Implement streaming support for chat endpoint and add async chat completions

2026-05-10 18:54:46 +02:00
parent 2677d381ce
commit 54ac77ab51
1 changed files with 119 additions and 7 deletions
@@ -1,6 +1,9 @@
 from fastapi import APIRouter, Body, Depends
+from fastapi.responses import StreamingResponse
 from openai import OpenAI
 from pydantic import BaseModel
+import json
+import asyncio

 from api.dependencies import get_llm_client

@@ -12,7 +15,18 @@ class ChatRequest(BaseModel):
    session_id: str | None = None


+class ChatCompletionRequest(BaseModel):
+    messages: list[dict]
+    stream: bool = False
+    model: str = "deepseek-chat"
+
+
+# ---------------------------------------------------------------------------
+# Core helpers
+# ---------------------------------------------------------------------------
+
 def run_agent(client: OpenAI, message: str, session_id: str | None = None) -> str:
+    """Non-streaming: returns the full response as a single string."""
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
@@ -23,13 +37,68 @@ def run_agent(client: OpenAI, message: str, session_id: str | None = None) -> st
    return response.choices[0].message.content


+async def run_agent_stream(client: OpenAI, message: str, session_id: str | None = None):
+    """Async generator that yields text tokens as they arrive from the LLM."""
+    loop = asyncio.get_running_loop()
+
+    # OpenAI's sync streaming iterator must run in a thread so it doesn't block the event loop
+    def _sync_stream():
+        stream = client.chat.completions.create(
+            model="deepseek-chat",
+            messages=[
+                {"role": "system", "content": "You are a helpful agent."},
+                {"role": "user", "content": message},
+            ],
+            stream=True,
+        )
+        for chunk in stream:
+            delta = chunk.choices[0].delta
+            if delta and delta.content:
+                yield delta.content
+
+    # Run the sync generator in a thread, yield results back to the async world
+    gen = _sync_stream()
+    while True:
+        token = await loop.run_in_executor(None, next, gen, None)
+        if token is None:
+            break
+        yield token
+
+
+# ---------------------------------------------------------------------------
+# Endpoints
+# ---------------------------------------------------------------------------
+
@router.get("/")
 def root():
    return {"status": "ok"}


@router.post("/chat")
-def chat(req: ChatRequest, client: OpenAI = Depends(get_llm_client)):
+async def chat(req: ChatRequest, client: OpenAI = Depends(get_llm_client)):
+    """Streaming chat endpoint — returns Server-Sent Events."""
+    async def event_stream():
+        async for token in run_agent_stream(client, req.message, req.session_id):
+            payload = json.dumps({"token": token, "session_id": req.session_id})
+            yield f"data: {payload}\n\n"
+
+        # Signal completion
+        yield f"data: {json.dumps({'done': True, 'session_id': req.session_id})}\n\n"
+
+    return StreamingResponse(
+        event_stream(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",   # Disable nginx buffering if behind a proxy
+        },
+    )
+
+
+@router.post("/chat/sync")
+def chat_sync(req: ChatRequest, client: OpenAI = Depends(get_llm_client)):
+    """Non-streaming fallback — returns the full response at once."""
    response = run_agent(client, req.message, req.session_id)
    return {"response": response, "session_id": req.session_id}

@@ -44,27 +113,70 @@ def list_models():
                "object": "model",
                "created": 0,
                "owned_by": "local-agent",
-            }
+            },
        ],
    }


@router.post("/chat/completions")
-def chat_completions(
-    payload: dict = Body(...),
+async def chat_completions(
+    req: ChatCompletionRequest,
    client: OpenAI = Depends(get_llm_client),
 ):
-    messages = payload["messages"]
-    user_message = messages[-1]["content"]
-    response = run_agent(client, user_message)
+    """OpenAI-compatible /chat/completions — supports stream=True."""
+    user_message = req.messages[-1]["content"]

+    if req.stream:
+        async def sse_stream():
+            async for token in run_agent_stream(client, user_message):
+                chunk = {
+                    "id": "chatcmpl-local",
+                    "object": "chat.completion.chunk",
+                    "choices": [
+                        {
+                            "index": 0,
+                            "delta": {"content": token},
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+                yield f"data: {json.dumps(chunk)}\n\n"
+            # Final chunk with finish_reason
+            final_chunk = {
+                "id": "chatcmpl-local",
+                "object": "chat.completion.chunk",
+                "choices": [
+                    {
+                        "index": 0,
+                        "delta": {},
+                        "finish_reason": "stop",
+                    }
+                ],
+            }
+            yield f"data: {json.dumps(final_chunk)}\n\n"
+            yield "data: [DONE]\n\n"
+
+        return StreamingResponse(
+            sse_stream(),
+            media_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+            },
+        )
+
+    # Non-streaming path
+    response = run_agent(client, user_message)
    return {
        "id": "chatcmpl-local",
        "object": "chat.completion",
+        "created": 0,
+        "model": req.model,
        "choices": [
            {
                "index": 0,
                "message": {"role": "assistant", "content": response},
+                "finish_reason": "stop",
            }
        ],
    }