Implement streaming support for chat endpoint and add async chat completions

2026-05-10 18:54:46 +02:00
parent 2677d381ce
commit 54ac77ab51
1 changed files with 119 additions and 7 deletions
@@ -1,6 +1,9 @@
 from fastapi import APIRouter, Body, Depends
 from fastapi.responses import StreamingResponse
 from openai import OpenAI
 from pydantic import BaseModel
 import json
 import asyncio
 from api.dependencies import get_llm_client
@@ -12,7 +15,18 @@ class ChatRequest(BaseModel):
    session_id: str | None = None
 class ChatCompletionRequest(BaseModel):
    messages: list[dict]
    stream: bool = False
    model: str = "deepseek-chat"
 # ---------------------------------------------------------------------------
 # Core helpers
 # ---------------------------------------------------------------------------
 def run_agent(client: OpenAI, message: str, session_id: str | None = None) -> str:
    """Non-streaming: returns the full response as a single string."""
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
@@ -23,13 +37,68 @@ def run_agent(client: OpenAI, message: str, session_id: str | None = None) -> st
    return response.choices[0].message.content
 async def run_agent_stream(client: OpenAI, message: str, session_id: str | None = None):
    """Async generator that yields text tokens as they arrive from the LLM."""
    loop = asyncio.get_running_loop()
    # OpenAI's sync streaming iterator must run in a thread so it doesn't block the event loop
    def _sync_stream():
        stream = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a helpful agent."},
                {"role": "user", "content": message},
            ],
            stream=True,
        )
        for chunk in stream:
            delta = chunk.choices[0].delta
            if delta and delta.content:
                yield delta.content
    # Run the sync generator in a thread, yield results back to the async world
    gen = _sync_stream()
    while True:
        token = await loop.run_in_executor(None, next, gen, None)
        if token is None:
            break
        yield token
 # ---------------------------------------------------------------------------
 # Endpoints
 # ---------------------------------------------------------------------------
@router.get("/")
 def root():
    return {"status": "ok"}
@router.post("/chat")
-def chat(req: ChatRequest, client: OpenAI = Depends(get_llm_client)):
+async def chat(req: ChatRequest, client: OpenAI = Depends(get_llm_client)):
    """Streaming chat endpoint — returns Server-Sent Events."""
    async def event_stream():
        async for token in run_agent_stream(client, req.message, req.session_id):
            payload = json.dumps({"token": token, "session_id": req.session_id})
            yield f"data: {payload}\n\n"
        # Signal completion
        yield f"data: {json.dumps({'done': True, 'session_id': req.session_id})}\n\n"
    return StreamingResponse(
        event_stream(),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "X-Accel-Buffering": "no",   # Disable nginx buffering if behind a proxy
        },
    )
@router.post("/chat/sync")
 def chat_sync(req: ChatRequest, client: OpenAI = Depends(get_llm_client)):
    """Non-streaming fallback — returns the full response at once."""
    response = run_agent(client, req.message, req.session_id)
    return {"response": response, "session_id": req.session_id}
@@ -44,27 +113,70 @@ def list_models():
                "object": "model",
                "created": 0,
                "owned_by": "local-agent",
-            }
+            },
        ],
    }
@router.post("/chat/completions")
-def chat_completions(
+async def chat_completions(
-    payload: dict = Body(...),
+    req: ChatCompletionRequest,
    client: OpenAI = Depends(get_llm_client),
 ):
-    messages = payload["messages"]
+    """OpenAI-compatible /chat/completions — supports stream=True."""
-    user_message = messages[-1]["content"]
+    user_message = req.messages[-1]["content"]
    response = run_agent(client, user_message)
    if req.stream:
        async def sse_stream():
            async for token in run_agent_stream(client, user_message):
                chunk = {
                    "id": "chatcmpl-local",
                    "object": "chat.completion.chunk",
                    "choices": [
                        {
                            "index": 0,
                            "delta": {"content": token},
                            "finish_reason": None,
                        }
                    ],
                }
                yield f"data: {json.dumps(chunk)}\n\n"
            # Final chunk with finish_reason
            final_chunk = {
                "id": "chatcmpl-local",
                "object": "chat.completion.chunk",
                "choices": [
                    {
                        "index": 0,
                        "delta": {},
                        "finish_reason": "stop",
                    }
                ],
            }
            yield f"data: {json.dumps(final_chunk)}\n\n"
            yield "data: [DONE]\n\n"
        return StreamingResponse(
            sse_stream(),
            media_type="text/event-stream",
            headers={
                "Cache-Control": "no-cache",
                "Connection": "keep-alive",
            },
        )
    # Non-streaming path
    response = run_agent(client, user_message)
    return {
        "id": "chatcmpl-local",
        "object": "chat.completion",
        "created": 0,
        "model": req.model,
        "choices": [
            {
                "index": 0,
                "message": {"role": "assistant", "content": response},
                "finish_reason": "stop",
            }
        ],
    }