Implement streaming support for chat endpoint and add async chat completions
Build and Push Agent API / build (push) Successful in 5s

This commit is contained in:
2026-05-10 18:54:46 +02:00
parent 2677d381ce
commit 54ac77ab51
+119 -7
View File
@@ -1,6 +1,9 @@
from fastapi import APIRouter, Body, Depends
from fastapi.responses import StreamingResponse
from openai import OpenAI
from pydantic import BaseModel
import json
import asyncio
from api.dependencies import get_llm_client
@@ -12,7 +15,18 @@ class ChatRequest(BaseModel):
session_id: str | None = None
class ChatCompletionRequest(BaseModel):
messages: list[dict]
stream: bool = False
model: str = "deepseek-chat"
# ---------------------------------------------------------------------------
# Core helpers
# ---------------------------------------------------------------------------
def run_agent(client: OpenAI, message: str, session_id: str | None = None) -> str:
"""Non-streaming: returns the full response as a single string."""
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
@@ -23,13 +37,68 @@ def run_agent(client: OpenAI, message: str, session_id: str | None = None) -> st
return response.choices[0].message.content
async def run_agent_stream(client: OpenAI, message: str, session_id: str | None = None):
"""Async generator that yields text tokens as they arrive from the LLM."""
loop = asyncio.get_running_loop()
# OpenAI's sync streaming iterator must run in a thread so it doesn't block the event loop
def _sync_stream():
stream = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "You are a helpful agent."},
{"role": "user", "content": message},
],
stream=True,
)
for chunk in stream:
delta = chunk.choices[0].delta
if delta and delta.content:
yield delta.content
# Run the sync generator in a thread, yield results back to the async world
gen = _sync_stream()
while True:
token = await loop.run_in_executor(None, next, gen, None)
if token is None:
break
yield token
# ---------------------------------------------------------------------------
# Endpoints
# ---------------------------------------------------------------------------
@router.get("/")
def root():
return {"status": "ok"}
@router.post("/chat")
def chat(req: ChatRequest, client: OpenAI = Depends(get_llm_client)):
async def chat(req: ChatRequest, client: OpenAI = Depends(get_llm_client)):
"""Streaming chat endpoint — returns Server-Sent Events."""
async def event_stream():
async for token in run_agent_stream(client, req.message, req.session_id):
payload = json.dumps({"token": token, "session_id": req.session_id})
yield f"data: {payload}\n\n"
# Signal completion
yield f"data: {json.dumps({'done': True, 'session_id': req.session_id})}\n\n"
return StreamingResponse(
event_stream(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no", # Disable nginx buffering if behind a proxy
},
)
@router.post("/chat/sync")
def chat_sync(req: ChatRequest, client: OpenAI = Depends(get_llm_client)):
"""Non-streaming fallback — returns the full response at once."""
response = run_agent(client, req.message, req.session_id)
return {"response": response, "session_id": req.session_id}
@@ -44,27 +113,70 @@ def list_models():
"object": "model",
"created": 0,
"owned_by": "local-agent",
}
},
],
}
@router.post("/chat/completions")
def chat_completions(
payload: dict = Body(...),
async def chat_completions(
req: ChatCompletionRequest,
client: OpenAI = Depends(get_llm_client),
):
messages = payload["messages"]
user_message = messages[-1]["content"]
response = run_agent(client, user_message)
"""OpenAI-compatible /chat/completions — supports stream=True."""
user_message = req.messages[-1]["content"]
if req.stream:
async def sse_stream():
async for token in run_agent_stream(client, user_message):
chunk = {
"id": "chatcmpl-local",
"object": "chat.completion.chunk",
"choices": [
{
"index": 0,
"delta": {"content": token},
"finish_reason": None,
}
],
}
yield f"data: {json.dumps(chunk)}\n\n"
# Final chunk with finish_reason
final_chunk = {
"id": "chatcmpl-local",
"object": "chat.completion.chunk",
"choices": [
{
"index": 0,
"delta": {},
"finish_reason": "stop",
}
],
}
yield f"data: {json.dumps(final_chunk)}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(
sse_stream(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
},
)
# Non-streaming path
response = run_agent(client, user_message)
return {
"id": "chatcmpl-local",
"object": "chat.completion",
"created": 0,
"model": req.model,
"choices": [
{
"index": 0,
"message": {"role": "assistant", "content": response},
"finish_reason": "stop",
}
],
}