feat: initial implementation of data ingestion and streaming API

2025-09-20 14:26:19 +02:00
commit 5f3c31ec3f
4 changed files with 180 additions and 0 deletions
--- a/src/llm/main.py
+++ b/src/llm/main.py
@@ -0,0 +1,74 @@
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+import requests
+import json
+
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+
+# --- Configuration (Same as before) ---
+DB_PATH = "dune_db"
+EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
+OLLAMA_API_URL = "http://localhost:11434/api/generate"
+OLLAMA_MODEL = "llama3:8b"
+PROMPT_TEMPLATE = """
+You are a helpful AI assistant and an expert on the Dune book series.
+Use the following pieces of context from the books to answer the user's question.
+If you don't know the answer from the context provided, just say that you don't know, don't try to make up an answer.
+
+Context:
+{context}
+
+Question:
+{question}
+
+Answer:
+"""
+
+# --- Pydantic Models (Same as before) ---
+class AskRequest(BaseModel):
+    question: str
+
+# --- Initialize FastAPI and load resources (Same as before) ---
+app = FastAPI()
+embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={'trust_remote_code': True})
+vector_store = Chroma(persist_directory=DB_PATH, embedding_function=embeddings)
+retriever = vector_store.as_retriever(search_kwargs={"k": 5})
+
+# --- NEW: The Streaming Endpoint ---
+@app.post("/ask-stream")
+async def ask_question_stream(request: AskRequest):
+    print(f"🔍 Streaming request for: {request.question}")
+
+    # 1. Retrieve context (this part is still blocking)
+    retrieved_docs = retriever.invoke(request.question)
+    context = "\n\n---\n\n".join([doc.page_content for doc in retrieved_docs])
+    prompt = PROMPT_TEMPLATE.format(context=context, question=request.question)
+    
+    # 2. Define the generator for the streaming response
+    async def stream_generator():
+        try:
+            ollama_payload = {
+                "model": OLLAMA_MODEL,
+                "prompt": prompt,
+                "stream": True  # <-- The key change to enable streaming from Ollama
+            }
+            # Use stream=True to get a streaming response from requests
+            with requests.post(OLLAMA_API_URL, json=ollama_payload, stream=True) as response:
+                response.raise_for_status()
+                # Ollama streams JSON objects separated by newlines
+                for line in response.iter_lines():
+                    if line:
+                        chunk = json.loads(line)
+                        # Yield the actual text part of the token
+                        yield chunk.get("response", "")
+        except requests.RequestException as e:
+            print(f"❌ Error communicating with Ollama: {e}")
+            yield "Error: Could not connect to the language model."
+        except Exception as e:
+            print(f"❌ An unexpected error occurred: {e}")
+            yield "Error: An unexpected error occurred while generating the answer."
+
+    # 3. Return the generator wrapped in a StreamingResponse
+    return StreamingResponse(stream_generator(), media_type="text/plain")