feat: initial implementation of data ingestion and streaming API

This commit is contained in:
2025-09-20 14:26:19 +02:00
commit 5f3c31ec3f
4 changed files with 180 additions and 0 deletions

74
src/llm/main.py Normal file
View File

@@ -0,0 +1,74 @@
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
import requests
import json
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
# --- Configuration (Same as before) ---
DB_PATH = "dune_db"
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
OLLAMA_API_URL = "http://localhost:11434/api/generate"
OLLAMA_MODEL = "llama3:8b"
PROMPT_TEMPLATE = """
You are a helpful AI assistant and an expert on the Dune book series.
Use the following pieces of context from the books to answer the user's question.
If you don't know the answer from the context provided, just say that you don't know, don't try to make up an answer.
Context:
{context}
Question:
{question}
Answer:
"""
# --- Pydantic Models (Same as before) ---
class AskRequest(BaseModel):
question: str
# --- Initialize FastAPI and load resources (Same as before) ---
app = FastAPI()
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={'trust_remote_code': True})
vector_store = Chroma(persist_directory=DB_PATH, embedding_function=embeddings)
retriever = vector_store.as_retriever(search_kwargs={"k": 5})
# --- NEW: The Streaming Endpoint ---
@app.post("/ask-stream")
async def ask_question_stream(request: AskRequest):
print(f"🔍 Streaming request for: {request.question}")
# 1. Retrieve context (this part is still blocking)
retrieved_docs = retriever.invoke(request.question)
context = "\n\n---\n\n".join([doc.page_content for doc in retrieved_docs])
prompt = PROMPT_TEMPLATE.format(context=context, question=request.question)
# 2. Define the generator for the streaming response
async def stream_generator():
try:
ollama_payload = {
"model": OLLAMA_MODEL,
"prompt": prompt,
"stream": True # <-- The key change to enable streaming from Ollama
}
# Use stream=True to get a streaming response from requests
with requests.post(OLLAMA_API_URL, json=ollama_payload, stream=True) as response:
response.raise_for_status()
# Ollama streams JSON objects separated by newlines
for line in response.iter_lines():
if line:
chunk = json.loads(line)
# Yield the actual text part of the token
yield chunk.get("response", "")
except requests.RequestException as e:
print(f"❌ Error communicating with Ollama: {e}")
yield "Error: Could not connect to the language model."
except Exception as e:
print(f"❌ An unexpected error occurred: {e}")
yield "Error: An unexpected error occurred while generating the answer."
# 3. Return the generator wrapped in a StreamingResponse
return StreamingResponse(stream_generator(), media_type="text/plain")