feat: initial implementation of data ingestion and streaming API

2025-09-20 14:26:19 +02:00
commit 5f3c31ec3f
4 changed files with 180 additions and 0 deletions
--- a/src/llm/.gitignore
+++ b/src/llm/.gitignore
@@ -0,0 +1,4 @@
 dune_db/
 books/
 venv/
 __pycache__/
--- a/src/llm/ingest.py
+++ b/src/llm/ingest.py
@@ -0,0 +1,82 @@
 import os
 import re
 from langchain_community.document_loaders import TextLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
 from langchain_community.embeddings import HuggingFaceEmbeddings
 # --- CONFIGURATION ---
 EMBEDDING_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
 DATA_PATH = "books"
 DB_PATH = "dune_db"
 CHUNK_SIZE = 2048
 CHUNK_OVERLAP = 256
 def main():
    """
    Main function to run the data ingestion process.
    """
    create_vector_store()
 def create_vector_store():
    """
    Creates a Chroma vector store from documents in the DATA_PATH directory.
    """
    print("Loading and processing documents...")
    book_files = sorted([f for f in os.listdir(DATA_PATH) if f.endswith(".txt")])
    all_splits = []
    for book_file in book_files:
        try:
            order_str, title_ext = book_file.split('_', 1)
            book_order = int(order_str)
            book_title = os.path.splitext(title_ext)[0].replace('_', ' ')
        except ValueError:
            print(f"Skipping file with unexpected format: {book_file}")
            continue
        print(f"  - Processing Book {book_order}: {book_title}")
        file_path = os.path.join(DATA_PATH, book_file)
        loader = TextLoader(file_path, encoding="utf-8")
        documents = loader.load()
        for doc in documents:
            doc.page_content = re.sub(r'\n{3,}', '\n\n', doc.page_content)
            doc.page_content = doc.page_content.strip()
            doc.metadata = {
                "source": book_file,
                "book_title": book_title,
                "book_order": book_order
            }
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
        splits = text_splitter.split_documents(documents)
        all_splits.extend(splits)
    print(f"Created {len(all_splits)} text chunks.")
    print(f"Initializing embedding model: {EMBEDDING_MODEL_NAME}")
    # --- THIS IS THE CORRECTED SECTION ---
    embedding_model = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        model_kwargs={'trust_remote_code': True} # Argument moved inside model_kwargs
    )
    # --- END CORRECTION ---
    print(f"Creating vector store and embedding {len(all_splits)} chunks...")
    vector_store = Chroma.from_documents(
        documents=all_splits,
        embedding=embedding_model,
        persist_directory=DB_PATH
    )
    print("--------------------------------------------------")
    print(f"Ingestion complete!")
    print(f"Vector store created at: {DB_PATH}")
    print("--------------------------------------------------")
 if __name__ == "__main__":
    main()
--- a/src/llm/main.py
+++ b/src/llm/main.py
@@ -0,0 +1,74 @@
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 import requests
 import json
 from langchain_community.vectorstores import Chroma
 from langchain_community.embeddings import HuggingFaceEmbeddings
 # --- Configuration (Same as before) ---
 DB_PATH = "dune_db"
 EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
 OLLAMA_API_URL = "http://localhost:11434/api/generate"
 OLLAMA_MODEL = "llama3:8b"
 PROMPT_TEMPLATE = """
 You are a helpful AI assistant and an expert on the Dune book series.
 Use the following pieces of context from the books to answer the user's question.
 If you don't know the answer from the context provided, just say that you don't know, don't try to make up an answer.
 Context:
 {context}
 Question:
 {question}
 Answer:
 """
 # --- Pydantic Models (Same as before) ---
 class AskRequest(BaseModel):
    question: str
 # --- Initialize FastAPI and load resources (Same as before) ---
 app = FastAPI()
 embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={'trust_remote_code': True})
 vector_store = Chroma(persist_directory=DB_PATH, embedding_function=embeddings)
 retriever = vector_store.as_retriever(search_kwargs={"k": 5})
 # --- NEW: The Streaming Endpoint ---
@app.post("/ask-stream")
 async def ask_question_stream(request: AskRequest):
    print(f"🔍 Streaming request for: {request.question}")
    # 1. Retrieve context (this part is still blocking)
    retrieved_docs = retriever.invoke(request.question)
    context = "\n\n---\n\n".join([doc.page_content for doc in retrieved_docs])
    prompt = PROMPT_TEMPLATE.format(context=context, question=request.question)
    # 2. Define the generator for the streaming response
    async def stream_generator():
        try:
            ollama_payload = {
                "model": OLLAMA_MODEL,
                "prompt": prompt,
                "stream": True  # <-- The key change to enable streaming from Ollama
            }
            # Use stream=True to get a streaming response from requests
            with requests.post(OLLAMA_API_URL, json=ollama_payload, stream=True) as response:
                response.raise_for_status()
                # Ollama streams JSON objects separated by newlines
                for line in response.iter_lines():
                    if line:
                        chunk = json.loads(line)
                        # Yield the actual text part of the token
                        yield chunk.get("response", "")
        except requests.RequestException as e:
            print(f"❌ Error communicating with Ollama: {e}")
            yield "Error: Could not connect to the language model."
        except Exception as e:
            print(f"❌ An unexpected error occurred: {e}")
            yield "Error: An unexpected error occurred while generating the answer."
    # 3. Return the generator wrapped in a StreamingResponse
    return StreamingResponse(stream_generator(), media_type="text/plain")
--- a/src/llm/requirements.txt
+++ b/src/llm/requirements.txt
@@ -0,0 +1,20 @@
 # --- Core AI Framework ---
 langchain
 langchain-community
 langchain-text-splitters
 # --- API & Server ---
 fastapi
 uvicorn[standard]
 # --- Vector Database ---
 chromadb
 # --- Embedding Model Dependencies ---
 sentence-transformers
 nomic
 # --- Deep Learning Backend for Embeddings ---
 torch
 einops