From 5f3c31ec3f573b6f396122135935bd4d357e4e94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Farkas?= Date: Sat, 20 Sep 2025 14:26:19 +0200 Subject: [PATCH] feat: initial implementation of data ingestion and streaming API --- src/llm/.gitignore | 4 ++ src/llm/ingest.py | 82 ++++++++++++++++++++++++++++++++++++++++ src/llm/main.py | 74 ++++++++++++++++++++++++++++++++++++ src/llm/requirements.txt | 20 ++++++++++ 4 files changed, 180 insertions(+) create mode 100644 src/llm/.gitignore create mode 100644 src/llm/ingest.py create mode 100644 src/llm/main.py create mode 100644 src/llm/requirements.txt diff --git a/src/llm/.gitignore b/src/llm/.gitignore new file mode 100644 index 0000000..6dd3ecb --- /dev/null +++ b/src/llm/.gitignore @@ -0,0 +1,4 @@ +dune_db/ +books/ +venv/ +__pycache__/ \ No newline at end of file diff --git a/src/llm/ingest.py b/src/llm/ingest.py new file mode 100644 index 0000000..eba16e2 --- /dev/null +++ b/src/llm/ingest.py @@ -0,0 +1,82 @@ +import os +import re +from langchain_community.document_loaders import TextLoader +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_community.vectorstores import Chroma +from langchain_community.embeddings import HuggingFaceEmbeddings + +# --- CONFIGURATION --- +EMBEDDING_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5" +DATA_PATH = "books" +DB_PATH = "dune_db" +CHUNK_SIZE = 2048 +CHUNK_OVERLAP = 256 + +def main(): + """ + Main function to run the data ingestion process. + """ + create_vector_store() + +def create_vector_store(): + """ + Creates a Chroma vector store from documents in the DATA_PATH directory. + """ + print("Loading and processing documents...") + + book_files = sorted([f for f in os.listdir(DATA_PATH) if f.endswith(".txt")]) + + all_splits = [] + for book_file in book_files: + try: + order_str, title_ext = book_file.split('_', 1) + book_order = int(order_str) + book_title = os.path.splitext(title_ext)[0].replace('_', ' ') + except ValueError: + print(f"Skipping file with unexpected format: {book_file}") + continue + + print(f" - Processing Book {book_order}: {book_title}") + + file_path = os.path.join(DATA_PATH, book_file) + loader = TextLoader(file_path, encoding="utf-8") + documents = loader.load() + + for doc in documents: + doc.page_content = re.sub(r'\n{3,}', '\n\n', doc.page_content) + doc.page_content = doc.page_content.strip() + + doc.metadata = { + "source": book_file, + "book_title": book_title, + "book_order": book_order + } + + text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP) + splits = text_splitter.split_documents(documents) + all_splits.extend(splits) + + print(f"Created {len(all_splits)} text chunks.") + + print(f"Initializing embedding model: {EMBEDDING_MODEL_NAME}") + # --- THIS IS THE CORRECTED SECTION --- + embedding_model = HuggingFaceEmbeddings( + model_name=EMBEDDING_MODEL_NAME, + model_kwargs={'trust_remote_code': True} # Argument moved inside model_kwargs + ) + # --- END CORRECTION --- + + print(f"Creating vector store and embedding {len(all_splits)} chunks...") + vector_store = Chroma.from_documents( + documents=all_splits, + embedding=embedding_model, + persist_directory=DB_PATH + ) + + print("--------------------------------------------------") + print(f"Ingestion complete!") + print(f"Vector store created at: {DB_PATH}") + print("--------------------------------------------------") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/llm/main.py b/src/llm/main.py new file mode 100644 index 0000000..516348c --- /dev/null +++ b/src/llm/main.py @@ -0,0 +1,74 @@ +from fastapi import FastAPI, HTTPException +from fastapi.responses import StreamingResponse +from pydantic import BaseModel +import requests +import json + +from langchain_community.vectorstores import Chroma +from langchain_community.embeddings import HuggingFaceEmbeddings + +# --- Configuration (Same as before) --- +DB_PATH = "dune_db" +EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" +OLLAMA_API_URL = "http://localhost:11434/api/generate" +OLLAMA_MODEL = "llama3:8b" +PROMPT_TEMPLATE = """ +You are a helpful AI assistant and an expert on the Dune book series. +Use the following pieces of context from the books to answer the user's question. +If you don't know the answer from the context provided, just say that you don't know, don't try to make up an answer. + +Context: +{context} + +Question: +{question} + +Answer: +""" + +# --- Pydantic Models (Same as before) --- +class AskRequest(BaseModel): + question: str + +# --- Initialize FastAPI and load resources (Same as before) --- +app = FastAPI() +embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={'trust_remote_code': True}) +vector_store = Chroma(persist_directory=DB_PATH, embedding_function=embeddings) +retriever = vector_store.as_retriever(search_kwargs={"k": 5}) + +# --- NEW: The Streaming Endpoint --- +@app.post("/ask-stream") +async def ask_question_stream(request: AskRequest): + print(f"🔍 Streaming request for: {request.question}") + + # 1. Retrieve context (this part is still blocking) + retrieved_docs = retriever.invoke(request.question) + context = "\n\n---\n\n".join([doc.page_content for doc in retrieved_docs]) + prompt = PROMPT_TEMPLATE.format(context=context, question=request.question) + + # 2. Define the generator for the streaming response + async def stream_generator(): + try: + ollama_payload = { + "model": OLLAMA_MODEL, + "prompt": prompt, + "stream": True # <-- The key change to enable streaming from Ollama + } + # Use stream=True to get a streaming response from requests + with requests.post(OLLAMA_API_URL, json=ollama_payload, stream=True) as response: + response.raise_for_status() + # Ollama streams JSON objects separated by newlines + for line in response.iter_lines(): + if line: + chunk = json.loads(line) + # Yield the actual text part of the token + yield chunk.get("response", "") + except requests.RequestException as e: + print(f"❌ Error communicating with Ollama: {e}") + yield "Error: Could not connect to the language model." + except Exception as e: + print(f"❌ An unexpected error occurred: {e}") + yield "Error: An unexpected error occurred while generating the answer." + + # 3. Return the generator wrapped in a StreamingResponse + return StreamingResponse(stream_generator(), media_type="text/plain") \ No newline at end of file diff --git a/src/llm/requirements.txt b/src/llm/requirements.txt new file mode 100644 index 0000000..252facf --- /dev/null +++ b/src/llm/requirements.txt @@ -0,0 +1,20 @@ +# --- Core AI Framework --- +langchain +langchain-community +langchain-text-splitters + +# --- API & Server --- +fastapi +uvicorn[standard] + +# --- Vector Database --- +chromadb + +# --- Embedding Model Dependencies --- +sentence-transformers +nomic + +# --- Deep Learning Backend for Embeddings --- +torch + +einops \ No newline at end of file