import os import re from langchain_community.document_loaders import TextLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma from langchain_community.embeddings import HuggingFaceEmbeddings # --- CONFIGURATION --- EMBEDDING_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5" DATA_PATH = "books" DB_PATH = "dune_db" CHUNK_SIZE = 2048 CHUNK_OVERLAP = 256 def main(): """ Main function to run the data ingestion process. """ create_vector_store() def create_vector_store(): """ Creates a Chroma vector store from documents in the DATA_PATH directory. """ print("Loading and processing documents...") book_files = sorted([f for f in os.listdir(DATA_PATH) if f.endswith(".txt")]) all_splits = [] for book_file in book_files: try: order_str, title_ext = book_file.split('_', 1) book_order = int(order_str) book_title = os.path.splitext(title_ext)[0].replace('_', ' ') except ValueError: print(f"Skipping file with unexpected format: {book_file}") continue print(f" - Processing Book {book_order}: {book_title}") file_path = os.path.join(DATA_PATH, book_file) loader = TextLoader(file_path, encoding="utf-8") documents = loader.load() for doc in documents: doc.page_content = re.sub(r'\n{3,}', '\n\n', doc.page_content) doc.page_content = doc.page_content.strip() doc.metadata = { "source": book_file, "book_title": book_title, "book_order": book_order } text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP) splits = text_splitter.split_documents(documents) all_splits.extend(splits) print(f"Created {len(all_splits)} text chunks.") print(f"Initializing embedding model: {EMBEDDING_MODEL_NAME}") # --- THIS IS THE CORRECTED SECTION --- embedding_model = HuggingFaceEmbeddings( model_name=EMBEDDING_MODEL_NAME, model_kwargs={'trust_remote_code': True} # Argument moved inside model_kwargs ) # --- END CORRECTION --- print(f"Creating vector store and embedding {len(all_splits)} chunks...") vector_store = Chroma.from_documents( documents=all_splits, embedding=embedding_model, persist_directory=DB_PATH ) print("--------------------------------------------------") print(f"Ingestion complete!") print(f"Vector store created at: {DB_PATH}") print("--------------------------------------------------") if __name__ == "__main__": main()