82 lines
2.7 KiB
Python
82 lines
2.7 KiB
Python
import os
|
|
import re
|
|
from langchain_community.document_loaders import TextLoader
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
from langchain_community.vectorstores import Chroma
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings
|
|
|
|
# --- CONFIGURATION ---
|
|
EMBEDDING_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
|
|
DATA_PATH = "books"
|
|
DB_PATH = "dune_db"
|
|
CHUNK_SIZE = 2048
|
|
CHUNK_OVERLAP = 256
|
|
|
|
def main():
|
|
"""
|
|
Main function to run the data ingestion process.
|
|
"""
|
|
create_vector_store()
|
|
|
|
def create_vector_store():
|
|
"""
|
|
Creates a Chroma vector store from documents in the DATA_PATH directory.
|
|
"""
|
|
print("Loading and processing documents...")
|
|
|
|
book_files = sorted([f for f in os.listdir(DATA_PATH) if f.endswith(".txt")])
|
|
|
|
all_splits = []
|
|
for book_file in book_files:
|
|
try:
|
|
order_str, title_ext = book_file.split('_', 1)
|
|
book_order = int(order_str)
|
|
book_title = os.path.splitext(title_ext)[0].replace('_', ' ')
|
|
except ValueError:
|
|
print(f"Skipping file with unexpected format: {book_file}")
|
|
continue
|
|
|
|
print(f" - Processing Book {book_order}: {book_title}")
|
|
|
|
file_path = os.path.join(DATA_PATH, book_file)
|
|
loader = TextLoader(file_path, encoding="utf-8")
|
|
documents = loader.load()
|
|
|
|
for doc in documents:
|
|
doc.page_content = re.sub(r'\n{3,}', '\n\n', doc.page_content)
|
|
doc.page_content = doc.page_content.strip()
|
|
|
|
doc.metadata = {
|
|
"source": book_file,
|
|
"book_title": book_title,
|
|
"book_order": book_order
|
|
}
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
|
|
splits = text_splitter.split_documents(documents)
|
|
all_splits.extend(splits)
|
|
|
|
print(f"Created {len(all_splits)} text chunks.")
|
|
|
|
print(f"Initializing embedding model: {EMBEDDING_MODEL_NAME}")
|
|
# --- THIS IS THE CORRECTED SECTION ---
|
|
embedding_model = HuggingFaceEmbeddings(
|
|
model_name=EMBEDDING_MODEL_NAME,
|
|
model_kwargs={'trust_remote_code': True} # Argument moved inside model_kwargs
|
|
)
|
|
# --- END CORRECTION ---
|
|
|
|
print(f"Creating vector store and embedding {len(all_splits)} chunks...")
|
|
vector_store = Chroma.from_documents(
|
|
documents=all_splits,
|
|
embedding=embedding_model,
|
|
persist_directory=DB_PATH
|
|
)
|
|
|
|
print("--------------------------------------------------")
|
|
print(f"Ingestion complete!")
|
|
print(f"Vector store created at: {DB_PATH}")
|
|
print("--------------------------------------------------")
|
|
|
|
if __name__ == "__main__":
|
|
main() |