reachify-ai-service / scripts /load_knowledge.py
amitbhatt6075's picture
Complete fresh start - FINAL UPLOAD
0914e96
raw
history blame
3.36 kB
import os
import shutil
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from typing import List
# === βœ… THE FIX: Missing path variables are re-added here ===
SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__))
ROOT_DIR = os.path.dirname(SCRIPTS_DIR) # Go one level up from /scripts to /ai-service
KNOWLEDGE_BASE_DIR = os.path.join(ROOT_DIR, "knowledge_base")
EMBEDDING_MODEL_PATH = os.path.join(ROOT_DIR, "embedding_model")
DB_PATH = os.path.join(os.environ.get("WRITABLE_DIR", "/tmp"), "vector_db_persistent")
def load_documents_with_metadata(directory: str, role: str) -> List[Document]:
"""
Loads documents from a specific subdirectory and adds 'role' metadata to each one.
"""
# First, check if the directory exists to avoid errors
if not os.path.isdir(directory):
print(f"⚠️ Warning: Directory '{directory}' not found. Skipping role '{role}'.")
return []
loader = DirectoryLoader(directory, glob="**/*.md", show_progress=True, loader_cls=TextLoader)
documents = loader.load()
for doc in documents:
# We ensure metadata exists and then add the role
if doc.metadata is None:
doc.metadata = {}
doc.metadata["role"] = role
print(f" > Loaded {len(documents)} documents for role '{role}'")
return documents
def load_knowledge_base():
"""
Loads documents from role-specific folders, adds metadata, splits them,
and stores them in a persistent ChromaDB vector store.
"""
if os.path.exists(DB_PATH):
print(f"🧹 Found existing DB at '{DB_PATH}'. Deleting it to rebuild.")
shutil.rmtree(DB_PATH)
print("πŸ“š Loading documents with role metadata...")
brand_docs = load_documents_with_metadata(os.path.join(KNOWLEDGE_BASE_DIR, "brand"), "brand")
influencer_docs = load_documents_with_metadata(os.path.join(KNOWLEDGE_BASE_DIR, "influencer"), "influencer")
common_docs = load_documents_with_metadata(os.path.join(KNOWLEDGE_BASE_DIR, "common"), "common")
all_documents = brand_docs + influencer_docs + common_docs
if not all_documents:
print("⚠️ No documents were found in any sub-folder. Aborting.")
return
print(f"πŸ“„ Found {len(all_documents)} documents in total.")
print("πŸ”ͺ Splitting documents into smaller chunks...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(all_documents)
print(f"Split documents into {len(chunks)} chunks.")
print(f"🧠 Loading embedding model from: {EMBEDDING_MODEL_PATH}")
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_PATH)
print(f"πŸ’Ύ Creating and persisting vector store at: {DB_PATH}")
vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_PATH)
print("\n" + "="*50)
print("βœ… Knowledge base loaded with role metadata successfully!")
print(f"Database is stored at: {DB_PATH}")
print("="*50 + "\n")
if __name__ == "__main__":
load_knowledge_base()