Spaces:

amitbhatt6075
/

reachify-ai-service

Running

App Files Files Community

reachify-ai-service / scripts /load_knowledge.py

amitbhatt6075

Complete fresh start - FINAL UPLOAD

0914e96 17 days ago

raw

history blame

3.36 kB

	import os
	import shutil
	from langchain_community.document_loaders import DirectoryLoader, TextLoader
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import Chroma
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_core.documents import Document
	from typing import List

	# === ✅ THE FIX: Missing path variables are re-added here ===
	SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__))
	ROOT_DIR = os.path.dirname(SCRIPTS_DIR) # Go one level up from /scripts to /ai-service
	KNOWLEDGE_BASE_DIR = os.path.join(ROOT_DIR, "knowledge_base")
	EMBEDDING_MODEL_PATH = os.path.join(ROOT_DIR, "embedding_model")
	DB_PATH = os.path.join(os.environ.get("WRITABLE_DIR", "/tmp"), "vector_db_persistent")


	def load_documents_with_metadata(directory: str, role: str) -> List[Document]:
	"""
	Loads documents from a specific subdirectory and adds 'role' metadata to each one.
	"""
	# First, check if the directory exists to avoid errors
	if not os.path.isdir(directory):
	print(f"⚠️ Warning: Directory '{directory}' not found. Skipping role '{role}'.")
	return []

	loader = DirectoryLoader(directory, glob="*/.md", show_progress=True, loader_cls=TextLoader)
	documents = loader.load()
	for doc in documents:
	# We ensure metadata exists and then add the role
	if doc.metadata is None:
	doc.metadata = {}
	doc.metadata["role"] = role
	print(f" > Loaded {len(documents)} documents for role '{role}'")
	return documents

	def load_knowledge_base():
	"""
	Loads documents from role-specific folders, adds metadata, splits them,
	and stores them in a persistent ChromaDB vector store.
	"""
	if os.path.exists(DB_PATH):
	print(f"🧹 Found existing DB at '{DB_PATH}'. Deleting it to rebuild.")
	shutil.rmtree(DB_PATH)

	print("📚 Loading documents with role metadata...")

	brand_docs = load_documents_with_metadata(os.path.join(KNOWLEDGE_BASE_DIR, "brand"), "brand")
	influencer_docs = load_documents_with_metadata(os.path.join(KNOWLEDGE_BASE_DIR, "influencer"), "influencer")
	common_docs = load_documents_with_metadata(os.path.join(KNOWLEDGE_BASE_DIR, "common"), "common")

	all_documents = brand_docs + influencer_docs + common_docs

	if not all_documents:
	print("⚠️ No documents were found in any sub-folder. Aborting.")
	return

	print(f"📄 Found {len(all_documents)} documents in total.")

	print("🔪 Splitting documents into smaller chunks...")
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	chunks = text_splitter.split_documents(all_documents)
	print(f"Split documents into {len(chunks)} chunks.")

	print(f"🧠 Loading embedding model from: {EMBEDDING_MODEL_PATH}")
	embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_PATH)

	print(f"💾 Creating and persisting vector store at: {DB_PATH}")
	vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_PATH)

	print("\n" + "="*50)
	print("✅ Knowledge base loaded with role metadata successfully!")
	print(f"Database is stored at: {DB_PATH}")
	print("="*50 + "\n")

	if __name__ == "__main__":
	load_knowledge_base()