|
|
import os |
|
|
import shutil |
|
|
from langchain_community.document_loaders import DirectoryLoader, TextLoader |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
from langchain_community.vectorstores import Chroma |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from langchain_core.documents import Document |
|
|
from typing import List |
|
|
|
|
|
|
|
|
SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__)) |
|
|
ROOT_DIR = os.path.dirname(SCRIPTS_DIR) |
|
|
KNOWLEDGE_BASE_DIR = os.path.join(ROOT_DIR, "knowledge_base") |
|
|
EMBEDDING_MODEL_PATH = os.path.join(ROOT_DIR, "embedding_model") |
|
|
DB_PATH = os.path.join(os.environ.get("WRITABLE_DIR", "/tmp"), "vector_db_persistent") |
|
|
|
|
|
|
|
|
def load_documents_with_metadata(directory: str, role: str) -> List[Document]: |
|
|
""" |
|
|
Loads documents from a specific subdirectory and adds 'role' metadata to each one. |
|
|
""" |
|
|
|
|
|
if not os.path.isdir(directory): |
|
|
print(f"β οΈ Warning: Directory '{directory}' not found. Skipping role '{role}'.") |
|
|
return [] |
|
|
|
|
|
loader = DirectoryLoader(directory, glob="**/*.md", show_progress=True, loader_cls=TextLoader) |
|
|
documents = loader.load() |
|
|
for doc in documents: |
|
|
|
|
|
if doc.metadata is None: |
|
|
doc.metadata = {} |
|
|
doc.metadata["role"] = role |
|
|
print(f" > Loaded {len(documents)} documents for role '{role}'") |
|
|
return documents |
|
|
|
|
|
def load_knowledge_base(): |
|
|
""" |
|
|
Loads documents from role-specific folders, adds metadata, splits them, |
|
|
and stores them in a persistent ChromaDB vector store. |
|
|
""" |
|
|
if os.path.exists(DB_PATH): |
|
|
print(f"π§Ή Found existing DB at '{DB_PATH}'. Deleting it to rebuild.") |
|
|
shutil.rmtree(DB_PATH) |
|
|
|
|
|
print("π Loading documents with role metadata...") |
|
|
|
|
|
brand_docs = load_documents_with_metadata(os.path.join(KNOWLEDGE_BASE_DIR, "brand"), "brand") |
|
|
influencer_docs = load_documents_with_metadata(os.path.join(KNOWLEDGE_BASE_DIR, "influencer"), "influencer") |
|
|
common_docs = load_documents_with_metadata(os.path.join(KNOWLEDGE_BASE_DIR, "common"), "common") |
|
|
|
|
|
all_documents = brand_docs + influencer_docs + common_docs |
|
|
|
|
|
if not all_documents: |
|
|
print("β οΈ No documents were found in any sub-folder. Aborting.") |
|
|
return |
|
|
|
|
|
print(f"π Found {len(all_documents)} documents in total.") |
|
|
|
|
|
print("πͺ Splitting documents into smaller chunks...") |
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) |
|
|
chunks = text_splitter.split_documents(all_documents) |
|
|
print(f"Split documents into {len(chunks)} chunks.") |
|
|
|
|
|
print(f"π§ Loading embedding model from: {EMBEDDING_MODEL_PATH}") |
|
|
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_PATH) |
|
|
|
|
|
print(f"πΎ Creating and persisting vector store at: {DB_PATH}") |
|
|
vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_PATH) |
|
|
|
|
|
print("\n" + "="*50) |
|
|
print("β
Knowledge base loaded with role metadata successfully!") |
|
|
print(f"Database is stored at: {DB_PATH}") |
|
|
print("="*50 + "\n") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
load_knowledge_base() |