|
|
|
|
|
|
|
|
import os |
|
|
import sys |
|
|
import json |
|
|
import csv |
|
|
|
|
|
|
|
|
current_dir = os.path.dirname(os.path.abspath(__file__)) |
|
|
parent_dir = os.path.dirname(current_dir) |
|
|
sys.path.append(parent_dir) |
|
|
|
|
|
from core.rag.store import VectorStore |
|
|
|
|
|
def ingest_creative_data(): |
|
|
print("\nπ Starting ADVANCED Creative Director Training...") |
|
|
|
|
|
|
|
|
store = VectorStore(collection_name="creative_mind") |
|
|
data_folder = os.path.join(parent_dir, "data", "creative_training") |
|
|
|
|
|
documents = [] |
|
|
metadatas = [] |
|
|
ids = [] |
|
|
|
|
|
if not os.path.exists(data_folder): |
|
|
print(f"β Folder missing: {data_folder}") |
|
|
return |
|
|
|
|
|
|
|
|
files_found = 0 |
|
|
for filename in os.listdir(data_folder): |
|
|
file_path = os.path.join(data_folder, filename) |
|
|
|
|
|
try: |
|
|
|
|
|
if filename.endswith(".txt"): |
|
|
print(f" π Reading Text: {filename}") |
|
|
with open(file_path, "r", encoding="utf-8") as f: |
|
|
content = f.read() |
|
|
if content.strip(): |
|
|
|
|
|
|
|
|
documents.append(content) |
|
|
metadatas.append({"source": filename, "type": "text_guide"}) |
|
|
ids.append(f"{filename}_full") |
|
|
files_found += 1 |
|
|
|
|
|
|
|
|
elif filename.endswith(".json"): |
|
|
print(f" β¨ Reading JSON: {filename}") |
|
|
with open(file_path, "r", encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
|
|
|
if isinstance(data, list): |
|
|
for idx, item in enumerate(data): |
|
|
|
|
|
text_repr = f"Strategy: {item.get('title', 'Tip')}\nDetails: {item.get('content', item)}" |
|
|
documents.append(text_repr) |
|
|
metadatas.append({"source": filename, "type": "json_entry"}) |
|
|
ids.append(f"{filename}_{idx}") |
|
|
files_found += 1 |
|
|
|
|
|
|
|
|
elif filename.endswith(".csv"): |
|
|
print(f" π Reading CSV: {filename}") |
|
|
with open(file_path, "r", encoding="utf-8") as f: |
|
|
reader = csv.DictReader(f) |
|
|
for idx, row in enumerate(reader): |
|
|
|
|
|
|
|
|
|
|
|
text_repr = "\n".join([f"{k}: {v}" for k, v in row.items()]) |
|
|
documents.append(text_repr) |
|
|
metadatas.append({"source": filename, "type": "csv_row"}) |
|
|
ids.append(f"{filename}_{idx}") |
|
|
files_found += 1 |
|
|
|
|
|
except Exception as e: |
|
|
print(f" β οΈ Error processing {filename}: {e}") |
|
|
|
|
|
|
|
|
if documents: |
|
|
print(f"\n π§ Embedding {len(documents)} data points into Vector DB...") |
|
|
|
|
|
batch_size = 50 |
|
|
for i in range(0, len(documents), batch_size): |
|
|
end = min(i + batch_size, len(documents)) |
|
|
print(f" - Batch {i} to {end}...") |
|
|
store.add_text(documents[i:end], metadatas[i:end], ids[i:end]) |
|
|
|
|
|
print(f"β
Training Complete! Scanned {files_found} files.") |
|
|
else: |
|
|
print("β οΈ No valid data found. Add .txt, .json, or .csv files.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
ingest_creative_data() |