Spaces:

amitbhatt6075
/

reachify-ai-service

Running

App Files Files Community

amitbhatt6075 commited on 9 days ago

Commit

01c71d2

1 Parent(s): 628b5b4

feat(thunderbird): Add market intelligence module with new model and APIs

Browse files

Files changed (7) hide show

api/main.py +30 -1
api/thunderbird_routes.py +44 -0
core/thunderbird_engine.py +66 -0
models/thunderbird_market_predictor_v1.joblib +3 -0
requirements.txt +0 -0
scripts/export_thunderbird_training_data.py +141 -0
training/train_thunderbird_market_predictor.py +90 -0

api/main.py CHANGED Viewed

@@ -31,6 +31,7 @@ from core.document_parser import parse_pdf_from_url
 from core.creative_chat import CreativeDirector
 from core.matcher import load_embedding_model
 from core.community_brain import CommunityBrain
 try:
     from core.rag.store import VectorStore
@@ -1713,4 +1714,32 @@ def summarize_community_thread(request: ThreadSummaryRequest):
         return ThreadSummaryResponse(summary="Summary unavailable.")
     summary = _community_brain.summarize_thread(request.comments)
-    return ThreadSummaryResponse(summary=summary)

 from core.creative_chat import CreativeDirector
 from core.matcher import load_embedding_model
 from core.community_brain import CommunityBrain
+from core.thunderbird_engine import get_external_trends, predict_niche_trends
 try:
     from core.rag.store import VectorStore
         return ThreadSummaryResponse(summary="Summary unavailable.")
     summary = _community_brain.summarize_thread(request.comments)
+    return ThreadSummaryResponse(summary=summary)
+# =============================================================
+# === ⚡️ PROJECT THUNDERBIRD - MARKET INTELLIGENCE HUB ===
+# =============================================================
+@app.post("/thunderbird/get_pulse_data", summary="Get All Data for Market Intelligence 'Pulse' Page")
+def get_pulse_data_endpoint():
+    """
+    This is the main orchestrator endpoint for the /pulse page.
+    It calls all necessary Thunderbird engine functions and combines their data.
+    """
+    print("🚀 API HIT: /thunderbird/get_pulse_data")
+    try:
+        # Call core logic functions in sequence
+        live_trends = get_external_trends()
+        niche_predictions = predict_niche_trends()
+        # Add future AI briefing calls here
+        # Combine results into one object for the frontend
+        return {
+            **live_trends,
+            **niche_predictions,
+        }
+    except Exception as e:
+        print(f"❌ API ERROR in /thunderbird/get_pulse_data: {e}")
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=str(e))

api/thunderbird_routes.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from fastapi import APIRouter, HTTPException, Depends
+from typing import Dict, Any
+# Import the brain functions we just created
+from core.thunderbird_engine import get_external_trends, predict_niche_trends
+# FastAPI router for all Thunderbird-related endpoints
+router = APIRouter(
+    prefix="/thunderbird", # All routes in this file will start with /thunderbird
+    tags=["Thunderbird - Market Intelligence"], # For Swagger UI documentation
+)
+# --- ENDPOINTS ---
+@router.post("/get_pulse_data")
+async def get_pulse_data() -> Dict[str, Any]:
+    """
+    This is the main endpoint for the /pulse page.
+    It calls all necessary engine functions and combines their data into a single response.
+    """
+    print("🚀 API HIT: /thunderbird/get_pulse_data")
+    try:
+        # Call our core logic functions
+        live_trends = get_external_trends()
+        niche_predictions = predict_niche_trends()
+        # In the future, we'll add the AI briefing call here as well
+        # Combine all results into a single, clean JSON object for the frontend
+        combined_data = {
+            **live_trends,
+            **niche_predictions,
+            # "ai_briefing": ai_briefing_result (for later)
+        }
+        print("✅ API SUCCESS: /thunderbird/get_pulse_data")
+        return combined_data
+    except Exception as e:
+        print(f"❌ API ERROR in /get_pulse_data: {e}")
+        # In case of an error, send a structured error message to the frontend
+        raise HTTPException(
+            status_code=500,
+            detail=f"An internal error occurred in the Thunderbird engine: {e}"
+        )

core/thunderbird_engine.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+import pandas as pd
+import joblib
+import random
+from datetime import datetime
+from newsapi import NewsApiClient
+import feedparser
+# --- CONFIGURATION ---
+MODEL_PATH = os.path.join(os.path.dirname(__file__), '..', 'models', 'thunderbird_market_predictor_v1.joblib')
+NEWS_API_KEY = os.getenv("NEWS_API_KEY")
+# --- CORE FUNCTIONS ---
+def get_external_trends() -> dict:
+    """Fetches real-time 'live' data from external news APIs and RSS feeds."""
+    print("🚀 [Thunderbird Engine] Fetching live external trends...")
+    results = {
+        "news_headlines": [],
+        "breakout_keyword": None,
+        "trending_audio": None
+    }
+    if NEWS_API_KEY:
+        try:
+            newsapi = NewsApiClient(api_key=NEWS_API_KEY)
+            top_headlines = newsapi.get_everything(
+                q='("influencer marketing" OR "social media marketing" OR "creator economy")',
+                language='en', sort_by='relevancy', page_size=5
+            )
+            results["news_headlines"] = [{"title": article['title'], "url": article['url']} for article in top_headlines.get('articles', [])]
+            print(f"   - ✅ Found {len(results['news_headlines'])} news articles.")
+        except Exception as e:
+            print(f"   - ⚠️ NewsAPI Error: {e}")
+            results["news_headlines"] = [{"title": "News service currently unavailable.", "url": "#"}]
+    # Simulate other trends for now to allow frontend development
+    results["breakout_keyword"] = "AI in Marketing"
+    trending_audios = [{"name": "Espresso - Sabrina Carpenter", "cover_art_url": "https://via.placeholder.com/150"}]
+    results["trending_audio"] = random.choice(trending_audios)
+    print("   - ✅ (Simulated) Found trending keyword and audio.")
+    return results
+def predict_niche_trends() -> dict:
+    """Loads our trained ML model to predict future interest in market niches."""
+    print("\n🚀 [Thunderbird Engine] Loading model to predict niche trends...")
+    try:
+        model_pack = joblib.load(MODEL_PATH)
+        model = model_pack['model']
+        encoder = model_pack['encoder']
+        print(f"   - ✅ Model '{os.path.basename(MODEL_PATH)}' loaded successfully.")
+    except FileNotFoundError:
+        print(f"   - ❌ CRITICAL: Model file not found at '{MODEL_PATH}'.")
+        return {"error": "Prediction model not found."}
+    print("   - ⚠️ NOTE: Generating SIMULATED trend data as training set is small.")
+    niches = encoder.get_feature_names_out(['niche'])
+    dates = pd.date_range(end=datetime.now(), periods=12, freq='M').strftime('%Y-%m').tolist()
+    predictions = {}
+    for niche_col_name in niches:
+        niche_name = niche_col_name.split('_')[-1]
+        points = [random.randint(40, 60)]
+        for _ in range(11):
+            points.append(max(20, min(100, points[-1] + random.randint(-10, 10))))
+        predictions[niche_name] = [{"date": date, "value": value} for date, value in zip(dates, points)]
+    print(f"   - ✅ (Simulated) Generated trend predictions for niches: {list(predictions.keys())}")
+    return {"trend_predictions": predictions}

models/thunderbird_market_predictor_v1.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed3d6a91acfe6d33d16ebbe8ef80c77b8df399af3205594ffba29391e9037dac
+size 64706

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

scripts/export_thunderbird_training_data.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import os
+import pandas as pd
+from datetime import datetime, timedelta
+from dotenv import load_dotenv
+from supabase import create_client, Client
+from pytrends.request import TrendReq
+import time
+import random
+# --- CONFIGURATION (No changes) ---
+load_dotenv()
+SUPABASE_URL = os.getenv("SUPABASE_URL")
+SUPABASE_KEY = os.getenv("SUPABASE_SERVICE_KEY")
+if not SUPABASE_URL or not SUPABASE_KEY:
+    raise ValueError("Supabase URL and Service Key must be set.")
+supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
+NICHES_TO_TRACK = ["fashion", "gaming", "fitness", "skincare", "finance"]
+MONTHS_TO_FETCH = 12
+OUTPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'data', 'thunderbird_market_trends.csv')
+# --- get_successful_campaign_counts() --- (No changes needed, this function is correct)
+def get_successful_campaign_counts() -> pd.DataFrame:
+    print("🚀 Fetching successful campaign data from Supabase...")
+    end_date = datetime.now()
+    start_date = end_date - timedelta(days=MONTHS_TO_FETCH * 30)
+    try:
+        response = supabase.table('campaigns').select('id, title, description, created_at') \
+            .eq('status', 'completed') \
+            .gte('created_at', start_date.isoformat()) \
+            .lte('created_at', end_date.isoformat()) \
+            .execute()
+        if not response.data:
+            print("⚠️ No campaign data found in the specified date range.")
+            return pd.DataFrame()
+        df = pd.DataFrame(response.data)
+        df['created_at'] = pd.to_datetime(df['created_at'])
+        df['month'] = df['created_at'].dt.to_period('M')
+        def assign_niche(row):
+            text_to_search = f"{row.get('title', '')} {row.get('description', '')}".lower()
+            for niche in NICHES_TO_TRACK:
+                if niche in text_to_search:
+                    return niche
+            return "general"
+        df['niche'] = df.apply(assign_niche, axis=1)
+        monthly_counts = df.groupby(['month', 'niche']).size().reset_index(name='successful_campaigns')
+        print(f"✅ Found and processed {len(df)} successful campaigns.")
+        return monthly_counts
+    except Exception as e:
+        print(f"❌ Error fetching data from Supabase: {e}")
+        return pd.DataFrame()
+# --- get_google_trends_data() --- (UPDATED)
+def get_google_trends_data() -> pd.DataFrame:
+    print("\n🚀 Fetching historical market interest from Google Trends (Robust Mode)...")
+    # Increase retries and backoff for more resilience
+    pytrends = TrendReq(hl='en-US', tz=360, retries=5, backoff_factor=1)
+    end_date = datetime.now()
+    start_date = end_date - timedelta(days=MONTHS_TO_FETCH * 30)
+    timeframe = f"{start_date.strftime('%Y-%m-%d')} {end_date.strftime('%Y-%m-%d')}"
+    all_trends_df = pd.DataFrame()
+    for niche in NICHES_TO_TRACK:
+        print(f"   - Fetching trend data for '{niche}'...")
+        try:
+            pytrends.build_payload([niche], cat=0, timeframe=timeframe, geo='', gprop='')
+            interest_over_time_df = pytrends.interest_over_time()
+            if not interest_over_time_df.empty and niche in interest_over_time_df:
+                interest_over_time_df = interest_over_time_df.rename(columns={niche: 'trend_score'})
+                interest_over_time_df['niche'] = niche
+                all_trends_df = pd.concat([all_trends_df, interest_over_time_df[['trend_score', 'niche']]])
+            else:
+                print(f"   - ℹ️ No trend data returned for '{niche}'.")
+            # --- THE FIX: LONGER, MORE RANDOM DELAY ---
+            sleep_time = random.uniform(5, 12) # Wait for 5 to 12 seconds
+            print(f"   - 😴 Sleeping for {sleep_time:.2f} seconds...")
+            time.sleep(sleep_time)
+            # ----------------------------------------
+        except Exception as e:
+            # We specifically catch the 429 error text
+            if "response with code 429" in str(e) or "too many 429 error responses" in str(e):
+                print(f"   - 🛑 Hit rate limit hard for '{niche}'. Taking a long 2-minute break...")
+                time.sleep(120) # Take a long break if we still get blocked
+            else:
+                print(f"   - ⚠️ A non-rate-limit error occurred for '{niche}'. Error: {e}")
+            continue
+    if all_trends_df.empty:
+        print("⚠️ Warning: Could not fetch any data from Google Trends. Proceeding without trend scores.")
+        return pd.DataFrame()
+    all_trends_df['month'] = all_trends_df.index.to_period('M')
+    monthly_trends = all_trends_df.groupby(['month', 'niche'])['trend_score'].mean().reset_index()
+    print(f"✅ Successfully fetched and processed Google Trends data.")
+    return monthly_trends
+# --- main() function --- (UPDATED)
+def main():
+    """Main function to run the script."""
+    print("--- Starting Project Thunderbird Data Export ---")
+    campaign_df = get_successful_campaign_counts()
+    if campaign_df.empty:
+        print("\n❌ No campaign data found. Aborting training file creation.")
+        return
+    trends_df = get_google_trends_data()
+    # --- THE FIX: USE A 'LEFT' MERGE ---
+    if not trends_df.empty:
+        print("\n🚀 Merging campaign success data with market trend data...")
+        training_df = pd.merge(campaign_df, trends_df, on=['month', 'niche'], how='left')
+        # Fill any missing trend scores with 0
+        training_df['trend_score'].fillna(0, inplace=True)
+    else:
+        print("\n⚠️ No trends data was fetched. Creating training file with only campaign data.")
+        training_df = campaign_df
+        training_df['trend_score'] = 0 # Add the column so our model doesn't break
+    # ----------------------------------
+    # Convert Period to string for CSV
+    training_df['month'] = training_df['month'].astype(str)
+    # Save the final dataframe to a CSV file
+    try:
+        training_df.to_csv(OUTPUT_FILE, index=False)
+        print(f"\n✅ Success! Training data has been saved to:")
+        print(f"   {OUTPUT_FILE}")
+    except Exception as e:
+        print(f"\n❌ Error saving training data to CSV: {e}")
+if __name__ == "__main__":
+    main()

training/train_thunderbird_market_predictor.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.metrics import mean_squared_error
+from sklearn.preprocessing import OneHotEncoder
+import joblib
+# --- CONFIGURATION ---
+# Path to the training data we created in the previous step
+INPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'data', 'thunderbird_market_trends.csv')
+# Path to save the trained model
+MODEL_OUTPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'models', 'thunderbird_market_predictor_v1.joblib')
+# --- MAIN SCRIPT ---
+def train_model():
+    """
+    Loads the training data, prepares it for the model, trains the model,
+    and saves the final version to a .joblib file.
+    """
+    print("--- Starting Project Thunderbird Model Training ---")
+    # 1. Load Data
+    try:
+        df = pd.read_csv(INPUT_FILE)
+        print(f"✅ Successfully loaded training data from '{INPUT_FILE}'")
+    except FileNotFoundError:
+        print(f"❌ Error: Training data file not found at '{INPUT_FILE}'.")
+        print("   Please run `scripts/export_thunderbird_training_data.py` first.")
+        return
+    # 2. Prepare Data (Feature Engineering)
+    print("\n🚀 Preparing data for the model...")
+    # 'month' and 'niche' are categorical. The model needs numbers.
+    # We will use one-hot encoding for the 'niche'.
+    encoder = OneHotEncoder(handle_unknown='ignore')
+    niche_encoded = encoder.fit_transform(df[['niche']]).toarray()
+    # Create a new DataFrame with the encoded columns
+    niche_df = pd.DataFrame(niche_encoded, columns=encoder.get_feature_names_out(['niche']))
+    # We won't use 'month' directly, as the trend score already has the time component.
+    # Our features are the market trend and the niche type.
+    X = pd.concat([df[['trend_score']], niche_df], axis=1)
+    # Our target is to predict how many successful campaigns there will be.
+    y = df['successful_campaigns']
+    print(f"✅ Data prepared. Features: {X.columns.tolist()}")
+    # 3. Split data for training and testing
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    print(f"\n🚀 Splitting data: {len(X_train)} rows for training, {len(X_test)} rows for testing.")
+    # 4. Train the Model
+    print("\n🧠 Training the Gradient Boosting Regressor model...")
+    # Gradient Boosting is a good choice for this kind of tabular data.
+    model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
+    model.fit(X_train, y_train)
+    print("✅ Model training complete.")
+    # 5. Evaluate the Model (optional, but good practice)
+    predictions = model.predict(X_test)
+    mse = mean_squared_error(y_test, predictions)
+    print(f"\n📊 Model evaluation (Mean Squared Error): {mse:.2f}")
+    print("   (Lower is better. A small number means the model's predictions are close to the real values).")
+    # 6. Save the Trained Model and the Encoder
+    print(f"\n💾 Saving the trained model and encoder...")
+    try:
+        # We need to save BOTH the model AND the encoder, so we can use it for predictions later.
+        model_and_encoder = {
+            'model': model,
+            'encoder': encoder
+        }
+        joblib.dump(model_and_encoder, MODEL_OUTPUT_FILE)
+        print(f"✅ Success! Model has been saved to:")
+        print(f"   {MODEL_OUTPUT_FILE}")
+    except Exception as e:
+        print(f"\n❌ Error saving model file: {e}")
+if __name__ == "__main__":
+    train_model()