Spaces:

amitbhatt6075
/

reachify-ai-service

Running

File size: 3,591 Bytes

01c71d2

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import joblib

# --- CONFIGURATION ---

# Path to the training data we created in the previous step
INPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'data', 'thunderbird_market_trends.csv')

# Path to save the trained model
MODEL_OUTPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'models', 'thunderbird_market_predictor_v1.joblib')

# --- MAIN SCRIPT ---

def train_model():
    """
    Loads the training data, prepares it for the model, trains the model,
    and saves the final version to a .joblib file.
    """
    print("--- Starting Project Thunderbird Model Training ---")

    # 1. Load Data
    try:
        df = pd.read_csv(INPUT_FILE)
        print(f"✅ Successfully loaded training data from '{INPUT_FILE}'")
    except FileNotFoundError:
        print(f"❌ Error: Training data file not found at '{INPUT_FILE}'.")
        print("   Please run `scripts/export_thunderbird_training_data.py` first.")
        return

    # 2. Prepare Data (Feature Engineering)
    print("\n🚀 Preparing data for the model...")
    # 'month' and 'niche' are categorical. The model needs numbers.
    # We will use one-hot encoding for the 'niche'.
    encoder = OneHotEncoder(handle_unknown='ignore')
    niche_encoded = encoder.fit_transform(df[['niche']]).toarray()
    
    # Create a new DataFrame with the encoded columns
    niche_df = pd.DataFrame(niche_encoded, columns=encoder.get_feature_names_out(['niche']))
    
    # We won't use 'month' directly, as the trend score already has the time component.
    # Our features are the market trend and the niche type.
    X = pd.concat([df[['trend_score']], niche_df], axis=1)
    
    # Our target is to predict how many successful campaigns there will be.
    y = df['successful_campaigns']

    print(f"✅ Data prepared. Features: {X.columns.tolist()}")

    # 3. Split data for training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"\n🚀 Splitting data: {len(X_train)} rows for training, {len(X_test)} rows for testing.")

    # 4. Train the Model
    print("\n🧠 Training the Gradient Boosting Regressor model...")
    
    # Gradient Boosting is a good choice for this kind of tabular data.
    model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    model.fit(X_train, y_train)
    
    print("✅ Model training complete.")

    # 5. Evaluate the Model (optional, but good practice)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    print(f"\n📊 Model evaluation (Mean Squared Error): {mse:.2f}")
    print("   (Lower is better. A small number means the model's predictions are close to the real values).")


    # 6. Save the Trained Model and the Encoder
    print(f"\n💾 Saving the trained model and encoder...")
    try:
        # We need to save BOTH the model AND the encoder, so we can use it for predictions later.
        model_and_encoder = {
            'model': model,
            'encoder': encoder
        }
        joblib.dump(model_and_encoder, MODEL_OUTPUT_FILE)
        print(f"✅ Success! Model has been saved to:")
        print(f"   {MODEL_OUTPUT_FILE}")
    except Exception as e:
        print(f"\n❌ Error saving model file: {e}")


if __name__ == "__main__":
    train_model()