reachify-ai-service / training /train_thunderbird_market_predictor.py
amitbhatt6075's picture
feat(thunderbird): Add market intelligence module with new model and APIs
01c71d2
raw
history blame
3.59 kB
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import joblib
# --- CONFIGURATION ---
# Path to the training data we created in the previous step
INPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'data', 'thunderbird_market_trends.csv')
# Path to save the trained model
MODEL_OUTPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'models', 'thunderbird_market_predictor_v1.joblib')
# --- MAIN SCRIPT ---
def train_model():
"""
Loads the training data, prepares it for the model, trains the model,
and saves the final version to a .joblib file.
"""
print("--- Starting Project Thunderbird Model Training ---")
# 1. Load Data
try:
df = pd.read_csv(INPUT_FILE)
print(f"βœ… Successfully loaded training data from '{INPUT_FILE}'")
except FileNotFoundError:
print(f"❌ Error: Training data file not found at '{INPUT_FILE}'.")
print(" Please run `scripts/export_thunderbird_training_data.py` first.")
return
# 2. Prepare Data (Feature Engineering)
print("\nπŸš€ Preparing data for the model...")
# 'month' and 'niche' are categorical. The model needs numbers.
# We will use one-hot encoding for the 'niche'.
encoder = OneHotEncoder(handle_unknown='ignore')
niche_encoded = encoder.fit_transform(df[['niche']]).toarray()
# Create a new DataFrame with the encoded columns
niche_df = pd.DataFrame(niche_encoded, columns=encoder.get_feature_names_out(['niche']))
# We won't use 'month' directly, as the trend score already has the time component.
# Our features are the market trend and the niche type.
X = pd.concat([df[['trend_score']], niche_df], axis=1)
# Our target is to predict how many successful campaigns there will be.
y = df['successful_campaigns']
print(f"βœ… Data prepared. Features: {X.columns.tolist()}")
# 3. Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nπŸš€ Splitting data: {len(X_train)} rows for training, {len(X_test)} rows for testing.")
# 4. Train the Model
print("\n🧠 Training the Gradient Boosting Regressor model...")
# Gradient Boosting is a good choice for this kind of tabular data.
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)
print("βœ… Model training complete.")
# 5. Evaluate the Model (optional, but good practice)
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"\nπŸ“Š Model evaluation (Mean Squared Error): {mse:.2f}")
print(" (Lower is better. A small number means the model's predictions are close to the real values).")
# 6. Save the Trained Model and the Encoder
print(f"\nπŸ’Ύ Saving the trained model and encoder...")
try:
# We need to save BOTH the model AND the encoder, so we can use it for predictions later.
model_and_encoder = {
'model': model,
'encoder': encoder
}
joblib.dump(model_and_encoder, MODEL_OUTPUT_FILE)
print(f"βœ… Success! Model has been saved to:")
print(f" {MODEL_OUTPUT_FILE}")
except Exception as e:
print(f"\n❌ Error saving model file: {e}")
if __name__ == "__main__":
train_model()