import os import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error from sklearn.preprocessing import OneHotEncoder import joblib # --- CONFIGURATION --- # Path to the training data we created in the previous step INPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'data', 'thunderbird_market_trends.csv') # Path to save the trained model MODEL_OUTPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'models', 'thunderbird_market_predictor_v1.joblib') # --- MAIN SCRIPT --- def train_model(): """ Loads the training data, prepares it for the model, trains the model, and saves the final version to a .joblib file. """ print("--- Starting Project Thunderbird Model Training ---") # 1. Load Data try: df = pd.read_csv(INPUT_FILE) print(f"āœ… Successfully loaded training data from '{INPUT_FILE}'") except FileNotFoundError: print(f"āŒ Error: Training data file not found at '{INPUT_FILE}'.") print(" Please run `scripts/export_thunderbird_training_data.py` first.") return # 2. Prepare Data (Feature Engineering) print("\nšŸš€ Preparing data for the model...") # 'month' and 'niche' are categorical. The model needs numbers. # We will use one-hot encoding for the 'niche'. encoder = OneHotEncoder(handle_unknown='ignore') niche_encoded = encoder.fit_transform(df[['niche']]).toarray() # Create a new DataFrame with the encoded columns niche_df = pd.DataFrame(niche_encoded, columns=encoder.get_feature_names_out(['niche'])) # We won't use 'month' directly, as the trend score already has the time component. # Our features are the market trend and the niche type. X = pd.concat([df[['trend_score']], niche_df], axis=1) # Our target is to predict how many successful campaigns there will be. y = df['successful_campaigns'] print(f"āœ… Data prepared. Features: {X.columns.tolist()}") # 3. Split data for training and testing X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print(f"\nšŸš€ Splitting data: {len(X_train)} rows for training, {len(X_test)} rows for testing.") # 4. Train the Model print("\n🧠 Training the Gradient Boosting Regressor model...") # Gradient Boosting is a good choice for this kind of tabular data. model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42) model.fit(X_train, y_train) print("āœ… Model training complete.") # 5. Evaluate the Model (optional, but good practice) predictions = model.predict(X_test) mse = mean_squared_error(y_test, predictions) print(f"\nšŸ“Š Model evaluation (Mean Squared Error): {mse:.2f}") print(" (Lower is better. A small number means the model's predictions are close to the real values).") # 6. Save the Trained Model and the Encoder print(f"\nšŸ’¾ Saving the trained model and encoder...") try: # We need to save BOTH the model AND the encoder, so we can use it for predictions later. model_and_encoder = { 'model': model, 'encoder': encoder } joblib.dump(model_and_encoder, MODEL_OUTPUT_FILE) print(f"āœ… Success! Model has been saved to:") print(f" {MODEL_OUTPUT_FILE}") except Exception as e: print(f"\nāŒ Error saving model file: {e}") if __name__ == "__main__": train_model()