File size: 3,591 Bytes
01c71d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import joblib
# --- CONFIGURATION ---
# Path to the training data we created in the previous step
INPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'data', 'thunderbird_market_trends.csv')
# Path to save the trained model
MODEL_OUTPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'models', 'thunderbird_market_predictor_v1.joblib')
# --- MAIN SCRIPT ---
def train_model():
"""
Loads the training data, prepares it for the model, trains the model,
and saves the final version to a .joblib file.
"""
print("--- Starting Project Thunderbird Model Training ---")
# 1. Load Data
try:
df = pd.read_csv(INPUT_FILE)
print(f"β
Successfully loaded training data from '{INPUT_FILE}'")
except FileNotFoundError:
print(f"β Error: Training data file not found at '{INPUT_FILE}'.")
print(" Please run `scripts/export_thunderbird_training_data.py` first.")
return
# 2. Prepare Data (Feature Engineering)
print("\nπ Preparing data for the model...")
# 'month' and 'niche' are categorical. The model needs numbers.
# We will use one-hot encoding for the 'niche'.
encoder = OneHotEncoder(handle_unknown='ignore')
niche_encoded = encoder.fit_transform(df[['niche']]).toarray()
# Create a new DataFrame with the encoded columns
niche_df = pd.DataFrame(niche_encoded, columns=encoder.get_feature_names_out(['niche']))
# We won't use 'month' directly, as the trend score already has the time component.
# Our features are the market trend and the niche type.
X = pd.concat([df[['trend_score']], niche_df], axis=1)
# Our target is to predict how many successful campaigns there will be.
y = df['successful_campaigns']
print(f"β
Data prepared. Features: {X.columns.tolist()}")
# 3. Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nπ Splitting data: {len(X_train)} rows for training, {len(X_test)} rows for testing.")
# 4. Train the Model
print("\nπ§ Training the Gradient Boosting Regressor model...")
# Gradient Boosting is a good choice for this kind of tabular data.
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)
print("β
Model training complete.")
# 5. Evaluate the Model (optional, but good practice)
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"\nπ Model evaluation (Mean Squared Error): {mse:.2f}")
print(" (Lower is better. A small number means the model's predictions are close to the real values).")
# 6. Save the Trained Model and the Encoder
print(f"\nπΎ Saving the trained model and encoder...")
try:
# We need to save BOTH the model AND the encoder, so we can use it for predictions later.
model_and_encoder = {
'model': model,
'encoder': encoder
}
joblib.dump(model_and_encoder, MODEL_OUTPUT_FILE)
print(f"β
Success! Model has been saved to:")
print(f" {MODEL_OUTPUT_FILE}")
except Exception as e:
print(f"\nβ Error saving model file: {e}")
if __name__ == "__main__":
train_model() |