|
|
import os |
|
|
import pandas as pd |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.ensemble import GradientBoostingRegressor |
|
|
from sklearn.metrics import mean_squared_error |
|
|
from sklearn.preprocessing import OneHotEncoder |
|
|
import joblib |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
INPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'data', 'thunderbird_market_trends.csv') |
|
|
|
|
|
|
|
|
MODEL_OUTPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'models', 'thunderbird_market_predictor_v1.joblib') |
|
|
|
|
|
|
|
|
|
|
|
def train_model(): |
|
|
""" |
|
|
Loads the training data, prepares it for the model, trains the model, |
|
|
and saves the final version to a .joblib file. |
|
|
""" |
|
|
print("--- Starting Project Thunderbird Model Training ---") |
|
|
|
|
|
|
|
|
try: |
|
|
df = pd.read_csv(INPUT_FILE) |
|
|
print(f"β
Successfully loaded training data from '{INPUT_FILE}'") |
|
|
except FileNotFoundError: |
|
|
print(f"β Error: Training data file not found at '{INPUT_FILE}'.") |
|
|
print(" Please run `scripts/export_thunderbird_training_data.py` first.") |
|
|
return |
|
|
|
|
|
|
|
|
print("\nπ Preparing data for the model...") |
|
|
|
|
|
|
|
|
encoder = OneHotEncoder(handle_unknown='ignore') |
|
|
niche_encoded = encoder.fit_transform(df[['niche']]).toarray() |
|
|
|
|
|
|
|
|
niche_df = pd.DataFrame(niche_encoded, columns=encoder.get_feature_names_out(['niche'])) |
|
|
|
|
|
|
|
|
|
|
|
X = pd.concat([df[['trend_score']], niche_df], axis=1) |
|
|
|
|
|
|
|
|
y = df['successful_campaigns'] |
|
|
|
|
|
print(f"β
Data prepared. Features: {X.columns.tolist()}") |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
print(f"\nπ Splitting data: {len(X_train)} rows for training, {len(X_test)} rows for testing.") |
|
|
|
|
|
|
|
|
print("\nπ§ Training the Gradient Boosting Regressor model...") |
|
|
|
|
|
|
|
|
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42) |
|
|
model.fit(X_train, y_train) |
|
|
|
|
|
print("β
Model training complete.") |
|
|
|
|
|
|
|
|
predictions = model.predict(X_test) |
|
|
mse = mean_squared_error(y_test, predictions) |
|
|
print(f"\nπ Model evaluation (Mean Squared Error): {mse:.2f}") |
|
|
print(" (Lower is better. A small number means the model's predictions are close to the real values).") |
|
|
|
|
|
|
|
|
|
|
|
print(f"\nπΎ Saving the trained model and encoder...") |
|
|
try: |
|
|
|
|
|
model_and_encoder = { |
|
|
'model': model, |
|
|
'encoder': encoder |
|
|
} |
|
|
joblib.dump(model_and_encoder, MODEL_OUTPUT_FILE) |
|
|
print(f"β
Success! Model has been saved to:") |
|
|
print(f" {MODEL_OUTPUT_FILE}") |
|
|
except Exception as e: |
|
|
print(f"\nβ Error saving model file: {e}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
train_model() |