# File: ai-service/training/train_earning_optimizer.py (FINAL SIMPLIFIED VERSION) import pandas as pd import xgboost as xgb import joblib import os import sys from sklearn.preprocessing import OneHotEncoder ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(ROOT_DIR) def train_earning_optimizer(): print("--- Starting Earning Optimizer Model Training (Simplified) ---") data_path = os.path.join(ROOT_DIR, 'data', 'earnings_training_data.csv') try: df = pd.read_csv(data_path) if df.empty: print("⚠️ CSV file is empty. Aborting.") return except (FileNotFoundError, pd.errors.EmptyDataError): print(f"🔴 ERROR: Data file not found or is empty at {data_path}") return # ... (Feature engineering code is the same) print("Creating 'Smart Performance Score'...") df['roi'] = df['payment_amount'] / df['follower_count'] df['norm_engagement'] = 0.5 if df['engagement_rate'].nunique() == 1 else (df['engagement_rate'] - df['engagement_rate'].min()) / (df['engagement_rate'].max() - df['engagement_rate'].min()) df['norm_roi'] = 0.5 if df['roi'].nunique() == 1 else (df['roi'] - df['roi'].min()) / (df['roi'].max() - df['roi'].min()) df['smart_performance_score'] = 0.6 * df['norm_engagement'] + 0.4 * df['norm_roi'] # === ✨ THE FIX STARTS HERE ✨ === print("Preparing data MANUALLY without Pipeline...") # 1. Manually encode categorical features categorical_features = ['campaign_niche', 'content_format'] encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) encoded_cats = encoder.fit_transform(df[categorical_features]) encoded_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_features)) # 2. Combine with numerical features numerical_features = df[['follower_count']].reset_index(drop=True) X_final = pd.concat([encoded_df, numerical_features], axis=1) y = df['smart_performance_score'] # Save the encoder along with the model joblib.dump(encoder, os.path.join(ROOT_DIR, 'models', 'earnings_encoder.joblib')) print("--- Encoder saved successfully! ---") # Train the model DIRECTLY on the prepared data print("Training the XGBoost model...") model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42) model.fit(X_final, y) print("--- Model training complete! ---") # Save the simple model (not the pipeline) model_path = os.path.join(ROOT_DIR, 'models', 'earnings_model.joblib') joblib.dump(model, model_path) print(f"--- SIMPLE Model saved successfully to {model_path} ---") # === ✨ THE FIX ENDS HERE ✨ === if __name__ == '__main__': train_earning_optimizer()