|
|
import pandas as pd |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.ensemble import RandomForestRegressor |
|
|
from sklearn.metrics import r2_score |
|
|
import joblib |
|
|
import os |
|
|
|
|
|
print("Starting model training...") |
|
|
|
|
|
|
|
|
DATA_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'sample_performance_training_data.csv') |
|
|
MODEL_PATH = os.path.join(os.path.dirname(__file__), '..', 'models', 'performance_scorer_v1.joblib') |
|
|
|
|
|
|
|
|
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True) |
|
|
|
|
|
|
|
|
try: |
|
|
df = pd.read_csv(DATA_PATH) |
|
|
except FileNotFoundError: |
|
|
print(f"ERROR: Training data file not found at {DATA_PATH}") |
|
|
print("Please ensure 'sample_performance_training_data.csv' exists in the 'ai-service/data' directory.") |
|
|
exit() |
|
|
|
|
|
|
|
|
features = ['avg_engagement_rate', 'on_time_submission_rate', 'avg_brand_rating', 'monthly_earnings'] |
|
|
target = 'performance_score' |
|
|
|
|
|
X = df[features] |
|
|
y = df[target] |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
|
|
|
model = RandomForestRegressor(n_estimators=100, random_state=42) |
|
|
model.fit(X_train, y_train) |
|
|
|
|
|
|
|
|
y_pred = model.predict(X_test) |
|
|
accuracy = r2_score(y_test, y_pred) |
|
|
print(f"Model trained successfully! R^2 Score: {accuracy:.2f}") |
|
|
|
|
|
|
|
|
joblib.dump(model, MODEL_PATH) |
|
|
|
|
|
print(f"Model saved to {MODEL_PATH}") |