Match-Prediction / model_training.py
Teera's picture
Upload 2 files
ff52fb3 verified
raw
history blame
10.6 kB
# ...existing code...
import joblib, json
from pathlib import Path
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# ...existing code...
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
try:
from ingest import metrics_report # type: ignore
except Exception:
try:
from preprocess import metrics_report # type: ignore
except Exception:
# Minimal fallback implementation returning a dict compatible with existing summary DataFrame usage
from sklearn.metrics import log_loss, accuracy_score, brier_score_loss, roc_auc_score
def metrics_report(y_true, y_proba, name="model"):
"""
Minimal metrics report compatible with the rest of the script.
Returns a dict with at least the key "model" so it can be placed into the summary DataFrame.
"""
y_true = np.asarray(y_true)
proba = np.asarray(y_proba)
# handle binary vs multiclass probabilities
try:
if proba.ndim == 1 or proba.shape[1] == 1:
# binary
proba_flat = proba.ravel()
y_pred = (proba_flat >= 0.5).astype(int)
ll = float(log_loss(y_true, proba_flat)) if y_true.size and proba_flat.size else float("nan")
try:
auc = float(roc_auc_score(y_true, proba_flat))
except Exception:
auc = float("nan")
try:
brier = float(brier_score_loss(y_true, proba_flat))
except Exception:
brier = float("nan")
else:
# multiclass
y_pred = proba.argmax(axis=1)
try:
ll = float(log_loss(y_true, proba))
except Exception:
ll = float("nan")
try:
auc = float(roc_auc_score(y_true, proba, multi_class="ovr"))
except Exception:
auc = float("nan")
# multiclass Brier as mean squared error against one-hot
try:
n_classes = proba.shape[1]
one_hot = np.eye(n_classes)[y_true]
brier = float(np.mean(np.sum((proba - one_hot) ** 2, axis=1)))
except Exception:
brier = float("nan")
except Exception:
y_pred = np.zeros_like(y_true)
ll = auc = brier = float("nan")
acc = float(accuracy_score(y_true, y_pred)) if y_true.size else float("nan")
return {
"model": name,
"accuracy": acc,
"log_loss": ll,
"roc_auc": auc,
"brier": brier
}
def load_processed_data(target_col="target", data_dir=Path("data/processed")):
"""
Try multiple ways to obtain a processed dataframe:
1) call a load_processed_data / load_data function from local ingest or preprocess modules
2) look for common filenames under data/processed (parquet/csv)
Returns: df (pd.DataFrame)
"""
# 1) try local modules
try:
from ingest import load_processed_data as _lp # type: ignore
df = _lp()
if isinstance(df, pd.DataFrame):
return df
except Exception:
pass
try:
from preprocess import load_processed_data as _lp2 # type: ignore
df = _lp2()
if isinstance(df, pd.DataFrame):
return df
except Exception:
pass
# 2) look for common files
candidates = [
data_dir / "processed.parquet",
data_dir / "dataset.parquet",
data_dir / "processed.csv",
data_dir / "dataset.csv",
data_dir / "train.parquet",
data_dir / "train.csv",
]
for fp in candidates:
if fp.exists():
if fp.suffix == ".parquet":
return pd.read_parquet(fp)
else:
return pd.read_csv(fp)
raise FileNotFoundError(
f"No processed data found. Checked modules ingest/preprocess and files under {data_dir}. "
"Add a processed dataset or expose load_processed_data() in ingest/preprocess."
)
# Load data and build train/valid/test splits
df = load_processed_data()
# infer target column
TARGET = None
for candidate in ("target", "label", "y", "outcome"):
if candidate in df.columns:
TARGET = candidate
break
if TARGET is None:
raise KeyError("No target column found. Expected one of: target,label,y,outcome")
# If dataset already includes a 'split' column with values 'train'/'valid'/'test', use it
if "split" in df.columns:
train_df = df[df["split"] == "train"].drop(columns=["split"])
valid_df = df[df["split"] == "valid"].drop(columns=["split"])
test_df = df[df["split"] == "test"].drop(columns=["split"])
else:
# create splits: train/valid/test = 64%/16%/20% (approx)
train_val, test_df = train_test_split(df, test_size=0.20, stratify=df[TARGET], random_state=42)
train_df, valid_df = train_test_split(train_val, test_size=0.20, stratify=train_val[TARGET], random_state=42)
X_cols = [c for c in df.columns if c != TARGET]
WINDOW = int(df.attrs.get("WINDOW", 1)) if hasattr(df, "attrs") else 1
X_train = train_df[X_cols]
y_train = train_df[TARGET]
X_valid = valid_df[X_cols]
y_valid = valid_df[TARGET]
X_test = test_df[X_cols]
y_test = test_df[TARGET]
# ...existing code...
xgb = XGBClassifier(
n_estimators=6000,
max_depth=50,
learning_rate=0.05,
subsample=0.9,
colsample_bytree=0.9,
objective="multi:softprob",
num_class=3,
reg_lambda=1.0,
random_state=42,
tree_method="hist"
)
xgb = XGBClassifier(
n_estimators=6000,
max_depth=50,
learning_rate=0.05,
subsample=0.9,
colsample_bytree=0.9,
objective="multi:softprob",
num_class=3,
reg_lambda=1.0,
random_state=42,
tree_method="hist"
)
xgb.fit(X_train, y_train)
proba_xgb_valid = xgb.predict_proba(X_valid)
proba_xgb_test = xgb.predict_proba(X_test)
m_xgb_valid = metrics_report(y_valid, proba_xgb_valid, "xgb_valid")
m_xgb_test = metrics_report(y_test, proba_xgb_test, "xgb_test")
# Isotonic calibration
cal_xgb = CalibratedClassifierCV(xgb, method="isotonic", cv="prefit")
cal_xgb.fit(X_valid, y_valid)
proba_xgb_cal_valid = cal_xgb.predict_proba(X_valid)
proba_xgb_cal_test = cal_xgb.predict_proba(X_test)
m_xgb_cal_valid = metrics_report(y_valid, proba_xgb_cal_valid, "xgb_cal_valid")
m_xgb_cal_test = metrics_report(y_test, proba_xgb_cal_test, "xgb_cal_test")
# Platt (optional)
cal_xgb_platt = CalibratedClassifierCV(xgb, method="sigmoid", cv="prefit")
cal_xgb_platt.fit(X_valid, y_valid)
proba_xgb_platt_test = cal_xgb_platt.predict_proba(X_test)
m_xgb_platt_test = metrics_report(y_test, proba_xgb_platt_test, "xgb_platt_test")
# -----------------------------
# 8) LightGBM + Calibration
# -----------------------------
lgbm = LGBMClassifier(
n_estimators=12000,
learning_rate=0.03,
num_leaves=63,
subsample=0.9,
colsample_bytree=0.9,
objective="multiclass",
class_weight=None,
random_state=42
)
lgbm.fit(X_train, y_train)
proba_lgb_valid = lgbm.predict_proba(X_valid)
proba_lgb_test = lgbm.predict_proba(X_test)
m_lgb_valid = metrics_report(y_valid, proba_lgb_valid, "lgb_valid")
m_lgb_test = metrics_report(y_test, proba_lgb_test, "lgb_test")
cal_lgb = CalibratedClassifierCV(lgbm, method="isotonic", cv="prefit")
cal_lgb.fit(X_valid, y_valid)
proba_lgb_cal_valid = cal_lgb.predict_proba(X_valid)
proba_lgb_cal_test = cal_lgb.predict_proba(X_test)
m_lgb_cal_valid = metrics_report(y_valid, proba_lgb_cal_valid, "lgb_cal_valid")
m_lgb_cal_test = metrics_report(y_test, proba_lgb_cal_test, "lgb_cal_test")
from sklearn.base import BaseEstimator, ClassifierMixin
class PriorProbaPredictor(BaseEstimator, ClassifierMixin):
"""Predict class probabilities equal to the class distribution in training data."""
def fit(self, X, y):
y = np.asarray(y)
classes, counts = np.unique(y, return_counts=True)
self.classes_ = classes
self.class_proba_ = counts / counts.sum()
return self
def predict_proba(self, X):
n = len(X)
# return array shape (n_samples, n_classes) following classes_ order
return np.tile(self.class_proba_, (n, 1))
def predict(self, X):
proba = self.predict_proba(X)
return proba.argmax(axis=1)
odds = PriorProbaPredictor()
odds.fit(X_train, y_train)
proba_odds_valid = odds.predict_proba(X_valid)
proba_odds_test = odds.predict_proba(X_test)
m_odds_valid = metrics_report(y_valid, proba_odds_valid, "odds_valid")
m_odds_test = metrics_report(y_test, proba_odds_test, "odds_test")
# ...existing code...
# -----------------------------
# 9) Summary table
# -----------------------------
summary = pd.DataFrame([
m_odds_valid, m_odds_test,
m_xgb_valid, m_xgb_test, m_xgb_cal_valid, m_xgb_cal_test, m_xgb_platt_test,
m_lgb_valid, m_lgb_test, m_lgb_cal_valid, m_lgb_cal_test
]).sort_values("model").reset_index(drop=True)
try:
from IPython.display import display as _display
_display(summary)
except Exception:
print(summary.to_string(index=False))
# Optional: save metrics
summary.to_csv("./evaluation/baseline_metrics.csv", index=False)
print("Saved: baseline_metrics.csv")
Path(".").mkdir(exist_ok=True)
# เลือกโมเดลที่ต้องการใช้ inference (แนะนำตัวที่ calibrated แล้ว)
joblib.dump(cal_xgb, "./model/model_xgb_isotonic.joblib")
joblib.dump(cal_lgb, "./model/model_lgb_isotonic.joblib")
# เก็บคอลัมน์ฟีเจอร์ และพารามิเตอร์สำคัญ
with open("feature_columns.json", "w", encoding="utf-8") as f:
json.dump({"X_cols": X_cols, "WINDOW": int(WINDOW)}, f, ensure_ascii=False, indent=2)
print("Saved: model_xgb_isotonic.joblib, model_lgb_isotonic.joblib, feature_columns.json")