Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- model_training.py +288 -0
- preprocess_data.py +196 -0
model_training.py
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ...existing code...
|
| 2 |
+
import joblib, json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from xgboost import XGBClassifier
|
| 5 |
+
from lightgbm import LGBMClassifier
|
| 6 |
+
# ...existing code...
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
+
from sklearn.model_selection import train_test_split
|
| 10 |
+
from sklearn.calibration import CalibratedClassifierCV
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
from ingest import metrics_report # type: ignore
|
| 14 |
+
except Exception:
|
| 15 |
+
try:
|
| 16 |
+
from preprocess import metrics_report # type: ignore
|
| 17 |
+
except Exception:
|
| 18 |
+
# Minimal fallback implementation returning a dict compatible with existing summary DataFrame usage
|
| 19 |
+
from sklearn.metrics import log_loss, accuracy_score, brier_score_loss, roc_auc_score
|
| 20 |
+
|
| 21 |
+
def metrics_report(y_true, y_proba, name="model"):
|
| 22 |
+
"""
|
| 23 |
+
Minimal metrics report compatible with the rest of the script.
|
| 24 |
+
Returns a dict with at least the key "model" so it can be placed into the summary DataFrame.
|
| 25 |
+
"""
|
| 26 |
+
y_true = np.asarray(y_true)
|
| 27 |
+
proba = np.asarray(y_proba)
|
| 28 |
+
|
| 29 |
+
# handle binary vs multiclass probabilities
|
| 30 |
+
try:
|
| 31 |
+
if proba.ndim == 1 or proba.shape[1] == 1:
|
| 32 |
+
# binary
|
| 33 |
+
proba_flat = proba.ravel()
|
| 34 |
+
y_pred = (proba_flat >= 0.5).astype(int)
|
| 35 |
+
ll = float(log_loss(y_true, proba_flat)) if y_true.size and proba_flat.size else float("nan")
|
| 36 |
+
try:
|
| 37 |
+
auc = float(roc_auc_score(y_true, proba_flat))
|
| 38 |
+
except Exception:
|
| 39 |
+
auc = float("nan")
|
| 40 |
+
try:
|
| 41 |
+
brier = float(brier_score_loss(y_true, proba_flat))
|
| 42 |
+
except Exception:
|
| 43 |
+
brier = float("nan")
|
| 44 |
+
else:
|
| 45 |
+
# multiclass
|
| 46 |
+
y_pred = proba.argmax(axis=1)
|
| 47 |
+
try:
|
| 48 |
+
ll = float(log_loss(y_true, proba))
|
| 49 |
+
except Exception:
|
| 50 |
+
ll = float("nan")
|
| 51 |
+
try:
|
| 52 |
+
auc = float(roc_auc_score(y_true, proba, multi_class="ovr"))
|
| 53 |
+
except Exception:
|
| 54 |
+
auc = float("nan")
|
| 55 |
+
# multiclass Brier as mean squared error against one-hot
|
| 56 |
+
try:
|
| 57 |
+
n_classes = proba.shape[1]
|
| 58 |
+
one_hot = np.eye(n_classes)[y_true]
|
| 59 |
+
brier = float(np.mean(np.sum((proba - one_hot) ** 2, axis=1)))
|
| 60 |
+
except Exception:
|
| 61 |
+
brier = float("nan")
|
| 62 |
+
except Exception:
|
| 63 |
+
y_pred = np.zeros_like(y_true)
|
| 64 |
+
ll = auc = brier = float("nan")
|
| 65 |
+
|
| 66 |
+
acc = float(accuracy_score(y_true, y_pred)) if y_true.size else float("nan")
|
| 67 |
+
|
| 68 |
+
return {
|
| 69 |
+
"model": name,
|
| 70 |
+
"accuracy": acc,
|
| 71 |
+
"log_loss": ll,
|
| 72 |
+
"roc_auc": auc,
|
| 73 |
+
"brier": brier
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
def load_processed_data(target_col="target", data_dir=Path("data/processed")):
|
| 77 |
+
"""
|
| 78 |
+
Try multiple ways to obtain a processed dataframe:
|
| 79 |
+
1) call a load_processed_data / load_data function from local ingest or preprocess modules
|
| 80 |
+
2) look for common filenames under data/processed (parquet/csv)
|
| 81 |
+
Returns: df (pd.DataFrame)
|
| 82 |
+
"""
|
| 83 |
+
# 1) try local modules
|
| 84 |
+
try:
|
| 85 |
+
from ingest import load_processed_data as _lp # type: ignore
|
| 86 |
+
df = _lp()
|
| 87 |
+
if isinstance(df, pd.DataFrame):
|
| 88 |
+
return df
|
| 89 |
+
except Exception:
|
| 90 |
+
pass
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
from preprocess import load_processed_data as _lp2 # type: ignore
|
| 94 |
+
df = _lp2()
|
| 95 |
+
if isinstance(df, pd.DataFrame):
|
| 96 |
+
return df
|
| 97 |
+
except Exception:
|
| 98 |
+
pass
|
| 99 |
+
|
| 100 |
+
# 2) look for common files
|
| 101 |
+
candidates = [
|
| 102 |
+
data_dir / "processed.parquet",
|
| 103 |
+
data_dir / "dataset.parquet",
|
| 104 |
+
data_dir / "processed.csv",
|
| 105 |
+
data_dir / "dataset.csv",
|
| 106 |
+
data_dir / "train.parquet",
|
| 107 |
+
data_dir / "train.csv",
|
| 108 |
+
]
|
| 109 |
+
for fp in candidates:
|
| 110 |
+
if fp.exists():
|
| 111 |
+
if fp.suffix == ".parquet":
|
| 112 |
+
return pd.read_parquet(fp)
|
| 113 |
+
else:
|
| 114 |
+
return pd.read_csv(fp)
|
| 115 |
+
|
| 116 |
+
raise FileNotFoundError(
|
| 117 |
+
f"No processed data found. Checked modules ingest/preprocess and files under {data_dir}. "
|
| 118 |
+
"Add a processed dataset or expose load_processed_data() in ingest/preprocess."
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
# Load data and build train/valid/test splits
|
| 122 |
+
df = load_processed_data()
|
| 123 |
+
|
| 124 |
+
# infer target column
|
| 125 |
+
TARGET = None
|
| 126 |
+
for candidate in ("target", "label", "y", "outcome"):
|
| 127 |
+
if candidate in df.columns:
|
| 128 |
+
TARGET = candidate
|
| 129 |
+
break
|
| 130 |
+
if TARGET is None:
|
| 131 |
+
raise KeyError("No target column found. Expected one of: target,label,y,outcome")
|
| 132 |
+
|
| 133 |
+
# If dataset already includes a 'split' column with values 'train'/'valid'/'test', use it
|
| 134 |
+
if "split" in df.columns:
|
| 135 |
+
train_df = df[df["split"] == "train"].drop(columns=["split"])
|
| 136 |
+
valid_df = df[df["split"] == "valid"].drop(columns=["split"])
|
| 137 |
+
test_df = df[df["split"] == "test"].drop(columns=["split"])
|
| 138 |
+
else:
|
| 139 |
+
# create splits: train/valid/test = 64%/16%/20% (approx)
|
| 140 |
+
train_val, test_df = train_test_split(df, test_size=0.20, stratify=df[TARGET], random_state=42)
|
| 141 |
+
train_df, valid_df = train_test_split(train_val, test_size=0.20, stratify=train_val[TARGET], random_state=42)
|
| 142 |
+
|
| 143 |
+
X_cols = [c for c in df.columns if c != TARGET]
|
| 144 |
+
WINDOW = int(df.attrs.get("WINDOW", 1)) if hasattr(df, "attrs") else 1
|
| 145 |
+
|
| 146 |
+
X_train = train_df[X_cols]
|
| 147 |
+
y_train = train_df[TARGET]
|
| 148 |
+
X_valid = valid_df[X_cols]
|
| 149 |
+
y_valid = valid_df[TARGET]
|
| 150 |
+
X_test = test_df[X_cols]
|
| 151 |
+
y_test = test_df[TARGET]
|
| 152 |
+
|
| 153 |
+
# ...existing code...
|
| 154 |
+
xgb = XGBClassifier(
|
| 155 |
+
n_estimators=6000,
|
| 156 |
+
max_depth=50,
|
| 157 |
+
learning_rate=0.05,
|
| 158 |
+
subsample=0.9,
|
| 159 |
+
colsample_bytree=0.9,
|
| 160 |
+
objective="multi:softprob",
|
| 161 |
+
num_class=3,
|
| 162 |
+
reg_lambda=1.0,
|
| 163 |
+
random_state=42,
|
| 164 |
+
tree_method="hist"
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
xgb = XGBClassifier(
|
| 169 |
+
n_estimators=6000,
|
| 170 |
+
max_depth=50,
|
| 171 |
+
learning_rate=0.05,
|
| 172 |
+
subsample=0.9,
|
| 173 |
+
colsample_bytree=0.9,
|
| 174 |
+
objective="multi:softprob",
|
| 175 |
+
num_class=3,
|
| 176 |
+
reg_lambda=1.0,
|
| 177 |
+
random_state=42,
|
| 178 |
+
tree_method="hist"
|
| 179 |
+
)
|
| 180 |
+
xgb.fit(X_train, y_train)
|
| 181 |
+
proba_xgb_valid = xgb.predict_proba(X_valid)
|
| 182 |
+
proba_xgb_test = xgb.predict_proba(X_test)
|
| 183 |
+
|
| 184 |
+
m_xgb_valid = metrics_report(y_valid, proba_xgb_valid, "xgb_valid")
|
| 185 |
+
m_xgb_test = metrics_report(y_test, proba_xgb_test, "xgb_test")
|
| 186 |
+
|
| 187 |
+
# Isotonic calibration
|
| 188 |
+
cal_xgb = CalibratedClassifierCV(xgb, method="isotonic", cv="prefit")
|
| 189 |
+
cal_xgb.fit(X_valid, y_valid)
|
| 190 |
+
proba_xgb_cal_valid = cal_xgb.predict_proba(X_valid)
|
| 191 |
+
proba_xgb_cal_test = cal_xgb.predict_proba(X_test)
|
| 192 |
+
|
| 193 |
+
m_xgb_cal_valid = metrics_report(y_valid, proba_xgb_cal_valid, "xgb_cal_valid")
|
| 194 |
+
m_xgb_cal_test = metrics_report(y_test, proba_xgb_cal_test, "xgb_cal_test")
|
| 195 |
+
|
| 196 |
+
# Platt (optional)
|
| 197 |
+
cal_xgb_platt = CalibratedClassifierCV(xgb, method="sigmoid", cv="prefit")
|
| 198 |
+
cal_xgb_platt.fit(X_valid, y_valid)
|
| 199 |
+
proba_xgb_platt_test = cal_xgb_platt.predict_proba(X_test)
|
| 200 |
+
m_xgb_platt_test = metrics_report(y_test, proba_xgb_platt_test, "xgb_platt_test")
|
| 201 |
+
|
| 202 |
+
# -----------------------------
|
| 203 |
+
# 8) LightGBM + Calibration
|
| 204 |
+
# -----------------------------
|
| 205 |
+
lgbm = LGBMClassifier(
|
| 206 |
+
n_estimators=12000,
|
| 207 |
+
learning_rate=0.03,
|
| 208 |
+
num_leaves=63,
|
| 209 |
+
subsample=0.9,
|
| 210 |
+
colsample_bytree=0.9,
|
| 211 |
+
objective="multiclass",
|
| 212 |
+
class_weight=None,
|
| 213 |
+
random_state=42
|
| 214 |
+
)
|
| 215 |
+
lgbm.fit(X_train, y_train)
|
| 216 |
+
proba_lgb_valid = lgbm.predict_proba(X_valid)
|
| 217 |
+
proba_lgb_test = lgbm.predict_proba(X_test)
|
| 218 |
+
|
| 219 |
+
m_lgb_valid = metrics_report(y_valid, proba_lgb_valid, "lgb_valid")
|
| 220 |
+
m_lgb_test = metrics_report(y_test, proba_lgb_test, "lgb_test")
|
| 221 |
+
|
| 222 |
+
cal_lgb = CalibratedClassifierCV(lgbm, method="isotonic", cv="prefit")
|
| 223 |
+
cal_lgb.fit(X_valid, y_valid)
|
| 224 |
+
proba_lgb_cal_valid = cal_lgb.predict_proba(X_valid)
|
| 225 |
+
proba_lgb_cal_test = cal_lgb.predict_proba(X_test)
|
| 226 |
+
|
| 227 |
+
m_lgb_cal_valid = metrics_report(y_valid, proba_lgb_cal_valid, "lgb_cal_valid")
|
| 228 |
+
m_lgb_cal_test = metrics_report(y_test, proba_lgb_cal_test, "lgb_cal_test")
|
| 229 |
+
|
| 230 |
+
from sklearn.base import BaseEstimator, ClassifierMixin
|
| 231 |
+
|
| 232 |
+
class PriorProbaPredictor(BaseEstimator, ClassifierMixin):
|
| 233 |
+
"""Predict class probabilities equal to the class distribution in training data."""
|
| 234 |
+
def fit(self, X, y):
|
| 235 |
+
y = np.asarray(y)
|
| 236 |
+
classes, counts = np.unique(y, return_counts=True)
|
| 237 |
+
self.classes_ = classes
|
| 238 |
+
self.class_proba_ = counts / counts.sum()
|
| 239 |
+
return self
|
| 240 |
+
|
| 241 |
+
def predict_proba(self, X):
|
| 242 |
+
n = len(X)
|
| 243 |
+
# return array shape (n_samples, n_classes) following classes_ order
|
| 244 |
+
return np.tile(self.class_proba_, (n, 1))
|
| 245 |
+
|
| 246 |
+
def predict(self, X):
|
| 247 |
+
proba = self.predict_proba(X)
|
| 248 |
+
return proba.argmax(axis=1)
|
| 249 |
+
|
| 250 |
+
odds = PriorProbaPredictor()
|
| 251 |
+
odds.fit(X_train, y_train)
|
| 252 |
+
proba_odds_valid = odds.predict_proba(X_valid)
|
| 253 |
+
proba_odds_test = odds.predict_proba(X_test)
|
| 254 |
+
|
| 255 |
+
m_odds_valid = metrics_report(y_valid, proba_odds_valid, "odds_valid")
|
| 256 |
+
m_odds_test = metrics_report(y_test, proba_odds_test, "odds_test")
|
| 257 |
+
# ...existing code...
|
| 258 |
+
# -----------------------------
|
| 259 |
+
# 9) Summary table
|
| 260 |
+
# -----------------------------
|
| 261 |
+
summary = pd.DataFrame([
|
| 262 |
+
m_odds_valid, m_odds_test,
|
| 263 |
+
m_xgb_valid, m_xgb_test, m_xgb_cal_valid, m_xgb_cal_test, m_xgb_platt_test,
|
| 264 |
+
m_lgb_valid, m_lgb_test, m_lgb_cal_valid, m_lgb_cal_test
|
| 265 |
+
]).sort_values("model").reset_index(drop=True)
|
| 266 |
+
|
| 267 |
+
try:
|
| 268 |
+
from IPython.display import display as _display
|
| 269 |
+
_display(summary)
|
| 270 |
+
except Exception:
|
| 271 |
+
print(summary.to_string(index=False))
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
# Optional: save metrics
|
| 275 |
+
summary.to_csv("./evaluation/baseline_metrics.csv", index=False)
|
| 276 |
+
print("Saved: baseline_metrics.csv")
|
| 277 |
+
|
| 278 |
+
Path(".").mkdir(exist_ok=True)
|
| 279 |
+
|
| 280 |
+
# เลือกโมเดลที่ต้องการใช้ inference (แนะนำตัวที่ calibrated แล้ว)
|
| 281 |
+
joblib.dump(cal_xgb, "./model/model_xgb_isotonic.joblib")
|
| 282 |
+
joblib.dump(cal_lgb, "./model/model_lgb_isotonic.joblib")
|
| 283 |
+
|
| 284 |
+
# เก็บคอลัมน์ฟีเจอร์ และพารามิเตอร์สำคัญ
|
| 285 |
+
with open("feature_columns.json", "w", encoding="utf-8") as f:
|
| 286 |
+
json.dump({"X_cols": X_cols, "WINDOW": int(WINDOW)}, f, ensure_ascii=False, indent=2)
|
| 287 |
+
|
| 288 |
+
print("Saved: model_xgb_isotonic.joblib, model_lgb_isotonic.joblib, feature_columns.json")
|
preprocess_data.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from collections import defaultdict
|
| 4 |
+
from unidecode import unidecode
|
| 5 |
+
from sklearn.metrics import log_loss, accuracy_score
|
| 6 |
+
|
| 7 |
+
def prepare_features(data_raw: pd.DataFrame, window: int = 7, verbose: bool = True):
|
| 8 |
+
"""Prepare features from raw EPL data.
|
| 9 |
+
|
| 10 |
+
Returns (feat_df, X_cols, WINDOW, base_df)
|
| 11 |
+
- feat_df: DataFrame with features aligned to training
|
| 12 |
+
- X_cols: list of feature column names used for modeling
|
| 13 |
+
- WINDOW: the rolling window used
|
| 14 |
+
- base_df: base match DataFrame with cleaned columns (date, home, away, ftr, ...)
|
| 15 |
+
"""
|
| 16 |
+
RENAME = {
|
| 17 |
+
"Date":"date","Time":"time","HomeTeam":"home","AwayTeam":"away",
|
| 18 |
+
"FTHG":"fthg","FTAG":"ftag","FTR":"ftr",
|
| 19 |
+
"HTHG":"hthg","HTAG":"htag","HTR":"htr",
|
| 20 |
+
"Referee":"ref",
|
| 21 |
+
"HS":"hs","AS":"as","HST":"hst","AST":"ast",
|
| 22 |
+
"HF":"hf","AF":"af","HC":"hc","AC":"ac",
|
| 23 |
+
"HY":"hy","AY":"ay","HR":"hr","AR":"ar",
|
| 24 |
+
# odds (Bet365, William Hill, Pinnacle(PS), VC)
|
| 25 |
+
"B365H":"b365h","B365D":"b365d","B365A":"b365a",
|
| 26 |
+
"WHH":"whh","WHD":"whd","WHA":"wha",
|
| 27 |
+
"PSH":"psh","PSD":"psd","PSA":"psa",
|
| 28 |
+
"VCH":"vch","VCD":"vcd","VCA":"vca",
|
| 29 |
+
}
|
| 30 |
+
df = data_raw.rename(columns=RENAME).copy()
|
| 31 |
+
|
| 32 |
+
# parse date
|
| 33 |
+
from datetime import datetime
|
| 34 |
+
def parse_date(x):
|
| 35 |
+
for fmt in ("%d/%m/%Y", "%d/%m/%y", "%Y-%m-%d"):
|
| 36 |
+
try:
|
| 37 |
+
return datetime.strptime(str(x), fmt)
|
| 38 |
+
except Exception:
|
| 39 |
+
pass
|
| 40 |
+
return pd.NaT
|
| 41 |
+
df["date"] = df["date"].map(parse_date)
|
| 42 |
+
df = df[~df["date"].isna()].copy()
|
| 43 |
+
|
| 44 |
+
# clean team names
|
| 45 |
+
def clean_team(s):
|
| 46 |
+
if pd.isna(s): return s
|
| 47 |
+
s = unidecode(str(s)).strip()
|
| 48 |
+
s = " ".join(s.split())
|
| 49 |
+
return s
|
| 50 |
+
df["home"] = df["home"].map(clean_team)
|
| 51 |
+
df["away"] = df["away"].map(clean_team)
|
| 52 |
+
|
| 53 |
+
# keep valid rows
|
| 54 |
+
df = df[(df["ftr"].isin(["H","D","A"])) & (~df["home"].isna()) & (~df["away"].isna())].copy()
|
| 55 |
+
df.sort_values(["date","home","away"], inplace=True, ignore_index=True)
|
| 56 |
+
|
| 57 |
+
# target
|
| 58 |
+
label_map = {"H":0, "D":1, "A":2}
|
| 59 |
+
df["y"] = df["ftr"].map(label_map)
|
| 60 |
+
|
| 61 |
+
# -----------------------------
|
| 62 |
+
# 3) Odds → implied probabilities (normalize overround)
|
| 63 |
+
# -----------------------------
|
| 64 |
+
def implied_probs(row, prefix):
|
| 65 |
+
h,d,a = row.get(prefix+"h"), row.get(prefix+"d"), row.get(prefix+"a")
|
| 66 |
+
if any(pd.isna([h,d,a])): return pd.Series([np.nan,np.nan,np.nan])
|
| 67 |
+
if min(h,d,a) <= 1.0: return pd.Series([np.nan,np.nan,np.nan])
|
| 68 |
+
inv = np.array([1/h, 1/d, 1/a], dtype=float)
|
| 69 |
+
s = inv.sum()
|
| 70 |
+
if s <= 0: return pd.Series([np.nan,np.nan,np.nan])
|
| 71 |
+
return pd.Series(inv / s)
|
| 72 |
+
|
| 73 |
+
for bk in ["b365","wh","ps","vc"]:
|
| 74 |
+
cols_exist = all([(bk+c) in df.columns for c in ["h","d","a"]])
|
| 75 |
+
if cols_exist:
|
| 76 |
+
probs = df.apply(lambda r: implied_probs(r, bk), axis=1, result_type="expand")
|
| 77 |
+
df[[f"p_{bk}_H", f"p_{bk}_D", f"p_{bk}_A"]] = probs
|
| 78 |
+
|
| 79 |
+
prob_cols = [c for c in df.columns if c.startswith("p_") and c[-2:] in ["_H","_D","_A"]]
|
| 80 |
+
def avg_prob(suffix):
|
| 81 |
+
cols = [c for c in prob_cols if c.endswith(suffix)]
|
| 82 |
+
return df[cols].mean(axis=1)
|
| 83 |
+
|
| 84 |
+
df["p_odds_H"] = avg_prob("_H")
|
| 85 |
+
df["p_odds_D"] = avg_prob("_D")
|
| 86 |
+
df["p_odds_A"] = avg_prob("_A")
|
| 87 |
+
|
| 88 |
+
# -----------------------------
|
| 89 |
+
# 4) Leak-free features: rolling form + simple Elo
|
| 90 |
+
# -----------------------------
|
| 91 |
+
def result_points(ftr, is_home):
|
| 92 |
+
if ftr == "D": return 1
|
| 93 |
+
if ftr == "H": return 3 if is_home else 0
|
| 94 |
+
if ftr == "A": return 0 if is_home else 3
|
| 95 |
+
return 0
|
| 96 |
+
|
| 97 |
+
tm_rows = []
|
| 98 |
+
for i, r in df.iterrows():
|
| 99 |
+
# home perspective
|
| 100 |
+
tm_rows.append({
|
| 101 |
+
"match_id": i, "date": r["date"], "team": r["home"], "opp": r["away"], "is_home": 1,
|
| 102 |
+
"gf": r["fthg"], "ga": r["ftag"],
|
| 103 |
+
"shots_f": r.get("hs", np.nan), "shots_a": r.get("as", np.nan),
|
| 104 |
+
"sot_f": r.get("hst", np.nan), "sot_a": r.get("ast", np.nan),
|
| 105 |
+
"corn_f": r.get("hc", np.nan), "corn_a": r.get("ac", np.nan),
|
| 106 |
+
"y_f": r.get("hy", np.nan), "y_a": r.get("ay", np.nan),
|
| 107 |
+
"r_f": r.get("hr", np.nan), "r_a": r.get("ar", np.nan),
|
| 108 |
+
"points": result_points(r["ftr"], True),
|
| 109 |
+
})
|
| 110 |
+
# away perspective
|
| 111 |
+
tm_rows.append({
|
| 112 |
+
"match_id": i, "date": r["date"], "team": r["away"], "opp": r["home"], "is_home": 0,
|
| 113 |
+
"gf": r["ftag"], "ga": r["fthg"],
|
| 114 |
+
"shots_f": r.get("as", np.nan), "shots_a": r.get("hs", np.nan),
|
| 115 |
+
"sot_f": r.get("ast", np.nan), "sot_a": r.get("hst", np.nan),
|
| 116 |
+
"corn_f": r.get("ac", np.nan), "corn_a": r.get("hc", np.nan),
|
| 117 |
+
"y_f": r.get("ay", np.nan), "y_a": r.get("hy", np.nan),
|
| 118 |
+
"r_f": r.get("ar", np.nan), "r_a": r.get("hr", np.nan),
|
| 119 |
+
"points": result_points(r["ftr"], False),
|
| 120 |
+
})
|
| 121 |
+
tm = pd.DataFrame(tm_rows).sort_values(["team","date"]).reset_index(drop=True)
|
| 122 |
+
|
| 123 |
+
WINDOW = int(window)
|
| 124 |
+
agg_cols = ["gf","ga","shots_f","shots_a","sot_f","sot_a","corn_f","corn_a","y_f","r_f","points"]
|
| 125 |
+
for col in agg_cols:
|
| 126 |
+
tm[f"roll_{col}"] = (tm.groupby("team")[col]
|
| 127 |
+
.rolling(WINDOW, min_periods=1).mean()
|
| 128 |
+
.shift(1) # ใช้ข้อมูลก่อนหน้าเท่านั้น
|
| 129 |
+
.reset_index(level=0, drop=True))
|
| 130 |
+
|
| 131 |
+
# Elo (ง่าย)
|
| 132 |
+
BASE_ELO = 1500.0
|
| 133 |
+
K = 20.0
|
| 134 |
+
HOME_ADV = 60.0
|
| 135 |
+
|
| 136 |
+
elo = defaultdict(lambda: BASE_ELO)
|
| 137 |
+
elo_before_home, elo_before_away = [], []
|
| 138 |
+
|
| 139 |
+
df_sorted = df.sort_values("date").reset_index(drop=True)
|
| 140 |
+
for i, r in df_sorted.iterrows():
|
| 141 |
+
h, a = r["home"], r["away"]
|
| 142 |
+
eh, ea = elo[h], elo[a]
|
| 143 |
+
elo_before_home.append(eh); elo_before_away.append(ea)
|
| 144 |
+
ph = 1.0/(1.0 + 10**(-((eh+HOME_ADV)-ea)/400))
|
| 145 |
+
if r["ftr"] == "H": oh, oa = 1.0, 0.0
|
| 146 |
+
elif r["ftr"] == "D": oh, oa = 0.5, 0.5
|
| 147 |
+
else: oh, oa = 0.0, 1.0
|
| 148 |
+
elo[h] = eh + K*(oh - ph)
|
| 149 |
+
elo[a] = ea + K*((1.0-oh) - (1.0-ph))
|
| 150 |
+
|
| 151 |
+
df_sorted["elo_home"] = elo_before_home
|
| 152 |
+
df_sorted["elo_away"] = elo_before_away
|
| 153 |
+
df_sorted["elo_diff"] = df_sorted["elo_home"] - df_sorted["elo_away"]
|
| 154 |
+
|
| 155 |
+
# Merge rolling features into match rows
|
| 156 |
+
home_tm = tm[tm["is_home"]==1].copy()
|
| 157 |
+
away_tm = tm[tm["is_home"]==0].copy()
|
| 158 |
+
home_feats = home_tm.filter(regex="^roll_").columns.tolist()
|
| 159 |
+
hf = home_tm[["match_id"] + home_feats].rename(columns={c: f"home_{c}" for c in home_feats})
|
| 160 |
+
af = away_tm[["match_id"] + home_feats].rename(columns={c: f"away_{c}" for c in home_feats})
|
| 161 |
+
|
| 162 |
+
feat_df = df_sorted.merge(hf, left_index=True, right_on="match_id", how="left") \
|
| 163 |
+
.merge(af, left_index=True, right_on="match_id", how="left")
|
| 164 |
+
|
| 165 |
+
# Fill odds missing (keep baseline)
|
| 166 |
+
for c in ["p_odds_H","p_odds_D","p_odds_A"]:
|
| 167 |
+
if c in feat_df.columns:
|
| 168 |
+
feat_df[c] = feat_df[c].astype(float).fillna(feat_df[c].mean())
|
| 169 |
+
|
| 170 |
+
role_feats = [f"home_{c}" for c in home_feats] + [f"away_{c}" for c in home_feats]
|
| 171 |
+
elo_feats = ["elo_home","elo_away","elo_diff"]
|
| 172 |
+
odds_feats = ["p_odds_H","p_odds_D","p_odds_A"]
|
| 173 |
+
X_cols = role_feats + elo_feats + odds_feats
|
| 174 |
+
|
| 175 |
+
for c in X_cols:
|
| 176 |
+
if c not in feat_df.columns:
|
| 177 |
+
feat_df[c] = np.nan
|
| 178 |
+
feat_df[c] = feat_df[c].astype(float).fillna(feat_df[c].median())
|
| 179 |
+
|
| 180 |
+
# -----------------------------
|
| 181 |
+
# 5) Time-based split (kept for compatibility, but not returned)
|
| 182 |
+
# -----------------------------
|
| 183 |
+
n = len(feat_df)
|
| 184 |
+
idx_train = int(n*0.70)
|
| 185 |
+
idx_valid = int(n*0.85)
|
| 186 |
+
if verbose and n > 0:
|
| 187 |
+
dates_train = feat_df["date"].iloc[:idx_train].max()
|
| 188 |
+
dates_valid = (feat_df["date"].iloc[idx_train:idx_valid].min(),
|
| 189 |
+
feat_df["date"].iloc[idx_train:idx_valid].max())
|
| 190 |
+
dates_test = (feat_df["date"].iloc[idx_valid:].min(),
|
| 191 |
+
feat_df["date"].iloc[idx_valid:].max())
|
| 192 |
+
print(f"Train up to: {dates_train:%Y-%m-%d}")
|
| 193 |
+
print(f"Valid: {dates_valid[0]:%Y-%m-%d} .. {dates_valid[1]:%Y-%m-%d}")
|
| 194 |
+
print(f"Test : {dates_test[0]:%Y-%m-%d} .. {dates_test[1]:%Y-%m-%d}")
|
| 195 |
+
|
| 196 |
+
return feat_df, X_cols, WINDOW, df_sorted
|