Teera commited on
Commit
ff52fb3
·
verified ·
1 Parent(s): bb936c6

Upload 2 files

Browse files
Files changed (2) hide show
  1. model_training.py +288 -0
  2. preprocess_data.py +196 -0
model_training.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ...existing code...
2
+ import joblib, json
3
+ from pathlib import Path
4
+ from xgboost import XGBClassifier
5
+ from lightgbm import LGBMClassifier
6
+ # ...existing code...
7
+ import pandas as pd
8
+ import numpy as np
9
+ from sklearn.model_selection import train_test_split
10
+ from sklearn.calibration import CalibratedClassifierCV
11
+
12
+ try:
13
+ from ingest import metrics_report # type: ignore
14
+ except Exception:
15
+ try:
16
+ from preprocess import metrics_report # type: ignore
17
+ except Exception:
18
+ # Minimal fallback implementation returning a dict compatible with existing summary DataFrame usage
19
+ from sklearn.metrics import log_loss, accuracy_score, brier_score_loss, roc_auc_score
20
+
21
+ def metrics_report(y_true, y_proba, name="model"):
22
+ """
23
+ Minimal metrics report compatible with the rest of the script.
24
+ Returns a dict with at least the key "model" so it can be placed into the summary DataFrame.
25
+ """
26
+ y_true = np.asarray(y_true)
27
+ proba = np.asarray(y_proba)
28
+
29
+ # handle binary vs multiclass probabilities
30
+ try:
31
+ if proba.ndim == 1 or proba.shape[1] == 1:
32
+ # binary
33
+ proba_flat = proba.ravel()
34
+ y_pred = (proba_flat >= 0.5).astype(int)
35
+ ll = float(log_loss(y_true, proba_flat)) if y_true.size and proba_flat.size else float("nan")
36
+ try:
37
+ auc = float(roc_auc_score(y_true, proba_flat))
38
+ except Exception:
39
+ auc = float("nan")
40
+ try:
41
+ brier = float(brier_score_loss(y_true, proba_flat))
42
+ except Exception:
43
+ brier = float("nan")
44
+ else:
45
+ # multiclass
46
+ y_pred = proba.argmax(axis=1)
47
+ try:
48
+ ll = float(log_loss(y_true, proba))
49
+ except Exception:
50
+ ll = float("nan")
51
+ try:
52
+ auc = float(roc_auc_score(y_true, proba, multi_class="ovr"))
53
+ except Exception:
54
+ auc = float("nan")
55
+ # multiclass Brier as mean squared error against one-hot
56
+ try:
57
+ n_classes = proba.shape[1]
58
+ one_hot = np.eye(n_classes)[y_true]
59
+ brier = float(np.mean(np.sum((proba - one_hot) ** 2, axis=1)))
60
+ except Exception:
61
+ brier = float("nan")
62
+ except Exception:
63
+ y_pred = np.zeros_like(y_true)
64
+ ll = auc = brier = float("nan")
65
+
66
+ acc = float(accuracy_score(y_true, y_pred)) if y_true.size else float("nan")
67
+
68
+ return {
69
+ "model": name,
70
+ "accuracy": acc,
71
+ "log_loss": ll,
72
+ "roc_auc": auc,
73
+ "brier": brier
74
+ }
75
+
76
+ def load_processed_data(target_col="target", data_dir=Path("data/processed")):
77
+ """
78
+ Try multiple ways to obtain a processed dataframe:
79
+ 1) call a load_processed_data / load_data function from local ingest or preprocess modules
80
+ 2) look for common filenames under data/processed (parquet/csv)
81
+ Returns: df (pd.DataFrame)
82
+ """
83
+ # 1) try local modules
84
+ try:
85
+ from ingest import load_processed_data as _lp # type: ignore
86
+ df = _lp()
87
+ if isinstance(df, pd.DataFrame):
88
+ return df
89
+ except Exception:
90
+ pass
91
+
92
+ try:
93
+ from preprocess import load_processed_data as _lp2 # type: ignore
94
+ df = _lp2()
95
+ if isinstance(df, pd.DataFrame):
96
+ return df
97
+ except Exception:
98
+ pass
99
+
100
+ # 2) look for common files
101
+ candidates = [
102
+ data_dir / "processed.parquet",
103
+ data_dir / "dataset.parquet",
104
+ data_dir / "processed.csv",
105
+ data_dir / "dataset.csv",
106
+ data_dir / "train.parquet",
107
+ data_dir / "train.csv",
108
+ ]
109
+ for fp in candidates:
110
+ if fp.exists():
111
+ if fp.suffix == ".parquet":
112
+ return pd.read_parquet(fp)
113
+ else:
114
+ return pd.read_csv(fp)
115
+
116
+ raise FileNotFoundError(
117
+ f"No processed data found. Checked modules ingest/preprocess and files under {data_dir}. "
118
+ "Add a processed dataset or expose load_processed_data() in ingest/preprocess."
119
+ )
120
+
121
+ # Load data and build train/valid/test splits
122
+ df = load_processed_data()
123
+
124
+ # infer target column
125
+ TARGET = None
126
+ for candidate in ("target", "label", "y", "outcome"):
127
+ if candidate in df.columns:
128
+ TARGET = candidate
129
+ break
130
+ if TARGET is None:
131
+ raise KeyError("No target column found. Expected one of: target,label,y,outcome")
132
+
133
+ # If dataset already includes a 'split' column with values 'train'/'valid'/'test', use it
134
+ if "split" in df.columns:
135
+ train_df = df[df["split"] == "train"].drop(columns=["split"])
136
+ valid_df = df[df["split"] == "valid"].drop(columns=["split"])
137
+ test_df = df[df["split"] == "test"].drop(columns=["split"])
138
+ else:
139
+ # create splits: train/valid/test = 64%/16%/20% (approx)
140
+ train_val, test_df = train_test_split(df, test_size=0.20, stratify=df[TARGET], random_state=42)
141
+ train_df, valid_df = train_test_split(train_val, test_size=0.20, stratify=train_val[TARGET], random_state=42)
142
+
143
+ X_cols = [c for c in df.columns if c != TARGET]
144
+ WINDOW = int(df.attrs.get("WINDOW", 1)) if hasattr(df, "attrs") else 1
145
+
146
+ X_train = train_df[X_cols]
147
+ y_train = train_df[TARGET]
148
+ X_valid = valid_df[X_cols]
149
+ y_valid = valid_df[TARGET]
150
+ X_test = test_df[X_cols]
151
+ y_test = test_df[TARGET]
152
+
153
+ # ...existing code...
154
+ xgb = XGBClassifier(
155
+ n_estimators=6000,
156
+ max_depth=50,
157
+ learning_rate=0.05,
158
+ subsample=0.9,
159
+ colsample_bytree=0.9,
160
+ objective="multi:softprob",
161
+ num_class=3,
162
+ reg_lambda=1.0,
163
+ random_state=42,
164
+ tree_method="hist"
165
+ )
166
+
167
+
168
+ xgb = XGBClassifier(
169
+ n_estimators=6000,
170
+ max_depth=50,
171
+ learning_rate=0.05,
172
+ subsample=0.9,
173
+ colsample_bytree=0.9,
174
+ objective="multi:softprob",
175
+ num_class=3,
176
+ reg_lambda=1.0,
177
+ random_state=42,
178
+ tree_method="hist"
179
+ )
180
+ xgb.fit(X_train, y_train)
181
+ proba_xgb_valid = xgb.predict_proba(X_valid)
182
+ proba_xgb_test = xgb.predict_proba(X_test)
183
+
184
+ m_xgb_valid = metrics_report(y_valid, proba_xgb_valid, "xgb_valid")
185
+ m_xgb_test = metrics_report(y_test, proba_xgb_test, "xgb_test")
186
+
187
+ # Isotonic calibration
188
+ cal_xgb = CalibratedClassifierCV(xgb, method="isotonic", cv="prefit")
189
+ cal_xgb.fit(X_valid, y_valid)
190
+ proba_xgb_cal_valid = cal_xgb.predict_proba(X_valid)
191
+ proba_xgb_cal_test = cal_xgb.predict_proba(X_test)
192
+
193
+ m_xgb_cal_valid = metrics_report(y_valid, proba_xgb_cal_valid, "xgb_cal_valid")
194
+ m_xgb_cal_test = metrics_report(y_test, proba_xgb_cal_test, "xgb_cal_test")
195
+
196
+ # Platt (optional)
197
+ cal_xgb_platt = CalibratedClassifierCV(xgb, method="sigmoid", cv="prefit")
198
+ cal_xgb_platt.fit(X_valid, y_valid)
199
+ proba_xgb_platt_test = cal_xgb_platt.predict_proba(X_test)
200
+ m_xgb_platt_test = metrics_report(y_test, proba_xgb_platt_test, "xgb_platt_test")
201
+
202
+ # -----------------------------
203
+ # 8) LightGBM + Calibration
204
+ # -----------------------------
205
+ lgbm = LGBMClassifier(
206
+ n_estimators=12000,
207
+ learning_rate=0.03,
208
+ num_leaves=63,
209
+ subsample=0.9,
210
+ colsample_bytree=0.9,
211
+ objective="multiclass",
212
+ class_weight=None,
213
+ random_state=42
214
+ )
215
+ lgbm.fit(X_train, y_train)
216
+ proba_lgb_valid = lgbm.predict_proba(X_valid)
217
+ proba_lgb_test = lgbm.predict_proba(X_test)
218
+
219
+ m_lgb_valid = metrics_report(y_valid, proba_lgb_valid, "lgb_valid")
220
+ m_lgb_test = metrics_report(y_test, proba_lgb_test, "lgb_test")
221
+
222
+ cal_lgb = CalibratedClassifierCV(lgbm, method="isotonic", cv="prefit")
223
+ cal_lgb.fit(X_valid, y_valid)
224
+ proba_lgb_cal_valid = cal_lgb.predict_proba(X_valid)
225
+ proba_lgb_cal_test = cal_lgb.predict_proba(X_test)
226
+
227
+ m_lgb_cal_valid = metrics_report(y_valid, proba_lgb_cal_valid, "lgb_cal_valid")
228
+ m_lgb_cal_test = metrics_report(y_test, proba_lgb_cal_test, "lgb_cal_test")
229
+
230
+ from sklearn.base import BaseEstimator, ClassifierMixin
231
+
232
+ class PriorProbaPredictor(BaseEstimator, ClassifierMixin):
233
+ """Predict class probabilities equal to the class distribution in training data."""
234
+ def fit(self, X, y):
235
+ y = np.asarray(y)
236
+ classes, counts = np.unique(y, return_counts=True)
237
+ self.classes_ = classes
238
+ self.class_proba_ = counts / counts.sum()
239
+ return self
240
+
241
+ def predict_proba(self, X):
242
+ n = len(X)
243
+ # return array shape (n_samples, n_classes) following classes_ order
244
+ return np.tile(self.class_proba_, (n, 1))
245
+
246
+ def predict(self, X):
247
+ proba = self.predict_proba(X)
248
+ return proba.argmax(axis=1)
249
+
250
+ odds = PriorProbaPredictor()
251
+ odds.fit(X_train, y_train)
252
+ proba_odds_valid = odds.predict_proba(X_valid)
253
+ proba_odds_test = odds.predict_proba(X_test)
254
+
255
+ m_odds_valid = metrics_report(y_valid, proba_odds_valid, "odds_valid")
256
+ m_odds_test = metrics_report(y_test, proba_odds_test, "odds_test")
257
+ # ...existing code...
258
+ # -----------------------------
259
+ # 9) Summary table
260
+ # -----------------------------
261
+ summary = pd.DataFrame([
262
+ m_odds_valid, m_odds_test,
263
+ m_xgb_valid, m_xgb_test, m_xgb_cal_valid, m_xgb_cal_test, m_xgb_platt_test,
264
+ m_lgb_valid, m_lgb_test, m_lgb_cal_valid, m_lgb_cal_test
265
+ ]).sort_values("model").reset_index(drop=True)
266
+
267
+ try:
268
+ from IPython.display import display as _display
269
+ _display(summary)
270
+ except Exception:
271
+ print(summary.to_string(index=False))
272
+
273
+
274
+ # Optional: save metrics
275
+ summary.to_csv("./evaluation/baseline_metrics.csv", index=False)
276
+ print("Saved: baseline_metrics.csv")
277
+
278
+ Path(".").mkdir(exist_ok=True)
279
+
280
+ # เลือกโมเดลที่ต้องการใช้ inference (แนะนำตัวที่ calibrated แล้ว)
281
+ joblib.dump(cal_xgb, "./model/model_xgb_isotonic.joblib")
282
+ joblib.dump(cal_lgb, "./model/model_lgb_isotonic.joblib")
283
+
284
+ # เก็บคอลัมน์ฟีเจอร์ และพารามิเตอร์สำคัญ
285
+ with open("feature_columns.json", "w", encoding="utf-8") as f:
286
+ json.dump({"X_cols": X_cols, "WINDOW": int(WINDOW)}, f, ensure_ascii=False, indent=2)
287
+
288
+ print("Saved: model_xgb_isotonic.joblib, model_lgb_isotonic.joblib, feature_columns.json")
preprocess_data.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from collections import defaultdict
4
+ from unidecode import unidecode
5
+ from sklearn.metrics import log_loss, accuracy_score
6
+
7
+ def prepare_features(data_raw: pd.DataFrame, window: int = 7, verbose: bool = True):
8
+ """Prepare features from raw EPL data.
9
+
10
+ Returns (feat_df, X_cols, WINDOW, base_df)
11
+ - feat_df: DataFrame with features aligned to training
12
+ - X_cols: list of feature column names used for modeling
13
+ - WINDOW: the rolling window used
14
+ - base_df: base match DataFrame with cleaned columns (date, home, away, ftr, ...)
15
+ """
16
+ RENAME = {
17
+ "Date":"date","Time":"time","HomeTeam":"home","AwayTeam":"away",
18
+ "FTHG":"fthg","FTAG":"ftag","FTR":"ftr",
19
+ "HTHG":"hthg","HTAG":"htag","HTR":"htr",
20
+ "Referee":"ref",
21
+ "HS":"hs","AS":"as","HST":"hst","AST":"ast",
22
+ "HF":"hf","AF":"af","HC":"hc","AC":"ac",
23
+ "HY":"hy","AY":"ay","HR":"hr","AR":"ar",
24
+ # odds (Bet365, William Hill, Pinnacle(PS), VC)
25
+ "B365H":"b365h","B365D":"b365d","B365A":"b365a",
26
+ "WHH":"whh","WHD":"whd","WHA":"wha",
27
+ "PSH":"psh","PSD":"psd","PSA":"psa",
28
+ "VCH":"vch","VCD":"vcd","VCA":"vca",
29
+ }
30
+ df = data_raw.rename(columns=RENAME).copy()
31
+
32
+ # parse date
33
+ from datetime import datetime
34
+ def parse_date(x):
35
+ for fmt in ("%d/%m/%Y", "%d/%m/%y", "%Y-%m-%d"):
36
+ try:
37
+ return datetime.strptime(str(x), fmt)
38
+ except Exception:
39
+ pass
40
+ return pd.NaT
41
+ df["date"] = df["date"].map(parse_date)
42
+ df = df[~df["date"].isna()].copy()
43
+
44
+ # clean team names
45
+ def clean_team(s):
46
+ if pd.isna(s): return s
47
+ s = unidecode(str(s)).strip()
48
+ s = " ".join(s.split())
49
+ return s
50
+ df["home"] = df["home"].map(clean_team)
51
+ df["away"] = df["away"].map(clean_team)
52
+
53
+ # keep valid rows
54
+ df = df[(df["ftr"].isin(["H","D","A"])) & (~df["home"].isna()) & (~df["away"].isna())].copy()
55
+ df.sort_values(["date","home","away"], inplace=True, ignore_index=True)
56
+
57
+ # target
58
+ label_map = {"H":0, "D":1, "A":2}
59
+ df["y"] = df["ftr"].map(label_map)
60
+
61
+ # -----------------------------
62
+ # 3) Odds → implied probabilities (normalize overround)
63
+ # -----------------------------
64
+ def implied_probs(row, prefix):
65
+ h,d,a = row.get(prefix+"h"), row.get(prefix+"d"), row.get(prefix+"a")
66
+ if any(pd.isna([h,d,a])): return pd.Series([np.nan,np.nan,np.nan])
67
+ if min(h,d,a) <= 1.0: return pd.Series([np.nan,np.nan,np.nan])
68
+ inv = np.array([1/h, 1/d, 1/a], dtype=float)
69
+ s = inv.sum()
70
+ if s <= 0: return pd.Series([np.nan,np.nan,np.nan])
71
+ return pd.Series(inv / s)
72
+
73
+ for bk in ["b365","wh","ps","vc"]:
74
+ cols_exist = all([(bk+c) in df.columns for c in ["h","d","a"]])
75
+ if cols_exist:
76
+ probs = df.apply(lambda r: implied_probs(r, bk), axis=1, result_type="expand")
77
+ df[[f"p_{bk}_H", f"p_{bk}_D", f"p_{bk}_A"]] = probs
78
+
79
+ prob_cols = [c for c in df.columns if c.startswith("p_") and c[-2:] in ["_H","_D","_A"]]
80
+ def avg_prob(suffix):
81
+ cols = [c for c in prob_cols if c.endswith(suffix)]
82
+ return df[cols].mean(axis=1)
83
+
84
+ df["p_odds_H"] = avg_prob("_H")
85
+ df["p_odds_D"] = avg_prob("_D")
86
+ df["p_odds_A"] = avg_prob("_A")
87
+
88
+ # -----------------------------
89
+ # 4) Leak-free features: rolling form + simple Elo
90
+ # -----------------------------
91
+ def result_points(ftr, is_home):
92
+ if ftr == "D": return 1
93
+ if ftr == "H": return 3 if is_home else 0
94
+ if ftr == "A": return 0 if is_home else 3
95
+ return 0
96
+
97
+ tm_rows = []
98
+ for i, r in df.iterrows():
99
+ # home perspective
100
+ tm_rows.append({
101
+ "match_id": i, "date": r["date"], "team": r["home"], "opp": r["away"], "is_home": 1,
102
+ "gf": r["fthg"], "ga": r["ftag"],
103
+ "shots_f": r.get("hs", np.nan), "shots_a": r.get("as", np.nan),
104
+ "sot_f": r.get("hst", np.nan), "sot_a": r.get("ast", np.nan),
105
+ "corn_f": r.get("hc", np.nan), "corn_a": r.get("ac", np.nan),
106
+ "y_f": r.get("hy", np.nan), "y_a": r.get("ay", np.nan),
107
+ "r_f": r.get("hr", np.nan), "r_a": r.get("ar", np.nan),
108
+ "points": result_points(r["ftr"], True),
109
+ })
110
+ # away perspective
111
+ tm_rows.append({
112
+ "match_id": i, "date": r["date"], "team": r["away"], "opp": r["home"], "is_home": 0,
113
+ "gf": r["ftag"], "ga": r["fthg"],
114
+ "shots_f": r.get("as", np.nan), "shots_a": r.get("hs", np.nan),
115
+ "sot_f": r.get("ast", np.nan), "sot_a": r.get("hst", np.nan),
116
+ "corn_f": r.get("ac", np.nan), "corn_a": r.get("hc", np.nan),
117
+ "y_f": r.get("ay", np.nan), "y_a": r.get("hy", np.nan),
118
+ "r_f": r.get("ar", np.nan), "r_a": r.get("hr", np.nan),
119
+ "points": result_points(r["ftr"], False),
120
+ })
121
+ tm = pd.DataFrame(tm_rows).sort_values(["team","date"]).reset_index(drop=True)
122
+
123
+ WINDOW = int(window)
124
+ agg_cols = ["gf","ga","shots_f","shots_a","sot_f","sot_a","corn_f","corn_a","y_f","r_f","points"]
125
+ for col in agg_cols:
126
+ tm[f"roll_{col}"] = (tm.groupby("team")[col]
127
+ .rolling(WINDOW, min_periods=1).mean()
128
+ .shift(1) # ใช้ข้อมูลก่อนหน้าเท่านั้น
129
+ .reset_index(level=0, drop=True))
130
+
131
+ # Elo (ง่าย)
132
+ BASE_ELO = 1500.0
133
+ K = 20.0
134
+ HOME_ADV = 60.0
135
+
136
+ elo = defaultdict(lambda: BASE_ELO)
137
+ elo_before_home, elo_before_away = [], []
138
+
139
+ df_sorted = df.sort_values("date").reset_index(drop=True)
140
+ for i, r in df_sorted.iterrows():
141
+ h, a = r["home"], r["away"]
142
+ eh, ea = elo[h], elo[a]
143
+ elo_before_home.append(eh); elo_before_away.append(ea)
144
+ ph = 1.0/(1.0 + 10**(-((eh+HOME_ADV)-ea)/400))
145
+ if r["ftr"] == "H": oh, oa = 1.0, 0.0
146
+ elif r["ftr"] == "D": oh, oa = 0.5, 0.5
147
+ else: oh, oa = 0.0, 1.0
148
+ elo[h] = eh + K*(oh - ph)
149
+ elo[a] = ea + K*((1.0-oh) - (1.0-ph))
150
+
151
+ df_sorted["elo_home"] = elo_before_home
152
+ df_sorted["elo_away"] = elo_before_away
153
+ df_sorted["elo_diff"] = df_sorted["elo_home"] - df_sorted["elo_away"]
154
+
155
+ # Merge rolling features into match rows
156
+ home_tm = tm[tm["is_home"]==1].copy()
157
+ away_tm = tm[tm["is_home"]==0].copy()
158
+ home_feats = home_tm.filter(regex="^roll_").columns.tolist()
159
+ hf = home_tm[["match_id"] + home_feats].rename(columns={c: f"home_{c}" for c in home_feats})
160
+ af = away_tm[["match_id"] + home_feats].rename(columns={c: f"away_{c}" for c in home_feats})
161
+
162
+ feat_df = df_sorted.merge(hf, left_index=True, right_on="match_id", how="left") \
163
+ .merge(af, left_index=True, right_on="match_id", how="left")
164
+
165
+ # Fill odds missing (keep baseline)
166
+ for c in ["p_odds_H","p_odds_D","p_odds_A"]:
167
+ if c in feat_df.columns:
168
+ feat_df[c] = feat_df[c].astype(float).fillna(feat_df[c].mean())
169
+
170
+ role_feats = [f"home_{c}" for c in home_feats] + [f"away_{c}" for c in home_feats]
171
+ elo_feats = ["elo_home","elo_away","elo_diff"]
172
+ odds_feats = ["p_odds_H","p_odds_D","p_odds_A"]
173
+ X_cols = role_feats + elo_feats + odds_feats
174
+
175
+ for c in X_cols:
176
+ if c not in feat_df.columns:
177
+ feat_df[c] = np.nan
178
+ feat_df[c] = feat_df[c].astype(float).fillna(feat_df[c].median())
179
+
180
+ # -----------------------------
181
+ # 5) Time-based split (kept for compatibility, but not returned)
182
+ # -----------------------------
183
+ n = len(feat_df)
184
+ idx_train = int(n*0.70)
185
+ idx_valid = int(n*0.85)
186
+ if verbose and n > 0:
187
+ dates_train = feat_df["date"].iloc[:idx_train].max()
188
+ dates_valid = (feat_df["date"].iloc[idx_train:idx_valid].min(),
189
+ feat_df["date"].iloc[idx_train:idx_valid].max())
190
+ dates_test = (feat_df["date"].iloc[idx_valid:].min(),
191
+ feat_df["date"].iloc[idx_valid:].max())
192
+ print(f"Train up to: {dates_train:%Y-%m-%d}")
193
+ print(f"Valid: {dates_valid[0]:%Y-%m-%d} .. {dates_valid[1]:%Y-%m-%d}")
194
+ print(f"Test : {dates_test[0]:%Y-%m-%d} .. {dates_test[1]:%Y-%m-%d}")
195
+
196
+ return feat_df, X_cols, WINDOW, df_sorted