Match-Prediction / preprocess_data.py
Teera's picture
Upload 2 files
ff52fb3 verified
import numpy as np
import pandas as pd
from collections import defaultdict
from unidecode import unidecode
from sklearn.metrics import log_loss, accuracy_score
def prepare_features(data_raw: pd.DataFrame, window: int = 7, verbose: bool = True):
"""Prepare features from raw EPL data.
Returns (feat_df, X_cols, WINDOW, base_df)
- feat_df: DataFrame with features aligned to training
- X_cols: list of feature column names used for modeling
- WINDOW: the rolling window used
- base_df: base match DataFrame with cleaned columns (date, home, away, ftr, ...)
"""
RENAME = {
"Date":"date","Time":"time","HomeTeam":"home","AwayTeam":"away",
"FTHG":"fthg","FTAG":"ftag","FTR":"ftr",
"HTHG":"hthg","HTAG":"htag","HTR":"htr",
"Referee":"ref",
"HS":"hs","AS":"as","HST":"hst","AST":"ast",
"HF":"hf","AF":"af","HC":"hc","AC":"ac",
"HY":"hy","AY":"ay","HR":"hr","AR":"ar",
# odds (Bet365, William Hill, Pinnacle(PS), VC)
"B365H":"b365h","B365D":"b365d","B365A":"b365a",
"WHH":"whh","WHD":"whd","WHA":"wha",
"PSH":"psh","PSD":"psd","PSA":"psa",
"VCH":"vch","VCD":"vcd","VCA":"vca",
}
df = data_raw.rename(columns=RENAME).copy()
# parse date
from datetime import datetime
def parse_date(x):
for fmt in ("%d/%m/%Y", "%d/%m/%y", "%Y-%m-%d"):
try:
return datetime.strptime(str(x), fmt)
except Exception:
pass
return pd.NaT
df["date"] = df["date"].map(parse_date)
df = df[~df["date"].isna()].copy()
# clean team names
def clean_team(s):
if pd.isna(s): return s
s = unidecode(str(s)).strip()
s = " ".join(s.split())
return s
df["home"] = df["home"].map(clean_team)
df["away"] = df["away"].map(clean_team)
# keep valid rows
df = df[(df["ftr"].isin(["H","D","A"])) & (~df["home"].isna()) & (~df["away"].isna())].copy()
df.sort_values(["date","home","away"], inplace=True, ignore_index=True)
# target
label_map = {"H":0, "D":1, "A":2}
df["y"] = df["ftr"].map(label_map)
# -----------------------------
# 3) Odds → implied probabilities (normalize overround)
# -----------------------------
def implied_probs(row, prefix):
h,d,a = row.get(prefix+"h"), row.get(prefix+"d"), row.get(prefix+"a")
if any(pd.isna([h,d,a])): return pd.Series([np.nan,np.nan,np.nan])
if min(h,d,a) <= 1.0: return pd.Series([np.nan,np.nan,np.nan])
inv = np.array([1/h, 1/d, 1/a], dtype=float)
s = inv.sum()
if s <= 0: return pd.Series([np.nan,np.nan,np.nan])
return pd.Series(inv / s)
for bk in ["b365","wh","ps","vc"]:
cols_exist = all([(bk+c) in df.columns for c in ["h","d","a"]])
if cols_exist:
probs = df.apply(lambda r: implied_probs(r, bk), axis=1, result_type="expand")
df[[f"p_{bk}_H", f"p_{bk}_D", f"p_{bk}_A"]] = probs
prob_cols = [c for c in df.columns if c.startswith("p_") and c[-2:] in ["_H","_D","_A"]]
def avg_prob(suffix):
cols = [c for c in prob_cols if c.endswith(suffix)]
return df[cols].mean(axis=1)
df["p_odds_H"] = avg_prob("_H")
df["p_odds_D"] = avg_prob("_D")
df["p_odds_A"] = avg_prob("_A")
# -----------------------------
# 4) Leak-free features: rolling form + simple Elo
# -----------------------------
def result_points(ftr, is_home):
if ftr == "D": return 1
if ftr == "H": return 3 if is_home else 0
if ftr == "A": return 0 if is_home else 3
return 0
tm_rows = []
for i, r in df.iterrows():
# home perspective
tm_rows.append({
"match_id": i, "date": r["date"], "team": r["home"], "opp": r["away"], "is_home": 1,
"gf": r["fthg"], "ga": r["ftag"],
"shots_f": r.get("hs", np.nan), "shots_a": r.get("as", np.nan),
"sot_f": r.get("hst", np.nan), "sot_a": r.get("ast", np.nan),
"corn_f": r.get("hc", np.nan), "corn_a": r.get("ac", np.nan),
"y_f": r.get("hy", np.nan), "y_a": r.get("ay", np.nan),
"r_f": r.get("hr", np.nan), "r_a": r.get("ar", np.nan),
"points": result_points(r["ftr"], True),
})
# away perspective
tm_rows.append({
"match_id": i, "date": r["date"], "team": r["away"], "opp": r["home"], "is_home": 0,
"gf": r["ftag"], "ga": r["fthg"],
"shots_f": r.get("as", np.nan), "shots_a": r.get("hs", np.nan),
"sot_f": r.get("ast", np.nan), "sot_a": r.get("hst", np.nan),
"corn_f": r.get("ac", np.nan), "corn_a": r.get("hc", np.nan),
"y_f": r.get("ay", np.nan), "y_a": r.get("hy", np.nan),
"r_f": r.get("ar", np.nan), "r_a": r.get("hr", np.nan),
"points": result_points(r["ftr"], False),
})
tm = pd.DataFrame(tm_rows).sort_values(["team","date"]).reset_index(drop=True)
WINDOW = int(window)
agg_cols = ["gf","ga","shots_f","shots_a","sot_f","sot_a","corn_f","corn_a","y_f","r_f","points"]
for col in agg_cols:
tm[f"roll_{col}"] = (tm.groupby("team")[col]
.rolling(WINDOW, min_periods=1).mean()
.shift(1) # ใช้ข้อมูลก่อนหน้าเท่านั้น
.reset_index(level=0, drop=True))
# Elo (ง่าย)
BASE_ELO = 1500.0
K = 20.0
HOME_ADV = 60.0
elo = defaultdict(lambda: BASE_ELO)
elo_before_home, elo_before_away = [], []
df_sorted = df.sort_values("date").reset_index(drop=True)
for i, r in df_sorted.iterrows():
h, a = r["home"], r["away"]
eh, ea = elo[h], elo[a]
elo_before_home.append(eh); elo_before_away.append(ea)
ph = 1.0/(1.0 + 10**(-((eh+HOME_ADV)-ea)/400))
if r["ftr"] == "H": oh, oa = 1.0, 0.0
elif r["ftr"] == "D": oh, oa = 0.5, 0.5
else: oh, oa = 0.0, 1.0
elo[h] = eh + K*(oh - ph)
elo[a] = ea + K*((1.0-oh) - (1.0-ph))
df_sorted["elo_home"] = elo_before_home
df_sorted["elo_away"] = elo_before_away
df_sorted["elo_diff"] = df_sorted["elo_home"] - df_sorted["elo_away"]
# Merge rolling features into match rows
home_tm = tm[tm["is_home"]==1].copy()
away_tm = tm[tm["is_home"]==0].copy()
home_feats = home_tm.filter(regex="^roll_").columns.tolist()
hf = home_tm[["match_id"] + home_feats].rename(columns={c: f"home_{c}" for c in home_feats})
af = away_tm[["match_id"] + home_feats].rename(columns={c: f"away_{c}" for c in home_feats})
feat_df = df_sorted.merge(hf, left_index=True, right_on="match_id", how="left") \
.merge(af, left_index=True, right_on="match_id", how="left")
# Fill odds missing (keep baseline)
for c in ["p_odds_H","p_odds_D","p_odds_A"]:
if c in feat_df.columns:
feat_df[c] = feat_df[c].astype(float).fillna(feat_df[c].mean())
role_feats = [f"home_{c}" for c in home_feats] + [f"away_{c}" for c in home_feats]
elo_feats = ["elo_home","elo_away","elo_diff"]
odds_feats = ["p_odds_H","p_odds_D","p_odds_A"]
X_cols = role_feats + elo_feats + odds_feats
for c in X_cols:
if c not in feat_df.columns:
feat_df[c] = np.nan
feat_df[c] = feat_df[c].astype(float).fillna(feat_df[c].median())
# -----------------------------
# 5) Time-based split (kept for compatibility, but not returned)
# -----------------------------
n = len(feat_df)
idx_train = int(n*0.70)
idx_valid = int(n*0.85)
if verbose and n > 0:
dates_train = feat_df["date"].iloc[:idx_train].max()
dates_valid = (feat_df["date"].iloc[idx_train:idx_valid].min(),
feat_df["date"].iloc[idx_train:idx_valid].max())
dates_test = (feat_df["date"].iloc[idx_valid:].min(),
feat_df["date"].iloc[idx_valid:].max())
print(f"Train up to: {dates_train:%Y-%m-%d}")
print(f"Valid: {dates_valid[0]:%Y-%m-%d} .. {dates_valid[1]:%Y-%m-%d}")
print(f"Test : {dates_test[0]:%Y-%m-%d} .. {dates_test[1]:%Y-%m-%d}")
return feat_df, X_cols, WINDOW, df_sorted