Spaces:
Sleeping
Sleeping
File size: 7,129 Bytes
46b7fa3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import numpy as np
import pandas as pd
from collections import defaultdict
from datetime import datetime
from typing import Optional, Tuple, Dict, List
from unidecode import unidecode
# --- Team name cleaner (shared) ---
def clean_team(s: str) -> str:
if pd.isna(s):
return s
s = unidecode(str(s)).strip()
s = " ".join(s.split())
return s
# --- Odds -> implied probabilities (normalized) ---
def implied_from_odds(odds_tuple: Tuple[float, float, float]) -> Optional[Tuple[float, float, float]]:
h, d, a = odds_tuple
if min(h, d, a) <= 1.0:
return None
inv = np.array([1.0/h, 1.0/d, 1.0/a], dtype=float)
s = inv.sum()
if s <= 0:
return None
p = inv / s
return float(p[0]), float(p[1]), float(p[2])
# --- Elo snapshot from historical df (up to date-1) ---
def compute_elo_snapshot(df_hist: pd.DataFrame, base_elo: float = 1500.0, K: float = 20.0, home_adv: float = 60.0) -> Dict[str, float]:
elo = defaultdict(lambda: base_elo)
dfh = df_hist.sort_values("date").reset_index(drop=True)
for _, r in dfh.iterrows():
h, a = r["home"], r["away"]
eh, ea = elo[h], elo[a]
ph = 1.0/(1.0 + 10**(-((eh+home_adv)-ea)/400))
if r["ftr"] == "H":
oh, oa = 1.0, 0.0
elif r["ftr"] == "D":
oh, oa = 0.5, 0.5
else:
oh, oa = 0.0, 1.0
elo[h] = eh + K*(oh - ph)
elo[a] = ea + K*((1.0-oh) - (1.0-ph))
return dict(elo)
# --- Build rolling features for a single team from history ---
def team_rolling_features(df_hist: pd.DataFrame, team_name: str, window: int = 6):
rows: List[dict] = []
for _, r in df_hist.iterrows():
rows.append({
"date": r["date"], "team": r["home"], "is_home": 1,
"gf": r["fthg"], "ga": r["ftag"],
"shots_f": r.get("hs", np.nan), "shots_a": r.get("as", np.nan),
"sot_f": r.get("hst", np.nan), "sot_a": r.get("ast", np.nan),
"corn_f": r.get("hc", np.nan), "corn_a": r.get("ac", np.nan),
"y_f": r.get("hy", np.nan), "y_a": r.get("ay", np.nan),
"r_f": r.get("hr", np.nan), "r_a": r.get("ar", np.nan),
"points": 3 if r["ftr"] == "H" else (1 if r["ftr"] == "D" else 0),
})
rows.append({
"date": r["date"], "team": r["away"], "is_home": 0,
"gf": r["ftag"], "ga": r["fthg"],
"shots_f": r.get("as", np.nan), "shots_a": r.get("hs", np.nan),
"sot_f": r.get("ast", np.nan), "sot_a": r.get("hst", np.nan),
"corn_f": r.get("ac", np.nan), "corn_a": r.get("hc", np.nan),
"y_f": r.get("ay", np.nan), "y_a": r.get("hy", np.nan),
"r_f": r.get("ar", np.nan), "r_a": r.get("hr", np.nan),
"points": 3 if r["ftr"] == "A" else (1 if r["ftr"] == "D" else 0),
})
tm = pd.DataFrame(rows)
if tm.empty:
return None, None
tm = tm.sort_values(["team", "date"]).reset_index(drop=True)
agg_cols = ["gf", "ga", "shots_f", "shots_a", "sot_f", "sot_a", "corn_f", "corn_a", "y_f", "r_f", "points"]
feats_home = {}
feats_away = {}
if (tm["team"] == team_name).any():
tdf = tm[tm["team"] == team_name]
if (tdf["is_home"] == 1).any():
t_home = tdf[tdf["is_home"] == 1]
for col in agg_cols:
feats_home[f"roll_{col}"] = t_home[col].tail(window).mean()
else:
for col in agg_cols:
feats_home[f"roll_{col}"] = np.nan
if (tdf["is_home"] == 0).any():
t_away = tdf[tdf["is_home"] == 0]
for col in agg_cols:
feats_away[f"roll_{col}"] = t_away[col].tail(window).mean()
else:
for col in agg_cols:
feats_away[f"roll_{col}"] = np.nan
else:
for col in agg_cols:
feats_home[f"roll_{col}"] = np.nan
feats_away[f"roll_{col}"] = np.nan
return feats_home, feats_away
# --- Build full feature vector for a fixture ---
def build_features_for_fixture(
home_team: str,
away_team: str,
date_str: str,
df_all: pd.DataFrame,
X_cols: List[str],
window: int = 6,
odds_tuple: Optional[Tuple[float, float, float]] = None,
feat_df_for_medians: Optional[pd.DataFrame] = None,
):
home = clean_team(home_team)
away = clean_team(away_team)
match_date = datetime.strptime(date_str, "%Y-%m-%d")
df_hist = df_all[df_all["date"] < match_date].copy()
if df_hist.empty:
raise ValueError("No historical data found before match date. Try a later date or load more seasons.")
# Elo snapshot
elo_map = compute_elo_snapshot(df_hist)
elo_home = float(elo_map.get(home, 1500.0))
elo_away = float(elo_map.get(away, 1500.0))
elo_diff = elo_home - elo_away
# Rolling features by role
feats_home_homeRole, _ = team_rolling_features(df_hist, home, window)
_, feats_away_awayRole = team_rolling_features(df_hist, away, window)
if feats_home_homeRole is None:
feats_home_homeRole = {f"roll_{c}": np.nan for c in ["gf","ga","shots_f","shots_a","sot_f","sot_a","corn_f","corn_a","y_f","r_f","points"]}
if feats_away_awayRole is None:
feats_away_awayRole = {f"roll_{c}": np.nan for c in ["gf","ga","shots_f","shots_a","sot_f","sot_a","corn_f","corn_a","y_f","r_f","points"]}
# Odds -> probs
if odds_tuple is not None:
probs = implied_from_odds(odds_tuple)
if probs is None:
raise ValueError("Invalid odds provided. Use decimal odds > 1.0")
pH, pD, pA = probs
else:
# Proxy from Elo + average draw rate
draws = (df_hist["ftr"] == "D").mean()
draws = float(draws) if not np.isnan(draws) and draws > 0 else 0.25
k = 400.0
ph_nodraw = 1.0/(1.0 + 10**(-(elo_diff)/k))
pa_nodraw = 1.0 - ph_nodraw
pH = ph_nodraw * (1.0 - draws)
pA = pa_nodraw * (1.0 - draws)
pD = draws
s = pH + pD + pA
pH, pD, pA = pH/s, pD/s, pA/s
# Assemble features in X_cols order
feat_row: Dict[str, float] = {}
for k, v in feats_home_homeRole.items():
feat_row[f"home_{k}"] = v
for k, v in feats_away_awayRole.items():
feat_row[f"away_{k}"] = v
feat_row["elo_home"] = elo_home
feat_row["elo_away"] = elo_away
feat_row["elo_diff"] = elo_diff
feat_row["p_odds_H"] = float(pH)
feat_row["p_odds_D"] = float(pD)
feat_row["p_odds_A"] = float(pA)
x_vals = [feat_row.get(c, np.nan) for c in X_cols]
x = np.array(x_vals, dtype=float).reshape(1, -1)
if feat_df_for_medians is not None:
medians = {c: float(feat_df_for_medians[c].median()) if c in feat_df_for_medians.columns else 0.0 for c in X_cols}
else:
# fallback zeros
medians = {c: 0.0 for c in X_cols}
for j, c in enumerate(X_cols):
if np.isnan(x[0, j]):
x[0, j] = medians[c]
context = {
"p_odds_H": pH,
"p_odds_D": pD,
"p_odds_A": pA,
"elo_home": elo_home,
"elo_away": elo_away,
}
return x, context
|