Match-Prediction / ingest_data.py
Teera's picture
Upload 8 files
46b7fa3 verified
import io
from collections import defaultdict
from datetime import datetime
import numpy as np
import pandas as pd
import requests
from unidecode import unidecode
from tqdm import tqdm
# -----------------------------
# 1) Download EPL seasons
# -----------------------------
BASE_URL = "https://www.football-data.co.uk/mmz4281/{scode}/E0.csv"
def season_code(y1):
# 1993 -> '9394', 2024 -> '2425'
return f"{str(y1)[-2:]}{(y1+1)%100:02d}"
def try_read_csv_bytes(content):
for enc in ("utf-8", "latin-1", "ISO-8859-1"):
try:
return pd.read_csv(io.BytesIO(content), encoding=enc)
except Exception:
pass
return pd.DataFrame()
def fetch_season(y1, verbose=True):
scode = season_code(y1)
url = BASE_URL.format(scode=scode)
r = requests.get(url, timeout=30)
if r.status_code != 200 or len(r.content) < 100:
if verbose: print(f"[skip] {y1}-{(y1+1)%100:02d} ({scode}) not available")
return pd.DataFrame()
df = try_read_csv_bytes(r.content)
if df.empty:
if verbose: print(f"[warn] parse error {scode}")
return pd.DataFrame()
df["SeasonFirstYear"] = y1
df["Season"] = f"{y1}-{str(y1+1)[-2:]}"
df["SeasonCode"] = scode
return df
def load_epl_data(start_y1: int = 1993, end_y1: int | None = None, verbose: bool = False) -> pd.DataFrame:
"""Download and concatenate EPL seasons into a single DataFrame.
- start_y1: first season starting year (inclusive), e.g., 1993
- end_y1: last season starting year (inclusive). Defaults to current year + 1 to include latest.
- verbose: print per-season logs
"""
if end_y1 is None:
end_y1 = datetime.now().year + 1
frames = []
for y in tqdm(range(start_y1, end_y1 + 1)):
df = fetch_season(y, verbose=verbose)
if not df.empty:
frames.append(df)
data_raw = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
if data_raw.empty:
raise RuntimeError("No data downloaded. Try expanding start_y1 backward or check internet.")
return data_raw
if __name__ == "__main__":
# Example usage when running this file directly
data_raw = load_epl_data(start_y1=1993, end_y1=datetime.now().year + 1, verbose=True)
print("Seasons loaded:", sorted(data_raw["Season"].unique()))