Spaces:
Sleeping
Sleeping
File size: 2,333 Bytes
46b7fa3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import io
from collections import defaultdict
from datetime import datetime
import numpy as np
import pandas as pd
import requests
from unidecode import unidecode
from tqdm import tqdm
# -----------------------------
# 1) Download EPL seasons
# -----------------------------
BASE_URL = "https://www.football-data.co.uk/mmz4281/{scode}/E0.csv"
def season_code(y1):
# 1993 -> '9394', 2024 -> '2425'
return f"{str(y1)[-2:]}{(y1+1)%100:02d}"
def try_read_csv_bytes(content):
for enc in ("utf-8", "latin-1", "ISO-8859-1"):
try:
return pd.read_csv(io.BytesIO(content), encoding=enc)
except Exception:
pass
return pd.DataFrame()
def fetch_season(y1, verbose=True):
scode = season_code(y1)
url = BASE_URL.format(scode=scode)
r = requests.get(url, timeout=30)
if r.status_code != 200 or len(r.content) < 100:
if verbose: print(f"[skip] {y1}-{(y1+1)%100:02d} ({scode}) not available")
return pd.DataFrame()
df = try_read_csv_bytes(r.content)
if df.empty:
if verbose: print(f"[warn] parse error {scode}")
return pd.DataFrame()
df["SeasonFirstYear"] = y1
df["Season"] = f"{y1}-{str(y1+1)[-2:]}"
df["SeasonCode"] = scode
return df
def load_epl_data(start_y1: int = 1993, end_y1: int | None = None, verbose: bool = False) -> pd.DataFrame:
"""Download and concatenate EPL seasons into a single DataFrame.
- start_y1: first season starting year (inclusive), e.g., 1993
- end_y1: last season starting year (inclusive). Defaults to current year + 1 to include latest.
- verbose: print per-season logs
"""
if end_y1 is None:
end_y1 = datetime.now().year + 1
frames = []
for y in tqdm(range(start_y1, end_y1 + 1)):
df = fetch_season(y, verbose=verbose)
if not df.empty:
frames.append(df)
data_raw = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
if data_raw.empty:
raise RuntimeError("No data downloaded. Try expanding start_y1 backward or check internet.")
return data_raw
if __name__ == "__main__":
# Example usage when running this file directly
data_raw = load_epl_data(start_y1=1993, end_y1=datetime.now().year + 1, verbose=True)
print("Seasons loaded:", sorted(data_raw["Season"].unique())) |