File size: 2,333 Bytes
46b7fa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import io
from collections import defaultdict
from datetime import datetime

import numpy as np
import pandas as pd
import requests
from unidecode import unidecode
from tqdm import tqdm


# -----------------------------
# 1) Download EPL seasons
# -----------------------------
BASE_URL = "https://www.football-data.co.uk/mmz4281/{scode}/E0.csv"

def season_code(y1):
    # 1993 -> '9394', 2024 -> '2425'
    return f"{str(y1)[-2:]}{(y1+1)%100:02d}"

def try_read_csv_bytes(content):
    for enc in ("utf-8", "latin-1", "ISO-8859-1"):
        try:
            return pd.read_csv(io.BytesIO(content), encoding=enc)
        except Exception:
            pass
    return pd.DataFrame()

def fetch_season(y1, verbose=True):
    scode = season_code(y1)
    url = BASE_URL.format(scode=scode)
    r = requests.get(url, timeout=30)
    if r.status_code != 200 or len(r.content) < 100:
        if verbose: print(f"[skip] {y1}-{(y1+1)%100:02d} ({scode}) not available")
        return pd.DataFrame()
    df = try_read_csv_bytes(r.content)
    if df.empty:
        if verbose: print(f"[warn] parse error {scode}")
        return pd.DataFrame()
    df["SeasonFirstYear"] = y1
    df["Season"] = f"{y1}-{str(y1+1)[-2:]}"
    df["SeasonCode"] = scode
    return df

def load_epl_data(start_y1: int = 1993, end_y1: int | None = None, verbose: bool = False) -> pd.DataFrame:
    """Download and concatenate EPL seasons into a single DataFrame.

    - start_y1: first season starting year (inclusive), e.g., 1993
    - end_y1: last season starting year (inclusive). Defaults to current year + 1 to include latest.
    - verbose: print per-season logs
    """
    if end_y1 is None:
        end_y1 = datetime.now().year + 1
    frames = []
    for y in tqdm(range(start_y1, end_y1 + 1)):
        df = fetch_season(y, verbose=verbose)
        if not df.empty:
            frames.append(df)
    data_raw = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
    if data_raw.empty:
        raise RuntimeError("No data downloaded. Try expanding start_y1 backward or check internet.")
    return data_raw

if __name__ == "__main__":
    # Example usage when running this file directly
    data_raw = load_epl_data(start_y1=1993, end_y1=datetime.now().year + 1, verbose=True)
    print("Seasons loaded:", sorted(data_raw["Season"].unique()))