Spaces:

aqibtahir
/

cookie-classifier-api

Sleeping

App Files Files Community

aqibtahir commited on Oct 30, 2025

Commit

12c511c

verified ·

1 Parent(s): 05ef637

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +264 -0

app.py ADDED Viewed

	@@ -0,0 +1,264 @@

+"""
+FastAPI Serverless API for Cookie Classification
+Deploy this to Hugging Face Spaces for FREE serverless inference!
+"""
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import List, Optional
+from huggingface_hub import hf_hub_download
+import joblib
+import numpy as np
+import re
+import pandas as pd
+from scipy.sparse import hstack, csr_matrix
+import os
+# Initialize FastAPI
+app = FastAPI(
+    title="Cookie Classifier API",
+    description="Classify web cookies into privacy categories: Strictly Necessary, Functionality, Analytics, Advertising/Tracking",
+    version="1.0.0"
+)
+# Enable CORS for frontend access
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, specify your frontend domain
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Class mapping
+CLASS_NAMES = {
+    0: "Strictly Necessary",
+    1: "Functionality",
+    2: "Analytics",
+    3: "Advertising/Tracking"
+}
+# Tracker tokens
+TRACKER_TOKENS = {
+    "ga", "gid", "utm", "ad", "ads", "pixel", "trk", "track", "fbp", "fbc",
+    "gclid", "sess", "session", "id", "uuid", "cid", "cmp", "campaign",
+    "click", "impress"
+}
+# Global model storage
+model = None
+tfidf_word = None
+tfidf_char = None
+def extract_name_features(s: str):
+    """Extract engineered features from cookie name"""
+    if not isinstance(s, str):
+        s = ""
+    lower = s.lower()
+    L = len(s)
+    digits = sum(ch.isdigit() for ch in s)
+    alphas = sum(ch.isalpha() for ch in s)
+    underscores = lower.count("_")
+    dashes = lower.count("-")
+    dots = lower.count(".")
+    prefix3 = lower[:3] if L >= 3 else lower
+    suffix3 = lower[-3:] if L >= 3 else lower
+    tokens = re.split(r"[^a-z0-9]+", lower)
+    tokens = [t for t in tokens if t]
+    uniq_tokens = len(set(tokens))
+    token_len_mean = np.mean([len(t) for t in tokens]) if tokens else 0.0
+    has_tracker = int(any(t in TRACKER_TOKENS for t in tokens))
+    camel = int(bool(re.search(r"[a-z][A-Z]", s)))
+    snake = int("_" in s)
+    has_hex = int(bool(re.search(r"\b[0-9a-f]{8,}\b", lower)))
+    return {
+        "len": L, "digits": digits, "alphas": alphas, "underscores": underscores,
+        "dashes": dashes, "dots": dots, "prefix3": prefix3, "suffix3": suffix3,
+        "uniq_tokens": uniq_tokens, "token_len_mean": float(token_len_mean),
+        "has_tracker_token": has_tracker, "camelCase": camel, "snake_case": snake,
+        "has_hex": has_hex
+    }
+def build_name_features(series):
+    """Build name features DataFrame"""
+    X = pd.DataFrame([extract_name_features(x) for x in series.fillna("")])
+    for col in ["prefix3", "suffix3"]:
+        top = X[col].value_counts().head(30).index
+        X[col] = X[col].where(X[col].isin(top), "__other__")
+    X = pd.get_dummies(X, columns=["prefix3", "suffix3"], drop_first=True)
+    return X
+def preprocess_cookie(cookie_name: str):
+    """Complete preprocessing for a single cookie name"""
+    series = pd.Series([cookie_name])
+    # TF-IDF features
+    Xw = tfidf_word.transform(series.fillna("").astype(str))
+    Xc = tfidf_char.transform(series.fillna("").astype(str))
+    Xtf = hstack([Xw, Xc])
+    # Name features
+    Xname = build_name_features(series)
+    Xname = Xname.select_dtypes(include=[np.number]).astype("float64")
+    # Combine
+    X_combined = hstack([Xtf, csr_matrix(Xname.values)])
+    return X_combined
+@app.on_event("startup")
+async def load_model():
+    """Load model and vectorizers on startup"""
+    global model, tfidf_word, tfidf_char
+    print("Loading model from Hugging Face...")
+    # Download model
+    model_path = hf_hub_download(
+        repo_id="aqibtahir/cookie-classifier-lr-tfidf",
+        filename="LR_TFIDF+NAME.joblib"
+    )
+    model = joblib.load(model_path)
+    # Try to load vectorizers (they should be in the same directory or uploaded separately)
+    try:
+        tfidf_word_path = hf_hub_download(
+            repo_id="aqibtahir/cookie-classifier-lr-tfidf",
+            filename="tfidf_word.joblib"
+        )
+        tfidf_char_path = hf_hub_download(
+            repo_id="aqibtahir/cookie-classifier-lr-tfidf",
+            filename="tfidf_char.joblib"
+        )
+        tfidf_word = joblib.load(tfidf_word_path)
+        tfidf_char = joblib.load(tfidf_char_path)
+        print("✓ Model and vectorizers loaded successfully!")
+    except Exception as e:
+        print(f"⚠️ Warning: Could not load vectorizers: {e}")
+        print("API will work with limited functionality")
+# Request/Response models
+class CookieRequest(BaseModel):
+    cookie_name: str
+class BatchCookieRequest(BaseModel):
+    cookie_names: List[str]
+class PredictionResponse(BaseModel):
+    cookie_name: str
+    category: str
+    class_id: int
+    confidence: Optional[float] = None
+@app.get("/")
+async def root():
+    """Health check and API info"""
+    return {
+        "status": "online",
+        "model": "Cookie Classifier - Linear Regression",
+        "categories": list(CLASS_NAMES.values()),
+        "endpoints": {
+            "predict": "/predict",
+            "batch": "/predict/batch",
+            "docs": "/docs"
+        }
+    }
+@app.post("/predict", response_model=PredictionResponse)
+async def predict(request: CookieRequest):
+    """
+    Predict cookie category for a single cookie name
+    Example:
+    ```
+    POST /predict
+    {"cookie_name": "_ga"}
+    ```
+    """
+    if not model:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    if not tfidf_word or not tfidf_char:
+        raise HTTPException(
+            status_code=503,
+            detail="Vectorizers not available. Please upload tfidf_word.joblib and tfidf_char.joblib to the model repository"
+        )
+    try:
+        # Preprocess and predict
+        features = preprocess_cookie(request.cookie_name)
+        prediction = model.predict(features)[0]
+        class_id = int(prediction)
+        # Get confidence if available
+        confidence = None
+        try:
+            decision = model.decision_function(features)[0]
+            # Normalize decision scores to pseudo-probabilities
+            scores = np.exp(decision) / np.exp(decision).sum()
+            confidence = float(scores[class_id])
+        except:
+            pass
+        return PredictionResponse(
+            cookie_name=request.cookie_name,
+            category=CLASS_NAMES[class_id],
+            class_id=class_id,
+            confidence=confidence
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")
+@app.post("/predict/batch")
+async def predict_batch(request: BatchCookieRequest):
+    """
+    Predict categories for multiple cookie names
+    Example:
+    ```
+    POST /predict/batch
+    {"cookie_names": ["_ga", "sessionid", "utm_campaign"]}
+    ```
+    """
+    if not model:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    if not tfidf_word or not tfidf_char:
+        raise HTTPException(
+            status_code=503,
+            detail="Vectorizers not available"
+        )
+    try:
+        results = []
+        for cookie_name in request.cookie_names:
+            features = preprocess_cookie(cookie_name)
+            prediction = model.predict(features)[0]
+            class_id = int(prediction)
+            confidence = None
+            try:
+                decision = model.decision_function(features)[0]
+                scores = np.exp(decision) / np.exp(decision).sum()
+                confidence = float(scores[class_id])
+            except:
+                pass
+            results.append({
+                "cookie_name": cookie_name,
+                "category": CLASS_NAMES[class_id],
+                "class_id": class_id,
+                "confidence": confidence
+            })
+        return {"predictions": results}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Batch prediction error: {str(e)}")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)