import gradio as gr
import pickle
import joblib
import pandas as pd
import spacy
import torch
import os
from scipy.sparse import hstack
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from arabert.preprocess import ArabertPreprocessor

# 📂 FOLDER CONFIGURATION
PATH_XGB_ENGLISH = "Models/Hybrid_XGBoost_model/"
PATH_BERT_ARABIC = "Models/Fine_tuned_model/"

print("⏳ Starting AI Detection App...")

# ==========================================
# ✅ LOAD SPACY (for linguistic features)
# ==========================================
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading Spacy model...")
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# ==========================================
# ✅ LOAD HYBRID MODEL (XGBoost + TFIDF + Feature Columns)
# ==========================================
def load_pickle(path: str):
    with open(path, "rb") as f:
        return pickle.load(f)

try:
    print(f"Loading Hybrid XGBoost assets from {PATH_XGB_ENGLISH}...")

    xgb_path = os.path.join(PATH_XGB_ENGLISH, "hybrid_xgb.pkl")
    tfidf_path = os.path.join(PATH_XGB_ENGLISH, "tfidf_vectorizer.pkl")
    cols_path = os.path.join(PATH_XGB_ENGLISH, "linguistic_feature_columns.pkl")

    # XGBoost model is normal pickle
    xgb_model = load_pickle(xgb_path)

    # TFIDF is joblib in your case
    tfidf_vectorizer = joblib.load(tfidf_path)

    # Feature columns list is normal pickle
    feature_columns = load_pickle(cols_path)

    print("✅ Hybrid Model Loaded Successfully")

except Exception as e:
    print(f"❌ Error loading Hybrid Model: {e}")
    xgb_model = None
    tfidf_vectorizer = None
    feature_columns = None

# ==========================================
# ✅ LOAD ARABIC MODEL (Fine-tuned AraBERT)
# ==========================================
try:
    print(f"Loading BERT from {PATH_BERT_ARABIC}...")
    device = "cpu"

    tokenizer_bert = AutoTokenizer.from_pretrained(PATH_BERT_ARABIC)
    model_bert = AutoModelForSequenceClassification.from_pretrained(PATH_BERT_ARABIC).to(device)

    arabert_prep = ArabertPreprocessor(model_name="aubmindlab/bert-base-arabertv02")

    print("✅ Fine tuned Model Loaded Successfully")

except Exception as e:
    print(f"❌ Error loading Model: {e}")
    print("⚠️  Did you forget to upload the 'pytorch_model.bin' or 'model.safetensors' file?")
    model_bert = None
    tokenizer_bert = None
    arabert_prep = None
    device = "cpu"

# ==========================================
# 🧠 FEATURE EXTRACTION & PREDICTION
# ==========================================
def get_features(text: str) -> pd.DataFrame:
    """Extract linguistic features using Spacy (POS ratios, etc.)"""
    doc = nlp(text)

    word_count = len([t for t in doc if not t.is_punct])
    if word_count == 0:
        word_count = 1

    pos = doc.count_by(spacy.attrs.POS)

    features = {
        "NOUN_ratio": pos.get(spacy.symbols.NOUN, 0) / word_count,
        "VERB_ratio": pos.get(spacy.symbols.VERB, 0) / word_count,
        "PART_ratio": pos.get(spacy.symbols.PART, 0) / word_count,
        "ADJ_ratio":  pos.get(spacy.symbols.ADJ, 0) / word_count,
        "NUM_ratio":  pos.get(spacy.symbols.NUM, 0) / word_count,
        "PRON_ratio": pos.get(spacy.symbols.PRON, 0) / word_count,
        "DET_ratio":  pos.get(spacy.symbols.DET, 0) / word_count,
        "PUNC_ratio": pos.get(spacy.symbols.PUNCT, 0) / word_count,
        "avg_word_len": sum(len(t.text) for t in doc) / len(doc) if len(doc) > 0 else 0,
        "word_count": word_count,
        "TTR_ratio": len(set([t.text.lower() for t in doc])) / word_count,
        "avg_sentence_len": word_count / len(list(doc.sents)) if len(list(doc.sents)) > 0 else 0,
        "UNKNOWN_ratio": 0,
    }

    # IMPORTANT: keep the column order exactly as training time
    return pd.DataFrame([features])[feature_columns]


def predict_XGBoost(text: str):
    if xgb_model is None or tfidf_vectorizer is None or feature_columns is None:
        return {"Error": "Hybrid model files missing or failed to load."}

    if not text or not text.strip():
        return "Please enter text."

    try:
        tfidf_data = tfidf_vectorizer.transform([text])
        ling_data = get_features(text)
        full_data = hstack([tfidf_data, ling_data])

        probs = xgb_model.predict_proba(full_data)[0]
        return {"👤 نص بشري": float(probs[0]), "🤖 ذكاء اصطناعي": float(probs[1])}

    except Exception as e:
        return f"Prediction Error: {str(e)}"


def predict_arabic(text: str):
    if model_bert is None or tokenizer_bert is None or arabert_prep is None:
        return {"Error": "Arabic model files missing (check pytorch_model.bin / model.safetensors)."}

    if not text or not text.strip():
        return "الرجاء إدخال نص."

    try:
        prep_text = arabert_prep.preprocess(text)

        inputs = tokenizer_bert(
            prep_text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=256
        ).to(device)

        with torch.no_grad():
            outputs = model_bert(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=1)

        prob_human = float(probs[0][0])
        prob_ai = float(probs[0][1])

        return {"👤 نص بشري": prob_human, "🤖 ذكاء اصطناعي": prob_ai}

    except Exception as e:
        return f"Error: {str(e)}"

# ==========================================
# 🎨 GRADIO UI (TABS)
# ==========================================
with gr.Blocks(title="AI Text Detector", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🧠 Arabic AI Generated Text Detector")
    gr.Markdown("Detect if text was written by a Human or AI in Arabic.")

    with gr.Tabs():

        # --- TAB 1: Hybrid ---
        with gr.TabItem("Hybrid Model Detector"):
            gr.Markdown("### Hybrid XGBoost Model (Higher accuracy)")
            gr.Markdown("This model uses hybrid approaches to detect Arabic AI generated text.")

            with gr.Row():
                arr_input = gr.Textbox(lines=5, label="Text", placeholder="اكتب النص هنا...", rtl=True)
                arr_output = gr.Label(label="الإحتمالية")

            eng_btn = gr.Button("حلل النص", variant="primary")
            eng_btn.click(predict_XGBoost, inputs=arr_input, outputs=arr_output)

        # --- TAB 2: ARABIC ---
        with gr.TabItem("AraBERT Detector"):
            gr.Markdown("### Fine-Tuned AraBERT Model")
            gr.Markdown("Fine-tuned AraBERT model using our own dataset to detect Arabic AI generated text.")

            with gr.Row():
                ar_input = gr.Textbox(lines=5, label="النص العربي", placeholder="ضع النص هنا...", rtl=True)
                ar_output = gr.Label(label="النتيجة")

            ar_btn = gr.Button("حلل النص", variant="primary")
            ar_btn.click(predict_arabic, inputs=ar_input, outputs=ar_output)

if __name__ == "__main__":
    demo.launch()