Spaces:

LordTenson
/

saudi-eou-detection

Sleeping

File size: 6,793 Bytes

a2d71c5

"""
Hugging Face Space - Arabic EOU Detection Demo
File: app.py

This creates an interactive web demo for your model
"""

import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ============================================================================
# LOAD MODEL
# ============================================================================

MODEL_NAME = "LordTenson/Saudi-EOU"  # Replace with your model name

print("Loading model...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    print(f"✅ Model loaded on {device}")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Falling back to local model...")
    MODEL_NAME = "./arabert_eou_final"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
    device = "cpu"
    model.to(device)
    model.eval()

# ============================================================================
# PREDICTION FUNCTION
# ============================================================================

def predict_eou(text, threshold=0.5):
    """
    Predict if text is end-of-utterance
    
    Args:
        text: Arabic text to analyze
        threshold: Confidence threshold
    
    Returns:
        Prediction result and confidence
    """
    if not text or len(text.strip()) == 0:
        return "❌ Please enter some text", 0.0, 0.0
    
    # Tokenize
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=256,
        padding=True
    )
    
    # Move to device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Inference
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
    
    # Get probabilities
    not_eou_prob = probs[0][0].item()
    eou_prob = probs[0][1].item()
    
    # Determine result
    is_eou = eou_prob >= threshold
    
    if is_eou:
        result = f"✅ **END OF TURN** - Speaker has finished"
        color = "green"
    else:
        result = f"⏳ **CONTINUE** - Speaker is still talking"
        color = "orange"
    
    # Return results
    return result, eou_prob, not_eou_prob

# ============================================================================
# GRADIO INTERFACE
# ============================================================================

def create_demo():
    """Create Gradio interface"""
    
    with gr.Blocks(title="Arabic EOU Detection", theme=gr.themes.Soft()) as demo:
        
        # Header
        gr.Markdown("""
        # 🎤 Arabic End-of-Utterance Detection
        
        This model detects whether a speaker has finished their turn in Arabic conversations.
        Fine-tuned AraBERT model on Saudi dialect conversations.
        
        **Use Case**: Real-time voice agents, conversation systems, live transcription
        """)
        
        with gr.Row():
            with gr.Column(scale=2):
                # Input
                text_input = gr.Textbox(
                    label="Enter Arabic Text",
                    placeholder="مثال: السلام عليكم كيف حالك",
                    lines=3,
                    rtl=True  # Right-to-left for Arabic
                )
                
                threshold_slider = gr.Slider(
                    minimum=0.0,
                    maximum=1.0,
                    value=0.5,
                    step=0.05,
                    label="Detection Threshold",
                    info="Lower = more sensitive, Higher = less sensitive"
                )
                
                submit_btn = gr.Button("🔍 Analyze", variant="primary")
                
            with gr.Column(scale=1):
                # Output
                result_output = gr.Markdown(label="Prediction")
                
                with gr.Row():
                    eou_prob = gr.Number(label="EOU Probability", precision=3)
                    not_eou_prob = gr.Number(label="Not-EOU Probability", precision=3)
        
        # Examples
        gr.Markdown("### 📝 Try These Examples:")
        
        gr.Examples(
            examples=[
                ["السلام عليكم كيف حالك", 0.5],
                ["أنا رايح", 0.5],
                ["شكراً لك والله", 0.5],
                ["يعني مثلاً", 0.5],
                ["تمام فهمت عليك", 0.5],
                ["أبي أقول لك", 0.5],
                ["والله ما أدري كيف", 0.5],
                ["خلاص انتهينا من الموضوع", 0.5],
            ],
            inputs=[text_input, threshold_slider],
            outputs=[result_output, eou_prob, not_eou_prob],
            fn=predict_eou,
            cache_examples=False,
        )
        
        # Model Info
        with gr.Accordion("ℹ️ Model Information", open=False):
            gr.Markdown(f"""
            ### Model Details
            - **Base Model**: aubmindlab/bert-base-arabertv2
            - **Fine-tuned on**: Saudi Arabic dialect conversations
            - **Accuracy**: 62%
            - **F1 Score**: 0.62 (balanced)
            - **Latency**: ~45ms average
            
            ### How It Works
            1. The model analyzes Arabic text
            2. Predicts probability of turn completion
            3. If probability > threshold → Turn ends
            4. Used in real-time voice agents for natural conversations
            
            ### Classes
            - **EOU (End-of-Utterance)**: Speaker has finished their turn
            - **Not-EOU**: Speaker is continuing, more words expected
            
            ### Links
            - 🤗 [Model on Hugging Face]({MODEL_NAME})
            - 📊 [Dataset](your-dataset-link)
            - 💻 [GitHub Repository](your-github-link)
            """)
        
        # Connect interface
        submit_btn.click(
            fn=predict_eou,
            inputs=[text_input, threshold_slider],
            outputs=[result_output, eou_prob, not_eou_prob]
        )
        
        text_input.submit(
            fn=predict_eou,
            inputs=[text_input, threshold_slider],
            outputs=[result_output, eou_prob, not_eou_prob]
        )
    
    return demo

# ============================================================================
# LAUNCH
# ============================================================================

if __name__ == "__main__":
    demo = create_demo()
    demo.launch()