import os
import platform
from pydub import AudioSegment, silence
if platform.system() == "Windows":
    ffmpeg_path = r"C:\Program Files\ffmpeg\bin"
    AudioSegment.converter = os.path.join(ffmpeg_path, "ffmpeg.exe")
else:
    AudioSegment.converter = "ffmpeg"  # On Linux (Hugging Face), use system-installed ffmpeg

import torch
import yt_dlp
import subprocess

import soundfile as sf
from transformers import (
    pipeline,
    Wav2Vec2ForSequenceClassification,
    Wav2Vec2FeatureExtractor
)
import gradio as gr
import datetime
from sklearn.cluster import KMeans
import numpy as np


# ✅ Whisper ASR
print("Loading Whisper...")
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")

# ✅ Accent model
print("Loading Accent Classifier (ylacombe)...")
accent_model_name = "ylacombe/accent-classifier"
accent_model = Wav2Vec2ForSequenceClassification.from_pretrained(accent_model_name)
accent_extractor = Wav2Vec2FeatureExtractor.from_pretrained(accent_model_name)
accent_labels = accent_model.config.id2label

# ✅ Download video
def download_video(url, output_path="video.mp4"):
    print("📥 Downloading video...")
    for f in ["video.mp4", "audio.wav", "trimmed.wav"]:
        if os.path.exists(f):
            os.remove(f)
    ydl_opts = {"outtmpl": output_path, "format": "bestaudio/best", "quiet": True}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

# ✅ Extract audio
def extract_audio(input_file="video.mp4", output_file="audio.wav"):
    print("🎧 Extracting audio...")
    subprocess.run([
        AudioSegment.converter, "-i", input_file, "-vn",
        "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", output_file, "-y"
    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

# ✅ Trim silence
def trim_silence(input_audio="audio.wav", output_audio="trimmed.wav"):
    print("🔇 Trimming silence...")
    sound = AudioSegment.from_wav(input_audio)
    chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
    if not chunks:
        return input_audio
    combined = AudioSegment.empty()
    for chunk in chunks:
        combined += chunk
    combined.export(output_audio, format="wav")
    return output_audio

# ✅ Transcription
def transcribe_audio(audio_path):
    print("📝 Transcribing...")
    result = transcriber(audio_path, return_timestamps=True)
    return result["text"]

# ✅ Real accent classification
def detect_accent(wav_path):
    print("🌍 Classifying accent...")
    speech, sr = sf.read(wav_path)
    inputs = accent_extractor(speech, sampling_rate=sr, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = accent_model(**inputs).logits
        probs = torch.nn.functional.softmax(logits, dim=-1)
        top = torch.argmax(probs, dim=-1).item()
        return accent_labels[top], round(probs[0][top].item() * 100, 2)

# ✅ Speaker estimate
def estimate_speakers(audio_path):
    print("👥 Estimating speakers...")
    sound = AudioSegment.from_wav(audio_path)
    chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
    if len(chunks) < 2:
        return 1
    durations = np.array([[len(c)] for c in chunks])
    km = KMeans(n_clusters=min(3, len(chunks)), random_state=0).fit(durations)
    return len(set(km.labels_))

# ✅ Fluency score
def estimate_fluency(original_audio, trimmed_audio):
    orig = AudioSegment.from_wav(original_audio)
    trim = AudioSegment.from_wav(trimmed_audio)
    return round(len(trim) / len(orig) * 100, 2)

# ✅ Report generation
def generate_report(transcript, speaker_count, fluency_score, accent, confidence):
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    content = f"""
📝 AcuSpeak Report — {timestamp}

📌 Estimated Number of Speakers: {speaker_count}
🗣️ Fluency Score: {fluency_score}%
🌍 Detected Accent: {accent} ({confidence}% confidence)

📄 Transcript:
{transcript}
"""
    path = "assets/acuspeak_report.txt"
    os.makedirs("assets", exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write(content.strip())
    return path

# ✅ Main logic
def process_url(url):
    try:
        download_video(url)
        extract_audio()
        trimmed = trim_silence()
        transcript = transcribe_audio(trimmed)
        speaker_count = estimate_speakers("audio.wav")
        fluency_score = estimate_fluency("audio.wav", trimmed)
        accent, confidence = detect_accent(trimmed)
        report = generate_report(transcript, speaker_count, fluency_score, accent, confidence)
        return transcript, speaker_count, f"{fluency_score}%", f"{accent} ({confidence}%)", trimmed, report
    except Exception as e:
        print(f"❌ Error: {e}")
        return f"❌ Error: {str(e)}", 0, "0%", "Unknown", None, None

# ✅ Gradio UI
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 🎙️ AcuSpeak — Accent, Fluency & Speaker Analysis Tool")
    url_input = gr.Text(label="🔗 Video URL")
    submit_btn = gr.Button("Analyze")

    transcript_box = gr.Textbox(label="🗒️ Transcript", lines=5)
    speaker_output = gr.Number(label="👥 Estimated Speakers")
    fluency_output = gr.Text(label="🧠 Fluency Score")
    accent_output = gr.Text(label="🌍 Detected Accent")
    audio_player = gr.Audio(label="🎧 Trimmed Audio", type="filepath")
    report_file = gr.File(label="📥 Download Report")

    submit_btn.click(fn=process_url, inputs=[url_input],
                     outputs=[transcript_box, speaker_output, fluency_output, accent_output, audio_player, report_file])

demo.launch()