import os import platform from pydub import AudioSegment, silence if platform.system() == "Windows": ffmpeg_path = r"C:\Program Files\ffmpeg\bin" AudioSegment.converter = os.path.join(ffmpeg_path, "ffmpeg.exe") else: AudioSegment.converter = "ffmpeg" # On Linux (Hugging Face), use system-installed ffmpeg import torch import yt_dlp import subprocess import soundfile as sf from transformers import ( pipeline, Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor ) import gradio as gr import datetime from sklearn.cluster import KMeans import numpy as np # ✅ Whisper ASR print("Loading Whisper...") transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny") # ✅ Accent model print("Loading Accent Classifier (ylacombe)...") accent_model_name = "ylacombe/accent-classifier" accent_model = Wav2Vec2ForSequenceClassification.from_pretrained(accent_model_name) accent_extractor = Wav2Vec2FeatureExtractor.from_pretrained(accent_model_name) accent_labels = accent_model.config.id2label # ✅ Download video def download_video(url, output_path="video.mp4"): print("📥 Downloading video...") for f in ["video.mp4", "audio.wav", "trimmed.wav"]: if os.path.exists(f): os.remove(f) ydl_opts = {"outtmpl": output_path, "format": "bestaudio/best", "quiet": True} with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) # ✅ Extract audio def extract_audio(input_file="video.mp4", output_file="audio.wav"): print("🎧 Extracting audio...") subprocess.run([ AudioSegment.converter, "-i", input_file, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", output_file, "-y" ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # ✅ Trim silence def trim_silence(input_audio="audio.wav", output_audio="trimmed.wav"): print("🔇 Trimming silence...") sound = AudioSegment.from_wav(input_audio) chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400) if not chunks: return input_audio combined = AudioSegment.empty() for chunk in chunks: combined += chunk combined.export(output_audio, format="wav") return output_audio # ✅ Transcription def transcribe_audio(audio_path): print("📝 Transcribing...") result = transcriber(audio_path, return_timestamps=True) return result["text"] # ✅ Real accent classification def detect_accent(wav_path): print("🌍 Classifying accent...") speech, sr = sf.read(wav_path) inputs = accent_extractor(speech, sampling_rate=sr, return_tensors="pt", padding=True) with torch.no_grad(): logits = accent_model(**inputs).logits probs = torch.nn.functional.softmax(logits, dim=-1) top = torch.argmax(probs, dim=-1).item() return accent_labels[top], round(probs[0][top].item() * 100, 2) # ✅ Speaker estimate def estimate_speakers(audio_path): print("👥 Estimating speakers...") sound = AudioSegment.from_wav(audio_path) chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400) if len(chunks) < 2: return 1 durations = np.array([[len(c)] for c in chunks]) km = KMeans(n_clusters=min(3, len(chunks)), random_state=0).fit(durations) return len(set(km.labels_)) # ✅ Fluency score def estimate_fluency(original_audio, trimmed_audio): orig = AudioSegment.from_wav(original_audio) trim = AudioSegment.from_wav(trimmed_audio) return round(len(trim) / len(orig) * 100, 2) # ✅ Report generation def generate_report(transcript, speaker_count, fluency_score, accent, confidence): timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") content = f""" 📝 AcuSpeak Report — {timestamp} 📌 Estimated Number of Speakers: {speaker_count} 🗣️ Fluency Score: {fluency_score}% 🌍 Detected Accent: {accent} ({confidence}% confidence) 📄 Transcript: {transcript} """ path = "assets/acuspeak_report.txt" os.makedirs("assets", exist_ok=True) with open(path, "w", encoding="utf-8") as f: f.write(content.strip()) return path # ✅ Main logic def process_url(url): try: download_video(url) extract_audio() trimmed = trim_silence() transcript = transcribe_audio(trimmed) speaker_count = estimate_speakers("audio.wav") fluency_score = estimate_fluency("audio.wav", trimmed) accent, confidence = detect_accent(trimmed) report = generate_report(transcript, speaker_count, fluency_score, accent, confidence) return transcript, speaker_count, f"{fluency_score}%", f"{accent} ({confidence}%)", trimmed, report except Exception as e: print(f"❌ Error: {e}") return f"❌ Error: {str(e)}", 0, "0%", "Unknown", None, None # ✅ Gradio UI with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("## 🎙️ AcuSpeak — Accent, Fluency & Speaker Analysis Tool") url_input = gr.Text(label="🔗 Video URL") submit_btn = gr.Button("Analyze") transcript_box = gr.Textbox(label="🗒️ Transcript", lines=5) speaker_output = gr.Number(label="👥 Estimated Speakers") fluency_output = gr.Text(label="🧠 Fluency Score") accent_output = gr.Text(label="🌍 Detected Accent") audio_player = gr.Audio(label="🎧 Trimmed Audio", type="filepath") report_file = gr.File(label="📥 Download Report") submit_btn.click(fn=process_url, inputs=[url_input], outputs=[transcript_box, speaker_output, fluency_output, accent_output, audio_player, report_file]) demo.launch()