Spaces:
Sleeping
Sleeping
| import os | |
| import platform | |
| from pydub import AudioSegment, silence | |
| if platform.system() == "Windows": | |
| ffmpeg_path = r"C:\Program Files\ffmpeg\bin" | |
| AudioSegment.converter = os.path.join(ffmpeg_path, "ffmpeg.exe") | |
| else: | |
| AudioSegment.converter = "ffmpeg" # On Linux (Hugging Face), use system-installed ffmpeg | |
| import torch | |
| import yt_dlp | |
| import subprocess | |
| import soundfile as sf | |
| from transformers import ( | |
| pipeline, | |
| Wav2Vec2ForSequenceClassification, | |
| Wav2Vec2FeatureExtractor | |
| ) | |
| import gradio as gr | |
| import datetime | |
| from sklearn.cluster import KMeans | |
| import numpy as np | |
| # ✅ Whisper ASR | |
| print("Loading Whisper...") | |
| transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny") | |
| # ✅ Accent model | |
| print("Loading Accent Classifier (ylacombe)...") | |
| accent_model_name = "ylacombe/accent-classifier" | |
| accent_model = Wav2Vec2ForSequenceClassification.from_pretrained(accent_model_name) | |
| accent_extractor = Wav2Vec2FeatureExtractor.from_pretrained(accent_model_name) | |
| accent_labels = accent_model.config.id2label | |
| # ✅ Download video | |
| def download_video(url, output_path="video.mp4"): | |
| print("📥 Downloading video...") | |
| for f in ["video.mp4", "audio.wav", "trimmed.wav"]: | |
| if os.path.exists(f): | |
| os.remove(f) | |
| ydl_opts = {"outtmpl": output_path, "format": "bestaudio/best", "quiet": True} | |
| with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
| ydl.download([url]) | |
| # ✅ Extract audio | |
| def extract_audio(input_file="video.mp4", output_file="audio.wav"): | |
| print("🎧 Extracting audio...") | |
| subprocess.run([ | |
| AudioSegment.converter, "-i", input_file, "-vn", | |
| "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", output_file, "-y" | |
| ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
| # ✅ Trim silence | |
| def trim_silence(input_audio="audio.wav", output_audio="trimmed.wav"): | |
| print("🔇 Trimming silence...") | |
| sound = AudioSegment.from_wav(input_audio) | |
| chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400) | |
| if not chunks: | |
| return input_audio | |
| combined = AudioSegment.empty() | |
| for chunk in chunks: | |
| combined += chunk | |
| combined.export(output_audio, format="wav") | |
| return output_audio | |
| # ✅ Transcription | |
| def transcribe_audio(audio_path): | |
| print("📝 Transcribing...") | |
| result = transcriber(audio_path, return_timestamps=True) | |
| return result["text"] | |
| # ✅ Real accent classification | |
| def detect_accent(wav_path): | |
| print("🌍 Classifying accent...") | |
| speech, sr = sf.read(wav_path) | |
| inputs = accent_extractor(speech, sampling_rate=sr, return_tensors="pt", padding=True) | |
| with torch.no_grad(): | |
| logits = accent_model(**inputs).logits | |
| probs = torch.nn.functional.softmax(logits, dim=-1) | |
| top = torch.argmax(probs, dim=-1).item() | |
| return accent_labels[top], round(probs[0][top].item() * 100, 2) | |
| # ✅ Speaker estimate | |
| def estimate_speakers(audio_path): | |
| print("👥 Estimating speakers...") | |
| sound = AudioSegment.from_wav(audio_path) | |
| chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400) | |
| if len(chunks) < 2: | |
| return 1 | |
| durations = np.array([[len(c)] for c in chunks]) | |
| km = KMeans(n_clusters=min(3, len(chunks)), random_state=0).fit(durations) | |
| return len(set(km.labels_)) | |
| # ✅ Fluency score | |
| def estimate_fluency(original_audio, trimmed_audio): | |
| orig = AudioSegment.from_wav(original_audio) | |
| trim = AudioSegment.from_wav(trimmed_audio) | |
| return round(len(trim) / len(orig) * 100, 2) | |
| # ✅ Report generation | |
| def generate_report(transcript, speaker_count, fluency_score, accent, confidence): | |
| timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| content = f""" | |
| 📝 AcuSpeak Report — {timestamp} | |
| 📌 Estimated Number of Speakers: {speaker_count} | |
| 🗣️ Fluency Score: {fluency_score}% | |
| 🌍 Detected Accent: {accent} ({confidence}% confidence) | |
| 📄 Transcript: | |
| {transcript} | |
| """ | |
| path = "assets/acuspeak_report.txt" | |
| os.makedirs("assets", exist_ok=True) | |
| with open(path, "w", encoding="utf-8") as f: | |
| f.write(content.strip()) | |
| return path | |
| # ✅ Main logic | |
| def process_url(url): | |
| try: | |
| download_video(url) | |
| extract_audio() | |
| trimmed = trim_silence() | |
| transcript = transcribe_audio(trimmed) | |
| speaker_count = estimate_speakers("audio.wav") | |
| fluency_score = estimate_fluency("audio.wav", trimmed) | |
| accent, confidence = detect_accent(trimmed) | |
| report = generate_report(transcript, speaker_count, fluency_score, accent, confidence) | |
| return transcript, speaker_count, f"{fluency_score}%", f"{accent} ({confidence}%)", trimmed, report | |
| except Exception as e: | |
| print(f"❌ Error: {e}") | |
| return f"❌ Error: {str(e)}", 0, "0%", "Unknown", None, None | |
| # ✅ Gradio UI | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("## 🎙️ AcuSpeak — Accent, Fluency & Speaker Analysis Tool") | |
| url_input = gr.Text(label="🔗 Video URL") | |
| submit_btn = gr.Button("Analyze") | |
| transcript_box = gr.Textbox(label="🗒️ Transcript", lines=5) | |
| speaker_output = gr.Number(label="👥 Estimated Speakers") | |
| fluency_output = gr.Text(label="🧠 Fluency Score") | |
| accent_output = gr.Text(label="🌍 Detected Accent") | |
| audio_player = gr.Audio(label="🎧 Trimmed Audio", type="filepath") | |
| report_file = gr.File(label="📥 Download Report") | |
| submit_btn.click(fn=process_url, inputs=[url_input], | |
| outputs=[transcript_box, speaker_output, fluency_output, accent_output, audio_player, report_file]) | |
| demo.launch() | |