Spaces:

Draxgabe
/

acuspeak-demo

Sleeping

App Files Files Community

Draxgabe commited on May 21

Commit

468e24d

verified ·

1 Parent(s): 0d0b41e

Update app.py

Browse files

Files changed (1) hide show

app.py +153 -151

app.py CHANGED Viewed

@@ -1,151 +1,153 @@
-import os
-# ✅ FFmpeg setup
-ffmpeg_path = r"C:\Program Files\ffmpeg\bin"
-if ffmpeg_path not in os.environ["PATH"]:
-    os.environ["PATH"] = ffmpeg_path + os.pathsep + os.environ["PATH"]
-import torch
-import yt_dlp
-import subprocess
-from pydub import AudioSegment, silence
-import soundfile as sf
-from transformers import (
-    pipeline,
-    Wav2Vec2ForSequenceClassification,
-    Wav2Vec2FeatureExtractor
-)
-import gradio as gr
-import datetime
-from sklearn.cluster import KMeans
-import numpy as np
-AudioSegment.converter = os.path.join(ffmpeg_path, "ffmpeg.exe")
-# ✅ Whisper ASR
-print("Loading Whisper...")
-transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small")
-# ✅ Accent model
-print("Loading Accent Classifier (ylacombe)...")
-accent_model_name = "ylacombe/accent-classifier"
-accent_model = Wav2Vec2ForSequenceClassification.from_pretrained(accent_model_name)
-accent_extractor = Wav2Vec2FeatureExtractor.from_pretrained(accent_model_name)
-accent_labels = accent_model.config.id2label
-# ✅ Download video
-def download_video(url, output_path="video.mp4"):
-    print("📥 Downloading video...")
-    for f in ["video.mp4", "audio.wav", "trimmed.wav"]:
-        if os.path.exists(f):
-            os.remove(f)
-    ydl_opts = {"outtmpl": output_path, "format": "bestaudio/best", "quiet": True}
-    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-        ydl.download([url])
-# ✅ Extract audio
-def extract_audio(input_file="video.mp4", output_file="audio.wav"):
-    print("🎧 Extracting audio...")
-    subprocess.run([
-        AudioSegment.converter, "-i", input_file, "-vn",
-        "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", output_file, "-y"
-    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-# ✅ Trim silence
-def trim_silence(input_audio="audio.wav", output_audio="trimmed.wav"):
-    print("🔇 Trimming silence...")
-    sound = AudioSegment.from_wav(input_audio)
-    chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
-    if not chunks:
-        return input_audio
-    combined = AudioSegment.empty()
-    for chunk in chunks:
-        combined += chunk
-    combined.export(output_audio, format="wav")
-    return output_audio
-# ✅ Transcription
-def transcribe_audio(audio_path):
-    print("📝 Transcribing...")
-    result = transcriber(audio_path, return_timestamps=True)
-    return result["text"]
-# ✅ Real accent classification
-def detect_accent(wav_path):
-    print("🌍 Classifying accent...")
-    speech, sr = sf.read(wav_path)
-    inputs = accent_extractor(speech, sampling_rate=sr, return_tensors="pt", padding=True)
-    with torch.no_grad():
-        logits = accent_model(**inputs).logits
-        probs = torch.nn.functional.softmax(logits, dim=-1)
-        top = torch.argmax(probs, dim=-1).item()
-        return accent_labels[top], round(probs[0][top].item() * 100, 2)
-# ✅ Speaker estimate
-def estimate_speakers(audio_path):
-    print("👥 Estimating speakers...")
-    sound = AudioSegment.from_wav(audio_path)
-    chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
-    if len(chunks) < 2:
-        return 1
-    durations = np.array([[len(c)] for c in chunks])
-    km = KMeans(n_clusters=min(3, len(chunks)), random_state=0).fit(durations)
-    return len(set(km.labels_))
-# ✅ Fluency score
-def estimate_fluency(original_audio, trimmed_audio):
-    orig = AudioSegment.from_wav(original_audio)
-    trim = AudioSegment.from_wav(trimmed_audio)
-    return round(len(trim) / len(orig) * 100, 2)
-# ✅ Report generation
-def generate_report(transcript, speaker_count, fluency_score, accent, confidence):
-    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    content = f"""
-📝 AcuSpeak Report — {timestamp}
-📌 Estimated Number of Speakers: {speaker_count}
-🗣️ Fluency Score: {fluency_score}%
-🌍 Detected Accent: {accent} ({confidence}% confidence)
-📄 Transcript:
-{transcript}
-"""
-    path = "assets/acuspeak_report.txt"
-    os.makedirs("assets", exist_ok=True)
-    with open(path, "w", encoding="utf-8") as f:
-        f.write(content.strip())
-    return path
-# ✅ Main logic
-def process_url(url):
-    try:
-        download_video(url)
-        extract_audio()
-        trimmed = trim_silence()
-        transcript = transcribe_audio(trimmed)
-        speaker_count = estimate_speakers("audio.wav")
-        fluency_score = estimate_fluency("audio.wav", trimmed)
-        accent, confidence = detect_accent(trimmed)
-        report = generate_report(transcript, speaker_count, fluency_score, accent, confidence)
-        return transcript, speaker_count, f"{fluency_score}%", f"{accent} ({confidence}%)", trimmed, report
-    except Exception as e:
-        print(f"❌ Error: {e}")
-        return f"❌ Error: {str(e)}", 0, "0%", "Unknown", None, None
-# ✅ Gradio UI
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 🎙️ AcuSpeak — Accent, Fluency & Speaker Analysis Tool")
-    url_input = gr.Text(label="🔗 Video URL")
-    submit_btn = gr.Button("Analyze")
-    transcript_box = gr.Textbox(label="🗒️ Transcript", lines=5)
-    speaker_output = gr.Number(label="👥 Estimated Speakers")
-    fluency_output = gr.Text(label="🧠 Fluency Score")
-    accent_output = gr.Text(label="🌍 Detected Accent")
-    audio_player = gr.Audio(label="🎧 Trimmed Audio", type="filepath")
-    report_file = gr.File(label="📥 Download Report")
-    submit_btn.click(fn=process_url, inputs=[url_input],
-                     outputs=[transcript_box, speaker_output, fluency_output, accent_output, audio_player, report_file])
-demo.launch()

+import os
+import platform
+from pydub import AudioSegment, silence
+if platform.system() == "Windows":
+    ffmpeg_path = r"C:\Program Files\ffmpeg\bin"
+    AudioSegment.converter = os.path.join(ffmpeg_path, "ffmpeg.exe")
+else:
+    AudioSegment.converter = "ffmpeg"  # On Linux (Hugging Face), use system-installed ffmpeg
+import torch
+import yt_dlp
+import subprocess
+import soundfile as sf
+from transformers import (
+    pipeline,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2FeatureExtractor
+)
+import gradio as gr
+import datetime
+from sklearn.cluster import KMeans
+import numpy as np
+# ✅ Whisper ASR
+print("Loading Whisper...")
+transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small")
+# ✅ Accent model
+print("Loading Accent Classifier (ylacombe)...")
+accent_model_name = "ylacombe/accent-classifier"
+accent_model = Wav2Vec2ForSequenceClassification.from_pretrained(accent_model_name)
+accent_extractor = Wav2Vec2FeatureExtractor.from_pretrained(accent_model_name)
+accent_labels = accent_model.config.id2label
+# ✅ Download video
+def download_video(url, output_path="video.mp4"):
+    print("📥 Downloading video...")
+    for f in ["video.mp4", "audio.wav", "trimmed.wav"]:
+        if os.path.exists(f):
+            os.remove(f)
+    ydl_opts = {"outtmpl": output_path, "format": "bestaudio/best", "quiet": True}
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([url])
+# ✅ Extract audio
+def extract_audio(input_file="video.mp4", output_file="audio.wav"):
+    print("🎧 Extracting audio...")
+    subprocess.run([
+        AudioSegment.converter, "-i", input_file, "-vn",
+        "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", output_file, "-y"
+    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+# ✅ Trim silence
+def trim_silence(input_audio="audio.wav", output_audio="trimmed.wav"):
+    print("🔇 Trimming silence...")
+    sound = AudioSegment.from_wav(input_audio)
+    chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
+    if not chunks:
+        return input_audio
+    combined = AudioSegment.empty()
+    for chunk in chunks:
+        combined += chunk
+    combined.export(output_audio, format="wav")
+    return output_audio
+# ✅ Transcription
+def transcribe_audio(audio_path):
+    print("📝 Transcribing...")
+    result = transcriber(audio_path, return_timestamps=True)
+    return result["text"]
+# ✅ Real accent classification
+def detect_accent(wav_path):
+    print("🌍 Classifying accent...")
+    speech, sr = sf.read(wav_path)
+    inputs = accent_extractor(speech, sampling_rate=sr, return_tensors="pt", padding=True)
+    with torch.no_grad():
+        logits = accent_model(**inputs).logits
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+        top = torch.argmax(probs, dim=-1).item()
+        return accent_labels[top], round(probs[0][top].item() * 100, 2)
+# ✅ Speaker estimate
+def estimate_speakers(audio_path):
+    print("👥 Estimating speakers...")
+    sound = AudioSegment.from_wav(audio_path)
+    chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
+    if len(chunks) < 2:
+        return 1
+    durations = np.array([[len(c)] for c in chunks])
+    km = KMeans(n_clusters=min(3, len(chunks)), random_state=0).fit(durations)
+    return len(set(km.labels_))
+# ✅ Fluency score
+def estimate_fluency(original_audio, trimmed_audio):
+    orig = AudioSegment.from_wav(original_audio)
+    trim = AudioSegment.from_wav(trimmed_audio)
+    return round(len(trim) / len(orig) * 100, 2)
+# ✅ Report generation
+def generate_report(transcript, speaker_count, fluency_score, accent, confidence):
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    content = f"""
+📝 AcuSpeak Report — {timestamp}
+📌 Estimated Number of Speakers: {speaker_count}
+🗣️ Fluency Score: {fluency_score}%
+🌍 Detected Accent: {accent} ({confidence}% confidence)
+📄 Transcript:
+{transcript}
+"""
+    path = "assets/acuspeak_report.txt"
+    os.makedirs("assets", exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(content.strip())
+    return path
+# ✅ Main logic
+def process_url(url):
+    try:
+        download_video(url)
+        extract_audio()
+        trimmed = trim_silence()
+        transcript = transcribe_audio(trimmed)
+        speaker_count = estimate_speakers("audio.wav")
+        fluency_score = estimate_fluency("audio.wav", trimmed)
+        accent, confidence = detect_accent(trimmed)
+        report = generate_report(transcript, speaker_count, fluency_score, accent, confidence)
+        return transcript, speaker_count, f"{fluency_score}%", f"{accent} ({confidence}%)", trimmed, report
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        return f"❌ Error: {str(e)}", 0, "0%", "Unknown", None, None
+# ✅ Gradio UI
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## 🎙️ AcuSpeak — Accent, Fluency & Speaker Analysis Tool")
+    url_input = gr.Text(label="🔗 Video URL")
+    submit_btn = gr.Button("Analyze")
+    transcript_box = gr.Textbox(label="🗒️ Transcript", lines=5)
+    speaker_output = gr.Number(label="👥 Estimated Speakers")
+    fluency_output = gr.Text(label="🧠 Fluency Score")
+    accent_output = gr.Text(label="🌍 Detected Accent")
+    audio_player = gr.Audio(label="🎧 Trimmed Audio", type="filepath")
+    report_file = gr.File(label="📥 Download Report")
+    submit_btn.click(fn=process_url, inputs=[url_input],
+                     outputs=[transcript_box, speaker_output, fluency_output, accent_output, audio_player, report_file])
+demo.launch()