Spaces:

Draxgabe
/

acuspeak-demo

Sleeping

App Files Files Community

Draxgabe commited on May 21

Commit

1cb290b

verified ·

1 Parent(s): 596e5fa

Upload 4 files

Browse files

Files changed (4) hide show

.gitignore +32 -0
Readme.MD +32 -0
app.py +151 -0
requirements.txt +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,32 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*.so
+# Virtual environments
+venv/
+env/
+ENV/
+*.egg-info/
+# Audio and output files
+*.wav
+*.mp3
+*.mp4
+# Report files
+assets/
+*.txt
+# OS-specific files
+.DS_Store
+Thumbs.db
+# Jupyter notebooks checkpoints
+.ipynb_checkpoints/
+# VS Code settings
+.vscode/
+# Environment variables
+.env

Readme.MD ADDED Viewed

	@@ -0,0 +1,32 @@

+# 🎙️ AcuSpeak - English Accent, Fluency, and Speaker Analysis Tool
+**AcuSpeak** is a simple but powerful web app that:
+- Accepts a public video URL (e.g. YouTube, MP4)
+- Extracts and processes audio using FFmpeg
+- Transcribes speech with Whisper
+- Classifies English accents using Hugging Face's `ylacombe/accent-classifier`
+- Estimates number of speakers and speaking fluency
+- Generates a downloadable text report
+This project is intended for evaluating spoken English in hiring and screening use cases.
+---
+## 🚀 Features
+- 🎧 **Audio Extraction & Trimming**
+- 📝 **Whisper-based Transcription**
+- 🌍 **Accent Classification** with real model
+- 👥 **Speaker Count Estimation**
+- 🧠 **Fluency Scoring**
+- 📄 **Downloadable Report**
+- 🖥️ **Gradio UI** (runs locally or on Hugging Face Spaces)
+---
+## 📦 Requirements
+Install dependencies with:
+```bash
+pip install -r requirements.txt

app.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+# ✅ FFmpeg setup
+ffmpeg_path = r"C:\Program Files\ffmpeg\bin"
+if ffmpeg_path not in os.environ["PATH"]:
+    os.environ["PATH"] = ffmpeg_path + os.pathsep + os.environ["PATH"]
+import torch
+import yt_dlp
+import subprocess
+from pydub import AudioSegment, silence
+import soundfile as sf
+from transformers import (
+    pipeline,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2FeatureExtractor
+)
+import gradio as gr
+import datetime
+from sklearn.cluster import KMeans
+import numpy as np
+AudioSegment.converter = os.path.join(ffmpeg_path, "ffmpeg.exe")
+# ✅ Whisper ASR
+print("Loading Whisper...")
+transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small")
+# ✅ Accent model
+print("Loading Accent Classifier (ylacombe)...")
+accent_model_name = "ylacombe/accent-classifier"
+accent_model = Wav2Vec2ForSequenceClassification.from_pretrained(accent_model_name)
+accent_extractor = Wav2Vec2FeatureExtractor.from_pretrained(accent_model_name)
+accent_labels = accent_model.config.id2label
+# ✅ Download video
+def download_video(url, output_path="video.mp4"):
+    print("📥 Downloading video...")
+    for f in ["video.mp4", "audio.wav", "trimmed.wav"]:
+        if os.path.exists(f):
+            os.remove(f)
+    ydl_opts = {"outtmpl": output_path, "format": "bestaudio/best", "quiet": True}
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([url])
+# ✅ Extract audio
+def extract_audio(input_file="video.mp4", output_file="audio.wav"):
+    print("🎧 Extracting audio...")
+    subprocess.run([
+        AudioSegment.converter, "-i", input_file, "-vn",
+        "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", output_file, "-y"
+    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+# ✅ Trim silence
+def trim_silence(input_audio="audio.wav", output_audio="trimmed.wav"):
+    print("🔇 Trimming silence...")
+    sound = AudioSegment.from_wav(input_audio)
+    chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
+    if not chunks:
+        return input_audio
+    combined = AudioSegment.empty()
+    for chunk in chunks:
+        combined += chunk
+    combined.export(output_audio, format="wav")
+    return output_audio
+# ✅ Transcription
+def transcribe_audio(audio_path):
+    print("📝 Transcribing...")
+    result = transcriber(audio_path, return_timestamps=True)
+    return result["text"]
+# ✅ Real accent classification
+def detect_accent(wav_path):
+    print("🌍 Classifying accent...")
+    speech, sr = sf.read(wav_path)
+    inputs = accent_extractor(speech, sampling_rate=sr, return_tensors="pt", padding=True)
+    with torch.no_grad():
+        logits = accent_model(**inputs).logits
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+        top = torch.argmax(probs, dim=-1).item()
+        return accent_labels[top], round(probs[0][top].item() * 100, 2)
+# ✅ Speaker estimate
+def estimate_speakers(audio_path):
+    print("👥 Estimating speakers...")
+    sound = AudioSegment.from_wav(audio_path)
+    chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
+    if len(chunks) < 2:
+        return 1
+    durations = np.array([[len(c)] for c in chunks])
+    km = KMeans(n_clusters=min(3, len(chunks)), random_state=0).fit(durations)
+    return len(set(km.labels_))
+# ✅ Fluency score
+def estimate_fluency(original_audio, trimmed_audio):
+    orig = AudioSegment.from_wav(original_audio)
+    trim = AudioSegment.from_wav(trimmed_audio)
+    return round(len(trim) / len(orig) * 100, 2)
+# ✅ Report generation
+def generate_report(transcript, speaker_count, fluency_score, accent, confidence):
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    content = f"""
+📝 AcuSpeak Report — {timestamp}
+📌 Estimated Number of Speakers: {speaker_count}
+🗣️ Fluency Score: {fluency_score}%
+🌍 Detected Accent: {accent} ({confidence}% confidence)
+📄 Transcript:
+{transcript}
+"""
+    path = "assets/acuspeak_report.txt"
+    os.makedirs("assets", exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(content.strip())
+    return path
+# ✅ Main logic
+def process_url(url):
+    try:
+        download_video(url)
+        extract_audio()
+        trimmed = trim_silence()
+        transcript = transcribe_audio(trimmed)
+        speaker_count = estimate_speakers("audio.wav")
+        fluency_score = estimate_fluency("audio.wav", trimmed)
+        accent, confidence = detect_accent(trimmed)
+        report = generate_report(transcript, speaker_count, fluency_score, accent, confidence)
+        return transcript, speaker_count, f"{fluency_score}%", f"{accent} ({confidence}%)", trimmed, report
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        return f"❌ Error: {str(e)}", 0, "0%", "Unknown", None, None
+# ✅ Gradio UI
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## 🎙️ AcuSpeak — Accent, Fluency & Speaker Analysis Tool")
+    url_input = gr.Text(label="🔗 Video URL")
+    submit_btn = gr.Button("Analyze")
+    transcript_box = gr.Textbox(label="🗒️ Transcript", lines=5)
+    speaker_output = gr.Number(label="👥 Estimated Speakers")
+    fluency_output = gr.Text(label="🧠 Fluency Score")
+    accent_output = gr.Text(label="🌍 Detected Accent")
+    audio_player = gr.Audio(label="🎧 Trimmed Audio", type="filepath")
+    report_file = gr.File(label="📥 Download Report")
+    submit_btn.click(fn=process_url, inputs=[url_input],
+                     outputs=[transcript_box, speaker_output, fluency_output, accent_output, audio_player, report_file])
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+transformers
+pydub
+yt-dlp
+soundfile
+scikit-learn