Draxgabe commited on
Commit
1cb290b
·
verified ·
1 Parent(s): 596e5fa

Upload 4 files

Browse files
Files changed (4) hide show
  1. .gitignore +32 -0
  2. Readme.MD +32 -0
  3. app.py +151 -0
  4. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.so
5
+
6
+ # Virtual environments
7
+ venv/
8
+ env/
9
+ ENV/
10
+ *.egg-info/
11
+
12
+ # Audio and output files
13
+ *.wav
14
+ *.mp3
15
+ *.mp4
16
+
17
+ # Report files
18
+ assets/
19
+ *.txt
20
+
21
+ # OS-specific files
22
+ .DS_Store
23
+ Thumbs.db
24
+
25
+ # Jupyter notebooks checkpoints
26
+ .ipynb_checkpoints/
27
+
28
+ # VS Code settings
29
+ .vscode/
30
+
31
+ # Environment variables
32
+ .env
Readme.MD ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🎙️ AcuSpeak - English Accent, Fluency, and Speaker Analysis Tool
2
+
3
+ **AcuSpeak** is a simple but powerful web app that:
4
+ - Accepts a public video URL (e.g. YouTube, MP4)
5
+ - Extracts and processes audio using FFmpeg
6
+ - Transcribes speech with Whisper
7
+ - Classifies English accents using Hugging Face's `ylacombe/accent-classifier`
8
+ - Estimates number of speakers and speaking fluency
9
+ - Generates a downloadable text report
10
+
11
+ This project is intended for evaluating spoken English in hiring and screening use cases.
12
+
13
+ ---
14
+
15
+ ## 🚀 Features
16
+
17
+ - 🎧 **Audio Extraction & Trimming**
18
+ - 📝 **Whisper-based Transcription**
19
+ - 🌍 **Accent Classification** with real model
20
+ - 👥 **Speaker Count Estimation**
21
+ - 🧠 **Fluency Scoring**
22
+ - 📄 **Downloadable Report**
23
+ - 🖥️ **Gradio UI** (runs locally or on Hugging Face Spaces)
24
+
25
+ ---
26
+
27
+ ## 📦 Requirements
28
+
29
+ Install dependencies with:
30
+
31
+ ```bash
32
+ pip install -r requirements.txt
app.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # ✅ FFmpeg setup
3
+ ffmpeg_path = r"C:\Program Files\ffmpeg\bin"
4
+ if ffmpeg_path not in os.environ["PATH"]:
5
+ os.environ["PATH"] = ffmpeg_path + os.pathsep + os.environ["PATH"]
6
+ import torch
7
+ import yt_dlp
8
+ import subprocess
9
+ from pydub import AudioSegment, silence
10
+ import soundfile as sf
11
+ from transformers import (
12
+ pipeline,
13
+ Wav2Vec2ForSequenceClassification,
14
+ Wav2Vec2FeatureExtractor
15
+ )
16
+ import gradio as gr
17
+ import datetime
18
+ from sklearn.cluster import KMeans
19
+ import numpy as np
20
+
21
+
22
+ AudioSegment.converter = os.path.join(ffmpeg_path, "ffmpeg.exe")
23
+
24
+ # ✅ Whisper ASR
25
+ print("Loading Whisper...")
26
+ transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small")
27
+
28
+ # ✅ Accent model
29
+ print("Loading Accent Classifier (ylacombe)...")
30
+ accent_model_name = "ylacombe/accent-classifier"
31
+ accent_model = Wav2Vec2ForSequenceClassification.from_pretrained(accent_model_name)
32
+ accent_extractor = Wav2Vec2FeatureExtractor.from_pretrained(accent_model_name)
33
+ accent_labels = accent_model.config.id2label
34
+
35
+ # ✅ Download video
36
+ def download_video(url, output_path="video.mp4"):
37
+ print("📥 Downloading video...")
38
+ for f in ["video.mp4", "audio.wav", "trimmed.wav"]:
39
+ if os.path.exists(f):
40
+ os.remove(f)
41
+ ydl_opts = {"outtmpl": output_path, "format": "bestaudio/best", "quiet": True}
42
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
43
+ ydl.download([url])
44
+
45
+ # ✅ Extract audio
46
+ def extract_audio(input_file="video.mp4", output_file="audio.wav"):
47
+ print("🎧 Extracting audio...")
48
+ subprocess.run([
49
+ AudioSegment.converter, "-i", input_file, "-vn",
50
+ "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", output_file, "-y"
51
+ ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
52
+
53
+ # ✅ Trim silence
54
+ def trim_silence(input_audio="audio.wav", output_audio="trimmed.wav"):
55
+ print("🔇 Trimming silence...")
56
+ sound = AudioSegment.from_wav(input_audio)
57
+ chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
58
+ if not chunks:
59
+ return input_audio
60
+ combined = AudioSegment.empty()
61
+ for chunk in chunks:
62
+ combined += chunk
63
+ combined.export(output_audio, format="wav")
64
+ return output_audio
65
+
66
+ # ✅ Transcription
67
+ def transcribe_audio(audio_path):
68
+ print("📝 Transcribing...")
69
+ result = transcriber(audio_path, return_timestamps=True)
70
+ return result["text"]
71
+
72
+ # ✅ Real accent classification
73
+ def detect_accent(wav_path):
74
+ print("🌍 Classifying accent...")
75
+ speech, sr = sf.read(wav_path)
76
+ inputs = accent_extractor(speech, sampling_rate=sr, return_tensors="pt", padding=True)
77
+ with torch.no_grad():
78
+ logits = accent_model(**inputs).logits
79
+ probs = torch.nn.functional.softmax(logits, dim=-1)
80
+ top = torch.argmax(probs, dim=-1).item()
81
+ return accent_labels[top], round(probs[0][top].item() * 100, 2)
82
+
83
+ # ✅ Speaker estimate
84
+ def estimate_speakers(audio_path):
85
+ print("👥 Estimating speakers...")
86
+ sound = AudioSegment.from_wav(audio_path)
87
+ chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
88
+ if len(chunks) < 2:
89
+ return 1
90
+ durations = np.array([[len(c)] for c in chunks])
91
+ km = KMeans(n_clusters=min(3, len(chunks)), random_state=0).fit(durations)
92
+ return len(set(km.labels_))
93
+
94
+ # ✅ Fluency score
95
+ def estimate_fluency(original_audio, trimmed_audio):
96
+ orig = AudioSegment.from_wav(original_audio)
97
+ trim = AudioSegment.from_wav(trimmed_audio)
98
+ return round(len(trim) / len(orig) * 100, 2)
99
+
100
+ # ✅ Report generation
101
+ def generate_report(transcript, speaker_count, fluency_score, accent, confidence):
102
+ timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
103
+ content = f"""
104
+ 📝 AcuSpeak Report — {timestamp}
105
+
106
+ 📌 Estimated Number of Speakers: {speaker_count}
107
+ 🗣️ Fluency Score: {fluency_score}%
108
+ 🌍 Detected Accent: {accent} ({confidence}% confidence)
109
+
110
+ 📄 Transcript:
111
+ {transcript}
112
+ """
113
+ path = "assets/acuspeak_report.txt"
114
+ os.makedirs("assets", exist_ok=True)
115
+ with open(path, "w", encoding="utf-8") as f:
116
+ f.write(content.strip())
117
+ return path
118
+
119
+ # ✅ Main logic
120
+ def process_url(url):
121
+ try:
122
+ download_video(url)
123
+ extract_audio()
124
+ trimmed = trim_silence()
125
+ transcript = transcribe_audio(trimmed)
126
+ speaker_count = estimate_speakers("audio.wav")
127
+ fluency_score = estimate_fluency("audio.wav", trimmed)
128
+ accent, confidence = detect_accent(trimmed)
129
+ report = generate_report(transcript, speaker_count, fluency_score, accent, confidence)
130
+ return transcript, speaker_count, f"{fluency_score}%", f"{accent} ({confidence}%)", trimmed, report
131
+ except Exception as e:
132
+ print(f"❌ Error: {e}")
133
+ return f"❌ Error: {str(e)}", 0, "0%", "Unknown", None, None
134
+
135
+ # ✅ Gradio UI
136
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
137
+ gr.Markdown("## 🎙️ AcuSpeak — Accent, Fluency & Speaker Analysis Tool")
138
+ url_input = gr.Text(label="🔗 Video URL")
139
+ submit_btn = gr.Button("Analyze")
140
+
141
+ transcript_box = gr.Textbox(label="🗒️ Transcript", lines=5)
142
+ speaker_output = gr.Number(label="👥 Estimated Speakers")
143
+ fluency_output = gr.Text(label="🧠 Fluency Score")
144
+ accent_output = gr.Text(label="🌍 Detected Accent")
145
+ audio_player = gr.Audio(label="🎧 Trimmed Audio", type="filepath")
146
+ report_file = gr.File(label="📥 Download Report")
147
+
148
+ submit_btn.click(fn=process_url, inputs=[url_input],
149
+ outputs=[transcript_box, speaker_output, fluency_output, accent_output, audio_player, report_file])
150
+
151
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ pydub
4
+ yt-dlp
5
+ soundfile
6
+ scikit-learn