Draxgabe commited on
Commit
468e24d
·
verified ·
1 Parent(s): 0d0b41e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -151
app.py CHANGED
@@ -1,151 +1,153 @@
1
- import os
2
- # ✅ FFmpeg setup
3
- ffmpeg_path = r"C:\Program Files\ffmpeg\bin"
4
- if ffmpeg_path not in os.environ["PATH"]:
5
- os.environ["PATH"] = ffmpeg_path + os.pathsep + os.environ["PATH"]
6
- import torch
7
- import yt_dlp
8
- import subprocess
9
- from pydub import AudioSegment, silence
10
- import soundfile as sf
11
- from transformers import (
12
- pipeline,
13
- Wav2Vec2ForSequenceClassification,
14
- Wav2Vec2FeatureExtractor
15
- )
16
- import gradio as gr
17
- import datetime
18
- from sklearn.cluster import KMeans
19
- import numpy as np
20
-
21
-
22
- AudioSegment.converter = os.path.join(ffmpeg_path, "ffmpeg.exe")
23
-
24
- # ✅ Whisper ASR
25
- print("Loading Whisper...")
26
- transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small")
27
-
28
- # Accent model
29
- print("Loading Accent Classifier (ylacombe)...")
30
- accent_model_name = "ylacombe/accent-classifier"
31
- accent_model = Wav2Vec2ForSequenceClassification.from_pretrained(accent_model_name)
32
- accent_extractor = Wav2Vec2FeatureExtractor.from_pretrained(accent_model_name)
33
- accent_labels = accent_model.config.id2label
34
-
35
- # Download video
36
- def download_video(url, output_path="video.mp4"):
37
- print("📥 Downloading video...")
38
- for f in ["video.mp4", "audio.wav", "trimmed.wav"]:
39
- if os.path.exists(f):
40
- os.remove(f)
41
- ydl_opts = {"outtmpl": output_path, "format": "bestaudio/best", "quiet": True}
42
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
43
- ydl.download([url])
44
-
45
- # ✅ Extract audio
46
- def extract_audio(input_file="video.mp4", output_file="audio.wav"):
47
- print("🎧 Extracting audio...")
48
- subprocess.run([
49
- AudioSegment.converter, "-i", input_file, "-vn",
50
- "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", output_file, "-y"
51
- ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
52
-
53
- # Trim silence
54
- def trim_silence(input_audio="audio.wav", output_audio="trimmed.wav"):
55
- print("🔇 Trimming silence...")
56
- sound = AudioSegment.from_wav(input_audio)
57
- chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
58
- if not chunks:
59
- return input_audio
60
- combined = AudioSegment.empty()
61
- for chunk in chunks:
62
- combined += chunk
63
- combined.export(output_audio, format="wav")
64
- return output_audio
65
-
66
- # ✅ Transcription
67
- def transcribe_audio(audio_path):
68
- print("📝 Transcribing...")
69
- result = transcriber(audio_path, return_timestamps=True)
70
- return result["text"]
71
-
72
- # ✅ Real accent classification
73
- def detect_accent(wav_path):
74
- print("🌍 Classifying accent...")
75
- speech, sr = sf.read(wav_path)
76
- inputs = accent_extractor(speech, sampling_rate=sr, return_tensors="pt", padding=True)
77
- with torch.no_grad():
78
- logits = accent_model(**inputs).logits
79
- probs = torch.nn.functional.softmax(logits, dim=-1)
80
- top = torch.argmax(probs, dim=-1).item()
81
- return accent_labels[top], round(probs[0][top].item() * 100, 2)
82
-
83
- # Speaker estimate
84
- def estimate_speakers(audio_path):
85
- print("👥 Estimating speakers...")
86
- sound = AudioSegment.from_wav(audio_path)
87
- chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
88
- if len(chunks) < 2:
89
- return 1
90
- durations = np.array([[len(c)] for c in chunks])
91
- km = KMeans(n_clusters=min(3, len(chunks)), random_state=0).fit(durations)
92
- return len(set(km.labels_))
93
-
94
- # ✅ Fluency score
95
- def estimate_fluency(original_audio, trimmed_audio):
96
- orig = AudioSegment.from_wav(original_audio)
97
- trim = AudioSegment.from_wav(trimmed_audio)
98
- return round(len(trim) / len(orig) * 100, 2)
99
-
100
- # Report generation
101
- def generate_report(transcript, speaker_count, fluency_score, accent, confidence):
102
- timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
103
- content = f"""
104
- 📝 AcuSpeak Report — {timestamp}
105
-
106
- 📌 Estimated Number of Speakers: {speaker_count}
107
- 🗣️ Fluency Score: {fluency_score}%
108
- 🌍 Detected Accent: {accent} ({confidence}% confidence)
109
-
110
- 📄 Transcript:
111
- {transcript}
112
- """
113
- path = "assets/acuspeak_report.txt"
114
- os.makedirs("assets", exist_ok=True)
115
- with open(path, "w", encoding="utf-8") as f:
116
- f.write(content.strip())
117
- return path
118
-
119
- # ✅ Main logic
120
- def process_url(url):
121
- try:
122
- download_video(url)
123
- extract_audio()
124
- trimmed = trim_silence()
125
- transcript = transcribe_audio(trimmed)
126
- speaker_count = estimate_speakers("audio.wav")
127
- fluency_score = estimate_fluency("audio.wav", trimmed)
128
- accent, confidence = detect_accent(trimmed)
129
- report = generate_report(transcript, speaker_count, fluency_score, accent, confidence)
130
- return transcript, speaker_count, f"{fluency_score}%", f"{accent} ({confidence}%)", trimmed, report
131
- except Exception as e:
132
- print(f" Error: {e}")
133
- return f"❌ Error: {str(e)}", 0, "0%", "Unknown", None, None
134
-
135
- # Gradio UI
136
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
137
- gr.Markdown("## 🎙️ AcuSpeak — Accent, Fluency & Speaker Analysis Tool")
138
- url_input = gr.Text(label="🔗 Video URL")
139
- submit_btn = gr.Button("Analyze")
140
-
141
- transcript_box = gr.Textbox(label="🗒️ Transcript", lines=5)
142
- speaker_output = gr.Number(label="👥 Estimated Speakers")
143
- fluency_output = gr.Text(label="🧠 Fluency Score")
144
- accent_output = gr.Text(label="🌍 Detected Accent")
145
- audio_player = gr.Audio(label="🎧 Trimmed Audio", type="filepath")
146
- report_file = gr.File(label="📥 Download Report")
147
-
148
- submit_btn.click(fn=process_url, inputs=[url_input],
149
- outputs=[transcript_box, speaker_output, fluency_output, accent_output, audio_player, report_file])
150
-
151
- demo.launch()
 
 
 
1
+ import os
2
+ import platform
3
+ from pydub import AudioSegment, silence
4
+ if platform.system() == "Windows":
5
+ ffmpeg_path = r"C:\Program Files\ffmpeg\bin"
6
+ AudioSegment.converter = os.path.join(ffmpeg_path, "ffmpeg.exe")
7
+ else:
8
+ AudioSegment.converter = "ffmpeg" # On Linux (Hugging Face), use system-installed ffmpeg
9
+
10
+ import torch
11
+ import yt_dlp
12
+ import subprocess
13
+
14
+ import soundfile as sf
15
+ from transformers import (
16
+ pipeline,
17
+ Wav2Vec2ForSequenceClassification,
18
+ Wav2Vec2FeatureExtractor
19
+ )
20
+ import gradio as gr
21
+ import datetime
22
+ from sklearn.cluster import KMeans
23
+ import numpy as np
24
+
25
+
26
+ # Whisper ASR
27
+ print("Loading Whisper...")
28
+ transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small")
29
+
30
+ # Accent model
31
+ print("Loading Accent Classifier (ylacombe)...")
32
+ accent_model_name = "ylacombe/accent-classifier"
33
+ accent_model = Wav2Vec2ForSequenceClassification.from_pretrained(accent_model_name)
34
+ accent_extractor = Wav2Vec2FeatureExtractor.from_pretrained(accent_model_name)
35
+ accent_labels = accent_model.config.id2label
36
+
37
+ # Download video
38
+ def download_video(url, output_path="video.mp4"):
39
+ print("📥 Downloading video...")
40
+ for f in ["video.mp4", "audio.wav", "trimmed.wav"]:
41
+ if os.path.exists(f):
42
+ os.remove(f)
43
+ ydl_opts = {"outtmpl": output_path, "format": "bestaudio/best", "quiet": True}
44
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
45
+ ydl.download([url])
46
+
47
+ # Extract audio
48
+ def extract_audio(input_file="video.mp4", output_file="audio.wav"):
49
+ print("🎧 Extracting audio...")
50
+ subprocess.run([
51
+ AudioSegment.converter, "-i", input_file, "-vn",
52
+ "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", output_file, "-y"
53
+ ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
54
+
55
+ # Trim silence
56
+ def trim_silence(input_audio="audio.wav", output_audio="trimmed.wav"):
57
+ print("🔇 Trimming silence...")
58
+ sound = AudioSegment.from_wav(input_audio)
59
+ chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
60
+ if not chunks:
61
+ return input_audio
62
+ combined = AudioSegment.empty()
63
+ for chunk in chunks:
64
+ combined += chunk
65
+ combined.export(output_audio, format="wav")
66
+ return output_audio
67
+
68
+ # ✅ Transcription
69
+ def transcribe_audio(audio_path):
70
+ print("📝 Transcribing...")
71
+ result = transcriber(audio_path, return_timestamps=True)
72
+ return result["text"]
73
+
74
+ # Real accent classification
75
+ def detect_accent(wav_path):
76
+ print("🌍 Classifying accent...")
77
+ speech, sr = sf.read(wav_path)
78
+ inputs = accent_extractor(speech, sampling_rate=sr, return_tensors="pt", padding=True)
79
+ with torch.no_grad():
80
+ logits = accent_model(**inputs).logits
81
+ probs = torch.nn.functional.softmax(logits, dim=-1)
82
+ top = torch.argmax(probs, dim=-1).item()
83
+ return accent_labels[top], round(probs[0][top].item() * 100, 2)
84
+
85
+ # Speaker estimate
86
+ def estimate_speakers(audio_path):
87
+ print("👥 Estimating speakers...")
88
+ sound = AudioSegment.from_wav(audio_path)
89
+ chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
90
+ if len(chunks) < 2:
91
+ return 1
92
+ durations = np.array([[len(c)] for c in chunks])
93
+ km = KMeans(n_clusters=min(3, len(chunks)), random_state=0).fit(durations)
94
+ return len(set(km.labels_))
95
+
96
+ # Fluency score
97
+ def estimate_fluency(original_audio, trimmed_audio):
98
+ orig = AudioSegment.from_wav(original_audio)
99
+ trim = AudioSegment.from_wav(trimmed_audio)
100
+ return round(len(trim) / len(orig) * 100, 2)
101
+
102
+ # Report generation
103
+ def generate_report(transcript, speaker_count, fluency_score, accent, confidence):
104
+ timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
105
+ content = f"""
106
+ 📝 AcuSpeak Report {timestamp}
107
+
108
+ 📌 Estimated Number of Speakers: {speaker_count}
109
+ 🗣️ Fluency Score: {fluency_score}%
110
+ 🌍 Detected Accent: {accent} ({confidence}% confidence)
111
+
112
+ 📄 Transcript:
113
+ {transcript}
114
+ """
115
+ path = "assets/acuspeak_report.txt"
116
+ os.makedirs("assets", exist_ok=True)
117
+ with open(path, "w", encoding="utf-8") as f:
118
+ f.write(content.strip())
119
+ return path
120
+
121
+ # ✅ Main logic
122
+ def process_url(url):
123
+ try:
124
+ download_video(url)
125
+ extract_audio()
126
+ trimmed = trim_silence()
127
+ transcript = transcribe_audio(trimmed)
128
+ speaker_count = estimate_speakers("audio.wav")
129
+ fluency_score = estimate_fluency("audio.wav", trimmed)
130
+ accent, confidence = detect_accent(trimmed)
131
+ report = generate_report(transcript, speaker_count, fluency_score, accent, confidence)
132
+ return transcript, speaker_count, f"{fluency_score}%", f"{accent} ({confidence}%)", trimmed, report
133
+ except Exception as e:
134
+ print(f"❌ Error: {e}")
135
+ return f"❌ Error: {str(e)}", 0, "0%", "Unknown", None, None
136
+
137
+ # Gradio UI
138
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
139
+ gr.Markdown("## 🎙️ AcuSpeak — Accent, Fluency & Speaker Analysis Tool")
140
+ url_input = gr.Text(label="🔗 Video URL")
141
+ submit_btn = gr.Button("Analyze")
142
+
143
+ transcript_box = gr.Textbox(label="🗒️ Transcript", lines=5)
144
+ speaker_output = gr.Number(label="👥 Estimated Speakers")
145
+ fluency_output = gr.Text(label="🧠 Fluency Score")
146
+ accent_output = gr.Text(label="🌍 Detected Accent")
147
+ audio_player = gr.Audio(label="🎧 Trimmed Audio", type="filepath")
148
+ report_file = gr.File(label="📥 Download Report")
149
+
150
+ submit_btn.click(fn=process_url, inputs=[url_input],
151
+ outputs=[transcript_box, speaker_output, fluency_output, accent_output, audio_player, report_file])
152
+
153
+ demo.launch()