acuspeak-demo / app.py
Draxgabe's picture
Update app.py
0ec2d9d verified
raw
history blame
5.64 kB
import os
import platform
from pydub import AudioSegment, silence
if platform.system() == "Windows":
ffmpeg_path = r"C:\Program Files\ffmpeg\bin"
AudioSegment.converter = os.path.join(ffmpeg_path, "ffmpeg.exe")
else:
AudioSegment.converter = "ffmpeg" # On Linux (Hugging Face), use system-installed ffmpeg
import torch
import yt_dlp
import subprocess
import soundfile as sf
from transformers import (
pipeline,
Wav2Vec2ForSequenceClassification,
Wav2Vec2FeatureExtractor
)
import gradio as gr
import datetime
from sklearn.cluster import KMeans
import numpy as np
# ✅ Whisper ASR
print("Loading Whisper...")
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
# ✅ Accent model
print("Loading Accent Classifier (ylacombe)...")
accent_model_name = "ylacombe/accent-classifier"
accent_model = Wav2Vec2ForSequenceClassification.from_pretrained(accent_model_name)
accent_extractor = Wav2Vec2FeatureExtractor.from_pretrained(accent_model_name)
accent_labels = accent_model.config.id2label
# ✅ Download video
def download_video(url, output_path="video.mp4"):
print("📥 Downloading video...")
for f in ["video.mp4", "audio.wav", "trimmed.wav"]:
if os.path.exists(f):
os.remove(f)
ydl_opts = {"outtmpl": output_path, "format": "bestaudio/best", "quiet": True}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
# ✅ Extract audio
def extract_audio(input_file="video.mp4", output_file="audio.wav"):
print("🎧 Extracting audio...")
subprocess.run([
AudioSegment.converter, "-i", input_file, "-vn",
"-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", output_file, "-y"
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
# ✅ Trim silence
def trim_silence(input_audio="audio.wav", output_audio="trimmed.wav"):
print("🔇 Trimming silence...")
sound = AudioSegment.from_wav(input_audio)
chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
if not chunks:
return input_audio
combined = AudioSegment.empty()
for chunk in chunks:
combined += chunk
combined.export(output_audio, format="wav")
return output_audio
# ✅ Transcription
def transcribe_audio(audio_path):
print("📝 Transcribing...")
result = transcriber(audio_path, return_timestamps=True)
return result["text"]
# ✅ Real accent classification
def detect_accent(wav_path):
print("🌍 Classifying accent...")
speech, sr = sf.read(wav_path)
inputs = accent_extractor(speech, sampling_rate=sr, return_tensors="pt", padding=True)
with torch.no_grad():
logits = accent_model(**inputs).logits
probs = torch.nn.functional.softmax(logits, dim=-1)
top = torch.argmax(probs, dim=-1).item()
return accent_labels[top], round(probs[0][top].item() * 100, 2)
# ✅ Speaker estimate
def estimate_speakers(audio_path):
print("👥 Estimating speakers...")
sound = AudioSegment.from_wav(audio_path)
chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
if len(chunks) < 2:
return 1
durations = np.array([[len(c)] for c in chunks])
km = KMeans(n_clusters=min(3, len(chunks)), random_state=0).fit(durations)
return len(set(km.labels_))
# ✅ Fluency score
def estimate_fluency(original_audio, trimmed_audio):
orig = AudioSegment.from_wav(original_audio)
trim = AudioSegment.from_wav(trimmed_audio)
return round(len(trim) / len(orig) * 100, 2)
# ✅ Report generation
def generate_report(transcript, speaker_count, fluency_score, accent, confidence):
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
content = f"""
📝 AcuSpeak Report — {timestamp}
📌 Estimated Number of Speakers: {speaker_count}
🗣️ Fluency Score: {fluency_score}%
🌍 Detected Accent: {accent} ({confidence}% confidence)
📄 Transcript:
{transcript}
"""
path = "assets/acuspeak_report.txt"
os.makedirs("assets", exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
f.write(content.strip())
return path
# ✅ Main logic
def process_url(url):
try:
download_video(url)
extract_audio()
trimmed = trim_silence()
transcript = transcribe_audio(trimmed)
speaker_count = estimate_speakers("audio.wav")
fluency_score = estimate_fluency("audio.wav", trimmed)
accent, confidence = detect_accent(trimmed)
report = generate_report(transcript, speaker_count, fluency_score, accent, confidence)
return transcript, speaker_count, f"{fluency_score}%", f"{accent} ({confidence}%)", trimmed, report
except Exception as e:
print(f"❌ Error: {e}")
return f"❌ Error: {str(e)}", 0, "0%", "Unknown", None, None
# ✅ Gradio UI
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("## 🎙️ AcuSpeak — Accent, Fluency & Speaker Analysis Tool")
url_input = gr.Text(label="🔗 Video URL")
submit_btn = gr.Button("Analyze")
transcript_box = gr.Textbox(label="🗒️ Transcript", lines=5)
speaker_output = gr.Number(label="👥 Estimated Speakers")
fluency_output = gr.Text(label="🧠 Fluency Score")
accent_output = gr.Text(label="🌍 Detected Accent")
audio_player = gr.Audio(label="🎧 Trimmed Audio", type="filepath")
report_file = gr.File(label="📥 Download Report")
submit_btn.click(fn=process_url, inputs=[url_input],
outputs=[transcript_box, speaker_output, fluency_output, accent_output, audio_player, report_file])
demo.launch()