Spaces:

Draxgabe
/

acuspeak-demo

Sleeping

App Files Files Community

acuspeak-demo / app.py

Draxgabe

Update app.py

0ec2d9d verified 7 months ago

raw

history blame

5.64 kB

	import os
	import platform
	from pydub import AudioSegment, silence
	if platform.system() == "Windows":
	ffmpeg_path = r"C:\Program Files\ffmpeg\bin"
	AudioSegment.converter = os.path.join(ffmpeg_path, "ffmpeg.exe")
	else:
	AudioSegment.converter = "ffmpeg" # On Linux (Hugging Face), use system-installed ffmpeg

	import torch
	import yt_dlp
	import subprocess

	import soundfile as sf
	from transformers import (
	pipeline,
	Wav2Vec2ForSequenceClassification,
	Wav2Vec2FeatureExtractor
	)
	import gradio as gr
	import datetime
	from sklearn.cluster import KMeans
	import numpy as np


	# ✅ Whisper ASR
	print("Loading Whisper...")
	transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")

	# ✅ Accent model
	print("Loading Accent Classifier (ylacombe)...")
	accent_model_name = "ylacombe/accent-classifier"
	accent_model = Wav2Vec2ForSequenceClassification.from_pretrained(accent_model_name)
	accent_extractor = Wav2Vec2FeatureExtractor.from_pretrained(accent_model_name)
	accent_labels = accent_model.config.id2label

	# ✅ Download video
	def download_video(url, output_path="video.mp4"):
	print("📥 Downloading video...")
	for f in ["video.mp4", "audio.wav", "trimmed.wav"]:
	if os.path.exists(f):
	os.remove(f)
	ydl_opts = {"outtmpl": output_path, "format": "bestaudio/best", "quiet": True}
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.download([url])

	# ✅ Extract audio
	def extract_audio(input_file="video.mp4", output_file="audio.wav"):
	print("🎧 Extracting audio...")
	subprocess.run([
	AudioSegment.converter, "-i", input_file, "-vn",
	"-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", output_file, "-y"
	], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

	# ✅ Trim silence
	def trim_silence(input_audio="audio.wav", output_audio="trimmed.wav"):
	print("🔇 Trimming silence...")
	sound = AudioSegment.from_wav(input_audio)
	chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
	if not chunks:
	return input_audio
	combined = AudioSegment.empty()
	for chunk in chunks:
	combined += chunk
	combined.export(output_audio, format="wav")
	return output_audio

	# ✅ Transcription
	def transcribe_audio(audio_path):
	print("📝 Transcribing...")
	result = transcriber(audio_path, return_timestamps=True)
	return result["text"]

	# ✅ Real accent classification
	def detect_accent(wav_path):
	print("🌍 Classifying accent...")
	speech, sr = sf.read(wav_path)
	inputs = accent_extractor(speech, sampling_rate=sr, return_tensors="pt", padding=True)
	with torch.no_grad():
	logits = accent_model(**inputs).logits
	probs = torch.nn.functional.softmax(logits, dim=-1)
	top = torch.argmax(probs, dim=-1).item()
	return accent_labels[top], round(probs[0][top].item() * 100, 2)

	# ✅ Speaker estimate
	def estimate_speakers(audio_path):
	print("👥 Estimating speakers...")
	sound = AudioSegment.from_wav(audio_path)
	chunks = silence.split_on_silence(sound, silence_thresh=-45, min_silence_len=400)
	if len(chunks) < 2:
	return 1
	durations = np.array([[len(c)] for c in chunks])
	km = KMeans(n_clusters=min(3, len(chunks)), random_state=0).fit(durations)
	return len(set(km.labels_))

	# ✅ Fluency score
	def estimate_fluency(original_audio, trimmed_audio):
	orig = AudioSegment.from_wav(original_audio)
	trim = AudioSegment.from_wav(trimmed_audio)
	return round(len(trim) / len(orig) * 100, 2)

	# ✅ Report generation
	def generate_report(transcript, speaker_count, fluency_score, accent, confidence):
	timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	content = f"""
	📝 AcuSpeak Report — {timestamp}

	📌 Estimated Number of Speakers: {speaker_count}
	🗣️ Fluency Score: {fluency_score}%
	🌍 Detected Accent: {accent} ({confidence}% confidence)

	📄 Transcript:
	{transcript}
	"""
	path = "assets/acuspeak_report.txt"
	os.makedirs("assets", exist_ok=True)
	with open(path, "w", encoding="utf-8") as f:
	f.write(content.strip())
	return path

	# ✅ Main logic
	def process_url(url):
	try:
	download_video(url)
	extract_audio()
	trimmed = trim_silence()
	transcript = transcribe_audio(trimmed)
	speaker_count = estimate_speakers("audio.wav")
	fluency_score = estimate_fluency("audio.wav", trimmed)
	accent, confidence = detect_accent(trimmed)
	report = generate_report(transcript, speaker_count, fluency_score, accent, confidence)
	return transcript, speaker_count, f"{fluency_score}%", f"{accent} ({confidence}%)", trimmed, report
	except Exception as e:
	print(f"❌ Error: {e}")
	return f"❌ Error: {str(e)}", 0, "0%", "Unknown", None, None

	# ✅ Gradio UI
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("## 🎙️ AcuSpeak — Accent, Fluency & Speaker Analysis Tool")
	url_input = gr.Text(label="🔗 Video URL")
	submit_btn = gr.Button("Analyze")

	transcript_box = gr.Textbox(label="🗒️ Transcript", lines=5)
	speaker_output = gr.Number(label="👥 Estimated Speakers")
	fluency_output = gr.Text(label="🧠 Fluency Score")
	accent_output = gr.Text(label="🌍 Detected Accent")
	audio_player = gr.Audio(label="🎧 Trimmed Audio", type="filepath")
	report_file = gr.File(label="📥 Download Report")

	submit_btn.click(fn=process_url, inputs=[url_input],
	outputs=[transcript_box, speaker_output, fluency_output, accent_output, audio_player, report_file])

	demo.launch()