Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,9 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os, asyncio, json, tempfile, websockets, pdfplumber
|
| 2 |
import gradio as gr
|
| 3 |
import openai
|
| 4 |
from dotenv import load_dotenv
|
| 5 |
import numpy as np
|
| 6 |
import wave
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# โโโ 0. ์ด๊ธฐํ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 9 |
load_dotenv()
|
|
@@ -11,6 +25,19 @@ openai.api_key = os.getenv("OPENAI_API_KEY")
|
|
| 11 |
if not openai.api_key:
|
| 12 |
raise RuntimeError("OPENAI_API_KEY ๊ฐ .env ์ ์์ต๋๋ค!")
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
LANG = ["Korean","English","Japanese","Chinese",
|
| 15 |
"Thai","Russian","Vietnamese","Spanish","French"]
|
| 16 |
VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
|
|
@@ -63,27 +90,121 @@ def translate_pdf(file, src, tgt):
|
|
| 63 |
return text, asyncio.run(gpt_translate(text, src, tgt))
|
| 64 |
|
| 65 |
# โโโ 2-1. ์ค๋์ค ๋ฒ์ญ (ํญ1์ฉ) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
async def translate_audio_async(file, src, tgt):
|
| 67 |
-
if not file: return "โ ๏ธ
|
| 68 |
|
| 69 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
# STT: Whisper API ์ฌ์ฉ
|
|
|
|
| 71 |
client = get_client()
|
| 72 |
-
with open(
|
| 73 |
transcript = await client.audio.transcriptions.create(
|
| 74 |
model="whisper-1",
|
| 75 |
file=audio_file,
|
| 76 |
language=src[:2].lower() # ์ธ์ด ์ฝ๋ ๊ฐ์ํ
|
| 77 |
)
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
orig_text = transcript.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
trans_text = await gpt_translate(orig_text, src, tgt)
|
|
|
|
|
|
|
|
|
|
| 81 |
audio_path = await gpt_tts(trans_text, tgt)
|
| 82 |
|
| 83 |
return orig_text, trans_text, audio_path
|
| 84 |
except Exception as e:
|
| 85 |
print(f"์ค๋์ค ๋ฒ์ญ ์ค๋ฅ: {e}")
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
def translate_audio(file, src, tgt):
|
| 89 |
return asyncio.run(translate_audio_async(file, src, tgt))
|
|
@@ -306,19 +427,83 @@ def realtime_four_sync(audio, src, state):
|
|
| 306 |
state["Thai"], state["Russian"], state)
|
| 307 |
|
| 308 |
# โโโ 5. UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 309 |
-
with gr.Blocks(title="SMARTok Demo") as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
with gr.Tabs():
|
| 311 |
# ํญ 1 โ ์ค๋์ค ๋ฒ์ญ
|
| 312 |
-
with gr.TabItem("๐๏ธ
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
| 321 |
-
btn1.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
# ํญ 2 โ PDF ๋ฒ์ญ
|
| 324 |
with gr.TabItem("๐ PDF"):
|
|
|
|
| 1 |
+
# SMARTok Demo - ์ค์๊ฐ ๋ค๊ตญ์ด ๋ฒ์ญ ์์คํ
|
| 2 |
+
#
|
| 3 |
+
# ํ์ ํจํค์ง:
|
| 4 |
+
# pip install gradio openai python-dotenv pdfplumber numpy websockets
|
| 5 |
+
#
|
| 6 |
+
# ์ ํ ํจํค์ง (๋น๋์ค ์ฒ๋ฆฌ):
|
| 7 |
+
# - ffmpeg ์ค์น: sudo apt-get install ffmpeg (Linux) / brew install ffmpeg (Mac)
|
| 8 |
+
# - ๋๋ pip install moviepy
|
| 9 |
+
#
|
| 10 |
+
# ํ๊ฒฝ ๋ณ์:
|
| 11 |
+
# .env ํ์ผ์ OPENAI_API_KEY ์ค์ ํ์
|
| 12 |
+
|
| 13 |
import os, asyncio, json, tempfile, websockets, pdfplumber
|
| 14 |
import gradio as gr
|
| 15 |
import openai
|
| 16 |
from dotenv import load_dotenv
|
| 17 |
import numpy as np
|
| 18 |
import wave
|
| 19 |
+
import subprocess
|
| 20 |
+
import mimetypes
|
| 21 |
|
| 22 |
# โโโ 0. ์ด๊ธฐํ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 23 |
load_dotenv()
|
|
|
|
| 25 |
if not openai.api_key:
|
| 26 |
raise RuntimeError("OPENAI_API_KEY ๊ฐ .env ์ ์์ต๋๋ค!")
|
| 27 |
|
| 28 |
+
# ffmpeg ์ค์น ํ์ธ
|
| 29 |
+
def check_ffmpeg():
|
| 30 |
+
try:
|
| 31 |
+
subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
|
| 32 |
+
return True
|
| 33 |
+
except:
|
| 34 |
+
return False
|
| 35 |
+
|
| 36 |
+
HAS_FFMPEG = check_ffmpeg()
|
| 37 |
+
if not HAS_FFMPEG:
|
| 38 |
+
print("โ ๏ธ ffmpeg๊ฐ ์ค์น๋์ด ์์ง ์์ต๋๋ค. ๋น๋์ค ์ฒ๋ฆฌ๊ฐ ์ ํ๋ ์ ์์ต๋๋ค.")
|
| 39 |
+
print("์ค์น ๋ฐฉ๋ฒ: sudo apt-get install ffmpeg (Linux) / brew install ffmpeg (Mac)")
|
| 40 |
+
|
| 41 |
LANG = ["Korean","English","Japanese","Chinese",
|
| 42 |
"Thai","Russian","Vietnamese","Spanish","French"]
|
| 43 |
VOICE = {l: ("nova" if l in ["Korean","Japanese","Chinese"] else "alloy")
|
|
|
|
| 90 |
return text, asyncio.run(gpt_translate(text, src, tgt))
|
| 91 |
|
| 92 |
# โโโ 2-1. ์ค๋์ค ๋ฒ์ญ (ํญ1์ฉ) โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 93 |
+
def extract_audio_from_video(video_path):
|
| 94 |
+
"""MP4 ๋ฑ ๋น๋์ค ํ์ผ์์ ์ค๋์ค ์ถ์ถ"""
|
| 95 |
+
audio_output = None
|
| 96 |
+
try:
|
| 97 |
+
# ์์ ์ค๋์ค ํ์ผ ์์ฑ
|
| 98 |
+
audio_output = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
| 99 |
+
audio_output.close()
|
| 100 |
+
|
| 101 |
+
# ๋ฐฉ๋ฒ 1: ffmpeg ์ฌ์ฉ ์๋
|
| 102 |
+
if HAS_FFMPEG:
|
| 103 |
+
cmd = [
|
| 104 |
+
'ffmpeg',
|
| 105 |
+
'-i', video_path,
|
| 106 |
+
'-vn', # ๋น๋์ค ์คํธ๋ฆผ ์ ๊ฑฐ
|
| 107 |
+
'-acodec', 'pcm_s16le', # WAV ํฌ๋งท
|
| 108 |
+
'-ar', '16000', # 16kHz ์ํ๋ง
|
| 109 |
+
'-ac', '1', # ๋ชจ๋
ธ
|
| 110 |
+
'-y', # ๋ฎ์ด์ฐ๊ธฐ
|
| 111 |
+
audio_output.name
|
| 112 |
+
]
|
| 113 |
+
|
| 114 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 115 |
+
|
| 116 |
+
if result.returncode == 0:
|
| 117 |
+
return audio_output.name
|
| 118 |
+
else:
|
| 119 |
+
print(f"ffmpeg ์ค๋ฅ: {result.stderr}")
|
| 120 |
+
|
| 121 |
+
# ๋ฐฉ๋ฒ 2: moviepy ์ฌ์ฉ ์๋
|
| 122 |
+
try:
|
| 123 |
+
from moviepy.editor import VideoFileClip
|
| 124 |
+
print("moviepy๋ฅผ ์ฌ์ฉํ์ฌ ์ค๋์ค ์ถ์ถ ์ค...")
|
| 125 |
+
video = VideoFileClip(video_path)
|
| 126 |
+
video.audio.write_audiofile(
|
| 127 |
+
audio_output.name,
|
| 128 |
+
fps=16000,
|
| 129 |
+
nbytes=2,
|
| 130 |
+
codec='pcm_s16le',
|
| 131 |
+
verbose=False,
|
| 132 |
+
logger=None
|
| 133 |
+
)
|
| 134 |
+
video.close()
|
| 135 |
+
return audio_output.name
|
| 136 |
+
except ImportError:
|
| 137 |
+
raise Exception(
|
| 138 |
+
"๋น๋์ค ์ฒ๋ฆฌ๋ฅผ ์ํด ffmpeg ๋๋ moviepy๊ฐ ํ์ํฉ๋๋ค.\n"
|
| 139 |
+
"์ค์น: pip install moviepy ๋๋ ffmpeg ์ค์น"
|
| 140 |
+
)
|
| 141 |
+
except Exception as e:
|
| 142 |
+
raise Exception(f"moviepy ์ค๋ฅ: {str(e)}")
|
| 143 |
+
|
| 144 |
+
except Exception as e:
|
| 145 |
+
# ์ค๋ฅ ์ ์์ ํ์ผ ์ ๋ฆฌ
|
| 146 |
+
if audio_output and os.path.exists(audio_output.name):
|
| 147 |
+
os.unlink(audio_output.name)
|
| 148 |
+
raise e
|
| 149 |
+
|
| 150 |
async def translate_audio_async(file, src, tgt):
|
| 151 |
+
if not file: return "โ ๏ธ ์ค๋์ค/๋น๋์ค ์
๋ก๋ ํ์", "", None
|
| 152 |
|
| 153 |
try:
|
| 154 |
+
# ํ์ผ ํ์
ํ์ธ
|
| 155 |
+
mime_type, _ = mimetypes.guess_type(file)
|
| 156 |
+
audio_file_path = file
|
| 157 |
+
temp_audio_path = None
|
| 158 |
+
|
| 159 |
+
# ๋น๋์ค ํ์ผ์ธ ๊ฒฝ์ฐ ์ค๋์ค ์ถ์ถ
|
| 160 |
+
if mime_type and mime_type.startswith('video/'):
|
| 161 |
+
print(f"๋น๋์ค ํ์ผ ๊ฐ์ง: {mime_type}")
|
| 162 |
+
print(f"ํ์ผ ํฌ๊ธฐ: {os.path.getsize(file) / 1024 / 1024:.1f} MB")
|
| 163 |
+
print("๋น๋์ค์์ ์ค๋์ค ์ถ์ถ ์ค... (์๊ฐ์ด ๊ฑธ๋ฆด ์ ์์ต๋๋ค)")
|
| 164 |
+
temp_audio_path = extract_audio_from_video(file)
|
| 165 |
+
audio_file_path = temp_audio_path
|
| 166 |
+
print("์ค๋์ค ์ถ์ถ ์๋ฃ!")
|
| 167 |
+
|
| 168 |
# STT: Whisper API ์ฌ์ฉ
|
| 169 |
+
print("์์ฑ ์ธ์ ์ค...")
|
| 170 |
client = get_client()
|
| 171 |
+
with open(audio_file_path, 'rb') as audio_file:
|
| 172 |
transcript = await client.audio.transcriptions.create(
|
| 173 |
model="whisper-1",
|
| 174 |
file=audio_file,
|
| 175 |
language=src[:2].lower() # ์ธ์ด ์ฝ๋ ๊ฐ์ํ
|
| 176 |
)
|
| 177 |
|
| 178 |
+
# ์์ ํ์ผ ์ ๋ฆฌ
|
| 179 |
+
if temp_audio_path and os.path.exists(temp_audio_path):
|
| 180 |
+
os.unlink(temp_audio_path)
|
| 181 |
+
|
| 182 |
orig_text = transcript.text
|
| 183 |
+
if not orig_text.strip():
|
| 184 |
+
return "โ ๏ธ ์์ฑ์ด ๊ฐ์ง๋์ง ์์์ต๋๋ค", "", None
|
| 185 |
+
|
| 186 |
+
print(f"์ธ์๋ ํ
์คํธ: {orig_text[:50]}...")
|
| 187 |
+
|
| 188 |
+
# ๋ฒ์ญ
|
| 189 |
+
print(f"{src} โ {tgt} ๋ฒ์ญ ์ค...")
|
| 190 |
trans_text = await gpt_translate(orig_text, src, tgt)
|
| 191 |
+
|
| 192 |
+
# TTS
|
| 193 |
+
print("์์ฑ ํฉ์ฑ ์ค...")
|
| 194 |
audio_path = await gpt_tts(trans_text, tgt)
|
| 195 |
|
| 196 |
return orig_text, trans_text, audio_path
|
| 197 |
except Exception as e:
|
| 198 |
print(f"์ค๋์ค ๋ฒ์ญ ์ค๋ฅ: {e}")
|
| 199 |
+
# ์์ ํ์ผ ์ ๋ฆฌ
|
| 200 |
+
if 'temp_audio_path' in locals() and temp_audio_path and os.path.exists(temp_audio_path):
|
| 201 |
+
os.unlink(temp_audio_path)
|
| 202 |
+
|
| 203 |
+
error_msg = str(e)
|
| 204 |
+
if "ffmpeg" in error_msg.lower():
|
| 205 |
+
error_msg += "\n\n๐ก ํด๊ฒฐ ๋ฐฉ๋ฒ:\n1. ffmpeg ์ค์น: sudo apt-get install ffmpeg\n2. ๋๋ pip install moviepy"
|
| 206 |
+
|
| 207 |
+
return "โ ๏ธ ๋ฒ์ญ ์ค ์ค๋ฅ ๋ฐ์", error_msg, None
|
| 208 |
|
| 209 |
def translate_audio(file, src, tgt):
|
| 210 |
return asyncio.run(translate_audio_async(file, src, tgt))
|
|
|
|
| 427 |
state["Thai"], state["Russian"], state)
|
| 428 |
|
| 429 |
# โโโ 5. UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 430 |
+
with gr.Blocks(title="SMARTok Demo", theme=gr.themes.Soft()) as demo:
|
| 431 |
+
gr.Markdown(
|
| 432 |
+
"""
|
| 433 |
+
# ๐ SMARTok ์ค์๊ฐ ๋ฒ์ญ ์์คํ
|
| 434 |
+
|
| 435 |
+
๋ค๊ตญ์ด ์ค์๊ฐ ๋ฒ์ญ์ ์ง์ํ๋ ํตํฉ ๋ฒ์ญ ํ๋ซํผ
|
| 436 |
+
"""
|
| 437 |
+
)
|
| 438 |
+
|
| 439 |
with gr.Tabs():
|
| 440 |
# ํญ 1 โ ์ค๋์ค ๋ฒ์ญ
|
| 441 |
+
with gr.TabItem("๐๏ธ ์ค๋์ค/๋น๋์ค"):
|
| 442 |
+
gr.Markdown("### ๐ ์ค๋์ค/๋น๋์ค ํ์ผ ๋ฒ์ญ")
|
| 443 |
+
|
| 444 |
+
with gr.Row():
|
| 445 |
+
src1 = gr.Dropdown(LANG, value="Korean", label="์
๋ ฅ ์ธ์ด")
|
| 446 |
+
tgt1 = gr.Dropdown(LANG, value="English", label="์ถ๋ ฅ ์ธ์ด")
|
| 447 |
+
|
| 448 |
+
with gr.Tabs():
|
| 449 |
+
with gr.TabItem("๐ ํ์ผ ์
๋ก๋"):
|
| 450 |
+
# ํ์ผ ์
๋ก๋ - ์ค๋์ค์ ๋น๋์ค ๋ชจ๋ ์ง์
|
| 451 |
+
aud1_file = gr.File(
|
| 452 |
+
label="์ค๋์ค/๋น๋์ค ํ์ผ ์
๋ก๋",
|
| 453 |
+
file_types=[".mp3", ".wav", ".m4a", ".flac", ".ogg", ".opus",
|
| 454 |
+
".mp4", ".avi", ".mov", ".mkv", ".webm", ".flv"],
|
| 455 |
+
type="filepath"
|
| 456 |
+
)
|
| 457 |
+
gr.Markdown(
|
| 458 |
+
"๐ **์ง์ ํ์**\n"
|
| 459 |
+
"- ์ค๋์ค: MP3, WAV, M4A, FLAC, OGG, OPUS\n"
|
| 460 |
+
"- ๋น๋์ค: MP4, AVI, MOV, MKV, WebM, FLV\n\n"
|
| 461 |
+
"โ ๏ธ **์ฃผ์์ฌํญ**\n"
|
| 462 |
+
"- ๋น๋์ค ํ์ผ์ ์ค๋์ค ์ถ์ถ ์๊ฐ์ด ํ์ํฉ๋๋ค\n"
|
| 463 |
+
"- ๋์ฉ๋ ํ์ผ์ ์ฒ๋ฆฌ ์๊ฐ์ด ์ค๋ ๊ฑธ๋ฆด ์ ์์ต๋๋ค"
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
with gr.TabItem("๐ค ๋ง์ดํฌ ๋
น์"):
|
| 467 |
+
aud1_mic = gr.Audio(
|
| 468 |
+
sources=["microphone"],
|
| 469 |
+
type="filepath",
|
| 470 |
+
label="๋ง์ดํฌ ๋
น์"
|
| 471 |
+
)
|
| 472 |
+
gr.Markdown("๐ก **ํ**: ๋
น์ ํ '์ ์ง' ๋ฒํผ์ ๋๋ฌ์ฃผ์ธ์")
|
| 473 |
|
| 474 |
+
btn1 = gr.Button("๐ ๋ฒ์ญ ์์", variant="primary", size="lg")
|
| 475 |
+
|
| 476 |
+
# ์งํ ์ํ ํ์
|
| 477 |
+
status1 = gr.Textbox(label="์งํ ์ํ", value="๋๊ธฐ ์ค...", interactive=False)
|
| 478 |
+
|
| 479 |
+
with gr.Row():
|
| 480 |
+
with gr.Column():
|
| 481 |
+
o1 = gr.Textbox(label="๐ ์๋ฌธ", lines=6)
|
| 482 |
+
with gr.Column():
|
| 483 |
+
t1 = gr.Textbox(label="๐ ๋ฒ์ญ", lines=6)
|
| 484 |
+
|
| 485 |
+
a1 = gr.Audio(label="๐ ๋ฒ์ญ๋ ์์ฑ (TTS)", type="filepath", autoplay=True)
|
| 486 |
+
|
| 487 |
+
# ํ์ผ๏ฟฝ๏ฟฝ๏ฟฝ๋ ๋ง์ดํฌ ์ค ํ์ฑํ๋ ์
๋ ฅ ์ฌ์ฉ
|
| 488 |
+
def translate_with_status(file_input, mic_input, src, tgt):
|
| 489 |
+
active_input = file_input if file_input else mic_input
|
| 490 |
+
if not active_input:
|
| 491 |
+
return "โ ๏ธ ํ์ผ์ ์
๋ก๋ํ๊ฑฐ๋ ๋
น์์ ํด์ฃผ์ธ์", "", None
|
| 492 |
+
|
| 493 |
+
# ์ํ ์
๋ฐ์ดํธ๋ ๋๊ธฐ ํจ์์์ ์ฒ๋ฆฌ
|
| 494 |
+
return translate_audio(active_input, src, tgt)
|
| 495 |
+
|
| 496 |
+
btn1.click(
|
| 497 |
+
lambda: "์ฒ๋ฆฌ ์ค... ์ ์๋ง ๊ธฐ๋ค๋ ค์ฃผ์ธ์ โณ",
|
| 498 |
+
outputs=status1
|
| 499 |
+
).then(
|
| 500 |
+
translate_with_status,
|
| 501 |
+
[aud1_file, aud1_mic, src1, tgt1],
|
| 502 |
+
[o1, t1, a1]
|
| 503 |
+
).then(
|
| 504 |
+
lambda: "โ
์๋ฃ!",
|
| 505 |
+
outputs=status1
|
| 506 |
+
)
|
| 507 |
|
| 508 |
# ํญ 2 โ PDF ๋ฒ์ญ
|
| 509 |
with gr.TabItem("๐ PDF"):
|