Spaces:

datbkpro
/

voicebot

Running

App Files Files Community

datbkpro commited on Oct 15

Commit

cd6507a

1 Parent(s): f66eb30

Deploy Gradio VoiceBot lên Hugging Face

Browse files

Files changed (2) hide show

app.py +1211 -0
requirements.txt +11 -0

app.py ADDED Viewed

	@@ -0,0 +1,1211 @@

+import gradio as gr
+import groq
+import os
+import io
+import numpy as np
+import soundfile as sf
+from PIL import Image
+from dotenv import load_dotenv
+import pandas as pd
+import json
+from typing import List, Dict
+from sentence_transformers import SentenceTransformer
+import faiss
+import time
+import re
+from gtts import gTTS
+import edge_tts
+import asyncio
+# //load_dotenv()
+api_key = os.getenv("GROQ_API_KEY")
+# Get the GROQ_API_KEY from environment variables
+# api_key = os.environ.get("GROQ_API_KEY")
+if not api_key:
+    raise ValueError("Please set the GROQ_API_KEY environment variable.")
+# Initialize the Groq client
+client = groq.Client(api_key=api_key)
+# Initialize Vietnamese embedding model
+print("🔄 Đang tải mô hình embedding tiếng Việt...")
+try:
+    vietnamese_embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
+    print("✅ Đã tải mô hình embedding tiếng Việt")
+except Exception as e:
+    print(f"❌ Lỗi tải mô hình embedding: {e}")
+    vietnamese_embedder = None
+# Enhanced RAG system with Vietnamese embeddings
+class EnhancedRAGSystem:
+    def __init__(self):
+        self.documents = []
+        self.metadatas = []
+        self.embeddings = None
+        self.index = None
+        self.dimension = 384  # Dimension for Vietnamese SBERT
+        print("✅ Đã khởi tạo Enhanced RAG system với embedding tiếng Việt")
+        # Initialize sample nutrition data in Vietnamese
+        self._initialize_sample_data()
+    def _initialize_sample_data(self):
+        """Khởi tạo dữ liệu dinh dưỡng mẫu bằng tiếng Việt"""
+        nutrition_data = [
+            "Chế độ ăn Địa Trung Hải giàu rau củ, trái cây, ngũ cốc nguyên hạt và dầu olive tốt cho tim mạch",
+            "Protein từ thịt gà, cá hồi và đậu phụ giúp xây dựng cơ bắp và duy trì sức khỏe",
+            "Trái cây họ cam quýt như cam, bưởi cung cấp vitamin C tăng cường hệ miễn dịch",
+            "Rau xanh như cải bó xôi, bông cải xanh chứa nhiều chất xơ và vitamin K",
+            "Cá hồi giàu omega-3 tốt cho não bộ và sức khỏe tim mạch",
+            "Các loại hạt như hạnh nhân, óc chó cung cấp chất béo lành mạnh và protein",
+            "Sữa chua Hy Lạp chứa probiotic tốt cho hệ tiêu hóa và giàu protein",
+            "Gạo lứt và yến mạch là nguồn carbohydrate phức tạp cung cấp năng lượng lâu dài"
+        ]
+        self.add_documents(nutrition_data, [{"type": "nutrition", "source": "sample", "language": "vi"}] * len(nutrition_data))
+    def add_documents(self, documents: List[str], metadatas: List[Dict] = None):
+        """Thêm documents vào database với embedding"""
+        if not documents:
+            return
+        # Generate embeddings for new documents
+        if vietnamese_embedder is not None:
+            try:
+                new_embeddings = vietnamese_embedder.encode(documents)
+                if self.embeddings is None:
+                    self.embeddings = new_embeddings
+                else:
+                    self.embeddings = np.vstack([self.embeddings, new_embeddings])
+                # Update FAISS index
+                self._update_faiss_index()
+            except Exception as e:
+                print(f"❌ Lỗi tạo embedding: {e}")
+        self.documents.extend(documents)
+        self.metadatas.extend(metadatas or [{}] * len(documents))
+        print(f"✅ Đã thêm {len(documents)} documents vào RAG database với embedding")
+    def _update_faiss_index(self):
+        """Cập nhật FAISS index với embeddings hiện tại"""
+        if self.embeddings is None or len(self.embeddings) == 0:
+            return
+        try:
+            dimension = self.embeddings.shape[1]
+            self.index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
+            self.index.add(self.embeddings.astype(np.float32))
+        except Exception as e:
+            print(f"❌ Lỗi cập nhật FAISS index: {e}")
+    def semantic_search(self, query: str, top_k: int = 3) -> List[Dict]:
+        """Tìm kiếm ngữ nghĩa sử dụng embedding tiếng Việt"""
+        if not self.documents or self.index is None:
+            return self._fallback_keyword_search(query, top_k)
+        try:
+            # Encode query using Vietnamese embedder
+            query_embedding = vietnamese_embedder.encode([query])
+            # Search in FAISS index
+            similarities, indices = self.index.search(query_embedding.astype(np.float32), min(top_k, len(self.documents)))
+            results = []
+            for i, (similarity, idx) in enumerate(zip(similarities[0], indices[0])):
+                if idx < len(self.documents):
+                    results.append({
+                        'id': str(idx),
+                        'text': self.documents[idx],
+                        'similarity': float(similarity),
+                        'metadata': self.metadatas[idx] if idx < len(self.metadatas) else {}
+                    })
+            return results
+        except Exception as e:
+            print(f"❌ Lỗi tìm kiếm ngữ nghĩa: {e}")
+            return self._fallback_keyword_search(query, top_k)
+    def _fallback_keyword_search(self, query: str, top_k: int = 3) -> List[Dict]:
+        """Tìm kiếm dự phòng dựa trên từ khóa"""
+        query_lower = query.lower()
+        results = []
+        for i, doc in enumerate(self.documents):
+            score = 0
+            for word in query_lower.split():
+                if word in doc.lower():
+                    score += 1
+            if score > 0:
+                results.append({
+                    'id': str(i),
+                    'text': doc,
+                    'similarity': min(score / 5, 1.0),
+                    'metadata': self.metadatas[i] if i < len(self.metadatas) else {}
+                })
+        results.sort(key=lambda x: x['similarity'], reverse=True)
+        return results[:top_k]
+    def get_collection_stats(self) -> Dict:
+        """Lấy thống kê collection"""
+        return {
+            'count': len(self.documents),
+            'embedding_count': len(self.embeddings) if self.embeddings is not None else 0,
+            'name': 'enhanced_rag_vi',
+            'status': 'active',
+            'has_embeddings': self.embeddings is not None
+        }
+class WikipediaProcessor:
+    def __init__(self):
+        self.supported_formats = ['.txt', '.csv', '.json']
+    def process_uploaded_file(self, file_path: str) -> List[str]:
+        """Xử lý file Wikipedia uploaded"""
+        file_ext = os.path.splitext(file_path)[1].lower()
+        try:
+            if file_ext == '.txt':
+                return self._process_txt_file(file_path)
+            elif file_ext == '.csv':
+                return self._process_csv_file(file_path)
+            elif file_ext == '.json':
+                return self._process_json_file(file_path)
+            else:
+                raise ValueError(f"Định dạng file không được hỗ trợ: {file_ext}")
+        except Exception as e:
+            raise Exception(f"Lỗi xử lý file: {str(e)}")
+    def _process_txt_file(self, file_path: str) -> List[str]:
+        """Xử lý file text"""
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        paragraphs = [p.strip() for p in content.split('\n\n') if p.strip() and len(p.strip()) > 20]
+        return paragraphs
+    def _process_csv_file(self, file_path: str) -> List[str]:
+        """Xử lý file CSV"""
+        try:
+            df = pd.read_csv(file_path)
+            documents = []
+            for _, row in df.iterrows():
+                doc_parts = []
+                for col in df.columns:
+                    if pd.notna(row[col]) and str(row[col]).strip():
+                        doc_parts.append(f"{col}: {row[col]}")
+                if doc_parts:
+                    documents.append(" | ".join(doc_parts))
+            return documents
+        except Exception as e:
+            raise Exception(f"Lỗi đọc CSV: {str(e)}")
+    def _process_json_file(self, file_path: str) -> List[str]:
+        """Xử lý file JSON"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            documents = []
+            def extract_text(obj, current_path=""):
+                if isinstance(obj, dict):
+                    for key, value in obj.items():
+                        extract_text(value, f"{current_path}.{key}" if current_path else key)
+                elif isinstance(obj, list):
+                    for item in obj:
+                        extract_text(item, current_path)
+                elif isinstance(obj, str) and len(obj.strip()) > 10:
+                    documents.append(f"{current_path}: {obj.strip()}")
+            extract_text(data)
+            return documents
+        except Exception as e:
+            raise Exception(f"Lỗi đọc JSON: {str(e)}")
+# Enhanced TTS Service with multiple providers and chunking
+class EnhancedTTSService:
+    def __init__(self):
+        self.supported_languages = {
+            'vi': 'vi',  # Vietnamese
+            'en': 'en',  # English
+            'fr': 'fr',  # French
+            'es': 'es',  # Spanish
+            'de': 'de',  # German
+            'ja': 'ja',  # Japanese
+            'ko': 'ko',  # Korean
+            'zh': 'zh'   # Chinese
+        }
+        self.max_chunk_length = 200  # Maximum characters per TTS request
+    def detect_language(self, text: str) -> str:
+        """Đơn giản phát hiện ngôn ngữ dựa trên ký tự"""
+        vietnamese_chars = set('àáâãèéêìíòóôõùúýăđĩũơưạảấầẩẫậắằẳẵặẹẻẽếềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹ')
+        if any(char in vietnamese_chars for char in text.lower()):
+            return 'vi'
+        # Simple detection for other languages
+        elif any(char in text for char in 'あいうえお'):  # Japanese
+            return 'ja'
+        elif any(char in text for char in '你好'):  # Chinese
+            return 'zh'
+        elif any(char in text for char in '안녕'):  # Korean
+            return 'ko'
+        else:
+            return 'en'  # Default to English
+    def split_text_into_chunks(self, text: str, max_length: int = None) -> List[str]:
+        """Chia văn bản thành các đoạn nhỏ cho TTS"""
+        if max_length is None:
+            max_length = self.max_chunk_length
+        # Split by sentences first
+        sentences = re.split(r'[.!?]+', text)
+        chunks = []
+        current_chunk = ""
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if not sentence:
+                continue
+            # If sentence is too long, split by commas
+            if len(sentence) > max_length:
+                parts = re.split(r'[,;:]', sentence)
+                for part in parts:
+                    part = part.strip()
+                    if not part:
+                        continue
+                    if len(current_chunk) + len(part) + 2 <= max_length:
+                        if current_chunk:
+                            current_chunk += ". " + part
+                        else:
+                            current_chunk = part
+                    else:
+                        if current_chunk:
+                            chunks.append(current_chunk)
+                        current_chunk = part
+            else:
+                if len(current_chunk) + len(sentence) + 2 <= max_length:
+                    if current_chunk:
+                        current_chunk += ". " + sentence
+                    else:
+                        current_chunk = sentence
+                else:
+                    if current_chunk:
+                        chunks.append(current_chunk)
+                    current_chunk = sentence
+        if current_chunk:
+            chunks.append(current_chunk)
+        return chunks
+    def text_to_speech_gtts(self, text: str, language: str = 'vi') -> bytes:
+        """Sử dụng gTTS (Google Text-to-Speech) library"""
+        try:
+            # Split long text into chunks
+            chunks = self.split_text_into_chunks(text)
+            audio_chunks = []
+            for chunk in chunks:
+                if not chunk.strip():
+                    continue
+                tts = gTTS(text=chunk, lang=language, slow=False)
+                audio_buffer = io.BytesIO()
+                tts.write_to_fp(audio_buffer)
+                audio_buffer.seek(0)
+                audio_chunks.append(audio_buffer.read())
+                # Small delay between requests
+                time.sleep(0.1)
+            # Combine all audio chunks
+            if audio_chunks:
+                return b''.join(audio_chunks)
+            return None
+        except Exception as e:
+            print(f"❌ Lỗi gTTS: {e}")
+            return None
+    async def text_to_speech_edgetts(self, text: str, voice: str = 'vi-VN-NamMinhNeural') -> bytes:
+        """Sử dụng Edge-TTS (Microsoft Edge) - async version"""
+        try:
+            communicate = edge_tts.Communicate(text, voice)
+            audio_buffer = io.BytesIO()
+            async for chunk in communicate.stream():
+                if chunk["type"] == "audio":
+                    audio_buffer.write(chunk["data"])
+            audio_buffer.seek(0)
+            return audio_buffer.read()
+        except Exception as e:
+            print(f"❌ Lỗi Edge-TTS: {e}")
+            return None
+    def text_to_speech_edgetts_sync(self, text: str, voice: str = 'vi-VN-NamMinhNeural') -> bytes:
+        """Sync wrapper for Edge-TTS"""
+        try:
+            return asyncio.run(self.text_to_speech_edgetts(text, voice))
+        except Exception as e:
+            print(f"❌ Lỗi Edge-TTS sync: {e}")
+            return None
+    def text_to_speech_fallback(self, text: str, language: str = 'vi') -> bytes:
+        """Fallback TTS using simple method"""
+        try:
+            # Use gTTS as fallback
+            return self.text_to_speech_gtts(text, language)
+        except Exception as e:
+            print(f"❌ Lỗi fallback TTS: {e}")
+            return None
+    def text_to_speech(self, text: str, language: str = None, provider: str = "auto") -> bytes:
+        """Chuyển văn b���n thành giọng nói với nhiều nhà cung cấp"""
+        if not text or len(text.strip()) == 0:
+            return None
+        if language is None:
+            language = self.detect_language(text)
+        # Clean and prepare text
+        text = self.clean_text(text)
+        try:
+            if provider == "auto" or provider == "gtts":
+                print(f"🔊 Đang sử dụng gTTS cho văn bản {len(text)} ký tự...")
+                audio_bytes = self.text_to_speech_gtts(text, language)
+                if audio_bytes:
+                    return audio_bytes
+            if provider == "auto" or provider == "edgetts":
+                print(f"🔊 Đang thử Edge-TTS cho văn bản {len(text)} ký tự...")
+                voice_map = {
+                    'vi': 'vi-VN-NamMinhNeural',
+                    'en': 'en-US-AriaNeural',
+                    'fr': 'fr-FR-DeniseNeural',
+                    'es': 'es-ES-ElviraNeural',
+                    'de': 'de-DE-KatjaNeural',
+                    'ja': 'ja-JP-NanamiNeural',
+                    'ko': 'ko-KR-SunHiNeural',
+                    'zh': 'zh-CN-XiaoxiaoNeural'
+                }
+                voice = voice_map.get(language, 'vi-VN-NamMinhNeural')
+                audio_bytes = self.text_to_speech_edgetts_sync(text, voice)
+                if audio_bytes:
+                    return audio_bytes
+            # Final fallback
+            print(f"🔊 Đang sử dụng fallback TTS...")
+            return self.text_to_speech_fallback(text, language)
+        except Exception as e:
+            print(f"❌ Lỗi TTS tổng hợp: {e}")
+            return None
+    def clean_text(self, text: str) -> str:
+        """Làm sạch văn bản trước khi chuyển thành giọng nói"""
+        # Remove URLs
+        text = re.sub(r'http\S+', '', text)
+        # Remove special characters but keep Vietnamese diacritics
+        text = re.sub(r'[^\w\sàáâãèéêìíòóôõùúýăđĩũơưạảấầẩẫậắằẳẵặẹẻẽếềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹ.,!?;:()-]', '', text)
+        # Replace multiple spaces with single space
+        text = re.sub(r'\s+', ' ', text)
+        # Remove extra whitespace
+        text = text.strip()
+        return text
+    def save_audio_to_file(self, audio_bytes: bytes, filename: str = None) -> str:
+        """Lưu audio bytes thành file tạm thời"""
+        if audio_bytes is None:
+            return None
+        if filename is None:
+            filename = f"tts_output_{int(time.time())}.mp3"
+        temp_dir = "temp_audio"
+        os.makedirs(temp_dir, exist_ok=True)
+        filepath = os.path.join(temp_dir, filename)
+        with open(filepath, 'wb') as f:
+            f.write(audio_bytes)
+        return filepath
+# Initialize systems
+rag_system = EnhancedRAGSystem()
+wikipedia_processor = WikipediaProcessor()
+tts_service = EnhancedTTSService()
+# Audio utility functions
+def numpy_to_mp3(audio_array: np.ndarray, sampling_rate: int = 24000) -> bytes:
+    """Convert numpy array to MP3 bytes"""
+    buffer = io.BytesIO()
+    sf.write(buffer, audio_array, sampling_rate, format='mp3')
+    buffer.seek(0)
+    return buffer.read()
+def transcribe_audio(audio):
+    """Chuyển đổi audio thành văn bản và tạo phản hồi với TTS"""
+    if audio is None:
+        return "No audio provided.", "", None
+    sr, y = audio
+    # Convert to mono if stereo
+    if y.ndim > 1:
+        y = y.mean(axis=1)
+    # Normalize audio
+    y = y.astype(np.float32)
+    y /= np.max(np.abs(y))
+    # Write audio to buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, y, sr, format='wav')
+    buffer.seek(0)
+    try:
+        # Use Whisper model for transcription
+        completion = client.audio.transcriptions.create(
+            model="whisper-large-v3-turbo",
+            file=("audio.wav", buffer),
+            response_format="text"
+        )
+        transcription = completion
+    except Exception as e:
+        transcription = f"Error in transcription: {str(e)}"
+    response = generate_response_with_rag(transcription)
+    # Generate TTS audio for response
+    tts_audio = None
+    if response and not response.startswith("Error"):
+        tts_bytes = tts_service.text_to_speech(response, 'vi')
+        if tts_bytes:
+            tts_audio_path = tts_service.save_audio_to_file(tts_bytes)
+            tts_audio = tts_audio_path
+    return transcription, response, tts_audio
+def generate_response_with_rag(user_input):
+    """Tạo phản hồi sử dụng RAG với embedding tiếng Việt"""
+    if not user_input or user_input.startswith("Error"):
+        return "No valid input available. Please try again."
+    try:
+        # Tìm kiếm thông tin liên quan từ RAG với embedding tiếng Việt
+        rag_results = rag_system.semantic_search(user_input, top_k=3)
+        # Tạo context từ RAG results
+        context_text = ""
+        if rag_results:
+            context_text = "\n".join([f"- {doc['text']}" for doc in rag_results])
+        # System prompt với RAG context
+        system_prompt = """Bạn là trợ lý AI thông minh chuyên về tiếng Việt. Hãy sử dụng thông tin từ cơ sở kiến thức được cung cấp để trả lời câu hỏi một cách chính xác và hữu ích bằng tiếng Việt.
+Thông tin tham khảo từ cơ sở kiến thức:
+{context}
+Nếu thông tin từ cơ sở kiến thức không đủ để trả lời, hãy dựa vào kiến thức chung của bạn. Luôn trả lời bằng tiếng Việt tự nhiên và dễ hiểu."""
+        messages = [
+            {"role": "system", "content": system_prompt.format(context=context_text)},
+            {"role": "user", "content": user_input}
+        ]
+        # Use Llama 3.3 70B model for text generation với RAG context
+        completion = client.chat.completions.create(
+            model="llama-3.3-70b-versatile",
+            messages=messages,
+        )
+        return completion.choices[0].message.content
+    except Exception as e:
+        return f"Error in response generation: {str(e)}"
+def analyze_image_with_description(image, user_description):
+    """Phân tích hình ảnh kết hợp với mô tả từ người dùng"""
+    if image is None:
+        return "No image uploaded."
+    try:
+        if user_description:
+            prompt = f"""Người dùng tải lên một hình ảnh và mô tả: "{user_description}"
+Dựa trên mô tả này, hãy phân tích chi tiết bằng tiếng Việt:
+1. Mô tả những gì có trong hình ảnh
+2. Nếu là thức ăn: ước tính dinh dưỡng (calo, protein, carbs, chất béo, chất xơ)
+3. Nếu là cảnh quan/con người: mô tả chi tiết và ý nghĩa
+4. Đưa ra nhận xét hoặc lời khuyên liên quan"""
+        else:
+            prompt = """Hãy mô tả chi tiết bằng tiếng Việt những gì bạn nghĩ có thể có trong hình ảnh này. Nếu là thức ăn, hãy ước tính giá trị dinh dưỡng. Nếu là cảnh quan hoặc con người, hãy mô tả chi tiết."""
+        chat_completion = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt
+                }
+            ],
+            model="llama-3.3-70b-versatile",
+        )
+        description = chat_completion.choices[0].message.content
+    except Exception as e:
+        description = f"Error in image analysis: {str(e)}"
+    return description
+def respond(message, chat_history):
+    """Xử lý chat với TTS output"""
+    if chat_history is None:
+        chat_history = []
+    # Prepare the message history for the API
+    messages = []
+    for user_msg, assistant_msg in chat_history:
+        messages.append({"role": "user", "content": user_msg})
+        messages.append({"role": "assistant", "content": assistant_msg})
+    messages.append({"role": "user", "content": message})
+    try:
+        # Sử dụng RAG để tìm kiếm thông tin liên quan
+        rag_results = rag_system.semantic_search(message, top_k=2)
+        # Thêm context từ RAG vào system prompt
+        context_text = ""
+        if rag_results:
+            context_text = "\nThông tin tham khảo:\n" + "\n".join([f"- {doc['text']}" for doc in rag_results])
+        system_message = {
+            "role": "system",
+            "content": f"Bạn là trợ lý AI hữu ích chuyên về tiếng Việt. Sử dụng thông tin từ cơ sở kiến thức khi có liên quan. Luôn trả lời bằng tiếng Việt tự nhiên.{context_text}"
+        }
+        # Chèn system message vào đầu
+        messages_with_context = [system_message] + messages
+        # Use Llama 3.3 70B model for generating assistant response
+        completion = client.chat.completions.create(
+            model="llama-3.3-70b-versatile",
+            messages=messages_with_context,
+        )
+        assistant_message = completion.choices[0].message.content
+        chat_history.append((message, assistant_message))
+        # Generate TTS audio for the response
+        tts_audio_path = None
+        if assistant_message and not assistant_message.startswith("Error"):
+            tts_bytes = tts_service.text_to_speech(assistant_message, 'vi')
+            if tts_bytes:
+                tts_audio_path = tts_service.save_audio_to_file(tts_bytes)
+    except Exception as e:
+        assistant_message = f"Error: {str(e)}"
+        chat_history.append((message, assistant_message))
+        tts_audio_path = None
+    return "", chat_history, chat_history, tts_audio_path
+def upload_wikipedia_file(file):
+    """Xử lý upload file Wikipedia"""
+    if file is None:
+        return "Vui lòng chọn file để upload"
+    try:
+        documents = wikipedia_processor.process_uploaded_file(file.name)
+        if not documents:
+            return "Không tìm thấy dữ liệu nào trong file."
+        # Thêm metadata
+        metadatas = [{"source": "wikipedia", "type": "knowledge", "file": os.path.basename(file.name), "language": "vi"} for _ in documents]
+        rag_system.add_documents(documents, metadatas)
+        stats = rag_system.get_collection_stats()
+        return f"✅ Đã thêm {len(documents)} documents Wikipedia vào RAG database. Tổng số documents: {stats['count']}, Embeddings: {stats['embedding_count']}"
+    except Exception as e:
+        return f"❌ Lỗi xử lý file Wikipedia: {str(e)}"
+def get_rag_stats():
+    """Lấy thống kê RAG database"""
+    stats = rag_system.get_collection_stats()
+    return f"📊 Thống kê RAG Database:\n- Tổng documents: {stats['count']}\n- Embeddings: {stats['embedding_count']}\n- Trạng thái: {stats['status']}\n- Hỗ trợ embedding: {stats['has_embeddings']}"
+def search_rag_database(query):
+    """Tìm kiếm trong RAG database để debug"""
+    if not query.strip():
+        return []
+    results = rag_system.semantic_search(query, top_k=5)
+    return results
+def clear_chat_history(chat_history):
+    """Xóa lịch sử chat"""
+    return [], []
+# Enhanced Streaming Voice AI Functions with TTS
+def generate_streaming_response(audio_file):
+    """Generate response for streaming voice AI với TTS"""
+    if audio_file is None:
+        return None, "No audio provided", None
+    try:
+        # Transcribe audio using Whisper
+        with open(audio_file, "rb") as f:
+            transcription = client.audio.transcriptions.create(
+                model="whisper-large-v3-turbo",
+                file=f,
+                response_format="text"
+            )
+        # Generate response using RAG với embedding tiếng Việt
+        rag_results = rag_system.semantic_search(transcription, top_k=2)
+        context_text = ""
+        if rag_results:
+            context_text = "\nThông tin tham khảo:\n" + "\n".join([f"- {doc['text']}" for doc in rag_results])
+        system_prompt = f"""Bạn là trợ lý AI thông minh và thân thiện chuyên về tiếng Việt. Hãy trả lời câu hỏi một cách tự nhiên và hữu ích bằng tiếng Việt. Sử dụng thông tin từ cơ sở kiến thức khi có liên quan.{context_text}"""
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": transcription}
+        ]
+        completion = client.chat.completions.create(
+            model="llama-3.3-70b-versatile",
+            messages=messages,
+            max_tokens=150
+        )
+        response = completion.choices[0].message.content
+        # Generate TTS audio
+        tts_audio_bytes = tts_service.text_to_speech(response, 'vi')
+        if tts_audio_bytes:
+            # Save to temporary file for audio output
+            temp_audio_file = tts_service.save_audio_to_file(tts_audio_bytes)
+            return response, response, temp_audio_file
+        return response, response, None
+    except Exception as e:
+        error_msg = f"Error in streaming response: {str(e)}"
+        return None, error_msg, None
+def read_streaming_response(answer):
+    """Read response aloud using TTS"""
+    if not answer:
+        return answer, None
+    try:
+        tts_audio_bytes = tts_service.text_to_speech(answer, 'vi')
+        if tts_audio_bytes:
+            temp_audio_file = tts_service.save_audio_to_file(tts_audio_bytes)
+            return answer, temp_audio_file
+    except Exception as e:
+        print(f"❌ Lỗi TTS: {e}")
+    return answer, None
+# Enhanced Magic 8 Ball Functions with Vietnamese responses
+def generate_magic_8_ball_response(audio_file):
+    """Generate Magic 8 Ball response for audio input với tiếng Việt"""
+    if audio_file is None:
+        return None, "No audio provided", None
+    try:
+        # Transcribe audio using Whisper
+        with open(audio_file, "rb") as f:
+            transcription = client.audio.transcriptions.create(
+                model="whisper-large-v3-turbo",
+                file=f,
+                response_format="text"
+            )
+        # Magic 8 Ball system prompt in Vietnamese
+        messages = [
+            {
+                "role": "system",
+                "content": (
+                )
+            },
+            {
+                "role": "user",
+                "content": f"Quả cầu pha lê xin hãy trả lời câu hỏi này - {transcription}"
+            }
+        ]
+        completion = client.chat.completions.create(
+            model="llama-3.3-70b-versatile",
+            messages=messages,
+            max_tokens=64,
+            temperature=0.8
+        )
+        response = completion.choices[0].message.content
+        # Clean up response
+        response = response.replace("Magic 8 Ball", "").replace("Quả cầu pha lê", "").replace(":", "").strip()
+        # Generate TTS audio
+        tts_audio_bytes = tts_service.text_to_speech(response, 'vi')
+        if tts_audio_bytes:
+            temp_audio_file = tts_service.save_audio_to_file(tts_audio_bytes)
+            return response, response, temp_audio_file
+        return response, response, None
+    except Exception as e:
+        error_msg = f"Error in Magic 8 Ball response: {str(e)}"
+        return None, error_msg, None
+def read_magic_8_ball_response(answer):
+    """Read Magic 8 Ball response aloud using TTS"""
+    if not answer:
+        return answer, None
+    try:
+        tts_audio_bytes = tts_service.text_to_speech(answer, 'vi')
+        if tts_audio_bytes:
+            temp_audio_file = tts_service.save_audio_to_file(tts_audio_bytes)
+            return answer, temp_audio_file
+    except Exception as e:
+        print(f"❌ Lỗi TTS: {e}")
+    return answer, None
+# Text-to-Speech standalone function
+def text_to_speech_standalone(text, language, tts_provider):
+    """Chức năng TTS độc lập"""
+    if not text:
+        return None
+    try:
+        tts_audio_bytes = tts_service.text_to_speech(text, language, tts_provider)
+        if tts_audio_bytes:
+            temp_audio_file = tts_service.save_audio_to_file(tts_audio_bytes)
+            return temp_audio_file
+    except Exception as e:
+        print(f"❌ Lỗi TTS: {e}")
+    return None
+# Custom CSS (giữ nguyên)
+custom_css = """
+.gradio-container {
+    background-color: #1f1f1f;
+}
+.gr-markdown, .gr-markdown * {
+    color: #ffffff !important;
+}
+.gr-textbox, .gr-textbox * {
+    color: #ffffff !important;
+}
+.gr-label, .gr-label * {
+    color: #ffffff !important;
+}
+.gr-chatbot, .gr-chatbot * {
+    color: #ffffff !important;
+}
+.gr-json, .gr-json * {
+    color: #ffffff !important;
+}
+.gr-box, .gr-block, .panel, .tab-item {
+    background-color: #2d2d2d !important;
+    border-color: #444444 !important;
+}
+input, textarea, select {
+    color: #ffffff !important;
+    background-color: #2d2d2d !important;
+    border-color: #444444 !important;
+}
+::placeholder {
+    color: #aaaaaa !important;
+}
+.message {
+    color: #ffffff !important;
+}
+.user-message, .bot-message {
+    color: #ffffff !important;
+}
+h1, h2, h3, h4, h5, h6 {
+    color: #ffffff !important;
+}
+.tab-nav {
+    color: #ffffff !important;
+}
+.tab-item {
+    color: #ffffff !important;
+}
+.gr-button {
+    color: #ffffff !important;
+    background-color: #f55036 !important;
+    border-color: #f55036 !important;
+}
+.gr-button-secondary {
+    color: #ffffff !important;
+    background-color: #666666 !important;
+    border-color: #666666 !important;
+}
+.gr-file, .gr-file * {
+    color: #ffffff !important;
+}
+.gr-json {
+    background-color: #2d2d2d !important;
+}
+.gr-audio, .gr-audio * {
+    color: #ffffff !important;
+}
+.gr-image, .gr-image * {
+    color: #ffffff !important;
+}
+.form, .form * {
+    color: #ffffff !important;
+}
+.block, .block * {
+    color: #ffffff !important;
+}
+div[data-testid="block"] {
+    background-color: #2d2d2d !important;
+    color: #ffffff !important;
+}
+.gr-component, .gr-component * {
+    color: #ffffff !important;
+}
+#groq-badge {
+    position: fixed;
+    bottom: 20px;
+    right: 20px;
+    z-index: 1000;
+    color: #ffffff !important;
+}
+.streaming-voice-container {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    padding: 20px;
+    border-radius: 10px;
+    margin: 10px 0;
+}
+.magic-8-ball-container {
+    background: linear-gradient(135deg, #1a2a6c 0%, #b21f1f 50%, #fdbb2d 100%);
+    padding: 20px;
+    border-radius: 15px;
+    margin: 10px 0;
+    text-align: center;
+    border: 3px solid #ffffff;
+}
+.magic-8-ball-title {
+    font-size: 2.5em !important;
+    font-weight: bold !important;
+    text-shadow: 2px 2px 4px rgba(0,0,0,0.5);
+    margin-bottom: 10px !important;
+}
+.magic-8-ball-subtitle {
+    font-size: 1.2em !important;
+    opacity: 0.9;
+    margin-bottom: 20px !important;
+}
+.tts-container {
+    background: linear-gradient(135deg, #00b09b 0%, #96c93d 100%);
+    padding: 20px;
+    border-radius: 10px;
+    margin: 10px 0;
+}
+"""
+with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="orange", neutral_hue="slate")) as demo:
+    gr.Markdown("# 🎙️ Groq x Gradio Multi-Modal với RAG Wikipedia & TTS Tiếng Việt")
+    gr.Markdown("**Ứng dụng đa chức năng với Llama 3.3, Whisper, RAG embedding tiếng Việt**")
+    with gr.Tab("🎙️ Audio"):
+        gr.Markdown("## Nói chuyện với AI (có TTS)")
+        with gr.Row():
+            audio_input = gr.Audio(type="numpy", label="Nói hoặc tải lên file âm thanh")
+        with gr.Row():
+            transcription_output = gr.Textbox(
+                label="Bản ghi âm",
+                lines=5,
+                interactive=True,
+                placeholder="Bản ghi âm sẽ hiển thị ở đây..."
+            )
+            response_output = gr.Textbox(
+                label="Phản hồi AI",
+                lines=5,
+                interactive=True,
+                placeholder="Phản hồi của AI sẽ hiển thị ở đây..."
+            )
+        with gr.Row():
+            tts_audio_output = gr.Audio(
+                label="Phản hồi bằng giọng nói",
+                interactive=False
+            )
+        process_button = gr.Button("Xử lý", variant="primary")
+        process_button.click(
+            transcribe_audio,
+            inputs=audio_input,
+            outputs=[transcription_output, response_output, tts_audio_output]
+        )
+    with gr.Tab("🔊 Streaming Voice"):
+        gr.Markdown("## 🎤 Trò chuyện giọng nói thời gian thực với TTS")
+        gr.Markdown("Nói chuyện tự nhiên với AI - Câu hỏi của bạn sẽ được chuyển thành văn bản và AI sẽ trả lời bằng giọng nói tiếng Việt")
+        with gr.Group():
+            with gr.Row():
+                audio_out = gr.Audio(
+                    label="Câu trả lời bằng giọng nói",
+                    autoplay=True,
+                    format="mp3"
+                )
+                answer_text = gr.Textbox(
+                    label="Câu trả lời văn bản",
+                    lines=5,
+                    placeholder="Câu trả lời văn bản sẽ hiển thị ở đây..."
+                )
+                streaming_state = gr.State()
+            with gr.Row():
+                audio_in = gr.Audio(
+                    label="Nói câu hỏi của bạn",
+                    sources="microphone",
+                    type="filepath",
+                    format="wav"
+                )
+        audio_in.stop_recording(
+            generate_streaming_response,
+            inputs=[audio_in],
+            outputs=[streaming_state, answer_text, audio_out]
+        ).then(
+            fn=read_streaming_response,
+            inputs=[streaming_state],
+            outputs=[answer_text, audio_out]
+        )
+    with gr.Tab("🎱 Magic 8 Ball"):
+        gr.HTML(
+            """
+            <div class="magic-8-ball-container">
+                <h1 class="magic-8-ball-title">Magic 8 Ball 🎱</h1>
+                <h3 class="magic-8-ball-subtitle">Hỏi một câu hỏi và nhận trí tuệ từ quả cầu thần kỳ bằng tiếng Việt</h3>
+                <p class="magic-8-ball-subtitle">Powered by Groq & Whisper & TTS</p>
+            </div>
+            """
+        )
+        with gr.Group():
+            with gr.Row():
+                magic_audio_out = gr.Audio(
+                    label="Câu trả lời bằng giọng nói",
+                    autoplay=True,
+                    format="mp3"
+                )
+                magic_answer = gr.Textbox(
+                    label="Câu trả lời",
+                    lines=3,
+                    placeholder="Câu trả lời thần kỳ sẽ hiển thị ở đây..."
+                )
+                magic_state = gr.State()
+            with gr.Row():
+                magic_audio_in = gr.Audio(
+                    label="Nói câu hỏi của bạn",
+                    sources="microphone",
+                    type="filepath",
+                    format="wav"
+                )
+        magic_audio_in.stop_recording(
+            generate_magic_8_ball_response,
+            inputs=[magic_audio_in],
+            outputs=[magic_state, magic_answer, magic_audio_out]
+        ).then(
+            fn=read_magic_8_ball_response,
+            inputs=[magic_state],
+            outputs=[magic_answer, magic_audio_out]
+        )
+    with gr.Tab("🔊 Text-to-Speech"):
+        gr.Markdown("## 🎵 Chuyển văn bản thành giọng nói nâng cao")
+        gr.Markdown("Nhập văn bản và chọn ngôn ngữ để chuyển thành giọng nói")
+        with gr.Group():
+            with gr.Row():
+                tts_text_input = gr.Textbox(
+                    label="Văn bản cần chuyển thành giọng nói",
+                    lines=4,
+                    placeholder="Nhập văn bản tại đây..."
+                )
+            with gr.Row():
+                tts_language = gr.Dropdown(
+                    choices=["vi", "en", "fr", "es", "de", "ja", "ko", "zh"],
+                    value="vi",
+                    label="Ngôn ngữ"
+                )
+                tts_provider = gr.Dropdown(
+                    choices=["auto", "gtts", "edgetts"],
+                    value="auto",
+                    label="Nhà cung cấp TTS"
+                )
+            with gr.Row():
+                tts_output_audio = gr.Audio(
+                    label="Kết quả giọng nói",
+                    interactive=False
+                )
+            tts_button = gr.Button("🔊 Chuyển thành giọng nói", variant="primary")
+        tts_button.click(
+            text_to_speech_standalone,
+            inputs=[tts_text_input, tts_language, tts_provider],
+            outputs=[tts_output_audio]
+        )
+    with gr.Tab("🖼️ Image"):
+        gr.Markdown("## Phân tích hình ảnh")
+        with gr.Row():
+            image_input = gr.Image(type="numpy", label="Tải lên hình ảnh")
+        with gr.Row():
+            image_description = gr.Textbox(
+                label="Mô tả hình ảnh của bạn (tùy chọn)",
+                placeholder="Mô tả ngắn về hình ảnh để AI phân tích chính xác hơn..."
+            )
+        with gr.Row():
+            image_output = gr.Textbox(label="Kết quả phân tích")
+        analyze_button = gr.Button("Phân tích hình ảnh", variant="primary")
+        analyze_button.click(
+            analyze_image_with_description,
+            inputs=[image_input, image_description],
+            outputs=[image_output]
+        )
+    with gr.Tab("💬 Chat"):
+        gr.Markdown("## Trò chuyện với AI Assistant (có TTS)")
+        chatbot = gr.Chatbot()
+        state = gr.State([])
+        with gr.Row():
+            user_input = gr.Textbox(
+                show_label=False,
+                placeholder="Nhập tin nhắn của bạn ở đây...",
+                container=False,
+                scale=4
+            )
+            send_button = gr.Button("Gửi", variant="primary", scale=1)
+            clear_button = gr.Button("Xóa Chat", variant="secondary", scale=1)
+        with gr.Row():
+            chat_tts_output = gr.Audio(
+                label="Phản hồi bằng giọng nói",
+                interactive=False
+            )
+        send_button.click(
+            respond,
+            inputs=[user_input, state],
+            outputs=[user_input, chatbot, state, chat_tts_output],
+        )
+        clear_button.click(
+            clear_chat_history,
+            inputs=[state],
+            outputs=[chatbot, state]
+        )
+    with gr.Tab("📚 RAG Wikipedia"):
+        gr.Markdown("## Quản lý kiến thức với Wikipedia và Embedding Tiếng Việt")
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 📤 Upload dữ liệu Wikipedia")
+                file_upload = gr.File(
+                    label="Tải lên file Wikipedia",
+                    file_types=['.txt', '.csv', '.json'],
+                    file_count="single"
+                )
+                upload_btn = gr.Button("📤 Upload Data", variant="primary")
+                upload_status = gr.Textbox(label="Trạng thái Upload", interactive=False)
+                gr.Markdown("### 📊 Thống kê Database")
+                stats_btn = gr.Button("📊 Database Stats", variant="secondary")
+                stats_display = gr.Textbox(label="Thống kê", interactive=False)
+                gr.Markdown("### 🔍 Tìm kiếm Database")
+                search_query = gr.Textbox(
+                    label="Tìm kiếm trong database",
+                    placeholder="Nhập từ khóa để tìm kiếm..."
+                )
+                search_btn = gr.Button("🔍 Tìm kiếm", variant="secondary")
+            with gr.Column(scale=2):
+                gr.Markdown("### 📋 Kết quả tìm kiếm RAG")
+                rag_results = gr.JSON(
+                    label="Tài liệu tham khảo tìm được"
+                )
+        upload_btn.click(
+            upload_wikipedia_file,
+            inputs=[file_upload],
+            outputs=[upload_status]
+        )
+        stats_btn.click(
+            get_rag_stats,
+            inputs=[],
+            outputs=[stats_display]
+        )
+        search_btn.click(
+            search_rag_database,
+            inputs=[search_query],
+            outputs=[rag_results]
+        )
+    gr.HTML("""
+    <div id="groq-badge">
+        <div style="color: #ffffff !important; font-weight: bold; background-color: #f55036; padding: 8px 12px; border-radius: 5px;">
+            POWERED BY DAT | VIETNAMESE EMBEDDING & ENHANCED TTS
+        </div>
+    </div>
+    """)
+if __name__ == "__main__":
+    demo.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio
+numpy
+soundfile
+Pillow
+pandas
+sentence-transformers
+faiss-cpu
+edge-tts
+gtts
+groq