from llama_cpp import Llama import gradio as gr # Load the model from the repo (downloads on startup, uses runtime storage) llm = Llama.from_pretrained( repo_id="QuantFactory/DarkIdol-Llama-3.1-8B-Instruct-1.2-Uncensored-GGUF", filename="DarkIdol-Llama-3.1-8B-Instruct-1.2-Uncensored.Q6_K.gguf", # Use Q6 for good quality; change if needed n_ctx=2048, # For longer conversations n_gpu_layers=-1 # Offload to GPU if you upgrade hardware ) def chat(user_input, history): messages = [{"role": "user", "content": user_input}] output = llm.create_chat_completion(messages, max_tokens=256, temperature=0.8) return output['choices'][0]['message']['content'] demo = gr.ChatInterface(fn=chat, title="Q6 Uncensored Voice Agent") demo.launch()