from llama_cpp import Llama
import gradio as gr

# Load the model from the repo (downloads on startup, uses runtime storage)
llm = Llama.from_pretrained(
    repo_id="QuantFactory/DarkIdol-Llama-3.1-8B-Instruct-1.2-Uncensored-GGUF",
    filename="DarkIdol-Llama-3.1-8B-Instruct-1.2-Uncensored.Q6_K.gguf",  # Use Q6 for good quality; change if needed
    n_ctx=2048,  # For longer conversations
    n_gpu_layers=-1  # Offload to GPU if you upgrade hardware
)

def chat(user_input, history):
    messages = [{"role": "user", "content": user_input}]
    output = llm.create_chat_completion(messages, max_tokens=256, temperature=0.8)
    return output['choices'][0]['message']['content']

demo = gr.ChatInterface(fn=chat, title="Q6 Uncensored Voice Agent")
demo.launch()