Spaces:

rajeshlion
/

ask-baba-bhAIro

Running

App Files Files Community

rajeshlion commited on Aug 31

Commit

b91179c

verified ·

1 Parent(s): 4b180dd

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -12

app.py CHANGED Viewed

@@ -493,34 +493,86 @@
 #     # share=True gives you a public link automatically
 #     demo.launch(share=True)
 import os
 import gradio as gr
-from llama_cpp import Llama
-# Small, fast, chat-tuned GGUF (≈0.5B params, 4-bit quant)
-REPO_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
-FILENAME = os.getenv("MODEL_FILE", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
 N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
 CTX = int(os.getenv("CTX", "2048"))
-print(f"🔧 Loading {REPO_ID}/{FILENAME} with {N_THREADS} threads, ctx={CTX}")
 llm = Llama.from_pretrained(
     repo_id=REPO_ID,
     filename=FILENAME,
     n_ctx=CTX,
     n_threads=N_THREADS,
-    n_gpu_layers=0,          # CPU only
     logits_all=False,
     verbose=False,
 )
-SYSTEM_DEFAULT = (
-    "You are a Chatbot who only answers spiritual questions based on Indian scriptures e.g. Bhagwadgita"
-    "and politely decline other questions."
-)
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     sysmsg = system_message or SYSTEM_DEFAULT
     msgs = [{"role": "system", "content": sysmsg}]

 #     # share=True gives you a public link automatically
 #     demo.launch(share=True)
 import os
 import gradio as gr
+# ---- llama.cpp backend (fast CPU) ----
+from llama_cpp import Llama
+# ---- to list files in a repo and pick a GGUF automatically ----
+from huggingface_hub import list_repo_files
+# ----------------- Config -----------------
+# You can override these via Space "Settings → Variables"
+# If MODEL_REPO is set, it's tried first; otherwise we try the CANDIDATE_REPOS below.
+MODEL_REPO = os.getenv("MODEL_REPO", "").strip() or None
+# Known small GGUF chat repos (fast & lightweight). We'll try them in order.
+CANDIDATE_REPOS = [
+    MODEL_REPO,  # user-preferred first (may be None)
+    "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
+    "Qwen/Qwen2-0.5B-Instruct-GGUF",
+    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+    "bartowski/Qwen2.5-0.5B-Instruct-GGUF",
+]
+# Best-to-worst file name patterns to prefer when multiple GGUFs are present.
+PREFERRED_PATTERNS = [
+    "q4_k_m.gguf", "Q4_K_M.gguf",
+    "q4_0.gguf",   "Q4_0.gguf",
+    "q5_k_m.gguf", "Q5_K_M.gguf",
+    ".gguf",  # catch-all
+]
+# Runtime knobs
 N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
 CTX = int(os.getenv("CTX", "2048"))
+SYSTEM_DEFAULT = (
+    "You are a Chatbot who only answers spiritual questions based on Indian scriptures "
+    "and politely decline other questions."
+)
+# --------------- GGUF Picker ---------------
+def pick_repo_and_file():
+    """Return (repo_id, gguf_filename) by scanning candidate repos for a preferred GGUF."""
+    tried = []
+    for repo in [r for r in CANDIDATE_REPOS if r]:  # drop None
+        try:
+            files = list_repo_files(repo)
+        except Exception:
+            tried.append(f"{repo} (list failed)")
+            continue
+        ggufs = [f for f in files if f.lower().endswith(".gguf")]
+        if not ggufs:
+            tried.append(f"{repo} (no .gguf)")
+            continue
+        # pick by pattern preference
+        for pat in PREFERRED_PATTERNS:
+            for f in ggufs:
+                if pat in f:
+                    return repo, f
+    tried_str = " | ".join(tried) if tried else "(none)"
+    raise RuntimeError(
+        "No GGUF file found in any candidate repo.\n"
+        f"Tried: {tried_str}\n"
+        "Tip: set MODEL_REPO to a GGUF repo like 'Qwen/Qwen2.5-0.5B-Instruct-GGUF' "
+        "or 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF'."
+    )
+REPO_ID, FILENAME = pick_repo_and_file()
+print(f"🔧 Loading GGUF from {REPO_ID}/{FILENAME}  |  threads={N_THREADS}, ctx={CTX}")
 llm = Llama.from_pretrained(
     repo_id=REPO_ID,
     filename=FILENAME,
     n_ctx=CTX,
     n_threads=N_THREADS,
+    n_gpu_layers=0,       # CPU only
     logits_all=False,
     verbose=False,
 )
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     sysmsg = system_message or SYSTEM_DEFAULT
     msgs = [{"role": "system", "content": sysmsg}]