rajeshlion commited on
Commit
b91179c
·
verified ·
1 Parent(s): 4b180dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -12
app.py CHANGED
@@ -493,34 +493,86 @@
493
  # # share=True gives you a public link automatically
494
  # demo.launch(share=True)
495
 
496
-
497
  import os
498
  import gradio as gr
499
- from llama_cpp import Llama
500
 
501
- # Small, fast, chat-tuned GGUF (≈0.5B params, 4-bit quant)
502
- REPO_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
503
- FILENAME = os.getenv("MODEL_FILE", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
504
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
505
  N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
506
  CTX = int(os.getenv("CTX", "2048"))
507
 
508
- print(f"🔧 Loading {REPO_ID}/{FILENAME} with {N_THREADS} threads, ctx={CTX}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  llm = Llama.from_pretrained(
510
  repo_id=REPO_ID,
511
  filename=FILENAME,
512
  n_ctx=CTX,
513
  n_threads=N_THREADS,
514
- n_gpu_layers=0, # CPU only
515
  logits_all=False,
516
  verbose=False,
517
  )
518
 
519
- SYSTEM_DEFAULT = (
520
- "You are a Chatbot who only answers spiritual questions based on Indian scriptures e.g. Bhagwadgita"
521
- "and politely decline other questions."
522
- )
523
-
524
  def respond(message, history, system_message, max_tokens, temperature, top_p):
525
  sysmsg = system_message or SYSTEM_DEFAULT
526
  msgs = [{"role": "system", "content": sysmsg}]
 
493
  # # share=True gives you a public link automatically
494
  # demo.launch(share=True)
495
 
 
496
  import os
497
  import gradio as gr
 
498
 
499
+ # ---- llama.cpp backend (fast CPU) ----
500
+ from llama_cpp import Llama
 
501
 
502
+ # ---- to list files in a repo and pick a GGUF automatically ----
503
+ from huggingface_hub import list_repo_files
504
+
505
+ # ----------------- Config -----------------
506
+ # You can override these via Space "Settings → Variables"
507
+ # If MODEL_REPO is set, it's tried first; otherwise we try the CANDIDATE_REPOS below.
508
+ MODEL_REPO = os.getenv("MODEL_REPO", "").strip() or None
509
+
510
+ # Known small GGUF chat repos (fast & lightweight). We'll try them in order.
511
+ CANDIDATE_REPOS = [
512
+ MODEL_REPO, # user-preferred first (may be None)
513
+ "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
514
+ "Qwen/Qwen2-0.5B-Instruct-GGUF",
515
+ "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
516
+ "bartowski/Qwen2.5-0.5B-Instruct-GGUF",
517
+ ]
518
+
519
+ # Best-to-worst file name patterns to prefer when multiple GGUFs are present.
520
+ PREFERRED_PATTERNS = [
521
+ "q4_k_m.gguf", "Q4_K_M.gguf",
522
+ "q4_0.gguf", "Q4_0.gguf",
523
+ "q5_k_m.gguf", "Q5_K_M.gguf",
524
+ ".gguf", # catch-all
525
+ ]
526
+
527
+ # Runtime knobs
528
  N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
529
  CTX = int(os.getenv("CTX", "2048"))
530
 
531
+ SYSTEM_DEFAULT = (
532
+ "You are a Chatbot who only answers spiritual questions based on Indian scriptures "
533
+ "and politely decline other questions."
534
+ )
535
+
536
+ # --------------- GGUF Picker ---------------
537
+ def pick_repo_and_file():
538
+ """Return (repo_id, gguf_filename) by scanning candidate repos for a preferred GGUF."""
539
+ tried = []
540
+ for repo in [r for r in CANDIDATE_REPOS if r]: # drop None
541
+ try:
542
+ files = list_repo_files(repo)
543
+ except Exception:
544
+ tried.append(f"{repo} (list failed)")
545
+ continue
546
+ ggufs = [f for f in files if f.lower().endswith(".gguf")]
547
+ if not ggufs:
548
+ tried.append(f"{repo} (no .gguf)")
549
+ continue
550
+ # pick by pattern preference
551
+ for pat in PREFERRED_PATTERNS:
552
+ for f in ggufs:
553
+ if pat in f:
554
+ return repo, f
555
+ tried_str = " | ".join(tried) if tried else "(none)"
556
+ raise RuntimeError(
557
+ "No GGUF file found in any candidate repo.\n"
558
+ f"Tried: {tried_str}\n"
559
+ "Tip: set MODEL_REPO to a GGUF repo like 'Qwen/Qwen2.5-0.5B-Instruct-GGUF' "
560
+ "or 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF'."
561
+ )
562
+
563
+ REPO_ID, FILENAME = pick_repo_and_file()
564
+ print(f"🔧 Loading GGUF from {REPO_ID}/{FILENAME} | threads={N_THREADS}, ctx={CTX}")
565
+
566
  llm = Llama.from_pretrained(
567
  repo_id=REPO_ID,
568
  filename=FILENAME,
569
  n_ctx=CTX,
570
  n_threads=N_THREADS,
571
+ n_gpu_layers=0, # CPU only
572
  logits_all=False,
573
  verbose=False,
574
  )
575
 
 
 
 
 
 
576
  def respond(message, history, system_message, max_tokens, temperature, top_p):
577
  sysmsg = system_message or SYSTEM_DEFAULT
578
  msgs = [{"role": "system", "content": sysmsg}]