Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -493,34 +493,86 @@
|
|
| 493 |
# # share=True gives you a public link automatically
|
| 494 |
# demo.launch(share=True)
|
| 495 |
|
| 496 |
-
|
| 497 |
import os
|
| 498 |
import gradio as gr
|
| 499 |
-
from llama_cpp import Llama
|
| 500 |
|
| 501 |
-
#
|
| 502 |
-
|
| 503 |
-
FILENAME = os.getenv("MODEL_FILE", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
|
| 504 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 505 |
N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
|
| 506 |
CTX = int(os.getenv("CTX", "2048"))
|
| 507 |
|
| 508 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
llm = Llama.from_pretrained(
|
| 510 |
repo_id=REPO_ID,
|
| 511 |
filename=FILENAME,
|
| 512 |
n_ctx=CTX,
|
| 513 |
n_threads=N_THREADS,
|
| 514 |
-
n_gpu_layers=0,
|
| 515 |
logits_all=False,
|
| 516 |
verbose=False,
|
| 517 |
)
|
| 518 |
|
| 519 |
-
SYSTEM_DEFAULT = (
|
| 520 |
-
"You are a Chatbot who only answers spiritual questions based on Indian scriptures e.g. Bhagwadgita"
|
| 521 |
-
"and politely decline other questions."
|
| 522 |
-
)
|
| 523 |
-
|
| 524 |
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
| 525 |
sysmsg = system_message or SYSTEM_DEFAULT
|
| 526 |
msgs = [{"role": "system", "content": sysmsg}]
|
|
|
|
| 493 |
# # share=True gives you a public link automatically
|
| 494 |
# demo.launch(share=True)
|
| 495 |
|
|
|
|
| 496 |
import os
|
| 497 |
import gradio as gr
|
|
|
|
| 498 |
|
| 499 |
+
# ---- llama.cpp backend (fast CPU) ----
|
| 500 |
+
from llama_cpp import Llama
|
|
|
|
| 501 |
|
| 502 |
+
# ---- to list files in a repo and pick a GGUF automatically ----
|
| 503 |
+
from huggingface_hub import list_repo_files
|
| 504 |
+
|
| 505 |
+
# ----------------- Config -----------------
|
| 506 |
+
# You can override these via Space "Settings → Variables"
|
| 507 |
+
# If MODEL_REPO is set, it's tried first; otherwise we try the CANDIDATE_REPOS below.
|
| 508 |
+
MODEL_REPO = os.getenv("MODEL_REPO", "").strip() or None
|
| 509 |
+
|
| 510 |
+
# Known small GGUF chat repos (fast & lightweight). We'll try them in order.
|
| 511 |
+
CANDIDATE_REPOS = [
|
| 512 |
+
MODEL_REPO, # user-preferred first (may be None)
|
| 513 |
+
"Qwen/Qwen2.5-0.5B-Instruct-GGUF",
|
| 514 |
+
"Qwen/Qwen2-0.5B-Instruct-GGUF",
|
| 515 |
+
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
| 516 |
+
"bartowski/Qwen2.5-0.5B-Instruct-GGUF",
|
| 517 |
+
]
|
| 518 |
+
|
| 519 |
+
# Best-to-worst file name patterns to prefer when multiple GGUFs are present.
|
| 520 |
+
PREFERRED_PATTERNS = [
|
| 521 |
+
"q4_k_m.gguf", "Q4_K_M.gguf",
|
| 522 |
+
"q4_0.gguf", "Q4_0.gguf",
|
| 523 |
+
"q5_k_m.gguf", "Q5_K_M.gguf",
|
| 524 |
+
".gguf", # catch-all
|
| 525 |
+
]
|
| 526 |
+
|
| 527 |
+
# Runtime knobs
|
| 528 |
N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
|
| 529 |
CTX = int(os.getenv("CTX", "2048"))
|
| 530 |
|
| 531 |
+
SYSTEM_DEFAULT = (
|
| 532 |
+
"You are a Chatbot who only answers spiritual questions based on Indian scriptures "
|
| 533 |
+
"and politely decline other questions."
|
| 534 |
+
)
|
| 535 |
+
|
| 536 |
+
# --------------- GGUF Picker ---------------
|
| 537 |
+
def pick_repo_and_file():
|
| 538 |
+
"""Return (repo_id, gguf_filename) by scanning candidate repos for a preferred GGUF."""
|
| 539 |
+
tried = []
|
| 540 |
+
for repo in [r for r in CANDIDATE_REPOS if r]: # drop None
|
| 541 |
+
try:
|
| 542 |
+
files = list_repo_files(repo)
|
| 543 |
+
except Exception:
|
| 544 |
+
tried.append(f"{repo} (list failed)")
|
| 545 |
+
continue
|
| 546 |
+
ggufs = [f for f in files if f.lower().endswith(".gguf")]
|
| 547 |
+
if not ggufs:
|
| 548 |
+
tried.append(f"{repo} (no .gguf)")
|
| 549 |
+
continue
|
| 550 |
+
# pick by pattern preference
|
| 551 |
+
for pat in PREFERRED_PATTERNS:
|
| 552 |
+
for f in ggufs:
|
| 553 |
+
if pat in f:
|
| 554 |
+
return repo, f
|
| 555 |
+
tried_str = " | ".join(tried) if tried else "(none)"
|
| 556 |
+
raise RuntimeError(
|
| 557 |
+
"No GGUF file found in any candidate repo.\n"
|
| 558 |
+
f"Tried: {tried_str}\n"
|
| 559 |
+
"Tip: set MODEL_REPO to a GGUF repo like 'Qwen/Qwen2.5-0.5B-Instruct-GGUF' "
|
| 560 |
+
"or 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF'."
|
| 561 |
+
)
|
| 562 |
+
|
| 563 |
+
REPO_ID, FILENAME = pick_repo_and_file()
|
| 564 |
+
print(f"🔧 Loading GGUF from {REPO_ID}/{FILENAME} | threads={N_THREADS}, ctx={CTX}")
|
| 565 |
+
|
| 566 |
llm = Llama.from_pretrained(
|
| 567 |
repo_id=REPO_ID,
|
| 568 |
filename=FILENAME,
|
| 569 |
n_ctx=CTX,
|
| 570 |
n_threads=N_THREADS,
|
| 571 |
+
n_gpu_layers=0, # CPU only
|
| 572 |
logits_all=False,
|
| 573 |
verbose=False,
|
| 574 |
)
|
| 575 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 576 |
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
| 577 |
sysmsg = system_message or SYSTEM_DEFAULT
|
| 578 |
msgs = [{"role": "system", "content": sysmsg}]
|