import gradio as gr import pandas as pd from sentence_transformers import SentenceTransformer, util import torch import os import sys import gc import time # --- 系統設定 --- SYSTEM_TITLE = "花蓮慈濟醫院公文輔助判決系統" FILE_PATH = 'data.csv' INDEX_FILE = 'corpus_embeddings.pt' # ▼▼▼ 設定登入帳號密碼 (已更新) ▼▼▼ # 格式:("帳號", "密碼") LOGIN_DATA = ("admin", "htch15583") # --- 1. 讀取資料 --- print("🚀 正在啟動系統...") if not os.path.exists(FILE_PATH): print(f"❌ 錯誤:找不到 {FILE_PATH}") sys.exit(1) try: # 讀取檔案 (CP950 優先) df = pd.read_csv(FILE_PATH, encoding='cp950') except UnicodeDecodeError: try: df = pd.read_csv(FILE_PATH, encoding='big5') except Exception: df = pd.DataFrame() except Exception: df = pd.DataFrame() # --- 2. 資料清洗 --- if not df.empty: # 移除 BOM 與空白 df.columns = [str(c).strip().replace('\ufeff', '') for c in df.columns] # 自動對應欄位 for col in df.columns: if '主旨' in col or '內容' in col: df.rename(columns={col: '主旨'}, inplace=True) if '窗口' in col or '單位' in col: df.rename(columns={col: '收文窗口'}, inplace=True) # 轉字串 & 移除空值 df['主旨'] = df['主旨'].astype(str) df['收文窗口'] = df['收文窗口'].astype(str) df = df.dropna(subset=['主旨', '收文窗口']) corpus = df['主旨'].tolist() total_records = len(corpus) print(f"📊 載入全量資料: {total_records} 筆") else: print("❌ 資料表是空的!") corpus = [] total_records = 0 # --- 3. 載入模型與建立索引 --- model = None try: print("🧠 正在載入模型 (BAAI/bge-small-zh-v1.5)...") model = SentenceTransformer('BAAI/bge-small-zh-v1.5') except Exception as e: print(f"❌ 模型載入失敗: {e}") corpus_embeddings = None if total_records > 0 and model is not None: # 檢查是否有快取檔案 if os.path.exists(INDEX_FILE): print(f"⚡ 偵測到快取檔案,正在秒速載入...") try: corpus_embeddings = torch.load(INDEX_FILE) print("✅ 索引載入完成!") except Exception as e: print(f"❌ 快取檔案損壞,將重新計算。錯誤: {e}") corpus_embeddings = None # 如果沒有快取,則進行計算 if corpus_embeddings is None: print(f"🔥 開始計算索引 (需時約 2-4 分鐘,請耐心等候)...") chunk_size = 500 embeddings_chunks = [] try: for i in range(0, total_records, chunk_size): batch = corpus[i : i + chunk_size] batch_emb = model.encode(batch, convert_to_tensor=True, show_progress_bar=False) embeddings_chunks.append(batch_emb) print(f" -> 已處理 {min(i + chunk_size, total_records)} / {total_records} 筆...") gc.collect() corpus_embeddings = torch.cat(embeddings_chunks) # 儲存到硬碟,下次啟動就會很快 torch.save(corpus_embeddings, INDEX_FILE) print("✅ 索引計算並儲存完成!") except Exception as e: print(f"❌ 索引計算失敗: {e}") corpus_embeddings = None # --- 4. 定義搜尋 --- def search_department(query): if corpus_embeddings is None: return "⚠️ 系統初始化失敗,請檢查 Logs。" if not query.strip(): return "請輸入公文主旨..." query_embedding = model.encode(query, convert_to_tensor=True) cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0] top_k = min(3, len(corpus)) top_results = torch.topk(cos_scores, k=top_k) output_text = f"🔍 分析結果:\n" + "="*30 + "\n" for score, idx in zip(top_results.values, top_results.indices): idx = idx.item() if idx < len(df): row = df.iloc[idx] score_val = score.item() if score_val > 0.7: confidence = "⭐⭐⭐ 極高" elif score_val > 0.55: confidence = "⭐⭐ 高" else: confidence = "⭐ 參考" output_text += f"【推薦單位】:{row['收文窗口']}\n" output_text += f" - 歷史案例:{row['主旨']}\n" output_text += f" - 相似度:{score_val:.4f} ({confidence})\n" output_text += "-"*20 + "\n" return output_text # --- 5. 介面 (已啟用密碼鎖) --- iface = gr.Interface( fn=search_department, inputs=gr.Textbox(lines=3, placeholder="請輸入公文主旨..."), outputs=gr.Textbox(lines=12, label="AI 判決建議"), title=SYSTEM_TITLE, description=f"系統狀態:{'🟢 系統正常' if corpus_embeddings is not None else '🔴 異常'}\n資料庫收錄:{total_records} 筆歷史資料", examples=[["檢送本署彙整人工生殖機構之捐贈生殖細胞使用情形"], ["函轉衛生局關於流感疫苗接種計畫"]] ) if __name__ == "__main__": # 啟動時加入驗證 iface.launch(auth=LOGIN_DATA)