Spaces:
Running
Running
| const stopwords = new Set([ | |
| "的", | |
| "与", | |
| "及", | |
| "和", | |
| "或", | |
| "对", | |
| "及其", | |
| "相关", | |
| "研究", | |
| "分析", | |
| "探索", | |
| "问题", | |
| "关系", | |
| "视角", | |
| "基于", | |
| "视域", | |
| "语境", | |
| "视野", | |
| "比较", | |
| "一", | |
| "二", | |
| "三", | |
| "四", | |
| "五", | |
| "论", | |
| "探", | |
| "略", | |
| "述", | |
| "试论", | |
| "研究者", | |
| ]); | |
| const state = { | |
| records: [], | |
| tokenDocFreq: new Map(), | |
| yearMin: null, | |
| yearMax: null, | |
| loaded: false, | |
| lastResults: [], | |
| lastInputs: [], | |
| matchedRecords: 0, | |
| }; | |
| const els = { | |
| projectFile: document.getElementById("projectFile"), | |
| vectorsFile: document.getElementById("vectorsFile"), | |
| useTokens: document.getElementById("useTokens"), | |
| usePhrases: document.getElementById("usePhrases"), | |
| useVectors: document.getElementById("useVectors"), | |
| loadBtn: document.getElementById("loadBtn"), | |
| sampleBtn: document.getElementById("sampleBtn"), | |
| loadStatus: document.getElementById("loadStatus"), | |
| loadProgress: document.getElementById("loadProgress"), | |
| dataStats: document.getElementById("dataStats"), | |
| keywordInput: document.getElementById("keywordInput"), | |
| resultLimit: document.getElementById("resultLimit"), | |
| minCo: document.getElementById("minCo"), | |
| sortBy: document.getElementById("sortBy"), | |
| analyzeBtn: document.getElementById("analyzeBtn"), | |
| exportBtn: document.getElementById("exportBtn"), | |
| queryStatus: document.getElementById("queryStatus"), | |
| resultList: document.getElementById("resultList"), | |
| resultSummary: document.getElementById("resultSummary"), | |
| llmProvider: document.getElementById("llmProvider"), | |
| llmKey: document.getElementById("llmKey"), | |
| llmModel: document.getElementById("llmModel"), | |
| llmBtn: document.getElementById("llmBtn"), | |
| llmStatus: document.getElementById("llmStatus"), | |
| llmOutput: document.getElementById("llmOutput"), | |
| }; | |
| const MODEL_OPTIONS = { | |
| openrouter: [ | |
| "openai/gpt-5.2", | |
| "openai/gpt-4o-mini", | |
| "anthropic/claude-sonnet-4.5", | |
| "anthropic/claude-opus-4.5", | |
| ], | |
| deepseek: ["deepseek-chat"], | |
| }; | |
| function updateModelOptions() { | |
| const provider = els.llmProvider.value; | |
| const options = MODEL_OPTIONS[provider] || []; | |
| els.llmModel.innerHTML = ""; | |
| for (const name of options) { | |
| const opt = document.createElement("option"); | |
| opt.value = name; | |
| opt.textContent = name; | |
| els.llmModel.appendChild(opt); | |
| } | |
| } | |
| function normalizeToken(token) { | |
| if (!token) return null; | |
| const t = String(token).trim(); | |
| if (!t) return null; | |
| if (t.length < 2) return null; | |
| if (stopwords.has(t)) return null; | |
| return t; | |
| } | |
| function updateStats() { | |
| const values = els.dataStats.querySelectorAll(".stat__value"); | |
| values[0].textContent = state.records.length.toLocaleString(); | |
| values[1].textContent = state.tokenDocFreq.size.toLocaleString(); | |
| if (state.yearMin && state.yearMax) { | |
| values[2].textContent = `${state.yearMin} - ${state.yearMax}`; | |
| } else { | |
| values[2].textContent = "-"; | |
| } | |
| } | |
| function setStatus(el, text) { | |
| el.textContent = text; | |
| } | |
| function setProgress(percent) { | |
| els.loadProgress.style.width = `${percent.toFixed(1)}%`; | |
| } | |
| function parseJsonlText(text, onRow) { | |
| const lines = text.split("\n"); | |
| for (const line of lines) { | |
| const trimmed = line.trim(); | |
| if (!trimmed) continue; | |
| try { | |
| onRow(JSON.parse(trimmed)); | |
| } catch (err) { | |
| // Skip malformed line. | |
| } | |
| } | |
| } | |
| async function parseJsonlFile(file, onRow, onProgress) { | |
| const decoder = new TextDecoder("utf-8"); | |
| const reader = file.stream().getReader(); | |
| let buffer = ""; | |
| let loaded = 0; | |
| while (true) { | |
| const { value, done } = await reader.read(); | |
| if (done) break; | |
| loaded += value.length; | |
| buffer += decoder.decode(value, { stream: true }); | |
| let lines = buffer.split("\n"); | |
| buffer = lines.pop(); | |
| for (const line of lines) { | |
| const trimmed = line.trim(); | |
| if (!trimmed) continue; | |
| try { | |
| const obj = JSON.parse(trimmed); | |
| onRow(obj); | |
| } catch (err) { | |
| // Skip malformed line. | |
| } | |
| } | |
| if (onProgress) { | |
| onProgress(loaded / file.size); | |
| } | |
| } | |
| if (buffer.trim()) { | |
| try { | |
| onRow(JSON.parse(buffer)); | |
| } catch (err) { | |
| // Ignore last line parse error. | |
| } | |
| } | |
| if (onProgress) { | |
| onProgress(1); | |
| } | |
| } | |
| function addRecord(tokens, year, title, source) { | |
| if (!tokens || tokens.length === 0) return; | |
| const uniq = new Set(); | |
| for (const token of tokens) { | |
| const norm = normalizeToken(token); | |
| if (norm) uniq.add(norm); | |
| } | |
| if (uniq.size === 0) return; | |
| const tokenArr = Array.from(uniq); | |
| state.records.push({ year, tokens: tokenArr, title, source }); | |
| for (const t of uniq) { | |
| state.tokenDocFreq.set(t, (state.tokenDocFreq.get(t) || 0) + 1); | |
| } | |
| if (year) { | |
| if (!state.yearMin || year < state.yearMin) state.yearMin = year; | |
| if (!state.yearMax || year > state.yearMax) state.yearMax = year; | |
| } | |
| } | |
| async function loadData() { | |
| state.records = []; | |
| state.tokenDocFreq.clear(); | |
| state.yearMin = null; | |
| state.yearMax = null; | |
| state.loaded = false; | |
| setProgress(0); | |
| setStatus(els.loadStatus, "加载中..."); | |
| const useTokens = els.useTokens.checked; | |
| const usePhrases = els.usePhrases.checked; | |
| const useVectors = els.useVectors.checked; | |
| const projectFile = els.projectFile.files[0]; | |
| const vectorsFile = els.vectorsFile.files[0]; | |
| if (!projectFile && !vectorsFile) { | |
| setStatus(els.loadStatus, "请至少选择一个 jsonl 文件"); | |
| return; | |
| } | |
| if (projectFile) { | |
| await parseJsonlFile( | |
| projectFile, | |
| (obj) => { | |
| const year = Number(obj.year); | |
| const nlp = obj.nlp || {}; | |
| const tokens = []; | |
| if (useTokens && Array.isArray(nlp.tokens)) { | |
| tokens.push(...nlp.tokens); | |
| } | |
| if (usePhrases && Array.isArray(nlp.phrases)) { | |
| tokens.push(...nlp.phrases); | |
| } | |
| addRecord(tokens, year, obj.title, "project"); | |
| }, | |
| (ratio) => setProgress(ratio * 50) | |
| ); | |
| } | |
| if (vectorsFile && useVectors) { | |
| await parseJsonlFile( | |
| vectorsFile, | |
| (obj) => { | |
| const year = Number(obj.year); | |
| const tokens = String(obj.name_tokens || "").split(/\s+/); | |
| addRecord(tokens, year, obj.name, "vectors"); | |
| }, | |
| (ratio) => setProgress(50 + ratio * 50) | |
| ); | |
| } | |
| state.loaded = true; | |
| updateStats(); | |
| setStatus(els.loadStatus, "加载完成"); | |
| setProgress(100); | |
| } | |
| async function loadSample() { | |
| state.records = []; | |
| state.tokenDocFreq.clear(); | |
| state.yearMin = null; | |
| state.yearMax = null; | |
| state.loaded = false; | |
| setProgress(0); | |
| setStatus(els.loadStatus, "加载示例数据..."); | |
| const useTokens = els.useTokens.checked; | |
| const usePhrases = els.usePhrases.checked; | |
| if (!useTokens && !usePhrases) { | |
| setStatus(els.loadStatus, "请至少勾选 tokens 或 phrases"); | |
| return; | |
| } | |
| try { | |
| const resp = await fetch("sample_project.jsonl"); | |
| if (!resp.ok) { | |
| throw new Error("示例数据加载失败"); | |
| } | |
| const text = await resp.text(); | |
| parseJsonlText(text, (obj) => { | |
| const year = Number(obj.year); | |
| const nlp = obj.nlp || {}; | |
| const tokens = []; | |
| if (useTokens && Array.isArray(nlp.tokens)) { | |
| tokens.push(...nlp.tokens); | |
| } | |
| if (usePhrases && Array.isArray(nlp.phrases)) { | |
| tokens.push(...nlp.phrases); | |
| } | |
| addRecord(tokens, year, obj.title, "sample"); | |
| }); | |
| state.loaded = true; | |
| updateStats(); | |
| setProgress(100); | |
| setStatus(els.loadStatus, "示例数据加载完成"); | |
| } catch (err) { | |
| setStatus(els.loadStatus, err.message); | |
| } | |
| } | |
| function parseInputKeywords(text) { | |
| const raw = text | |
| .replace(/[,、;;]+/g, " ") | |
| .replace(/\s+/g, " ") | |
| .split(" ") | |
| .map((t) => t.trim()) | |
| .filter(Boolean); | |
| const uniq = new Set(); | |
| for (const item of raw) { | |
| const norm = normalizeToken(item); | |
| if (norm) uniq.add(norm); | |
| } | |
| return Array.from(uniq); | |
| } | |
| function analyze() { | |
| if (!state.loaded) { | |
| setStatus(els.queryStatus, "请先加载数据"); | |
| return; | |
| } | |
| const inputKeywords = parseInputKeywords(els.keywordInput.value); | |
| if (inputKeywords.length === 0) { | |
| setStatus(els.queryStatus, "请输入至少一个关键词"); | |
| return; | |
| } | |
| const inputSet = new Set(inputKeywords); | |
| const candidateCo = new Map(); | |
| const candidateHitInputs = new Map(); | |
| const inputCounts = new Map(); | |
| inputKeywords.forEach((k) => inputCounts.set(k, 0)); | |
| let matchedRecords = 0; | |
| for (const record of state.records) { | |
| const hits = []; | |
| for (const t of record.tokens) { | |
| if (inputSet.has(t)) { | |
| hits.push(t); | |
| } | |
| } | |
| if (hits.length === 0) continue; | |
| matchedRecords += 1; | |
| hits.forEach((h) => inputCounts.set(h, (inputCounts.get(h) || 0) + 1)); | |
| const uniq = new Set(record.tokens); | |
| for (const t of uniq) { | |
| if (inputSet.has(t)) continue; | |
| candidateCo.set(t, (candidateCo.get(t) || 0) + 1); | |
| if (!candidateHitInputs.has(t)) { | |
| candidateHitInputs.set(t, new Set()); | |
| } | |
| const hitSet = candidateHitInputs.get(t); | |
| hits.forEach((h) => hitSet.add(h)); | |
| } | |
| } | |
| const results = []; | |
| for (const [token, coCount] of candidateCo.entries()) { | |
| const df = state.tokenDocFreq.get(token) || coCount; | |
| const coverage = candidateHitInputs.get(token)?.size || 0; | |
| const coRate = df ? coCount / df : 0; | |
| const score = | |
| (coRate * (coverage / inputKeywords.length)) * | |
| Math.log(1 + coCount); | |
| results.push({ | |
| token, | |
| coCount, | |
| df, | |
| coverage, | |
| score, | |
| coRate, | |
| }); | |
| } | |
| results.sort((a, b) => b.score - a.score); | |
| state.lastResults = results; | |
| state.lastInputs = inputKeywords; | |
| state.matchedRecords = matchedRecords; | |
| renderResults(); | |
| const inputStats = inputKeywords | |
| .map((k) => `${k}(${inputCounts.get(k) || 0})`) | |
| .join("、"); | |
| setStatus( | |
| els.queryStatus, | |
| `输入关键词:${inputStats};命中记录 ${matchedRecords.toLocaleString()} 条` | |
| ); | |
| } | |
| function renderResults() { | |
| const limit = Number(els.resultLimit.value) || 80; | |
| const minCo = Number(els.minCo.value) || 1; | |
| const sortBy = els.sortBy.value; | |
| let results = state.lastResults.filter((r) => r.coCount >= minCo); | |
| const sortMap = { | |
| score: (a, b) => b.score - a.score, | |
| coCount: (a, b) => b.coCount - a.coCount, | |
| coverage: (a, b) => b.coverage - a.coverage, | |
| coRate: (a, b) => b.coRate - a.coRate, | |
| }; | |
| results.sort(sortMap[sortBy]); | |
| results = results.slice(0, limit); | |
| const maxScore = results.reduce((max, r) => Math.max(max, r.score), 0.0001); | |
| els.resultList.innerHTML = ""; | |
| for (const r of results) { | |
| const item = document.createElement("div"); | |
| item.className = "result-item"; | |
| const intensity = Math.min(1, r.score / maxScore); | |
| const hue = 20 + (1 - intensity) * 80; | |
| item.innerHTML = ` | |
| <div class="result-item__kw">${r.token}</div> | |
| <div class="bar"><span style="width:${(intensity * 100).toFixed( | |
| 1 | |
| )}%;background:linear-gradient(90deg, hsl(${hue},70%,45%), hsl(${ | |
| hue + 20 | |
| },80%,60%));"></span></div> | |
| <div class="meta"> | |
| <span>适配:${r.score.toFixed(3)}</span> | |
| <span>共现:${r.coCount}</span> | |
| <span>覆盖:${r.coverage}/${state.lastInputs.length}</span> | |
| </div> | |
| `; | |
| els.resultList.appendChild(item); | |
| } | |
| els.resultSummary.textContent = `共找到 ${state.lastResults.length.toLocaleString()} 个候选搭配词,当前展示 ${results.length} 个。`; | |
| } | |
| function exportCsv() { | |
| if (!state.lastResults.length) return; | |
| const rows = [ | |
| ["keyword", "score", "co_count", "df", "coverage", "co_rate"], | |
| ]; | |
| for (const r of state.lastResults) { | |
| rows.push([ | |
| r.token, | |
| r.score.toFixed(6), | |
| r.coCount, | |
| r.df, | |
| r.coverage, | |
| r.coRate.toFixed(6), | |
| ]); | |
| } | |
| const csv = rows.map((r) => r.join(",")).join("\n"); | |
| const blob = new Blob([csv], { type: "text/csv;charset=utf-8;" }); | |
| const url = URL.createObjectURL(blob); | |
| const a = document.createElement("a"); | |
| a.href = url; | |
| a.download = "keyword_pairs.csv"; | |
| document.body.appendChild(a); | |
| a.click(); | |
| document.body.removeChild(a); | |
| URL.revokeObjectURL(url); | |
| } | |
| function buildPrompt() { | |
| const top = state.lastResults.slice(0, 30); | |
| const list = top | |
| .map( | |
| (r, idx) => | |
| `${idx + 1}. ${r.token}(适配:${r.score.toFixed( | |
| 3 | |
| )}, 共现:${r.coCount}, 覆盖:${r.coverage}/${state.lastInputs.length})` | |
| ) | |
| .join("\n"); | |
| const yearRange = | |
| state.yearMin && state.yearMax ? `${state.yearMin}-${state.yearMax}` : "未知"; | |
| return `你是研究选题助手,请以数据库统计为主,结合搭配关键词进行整合与拓展。请用中文输出。\n\n` + | |
| `输入关键词:${state.lastInputs.join("、")}\n` + | |
| `数据库年份范围:${yearRange}\n` + | |
| `命中记录数:${state.matchedRecords}\n\n` + | |
| `数据库高分搭配关键词(按适配度排序,优先使用):\n${list}\n\n` + | |
| `任务要求:\n` + | |
| `1) 将搭配关键词做 3-6 组主题化整合,每组给出一句聚合说明。\n` + | |
| `2) 给出 6-10 个拟申报选题建议,每条包含题目 + 1 句摘要。\n` + | |
| `3) 给出综合评价(创新性/可行性/政策相关性/学术价值,各 1-5 分)并简述理由。\n` + | |
| `4) 额外给出不超过 10 个“LLM 扩展关键词”,标注为扩展来源。\n` + | |
| `5) 提醒 2-3 条可能风险或选题陷阱。`; | |
| } | |
| async function callLLM() { | |
| if (!state.lastResults.length) { | |
| setStatus(els.llmStatus, "请先生成搭配词"); | |
| return; | |
| } | |
| const provider = els.llmProvider.value; | |
| const key = els.llmKey.value.trim(); | |
| const model = els.llmModel.value.trim(); | |
| if (!key || !model) { | |
| setStatus(els.llmStatus, "请填写 API Key 和模型"); | |
| return; | |
| } | |
| let url = ""; | |
| const headers = { | |
| "Content-Type": "application/json", | |
| Authorization: `Bearer ${key}`, | |
| }; | |
| if (provider === "openrouter") { | |
| url = "https://openrouter.ai/api/v1/chat/completions"; | |
| const origin = typeof location !== "undefined" ? location.origin : ""; | |
| headers["HTTP-Referer"] = origin && origin !== "null" ? origin : "http://localhost"; | |
| headers["X-Title"] = "Local Keyword Matcher"; | |
| } else { | |
| url = "https://api.deepseek.com/chat/completions"; | |
| } | |
| const allowed = MODEL_OPTIONS[provider] || []; | |
| if (!allowed.includes(model)) { | |
| setStatus(els.llmStatus, "模型不在允许列表"); | |
| return; | |
| } | |
| const body = { | |
| model, | |
| messages: [ | |
| { role: "system", content: "你是严谨的研究选题助手。" }, | |
| { role: "user", content: buildPrompt() }, | |
| ], | |
| temperature: 0.7, | |
| }; | |
| setStatus(els.llmStatus, "请求中..."); | |
| els.llmOutput.textContent = "生成中,请稍候..."; | |
| try { | |
| const resp = await fetch(url, { | |
| method: "POST", | |
| headers, | |
| body: JSON.stringify(body), | |
| }); | |
| if (!resp.ok) { | |
| const text = await resp.text(); | |
| throw new Error(text || "请求失败"); | |
| } | |
| const data = await resp.json(); | |
| const content = data.choices?.[0]?.message?.content; | |
| els.llmOutput.textContent = content || "未返回内容"; | |
| setStatus(els.llmStatus, "完成"); | |
| } catch (err) { | |
| els.llmOutput.textContent = `错误:${err.message}`; | |
| setStatus(els.llmStatus, "请求失败"); | |
| } | |
| } | |
| els.loadBtn.addEventListener("click", loadData); | |
| els.sampleBtn.addEventListener("click", loadSample); | |
| els.analyzeBtn.addEventListener("click", analyze); | |
| els.exportBtn.addEventListener("click", exportCsv); | |
| els.sortBy.addEventListener("change", renderResults); | |
| els.resultLimit.addEventListener("change", renderResults); | |
| els.minCo.addEventListener("change", renderResults); | |
| els.llmBtn.addEventListener("click", callLLM); | |
| els.llmProvider.addEventListener("change", updateModelOptions); | |
| updateModelOptions(); | |