const stopwords = new Set([ "的", "与", "及", "和", "或", "对", "及其", "相关", "研究", "分析", "探索", "问题", "关系", "视角", "基于", "视域", "语境", "视野", "比较", "一", "二", "三", "四", "五", "论", "探", "略", "述", "试论", "研究者", ]); const state = { records: [], tokenDocFreq: new Map(), yearMin: null, yearMax: null, loaded: false, lastResults: [], lastInputs: [], matchedRecords: 0, }; const els = { projectFile: document.getElementById("projectFile"), vectorsFile: document.getElementById("vectorsFile"), useTokens: document.getElementById("useTokens"), usePhrases: document.getElementById("usePhrases"), useVectors: document.getElementById("useVectors"), loadBtn: document.getElementById("loadBtn"), sampleBtn: document.getElementById("sampleBtn"), loadStatus: document.getElementById("loadStatus"), loadProgress: document.getElementById("loadProgress"), dataStats: document.getElementById("dataStats"), keywordInput: document.getElementById("keywordInput"), resultLimit: document.getElementById("resultLimit"), minCo: document.getElementById("minCo"), sortBy: document.getElementById("sortBy"), analyzeBtn: document.getElementById("analyzeBtn"), exportBtn: document.getElementById("exportBtn"), queryStatus: document.getElementById("queryStatus"), resultList: document.getElementById("resultList"), resultSummary: document.getElementById("resultSummary"), llmProvider: document.getElementById("llmProvider"), llmKey: document.getElementById("llmKey"), llmModel: document.getElementById("llmModel"), llmBtn: document.getElementById("llmBtn"), llmStatus: document.getElementById("llmStatus"), llmOutput: document.getElementById("llmOutput"), }; const MODEL_OPTIONS = { openrouter: [ "openai/gpt-5.2", "openai/gpt-4o-mini", "anthropic/claude-sonnet-4.5", "anthropic/claude-opus-4.5", ], deepseek: ["deepseek-chat"], }; function updateModelOptions() { const provider = els.llmProvider.value; const options = MODEL_OPTIONS[provider] || []; els.llmModel.innerHTML = ""; for (const name of options) { const opt = document.createElement("option"); opt.value = name; opt.textContent = name; els.llmModel.appendChild(opt); } } function normalizeToken(token) { if (!token) return null; const t = String(token).trim(); if (!t) return null; if (t.length < 2) return null; if (stopwords.has(t)) return null; return t; } function updateStats() { const values = els.dataStats.querySelectorAll(".stat__value"); values[0].textContent = state.records.length.toLocaleString(); values[1].textContent = state.tokenDocFreq.size.toLocaleString(); if (state.yearMin && state.yearMax) { values[2].textContent = `${state.yearMin} - ${state.yearMax}`; } else { values[2].textContent = "-"; } } function setStatus(el, text) { el.textContent = text; } function setProgress(percent) { els.loadProgress.style.width = `${percent.toFixed(1)}%`; } function parseJsonlText(text, onRow) { const lines = text.split("\n"); for (const line of lines) { const trimmed = line.trim(); if (!trimmed) continue; try { onRow(JSON.parse(trimmed)); } catch (err) { // Skip malformed line. } } } async function parseJsonlFile(file, onRow, onProgress) { const decoder = new TextDecoder("utf-8"); const reader = file.stream().getReader(); let buffer = ""; let loaded = 0; while (true) { const { value, done } = await reader.read(); if (done) break; loaded += value.length; buffer += decoder.decode(value, { stream: true }); let lines = buffer.split("\n"); buffer = lines.pop(); for (const line of lines) { const trimmed = line.trim(); if (!trimmed) continue; try { const obj = JSON.parse(trimmed); onRow(obj); } catch (err) { // Skip malformed line. } } if (onProgress) { onProgress(loaded / file.size); } } if (buffer.trim()) { try { onRow(JSON.parse(buffer)); } catch (err) { // Ignore last line parse error. } } if (onProgress) { onProgress(1); } } function addRecord(tokens, year, title, source) { if (!tokens || tokens.length === 0) return; const uniq = new Set(); for (const token of tokens) { const norm = normalizeToken(token); if (norm) uniq.add(norm); } if (uniq.size === 0) return; const tokenArr = Array.from(uniq); state.records.push({ year, tokens: tokenArr, title, source }); for (const t of uniq) { state.tokenDocFreq.set(t, (state.tokenDocFreq.get(t) || 0) + 1); } if (year) { if (!state.yearMin || year < state.yearMin) state.yearMin = year; if (!state.yearMax || year > state.yearMax) state.yearMax = year; } } async function loadData() { state.records = []; state.tokenDocFreq.clear(); state.yearMin = null; state.yearMax = null; state.loaded = false; setProgress(0); setStatus(els.loadStatus, "加载中..."); const useTokens = els.useTokens.checked; const usePhrases = els.usePhrases.checked; const useVectors = els.useVectors.checked; const projectFile = els.projectFile.files[0]; const vectorsFile = els.vectorsFile.files[0]; if (!projectFile && !vectorsFile) { setStatus(els.loadStatus, "请至少选择一个 jsonl 文件"); return; } if (projectFile) { await parseJsonlFile( projectFile, (obj) => { const year = Number(obj.year); const nlp = obj.nlp || {}; const tokens = []; if (useTokens && Array.isArray(nlp.tokens)) { tokens.push(...nlp.tokens); } if (usePhrases && Array.isArray(nlp.phrases)) { tokens.push(...nlp.phrases); } addRecord(tokens, year, obj.title, "project"); }, (ratio) => setProgress(ratio * 50) ); } if (vectorsFile && useVectors) { await parseJsonlFile( vectorsFile, (obj) => { const year = Number(obj.year); const tokens = String(obj.name_tokens || "").split(/\s+/); addRecord(tokens, year, obj.name, "vectors"); }, (ratio) => setProgress(50 + ratio * 50) ); } state.loaded = true; updateStats(); setStatus(els.loadStatus, "加载完成"); setProgress(100); } async function loadSample() { state.records = []; state.tokenDocFreq.clear(); state.yearMin = null; state.yearMax = null; state.loaded = false; setProgress(0); setStatus(els.loadStatus, "加载示例数据..."); const useTokens = els.useTokens.checked; const usePhrases = els.usePhrases.checked; if (!useTokens && !usePhrases) { setStatus(els.loadStatus, "请至少勾选 tokens 或 phrases"); return; } try { const resp = await fetch("sample_project.jsonl"); if (!resp.ok) { throw new Error("示例数据加载失败"); } const text = await resp.text(); parseJsonlText(text, (obj) => { const year = Number(obj.year); const nlp = obj.nlp || {}; const tokens = []; if (useTokens && Array.isArray(nlp.tokens)) { tokens.push(...nlp.tokens); } if (usePhrases && Array.isArray(nlp.phrases)) { tokens.push(...nlp.phrases); } addRecord(tokens, year, obj.title, "sample"); }); state.loaded = true; updateStats(); setProgress(100); setStatus(els.loadStatus, "示例数据加载完成"); } catch (err) { setStatus(els.loadStatus, err.message); } } function parseInputKeywords(text) { const raw = text .replace(/[,、;;]+/g, " ") .replace(/\s+/g, " ") .split(" ") .map((t) => t.trim()) .filter(Boolean); const uniq = new Set(); for (const item of raw) { const norm = normalizeToken(item); if (norm) uniq.add(norm); } return Array.from(uniq); } function analyze() { if (!state.loaded) { setStatus(els.queryStatus, "请先加载数据"); return; } const inputKeywords = parseInputKeywords(els.keywordInput.value); if (inputKeywords.length === 0) { setStatus(els.queryStatus, "请输入至少一个关键词"); return; } const inputSet = new Set(inputKeywords); const candidateCo = new Map(); const candidateHitInputs = new Map(); const inputCounts = new Map(); inputKeywords.forEach((k) => inputCounts.set(k, 0)); let matchedRecords = 0; for (const record of state.records) { const hits = []; for (const t of record.tokens) { if (inputSet.has(t)) { hits.push(t); } } if (hits.length === 0) continue; matchedRecords += 1; hits.forEach((h) => inputCounts.set(h, (inputCounts.get(h) || 0) + 1)); const uniq = new Set(record.tokens); for (const t of uniq) { if (inputSet.has(t)) continue; candidateCo.set(t, (candidateCo.get(t) || 0) + 1); if (!candidateHitInputs.has(t)) { candidateHitInputs.set(t, new Set()); } const hitSet = candidateHitInputs.get(t); hits.forEach((h) => hitSet.add(h)); } } const results = []; for (const [token, coCount] of candidateCo.entries()) { const df = state.tokenDocFreq.get(token) || coCount; const coverage = candidateHitInputs.get(token)?.size || 0; const coRate = df ? coCount / df : 0; const score = (coRate * (coverage / inputKeywords.length)) * Math.log(1 + coCount); results.push({ token, coCount, df, coverage, score, coRate, }); } results.sort((a, b) => b.score - a.score); state.lastResults = results; state.lastInputs = inputKeywords; state.matchedRecords = matchedRecords; renderResults(); const inputStats = inputKeywords .map((k) => `${k}(${inputCounts.get(k) || 0})`) .join("、"); setStatus( els.queryStatus, `输入关键词:${inputStats};命中记录 ${matchedRecords.toLocaleString()} 条` ); } function renderResults() { const limit = Number(els.resultLimit.value) || 80; const minCo = Number(els.minCo.value) || 1; const sortBy = els.sortBy.value; let results = state.lastResults.filter((r) => r.coCount >= minCo); const sortMap = { score: (a, b) => b.score - a.score, coCount: (a, b) => b.coCount - a.coCount, coverage: (a, b) => b.coverage - a.coverage, coRate: (a, b) => b.coRate - a.coRate, }; results.sort(sortMap[sortBy]); results = results.slice(0, limit); const maxScore = results.reduce((max, r) => Math.max(max, r.score), 0.0001); els.resultList.innerHTML = ""; for (const r of results) { const item = document.createElement("div"); item.className = "result-item"; const intensity = Math.min(1, r.score / maxScore); const hue = 20 + (1 - intensity) * 80; item.innerHTML = `
${r.token}
适配:${r.score.toFixed(3)} 共现:${r.coCount} 覆盖:${r.coverage}/${state.lastInputs.length}
`; els.resultList.appendChild(item); } els.resultSummary.textContent = `共找到 ${state.lastResults.length.toLocaleString()} 个候选搭配词,当前展示 ${results.length} 个。`; } function exportCsv() { if (!state.lastResults.length) return; const rows = [ ["keyword", "score", "co_count", "df", "coverage", "co_rate"], ]; for (const r of state.lastResults) { rows.push([ r.token, r.score.toFixed(6), r.coCount, r.df, r.coverage, r.coRate.toFixed(6), ]); } const csv = rows.map((r) => r.join(",")).join("\n"); const blob = new Blob([csv], { type: "text/csv;charset=utf-8;" }); const url = URL.createObjectURL(blob); const a = document.createElement("a"); a.href = url; a.download = "keyword_pairs.csv"; document.body.appendChild(a); a.click(); document.body.removeChild(a); URL.revokeObjectURL(url); } function buildPrompt() { const top = state.lastResults.slice(0, 30); const list = top .map( (r, idx) => `${idx + 1}. ${r.token}(适配:${r.score.toFixed( 3 )}, 共现:${r.coCount}, 覆盖:${r.coverage}/${state.lastInputs.length})` ) .join("\n"); const yearRange = state.yearMin && state.yearMax ? `${state.yearMin}-${state.yearMax}` : "未知"; return `你是研究选题助手,请以数据库统计为主,结合搭配关键词进行整合与拓展。请用中文输出。\n\n` + `输入关键词:${state.lastInputs.join("、")}\n` + `数据库年份范围:${yearRange}\n` + `命中记录数:${state.matchedRecords}\n\n` + `数据库高分搭配关键词(按适配度排序,优先使用):\n${list}\n\n` + `任务要求:\n` + `1) 将搭配关键词做 3-6 组主题化整合,每组给出一句聚合说明。\n` + `2) 给出 6-10 个拟申报选题建议,每条包含题目 + 1 句摘要。\n` + `3) 给出综合评价(创新性/可行性/政策相关性/学术价值,各 1-5 分)并简述理由。\n` + `4) 额外给出不超过 10 个“LLM 扩展关键词”,标注为扩展来源。\n` + `5) 提醒 2-3 条可能风险或选题陷阱。`; } async function callLLM() { if (!state.lastResults.length) { setStatus(els.llmStatus, "请先生成搭配词"); return; } const provider = els.llmProvider.value; const key = els.llmKey.value.trim(); const model = els.llmModel.value.trim(); if (!key || !model) { setStatus(els.llmStatus, "请填写 API Key 和模型"); return; } let url = ""; const headers = { "Content-Type": "application/json", Authorization: `Bearer ${key}`, }; if (provider === "openrouter") { url = "https://openrouter.ai/api/v1/chat/completions"; const origin = typeof location !== "undefined" ? location.origin : ""; headers["HTTP-Referer"] = origin && origin !== "null" ? origin : "http://localhost"; headers["X-Title"] = "Local Keyword Matcher"; } else { url = "https://api.deepseek.com/chat/completions"; } const allowed = MODEL_OPTIONS[provider] || []; if (!allowed.includes(model)) { setStatus(els.llmStatus, "模型不在允许列表"); return; } const body = { model, messages: [ { role: "system", content: "你是严谨的研究选题助手。" }, { role: "user", content: buildPrompt() }, ], temperature: 0.7, }; setStatus(els.llmStatus, "请求中..."); els.llmOutput.textContent = "生成中,请稍候..."; try { const resp = await fetch(url, { method: "POST", headers, body: JSON.stringify(body), }); if (!resp.ok) { const text = await resp.text(); throw new Error(text || "请求失败"); } const data = await resp.json(); const content = data.choices?.[0]?.message?.content; els.llmOutput.textContent = content || "未返回内容"; setStatus(els.llmStatus, "完成"); } catch (err) { els.llmOutput.textContent = `错误:${err.message}`; setStatus(els.llmStatus, "请求失败"); } } els.loadBtn.addEventListener("click", loadData); els.sampleBtn.addEventListener("click", loadSample); els.analyzeBtn.addEventListener("click", analyze); els.exportBtn.addEventListener("click", exportCsv); els.sortBy.addEventListener("change", renderResults); els.resultLimit.addEventListener("change", renderResults); els.minCo.addEventListener("change", renderResults); els.llmBtn.addEventListener("click", callLLM); els.llmProvider.addEventListener("change", updateModelOptions); updateModelOptions();