const stopwords = new Set([ "的", "与", "及", "和", "或", "对", "及其", "相关", "研究", "分析", "探索", "问题", "关系", "视角", "基于", "视域", "语境", "视野", "比较", "一", "二", "三", "四", "五", "论", "探", "略", "述", "试论", "研究者", ]); const state = { records: [], tokenDocFreq: new Map(), yearMin: null, yearMax: null, loaded: false, lastResults: [], lastInputs: [], matchedRecords: 0, }; const els = { projectFile: document.getElementById("projectFile"), vectorsFile: document.getElementById("vectorsFile"), useTokens: document.getElementById("useTokens"), usePhrases: document.getElementById("usePhrases"), useVectors: document.getElementById("useVectors"), loadBtn: document.getElementById("loadBtn"), sampleBtn: document.getElementById("sampleBtn"), loadStatus: document.getElementById("loadStatus"), loadProgress: document.getElementById("loadProgress"), dataStats: document.getElementById("dataStats"), keywordInput: document.getElementById("keywordInput"), resultLimit: document.getElementById("resultLimit"), minCo: document.getElementById("minCo"), sortBy: document.getElementById("sortBy"), analyzeBtn: document.getElementById("analyzeBtn"), exportBtn: document.getElementById("exportBtn"), queryStatus: document.getElementById("queryStatus"), resultList: document.getElementById("resultList"), resultSummary: document.getElementById("resultSummary"), llmProvider: document.getElementById("llmProvider"), llmKey: document.getElementById("llmKey"), llmModel: document.getElementById("llmModel"), llmBtn: document.getElementById("llmBtn"), llmStatus: document.getElementById("llmStatus"), llmOutput: document.getElementById("llmOutput"), }; const MODEL_OPTIONS = { openrouter: [ "openai/gpt-5.2", "openai/gpt-4o-mini", "anthropic/claude-sonnet-4.5", "anthropic/claude-opus-4.5", ], deepseek: ["deepseek-chat"], }; function updateModelOptions() { const provider = els.llmProvider.value; const options = MODEL_OPTIONS[provider] || []; els.llmModel.innerHTML = ""; for (const name of options) { const opt = document.createElement("option"); opt.value = name; opt.textContent = name; els.llmModel.appendChild(opt); } } function normalizeToken(token) { if (!token) return null; const t = String(token).trim(); if (!t) return null; if (t.length < 2) return null; if (stopwords.has(t)) return null; return t; } function updateStats() { const values = els.dataStats.querySelectorAll(".stat__value"); values[0].textContent = state.records.length.toLocaleString(); values[1].textContent = state.tokenDocFreq.size.toLocaleString(); if (state.yearMin && state.yearMax) { values[2].textContent = `${state.yearMin} - ${state.yearMax}`; } else { values[2].textContent = "-"; } } function setStatus(el, text) { el.textContent = text; } function setProgress(percent) { els.loadProgress.style.width = `${percent.toFixed(1)}%`; } function parseJsonlText(text, onRow) { const lines = text.split("\n"); for (const line of lines) { const trimmed = line.trim(); if (!trimmed) continue; try { onRow(JSON.parse(trimmed)); } catch (err) { // Skip malformed line. } } } async function parseJsonlFile(file, onRow, onProgress) { const decoder = new TextDecoder("utf-8"); const reader = file.stream().getReader(); let buffer = ""; let loaded = 0; while (true) { const { value, done } = await reader.read(); if (done) break; loaded += value.length; buffer += decoder.decode(value, { stream: true }); let lines = buffer.split("\n"); buffer = lines.pop(); for (const line of lines) { const trimmed = line.trim(); if (!trimmed) continue; try { const obj = JSON.parse(trimmed); onRow(obj); } catch (err) { // Skip malformed line. } } if (onProgress) { onProgress(loaded / file.size); } } if (buffer.trim()) { try { onRow(JSON.parse(buffer)); } catch (err) { // Ignore last line parse error. } } if (onProgress) { onProgress(1); } } function addRecord(tokens, year, title, source) { if (!tokens || tokens.length === 0) return; const uniq = new Set(); for (const token of tokens) { const norm = normalizeToken(token); if (norm) uniq.add(norm); } if (uniq.size === 0) return; const tokenArr = Array.from(uniq); state.records.push({ year, tokens: tokenArr, title, source }); for (const t of uniq) { state.tokenDocFreq.set(t, (state.tokenDocFreq.get(t) || 0) + 1); } if (year) { if (!state.yearMin || year < state.yearMin) state.yearMin = year; if (!state.yearMax || year > state.yearMax) state.yearMax = year; } } async function loadData() { state.records = []; state.tokenDocFreq.clear(); state.yearMin = null; state.yearMax = null; state.loaded = false; setProgress(0); setStatus(els.loadStatus, "加载中..."); const useTokens = els.useTokens.checked; const usePhrases = els.usePhrases.checked; const useVectors = els.useVectors.checked; const projectFile = els.projectFile.files[0]; const vectorsFile = els.vectorsFile.files[0]; if (!projectFile && !vectorsFile) { setStatus(els.loadStatus, "请至少选择一个 jsonl 文件"); return; } if (projectFile) { await parseJsonlFile( projectFile, (obj) => { const year = Number(obj.year); const nlp = obj.nlp || {}; const tokens = []; if (useTokens && Array.isArray(nlp.tokens)) { tokens.push(...nlp.tokens); } if (usePhrases && Array.isArray(nlp.phrases)) { tokens.push(...nlp.phrases); } addRecord(tokens, year, obj.title, "project"); }, (ratio) => setProgress(ratio * 50) ); } if (vectorsFile && useVectors) { await parseJsonlFile( vectorsFile, (obj) => { const year = Number(obj.year); const tokens = String(obj.name_tokens || "").split(/\s+/); addRecord(tokens, year, obj.name, "vectors"); }, (ratio) => setProgress(50 + ratio * 50) ); } state.loaded = true; updateStats(); setStatus(els.loadStatus, "加载完成"); setProgress(100); } async function loadSample() { state.records = []; state.tokenDocFreq.clear(); state.yearMin = null; state.yearMax = null; state.loaded = false; setProgress(0); setStatus(els.loadStatus, "加载示例数据..."); const useTokens = els.useTokens.checked; const usePhrases = els.usePhrases.checked; if (!useTokens && !usePhrases) { setStatus(els.loadStatus, "请至少勾选 tokens 或 phrases"); return; } try { const resp = await fetch("sample_project.jsonl"); if (!resp.ok) { throw new Error("示例数据加载失败"); } const text = await resp.text(); parseJsonlText(text, (obj) => { const year = Number(obj.year); const nlp = obj.nlp || {}; const tokens = []; if (useTokens && Array.isArray(nlp.tokens)) { tokens.push(...nlp.tokens); } if (usePhrases && Array.isArray(nlp.phrases)) { tokens.push(...nlp.phrases); } addRecord(tokens, year, obj.title, "sample"); }); state.loaded = true; updateStats(); setProgress(100); setStatus(els.loadStatus, "示例数据加载完成"); } catch (err) { setStatus(els.loadStatus, err.message); } } function parseInputKeywords(text) { const raw = text .replace(/[,、;;]+/g, " ") .replace(/\s+/g, " ") .split(" ") .map((t) => t.trim()) .filter(Boolean); const uniq = new Set(); for (const item of raw) { const norm = normalizeToken(item); if (norm) uniq.add(norm); } return Array.from(uniq); } function analyze() { if (!state.loaded) { setStatus(els.queryStatus, "请先加载数据"); return; } const inputKeywords = parseInputKeywords(els.keywordInput.value); if (inputKeywords.length === 0) { setStatus(els.queryStatus, "请输入至少一个关键词"); return; } const inputSet = new Set(inputKeywords); const candidateCo = new Map(); const candidateHitInputs = new Map(); const inputCounts = new Map(); inputKeywords.forEach((k) => inputCounts.set(k, 0)); let matchedRecords = 0; for (const record of state.records) { const hits = []; for (const t of record.tokens) { if (inputSet.has(t)) { hits.push(t); } } if (hits.length === 0) continue; matchedRecords += 1; hits.forEach((h) => inputCounts.set(h, (inputCounts.get(h) || 0) + 1)); const uniq = new Set(record.tokens); for (const t of uniq) { if (inputSet.has(t)) continue; candidateCo.set(t, (candidateCo.get(t) || 0) + 1); if (!candidateHitInputs.has(t)) { candidateHitInputs.set(t, new Set()); } const hitSet = candidateHitInputs.get(t); hits.forEach((h) => hitSet.add(h)); } } const results = []; for (const [token, coCount] of candidateCo.entries()) { const df = state.tokenDocFreq.get(token) || coCount; const coverage = candidateHitInputs.get(token)?.size || 0; const coRate = df ? coCount / df : 0; const score = (coRate * (coverage / inputKeywords.length)) * Math.log(1 + coCount); results.push({ token, coCount, df, coverage, score, coRate, }); } results.sort((a, b) => b.score - a.score); state.lastResults = results; state.lastInputs = inputKeywords; state.matchedRecords = matchedRecords; renderResults(); const inputStats = inputKeywords .map((k) => `${k}(${inputCounts.get(k) || 0})`) .join("、"); setStatus( els.queryStatus, `输入关键词:${inputStats};命中记录 ${matchedRecords.toLocaleString()} 条` ); } function renderResults() { const limit = Number(els.resultLimit.value) || 80; const minCo = Number(els.minCo.value) || 1; const sortBy = els.sortBy.value; let results = state.lastResults.filter((r) => r.coCount >= minCo); const sortMap = { score: (a, b) => b.score - a.score, coCount: (a, b) => b.coCount - a.coCount, coverage: (a, b) => b.coverage - a.coverage, coRate: (a, b) => b.coRate - a.coRate, }; results.sort(sortMap[sortBy]); results = results.slice(0, limit); const maxScore = results.reduce((max, r) => Math.max(max, r.score), 0.0001); els.resultList.innerHTML = ""; for (const r of results) { const item = document.createElement("div"); item.className = "result-item"; const intensity = Math.min(1, r.score / maxScore); const hue = 20 + (1 - intensity) * 80; item.innerHTML = `