LingTitles / app.js
merlinyang's picture
Upload 5 files
3561023 verified
const stopwords = new Set([
"的",
"与",
"及",
"和",
"或",
"对",
"及其",
"相关",
"研究",
"分析",
"探索",
"问题",
"关系",
"视角",
"基于",
"视域",
"语境",
"视野",
"比较",
"一",
"二",
"三",
"四",
"五",
"论",
"探",
"略",
"述",
"试论",
"研究者",
]);
const state = {
records: [],
tokenDocFreq: new Map(),
yearMin: null,
yearMax: null,
loaded: false,
lastResults: [],
lastInputs: [],
matchedRecords: 0,
};
const els = {
projectFile: document.getElementById("projectFile"),
vectorsFile: document.getElementById("vectorsFile"),
useTokens: document.getElementById("useTokens"),
usePhrases: document.getElementById("usePhrases"),
useVectors: document.getElementById("useVectors"),
loadBtn: document.getElementById("loadBtn"),
sampleBtn: document.getElementById("sampleBtn"),
loadStatus: document.getElementById("loadStatus"),
loadProgress: document.getElementById("loadProgress"),
dataStats: document.getElementById("dataStats"),
keywordInput: document.getElementById("keywordInput"),
resultLimit: document.getElementById("resultLimit"),
minCo: document.getElementById("minCo"),
sortBy: document.getElementById("sortBy"),
analyzeBtn: document.getElementById("analyzeBtn"),
exportBtn: document.getElementById("exportBtn"),
queryStatus: document.getElementById("queryStatus"),
resultList: document.getElementById("resultList"),
resultSummary: document.getElementById("resultSummary"),
llmProvider: document.getElementById("llmProvider"),
llmKey: document.getElementById("llmKey"),
llmModel: document.getElementById("llmModel"),
llmBtn: document.getElementById("llmBtn"),
llmStatus: document.getElementById("llmStatus"),
llmOutput: document.getElementById("llmOutput"),
};
const MODEL_OPTIONS = {
openrouter: [
"openai/gpt-5.2",
"openai/gpt-4o-mini",
"anthropic/claude-sonnet-4.5",
"anthropic/claude-opus-4.5",
],
deepseek: ["deepseek-chat"],
};
function updateModelOptions() {
const provider = els.llmProvider.value;
const options = MODEL_OPTIONS[provider] || [];
els.llmModel.innerHTML = "";
for (const name of options) {
const opt = document.createElement("option");
opt.value = name;
opt.textContent = name;
els.llmModel.appendChild(opt);
}
}
function normalizeToken(token) {
if (!token) return null;
const t = String(token).trim();
if (!t) return null;
if (t.length < 2) return null;
if (stopwords.has(t)) return null;
return t;
}
function updateStats() {
const values = els.dataStats.querySelectorAll(".stat__value");
values[0].textContent = state.records.length.toLocaleString();
values[1].textContent = state.tokenDocFreq.size.toLocaleString();
if (state.yearMin && state.yearMax) {
values[2].textContent = `${state.yearMin} - ${state.yearMax}`;
} else {
values[2].textContent = "-";
}
}
function setStatus(el, text) {
el.textContent = text;
}
function setProgress(percent) {
els.loadProgress.style.width = `${percent.toFixed(1)}%`;
}
function parseJsonlText(text, onRow) {
const lines = text.split("\n");
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed) continue;
try {
onRow(JSON.parse(trimmed));
} catch (err) {
// Skip malformed line.
}
}
}
async function parseJsonlFile(file, onRow, onProgress) {
const decoder = new TextDecoder("utf-8");
const reader = file.stream().getReader();
let buffer = "";
let loaded = 0;
while (true) {
const { value, done } = await reader.read();
if (done) break;
loaded += value.length;
buffer += decoder.decode(value, { stream: true });
let lines = buffer.split("\n");
buffer = lines.pop();
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed) continue;
try {
const obj = JSON.parse(trimmed);
onRow(obj);
} catch (err) {
// Skip malformed line.
}
}
if (onProgress) {
onProgress(loaded / file.size);
}
}
if (buffer.trim()) {
try {
onRow(JSON.parse(buffer));
} catch (err) {
// Ignore last line parse error.
}
}
if (onProgress) {
onProgress(1);
}
}
function addRecord(tokens, year, title, source) {
if (!tokens || tokens.length === 0) return;
const uniq = new Set();
for (const token of tokens) {
const norm = normalizeToken(token);
if (norm) uniq.add(norm);
}
if (uniq.size === 0) return;
const tokenArr = Array.from(uniq);
state.records.push({ year, tokens: tokenArr, title, source });
for (const t of uniq) {
state.tokenDocFreq.set(t, (state.tokenDocFreq.get(t) || 0) + 1);
}
if (year) {
if (!state.yearMin || year < state.yearMin) state.yearMin = year;
if (!state.yearMax || year > state.yearMax) state.yearMax = year;
}
}
async function loadData() {
state.records = [];
state.tokenDocFreq.clear();
state.yearMin = null;
state.yearMax = null;
state.loaded = false;
setProgress(0);
setStatus(els.loadStatus, "加载中...");
const useTokens = els.useTokens.checked;
const usePhrases = els.usePhrases.checked;
const useVectors = els.useVectors.checked;
const projectFile = els.projectFile.files[0];
const vectorsFile = els.vectorsFile.files[0];
if (!projectFile && !vectorsFile) {
setStatus(els.loadStatus, "请至少选择一个 jsonl 文件");
return;
}
if (projectFile) {
await parseJsonlFile(
projectFile,
(obj) => {
const year = Number(obj.year);
const nlp = obj.nlp || {};
const tokens = [];
if (useTokens && Array.isArray(nlp.tokens)) {
tokens.push(...nlp.tokens);
}
if (usePhrases && Array.isArray(nlp.phrases)) {
tokens.push(...nlp.phrases);
}
addRecord(tokens, year, obj.title, "project");
},
(ratio) => setProgress(ratio * 50)
);
}
if (vectorsFile && useVectors) {
await parseJsonlFile(
vectorsFile,
(obj) => {
const year = Number(obj.year);
const tokens = String(obj.name_tokens || "").split(/\s+/);
addRecord(tokens, year, obj.name, "vectors");
},
(ratio) => setProgress(50 + ratio * 50)
);
}
state.loaded = true;
updateStats();
setStatus(els.loadStatus, "加载完成");
setProgress(100);
}
async function loadSample() {
state.records = [];
state.tokenDocFreq.clear();
state.yearMin = null;
state.yearMax = null;
state.loaded = false;
setProgress(0);
setStatus(els.loadStatus, "加载示例数据...");
const useTokens = els.useTokens.checked;
const usePhrases = els.usePhrases.checked;
if (!useTokens && !usePhrases) {
setStatus(els.loadStatus, "请至少勾选 tokens 或 phrases");
return;
}
try {
const resp = await fetch("sample_project.jsonl");
if (!resp.ok) {
throw new Error("示例数据加载失败");
}
const text = await resp.text();
parseJsonlText(text, (obj) => {
const year = Number(obj.year);
const nlp = obj.nlp || {};
const tokens = [];
if (useTokens && Array.isArray(nlp.tokens)) {
tokens.push(...nlp.tokens);
}
if (usePhrases && Array.isArray(nlp.phrases)) {
tokens.push(...nlp.phrases);
}
addRecord(tokens, year, obj.title, "sample");
});
state.loaded = true;
updateStats();
setProgress(100);
setStatus(els.loadStatus, "示例数据加载完成");
} catch (err) {
setStatus(els.loadStatus, err.message);
}
}
function parseInputKeywords(text) {
const raw = text
.replace(/[,、;;]+/g, " ")
.replace(/\s+/g, " ")
.split(" ")
.map((t) => t.trim())
.filter(Boolean);
const uniq = new Set();
for (const item of raw) {
const norm = normalizeToken(item);
if (norm) uniq.add(norm);
}
return Array.from(uniq);
}
function analyze() {
if (!state.loaded) {
setStatus(els.queryStatus, "请先加载数据");
return;
}
const inputKeywords = parseInputKeywords(els.keywordInput.value);
if (inputKeywords.length === 0) {
setStatus(els.queryStatus, "请输入至少一个关键词");
return;
}
const inputSet = new Set(inputKeywords);
const candidateCo = new Map();
const candidateHitInputs = new Map();
const inputCounts = new Map();
inputKeywords.forEach((k) => inputCounts.set(k, 0));
let matchedRecords = 0;
for (const record of state.records) {
const hits = [];
for (const t of record.tokens) {
if (inputSet.has(t)) {
hits.push(t);
}
}
if (hits.length === 0) continue;
matchedRecords += 1;
hits.forEach((h) => inputCounts.set(h, (inputCounts.get(h) || 0) + 1));
const uniq = new Set(record.tokens);
for (const t of uniq) {
if (inputSet.has(t)) continue;
candidateCo.set(t, (candidateCo.get(t) || 0) + 1);
if (!candidateHitInputs.has(t)) {
candidateHitInputs.set(t, new Set());
}
const hitSet = candidateHitInputs.get(t);
hits.forEach((h) => hitSet.add(h));
}
}
const results = [];
for (const [token, coCount] of candidateCo.entries()) {
const df = state.tokenDocFreq.get(token) || coCount;
const coverage = candidateHitInputs.get(token)?.size || 0;
const coRate = df ? coCount / df : 0;
const score =
(coRate * (coverage / inputKeywords.length)) *
Math.log(1 + coCount);
results.push({
token,
coCount,
df,
coverage,
score,
coRate,
});
}
results.sort((a, b) => b.score - a.score);
state.lastResults = results;
state.lastInputs = inputKeywords;
state.matchedRecords = matchedRecords;
renderResults();
const inputStats = inputKeywords
.map((k) => `${k}(${inputCounts.get(k) || 0})`)
.join("、");
setStatus(
els.queryStatus,
`输入关键词:${inputStats};命中记录 ${matchedRecords.toLocaleString()} 条`
);
}
function renderResults() {
const limit = Number(els.resultLimit.value) || 80;
const minCo = Number(els.minCo.value) || 1;
const sortBy = els.sortBy.value;
let results = state.lastResults.filter((r) => r.coCount >= minCo);
const sortMap = {
score: (a, b) => b.score - a.score,
coCount: (a, b) => b.coCount - a.coCount,
coverage: (a, b) => b.coverage - a.coverage,
coRate: (a, b) => b.coRate - a.coRate,
};
results.sort(sortMap[sortBy]);
results = results.slice(0, limit);
const maxScore = results.reduce((max, r) => Math.max(max, r.score), 0.0001);
els.resultList.innerHTML = "";
for (const r of results) {
const item = document.createElement("div");
item.className = "result-item";
const intensity = Math.min(1, r.score / maxScore);
const hue = 20 + (1 - intensity) * 80;
item.innerHTML = `
<div class="result-item__kw">${r.token}</div>
<div class="bar"><span style="width:${(intensity * 100).toFixed(
1
)}%;background:linear-gradient(90deg, hsl(${hue},70%,45%), hsl(${
hue + 20
},80%,60%));"></span></div>
<div class="meta">
<span>适配:${r.score.toFixed(3)}</span>
<span>共现:${r.coCount}</span>
<span>覆盖:${r.coverage}/${state.lastInputs.length}</span>
</div>
`;
els.resultList.appendChild(item);
}
els.resultSummary.textContent = `共找到 ${state.lastResults.length.toLocaleString()} 个候选搭配词,当前展示 ${results.length} 个。`;
}
function exportCsv() {
if (!state.lastResults.length) return;
const rows = [
["keyword", "score", "co_count", "df", "coverage", "co_rate"],
];
for (const r of state.lastResults) {
rows.push([
r.token,
r.score.toFixed(6),
r.coCount,
r.df,
r.coverage,
r.coRate.toFixed(6),
]);
}
const csv = rows.map((r) => r.join(",")).join("\n");
const blob = new Blob([csv], { type: "text/csv;charset=utf-8;" });
const url = URL.createObjectURL(blob);
const a = document.createElement("a");
a.href = url;
a.download = "keyword_pairs.csv";
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
}
function buildPrompt() {
const top = state.lastResults.slice(0, 30);
const list = top
.map(
(r, idx) =>
`${idx + 1}. ${r.token}(适配:${r.score.toFixed(
3
)}, 共现:${r.coCount}, 覆盖:${r.coverage}/${state.lastInputs.length})`
)
.join("\n");
const yearRange =
state.yearMin && state.yearMax ? `${state.yearMin}-${state.yearMax}` : "未知";
return `你是研究选题助手,请以数据库统计为主,结合搭配关键词进行整合与拓展。请用中文输出。\n\n` +
`输入关键词:${state.lastInputs.join("、")}\n` +
`数据库年份范围:${yearRange}\n` +
`命中记录数:${state.matchedRecords}\n\n` +
`数据库高分搭配关键词(按适配度排序,优先使用):\n${list}\n\n` +
`任务要求:\n` +
`1) 将搭配关键词做 3-6 组主题化整合,每组给出一句聚合说明。\n` +
`2) 给出 6-10 个拟申报选题建议,每条包含题目 + 1 句摘要。\n` +
`3) 给出综合评价(创新性/可行性/政策相关性/学术价值,各 1-5 分)并简述理由。\n` +
`4) 额外给出不超过 10 个“LLM 扩展关键词”,标注为扩展来源。\n` +
`5) 提醒 2-3 条可能风险或选题陷阱。`;
}
async function callLLM() {
if (!state.lastResults.length) {
setStatus(els.llmStatus, "请先生成搭配词");
return;
}
const provider = els.llmProvider.value;
const key = els.llmKey.value.trim();
const model = els.llmModel.value.trim();
if (!key || !model) {
setStatus(els.llmStatus, "请填写 API Key 和模型");
return;
}
let url = "";
const headers = {
"Content-Type": "application/json",
Authorization: `Bearer ${key}`,
};
if (provider === "openrouter") {
url = "https://openrouter.ai/api/v1/chat/completions";
const origin = typeof location !== "undefined" ? location.origin : "";
headers["HTTP-Referer"] = origin && origin !== "null" ? origin : "http://localhost";
headers["X-Title"] = "Local Keyword Matcher";
} else {
url = "https://api.deepseek.com/chat/completions";
}
const allowed = MODEL_OPTIONS[provider] || [];
if (!allowed.includes(model)) {
setStatus(els.llmStatus, "模型不在允许列表");
return;
}
const body = {
model,
messages: [
{ role: "system", content: "你是严谨的研究选题助手。" },
{ role: "user", content: buildPrompt() },
],
temperature: 0.7,
};
setStatus(els.llmStatus, "请求中...");
els.llmOutput.textContent = "生成中,请稍候...";
try {
const resp = await fetch(url, {
method: "POST",
headers,
body: JSON.stringify(body),
});
if (!resp.ok) {
const text = await resp.text();
throw new Error(text || "请求失败");
}
const data = await resp.json();
const content = data.choices?.[0]?.message?.content;
els.llmOutput.textContent = content || "未返回内容";
setStatus(els.llmStatus, "完成");
} catch (err) {
els.llmOutput.textContent = `错误:${err.message}`;
setStatus(els.llmStatus, "请求失败");
}
}
els.loadBtn.addEventListener("click", loadData);
els.sampleBtn.addEventListener("click", loadSample);
els.analyzeBtn.addEventListener("click", analyze);
els.exportBtn.addEventListener("click", exportCsv);
els.sortBy.addEventListener("change", renderResults);
els.resultLimit.addEventListener("change", renderResults);
els.minCo.addEventListener("change", renderResults);
els.llmBtn.addEventListener("click", callLLM);
els.llmProvider.addEventListener("change", updateModelOptions);
updateModelOptions();