Spaces:

merlinyang
/

LingTitles

Running

App Files Files Community

LingTitles / app.js

merlinyang's picture

Upload 5 files

3561023 verified about 2 months ago

history blame contribute delete

15.9 kB

	const stopwords = new Set([
	"的",
	"与",
	"及",
	"和",
	"或",
	"对",
	"及其",
	"相关",
	"研究",
	"分析",
	"探索",
	"问题",
	"关系",
	"视角",
	"基于",
	"视域",
	"语境",
	"视野",
	"比较",
	"一",
	"二",
	"三",
	"四",
	"五",
	"论",
	"探",
	"略",
	"述",
	"试论",
	"研究者",
	]);

	const state = {
	records: [],
	tokenDocFreq: new Map(),
	yearMin: null,
	yearMax: null,
	loaded: false,
	lastResults: [],
	lastInputs: [],
	matchedRecords: 0,
	};

	const els = {
	projectFile: document.getElementById("projectFile"),
	vectorsFile: document.getElementById("vectorsFile"),
	useTokens: document.getElementById("useTokens"),
	usePhrases: document.getElementById("usePhrases"),
	useVectors: document.getElementById("useVectors"),
	loadBtn: document.getElementById("loadBtn"),
	sampleBtn: document.getElementById("sampleBtn"),
	loadStatus: document.getElementById("loadStatus"),
	loadProgress: document.getElementById("loadProgress"),
	dataStats: document.getElementById("dataStats"),
	keywordInput: document.getElementById("keywordInput"),
	resultLimit: document.getElementById("resultLimit"),
	minCo: document.getElementById("minCo"),
	sortBy: document.getElementById("sortBy"),
	analyzeBtn: document.getElementById("analyzeBtn"),
	exportBtn: document.getElementById("exportBtn"),
	queryStatus: document.getElementById("queryStatus"),
	resultList: document.getElementById("resultList"),
	resultSummary: document.getElementById("resultSummary"),
	llmProvider: document.getElementById("llmProvider"),
	llmKey: document.getElementById("llmKey"),
	llmModel: document.getElementById("llmModel"),
	llmBtn: document.getElementById("llmBtn"),
	llmStatus: document.getElementById("llmStatus"),
	llmOutput: document.getElementById("llmOutput"),
	};

	const MODEL_OPTIONS = {
	openrouter: [
	"openai/gpt-5.2",
	"openai/gpt-4o-mini",
	"anthropic/claude-sonnet-4.5",
	"anthropic/claude-opus-4.5",
	],
	deepseek: ["deepseek-chat"],
	};

	function updateModelOptions() {
	const provider = els.llmProvider.value;
	const options = MODEL_OPTIONS[provider] \|\| [];
	els.llmModel.innerHTML = "";
	for (const name of options) {
	const opt = document.createElement("option");
	opt.value = name;
	opt.textContent = name;
	els.llmModel.appendChild(opt);
	}
	}

	function normalizeToken(token) {
	if (!token) return null;
	const t = String(token).trim();
	if (!t) return null;
	if (t.length < 2) return null;
	if (stopwords.has(t)) return null;
	return t;
	}

	function updateStats() {
	const values = els.dataStats.querySelectorAll(".stat__value");
	values[0].textContent = state.records.length.toLocaleString();
	values[1].textContent = state.tokenDocFreq.size.toLocaleString();
	if (state.yearMin && state.yearMax) {
	values[2].textContent = `${state.yearMin} - ${state.yearMax}`;
	} else {
	values[2].textContent = "-";
	}
	}

	function setStatus(el, text) {
	el.textContent = text;
	}

	function setProgress(percent) {
	els.loadProgress.style.width = `${percent.toFixed(1)}%`;
	}

	function parseJsonlText(text, onRow) {
	const lines = text.split("\n");
	for (const line of lines) {
	const trimmed = line.trim();
	if (!trimmed) continue;
	try {
	onRow(JSON.parse(trimmed));
	} catch (err) {
	// Skip malformed line.
	}
	}
	}

	async function parseJsonlFile(file, onRow, onProgress) {
	const decoder = new TextDecoder("utf-8");
	const reader = file.stream().getReader();
	let buffer = "";
	let loaded = 0;
	while (true) {
	const { value, done } = await reader.read();
	if (done) break;
	loaded += value.length;
	buffer += decoder.decode(value, { stream: true });
	let lines = buffer.split("\n");
	buffer = lines.pop();
	for (const line of lines) {
	const trimmed = line.trim();
	if (!trimmed) continue;
	try {
	const obj = JSON.parse(trimmed);
	onRow(obj);
	} catch (err) {
	// Skip malformed line.
	}
	}
	if (onProgress) {
	onProgress(loaded / file.size);
	}
	}
	if (buffer.trim()) {
	try {
	onRow(JSON.parse(buffer));
	} catch (err) {
	// Ignore last line parse error.
	}
	}
	if (onProgress) {
	onProgress(1);
	}
	}

	function addRecord(tokens, year, title, source) {
	if (!tokens \|\| tokens.length === 0) return;
	const uniq = new Set();
	for (const token of tokens) {
	const norm = normalizeToken(token);
	if (norm) uniq.add(norm);
	}
	if (uniq.size === 0) return;
	const tokenArr = Array.from(uniq);
	state.records.push({ year, tokens: tokenArr, title, source });
	for (const t of uniq) {
	state.tokenDocFreq.set(t, (state.tokenDocFreq.get(t) \|\| 0) + 1);
	}
	if (year) {
	if (!state.yearMin \|\| year < state.yearMin) state.yearMin = year;
	if (!state.yearMax \|\| year > state.yearMax) state.yearMax = year;
	}
	}

	async function loadData() {
	state.records = [];
	state.tokenDocFreq.clear();
	state.yearMin = null;
	state.yearMax = null;
	state.loaded = false;
	setProgress(0);
	setStatus(els.loadStatus, "加载中...");

	const useTokens = els.useTokens.checked;
	const usePhrases = els.usePhrases.checked;
	const useVectors = els.useVectors.checked;

	const projectFile = els.projectFile.files[0];
	const vectorsFile = els.vectorsFile.files[0];

	if (!projectFile && !vectorsFile) {
	setStatus(els.loadStatus, "请至少选择一个 jsonl 文件");
	return;
	}

	if (projectFile) {
	await parseJsonlFile(
	projectFile,
	(obj) => {
	const year = Number(obj.year);
	const nlp = obj.nlp \|\| {};
	const tokens = [];
	if (useTokens && Array.isArray(nlp.tokens)) {
	tokens.push(...nlp.tokens);
	}
	if (usePhrases && Array.isArray(nlp.phrases)) {
	tokens.push(...nlp.phrases);
	}
	addRecord(tokens, year, obj.title, "project");
	},
	(ratio) => setProgress(ratio * 50)
	);
	}

	if (vectorsFile && useVectors) {
	await parseJsonlFile(
	vectorsFile,
	(obj) => {
	const year = Number(obj.year);
	const tokens = String(obj.name_tokens \|\| "").split(/\s+/);
	addRecord(tokens, year, obj.name, "vectors");
	},
	(ratio) => setProgress(50 + ratio * 50)
	);
	}

	state.loaded = true;
	updateStats();
	setStatus(els.loadStatus, "加载完成");
	setProgress(100);
	}

	async function loadSample() {
	state.records = [];
	state.tokenDocFreq.clear();
	state.yearMin = null;
	state.yearMax = null;
	state.loaded = false;
	setProgress(0);
	setStatus(els.loadStatus, "加载示例数据...");

	const useTokens = els.useTokens.checked;
	const usePhrases = els.usePhrases.checked;
	if (!useTokens && !usePhrases) {
	setStatus(els.loadStatus, "请至少勾选 tokens 或 phrases");
	return;
	}

	try {
	const resp = await fetch("sample_project.jsonl");
	if (!resp.ok) {
	throw new Error("示例数据加载失败");
	}
	const text = await resp.text();
	parseJsonlText(text, (obj) => {
	const year = Number(obj.year);
	const nlp = obj.nlp \|\| {};
	const tokens = [];
	if (useTokens && Array.isArray(nlp.tokens)) {
	tokens.push(...nlp.tokens);
	}
	if (usePhrases && Array.isArray(nlp.phrases)) {
	tokens.push(...nlp.phrases);
	}
	addRecord(tokens, year, obj.title, "sample");
	});
	state.loaded = true;
	updateStats();
	setProgress(100);
	setStatus(els.loadStatus, "示例数据加载完成");
	} catch (err) {
	setStatus(els.loadStatus, err.message);
	}
	}

	function parseInputKeywords(text) {
	const raw = text
	.replace(/[，、；;]+/g, " ")
	.replace(/\s+/g, " ")
	.split(" ")
	.map((t) => t.trim())
	.filter(Boolean);
	const uniq = new Set();
	for (const item of raw) {
	const norm = normalizeToken(item);
	if (norm) uniq.add(norm);
	}
	return Array.from(uniq);
	}

	function analyze() {
	if (!state.loaded) {
	setStatus(els.queryStatus, "请先加载数据");
	return;
	}
	const inputKeywords = parseInputKeywords(els.keywordInput.value);
	if (inputKeywords.length === 0) {
	setStatus(els.queryStatus, "请输入至少一个关键词");
	return;
	}
	const inputSet = new Set(inputKeywords);
	const candidateCo = new Map();
	const candidateHitInputs = new Map();
	const inputCounts = new Map();
	inputKeywords.forEach((k) => inputCounts.set(k, 0));
	let matchedRecords = 0;

	for (const record of state.records) {
	const hits = [];
	for (const t of record.tokens) {
	if (inputSet.has(t)) {
	hits.push(t);
	}
	}
	if (hits.length === 0) continue;
	matchedRecords += 1;
	hits.forEach((h) => inputCounts.set(h, (inputCounts.get(h) \|\| 0) + 1));
	const uniq = new Set(record.tokens);
	for (const t of uniq) {
	if (inputSet.has(t)) continue;
	candidateCo.set(t, (candidateCo.get(t) \|\| 0) + 1);
	if (!candidateHitInputs.has(t)) {
	candidateHitInputs.set(t, new Set());
	}
	const hitSet = candidateHitInputs.get(t);
	hits.forEach((h) => hitSet.add(h));
	}
	}

	const results = [];
	for (const [token, coCount] of candidateCo.entries()) {
	const df = state.tokenDocFreq.get(token) \|\| coCount;
	const coverage = candidateHitInputs.get(token)?.size \|\| 0;
	const coRate = df ? coCount / df : 0;
	const score =
	(coRate * (coverage / inputKeywords.length)) *
	Math.log(1 + coCount);
	results.push({
	token,
	coCount,
	df,
	coverage,
	score,
	coRate,
	});
	}

	results.sort((a, b) => b.score - a.score);
	state.lastResults = results;
	state.lastInputs = inputKeywords;
	state.matchedRecords = matchedRecords;

	renderResults();

	const inputStats = inputKeywords
	.map((k) => `${k}(${inputCounts.get(k) \|\| 0})`)
	.join("、");
	setStatus(
	els.queryStatus,
	`输入关键词：${inputStats}；命中记录 ${matchedRecords.toLocaleString()} 条`
	);
	}

	function renderResults() {
	const limit = Number(els.resultLimit.value) \|\| 80;
	const minCo = Number(els.minCo.value) \|\| 1;
	const sortBy = els.sortBy.value;
	let results = state.lastResults.filter((r) => r.coCount >= minCo);

	const sortMap = {
	score: (a, b) => b.score - a.score,
	coCount: (a, b) => b.coCount - a.coCount,
	coverage: (a, b) => b.coverage - a.coverage,
	coRate: (a, b) => b.coRate - a.coRate,
	};
	results.sort(sortMap[sortBy]);
	results = results.slice(0, limit);

	const maxScore = results.reduce((max, r) => Math.max(max, r.score), 0.0001);

	els.resultList.innerHTML = "";
	for (const r of results) {
	const item = document.createElement("div");
	item.className = "result-item";
	const intensity = Math.min(1, r.score / maxScore);
	const hue = 20 + (1 - intensity) * 80;
	item.innerHTML = `
	<div class="result-item__kw">${r.token}</div>
	<div class="bar"><span style="width:${(intensity * 100).toFixed(
	1
	)}%;background:linear-gradient(90deg, hsl(${hue},70%,45%), hsl(${
	hue + 20
	},80%,60%));"></span></div>
	<div class="meta">
	<span>适配:${r.score.toFixed(3)}</span>
	<span>共现:${r.coCount}</span>
	<span>覆盖:${r.coverage}/${state.lastInputs.length}</span>
	</div>
	`;
	els.resultList.appendChild(item);
	}

	els.resultSummary.textContent = `共找到 ${state.lastResults.length.toLocaleString()} 个候选搭配词，当前展示 ${results.length} 个。`;
	}

	function exportCsv() {
	if (!state.lastResults.length) return;
	const rows = [
	["keyword", "score", "co_count", "df", "coverage", "co_rate"],
	];
	for (const r of state.lastResults) {
	rows.push([
	r.token,
	r.score.toFixed(6),
	r.coCount,
	r.df,
	r.coverage,
	r.coRate.toFixed(6),
	]);
	}
	const csv = rows.map((r) => r.join(",")).join("\n");
	const blob = new Blob([csv], { type: "text/csv;charset=utf-8;" });
	const url = URL.createObjectURL(blob);
	const a = document.createElement("a");
	a.href = url;
	a.download = "keyword_pairs.csv";
	document.body.appendChild(a);
	a.click();
	document.body.removeChild(a);
	URL.revokeObjectURL(url);
	}

	function buildPrompt() {
	const top = state.lastResults.slice(0, 30);
	const list = top
	.map(
	(r, idx) =>
	`${idx + 1}. ${r.token}（适配:${r.score.toFixed(
	3
	)}, 共现:${r.coCount}, 覆盖:${r.coverage}/${state.lastInputs.length}）`
	)
	.join("\n");
	const yearRange =
	state.yearMin && state.yearMax ? `${state.yearMin}-${state.yearMax}` : "未知";

	return `你是研究选题助手，请以数据库统计为主，结合搭配关键词进行整合与拓展。请用中文输出。\n\n` +
	`输入关键词：${state.lastInputs.join("、")}\n` +
	`数据库年份范围：${yearRange}\n` +
	`命中记录数：${state.matchedRecords}\n\n` +
	`数据库高分搭配关键词（按适配度排序，优先使用）：\n${list}\n\n` +
	`任务要求：\n` +
	`1) 将搭配关键词做 3-6 组主题化整合，每组给出一句聚合说明。\n` +
	`2) 给出 6-10 个拟申报选题建议，每条包含题目 + 1 句摘要。\n` +
	`3) 给出综合评价（创新性/可行性/政策相关性/学术价值，各 1-5 分）并简述理由。\n` +
	`4) 额外给出不超过 10 个“LLM 扩展关键词”，标注为扩展来源。\n` +
	`5) 提醒 2-3 条可能风险或选题陷阱。`;
	}

	async function callLLM() {
	if (!state.lastResults.length) {
	setStatus(els.llmStatus, "请先生成搭配词");
	return;
	}
	const provider = els.llmProvider.value;
	const key = els.llmKey.value.trim();
	const model = els.llmModel.value.trim();

	if (!key \|\| !model) {
	setStatus(els.llmStatus, "请填写 API Key 和模型");
	return;
	}

	let url = "";
	const headers = {
	"Content-Type": "application/json",
	Authorization: `Bearer ${key}`,
	};
	if (provider === "openrouter") {
	url = "https://openrouter.ai/api/v1/chat/completions";
	const origin = typeof location !== "undefined" ? location.origin : "";
	headers["HTTP-Referer"] = origin && origin !== "null" ? origin : "http://localhost";
	headers["X-Title"] = "Local Keyword Matcher";
	} else {
	url = "https://api.deepseek.com/chat/completions";
	}

	const allowed = MODEL_OPTIONS[provider] \|\| [];
	if (!allowed.includes(model)) {
	setStatus(els.llmStatus, "模型不在允许列表");
	return;
	}

	const body = {
	model,
	messages: [
	{ role: "system", content: "你是严谨的研究选题助手。" },
	{ role: "user", content: buildPrompt() },
	],
	temperature: 0.7,
	};

	setStatus(els.llmStatus, "请求中...");
	els.llmOutput.textContent = "生成中，请稍候...";

	try {
	const resp = await fetch(url, {
	method: "POST",
	headers,
	body: JSON.stringify(body),
	});
	if (!resp.ok) {
	const text = await resp.text();
	throw new Error(text \|\| "请求失败");
	}
	const data = await resp.json();
	const content = data.choices?.[0]?.message?.content;
	els.llmOutput.textContent = content \|\| "未返回内容";
	setStatus(els.llmStatus, "完成");
	} catch (err) {
	els.llmOutput.textContent = `错误：${err.message}`;
	setStatus(els.llmStatus, "请求失败");
	}
	}

	els.loadBtn.addEventListener("click", loadData);
	els.sampleBtn.addEventListener("click", loadSample);
	els.analyzeBtn.addEventListener("click", analyze);
	els.exportBtn.addEventListener("click", exportCsv);
	els.sortBy.addEventListener("change", renderResults);
	els.resultLimit.addEventListener("change", renderResults);
	els.minCo.addEventListener("change", renderResults);
	els.llmBtn.addEventListener("click", callLLM);
	els.llmProvider.addEventListener("change", updateModelOptions);
	updateModelOptions();