new_recommendation / test_crossref_sorting.py
wujian123's picture
Upload all project files
3c6b551
#!/usr/bin/env python3
"""
测试Crossref API的排序机制
验证排序是否真的按引用量进行
"""
import requests
import json
def test_crossref_sorting():
"""测试Crossref API的排序机制"""
url = "https://api.crossref.org/works"
headers = {
'User-Agent': 'Academic-Reviewer-System/1.0 (mailto:[email protected])'
}
# 测试查询
query = "machine learning"
print("=== 测试Crossref API排序机制 ===")
print(f"查询: {query}")
print("=" * 60)
# 测试不同的排序方式
sort_options = ["published", "relevance", "deposited"]
for sort_option in sort_options:
print(f"\n--- 排序方式: {sort_option} ---")
params = {
"query": query,
"rows": 10,
"sort": sort_option,
"order": "desc",
"select": "DOI,title,author,container-title,published-print,published-online,is-referenced-by-count"
}
try:
response = requests.get(url, params=params, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()
items = data.get("message", {}).get("items", [])
print(f"返回 {len(items)} 个结果")
# 显示原始排序的引用量
print("原始排序的引用量:")
original_citations = []
for i, item in enumerate(items, 1):
title = item.get('title', ['N/A'])[0] if item.get('title') else 'N/A'
if len(title) > 40:
title = title[:40] + "..."
cited_count = item.get('is-referenced-by-count', 0)
original_citations.append(cited_count)
print(f" {i}. {title} (引用: {cited_count})")
# 按引用量重新排序
sorted_items = sorted(items, key=lambda x: x.get('is-referenced-by-count', 0), reverse=True)
print("\n按引用量重新排序:")
sorted_citations = []
for i, item in enumerate(sorted_items, 1):
title = item.get('title', ['N/A'])[0] if item.get('title') else 'N/A'
if len(title) > 40:
title = title[:40] + "..."
cited_count = item.get('is-referenced-by-count', 0)
sorted_citations.append(cited_count)
print(f" {i}. {title} (引用: {cited_count})")
# 比较排序差异
print(f"\n排序差异分析:")
print(f"原始排序引用量: {original_citations}")
print(f"按引用量排序: {sorted_citations}")
# 检查是否相同
if original_citations == sorted_citations:
print("✅ 原始排序与按引用量排序相同")
else:
print("❌ 原始排序与按引用量排序不同")
print(" 说明Crossref API没有按引用量排序")
# 统计引用量分布
non_zero_citations = [c for c in original_citations if c > 0]
print(f"非零引用量数量: {len(non_zero_citations)}/{len(original_citations)}")
if non_zero_citations:
print(f"平均引用量: {sum(non_zero_citations) / len(non_zero_citations):.2f}")
print(f"最大引用量: {max(non_zero_citations)}")
except Exception as e:
print(f"错误: {str(e)}")
print("-" * 50)
def test_high_citation_query():
"""测试高引用量查询"""
url = "https://api.crossref.org/works"
headers = {
'User-Agent': 'Academic-Reviewer-System/1.0 (mailto:[email protected])'
}
# 使用更可能产生高引用量的查询
high_citation_queries = [
"CRISPR",
"machine learning",
"deep learning",
"artificial intelligence"
]
print("\n=== 测试高引用量查询 ===")
print("=" * 60)
for query in high_citation_queries:
print(f"\n--- 查询: {query} ---")
params = {
"query": query,
"rows": 5,
"sort": "relevance", # 使用相关性排序
"order": "desc",
"select": "DOI,title,author,container-title,published-print,published-online,is-referenced-by-count"
}
try:
response = requests.get(url, params=params, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()
items = data.get("message", {}).get("items", [])
# 按引用量排序
sorted_items = sorted(items, key=lambda x: x.get('is-referenced-by-count', 0), reverse=True)
print(f"按引用量排序的前3个结果:")
for i, item in enumerate(sorted_items[:3], 1):
title = item.get('title', ['N/A'])[0] if item.get('title') else 'N/A'
if len(title) > 50:
title = title[:50] + "..."
cited_count = item.get('is-referenced-by-count', 0)
# 获取发表年份
pub_date = item.get('published-print', {}) or item.get('published-online', {})
pub_year = pub_date.get('date-parts', [[None]])[0][0] if pub_date else 'N/A'
print(f" {i}. {title}")
print(f" 引用量: {cited_count}, 年份: {pub_year}")
# 统计
citations = [item.get('is-referenced-by-count', 0) for item in items]
non_zero = [c for c in citations if c > 0]
print(f" 总结果: {len(items)}, 有引用量: {len(non_zero)}, 平均引用量: {sum(non_zero) / len(non_zero) if non_zero else 0:.2f}")
except Exception as e:
print(f"错误: {str(e)}")
if __name__ == "__main__":
test_crossref_sorting()
test_high_citation_query()