| | import asyncio |
| | import aiohttp |
| | import json |
| | import time |
| | import os |
| | from typing import Optional, Dict, Any |
| | from pydantic import BaseModel, HttpUrl |
| |
|
| | class NBCNewsAPITest: |
| | def __init__(self, base_url: str = "http://localhost:8000"): |
| | self.base_url = base_url |
| | self.session = None |
| |
|
| | async def __aenter__(self): |
| | self.session = aiohttp.ClientSession() |
| | return self |
| |
|
| | async def __aexit__(self, exc_type, exc_val, exc_tb): |
| | if self.session: |
| | await self.session.close() |
| |
|
| | async def submit_crawl(self, request_data: Dict[str, Any]) -> str: |
| | async with self.session.post(f"{self.base_url}/crawl", json=request_data) as response: |
| | result = await response.json() |
| | return result["task_id"] |
| |
|
| | async def get_task_status(self, task_id: str) -> Dict[str, Any]: |
| | async with self.session.get(f"{self.base_url}/task/{task_id}") as response: |
| | return await response.json() |
| |
|
| | async def wait_for_task(self, task_id: str, timeout: int = 300, poll_interval: int = 2) -> Dict[str, Any]: |
| | start_time = time.time() |
| | while True: |
| | if time.time() - start_time > timeout: |
| | raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds") |
| |
|
| | status = await self.get_task_status(task_id) |
| | if status["status"] in ["completed", "failed"]: |
| | return status |
| |
|
| | await asyncio.sleep(poll_interval) |
| |
|
| | async def check_health(self) -> Dict[str, Any]: |
| | async with self.session.get(f"{self.base_url}/health") as response: |
| | return await response.json() |
| |
|
| | async def test_basic_crawl(): |
| | print("\n=== Testing Basic Crawl ===") |
| | async with NBCNewsAPITest() as api: |
| | request = { |
| | "urls": "https://www.nbcnews.com/business", |
| | "priority": 10 |
| | } |
| | task_id = await api.submit_crawl(request) |
| | result = await api.wait_for_task(task_id) |
| | print(f"Basic crawl result length: {len(result['result']['markdown'])}") |
| | assert result["status"] == "completed" |
| | assert "result" in result |
| | assert result["result"]["success"] |
| |
|
| | async def test_js_execution(): |
| | print("\n=== Testing JS Execution ===") |
| | async with NBCNewsAPITest() as api: |
| | request = { |
| | "urls": "https://www.nbcnews.com/business", |
| | "priority": 8, |
| | "js_code": [ |
| | "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" |
| | ], |
| | "wait_for": "article.tease-card:nth-child(10)", |
| | "crawler_params": { |
| | "headless": True |
| | } |
| | } |
| | task_id = await api.submit_crawl(request) |
| | result = await api.wait_for_task(task_id) |
| | print(f"JS execution result length: {len(result['result']['markdown'])}") |
| | assert result["status"] == "completed" |
| | assert result["result"]["success"] |
| |
|
| | async def test_css_selector(): |
| | print("\n=== Testing CSS Selector ===") |
| | async with NBCNewsAPITest() as api: |
| | request = { |
| | "urls": "https://www.nbcnews.com/business", |
| | "priority": 7, |
| | "css_selector": ".wide-tease-item__description" |
| | } |
| | task_id = await api.submit_crawl(request) |
| | result = await api.wait_for_task(task_id) |
| | print(f"CSS selector result length: {len(result['result']['markdown'])}") |
| | assert result["status"] == "completed" |
| | assert result["result"]["success"] |
| |
|
| | async def test_structured_extraction(): |
| | print("\n=== Testing Structured Extraction ===") |
| | async with NBCNewsAPITest() as api: |
| | schema = { |
| | "name": "NBC News Articles", |
| | "baseSelector": "article.tease-card", |
| | "fields": [ |
| | { |
| | "name": "title", |
| | "selector": "h2", |
| | "type": "text" |
| | }, |
| | { |
| | "name": "description", |
| | "selector": ".tease-card__description", |
| | "type": "text" |
| | }, |
| | { |
| | "name": "link", |
| | "selector": "a", |
| | "type": "attribute", |
| | "attribute": "href" |
| | } |
| | ] |
| | } |
| | |
| | request = { |
| | "urls": "https://www.nbcnews.com/business", |
| | "priority": 9, |
| | "extraction_config": { |
| | "type": "json_css", |
| | "params": { |
| | "schema": schema |
| | } |
| | } |
| | } |
| | task_id = await api.submit_crawl(request) |
| | result = await api.wait_for_task(task_id) |
| | extracted = json.loads(result["result"]["extracted_content"]) |
| | print(f"Extracted {len(extracted)} articles") |
| | assert result["status"] == "completed" |
| | assert result["result"]["success"] |
| | assert len(extracted) > 0 |
| |
|
| | async def test_batch_crawl(): |
| | print("\n=== Testing Batch Crawl ===") |
| | async with NBCNewsAPITest() as api: |
| | request = { |
| | "urls": [ |
| | "https://www.nbcnews.com/business", |
| | "https://www.nbcnews.com/business/consumer", |
| | "https://www.nbcnews.com/business/economy" |
| | ], |
| | "priority": 6, |
| | "crawler_params": { |
| | "headless": True |
| | } |
| | } |
| | task_id = await api.submit_crawl(request) |
| | result = await api.wait_for_task(task_id) |
| | print(f"Batch crawl completed, got {len(result['results'])} results") |
| | assert result["status"] == "completed" |
| | assert "results" in result |
| | assert len(result["results"]) == 3 |
| |
|
| | async def test_llm_extraction(): |
| | print("\n=== Testing LLM Extraction with Ollama ===") |
| | async with NBCNewsAPITest() as api: |
| | schema = { |
| | "type": "object", |
| | "properties": { |
| | "article_title": { |
| | "type": "string", |
| | "description": "The main title of the news article" |
| | }, |
| | "summary": { |
| | "type": "string", |
| | "description": "A brief summary of the article content" |
| | }, |
| | "main_topics": { |
| | "type": "array", |
| | "items": {"type": "string"}, |
| | "description": "Main topics or themes discussed in the article" |
| | } |
| | }, |
| | "required": ["article_title", "summary", "main_topics"] |
| | } |
| |
|
| | request = { |
| | "urls": "https://www.nbcnews.com/business", |
| | "priority": 8, |
| | "extraction_config": { |
| | "type": "llm", |
| | "params": { |
| | "provider": "openai/gpt-4o-mini", |
| | "api_key": os.getenv("OLLAMA_API_KEY"), |
| | "schema": schema, |
| | "extraction_type": "schema", |
| | "instruction": """Extract the main article information including title, a brief summary, and main topics discussed. |
| | Focus on the primary business news article on the page.""" |
| | } |
| | }, |
| | "crawler_params": { |
| | "headless": True, |
| | "word_count_threshold": 1 |
| | } |
| | } |
| | |
| | task_id = await api.submit_crawl(request) |
| | result = await api.wait_for_task(task_id) |
| | |
| | if result["status"] == "completed": |
| | extracted = json.loads(result["result"]["extracted_content"]) |
| | print(f"Extracted article analysis:") |
| | print(json.dumps(extracted, indent=2)) |
| | |
| | assert result["status"] == "completed" |
| | assert result["result"]["success"] |
| |
|
| | async def test_screenshot(): |
| | print("\n=== Testing Screenshot ===") |
| | async with NBCNewsAPITest() as api: |
| | request = { |
| | "urls": "https://www.nbcnews.com/business", |
| | "priority": 5, |
| | "screenshot": True, |
| | "crawler_params": { |
| | "headless": True |
| | } |
| | } |
| | task_id = await api.submit_crawl(request) |
| | result = await api.wait_for_task(task_id) |
| | print("Screenshot captured:", bool(result["result"]["screenshot"])) |
| | assert result["status"] == "completed" |
| | assert result["result"]["success"] |
| | assert result["result"]["screenshot"] is not None |
| |
|
| | async def test_priority_handling(): |
| | print("\n=== Testing Priority Handling ===") |
| | async with NBCNewsAPITest() as api: |
| | |
| | low_priority = { |
| | "urls": "https://www.nbcnews.com/business", |
| | "priority": 1, |
| | "crawler_params": {"headless": True} |
| | } |
| | low_task_id = await api.submit_crawl(low_priority) |
| |
|
| | |
| | high_priority = { |
| | "urls": "https://www.nbcnews.com/business/consumer", |
| | "priority": 10, |
| | "crawler_params": {"headless": True} |
| | } |
| | high_task_id = await api.submit_crawl(high_priority) |
| |
|
| | |
| | high_result = await api.wait_for_task(high_task_id) |
| | low_result = await api.wait_for_task(low_task_id) |
| |
|
| | print("Both tasks completed") |
| | assert high_result["status"] == "completed" |
| | assert low_result["status"] == "completed" |
| |
|
| | async def main(): |
| | try: |
| | |
| | async with NBCNewsAPITest() as api: |
| | health = await api.check_health() |
| | print("Server health:", health) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | await test_llm_extraction() |
| | |
| | |
| | |
| |
|
| | except Exception as e: |
| | print(f"Test failed: {str(e)}") |
| | raise |
| |
|
| | if __name__ == "__main__": |
| | asyncio.run(main()) |