| | import os, sys |
| | import time |
| | import warnings |
| | from enum import Enum |
| | from colorama import init, Fore, Back, Style |
| | from pathlib import Path |
| | from typing import Optional, List, Union |
| | import json |
| | import asyncio |
| | |
| | from contextlib import asynccontextmanager |
| | from .models import CrawlResult, MarkdownGenerationResult |
| | from .async_database import async_db_manager |
| | from .chunking_strategy import * |
| | from .content_filter_strategy import * |
| | from .extraction_strategy import * |
| | from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse |
| | from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode |
| | from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy |
| | from .content_scraping_strategy import WebScrapingStrategy |
| | from .async_logger import AsyncLogger |
| | from .async_configs import BrowserConfig, CrawlerRunConfig |
| | from .config import ( |
| | MIN_WORD_THRESHOLD, |
| | IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, |
| | URL_LOG_SHORTEN_LENGTH |
| | ) |
| | from .utils import ( |
| | sanitize_input_encode, |
| | InvalidCSSSelectorError, |
| | format_html, |
| | fast_format_html, |
| | create_box_message |
| | ) |
| |
|
| | from urllib.parse import urlparse |
| | import random |
| | from .__version__ import __version__ as crawl4ai_version |
| |
|
| |
|
| | class AsyncWebCrawler: |
| | """ |
| | Asynchronous web crawler with flexible caching capabilities. |
| | |
| | There are two ways to use the crawler: |
| | |
| | 1. Using context manager (recommended for simple cases): |
| | ```python |
| | async with AsyncWebCrawler() as crawler: |
| | result = await crawler.arun(url="https://example.com") |
| | ``` |
| | |
| | 2. Using explicit lifecycle management (recommended for long-running applications): |
| | ```python |
| | crawler = AsyncWebCrawler() |
| | await crawler.start() |
| | |
| | # Use the crawler multiple times |
| | result1 = await crawler.arun(url="https://example.com") |
| | result2 = await crawler.arun(url="https://another.com") |
| | |
| | await crawler.close() |
| | ``` |
| | |
| | Migration Guide: |
| | Old way (deprecated): |
| | crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True) |
| | |
| | New way (recommended): |
| | browser_config = BrowserConfig(browser_type="chromium", headless=True) |
| | crawler = AsyncWebCrawler(config=browser_config) |
| | |
| | |
| | Attributes: |
| | browser_config (BrowserConfig): Configuration object for browser settings. |
| | crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages. |
| | logger (AsyncLogger): Logger instance for recording events and errors. |
| | always_bypass_cache (bool): Whether to always bypass cache. |
| | crawl4ai_folder (str): Directory for storing cache. |
| | base_directory (str): Base directory for storing cache. |
| | ready (bool): Whether the crawler is ready for use. |
| | |
| | Methods: |
| | start(): Start the crawler explicitly without using context manager. |
| | close(): Close the crawler explicitly without using context manager. |
| | arun(): Run the crawler for a single source: URL (web, local file, or raw HTML). |
| | awarmup(): Perform warmup sequence. |
| | arun_many(): Run the crawler for multiple sources. |
| | aprocess_html(): Process HTML content. |
| | |
| | Typical Usage: |
| | async with AsyncWebCrawler() as crawler: |
| | result = await crawler.arun(url="https://example.com") |
| | print(result.markdown) |
| | |
| | Using configuration: |
| | browser_config = BrowserConfig(browser_type="chromium", headless=True) |
| | async with AsyncWebCrawler(config=browser_config) as crawler: |
| | crawler_config = CrawlerRunConfig( |
| | cache_mode=CacheMode.BYPASS |
| | ) |
| | result = await crawler.arun(url="https://example.com", config=crawler_config) |
| | print(result.markdown) |
| | """ |
| | _domain_last_hit = {} |
| |
|
| | def __init__( |
| | self, |
| | crawler_strategy: Optional[AsyncCrawlerStrategy] = None, |
| | config: Optional[BrowserConfig] = None, |
| | always_bypass_cache: bool = False, |
| | always_by_pass_cache: Optional[bool] = None, |
| | base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), |
| | thread_safe: bool = False, |
| | **kwargs, |
| | ): |
| | """ |
| | Initialize the AsyncWebCrawler. |
| | |
| | Args: |
| | crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy |
| | config: Configuration object for browser settings. If None, will be created from kwargs |
| | always_bypass_cache: Whether to always bypass cache (new parameter) |
| | always_by_pass_cache: Deprecated, use always_bypass_cache instead |
| | base_directory: Base directory for storing cache |
| | thread_safe: Whether to use thread-safe operations |
| | **kwargs: Additional arguments for backwards compatibility |
| | """ |
| | |
| | browser_config = config |
| | if browser_config is not None: |
| | if any(k in kwargs for k in ["browser_type", "headless", "viewport_width", "viewport_height"]): |
| | self.logger.warning( |
| | message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.", |
| | tag="WARNING" |
| | ) |
| | else: |
| | |
| | browser_config = BrowserConfig.from_kwargs(kwargs) |
| |
|
| | self.browser_config = browser_config |
| | |
| | |
| | self.logger = AsyncLogger( |
| | log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"), |
| | verbose=self.browser_config.verbose, |
| | tag_width=10 |
| | ) |
| |
|
| | |
| | |
| | params = { |
| | k:v for k, v in kwargs.items() if k in ['browser_congig', 'logger'] |
| | } |
| | self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( |
| | browser_config=browser_config, |
| | logger=self.logger, |
| | **params |
| | ) |
| | |
| | |
| | if not self.crawler_strategy.logger: |
| | self.crawler_strategy.logger = self.logger |
| | |
| | |
| | if always_by_pass_cache is not None: |
| | if kwargs.get("warning", True): |
| | warnings.warn( |
| | "'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. " |
| | "Use 'always_bypass_cache' instead. " |
| | "Pass warning=False to suppress this warning.", |
| | DeprecationWarning, |
| | stacklevel=2 |
| | ) |
| | self.always_bypass_cache = always_by_pass_cache |
| | else: |
| | self.always_bypass_cache = always_bypass_cache |
| |
|
| | |
| | self._lock = asyncio.Lock() if thread_safe else None |
| | |
| | |
| | self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") |
| | os.makedirs(self.crawl4ai_folder, exist_ok=True) |
| | os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) |
| | |
| | self.ready = False |
| |
|
| | async def start(self): |
| | """ |
| | Start the crawler explicitly without using context manager. |
| | This is equivalent to using 'async with' but gives more control over the lifecycle. |
| | |
| | This method will: |
| | 1. Initialize the browser and context |
| | 2. Perform warmup sequence |
| | 3. Return the crawler instance for method chaining |
| | |
| | Returns: |
| | AsyncWebCrawler: The initialized crawler instance |
| | """ |
| | await self.crawler_strategy.__aenter__() |
| | await self.awarmup() |
| | return self |
| |
|
| | async def close(self): |
| | """ |
| | Close the crawler explicitly without using context manager. |
| | This should be called when you're done with the crawler if you used start(). |
| | |
| | This method will: |
| | 1. Clean up browser resources |
| | 2. Close any open pages and contexts |
| | """ |
| | await self.crawler_strategy.__aexit__(None, None, None) |
| |
|
| | async def __aenter__(self): |
| | return await self.start() |
| |
|
| | async def __aexit__(self, exc_type, exc_val, exc_tb): |
| | await self.close() |
| | |
| | async def awarmup(self): |
| | """ |
| | Initialize the crawler with warm-up sequence. |
| | |
| | This method: |
| | 1. Logs initialization info |
| | 2. Sets up browser configuration |
| | 3. Marks the crawler as ready |
| | """ |
| | self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") |
| | self.ready = True |
| |
|
| | @asynccontextmanager |
| | async def nullcontext(self): |
| | """异步空上下文管理器""" |
| | yield |
| |
|
| | async def arun( |
| | self, |
| | url: str, |
| | config: Optional[CrawlerRunConfig] = None, |
| | |
| | word_count_threshold=MIN_WORD_THRESHOLD, |
| | extraction_strategy: ExtractionStrategy = None, |
| | chunking_strategy: ChunkingStrategy = RegexChunking(), |
| | content_filter: RelevantContentFilter = None, |
| | cache_mode: Optional[CacheMode] = None, |
| | |
| | bypass_cache: bool = False, |
| | disable_cache: bool = False, |
| | no_cache_read: bool = False, |
| | no_cache_write: bool = False, |
| | |
| | css_selector: str = None, |
| | screenshot: bool = False, |
| | pdf: bool = False, |
| | user_agent: str = None, |
| | verbose=True, |
| | **kwargs, |
| | ) -> CrawlResult: |
| | """ |
| | Runs the crawler for a single source: URL (web, local file, or raw HTML). |
| | |
| | Migration Guide: |
| | Old way (deprecated): |
| | result = await crawler.arun( |
| | url="https://example.com", |
| | word_count_threshold=200, |
| | screenshot=True, |
| | ... |
| | ) |
| | |
| | New way (recommended): |
| | config = CrawlerRunConfig( |
| | word_count_threshold=200, |
| | screenshot=True, |
| | ... |
| | ) |
| | result = await crawler.arun(url="https://example.com", crawler_config=config) |
| | |
| | Args: |
| | url: The URL to crawl (http://, https://, file://, or raw:) |
| | crawler_config: Configuration object controlling crawl behavior |
| | [other parameters maintained for backwards compatibility] |
| | |
| | Returns: |
| | CrawlResult: The result of crawling and processing |
| | """ |
| | crawler_config = config |
| | if not isinstance(url, str) or not url: |
| | raise ValueError("Invalid URL, make sure the URL is a non-empty string") |
| | |
| | async with self._lock or self.nullcontext(): |
| | try: |
| | |
| | if crawler_config is not None: |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | config = crawler_config |
| | else: |
| | |
| | config_kwargs = { |
| | "word_count_threshold": word_count_threshold, |
| | "extraction_strategy": extraction_strategy, |
| | "chunking_strategy": chunking_strategy, |
| | "content_filter": content_filter, |
| | "cache_mode": cache_mode, |
| | "bypass_cache": bypass_cache, |
| | "disable_cache": disable_cache, |
| | "no_cache_read": no_cache_read, |
| | "no_cache_write": no_cache_write, |
| | "css_selector": css_selector, |
| | "screenshot": screenshot, |
| | "pdf": pdf, |
| | "verbose": verbose, |
| | **kwargs |
| | } |
| | config = CrawlerRunConfig.from_kwargs(config_kwargs) |
| |
|
| | |
| | if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): |
| | if kwargs.get("warning", True): |
| | warnings.warn( |
| | "Cache control boolean flags are deprecated and will be removed in version 0.5.0. " |
| | "Use 'cache_mode' parameter instead.", |
| | DeprecationWarning, |
| | stacklevel=2 |
| | ) |
| | |
| | |
| | if config.cache_mode is None: |
| | config.cache_mode = _legacy_to_cache_mode( |
| | disable_cache=disable_cache, |
| | bypass_cache=bypass_cache, |
| | no_cache_read=no_cache_read, |
| | no_cache_write=no_cache_write |
| | ) |
| | |
| | |
| | if config.cache_mode is None: |
| | config.cache_mode = CacheMode.ENABLED |
| |
|
| | |
| | cache_context = CacheContext(url, config.cache_mode, self.always_bypass_cache) |
| |
|
| | |
| | async_response: AsyncCrawlResponse = None |
| | cached_result: CrawlResult = None |
| | screenshot_data = None |
| | pdf_data = None |
| | extracted_content = None |
| | start_time = time.perf_counter() |
| |
|
| | |
| | if cache_context.should_read(): |
| | cached_result = await async_db_manager.aget_cached_url(url) |
| |
|
| | if cached_result: |
| | html = sanitize_input_encode(cached_result.html) |
| | extracted_content = sanitize_input_encode(cached_result.extracted_content or "") |
| | extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content |
| | |
| | screenshot_data = cached_result.screenshot |
| | pdf_data = cached_result.pdf |
| | if config.screenshot and not screenshot or config.pdf and not pdf: |
| | cached_result = None |
| |
|
| | self.logger.url_status( |
| | url=cache_context.display_url, |
| | success=bool(html), |
| | timing=time.perf_counter() - start_time, |
| | tag="FETCH" |
| | ) |
| |
|
| | |
| | if not cached_result or not html: |
| | t1 = time.perf_counter() |
| | |
| | if user_agent: |
| | self.crawler_strategy.update_user_agent(user_agent) |
| | |
| | |
| | async_response = await self.crawler_strategy.crawl( |
| | url, |
| | config=config |
| | ) |
| | |
| | html = sanitize_input_encode(async_response.html) |
| | screenshot_data = async_response.screenshot |
| | pdf_data = async_response.pdf_data |
| | |
| | t2 = time.perf_counter() |
| | self.logger.url_status( |
| | url=cache_context.display_url, |
| | success=bool(html), |
| | timing=t2 - t1, |
| | tag="FETCH" |
| | ) |
| |
|
| | |
| | crawl_result = await self.aprocess_html( |
| | url=url, |
| | html=html, |
| | extracted_content=extracted_content, |
| | config=config, |
| | screenshot=screenshot_data, |
| | pdf_data=pdf_data, |
| | verbose=config.verbose, |
| | is_raw_html = True if url.startswith("raw:") else False, |
| | **kwargs |
| | ) |
| |
|
| | crawl_result.status_code = async_response.status_code |
| | crawl_result.response_headers = async_response.response_headers |
| | crawl_result.downloaded_files = async_response.downloaded_files |
| | crawl_result.ssl_certificate = async_response.ssl_certificate |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | crawl_result.success = bool(html) |
| | crawl_result.session_id = getattr(config, 'session_id', None) |
| |
|
| | self.logger.success( |
| | message="{url:.50}... | Status: {status} | Total: {timing}", |
| | tag="COMPLETE", |
| | params={ |
| | "url": cache_context.display_url, |
| | "status": crawl_result.success, |
| | "timing": f"{time.perf_counter() - start_time:.2f}s" |
| | }, |
| | colors={ |
| | "status": Fore.GREEN if crawl_result.success else Fore.RED, |
| | "timing": Fore.YELLOW |
| | } |
| | ) |
| |
|
| | |
| | if cache_context.should_write() and not bool(cached_result): |
| | await async_db_manager.acache_url(crawl_result) |
| |
|
| | return crawl_result |
| |
|
| | else: |
| | self.logger.success( |
| | message="{url:.50}... | Status: {status} | Total: {timing}", |
| | tag="COMPLETE", |
| | params={ |
| | "url": cache_context.display_url, |
| | "status": True, |
| | "timing": f"{time.perf_counter() - start_time:.2f}s" |
| | }, |
| | colors={ |
| | "status": Fore.GREEN, |
| | "timing": Fore.YELLOW |
| | } |
| | ) |
| |
|
| | cached_result.success = bool(html) |
| | cached_result.session_id = getattr(config, 'session_id', None) |
| | return cached_result |
| |
|
| | except Exception as e: |
| | error_context = get_error_context(sys.exc_info()) |
| | |
| | error_message = ( |
| | f"Unexpected error in _crawl_web at line {error_context['line_no']} " |
| | f"in {error_context['function']} ({error_context['filename']}):\n" |
| | f"Error: {str(e)}\n\n" |
| | f"Code context:\n{error_context['code_context']}" |
| | ) |
| | |
| | |
| | |
| | self.logger.error_status( |
| | url=url, |
| | error=create_box_message(error_message, type="error"), |
| | tag="ERROR" |
| | ) |
| | |
| | return CrawlResult( |
| | url=url, |
| | html="", |
| | success=False, |
| | error_message=error_message |
| | ) |
| |
|
| | async def aprocess_html( |
| | self, |
| | url: str, |
| | html: str, |
| | extracted_content: str, |
| | config: CrawlerRunConfig, |
| | screenshot: str, |
| | pdf_data: str, |
| | verbose: bool, |
| | **kwargs, |
| | ) -> CrawlResult: |
| | """ |
| | Process HTML content using the provided configuration. |
| | |
| | Args: |
| | url: The URL being processed |
| | html: Raw HTML content |
| | extracted_content: Previously extracted content (if any) |
| | config: Configuration object controlling processing behavior |
| | screenshot: Screenshot data (if any) |
| | pdf_data: PDF data (if any) |
| | verbose: Whether to enable verbose logging |
| | **kwargs: Additional parameters for backwards compatibility |
| | |
| | Returns: |
| | CrawlResult: Processed result containing extracted and formatted content |
| | """ |
| | try: |
| | _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" |
| | t1 = time.perf_counter() |
| |
|
| | |
| | scrapping_strategy = WebScrapingStrategy(logger=self.logger) |
| |
|
| | |
| | params = {k:v for k, v in config.to_dict().items() if k not in ["url"]} |
| | |
| | params.update({k:v for k, v in kwargs.items() if k not in params.keys()}) |
| | |
| | result = scrapping_strategy.scrap( |
| | url, |
| | html, |
| | **params, |
| | |
| | |
| | |
| | |
| | |
| | |
| | ) |
| |
|
| | if result is None: |
| | raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") |
| |
|
| | except InvalidCSSSelectorError as e: |
| | raise ValueError(str(e)) |
| | except Exception as e: |
| | raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") |
| |
|
| | |
| |
|
| | |
| | cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) |
| | fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) |
| | fit_html = sanitize_input_encode(result.get("fit_html", "")) |
| | media = result.get("media", []) |
| | links = result.get("links", []) |
| | metadata = result.get("metadata", {}) |
| |
|
| | |
| | markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator() |
| | |
| | |
| | |
| | |
| | |
| | markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( |
| | cleaned_html=cleaned_html, |
| | base_url=url, |
| | |
| | ) |
| | markdown_v2 = markdown_result |
| | markdown = sanitize_input_encode(markdown_result.raw_markdown) |
| |
|
| | |
| | self.logger.info( |
| | message="Processed {url:.50}... | Time: {timing}ms", |
| | tag="SCRAPE", |
| | params={ |
| | "url": _url, |
| | "timing": int((time.perf_counter() - t1) * 1000) |
| | } |
| | ) |
| |
|
| | |
| | if (extracted_content is None and |
| | config.extraction_strategy and |
| | config.chunking_strategy and |
| | not isinstance(config.extraction_strategy, NoExtractionStrategy)): |
| | |
| | t1 = time.perf_counter() |
| | |
| | |
| | content_format = config.extraction_strategy.input_format |
| | if content_format == "fit_markdown" and not markdown_result.fit_markdown: |
| | self.logger.warning( |
| | message="Fit markdown requested but not available. Falling back to raw markdown.", |
| | tag="EXTRACT", |
| | params={"url": _url} |
| | ) |
| | content_format = "markdown" |
| |
|
| | content = { |
| | "markdown": markdown, |
| | "html": html, |
| | "fit_markdown": markdown_result.raw_markdown |
| | }.get(content_format, markdown) |
| | |
| | |
| | chunking = IdentityChunking() if content_format == "html" else config.chunking_strategy |
| | sections = chunking.chunk(content) |
| | extracted_content = config.extraction_strategy.run(url, sections) |
| | extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) |
| |
|
| | |
| | self.logger.info( |
| | message="Completed for {url:.50}... | Time: {timing}s", |
| | tag="EXTRACT", |
| | params={ |
| | "url": _url, |
| | "timing": time.perf_counter() - t1 |
| | } |
| | ) |
| |
|
| | |
| | screenshot_data = None if not screenshot else screenshot |
| | pdf_data = None if not pdf_data else pdf_data |
| |
|
| | |
| | if config.prettiify: |
| | cleaned_html = fast_format_html(cleaned_html) |
| |
|
| | |
| | return CrawlResult( |
| | url=url, |
| | html=html, |
| | cleaned_html=cleaned_html, |
| | markdown_v2=markdown_v2, |
| | markdown=markdown, |
| | fit_markdown=fit_markdown, |
| | fit_html=fit_html, |
| | media=media, |
| | links=links, |
| | metadata=metadata, |
| | screenshot=screenshot_data, |
| | pdf=pdf_data, |
| | extracted_content=extracted_content, |
| | success=True, |
| | error_message="", |
| | ) |
| |
|
| | async def arun_many( |
| | self, |
| | urls: List[str], |
| | config: Optional[CrawlerRunConfig] = None, |
| | |
| | word_count_threshold=MIN_WORD_THRESHOLD, |
| | extraction_strategy: ExtractionStrategy = None, |
| | chunking_strategy: ChunkingStrategy = RegexChunking(), |
| | content_filter: RelevantContentFilter = None, |
| | cache_mode: Optional[CacheMode] = None, |
| | bypass_cache: bool = False, |
| | css_selector: str = None, |
| | screenshot: bool = False, |
| | pdf: bool = False, |
| | user_agent: str = None, |
| | verbose=True, |
| | **kwargs, |
| | ) -> List[CrawlResult]: |
| | """ |
| | Runs the crawler for multiple URLs concurrently. |
| | |
| | Migration Guide: |
| | Old way (deprecated): |
| | results = await crawler.arun_many( |
| | urls, |
| | word_count_threshold=200, |
| | screenshot=True, |
| | ... |
| | ) |
| | |
| | New way (recommended): |
| | config = CrawlerRunConfig( |
| | word_count_threshold=200, |
| | screenshot=True, |
| | ... |
| | ) |
| | results = await crawler.arun_many(urls, crawler_config=config) |
| | |
| | Args: |
| | urls: List of URLs to crawl |
| | crawler_config: Configuration object controlling crawl behavior for all URLs |
| | [other parameters maintained for backwards compatibility] |
| | |
| | Returns: |
| | List[CrawlResult]: Results for each URL |
| | """ |
| | crawler_config = config |
| | |
| | if crawler_config is not None: |
| | if any(param is not None for param in [ |
| | word_count_threshold, extraction_strategy, chunking_strategy, |
| | content_filter, cache_mode, css_selector, screenshot, pdf |
| | ]): |
| | self.logger.warning( |
| | message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", |
| | tag="WARNING" |
| | ) |
| | config = crawler_config |
| | else: |
| | |
| | config_kwargs = { |
| | "word_count_threshold": word_count_threshold, |
| | "extraction_strategy": extraction_strategy, |
| | "chunking_strategy": chunking_strategy, |
| | "content_filter": content_filter, |
| | "cache_mode": cache_mode, |
| | "bypass_cache": bypass_cache, |
| | "css_selector": css_selector, |
| | "screenshot": screenshot, |
| | "pdf": pdf, |
| | "verbose": verbose, |
| | **kwargs |
| | } |
| | config = CrawlerRunConfig.from_kwargs(config_kwargs) |
| |
|
| | if bypass_cache: |
| | if kwargs.get("warning", True): |
| | warnings.warn( |
| | "'bypass_cache' is deprecated and will be removed in version 0.5.0. " |
| | "Use 'cache_mode=CacheMode.BYPASS' instead. " |
| | "Pass warning=False to suppress this warning.", |
| | DeprecationWarning, |
| | stacklevel=2 |
| | ) |
| | if config.cache_mode is None: |
| | config.cache_mode = CacheMode.BYPASS |
| |
|
| | semaphore_count = config.semaphore_count or 5 |
| | semaphore = asyncio.Semaphore(semaphore_count) |
| |
|
| | async def crawl_with_semaphore(url): |
| | |
| | domain = urlparse(url).netloc |
| | current_time = time.time() |
| | |
| | self.logger.debug( |
| | message="Started task for {url:.50}...", |
| | tag="PARALLEL", |
| | params={"url": url} |
| | ) |
| |
|
| | |
| | mean_delay = config.mean_delay |
| | max_range = config.max_range |
| | |
| | |
| | if domain in self._domain_last_hit: |
| | time_since_last = current_time - self._domain_last_hit[domain] |
| | if time_since_last < mean_delay: |
| | delay = mean_delay + random.uniform(0, max_range) |
| | await asyncio.sleep(delay) |
| | |
| | self._domain_last_hit[domain] = current_time |
| |
|
| | async with semaphore: |
| | return await self.arun( |
| | url, |
| | crawler_config=config, |
| | user_agent=user_agent |
| | ) |
| |
|
| | |
| | self.logger.info( |
| | message="Starting concurrent crawling for {count} URLs...", |
| | tag="INIT", |
| | params={"count": len(urls)} |
| | ) |
| |
|
| | |
| | start_time = time.perf_counter() |
| | tasks = [crawl_with_semaphore(url) for url in urls] |
| | results = await asyncio.gather(*tasks, return_exceptions=True) |
| | end_time = time.perf_counter() |
| |
|
| | |
| | self.logger.success( |
| | message="Concurrent crawling completed for {count} URLs | Total time: {timing}", |
| | tag="COMPLETE", |
| | params={ |
| | "count": len(urls), |
| | "timing": f"{end_time - start_time:.2f}s" |
| | }, |
| | colors={ |
| | "timing": Fore.YELLOW |
| | } |
| | ) |
| |
|
| | return [result if not isinstance(result, Exception) else str(result) for result in results] |
| |
|
| | async def aclear_cache(self): |
| | """Clear the cache database.""" |
| | await async_db_manager.cleanup() |
| |
|
| | async def aflush_cache(self): |
| | """Flush the cache database.""" |
| | await async_db_manager.aflush_db() |
| |
|
| | async def aget_cache_size(self): |
| | """Get the total number of cached items.""" |
| | return await async_db_manager.aget_total_count() |
| |
|