c4ai

Running

c4ai / crawl4ai /async_webcrawler.py

amaye15

test

03c0888 about 1 year ago

35.9 kB

	import os, sys
	import time
	import warnings
	from enum import Enum
	from colorama import init, Fore, Back, Style
	from pathlib import Path
	from typing import Optional, List, Union
	import json
	import asyncio
	# from contextlib import nullcontext, asynccontextmanager
	from contextlib import asynccontextmanager
	from .models import CrawlResult, MarkdownGenerationResult
	from .async_database import async_db_manager
	from .chunking_strategy import *
	from .content_filter_strategy import *
	from .extraction_strategy import *
	from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
	from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
	from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
	from .content_scraping_strategy import WebScrapingStrategy
	from .async_logger import AsyncLogger
	from .async_configs import BrowserConfig, CrawlerRunConfig
	from .config import (
	MIN_WORD_THRESHOLD,
	IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
	URL_LOG_SHORTEN_LENGTH
	)
	from .utils import (
	sanitize_input_encode,
	InvalidCSSSelectorError,
	format_html,
	fast_format_html,
	create_box_message
	)

	from urllib.parse import urlparse
	import random
	from .__version__ import __version__ as crawl4ai_version


	class AsyncWebCrawler:
	"""
	Asynchronous web crawler with flexible caching capabilities.

	There are two ways to use the crawler:

	1. Using context manager (recommended for simple cases):
	```python
	async with AsyncWebCrawler() as crawler:
	result = await crawler.arun(url="https://example.com")
	```

	2. Using explicit lifecycle management (recommended for long-running applications):
	```python
	crawler = AsyncWebCrawler()
	await crawler.start()

	# Use the crawler multiple times
	result1 = await crawler.arun(url="https://example.com")
	result2 = await crawler.arun(url="https://another.com")

	await crawler.close()
	```

	Migration Guide:
	Old way (deprecated):
	crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True)

	New way (recommended):
	browser_config = BrowserConfig(browser_type="chromium", headless=True)
	crawler = AsyncWebCrawler(config=browser_config)


	Attributes:
	browser_config (BrowserConfig): Configuration object for browser settings.
	crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
	logger (AsyncLogger): Logger instance for recording events and errors.
	always_bypass_cache (bool): Whether to always bypass cache.
	crawl4ai_folder (str): Directory for storing cache.
	base_directory (str): Base directory for storing cache.
	ready (bool): Whether the crawler is ready for use.

	Methods:
	start(): Start the crawler explicitly without using context manager.
	close(): Close the crawler explicitly without using context manager.
	arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
	awarmup(): Perform warmup sequence.
	arun_many(): Run the crawler for multiple sources.
	aprocess_html(): Process HTML content.

	Typical Usage:
	async with AsyncWebCrawler() as crawler:
	result = await crawler.arun(url="https://example.com")
	print(result.markdown)

	Using configuration:
	browser_config = BrowserConfig(browser_type="chromium", headless=True)
	async with AsyncWebCrawler(config=browser_config) as crawler:
	crawler_config = CrawlerRunConfig(
	cache_mode=CacheMode.BYPASS
	)
	result = await crawler.arun(url="https://example.com", config=crawler_config)
	print(result.markdown)
	"""
	_domain_last_hit = {}

	def __init__(
	self,
	crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
	config: Optional[BrowserConfig] = None,
	always_bypass_cache: bool = False,
	always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
	base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
	thread_safe: bool = False,
	**kwargs,
	):
	"""
	Initialize the AsyncWebCrawler.

	Args:
	crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy
	config: Configuration object for browser settings. If None, will be created from kwargs
	always_bypass_cache: Whether to always bypass cache (new parameter)
	always_by_pass_cache: Deprecated, use always_bypass_cache instead
	base_directory: Base directory for storing cache
	thread_safe: Whether to use thread-safe operations
	**kwargs: Additional arguments for backwards compatibility
	"""
	# Handle browser configuration
	browser_config = config
	if browser_config is not None:
	if any(k in kwargs for k in ["browser_type", "headless", "viewport_width", "viewport_height"]):
	self.logger.warning(
	message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.",
	tag="WARNING"
	)
	else:
	# Create browser config from kwargs for backwards compatibility
	browser_config = BrowserConfig.from_kwargs(kwargs)

	self.browser_config = browser_config

	# Initialize logger first since other components may need it
	self.logger = AsyncLogger(
	log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
	verbose=self.browser_config.verbose,
	tag_width=10
	)


	# Initialize crawler strategy
	params = {
	k:v for k, v in kwargs.items() if k in ['browser_congig', 'logger']
	}
	self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
	browser_config=browser_config,
	logger=self.logger,
	**params # Pass remaining kwargs for backwards compatibility
	)

	# If craweler strategy doesnt have logger, use crawler logger
	if not self.crawler_strategy.logger:
	self.crawler_strategy.logger = self.logger

	# Handle deprecated cache parameter
	if always_by_pass_cache is not None:
	if kwargs.get("warning", True):
	warnings.warn(
	"'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. "
	"Use 'always_bypass_cache' instead. "
	"Pass warning=False to suppress this warning.",
	DeprecationWarning,
	stacklevel=2
	)
	self.always_bypass_cache = always_by_pass_cache
	else:
	self.always_bypass_cache = always_bypass_cache

	# Thread safety setup
	self._lock = asyncio.Lock() if thread_safe else None

	# Initialize directories
	self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
	os.makedirs(self.crawl4ai_folder, exist_ok=True)
	os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)

	self.ready = False

	async def start(self):
	"""
	Start the crawler explicitly without using context manager.
	This is equivalent to using 'async with' but gives more control over the lifecycle.

	This method will:
	1. Initialize the browser and context
	2. Perform warmup sequence
	3. Return the crawler instance for method chaining

	Returns:
	AsyncWebCrawler: The initialized crawler instance
	"""
	await self.crawler_strategy.__aenter__()
	await self.awarmup()
	return self

	async def close(self):
	"""
	Close the crawler explicitly without using context manager.
	This should be called when you're done with the crawler if you used start().

	This method will:
	1. Clean up browser resources
	2. Close any open pages and contexts
	"""
	await self.crawler_strategy.__aexit__(None, None, None)

	async def __aenter__(self):
	return await self.start()

	async def __aexit__(self, exc_type, exc_val, exc_tb):
	await self.close()

	async def awarmup(self):
	"""
	Initialize the crawler with warm-up sequence.

	This method:
	1. Logs initialization info
	2. Sets up browser configuration
	3. Marks the crawler as ready
	"""
	self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
	self.ready = True

	@asynccontextmanager
	async def nullcontext(self):
	"""异步空上下文管理器"""
	yield

	async def arun(
	self,
	url: str,
	config: Optional[CrawlerRunConfig] = None,
	# Legacy parameters maintained for backwards compatibility
	word_count_threshold=MIN_WORD_THRESHOLD,
	extraction_strategy: ExtractionStrategy = None,
	chunking_strategy: ChunkingStrategy = RegexChunking(),
	content_filter: RelevantContentFilter = None,
	cache_mode: Optional[CacheMode] = None,
	# Deprecated cache parameters
	bypass_cache: bool = False,
	disable_cache: bool = False,
	no_cache_read: bool = False,
	no_cache_write: bool = False,
	# Other legacy parameters
	css_selector: str = None,
	screenshot: bool = False,
	pdf: bool = False,
	user_agent: str = None,
	verbose=True,
	**kwargs,
	) -> CrawlResult:
	"""
	Runs the crawler for a single source: URL (web, local file, or raw HTML).

	Migration Guide:
	Old way (deprecated):
	result = await crawler.arun(
	url="https://example.com",
	word_count_threshold=200,
	screenshot=True,
	...
	)

	New way (recommended):
	config = CrawlerRunConfig(
	word_count_threshold=200,
	screenshot=True,
	...
	)
	result = await crawler.arun(url="https://example.com", crawler_config=config)

	Args:
	url: The URL to crawl (http://, https://, file://, or raw:)
	crawler_config: Configuration object controlling crawl behavior
	[other parameters maintained for backwards compatibility]

	Returns:
	CrawlResult: The result of crawling and processing
	"""
	crawler_config = config
	if not isinstance(url, str) or not url:
	raise ValueError("Invalid URL, make sure the URL is a non-empty string")

	async with self._lock or self.nullcontext():
	try:
	# Handle configuration
	if crawler_config is not None:
	# if any(param is not None for param in [
	# word_count_threshold, extraction_strategy, chunking_strategy,
	# content_filter, cache_mode, css_selector, screenshot, pdf
	# ]):
	# self.logger.warning(
	# message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
	# tag="WARNING"
	# )
	config = crawler_config
	else:
	# Merge all parameters into a single kwargs dict for config creation
	config_kwargs = {
	"word_count_threshold": word_count_threshold,
	"extraction_strategy": extraction_strategy,
	"chunking_strategy": chunking_strategy,
	"content_filter": content_filter,
	"cache_mode": cache_mode,
	"bypass_cache": bypass_cache,
	"disable_cache": disable_cache,
	"no_cache_read": no_cache_read,
	"no_cache_write": no_cache_write,
	"css_selector": css_selector,
	"screenshot": screenshot,
	"pdf": pdf,
	"verbose": verbose,
	**kwargs
	}
	config = CrawlerRunConfig.from_kwargs(config_kwargs)

	# Handle deprecated cache parameters
	if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
	if kwargs.get("warning", True):
	warnings.warn(
	"Cache control boolean flags are deprecated and will be removed in version 0.5.0. "
	"Use 'cache_mode' parameter instead.",
	DeprecationWarning,
	stacklevel=2
	)

	# Convert legacy parameters if cache_mode not provided
	if config.cache_mode is None:
	config.cache_mode = _legacy_to_cache_mode(
	disable_cache=disable_cache,
	bypass_cache=bypass_cache,
	no_cache_read=no_cache_read,
	no_cache_write=no_cache_write
	)

	# Default to ENABLED if no cache mode specified
	if config.cache_mode is None:
	config.cache_mode = CacheMode.ENABLED

	# Create cache context
	cache_context = CacheContext(url, config.cache_mode, self.always_bypass_cache)

	# Initialize processing variables
	async_response: AsyncCrawlResponse = None
	cached_result: CrawlResult = None
	screenshot_data = None
	pdf_data = None
	extracted_content = None
	start_time = time.perf_counter()

	# Try to get cached result if appropriate
	if cache_context.should_read():
	cached_result = await async_db_manager.aget_cached_url(url)

	if cached_result:
	html = sanitize_input_encode(cached_result.html)
	extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
	extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content
	# If screenshot is requested but its not in cache, then set cache_result to None
	screenshot_data = cached_result.screenshot
	pdf_data = cached_result.pdf
	if config.screenshot and not screenshot or config.pdf and not pdf:
	cached_result = None

	self.logger.url_status(
	url=cache_context.display_url,
	success=bool(html),
	timing=time.perf_counter() - start_time,
	tag="FETCH"
	)

	# Fetch fresh content if needed
	if not cached_result or not html:
	t1 = time.perf_counter()

	if user_agent:
	self.crawler_strategy.update_user_agent(user_agent)

	# Pass config to crawl method
	async_response = await self.crawler_strategy.crawl(
	url,
	config=config # Pass the entire config object
	)

	html = sanitize_input_encode(async_response.html)
	screenshot_data = async_response.screenshot
	pdf_data = async_response.pdf_data

	t2 = time.perf_counter()
	self.logger.url_status(
	url=cache_context.display_url,
	success=bool(html),
	timing=t2 - t1,
	tag="FETCH"
	)

	# Process the HTML content
	crawl_result = await self.aprocess_html(
	url=url,
	html=html,
	extracted_content=extracted_content,
	config=config, # Pass the config object instead of individual parameters
	screenshot=screenshot_data,
	pdf_data=pdf_data,
	verbose=config.verbose,
	is_raw_html = True if url.startswith("raw:") else False,
	**kwargs
	)

	crawl_result.status_code = async_response.status_code
	crawl_result.response_headers = async_response.response_headers
	crawl_result.downloaded_files = async_response.downloaded_files
	crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate

	# # Check and set values from async_response to crawl_result
	# try:
	# for key in vars(async_response):
	# if hasattr(crawl_result, key):
	# value = getattr(async_response, key, None)
	# current_value = getattr(crawl_result, key, None)
	# if value is not None and not current_value:
	# try:
	# setattr(crawl_result, key, value)
	# except Exception as e:
	# self.logger.warning(
	# message=f"Failed to set attribute {key}: {str(e)}",
	# tag="WARNING"
	# )
	# except Exception as e:
	# self.logger.warning(
	# message=f"Error copying response attributes: {str(e)}",
	# tag="WARNING"
	# )

	crawl_result.success = bool(html)
	crawl_result.session_id = getattr(config, 'session_id', None)

	self.logger.success(
	message="{url:.50}... \| Status: {status} \| Total: {timing}",
	tag="COMPLETE",
	params={
	"url": cache_context.display_url,
	"status": crawl_result.success,
	"timing": f"{time.perf_counter() - start_time:.2f}s"
	},
	colors={
	"status": Fore.GREEN if crawl_result.success else Fore.RED,
	"timing": Fore.YELLOW
	}
	)

	# Update cache if appropriate
	if cache_context.should_write() and not bool(cached_result):
	await async_db_manager.acache_url(crawl_result)

	return crawl_result

	else:
	self.logger.success(
	message="{url:.50}... \| Status: {status} \| Total: {timing}",
	tag="COMPLETE",
	params={
	"url": cache_context.display_url,
	"status": True,
	"timing": f"{time.perf_counter() - start_time:.2f}s"
	},
	colors={
	"status": Fore.GREEN,
	"timing": Fore.YELLOW
	}
	)

	cached_result.success = bool(html)
	cached_result.session_id = getattr(config, 'session_id', None)
	return cached_result

	except Exception as e:
	error_context = get_error_context(sys.exc_info())

	error_message = (
	f"Unexpected error in _crawl_web at line {error_context['line_no']} "
	f"in {error_context['function']} ({error_context['filename']}):\n"
	f"Error: {str(e)}\n\n"
	f"Code context:\n{error_context['code_context']}"
	)
	# if not hasattr(e, "msg"):
	# e.msg = str(e)

	self.logger.error_status(
	url=url,
	error=create_box_message(error_message, type="error"),
	tag="ERROR"
	)

	return CrawlResult(
	url=url,
	html="",
	success=False,
	error_message=error_message
	)

	async def aprocess_html(
	self,
	url: str,
	html: str,
	extracted_content: str,
	config: CrawlerRunConfig,
	screenshot: str,
	pdf_data: str,
	verbose: bool,
	**kwargs,
	) -> CrawlResult:
	"""
	Process HTML content using the provided configuration.

	Args:
	url: The URL being processed
	html: Raw HTML content
	extracted_content: Previously extracted content (if any)
	config: Configuration object controlling processing behavior
	screenshot: Screenshot data (if any)
	pdf_data: PDF data (if any)
	verbose: Whether to enable verbose logging
	**kwargs: Additional parameters for backwards compatibility

	Returns:
	CrawlResult: Processed result containing extracted and formatted content
	"""
	try:
	_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
	t1 = time.perf_counter()

	# Initialize scraping strategy
	scrapping_strategy = WebScrapingStrategy(logger=self.logger)

	# Process HTML content
	params = {k:v for k, v in config.to_dict().items() if k not in ["url"]}
	# add keys from kwargs to params that doesn't exist in params
	params.update({k:v for k, v in kwargs.items() if k not in params.keys()})

	result = scrapping_strategy.scrap(
	url,
	html,
	**params,
	# word_count_threshold=config.word_count_threshold,
	# css_selector=config.css_selector,
	# only_text=config.only_text,
	# image_description_min_word_threshold=config.image_description_min_word_threshold,
	# content_filter=config.content_filter,
	# **kwargs
	)

	if result is None:
	raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")

	except InvalidCSSSelectorError as e:
	raise ValueError(str(e))
	except Exception as e:
	raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")



	# Extract results
	cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
	fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
	fit_html = sanitize_input_encode(result.get("fit_html", ""))
	media = result.get("media", [])
	links = result.get("links", [])
	metadata = result.get("metadata", {})

	# Markdown Generation
	markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()

	# Uncomment if by default we want to use PruningContentFilter
	# if not config.content_filter and not markdown_generator.content_filter:
	# markdown_generator.content_filter = PruningContentFilter()

	markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
	cleaned_html=cleaned_html,
	base_url=url,
	# html2text_options=kwargs.get('html2text', {})
	)
	markdown_v2 = markdown_result
	markdown = sanitize_input_encode(markdown_result.raw_markdown)

	# Log processing completion
	self.logger.info(
	message="Processed {url:.50}... \| Time: {timing}ms",
	tag="SCRAPE",
	params={
	"url": _url,
	"timing": int((time.perf_counter() - t1) * 1000)
	}
	)

	# Handle content extraction if needed
	if (extracted_content is None and
	config.extraction_strategy and
	config.chunking_strategy and
	not isinstance(config.extraction_strategy, NoExtractionStrategy)):

	t1 = time.perf_counter()

	# Choose content based on input_format
	content_format = config.extraction_strategy.input_format
	if content_format == "fit_markdown" and not markdown_result.fit_markdown:
	self.logger.warning(
	message="Fit markdown requested but not available. Falling back to raw markdown.",
	tag="EXTRACT",
	params={"url": _url}
	)
	content_format = "markdown"

	content = {
	"markdown": markdown,
	"html": html,
	"fit_markdown": markdown_result.raw_markdown
	}.get(content_format, markdown)

	# Use IdentityChunking for HTML input, otherwise use provided chunking strategy
	chunking = IdentityChunking() if content_format == "html" else config.chunking_strategy
	sections = chunking.chunk(content)
	extracted_content = config.extraction_strategy.run(url, sections)
	extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)

	# Log extraction completion
	self.logger.info(
	message="Completed for {url:.50}... \| Time: {timing}s",
	tag="EXTRACT",
	params={
	"url": _url,
	"timing": time.perf_counter() - t1
	}
	)

	# Handle screenshot and PDF data
	screenshot_data = None if not screenshot else screenshot
	pdf_data = None if not pdf_data else pdf_data

	# Apply HTML formatting if requested
	if config.prettiify:
	cleaned_html = fast_format_html(cleaned_html)

	# Return complete crawl result
	return CrawlResult(
	url=url,
	html=html,
	cleaned_html=cleaned_html,
	markdown_v2=markdown_v2,
	markdown=markdown,
	fit_markdown=fit_markdown,
	fit_html=fit_html,
	media=media,
	links=links,
	metadata=metadata,
	screenshot=screenshot_data,
	pdf=pdf_data,
	extracted_content=extracted_content,
	success=True,
	error_message="",
	)

	async def arun_many(
	self,
	urls: List[str],
	config: Optional[CrawlerRunConfig] = None,
	# Legacy parameters maintained for backwards compatibility
	word_count_threshold=MIN_WORD_THRESHOLD,
	extraction_strategy: ExtractionStrategy = None,
	chunking_strategy: ChunkingStrategy = RegexChunking(),
	content_filter: RelevantContentFilter = None,
	cache_mode: Optional[CacheMode] = None,
	bypass_cache: bool = False,
	css_selector: str = None,
	screenshot: bool = False,
	pdf: bool = False,
	user_agent: str = None,
	verbose=True,
	**kwargs,
	) -> List[CrawlResult]:
	"""
	Runs the crawler for multiple URLs concurrently.

	Migration Guide:
	Old way (deprecated):
	results = await crawler.arun_many(
	urls,
	word_count_threshold=200,
	screenshot=True,
	...
	)

	New way (recommended):
	config = CrawlerRunConfig(
	word_count_threshold=200,
	screenshot=True,
	...
	)
	results = await crawler.arun_many(urls, crawler_config=config)

	Args:
	urls: List of URLs to crawl
	crawler_config: Configuration object controlling crawl behavior for all URLs
	[other parameters maintained for backwards compatibility]

	Returns:
	List[CrawlResult]: Results for each URL
	"""
	crawler_config = config
	# Handle configuration
	if crawler_config is not None:
	if any(param is not None for param in [
	word_count_threshold, extraction_strategy, chunking_strategy,
	content_filter, cache_mode, css_selector, screenshot, pdf
	]):
	self.logger.warning(
	message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
	tag="WARNING"
	)
	config = crawler_config
	else:
	# Merge all parameters into a single kwargs dict for config creation
	config_kwargs = {
	"word_count_threshold": word_count_threshold,
	"extraction_strategy": extraction_strategy,
	"chunking_strategy": chunking_strategy,
	"content_filter": content_filter,
	"cache_mode": cache_mode,
	"bypass_cache": bypass_cache,
	"css_selector": css_selector,
	"screenshot": screenshot,
	"pdf": pdf,
	"verbose": verbose,
	**kwargs
	}
	config = CrawlerRunConfig.from_kwargs(config_kwargs)

	if bypass_cache:
	if kwargs.get("warning", True):
	warnings.warn(
	"'bypass_cache' is deprecated and will be removed in version 0.5.0. "
	"Use 'cache_mode=CacheMode.BYPASS' instead. "
	"Pass warning=False to suppress this warning.",
	DeprecationWarning,
	stacklevel=2
	)
	if config.cache_mode is None:
	config.cache_mode = CacheMode.BYPASS

	semaphore_count = config.semaphore_count or 5
	semaphore = asyncio.Semaphore(semaphore_count)

	async def crawl_with_semaphore(url):
	# Handle rate limiting per domain
	domain = urlparse(url).netloc
	current_time = time.time()

	self.logger.debug(
	message="Started task for {url:.50}...",
	tag="PARALLEL",
	params={"url": url}
	)

	# Get delay settings from config
	mean_delay = config.mean_delay
	max_range = config.max_range

	# Apply rate limiting
	if domain in self._domain_last_hit:
	time_since_last = current_time - self._domain_last_hit[domain]
	if time_since_last < mean_delay:
	delay = mean_delay + random.uniform(0, max_range)
	await asyncio.sleep(delay)

	self._domain_last_hit[domain] = current_time

	async with semaphore:
	return await self.arun(
	url,
	crawler_config=config, # Pass the entire config object
	user_agent=user_agent # Maintain user_agent override capability
	)

	# Log start of concurrent crawling
	self.logger.info(
	message="Starting concurrent crawling for {count} URLs...",
	tag="INIT",
	params={"count": len(urls)}
	)

	# Execute concurrent crawls
	start_time = time.perf_counter()
	tasks = [crawl_with_semaphore(url) for url in urls]
	results = await asyncio.gather(*tasks, return_exceptions=True)
	end_time = time.perf_counter()

	# Log completion
	self.logger.success(
	message="Concurrent crawling completed for {count} URLs \| Total time: {timing}",
	tag="COMPLETE",
	params={
	"count": len(urls),
	"timing": f"{end_time - start_time:.2f}s"
	},
	colors={
	"timing": Fore.YELLOW
	}
	)

	return [result if not isinstance(result, Exception) else str(result) for result in results]

	async def aclear_cache(self):
	"""Clear the cache database."""
	await async_db_manager.cleanup()

	async def aflush_cache(self):
	"""Flush the cache database."""
	await async_db_manager.aflush_db()

	async def aget_cache_size(self):
	"""Get the total number of cached items."""
	return await async_db_manager.aget_total_count()