| | """ |
| | Crawl4AI v0.4.24 Feature Walkthrough |
| | =================================== |
| | |
| | This script demonstrates the new features introduced in Crawl4AI v0.4.24. |
| | Each section includes detailed examples and explanations of the new capabilities. |
| | """ |
| |
|
| | import asyncio |
| | import os |
| | import json |
| | import re |
| | from typing import List, Optional, Dict, Any |
| | from pydantic import BaseModel, Field |
| | from crawl4ai import ( |
| | AsyncWebCrawler, |
| | BrowserConfig, |
| | CrawlerRunConfig, |
| | CacheMode, |
| | LLMExtractionStrategy, |
| | JsonCssExtractionStrategy |
| | ) |
| | from crawl4ai.content_filter_strategy import RelevantContentFilter |
| | from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator |
| | from bs4 import BeautifulSoup |
| |
|
| | |
| | SAMPLE_HTML = """ |
| | <div class="article-list"> |
| | <article class="post" data-category="tech" data-author="john"> |
| | <h2 class="title"><a href="/post-1">First Post</a></h2> |
| | <div class="meta"> |
| | <a href="/author/john" class="author">John Doe</a> |
| | <span class="date">2023-12-31</span> |
| | </div> |
| | <div class="content"> |
| | <p>First post content...</p> |
| | <a href="/read-more-1" class="read-more">Read More</a> |
| | </div> |
| | </article> |
| | <article class="post" data-category="science" data-author="jane"> |
| | <h2 class="title"><a href="/post-2">Second Post</a></h2> |
| | <div class="meta"> |
| | <a href="/author/jane" class="author">Jane Smith</a> |
| | <span class="date">2023-12-30</span> |
| | </div> |
| | <div class="content"> |
| | <p>Second post content...</p> |
| | <a href="/read-more-2" class="read-more">Read More</a> |
| | </div> |
| | </article> |
| | </div> |
| | """ |
| |
|
| | async def demo_ssl_features(): |
| | """ |
| | Enhanced SSL & Security Features Demo |
| | ----------------------------------- |
| | |
| | This example demonstrates the new SSL certificate handling and security features: |
| | 1. Custom certificate paths |
| | 2. SSL verification options |
| | 3. HTTPS error handling |
| | 4. Certificate validation configurations |
| | |
| | These features are particularly useful when: |
| | - Working with self-signed certificates |
| | - Dealing with corporate proxies |
| | - Handling mixed content websites |
| | - Managing different SSL security levels |
| | """ |
| | print("\n1. Enhanced SSL & Security Demo") |
| | print("--------------------------------") |
| |
|
| | browser_config = BrowserConfig() |
| |
|
| | run_config = CrawlerRunConfig( |
| | cache_mode=CacheMode.BYPASS, |
| | fetch_ssl_certificate=True |
| | ) |
| |
|
| | async with AsyncWebCrawler(config=browser_config) as crawler: |
| | result = await crawler.arun( |
| | url="https://example.com", |
| | config=run_config |
| | ) |
| | print(f"SSL Crawl Success: {result.success}") |
| | result.ssl_certificate.to_json( |
| | os.path.join(os.getcwd(), "ssl_certificate.json") |
| | ) |
| | if not result.success: |
| | print(f"SSL Error: {result.error_message}") |
| |
|
| | async def demo_content_filtering(): |
| | """ |
| | Smart Content Filtering Demo |
| | ---------------------- |
| | |
| | Demonstrates advanced content filtering capabilities: |
| | 1. Custom filter to identify and extract specific content |
| | 2. Integration with markdown generation |
| | 3. Flexible pruning rules |
| | """ |
| | print("\n2. Smart Content Filtering Demo") |
| | print("--------------------------------") |
| |
|
| | |
| | class CustomNewsFilter(RelevantContentFilter): |
| | def __init__(self): |
| | super().__init__() |
| | |
| | self.negative_patterns = re.compile( |
| | r'nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending', |
| | re.I |
| | ) |
| | self.min_word_count = 30 |
| |
|
| | def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: |
| | """ |
| | Implements news-specific content filtering logic. |
| | |
| | Args: |
| | html (str): HTML content to be filtered |
| | min_word_threshold (int, optional): Minimum word count threshold |
| | |
| | Returns: |
| | List[str]: List of filtered HTML content blocks |
| | """ |
| | if not html or not isinstance(html, str): |
| | return [] |
| | |
| | soup = BeautifulSoup(html, 'lxml') |
| | if not soup.body: |
| | soup = BeautifulSoup(f'<body>{html}</body>', 'lxml') |
| | |
| | body = soup.find('body') |
| | |
| | |
| | chunks = self.extract_text_chunks(body, min_word_threshold or self.min_word_count) |
| | |
| | |
| | filtered_chunks = [] |
| | for _, text, tag_type, element in chunks: |
| | |
| | if self.is_excluded(element): |
| | continue |
| | |
| | |
| | if tag_type == 'header': |
| | filtered_chunks.append(self.clean_element(element)) |
| | continue |
| | |
| | |
| | text = element.get_text(strip=True) |
| | if len(text.split()) >= (min_word_threshold or self.min_word_count): |
| | |
| | links_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a')) |
| | link_density = len(links_text) / len(text) if text else 1 |
| | |
| | |
| | if link_density < 0.5: |
| | filtered_chunks.append(self.clean_element(element)) |
| | |
| | return filtered_chunks |
| |
|
| | |
| | markdown_gen = DefaultMarkdownGenerator( |
| | content_filter=CustomNewsFilter() |
| | ) |
| |
|
| | run_config = CrawlerRunConfig( |
| | markdown_generator=markdown_gen, |
| | cache_mode=CacheMode.BYPASS |
| | ) |
| |
|
| | async with AsyncWebCrawler() as crawler: |
| | result = await crawler.arun( |
| | url="https://news.ycombinator.com", |
| | config=run_config |
| | ) |
| | print("Filtered Content Sample:") |
| | print(result.markdown[:500]) |
| |
|
| | async def demo_json_extraction(): |
| | """ |
| | Improved JSON Extraction Demo |
| | --------------------------- |
| | |
| | Demonstrates the enhanced JSON extraction capabilities: |
| | 1. Base element attributes extraction |
| | 2. Complex nested structures |
| | 3. Multiple extraction patterns |
| | |
| | Key features shown: |
| | - Extracting attributes from base elements (href, data-* attributes) |
| | - Processing repeated patterns |
| | - Handling optional fields |
| | """ |
| | print("\n3. Improved JSON Extraction Demo") |
| | print("--------------------------------") |
| |
|
| | |
| | json_strategy = JsonCssExtractionStrategy( |
| | schema={ |
| | "name": "Blog Posts", |
| | "baseSelector": "div.article-list", |
| | "baseFields": [ |
| | {"name": "list_id", "type": "attribute", "attribute": "data-list-id"}, |
| | {"name": "category", "type": "attribute", "attribute": "data-category"} |
| | ], |
| | "fields": [ |
| | { |
| | "name": "posts", |
| | "selector": "article.post", |
| | "type": "nested_list", |
| | "baseFields": [ |
| | {"name": "post_id", "type": "attribute", "attribute": "data-post-id"}, |
| | {"name": "author_id", "type": "attribute", "attribute": "data-author"} |
| | ], |
| | "fields": [ |
| | { |
| | "name": "title", |
| | "selector": "h2.title a", |
| | "type": "text", |
| | "baseFields": [ |
| | {"name": "url", "type": "attribute", "attribute": "href"} |
| | ] |
| | }, |
| | { |
| | "name": "author", |
| | "selector": "div.meta a.author", |
| | "type": "text", |
| | "baseFields": [ |
| | {"name": "profile_url", "type": "attribute", "attribute": "href"} |
| | ] |
| | }, |
| | { |
| | "name": "date", |
| | "selector": "span.date", |
| | "type": "text" |
| | }, |
| | { |
| | "name": "read_more", |
| | "selector": "a.read-more", |
| | "type": "nested", |
| | "fields": [ |
| | {"name": "text", "type": "text"}, |
| | {"name": "url", "type": "attribute", "attribute": "href"} |
| | ] |
| | } |
| | ] |
| | } |
| | ] |
| | } |
| | ) |
| |
|
| | |
| | run_config = CrawlerRunConfig( |
| | extraction_strategy=json_strategy, |
| | cache_mode=CacheMode.BYPASS |
| | ) |
| |
|
| | async with AsyncWebCrawler() as crawler: |
| | result = await crawler.arun( |
| | url="raw:" + SAMPLE_HTML, |
| | config=run_config |
| | ) |
| | print("Extracted Content:") |
| | print(result.extracted_content) |
| |
|
| | async def demo_input_formats(): |
| | """ |
| | Input Format Handling Demo |
| | ---------------------- |
| | |
| | Demonstrates how LLM extraction can work with different input formats: |
| | 1. Markdown (default) - Good for simple text extraction |
| | 2. HTML - Better when you need structure and attributes |
| | |
| | This example shows how HTML input can be beneficial when: |
| | - You need to understand the DOM structure |
| | - You want to extract both visible text and HTML attributes |
| | - The content has complex layouts like tables or forms |
| | """ |
| | print("\n4. Input Format Handling Demo") |
| | print("---------------------------") |
| |
|
| | |
| | dummy_html = """ |
| | <div class="job-posting" data-post-id="12345"> |
| | <header class="job-header"> |
| | <h1 class="job-title">Senior AI/ML Engineer</h1> |
| | <div class="job-meta"> |
| | <span class="department">AI Research Division</span> |
| | <span class="location" data-remote="hybrid">San Francisco (Hybrid)</span> |
| | </div> |
| | <div class="salary-info" data-currency="USD"> |
| | <span class="range">$150,000 - $220,000</span> |
| | <span class="period">per year</span> |
| | </div> |
| | </header> |
| | |
| | <section class="requirements"> |
| | <div class="technical-skills"> |
| | <h3>Technical Requirements</h3> |
| | <ul class="required-skills"> |
| | <li class="skill required" data-priority="must-have"> |
| | 5+ years experience in Machine Learning |
| | </li> |
| | <li class="skill required" data-priority="must-have"> |
| | Proficiency in Python and PyTorch/TensorFlow |
| | </li> |
| | <li class="skill preferred" data-priority="nice-to-have"> |
| | Experience with distributed training systems |
| | </li> |
| | </ul> |
| | </div> |
| | |
| | <div class="soft-skills"> |
| | <h3>Professional Skills</h3> |
| | <ul class="required-skills"> |
| | <li class="skill required" data-priority="must-have"> |
| | Strong problem-solving abilities |
| | </li> |
| | <li class="skill preferred" data-priority="nice-to-have"> |
| | Experience leading technical teams |
| | </li> |
| | </ul> |
| | </div> |
| | </section> |
| | |
| | <section class="timeline"> |
| | <time class="deadline" datetime="2024-02-28"> |
| | Application Deadline: February 28, 2024 |
| | </time> |
| | </section> |
| | |
| | <footer class="contact-section"> |
| | <div class="hiring-manager"> |
| | <h4>Hiring Manager</h4> |
| | <div class="contact-info"> |
| | <span class="name">Dr. Sarah Chen</span> |
| | <span class="title">Director of AI Research</span> |
| | <span class="email">ai.hiring@example.com</span> |
| | </div> |
| | </div> |
| | <div class="team-info"> |
| | <p>Join our team of 50+ researchers working on cutting-edge AI applications</p> |
| | </div> |
| | </footer> |
| | </div> |
| | """ |
| | |
| | |
| | url = f"raw://{dummy_html}" |
| |
|
| | from pydantic import BaseModel, Field |
| | from typing import List, Optional |
| |
|
| | |
| | class JobRequirement(BaseModel): |
| | category: str = Field(description="Category of the requirement (e.g., Technical, Soft Skills)") |
| | items: List[str] = Field(description="List of specific requirements in this category") |
| | priority: str = Field(description="Priority level (Required/Preferred) based on the HTML class or context") |
| |
|
| | class JobPosting(BaseModel): |
| | title: str = Field(description="Job title") |
| | department: str = Field(description="Department or team") |
| | location: str = Field(description="Job location, including remote options") |
| | salary_range: Optional[str] = Field(description="Salary range if specified") |
| | requirements: List[JobRequirement] = Field(description="Categorized job requirements") |
| | application_deadline: Optional[str] = Field(description="Application deadline if specified") |
| | contact_info: Optional[dict] = Field(description="Contact information from footer or contact section") |
| |
|
| | |
| | markdown_strategy = LLMExtractionStrategy( |
| | provider="openai/gpt-4o", |
| | api_token=os.getenv("OPENAI_API_KEY"), |
| | schema=JobPosting.model_json_schema(), |
| | extraction_type="schema", |
| | instruction=""" |
| | Extract job posting details into structured data. Focus on the visible text content |
| | and organize requirements into categories. |
| | """, |
| | input_format="markdown" |
| | ) |
| |
|
| | |
| | html_strategy = LLMExtractionStrategy( |
| | provider="openai/gpt-4", |
| | api_token=os.getenv("OPENAI_API_KEY"), |
| | schema=JobPosting.model_json_schema(), |
| | extraction_type="schema", |
| | instruction=""" |
| | Extract job posting details, using HTML structure to: |
| | 1. Identify requirement priorities from CSS classes (e.g., 'required' vs 'preferred') |
| | 2. Extract contact info from the page footer or dedicated contact section |
| | 3. Parse salary information from specially formatted elements |
| | 4. Determine application deadline from timestamp or date elements |
| | |
| | Use HTML attributes and classes to enhance extraction accuracy. |
| | """, |
| | input_format="html" |
| | ) |
| |
|
| | async with AsyncWebCrawler() as crawler: |
| | |
| | markdown_config = CrawlerRunConfig( |
| | extraction_strategy=markdown_strategy |
| | ) |
| | markdown_result = await crawler.arun( |
| | url=url, |
| | config=markdown_config |
| | ) |
| | print("\nMarkdown-based Extraction Result:") |
| | items = json.loads(markdown_result.extracted_content) |
| | print(json.dumps(items, indent=2)) |
| |
|
| | |
| | html_config = CrawlerRunConfig( |
| | extraction_strategy=html_strategy |
| | ) |
| | html_result = await crawler.arun( |
| | url=url, |
| | config=html_config |
| | ) |
| | print("\nHTML-based Extraction Result:") |
| | items = json.loads(html_result.extracted_content) |
| | print(json.dumps(items, indent=2)) |
| |
|
| | |
| | async def main(): |
| | print("Crawl4AI v0.4.24 Feature Walkthrough") |
| | print("====================================") |
| |
|
| | |
| | await demo_ssl_features() |
| | await demo_content_filtering() |
| | await demo_json_extraction() |
| | |
| |
|
| | if __name__ == "__main__": |
| | asyncio.run(main()) |
| |
|