Files
openclaw-backups/skills/aidotnet-web-scraper/scripts/crawl_dynamic.py

255 lines
8.6 KiB
Python

#!/usr/bin/env python3
"""Crawl JavaScript-rendered (dynamic) web pages using Crawl4AI.
Uses a headless Chromium browser to render pages that require JavaScript,
then extracts clean Markdown content. Use this when fetch_page.py returns
empty or incomplete content (SPAs, React/Vue apps, etc.).
Dependencies: pip install crawl4ai && crawl4ai-setup
"""
import sys
import argparse
import asyncio
def setup_encoding():
"""Setup proper encoding for Windows console output."""
if sys.platform == "win32":
import io
try:
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
except (AttributeError, io.UnsupportedOperation):
sys.stdout = io.TextIOWrapper(
sys.stdout.buffer, encoding='utf-8', errors='replace', line_buffering=True
)
sys.stderr = io.TextIOWrapper(
sys.stderr.buffer, encoding='utf-8', errors='replace', line_buffering=True
)
def check_dependencies():
"""Check that Crawl4AI is installed."""
try:
import crawl4ai # noqa: F401
except ImportError:
print("Error: crawl4ai not installed.", file=sys.stderr)
print("Install with:", file=sys.stderr)
print(" pip install crawl4ai", file=sys.stderr)
print(" crawl4ai-setup", file=sys.stderr)
sys.exit(1)
async def crawl_page(url, wait_seconds=3, css_selector=None, scroll=False):
"""Crawl a page with headless browser and return Markdown content."""
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
browser_conf = BrowserConfig(
headless=True,
verbose=False,
)
md_generator = DefaultMarkdownGenerator(
content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed")
)
run_conf = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=md_generator,
page_timeout=60000, # 60s
wait_until="networkidle",
)
# Add wait time if specified
if wait_seconds > 0:
run_conf.delay_before_return_html = wait_seconds
# Wait for specific CSS selector
if css_selector:
run_conf.wait_for = f"css:{css_selector}"
async with AsyncWebCrawler(config=browser_conf) as crawler:
result = await crawler.arun(url=url, config=run_conf)
if not result.success:
return None, result.error_message or "Unknown error"
# Get the best available markdown
md = ""
if result.markdown:
if hasattr(result.markdown, 'fit_markdown') and result.markdown.fit_markdown:
md = result.markdown.fit_markdown
elif hasattr(result.markdown, 'raw_markdown') and result.markdown.raw_markdown:
md = result.markdown.raw_markdown
elif isinstance(result.markdown, str):
md = result.markdown
title = ""
if hasattr(result, 'metadata') and result.metadata:
title = result.metadata.get('title', '')
return {
"title": title,
"url": result.url or url,
"markdown": md,
"status_code": getattr(result, 'status_code', None),
}, None
async def crawl_with_scroll(url, wait_seconds=3, css_selector=None):
"""Crawl with infinite scroll support."""
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter
browser_conf = BrowserConfig(
headless=True,
verbose=False,
)
# JavaScript to scroll to bottom
scroll_js = """
async () => {
await new Promise((resolve) => {
let totalHeight = 0;
const distance = 500;
const timer = setInterval(() => {
const scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight) {
clearInterval(timer);
resolve();
}
}, 300);
// Safety timeout
setTimeout(() => { clearInterval(timer); resolve(); }, 15000);
});
}
"""
md_generator = DefaultMarkdownGenerator(
content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed")
)
run_conf = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=md_generator,
page_timeout=60000,
js_code=scroll_js,
wait_until="networkidle",
)
if wait_seconds > 0:
run_conf.delay_before_return_html = wait_seconds
if css_selector:
run_conf.wait_for = f"css:{css_selector}"
async with AsyncWebCrawler(config=browser_conf) as crawler:
result = await crawler.arun(url=url, config=run_conf)
if not result.success:
return None, result.error_message or "Unknown error"
md = ""
if result.markdown:
if hasattr(result.markdown, 'fit_markdown') and result.markdown.fit_markdown:
md = result.markdown.fit_markdown
elif hasattr(result.markdown, 'raw_markdown') and result.markdown.raw_markdown:
md = result.markdown.raw_markdown
elif isinstance(result.markdown, str):
md = result.markdown
title = ""
if hasattr(result, 'metadata') and result.metadata:
title = result.metadata.get('title', '')
return {
"title": title,
"url": result.url or url,
"markdown": md,
"status_code": getattr(result, 'status_code', None),
}, None
def main():
setup_encoding()
check_dependencies()
parser = argparse.ArgumentParser(
description="Crawl JavaScript-rendered pages with headless browser"
)
parser.add_argument("url", help="URL to crawl")
parser.add_argument("--wait", type=int, default=3,
help="Seconds to wait after page load (default: 3)")
parser.add_argument("--selector", type=str, default=None,
help="CSS selector to wait for before extracting")
parser.add_argument("--scroll", action="store_true",
help="Scroll to bottom to trigger lazy loading")
parser.add_argument("--save", type=str, default=None,
help="Also save output to this file path")
parser.add_argument("--max-length", type=int, default=None,
help="Truncate output to N characters")
args = parser.parse_args()
url = args.url.strip()
if not url.startswith(("http://", "https://")):
url = "https://" + url
print(f"Crawling (dynamic): {url}", file=sys.stderr)
print(f"Options: wait={args.wait}s, selector={args.selector}, scroll={args.scroll}", file=sys.stderr)
# Run async crawl
if args.scroll:
data, error = asyncio.run(crawl_with_scroll(url, args.wait, args.selector))
else:
data, error = asyncio.run(crawl_page(url, args.wait, args.selector))
if error:
print(f"Error: crawl failed: {error}", file=sys.stderr)
sys.exit(1)
if not data or not data["markdown"]:
print("Warning: no content extracted from page", file=sys.stderr)
print("[No content could be extracted from this page]")
sys.exit(0)
# Build output
parts = []
if data["title"]:
parts.append(f"# {data['title']}\n")
parts.append(f"**Source**: {data['url']}")
if data.get("status_code"):
parts.append(f"**Status**: {data['status_code']}")
parts.append("\n---\n")
parts.append(data["markdown"])
output = "\n".join(parts)
# Truncate if requested
if args.max_length and len(output) > args.max_length:
output = output[:args.max_length] + f"\n\n[... truncated at {args.max_length} characters, total {len(output)}]"
print(output)
content_len = len(data["markdown"])
print(f"\nExtracted: {content_len} characters (dynamic crawl)", file=sys.stderr)
# Save to file if requested
if args.save:
try:
with open(args.save, "w", encoding="utf-8") as f:
f.write(output)
print(f"Saved to: {args.save}", file=sys.stderr)
except Exception as e:
print(f"Error saving file: {e}", file=sys.stderr)
if __name__ == "__main__":
main()