openclaw-backups/skills/aidotnet-web-scraper/scripts/fetch_page.py

#!/usr/bin/env python3
"""Fetch a web page and extract readable content as clean Markdown.

Uses requests + BeautifulSoup + readability-lxml + html2text for lightweight,
fast extraction without a headless browser. Works well for articles, docs,
blogs, wikis, and most static websites.

Dependencies: pip install requests beautifulsoup4 readability-lxml html2text
"""

import sys
import argparse


def setup_encoding():
    """Setup proper encoding for Windows console output."""
    if sys.platform == "win32":
        import io
        try:
            sys.stdout.reconfigure(encoding='utf-8', errors='replace')
            sys.stderr.reconfigure(encoding='utf-8', errors='replace')
        except (AttributeError, io.UnsupportedOperation):
            sys.stdout = io.TextIOWrapper(
                sys.stdout.buffer, encoding='utf-8', errors='replace', line_buffering=True
            )
            sys.stderr = io.TextIOWrapper(
                sys.stderr.buffer, encoding='utf-8', errors='replace', line_buffering=True
            )


def check_dependencies():
    """Check that required packages are installed."""
    missing = []
    try:
        import requests  # noqa: F401
    except ImportError:
        missing.append("requests")
    try:
        from bs4 import BeautifulSoup  # noqa: F401
    except ImportError:
        missing.append("beautifulsoup4")
    try:
        from readability import Document  # noqa: F401
    except ImportError:
        missing.append("readability-lxml")
    try:
        import html2text  # noqa: F401
    except ImportError:
        missing.append("html2text")

    if missing:
        print(f"Error: missing dependencies: {', '.join(missing)}", file=sys.stderr)
        print(f"Install with: pip install {' '.join(missing)}", file=sys.stderr)
        sys.exit(1)


def fetch_url(url, timeout=30):
    """Fetch URL content with proper headers."""
    import requests

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        ),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
        "Accept-Encoding": "gzip, deflate, br",
    }

    try:
        resp = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
        resp.raise_for_status()

        # Detect encoding
        if resp.encoding and resp.encoding.lower() != 'utf-8':
            resp.encoding = resp.apparent_encoding or resp.encoding

        return resp.text, resp.url, resp.status_code
    except requests.exceptions.Timeout:
        print(f"Error: request timed out after {timeout}s", file=sys.stderr)
        sys.exit(1)
    except requests.exceptions.ConnectionError as e:
        print(f"Error: connection failed: {e}", file=sys.stderr)
        sys.exit(1)
    except requests.exceptions.HTTPError as e:
        print(f"Error: HTTP {e.response.status_code}: {e}", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)


def extract_with_readability(html, url):
    """Extract main article content using readability-lxml."""
    from readability import Document

    doc = Document(html, url=url)
    title = doc.short_title()
    content_html = doc.summary()
    return title, content_html


def extract_with_selector(html, selector):
    """Extract content matching a CSS selector."""
    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, "html.parser")
    elements = soup.select(selector)
    if not elements:
        return None

    # Combine all matching elements
    parts = []
    for el in elements:
        parts.append(str(el))
    return "\n".join(parts)


def html_to_markdown(html, base_url=None):
    """Convert HTML to clean Markdown."""
    import html2text

    converter = html2text.HTML2Text()
    converter.body_width = 0  # Don't wrap lines
    converter.ignore_images = False
    converter.ignore_links = False
    converter.ignore_emphasis = False
    converter.protect_links = True
    converter.unicode_snob = True
    converter.mark_code = True
    converter.wrap_links = False
    converter.single_line_break = False

    if base_url:
        converter.baseurl = base_url

    md = converter.handle(html)

    # Clean up excessive blank lines
    import re
    md = re.sub(r'\n{3,}', '\n\n', md)
    return md.strip()


def extract_metadata(html):
    """Extract page metadata (title, description, etc.)."""
    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, "html.parser")
    meta = {}

    # Title
    title_tag = soup.find("title")
    if title_tag:
        meta["title"] = title_tag.get_text(strip=True)

    # Meta description
    desc_tag = soup.find("meta", attrs={"name": "description"})
    if desc_tag and desc_tag.get("content"):
        meta["description"] = desc_tag["content"].strip()

    # OG tags
    for prop in ["og:title", "og:description", "og:type", "og:site_name"]:
        tag = soup.find("meta", attrs={"property": prop})
        if tag and tag.get("content"):
            meta[prop.replace("og:", "og_")] = tag["content"].strip()

    # Author
    author_tag = soup.find("meta", attrs={"name": "author"})
    if author_tag and author_tag.get("content"):
        meta["author"] = author_tag["content"].strip()

    # Published date
    for attr in ["article:published_time", "datePublished", "date"]:
        date_tag = soup.find("meta", attrs={"property": attr}) or soup.find("meta", attrs={"name": attr})
        if date_tag and date_tag.get("content"):
            meta["published"] = date_tag["content"].strip()
            break

    return meta


def main():
    setup_encoding()
    check_dependencies()

    parser = argparse.ArgumentParser(
        description="Fetch a web page and extract content as Markdown"
    )
    parser.add_argument("url", help="URL to fetch")
    parser.add_argument("--raw", action="store_true",
                        help="Output full page Markdown (no readability extraction)")
    parser.add_argument("--selector", type=str, default=None,
                        help="CSS selector to extract specific elements")
    parser.add_argument("--save", type=str, default=None,
                        help="Also save output to this file path")
    parser.add_argument("--max-length", type=int, default=None,
                        help="Truncate output to N characters")
    parser.add_argument("--timeout", type=int, default=30,
                        help="Request timeout in seconds (default: 30)")
    parser.add_argument("--no-metadata", action="store_true",
                        help="Skip metadata header in output")

    args = parser.parse_args()

    # Normalize URL
    url = args.url.strip()
    if not url.startswith(("http://", "https://")):
        url = "https://" + url

    print(f"Fetching: {url}", file=sys.stderr)

    # Fetch
    html, final_url, status = fetch_url(url, timeout=args.timeout)
    print(f"Status: {status}, Size: {len(html)} bytes", file=sys.stderr)

    if final_url != url:
        print(f"Redirected to: {final_url}", file=sys.stderr)

    # Extract metadata
    meta = extract_metadata(html) if not args.no_metadata else {}

    # Extract content
    if args.selector:
        # CSS selector mode
        selected_html = extract_with_selector(html, args.selector)
        if not selected_html:
            print(f"Warning: no elements matched selector '{args.selector}'", file=sys.stderr)
            print(f"[No elements matched CSS selector: {args.selector}]")
            sys.exit(0)
        title = meta.get("title", "")
        content_md = html_to_markdown(selected_html, base_url=final_url)
    elif args.raw:
        # Raw full-page mode
        title = meta.get("title", "")
        content_md = html_to_markdown(html, base_url=final_url)
    else:
        # Readability extraction mode (default)
        title, article_html = extract_with_readability(html, final_url)
        content_md = html_to_markdown(article_html, base_url=final_url)

    # Build output
    parts = []

    if not args.no_metadata and meta:
        parts.append(f"# {title or meta.get('title', 'Untitled')}")
        parts.append(f"\n**Source**: {final_url}")
        if meta.get("author"):
            parts.append(f"**Author**: {meta['author']}")
        if meta.get("published"):
            parts.append(f"**Published**: {meta['published']}")
        if meta.get("description"):
            parts.append(f"**Description**: {meta['description']}")
        parts.append("\n---\n")
    elif title and not args.no_metadata:
        parts.append(f"# {title}\n")

    parts.append(content_md)

    output = "\n".join(parts)

    # Truncate if requested
    if args.max_length and len(output) > args.max_length:
        output = output[:args.max_length] + f"\n\n[... truncated at {args.max_length} characters, total {len(output)}]"

    # Print to stdout
    print(output)

    content_length = len(content_md)
    print(f"\nExtracted: {content_length} characters", file=sys.stderr)

    # Save to file if requested
    if args.save:
        try:
            with open(args.save, "w", encoding="utf-8") as f:
                f.write(output)
            print(f"Saved to: {args.save}", file=sys.stderr)
        except Exception as e:
            print(f"Error saving file: {e}", file=sys.stderr)


if __name__ == "__main__":
    main()