openclaw-backups/skills/aidotnet-web-scraper/scripts/extract_links.py

#!/usr/bin/env python3
"""Extract and categorize all links from a web page.

Fetches the page and extracts all <a> tags, categorizing them as
internal, external, or resource links. Useful for site navigation
and discovery before deeper scraping.

Dependencies: pip install requests beautifulsoup4
"""

import sys
import argparse
import json
import re
from urllib.parse import urlparse, urljoin


def setup_encoding():
    """Setup proper encoding for Windows console output."""
    if sys.platform == "win32":
        import io
        try:
            sys.stdout.reconfigure(encoding='utf-8', errors='replace')
            sys.stderr.reconfigure(encoding='utf-8', errors='replace')
        except (AttributeError, io.UnsupportedOperation):
            sys.stdout = io.TextIOWrapper(
                sys.stdout.buffer, encoding='utf-8', errors='replace', line_buffering=True
            )
            sys.stderr = io.TextIOWrapper(
                sys.stderr.buffer, encoding='utf-8', errors='replace', line_buffering=True
            )


def check_dependencies():
    """Check that required packages are installed."""
    missing = []
    try:
        import requests  # noqa: F401
    except ImportError:
        missing.append("requests")
    try:
        from bs4 import BeautifulSoup  # noqa: F401
    except ImportError:
        missing.append("beautifulsoup4")
    if missing:
        print(f"Error: missing dependencies: {', '.join(missing)}", file=sys.stderr)
        print(f"Install with: pip install {' '.join(missing)}", file=sys.stderr)
        sys.exit(1)


RESOURCE_EXTENSIONS = {
    '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
    '.zip', '.rar', '.tar', '.gz', '.7z',
    '.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico',
    '.mp3', '.mp4', '.avi', '.mov', '.webm',
    '.css', '.js', '.woff', '.woff2', '.ttf', '.eot',
}


def classify_link(href, base_domain):
    """Classify a link as internal, external, or resource."""
    parsed = urlparse(href)

    # Check for resource files
    path_lower = parsed.path.lower()
    for ext in RESOURCE_EXTENSIONS:
        if path_lower.endswith(ext):
            return "resource"

    # Check domain
    link_domain = parsed.netloc.lower()
    if not link_domain or link_domain == base_domain:
        return "internal"

    # Check for common CDN / same-org subdomains
    base_parts = base_domain.split(".")
    link_parts = link_domain.split(".")
    if len(base_parts) >= 2 and len(link_parts) >= 2:
        if base_parts[-2:] == link_parts[-2:]:
            return "internal"

    return "external"


def extract_links(html, base_url):
    """Extract all links from HTML."""
    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, "html.parser")
    base_domain = urlparse(base_url).netloc.lower()
    links = []
    seen = set()

    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"].strip()

        # Skip anchors, javascript:, mailto:, tel:
        if not href or href.startswith(("#", "javascript:", "mailto:", "tel:")):
            continue

        # Resolve relative URLs
        full_url = urljoin(base_url, href)

        # Deduplicate
        if full_url in seen:
            continue
        seen.add(full_url)

        # Extract link text
        text = a_tag.get_text(strip=True) or ""
        text = re.sub(r'\s+', ' ', text)  # normalize whitespace
        if len(text) > 100:
            text = text[:100] + "..."

        link_type = classify_link(full_url, base_domain)

        links.append({
            "url": full_url,
            "text": text,
            "type": link_type,
        })

    return links


def format_markdown(links, url, filter_pattern=None, external_only=False):
    """Format links as Markdown."""
    # Apply filters
    filtered = links
    if external_only:
        filtered = [link for link in filtered if link["type"] == "external"]
    if filter_pattern:
        try:
            pattern = re.compile(filter_pattern, re.IGNORECASE)
            filtered = [link for link in filtered if pattern.search(link["url"])]
        except re.error as e:
            print(f"Warning: invalid regex pattern '{filter_pattern}': {e}", file=sys.stderr)

    # Group by type
    internal = [link for link in filtered if link["type"] == "internal"]
    external = [link for link in filtered if link["type"] == "external"]
    resources = [link for link in filtered if link["type"] == "resource"]

    parts = [f"# Links from {url}\n"]
    parts.append(f"Total: **{len(filtered)}** links ({len(internal)} internal, {len(external)} external, {len(resources)} resource)\n")

    if internal:
        parts.append("## Internal Links\n")
        for lk in internal:
            text = f" — {lk['text']}" if lk['text'] else ""
            parts.append(f"- {lk['url']}{text}")
        parts.append("")

    if external:
        parts.append("## External Links\n")
        for lk in external:
            text = f" — {lk['text']}" if lk['text'] else ""
            parts.append(f"- {lk['url']}{text}")
        parts.append("")

    if resources:
        parts.append("## Resource Links\n")
        for lk in resources:
            text = f" — {lk['text']}" if lk['text'] else ""
            parts.append(f"- {lk['url']}{text}")
        parts.append("")

    return "\n".join(parts)


def main():
    setup_encoding()
    check_dependencies()

    parser = argparse.ArgumentParser(
        description="Extract and categorize links from a web page"
    )
    parser.add_argument("url", help="URL to extract links from")
    parser.add_argument("--filter", type=str, default=None,
                        help="Regex pattern to filter URLs")
    parser.add_argument("--external-only", action="store_true",
                        help="Only show external links")
    parser.add_argument("--json", action="store_true",
                        help="Output as JSON instead of Markdown")
    parser.add_argument("--timeout", type=int, default=30,
                        help="Request timeout in seconds (default: 30)")

    args = parser.parse_args()

    import requests

    url = args.url.strip()
    if not url.startswith(("http://", "https://")):
        url = "https://" + url

    print(f"Extracting links from: {url}", file=sys.stderr)

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        ),
    }

    try:
        resp = requests.get(url, headers=headers, timeout=args.timeout, allow_redirects=True)
        resp.raise_for_status()
        if resp.encoding and resp.encoding.lower() != 'utf-8':
            resp.encoding = resp.apparent_encoding or resp.encoding
        html = resp.text
        final_url = resp.url
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)

    links = extract_links(html, final_url)
    print(f"Found {len(links)} unique links", file=sys.stderr)

    if args.json:
        # Apply filters for JSON output too
        filtered = links
        if args.external_only:
            filtered = [lk for lk in filtered if lk["type"] == "external"]
        if args.filter:
            try:
                pattern = re.compile(args.filter, re.IGNORECASE)
                filtered = [lk for lk in filtered if pattern.search(lk["url"])]
            except re.error:
                pass
        print(json.dumps(filtered, indent=2, ensure_ascii=False))
    else:
        print(format_markdown(links, final_url, args.filter, args.external_only))


if __name__ == "__main__":
    main()