Files
openclaw-backups/skills/aidotnet-web-scraper/scripts/extract_links.py

237 lines
7.5 KiB
Python

#!/usr/bin/env python3
"""Extract and categorize all links from a web page.
Fetches the page and extracts all <a> tags, categorizing them as
internal, external, or resource links. Useful for site navigation
and discovery before deeper scraping.
Dependencies: pip install requests beautifulsoup4
"""
import sys
import argparse
import json
import re
from urllib.parse import urlparse, urljoin
def setup_encoding():
"""Setup proper encoding for Windows console output."""
if sys.platform == "win32":
import io
try:
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
except (AttributeError, io.UnsupportedOperation):
sys.stdout = io.TextIOWrapper(
sys.stdout.buffer, encoding='utf-8', errors='replace', line_buffering=True
)
sys.stderr = io.TextIOWrapper(
sys.stderr.buffer, encoding='utf-8', errors='replace', line_buffering=True
)
def check_dependencies():
"""Check that required packages are installed."""
missing = []
try:
import requests # noqa: F401
except ImportError:
missing.append("requests")
try:
from bs4 import BeautifulSoup # noqa: F401
except ImportError:
missing.append("beautifulsoup4")
if missing:
print(f"Error: missing dependencies: {', '.join(missing)}", file=sys.stderr)
print(f"Install with: pip install {' '.join(missing)}", file=sys.stderr)
sys.exit(1)
RESOURCE_EXTENSIONS = {
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
'.zip', '.rar', '.tar', '.gz', '.7z',
'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico',
'.mp3', '.mp4', '.avi', '.mov', '.webm',
'.css', '.js', '.woff', '.woff2', '.ttf', '.eot',
}
def classify_link(href, base_domain):
"""Classify a link as internal, external, or resource."""
parsed = urlparse(href)
# Check for resource files
path_lower = parsed.path.lower()
for ext in RESOURCE_EXTENSIONS:
if path_lower.endswith(ext):
return "resource"
# Check domain
link_domain = parsed.netloc.lower()
if not link_domain or link_domain == base_domain:
return "internal"
# Check for common CDN / same-org subdomains
base_parts = base_domain.split(".")
link_parts = link_domain.split(".")
if len(base_parts) >= 2 and len(link_parts) >= 2:
if base_parts[-2:] == link_parts[-2:]:
return "internal"
return "external"
def extract_links(html, base_url):
"""Extract all links from HTML."""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
base_domain = urlparse(base_url).netloc.lower()
links = []
seen = set()
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"].strip()
# Skip anchors, javascript:, mailto:, tel:
if not href or href.startswith(("#", "javascript:", "mailto:", "tel:")):
continue
# Resolve relative URLs
full_url = urljoin(base_url, href)
# Deduplicate
if full_url in seen:
continue
seen.add(full_url)
# Extract link text
text = a_tag.get_text(strip=True) or ""
text = re.sub(r'\s+', ' ', text) # normalize whitespace
if len(text) > 100:
text = text[:100] + "..."
link_type = classify_link(full_url, base_domain)
links.append({
"url": full_url,
"text": text,
"type": link_type,
})
return links
def format_markdown(links, url, filter_pattern=None, external_only=False):
"""Format links as Markdown."""
# Apply filters
filtered = links
if external_only:
filtered = [link for link in filtered if link["type"] == "external"]
if filter_pattern:
try:
pattern = re.compile(filter_pattern, re.IGNORECASE)
filtered = [link for link in filtered if pattern.search(link["url"])]
except re.error as e:
print(f"Warning: invalid regex pattern '{filter_pattern}': {e}", file=sys.stderr)
# Group by type
internal = [link for link in filtered if link["type"] == "internal"]
external = [link for link in filtered if link["type"] == "external"]
resources = [link for link in filtered if link["type"] == "resource"]
parts = [f"# Links from {url}\n"]
parts.append(f"Total: **{len(filtered)}** links ({len(internal)} internal, {len(external)} external, {len(resources)} resource)\n")
if internal:
parts.append("## Internal Links\n")
for lk in internal:
text = f"{lk['text']}" if lk['text'] else ""
parts.append(f"- {lk['url']}{text}")
parts.append("")
if external:
parts.append("## External Links\n")
for lk in external:
text = f"{lk['text']}" if lk['text'] else ""
parts.append(f"- {lk['url']}{text}")
parts.append("")
if resources:
parts.append("## Resource Links\n")
for lk in resources:
text = f"{lk['text']}" if lk['text'] else ""
parts.append(f"- {lk['url']}{text}")
parts.append("")
return "\n".join(parts)
def main():
setup_encoding()
check_dependencies()
parser = argparse.ArgumentParser(
description="Extract and categorize links from a web page"
)
parser.add_argument("url", help="URL to extract links from")
parser.add_argument("--filter", type=str, default=None,
help="Regex pattern to filter URLs")
parser.add_argument("--external-only", action="store_true",
help="Only show external links")
parser.add_argument("--json", action="store_true",
help="Output as JSON instead of Markdown")
parser.add_argument("--timeout", type=int, default=30,
help="Request timeout in seconds (default: 30)")
args = parser.parse_args()
import requests
url = args.url.strip()
if not url.startswith(("http://", "https://")):
url = "https://" + url
print(f"Extracting links from: {url}", file=sys.stderr)
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
),
}
try:
resp = requests.get(url, headers=headers, timeout=args.timeout, allow_redirects=True)
resp.raise_for_status()
if resp.encoding and resp.encoding.lower() != 'utf-8':
resp.encoding = resp.apparent_encoding or resp.encoding
html = resp.text
final_url = resp.url
except requests.exceptions.RequestException as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
links = extract_links(html, final_url)
print(f"Found {len(links)} unique links", file=sys.stderr)
if args.json:
# Apply filters for JSON output too
filtered = links
if args.external_only:
filtered = [lk for lk in filtered if lk["type"] == "external"]
if args.filter:
try:
pattern = re.compile(args.filter, re.IGNORECASE)
filtered = [lk for lk in filtered if pattern.search(lk["url"])]
except re.error:
pass
print(json.dumps(filtered, indent=2, ensure_ascii=False))
else:
print(format_markdown(links, final_url, args.filter, args.external_only))
if __name__ == "__main__":
main()