237 lines
7.5 KiB
Python
237 lines
7.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Extract and categorize all links from a web page.
|
|
|
|
Fetches the page and extracts all <a> tags, categorizing them as
|
|
internal, external, or resource links. Useful for site navigation
|
|
and discovery before deeper scraping.
|
|
|
|
Dependencies: pip install requests beautifulsoup4
|
|
"""
|
|
|
|
import sys
|
|
import argparse
|
|
import json
|
|
import re
|
|
from urllib.parse import urlparse, urljoin
|
|
|
|
|
|
def setup_encoding():
|
|
"""Setup proper encoding for Windows console output."""
|
|
if sys.platform == "win32":
|
|
import io
|
|
try:
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
|
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
|
|
except (AttributeError, io.UnsupportedOperation):
|
|
sys.stdout = io.TextIOWrapper(
|
|
sys.stdout.buffer, encoding='utf-8', errors='replace', line_buffering=True
|
|
)
|
|
sys.stderr = io.TextIOWrapper(
|
|
sys.stderr.buffer, encoding='utf-8', errors='replace', line_buffering=True
|
|
)
|
|
|
|
|
|
def check_dependencies():
|
|
"""Check that required packages are installed."""
|
|
missing = []
|
|
try:
|
|
import requests # noqa: F401
|
|
except ImportError:
|
|
missing.append("requests")
|
|
try:
|
|
from bs4 import BeautifulSoup # noqa: F401
|
|
except ImportError:
|
|
missing.append("beautifulsoup4")
|
|
if missing:
|
|
print(f"Error: missing dependencies: {', '.join(missing)}", file=sys.stderr)
|
|
print(f"Install with: pip install {' '.join(missing)}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
RESOURCE_EXTENSIONS = {
|
|
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
|
|
'.zip', '.rar', '.tar', '.gz', '.7z',
|
|
'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico',
|
|
'.mp3', '.mp4', '.avi', '.mov', '.webm',
|
|
'.css', '.js', '.woff', '.woff2', '.ttf', '.eot',
|
|
}
|
|
|
|
|
|
def classify_link(href, base_domain):
|
|
"""Classify a link as internal, external, or resource."""
|
|
parsed = urlparse(href)
|
|
|
|
# Check for resource files
|
|
path_lower = parsed.path.lower()
|
|
for ext in RESOURCE_EXTENSIONS:
|
|
if path_lower.endswith(ext):
|
|
return "resource"
|
|
|
|
# Check domain
|
|
link_domain = parsed.netloc.lower()
|
|
if not link_domain or link_domain == base_domain:
|
|
return "internal"
|
|
|
|
# Check for common CDN / same-org subdomains
|
|
base_parts = base_domain.split(".")
|
|
link_parts = link_domain.split(".")
|
|
if len(base_parts) >= 2 and len(link_parts) >= 2:
|
|
if base_parts[-2:] == link_parts[-2:]:
|
|
return "internal"
|
|
|
|
return "external"
|
|
|
|
|
|
def extract_links(html, base_url):
|
|
"""Extract all links from HTML."""
|
|
from bs4 import BeautifulSoup
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
base_domain = urlparse(base_url).netloc.lower()
|
|
links = []
|
|
seen = set()
|
|
|
|
for a_tag in soup.find_all("a", href=True):
|
|
href = a_tag["href"].strip()
|
|
|
|
# Skip anchors, javascript:, mailto:, tel:
|
|
if not href or href.startswith(("#", "javascript:", "mailto:", "tel:")):
|
|
continue
|
|
|
|
# Resolve relative URLs
|
|
full_url = urljoin(base_url, href)
|
|
|
|
# Deduplicate
|
|
if full_url in seen:
|
|
continue
|
|
seen.add(full_url)
|
|
|
|
# Extract link text
|
|
text = a_tag.get_text(strip=True) or ""
|
|
text = re.sub(r'\s+', ' ', text) # normalize whitespace
|
|
if len(text) > 100:
|
|
text = text[:100] + "..."
|
|
|
|
link_type = classify_link(full_url, base_domain)
|
|
|
|
links.append({
|
|
"url": full_url,
|
|
"text": text,
|
|
"type": link_type,
|
|
})
|
|
|
|
return links
|
|
|
|
|
|
def format_markdown(links, url, filter_pattern=None, external_only=False):
|
|
"""Format links as Markdown."""
|
|
# Apply filters
|
|
filtered = links
|
|
if external_only:
|
|
filtered = [link for link in filtered if link["type"] == "external"]
|
|
if filter_pattern:
|
|
try:
|
|
pattern = re.compile(filter_pattern, re.IGNORECASE)
|
|
filtered = [link for link in filtered if pattern.search(link["url"])]
|
|
except re.error as e:
|
|
print(f"Warning: invalid regex pattern '{filter_pattern}': {e}", file=sys.stderr)
|
|
|
|
# Group by type
|
|
internal = [link for link in filtered if link["type"] == "internal"]
|
|
external = [link for link in filtered if link["type"] == "external"]
|
|
resources = [link for link in filtered if link["type"] == "resource"]
|
|
|
|
parts = [f"# Links from {url}\n"]
|
|
parts.append(f"Total: **{len(filtered)}** links ({len(internal)} internal, {len(external)} external, {len(resources)} resource)\n")
|
|
|
|
if internal:
|
|
parts.append("## Internal Links\n")
|
|
for lk in internal:
|
|
text = f" — {lk['text']}" if lk['text'] else ""
|
|
parts.append(f"- {lk['url']}{text}")
|
|
parts.append("")
|
|
|
|
if external:
|
|
parts.append("## External Links\n")
|
|
for lk in external:
|
|
text = f" — {lk['text']}" if lk['text'] else ""
|
|
parts.append(f"- {lk['url']}{text}")
|
|
parts.append("")
|
|
|
|
if resources:
|
|
parts.append("## Resource Links\n")
|
|
for lk in resources:
|
|
text = f" — {lk['text']}" if lk['text'] else ""
|
|
parts.append(f"- {lk['url']}{text}")
|
|
parts.append("")
|
|
|
|
return "\n".join(parts)
|
|
|
|
|
|
def main():
|
|
setup_encoding()
|
|
check_dependencies()
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Extract and categorize links from a web page"
|
|
)
|
|
parser.add_argument("url", help="URL to extract links from")
|
|
parser.add_argument("--filter", type=str, default=None,
|
|
help="Regex pattern to filter URLs")
|
|
parser.add_argument("--external-only", action="store_true",
|
|
help="Only show external links")
|
|
parser.add_argument("--json", action="store_true",
|
|
help="Output as JSON instead of Markdown")
|
|
parser.add_argument("--timeout", type=int, default=30,
|
|
help="Request timeout in seconds (default: 30)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
import requests
|
|
|
|
url = args.url.strip()
|
|
if not url.startswith(("http://", "https://")):
|
|
url = "https://" + url
|
|
|
|
print(f"Extracting links from: {url}", file=sys.stderr)
|
|
|
|
headers = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
),
|
|
}
|
|
|
|
try:
|
|
resp = requests.get(url, headers=headers, timeout=args.timeout, allow_redirects=True)
|
|
resp.raise_for_status()
|
|
if resp.encoding and resp.encoding.lower() != 'utf-8':
|
|
resp.encoding = resp.apparent_encoding or resp.encoding
|
|
html = resp.text
|
|
final_url = resp.url
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
links = extract_links(html, final_url)
|
|
print(f"Found {len(links)} unique links", file=sys.stderr)
|
|
|
|
if args.json:
|
|
# Apply filters for JSON output too
|
|
filtered = links
|
|
if args.external_only:
|
|
filtered = [lk for lk in filtered if lk["type"] == "external"]
|
|
if args.filter:
|
|
try:
|
|
pattern = re.compile(args.filter, re.IGNORECASE)
|
|
filtered = [lk for lk in filtered if pattern.search(lk["url"])]
|
|
except re.error:
|
|
pass
|
|
print(json.dumps(filtered, indent=2, ensure_ascii=False))
|
|
else:
|
|
print(format_markdown(links, final_url, args.filter, args.external_only))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|