285 lines
9.3 KiB
Python
285 lines
9.3 KiB
Python
#!/usr/bin/env python3
|
|
"""Fetch a web page and extract readable content as clean Markdown.
|
|
|
|
Uses requests + BeautifulSoup + readability-lxml + html2text for lightweight,
|
|
fast extraction without a headless browser. Works well for articles, docs,
|
|
blogs, wikis, and most static websites.
|
|
|
|
Dependencies: pip install requests beautifulsoup4 readability-lxml html2text
|
|
"""
|
|
|
|
import sys
|
|
import argparse
|
|
|
|
|
|
def setup_encoding():
|
|
"""Setup proper encoding for Windows console output."""
|
|
if sys.platform == "win32":
|
|
import io
|
|
try:
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
|
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
|
|
except (AttributeError, io.UnsupportedOperation):
|
|
sys.stdout = io.TextIOWrapper(
|
|
sys.stdout.buffer, encoding='utf-8', errors='replace', line_buffering=True
|
|
)
|
|
sys.stderr = io.TextIOWrapper(
|
|
sys.stderr.buffer, encoding='utf-8', errors='replace', line_buffering=True
|
|
)
|
|
|
|
|
|
def check_dependencies():
|
|
"""Check that required packages are installed."""
|
|
missing = []
|
|
try:
|
|
import requests # noqa: F401
|
|
except ImportError:
|
|
missing.append("requests")
|
|
try:
|
|
from bs4 import BeautifulSoup # noqa: F401
|
|
except ImportError:
|
|
missing.append("beautifulsoup4")
|
|
try:
|
|
from readability import Document # noqa: F401
|
|
except ImportError:
|
|
missing.append("readability-lxml")
|
|
try:
|
|
import html2text # noqa: F401
|
|
except ImportError:
|
|
missing.append("html2text")
|
|
|
|
if missing:
|
|
print(f"Error: missing dependencies: {', '.join(missing)}", file=sys.stderr)
|
|
print(f"Install with: pip install {' '.join(missing)}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
def fetch_url(url, timeout=30):
|
|
"""Fetch URL content with proper headers."""
|
|
import requests
|
|
|
|
headers = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
),
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
}
|
|
|
|
try:
|
|
resp = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
|
|
resp.raise_for_status()
|
|
|
|
# Detect encoding
|
|
if resp.encoding and resp.encoding.lower() != 'utf-8':
|
|
resp.encoding = resp.apparent_encoding or resp.encoding
|
|
|
|
return resp.text, resp.url, resp.status_code
|
|
except requests.exceptions.Timeout:
|
|
print(f"Error: request timed out after {timeout}s", file=sys.stderr)
|
|
sys.exit(1)
|
|
except requests.exceptions.ConnectionError as e:
|
|
print(f"Error: connection failed: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
except requests.exceptions.HTTPError as e:
|
|
print(f"Error: HTTP {e.response.status_code}: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
def extract_with_readability(html, url):
|
|
"""Extract main article content using readability-lxml."""
|
|
from readability import Document
|
|
|
|
doc = Document(html, url=url)
|
|
title = doc.short_title()
|
|
content_html = doc.summary()
|
|
return title, content_html
|
|
|
|
|
|
def extract_with_selector(html, selector):
|
|
"""Extract content matching a CSS selector."""
|
|
from bs4 import BeautifulSoup
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
elements = soup.select(selector)
|
|
if not elements:
|
|
return None
|
|
|
|
# Combine all matching elements
|
|
parts = []
|
|
for el in elements:
|
|
parts.append(str(el))
|
|
return "\n".join(parts)
|
|
|
|
|
|
def html_to_markdown(html, base_url=None):
|
|
"""Convert HTML to clean Markdown."""
|
|
import html2text
|
|
|
|
converter = html2text.HTML2Text()
|
|
converter.body_width = 0 # Don't wrap lines
|
|
converter.ignore_images = False
|
|
converter.ignore_links = False
|
|
converter.ignore_emphasis = False
|
|
converter.protect_links = True
|
|
converter.unicode_snob = True
|
|
converter.mark_code = True
|
|
converter.wrap_links = False
|
|
converter.single_line_break = False
|
|
|
|
if base_url:
|
|
converter.baseurl = base_url
|
|
|
|
md = converter.handle(html)
|
|
|
|
# Clean up excessive blank lines
|
|
import re
|
|
md = re.sub(r'\n{3,}', '\n\n', md)
|
|
return md.strip()
|
|
|
|
|
|
def extract_metadata(html):
|
|
"""Extract page metadata (title, description, etc.)."""
|
|
from bs4 import BeautifulSoup
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
meta = {}
|
|
|
|
# Title
|
|
title_tag = soup.find("title")
|
|
if title_tag:
|
|
meta["title"] = title_tag.get_text(strip=True)
|
|
|
|
# Meta description
|
|
desc_tag = soup.find("meta", attrs={"name": "description"})
|
|
if desc_tag and desc_tag.get("content"):
|
|
meta["description"] = desc_tag["content"].strip()
|
|
|
|
# OG tags
|
|
for prop in ["og:title", "og:description", "og:type", "og:site_name"]:
|
|
tag = soup.find("meta", attrs={"property": prop})
|
|
if tag and tag.get("content"):
|
|
meta[prop.replace("og:", "og_")] = tag["content"].strip()
|
|
|
|
# Author
|
|
author_tag = soup.find("meta", attrs={"name": "author"})
|
|
if author_tag and author_tag.get("content"):
|
|
meta["author"] = author_tag["content"].strip()
|
|
|
|
# Published date
|
|
for attr in ["article:published_time", "datePublished", "date"]:
|
|
date_tag = soup.find("meta", attrs={"property": attr}) or soup.find("meta", attrs={"name": attr})
|
|
if date_tag and date_tag.get("content"):
|
|
meta["published"] = date_tag["content"].strip()
|
|
break
|
|
|
|
return meta
|
|
|
|
|
|
def main():
|
|
setup_encoding()
|
|
check_dependencies()
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Fetch a web page and extract content as Markdown"
|
|
)
|
|
parser.add_argument("url", help="URL to fetch")
|
|
parser.add_argument("--raw", action="store_true",
|
|
help="Output full page Markdown (no readability extraction)")
|
|
parser.add_argument("--selector", type=str, default=None,
|
|
help="CSS selector to extract specific elements")
|
|
parser.add_argument("--save", type=str, default=None,
|
|
help="Also save output to this file path")
|
|
parser.add_argument("--max-length", type=int, default=None,
|
|
help="Truncate output to N characters")
|
|
parser.add_argument("--timeout", type=int, default=30,
|
|
help="Request timeout in seconds (default: 30)")
|
|
parser.add_argument("--no-metadata", action="store_true",
|
|
help="Skip metadata header in output")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Normalize URL
|
|
url = args.url.strip()
|
|
if not url.startswith(("http://", "https://")):
|
|
url = "https://" + url
|
|
|
|
print(f"Fetching: {url}", file=sys.stderr)
|
|
|
|
# Fetch
|
|
html, final_url, status = fetch_url(url, timeout=args.timeout)
|
|
print(f"Status: {status}, Size: {len(html)} bytes", file=sys.stderr)
|
|
|
|
if final_url != url:
|
|
print(f"Redirected to: {final_url}", file=sys.stderr)
|
|
|
|
# Extract metadata
|
|
meta = extract_metadata(html) if not args.no_metadata else {}
|
|
|
|
# Extract content
|
|
if args.selector:
|
|
# CSS selector mode
|
|
selected_html = extract_with_selector(html, args.selector)
|
|
if not selected_html:
|
|
print(f"Warning: no elements matched selector '{args.selector}'", file=sys.stderr)
|
|
print(f"[No elements matched CSS selector: {args.selector}]")
|
|
sys.exit(0)
|
|
title = meta.get("title", "")
|
|
content_md = html_to_markdown(selected_html, base_url=final_url)
|
|
elif args.raw:
|
|
# Raw full-page mode
|
|
title = meta.get("title", "")
|
|
content_md = html_to_markdown(html, base_url=final_url)
|
|
else:
|
|
# Readability extraction mode (default)
|
|
title, article_html = extract_with_readability(html, final_url)
|
|
content_md = html_to_markdown(article_html, base_url=final_url)
|
|
|
|
# Build output
|
|
parts = []
|
|
|
|
if not args.no_metadata and meta:
|
|
parts.append(f"# {title or meta.get('title', 'Untitled')}")
|
|
parts.append(f"\n**Source**: {final_url}")
|
|
if meta.get("author"):
|
|
parts.append(f"**Author**: {meta['author']}")
|
|
if meta.get("published"):
|
|
parts.append(f"**Published**: {meta['published']}")
|
|
if meta.get("description"):
|
|
parts.append(f"**Description**: {meta['description']}")
|
|
parts.append("\n---\n")
|
|
elif title and not args.no_metadata:
|
|
parts.append(f"# {title}\n")
|
|
|
|
parts.append(content_md)
|
|
|
|
output = "\n".join(parts)
|
|
|
|
# Truncate if requested
|
|
if args.max_length and len(output) > args.max_length:
|
|
output = output[:args.max_length] + f"\n\n[... truncated at {args.max_length} characters, total {len(output)}]"
|
|
|
|
# Print to stdout
|
|
print(output)
|
|
|
|
content_length = len(content_md)
|
|
print(f"\nExtracted: {content_length} characters", file=sys.stderr)
|
|
|
|
# Save to file if requested
|
|
if args.save:
|
|
try:
|
|
with open(args.save, "w", encoding="utf-8") as f:
|
|
f.write(output)
|
|
print(f"Saved to: {args.save}", file=sys.stderr)
|
|
except Exception as e:
|
|
print(f"Error saving file: {e}", file=sys.stderr)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|