AI Newsletter Digest improvements: fixed QP soft line break decoding, URL extraction, and content cleaning
This commit is contained in:
284
skills/aidotnet-web-scraper/scripts/fetch_page.py
Normal file
284
skills/aidotnet-web-scraper/scripts/fetch_page.py
Normal file
@@ -0,0 +1,284 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Fetch a web page and extract readable content as clean Markdown.
|
||||
|
||||
Uses requests + BeautifulSoup + readability-lxml + html2text for lightweight,
|
||||
fast extraction without a headless browser. Works well for articles, docs,
|
||||
blogs, wikis, and most static websites.
|
||||
|
||||
Dependencies: pip install requests beautifulsoup4 readability-lxml html2text
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
|
||||
def setup_encoding():
|
||||
"""Setup proper encoding for Windows console output."""
|
||||
if sys.platform == "win32":
|
||||
import io
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
||||
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
|
||||
except (AttributeError, io.UnsupportedOperation):
|
||||
sys.stdout = io.TextIOWrapper(
|
||||
sys.stdout.buffer, encoding='utf-8', errors='replace', line_buffering=True
|
||||
)
|
||||
sys.stderr = io.TextIOWrapper(
|
||||
sys.stderr.buffer, encoding='utf-8', errors='replace', line_buffering=True
|
||||
)
|
||||
|
||||
|
||||
def check_dependencies():
|
||||
"""Check that required packages are installed."""
|
||||
missing = []
|
||||
try:
|
||||
import requests # noqa: F401
|
||||
except ImportError:
|
||||
missing.append("requests")
|
||||
try:
|
||||
from bs4 import BeautifulSoup # noqa: F401
|
||||
except ImportError:
|
||||
missing.append("beautifulsoup4")
|
||||
try:
|
||||
from readability import Document # noqa: F401
|
||||
except ImportError:
|
||||
missing.append("readability-lxml")
|
||||
try:
|
||||
import html2text # noqa: F401
|
||||
except ImportError:
|
||||
missing.append("html2text")
|
||||
|
||||
if missing:
|
||||
print(f"Error: missing dependencies: {', '.join(missing)}", file=sys.stderr)
|
||||
print(f"Install with: pip install {' '.join(missing)}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def fetch_url(url, timeout=30):
|
||||
"""Fetch URL content with proper headers."""
|
||||
import requests
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
|
||||
# Detect encoding
|
||||
if resp.encoding and resp.encoding.lower() != 'utf-8':
|
||||
resp.encoding = resp.apparent_encoding or resp.encoding
|
||||
|
||||
return resp.text, resp.url, resp.status_code
|
||||
except requests.exceptions.Timeout:
|
||||
print(f"Error: request timed out after {timeout}s", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
print(f"Error: connection failed: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
print(f"Error: HTTP {e.response.status_code}: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def extract_with_readability(html, url):
|
||||
"""Extract main article content using readability-lxml."""
|
||||
from readability import Document
|
||||
|
||||
doc = Document(html, url=url)
|
||||
title = doc.short_title()
|
||||
content_html = doc.summary()
|
||||
return title, content_html
|
||||
|
||||
|
||||
def extract_with_selector(html, selector):
|
||||
"""Extract content matching a CSS selector."""
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
elements = soup.select(selector)
|
||||
if not elements:
|
||||
return None
|
||||
|
||||
# Combine all matching elements
|
||||
parts = []
|
||||
for el in elements:
|
||||
parts.append(str(el))
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def html_to_markdown(html, base_url=None):
|
||||
"""Convert HTML to clean Markdown."""
|
||||
import html2text
|
||||
|
||||
converter = html2text.HTML2Text()
|
||||
converter.body_width = 0 # Don't wrap lines
|
||||
converter.ignore_images = False
|
||||
converter.ignore_links = False
|
||||
converter.ignore_emphasis = False
|
||||
converter.protect_links = True
|
||||
converter.unicode_snob = True
|
||||
converter.mark_code = True
|
||||
converter.wrap_links = False
|
||||
converter.single_line_break = False
|
||||
|
||||
if base_url:
|
||||
converter.baseurl = base_url
|
||||
|
||||
md = converter.handle(html)
|
||||
|
||||
# Clean up excessive blank lines
|
||||
import re
|
||||
md = re.sub(r'\n{3,}', '\n\n', md)
|
||||
return md.strip()
|
||||
|
||||
|
||||
def extract_metadata(html):
|
||||
"""Extract page metadata (title, description, etc.)."""
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
meta = {}
|
||||
|
||||
# Title
|
||||
title_tag = soup.find("title")
|
||||
if title_tag:
|
||||
meta["title"] = title_tag.get_text(strip=True)
|
||||
|
||||
# Meta description
|
||||
desc_tag = soup.find("meta", attrs={"name": "description"})
|
||||
if desc_tag and desc_tag.get("content"):
|
||||
meta["description"] = desc_tag["content"].strip()
|
||||
|
||||
# OG tags
|
||||
for prop in ["og:title", "og:description", "og:type", "og:site_name"]:
|
||||
tag = soup.find("meta", attrs={"property": prop})
|
||||
if tag and tag.get("content"):
|
||||
meta[prop.replace("og:", "og_")] = tag["content"].strip()
|
||||
|
||||
# Author
|
||||
author_tag = soup.find("meta", attrs={"name": "author"})
|
||||
if author_tag and author_tag.get("content"):
|
||||
meta["author"] = author_tag["content"].strip()
|
||||
|
||||
# Published date
|
||||
for attr in ["article:published_time", "datePublished", "date"]:
|
||||
date_tag = soup.find("meta", attrs={"property": attr}) or soup.find("meta", attrs={"name": attr})
|
||||
if date_tag and date_tag.get("content"):
|
||||
meta["published"] = date_tag["content"].strip()
|
||||
break
|
||||
|
||||
return meta
|
||||
|
||||
|
||||
def main():
|
||||
setup_encoding()
|
||||
check_dependencies()
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Fetch a web page and extract content as Markdown"
|
||||
)
|
||||
parser.add_argument("url", help="URL to fetch")
|
||||
parser.add_argument("--raw", action="store_true",
|
||||
help="Output full page Markdown (no readability extraction)")
|
||||
parser.add_argument("--selector", type=str, default=None,
|
||||
help="CSS selector to extract specific elements")
|
||||
parser.add_argument("--save", type=str, default=None,
|
||||
help="Also save output to this file path")
|
||||
parser.add_argument("--max-length", type=int, default=None,
|
||||
help="Truncate output to N characters")
|
||||
parser.add_argument("--timeout", type=int, default=30,
|
||||
help="Request timeout in seconds (default: 30)")
|
||||
parser.add_argument("--no-metadata", action="store_true",
|
||||
help="Skip metadata header in output")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Normalize URL
|
||||
url = args.url.strip()
|
||||
if not url.startswith(("http://", "https://")):
|
||||
url = "https://" + url
|
||||
|
||||
print(f"Fetching: {url}", file=sys.stderr)
|
||||
|
||||
# Fetch
|
||||
html, final_url, status = fetch_url(url, timeout=args.timeout)
|
||||
print(f"Status: {status}, Size: {len(html)} bytes", file=sys.stderr)
|
||||
|
||||
if final_url != url:
|
||||
print(f"Redirected to: {final_url}", file=sys.stderr)
|
||||
|
||||
# Extract metadata
|
||||
meta = extract_metadata(html) if not args.no_metadata else {}
|
||||
|
||||
# Extract content
|
||||
if args.selector:
|
||||
# CSS selector mode
|
||||
selected_html = extract_with_selector(html, args.selector)
|
||||
if not selected_html:
|
||||
print(f"Warning: no elements matched selector '{args.selector}'", file=sys.stderr)
|
||||
print(f"[No elements matched CSS selector: {args.selector}]")
|
||||
sys.exit(0)
|
||||
title = meta.get("title", "")
|
||||
content_md = html_to_markdown(selected_html, base_url=final_url)
|
||||
elif args.raw:
|
||||
# Raw full-page mode
|
||||
title = meta.get("title", "")
|
||||
content_md = html_to_markdown(html, base_url=final_url)
|
||||
else:
|
||||
# Readability extraction mode (default)
|
||||
title, article_html = extract_with_readability(html, final_url)
|
||||
content_md = html_to_markdown(article_html, base_url=final_url)
|
||||
|
||||
# Build output
|
||||
parts = []
|
||||
|
||||
if not args.no_metadata and meta:
|
||||
parts.append(f"# {title or meta.get('title', 'Untitled')}")
|
||||
parts.append(f"\n**Source**: {final_url}")
|
||||
if meta.get("author"):
|
||||
parts.append(f"**Author**: {meta['author']}")
|
||||
if meta.get("published"):
|
||||
parts.append(f"**Published**: {meta['published']}")
|
||||
if meta.get("description"):
|
||||
parts.append(f"**Description**: {meta['description']}")
|
||||
parts.append("\n---\n")
|
||||
elif title and not args.no_metadata:
|
||||
parts.append(f"# {title}\n")
|
||||
|
||||
parts.append(content_md)
|
||||
|
||||
output = "\n".join(parts)
|
||||
|
||||
# Truncate if requested
|
||||
if args.max_length and len(output) > args.max_length:
|
||||
output = output[:args.max_length] + f"\n\n[... truncated at {args.max_length} characters, total {len(output)}]"
|
||||
|
||||
# Print to stdout
|
||||
print(output)
|
||||
|
||||
content_length = len(content_md)
|
||||
print(f"\nExtracted: {content_length} characters", file=sys.stderr)
|
||||
|
||||
# Save to file if requested
|
||||
if args.save:
|
||||
try:
|
||||
with open(args.save, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
print(f"Saved to: {args.save}", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f"Error saving file: {e}", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user