#!/usr/bin/env python3 """Fetch a web page and extract readable content as clean Markdown. Uses requests + BeautifulSoup + readability-lxml + html2text for lightweight, fast extraction without a headless browser. Works well for articles, docs, blogs, wikis, and most static websites. Dependencies: pip install requests beautifulsoup4 readability-lxml html2text """ import sys import argparse def setup_encoding(): """Setup proper encoding for Windows console output.""" if sys.platform == "win32": import io try: sys.stdout.reconfigure(encoding='utf-8', errors='replace') sys.stderr.reconfigure(encoding='utf-8', errors='replace') except (AttributeError, io.UnsupportedOperation): sys.stdout = io.TextIOWrapper( sys.stdout.buffer, encoding='utf-8', errors='replace', line_buffering=True ) sys.stderr = io.TextIOWrapper( sys.stderr.buffer, encoding='utf-8', errors='replace', line_buffering=True ) def check_dependencies(): """Check that required packages are installed.""" missing = [] try: import requests # noqa: F401 except ImportError: missing.append("requests") try: from bs4 import BeautifulSoup # noqa: F401 except ImportError: missing.append("beautifulsoup4") try: from readability import Document # noqa: F401 except ImportError: missing.append("readability-lxml") try: import html2text # noqa: F401 except ImportError: missing.append("html2text") if missing: print(f"Error: missing dependencies: {', '.join(missing)}", file=sys.stderr) print(f"Install with: pip install {' '.join(missing)}", file=sys.stderr) sys.exit(1) def fetch_url(url, timeout=30): """Fetch URL content with proper headers.""" import requests headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", "Accept-Encoding": "gzip, deflate, br", } try: resp = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True) resp.raise_for_status() # Detect encoding if resp.encoding and resp.encoding.lower() != 'utf-8': resp.encoding = resp.apparent_encoding or resp.encoding return resp.text, resp.url, resp.status_code except requests.exceptions.Timeout: print(f"Error: request timed out after {timeout}s", file=sys.stderr) sys.exit(1) except requests.exceptions.ConnectionError as e: print(f"Error: connection failed: {e}", file=sys.stderr) sys.exit(1) except requests.exceptions.HTTPError as e: print(f"Error: HTTP {e.response.status_code}: {e}", file=sys.stderr) sys.exit(1) except Exception as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) def extract_with_readability(html, url): """Extract main article content using readability-lxml.""" from readability import Document doc = Document(html, url=url) title = doc.short_title() content_html = doc.summary() return title, content_html def extract_with_selector(html, selector): """Extract content matching a CSS selector.""" from bs4 import BeautifulSoup soup = BeautifulSoup(html, "html.parser") elements = soup.select(selector) if not elements: return None # Combine all matching elements parts = [] for el in elements: parts.append(str(el)) return "\n".join(parts) def html_to_markdown(html, base_url=None): """Convert HTML to clean Markdown.""" import html2text converter = html2text.HTML2Text() converter.body_width = 0 # Don't wrap lines converter.ignore_images = False converter.ignore_links = False converter.ignore_emphasis = False converter.protect_links = True converter.unicode_snob = True converter.mark_code = True converter.wrap_links = False converter.single_line_break = False if base_url: converter.baseurl = base_url md = converter.handle(html) # Clean up excessive blank lines import re md = re.sub(r'\n{3,}', '\n\n', md) return md.strip() def extract_metadata(html): """Extract page metadata (title, description, etc.).""" from bs4 import BeautifulSoup soup = BeautifulSoup(html, "html.parser") meta = {} # Title title_tag = soup.find("title") if title_tag: meta["title"] = title_tag.get_text(strip=True) # Meta description desc_tag = soup.find("meta", attrs={"name": "description"}) if desc_tag and desc_tag.get("content"): meta["description"] = desc_tag["content"].strip() # OG tags for prop in ["og:title", "og:description", "og:type", "og:site_name"]: tag = soup.find("meta", attrs={"property": prop}) if tag and tag.get("content"): meta[prop.replace("og:", "og_")] = tag["content"].strip() # Author author_tag = soup.find("meta", attrs={"name": "author"}) if author_tag and author_tag.get("content"): meta["author"] = author_tag["content"].strip() # Published date for attr in ["article:published_time", "datePublished", "date"]: date_tag = soup.find("meta", attrs={"property": attr}) or soup.find("meta", attrs={"name": attr}) if date_tag and date_tag.get("content"): meta["published"] = date_tag["content"].strip() break return meta def main(): setup_encoding() check_dependencies() parser = argparse.ArgumentParser( description="Fetch a web page and extract content as Markdown" ) parser.add_argument("url", help="URL to fetch") parser.add_argument("--raw", action="store_true", help="Output full page Markdown (no readability extraction)") parser.add_argument("--selector", type=str, default=None, help="CSS selector to extract specific elements") parser.add_argument("--save", type=str, default=None, help="Also save output to this file path") parser.add_argument("--max-length", type=int, default=None, help="Truncate output to N characters") parser.add_argument("--timeout", type=int, default=30, help="Request timeout in seconds (default: 30)") parser.add_argument("--no-metadata", action="store_true", help="Skip metadata header in output") args = parser.parse_args() # Normalize URL url = args.url.strip() if not url.startswith(("http://", "https://")): url = "https://" + url print(f"Fetching: {url}", file=sys.stderr) # Fetch html, final_url, status = fetch_url(url, timeout=args.timeout) print(f"Status: {status}, Size: {len(html)} bytes", file=sys.stderr) if final_url != url: print(f"Redirected to: {final_url}", file=sys.stderr) # Extract metadata meta = extract_metadata(html) if not args.no_metadata else {} # Extract content if args.selector: # CSS selector mode selected_html = extract_with_selector(html, args.selector) if not selected_html: print(f"Warning: no elements matched selector '{args.selector}'", file=sys.stderr) print(f"[No elements matched CSS selector: {args.selector}]") sys.exit(0) title = meta.get("title", "") content_md = html_to_markdown(selected_html, base_url=final_url) elif args.raw: # Raw full-page mode title = meta.get("title", "") content_md = html_to_markdown(html, base_url=final_url) else: # Readability extraction mode (default) title, article_html = extract_with_readability(html, final_url) content_md = html_to_markdown(article_html, base_url=final_url) # Build output parts = [] if not args.no_metadata and meta: parts.append(f"# {title or meta.get('title', 'Untitled')}") parts.append(f"\n**Source**: {final_url}") if meta.get("author"): parts.append(f"**Author**: {meta['author']}") if meta.get("published"): parts.append(f"**Published**: {meta['published']}") if meta.get("description"): parts.append(f"**Description**: {meta['description']}") parts.append("\n---\n") elif title and not args.no_metadata: parts.append(f"# {title}\n") parts.append(content_md) output = "\n".join(parts) # Truncate if requested if args.max_length and len(output) > args.max_length: output = output[:args.max_length] + f"\n\n[... truncated at {args.max_length} characters, total {len(output)}]" # Print to stdout print(output) content_length = len(content_md) print(f"\nExtracted: {content_length} characters", file=sys.stderr) # Save to file if requested if args.save: try: with open(args.save, "w", encoding="utf-8") as f: f.write(output) print(f"Saved to: {args.save}", file=sys.stderr) except Exception as e: print(f"Error saving file: {e}", file=sys.stderr) if __name__ == "__main__": main()