Add content capture bookmarklet for paywalled sites

- New "Content Capture" bookmarklet sends page HTML directly - Works for paywalled sites (Economist, NYT, etc.) when logged in - Works for Cloudflare-protected sites - Added POST handler to /api/save for HTML content - Added extractFromHtml() for processing captured content - Improved 403 error message with bookmarklet suggestion - Updated bookmarklet page with both options Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-24 22:01:41 +08:00 · 2026-01-23 09:14:09 +00:00
parent 464f93a6aa
commit 1022b1ddca
3 changed files with 283 additions and 34 deletions
--- a/src/lib/utils/extract.ts
+++ b/src/lib/utils/extract.ts
@@ -13,14 +13,30 @@ export interface ExtractedArticle {
 }

 export async function extractArticle(url: string): Promise<ExtractedArticle> {
-  // Fetch the page
+  // Fetch the page with browser-like headers to avoid bot detection
  const response = await fetch(url, {
    headers: {
-      "User-Agent": "Mozilla/5.0 (compatible; ReadLater/1.0)",
+      "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+      "Accept-Language": "en-US,en;q=0.9",
+      "Accept-Encoding": "gzip, deflate, br",
+      "Cache-Control": "no-cache",
+      "Pragma": "no-cache",
+      "Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+      "Sec-Ch-Ua-Mobile": "?0",
+      "Sec-Ch-Ua-Platform": '"macOS"',
+      "Sec-Fetch-Dest": "document",
+      "Sec-Fetch-Mode": "navigate",
+      "Sec-Fetch-Site": "none",
+      "Sec-Fetch-User": "?1",
+      "Upgrade-Insecure-Requests": "1",
    },
  });

  if (!response.ok) {
+    if (response.status === 403) {
+      throw new Error(`This site blocks automated access (403 Forbidden). Try using the bookmarklet from the article page instead - it can capture content your browser can see.`);
+    }
    throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
  }

@@ -60,3 +76,45 @@ export async function extractArticle(url: string): Promise<ExtractedArticle> {
    wordCount,
  };
 }
+
+// Extract article from provided HTML content (for bookmarklet with content capture)
+export async function extractFromHtml(
+  html: string,
+  url: string,
+  fallbackTitle?: string
+): Promise<ExtractedArticle> {
+  const dom = new JSDOM(html, { url });
+  const document = dom.window.document;
+
+  // Extract using Readability
+  const reader = new Readability(document);
+  const article = reader.parse();
+
+  if (!article) {
+    throw new Error("Could not extract article content from provided HTML");
+  }
+
+  // Try to find lead image
+  let leadImage: string | null = null;
+  const ogImage = document.querySelector('meta[property="og:image"]');
+  if (ogImage) {
+    leadImage = ogImage.getAttribute("content");
+  }
+
+  const textContent = article.textContent || "";
+  const content = article.content || "";
+
+  // Calculate word count
+  const wordCount = textContent.split(/\s+/).filter(Boolean).length;
+
+  return {
+    title: article.title || fallbackTitle || "Untitled",
+    author: article.byline || null,
+    siteName: article.siteName || new URL(url).hostname,
+    excerpt: article.excerpt || null,
+    content,
+    textContent,
+    leadImage,
+    wordCount,
+  };
+}