mirror of
https://github.com/Tony0410/readlater.git
synced 2026-05-24 22:01:41 +08:00
- New "Content Capture" bookmarklet sends page HTML directly - Works for paywalled sites (Economist, NYT, etc.) when logged in - Works for Cloudflare-protected sites - Added POST handler to /api/save for HTML content - Added extractFromHtml() for processing captured content - Improved 403 error message with bookmarklet suggestion - Updated bookmarklet page with both options Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
121 lines
3.6 KiB
TypeScript
121 lines
3.6 KiB
TypeScript
import { Readability } from "@mozilla/readability";
|
|
import { JSDOM } from "jsdom";
|
|
|
|
export interface ExtractedArticle {
|
|
title: string;
|
|
author: string | null;
|
|
siteName: string | null;
|
|
excerpt: string | null;
|
|
content: string;
|
|
textContent: string;
|
|
leadImage: string | null;
|
|
wordCount: number;
|
|
}
|
|
|
|
export async function extractArticle(url: string): Promise<ExtractedArticle> {
|
|
// Fetch the page with browser-like headers to avoid bot detection
|
|
const response = await fetch(url, {
|
|
headers: {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"Cache-Control": "no-cache",
|
|
"Pragma": "no-cache",
|
|
"Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
|
"Sec-Ch-Ua-Mobile": "?0",
|
|
"Sec-Ch-Ua-Platform": '"macOS"',
|
|
"Sec-Fetch-Dest": "document",
|
|
"Sec-Fetch-Mode": "navigate",
|
|
"Sec-Fetch-Site": "none",
|
|
"Sec-Fetch-User": "?1",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
},
|
|
});
|
|
|
|
if (!response.ok) {
|
|
if (response.status === 403) {
|
|
throw new Error(`This site blocks automated access (403 Forbidden). Try using the bookmarklet from the article page instead - it can capture content your browser can see.`);
|
|
}
|
|
throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
|
|
}
|
|
|
|
const html = await response.text();
|
|
const dom = new JSDOM(html, { url });
|
|
const document = dom.window.document;
|
|
|
|
// Extract using Readability
|
|
const reader = new Readability(document);
|
|
const article = reader.parse();
|
|
|
|
if (!article) {
|
|
throw new Error("Could not extract article content");
|
|
}
|
|
|
|
// Try to find lead image
|
|
let leadImage: string | null = null;
|
|
const ogImage = document.querySelector('meta[property="og:image"]');
|
|
if (ogImage) {
|
|
leadImage = ogImage.getAttribute("content");
|
|
}
|
|
|
|
const textContent = article.textContent || "";
|
|
const content = article.content || "";
|
|
|
|
// Calculate word count
|
|
const wordCount = textContent.split(/\s+/).filter(Boolean).length;
|
|
|
|
return {
|
|
title: article.title || "Untitled",
|
|
author: article.byline || null,
|
|
siteName: article.siteName || new URL(url).hostname,
|
|
excerpt: article.excerpt || null,
|
|
content,
|
|
textContent,
|
|
leadImage,
|
|
wordCount,
|
|
};
|
|
}
|
|
|
|
// Extract article from provided HTML content (for bookmarklet with content capture)
|
|
export async function extractFromHtml(
|
|
html: string,
|
|
url: string,
|
|
fallbackTitle?: string
|
|
): Promise<ExtractedArticle> {
|
|
const dom = new JSDOM(html, { url });
|
|
const document = dom.window.document;
|
|
|
|
// Extract using Readability
|
|
const reader = new Readability(document);
|
|
const article = reader.parse();
|
|
|
|
if (!article) {
|
|
throw new Error("Could not extract article content from provided HTML");
|
|
}
|
|
|
|
// Try to find lead image
|
|
let leadImage: string | null = null;
|
|
const ogImage = document.querySelector('meta[property="og:image"]');
|
|
if (ogImage) {
|
|
leadImage = ogImage.getAttribute("content");
|
|
}
|
|
|
|
const textContent = article.textContent || "";
|
|
const content = article.content || "";
|
|
|
|
// Calculate word count
|
|
const wordCount = textContent.split(/\s+/).filter(Boolean).length;
|
|
|
|
return {
|
|
title: article.title || fallbackTitle || "Untitled",
|
|
author: article.byline || null,
|
|
siteName: article.siteName || new URL(url).hostname,
|
|
excerpt: article.excerpt || null,
|
|
content,
|
|
textContent,
|
|
leadImage,
|
|
wordCount,
|
|
};
|
|
}
|