import { Readability } from "@mozilla/readability"; import { JSDOM } from "jsdom"; export interface ExtractedArticle { title: string; author: string | null; siteName: string | null; excerpt: string | null; content: string; textContent: string; leadImage: string | null; wordCount: number; } export async function extractArticle(url: string): Promise { // Fetch the page with browser-like headers to avoid bot detection const response = await fetch(url, { headers: { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Cache-Control": "no-cache", "Pragma": "no-cache", "Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": '"macOS"', "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", "Upgrade-Insecure-Requests": "1", }, }); if (!response.ok) { if (response.status === 403) { throw new Error(`This site blocks automated access (403 Forbidden). Try using the bookmarklet from the article page instead - it can capture content your browser can see.`); } throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`); } const html = await response.text(); const dom = new JSDOM(html, { url }); const document = dom.window.document; // Extract using Readability const reader = new Readability(document); const article = reader.parse(); if (!article) { throw new Error("Could not extract article content"); } // Try to find lead image let leadImage: string | null = null; const ogImage = document.querySelector('meta[property="og:image"]'); if (ogImage) { leadImage = ogImage.getAttribute("content"); } const textContent = article.textContent || ""; const content = article.content || ""; // Calculate word count const wordCount = textContent.split(/\s+/).filter(Boolean).length; return { title: article.title || "Untitled", author: article.byline || null, siteName: article.siteName || new URL(url).hostname, excerpt: article.excerpt || null, content, textContent, leadImage, wordCount, }; } // Extract article from provided HTML content (for bookmarklet with content capture) export async function extractFromHtml( html: string, url: string, fallbackTitle?: string ): Promise { const dom = new JSDOM(html, { url }); const document = dom.window.document; // Extract using Readability const reader = new Readability(document); const article = reader.parse(); if (!article) { throw new Error("Could not extract article content from provided HTML"); } // Try to find lead image let leadImage: string | null = null; const ogImage = document.querySelector('meta[property="og:image"]'); if (ogImage) { leadImage = ogImage.getAttribute("content"); } const textContent = article.textContent || ""; const content = article.content || ""; // Calculate word count const wordCount = textContent.split(/\s+/).filter(Boolean).length; return { title: article.title || fallbackTitle || "Untitled", author: article.byline || null, siteName: article.siteName || new URL(url).hostname, excerpt: article.excerpt || null, content, textContent, leadImage, wordCount, }; }