mirror of
https://github.com/Tony0410/readlater.git
synced 2026-05-24 13:52:03 +08:00
- Follow redirects to resolve shortened URLs (c.gle, etc.) - Extract actual article URL from Google News redirect pages - Fetch and extract the real article content instead of the redirect page Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
290 lines
9.2 KiB
TypeScript
290 lines
9.2 KiB
TypeScript
import { Readability } from "@mozilla/readability";
|
|
import { JSDOM, VirtualConsole } from "jsdom";
|
|
|
|
// Create a virtual console that suppresses CSS parsing errors
|
|
// JSDOM has issues with modern CSS (variables, etc.) that don't affect Readability
|
|
function createVirtualConsole() {
|
|
const virtualConsole = new VirtualConsole();
|
|
virtualConsole.on("error", () => {
|
|
// Suppress CSS parsing errors
|
|
});
|
|
virtualConsole.on("warn", () => {
|
|
// Suppress warnings
|
|
});
|
|
return virtualConsole;
|
|
}
|
|
|
|
// Strip style tags and inline styles from HTML to prevent JSDOM CSS parsing errors
|
|
// Readability doesn't need CSS - it only needs the DOM structure
|
|
function stripStyles(html: string): string {
|
|
// Remove <style> tags and their contents
|
|
let cleaned = html.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "");
|
|
// Remove style attributes (but keep the rest of the tag)
|
|
cleaned = cleaned.replace(/\s+style\s*=\s*["'][^"']*["']/gi, "");
|
|
return cleaned;
|
|
}
|
|
|
|
export interface ExtractedArticle {
|
|
title: string;
|
|
author: string | null;
|
|
siteName: string | null;
|
|
excerpt: string | null;
|
|
content: string;
|
|
textContent: string;
|
|
leadImage: string | null;
|
|
wordCount: number;
|
|
publishedAt: Date | null;
|
|
}
|
|
|
|
// Try to extract actual article URL from Google News redirect page
|
|
function extractGoogleNewsUrl(html: string): string | null {
|
|
// Google News embeds the real URL in various ways
|
|
// Look for data-n-au attribute (article URL)
|
|
const dataMatch = html.match(/data-n-au="([^"]+)"/);
|
|
if (dataMatch) return decodeURIComponent(dataMatch[1]);
|
|
|
|
// Look for canonical link
|
|
const canonicalMatch = html.match(/<link[^>]+rel=["']canonical["'][^>]+href=["']([^"']+)["']/i);
|
|
if (canonicalMatch && !canonicalMatch[1].includes('news.google.com')) {
|
|
return canonicalMatch[1];
|
|
}
|
|
|
|
// Look for og:url that's not Google News
|
|
const ogMatch = html.match(/<meta[^>]+property=["']og:url["'][^>]+content=["']([^"']+)["']/i);
|
|
if (ogMatch && !ogMatch[1].includes('news.google.com')) {
|
|
return ogMatch[1];
|
|
}
|
|
|
|
// Look for article link in jsdata or similar
|
|
const jsMatch = html.match(/jsdata="[^"]*https?:\/\/(?!news\.google\.com)[^"&\s]+/);
|
|
if (jsMatch) {
|
|
const urlMatch = jsMatch[0].match(/https?:\/\/[^"&\s]+/);
|
|
if (urlMatch) return urlMatch[0];
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
export async function extractArticle(url: string): Promise<ExtractedArticle> {
|
|
// Resolve shortened/redirect URLs first
|
|
let resolvedUrl = url;
|
|
|
|
// Follow redirects to get final URL
|
|
try {
|
|
const headResponse = await fetch(url, {
|
|
method: 'HEAD',
|
|
redirect: 'follow',
|
|
headers: {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
},
|
|
});
|
|
resolvedUrl = headResponse.url;
|
|
} catch {
|
|
// If HEAD fails, continue with original URL
|
|
}
|
|
|
|
// Fetch the page with browser-like headers to avoid bot detection
|
|
const response = await fetch(resolvedUrl, {
|
|
headers: {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"Cache-Control": "no-cache",
|
|
"Pragma": "no-cache",
|
|
"Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
|
"Sec-Ch-Ua-Mobile": "?0",
|
|
"Sec-Ch-Ua-Platform": '"macOS"',
|
|
"Sec-Fetch-Dest": "document",
|
|
"Sec-Fetch-Mode": "navigate",
|
|
"Sec-Fetch-Site": "none",
|
|
"Sec-Fetch-User": "?1",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
},
|
|
});
|
|
|
|
if (!response.ok) {
|
|
// On 403/blocked, return minimal article with just URL info
|
|
if (response.status === 403 || response.status === 401) {
|
|
const hostname = new URL(url).hostname.replace(/^www\./, "");
|
|
return {
|
|
title: `Article from ${hostname}`,
|
|
author: null,
|
|
siteName: hostname,
|
|
excerpt: "This site blocked automated access. Use 'Open original' to read, or the Content Capture bookmarklet to save the full article.",
|
|
content: `<p>This site blocked automated access. <a href="${url}" target="_blank">Open original article</a> to read.</p><p>Tip: Use the Content Capture bookmarklet from the article page to save the full content.</p>`,
|
|
textContent: "This site blocked automated access. Open original article to read.",
|
|
leadImage: null,
|
|
wordCount: 0,
|
|
publishedAt: null,
|
|
};
|
|
}
|
|
throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
|
|
}
|
|
|
|
let html = await response.text();
|
|
let finalUrl = resolvedUrl;
|
|
|
|
// Check if we landed on Google News - need to extract actual article URL
|
|
if (resolvedUrl.includes('news.google.com')) {
|
|
const realUrl = extractGoogleNewsUrl(html);
|
|
if (realUrl) {
|
|
// Fetch the actual article
|
|
const articleResponse = await fetch(realUrl, {
|
|
headers: {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
},
|
|
});
|
|
if (articleResponse.ok) {
|
|
html = await articleResponse.text();
|
|
finalUrl = realUrl;
|
|
}
|
|
}
|
|
}
|
|
|
|
const cleanedHtml = stripStyles(html);
|
|
const dom = new JSDOM(cleanedHtml, {
|
|
url: finalUrl,
|
|
virtualConsole: createVirtualConsole(),
|
|
});
|
|
const document = dom.window.document;
|
|
|
|
// Extract using Readability
|
|
const reader = new Readability(document);
|
|
const article = reader.parse();
|
|
|
|
if (!article) {
|
|
throw new Error("Could not extract article content");
|
|
}
|
|
|
|
// Try to find lead image
|
|
let leadImage: string | null = null;
|
|
const ogImage = document.querySelector('meta[property="og:image"]');
|
|
if (ogImage) {
|
|
leadImage = ogImage.getAttribute("content");
|
|
}
|
|
|
|
// Try to find publish date from various meta tags
|
|
let publishedAt: Date | null = null;
|
|
const dateSelectors = [
|
|
'meta[property="article:published_time"]',
|
|
'meta[name="article:published_time"]',
|
|
'meta[property="og:published_time"]',
|
|
'meta[name="pubdate"]',
|
|
'meta[name="publishdate"]',
|
|
'meta[name="date"]',
|
|
'meta[itemprop="datePublished"]',
|
|
'time[datetime]',
|
|
'time[pubdate]',
|
|
];
|
|
|
|
for (const selector of dateSelectors) {
|
|
const el = document.querySelector(selector);
|
|
if (el) {
|
|
const dateStr = el.getAttribute("content") || el.getAttribute("datetime");
|
|
if (dateStr) {
|
|
const parsed = new Date(dateStr);
|
|
if (!isNaN(parsed.getTime())) {
|
|
publishedAt = parsed;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
const textContent = article.textContent || "";
|
|
const content = article.content || "";
|
|
|
|
// Calculate word count
|
|
const wordCount = textContent.split(/\s+/).filter(Boolean).length;
|
|
|
|
return {
|
|
title: article.title || "Untitled",
|
|
author: article.byline || null,
|
|
siteName: article.siteName || new URL(finalUrl).hostname,
|
|
excerpt: article.excerpt || null,
|
|
content,
|
|
textContent,
|
|
leadImage,
|
|
wordCount,
|
|
publishedAt,
|
|
};
|
|
}
|
|
|
|
// Extract article from provided HTML content (for bookmarklet with content capture)
|
|
export async function extractFromHtml(
|
|
html: string,
|
|
url: string,
|
|
fallbackTitle?: string
|
|
): Promise<ExtractedArticle> {
|
|
const cleanedHtml = stripStyles(html);
|
|
const dom = new JSDOM(cleanedHtml, {
|
|
url,
|
|
virtualConsole: createVirtualConsole(),
|
|
});
|
|
const document = dom.window.document;
|
|
|
|
// Extract using Readability
|
|
const reader = new Readability(document);
|
|
const article = reader.parse();
|
|
|
|
if (!article) {
|
|
throw new Error("Could not extract article content from provided HTML");
|
|
}
|
|
|
|
// Try to find lead image
|
|
let leadImage: string | null = null;
|
|
const ogImage = document.querySelector('meta[property="og:image"]');
|
|
if (ogImage) {
|
|
leadImage = ogImage.getAttribute("content");
|
|
}
|
|
|
|
// Try to find publish date from various meta tags
|
|
let publishedAt: Date | null = null;
|
|
const dateSelectors = [
|
|
'meta[property="article:published_time"]',
|
|
'meta[name="article:published_time"]',
|
|
'meta[property="og:published_time"]',
|
|
'meta[name="pubdate"]',
|
|
'meta[name="publishdate"]',
|
|
'meta[name="date"]',
|
|
'meta[itemprop="datePublished"]',
|
|
'time[datetime]',
|
|
'time[pubdate]',
|
|
];
|
|
|
|
for (const selector of dateSelectors) {
|
|
const el = document.querySelector(selector);
|
|
if (el) {
|
|
const dateStr = el.getAttribute("content") || el.getAttribute("datetime");
|
|
if (dateStr) {
|
|
const parsed = new Date(dateStr);
|
|
if (!isNaN(parsed.getTime())) {
|
|
publishedAt = parsed;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
const textContent = article.textContent || "";
|
|
const content = article.content || "";
|
|
|
|
// Calculate word count
|
|
const wordCount = textContent.split(/\s+/).filter(Boolean).length;
|
|
|
|
return {
|
|
title: article.title || fallbackTitle || "Untitled",
|
|
author: article.byline || null,
|
|
siteName: article.siteName || new URL(url).hostname,
|
|
excerpt: article.excerpt || null,
|
|
content,
|
|
textContent,
|
|
leadImage,
|
|
wordCount,
|
|
publishedAt,
|
|
};
|
|
}
|