mirror of
https://github.com/Tony0410/readlater.git
synced 2026-05-24 22:01:41 +08:00
Handle email digest redirect URLs and Google News links
- Follow redirects to resolve shortened URLs (c.gle, etc.) - Extract actual article URL from Google News redirect pages - Fetch and extract the real article content instead of the redirect page Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -36,9 +36,55 @@ export interface ExtractedArticle {
|
|||||||
publishedAt: Date | null;
|
publishedAt: Date | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Try to extract actual article URL from Google News redirect page
|
||||||
|
function extractGoogleNewsUrl(html: string): string | null {
|
||||||
|
// Google News embeds the real URL in various ways
|
||||||
|
// Look for data-n-au attribute (article URL)
|
||||||
|
const dataMatch = html.match(/data-n-au="([^"]+)"/);
|
||||||
|
if (dataMatch) return decodeURIComponent(dataMatch[1]);
|
||||||
|
|
||||||
|
// Look for canonical link
|
||||||
|
const canonicalMatch = html.match(/<link[^>]+rel=["']canonical["'][^>]+href=["']([^"']+)["']/i);
|
||||||
|
if (canonicalMatch && !canonicalMatch[1].includes('news.google.com')) {
|
||||||
|
return canonicalMatch[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for og:url that's not Google News
|
||||||
|
const ogMatch = html.match(/<meta[^>]+property=["']og:url["'][^>]+content=["']([^"']+)["']/i);
|
||||||
|
if (ogMatch && !ogMatch[1].includes('news.google.com')) {
|
||||||
|
return ogMatch[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for article link in jsdata or similar
|
||||||
|
const jsMatch = html.match(/jsdata="[^"]*https?:\/\/(?!news\.google\.com)[^"&\s]+/);
|
||||||
|
if (jsMatch) {
|
||||||
|
const urlMatch = jsMatch[0].match(/https?:\/\/[^"&\s]+/);
|
||||||
|
if (urlMatch) return urlMatch[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
export async function extractArticle(url: string): Promise<ExtractedArticle> {
|
export async function extractArticle(url: string): Promise<ExtractedArticle> {
|
||||||
|
// Resolve shortened/redirect URLs first
|
||||||
|
let resolvedUrl = url;
|
||||||
|
|
||||||
|
// Follow redirects to get final URL
|
||||||
|
try {
|
||||||
|
const headResponse = await fetch(url, {
|
||||||
|
method: 'HEAD',
|
||||||
|
redirect: 'follow',
|
||||||
|
headers: {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
},
|
||||||
|
});
|
||||||
|
resolvedUrl = headResponse.url;
|
||||||
|
} catch {
|
||||||
|
// If HEAD fails, continue with original URL
|
||||||
|
}
|
||||||
|
|
||||||
// Fetch the page with browser-like headers to avoid bot detection
|
// Fetch the page with browser-like headers to avoid bot detection
|
||||||
const response = await fetch(url, {
|
const response = await fetch(resolvedUrl, {
|
||||||
headers: {
|
headers: {
|
||||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
||||||
@@ -76,10 +122,31 @@ export async function extractArticle(url: string): Promise<ExtractedArticle> {
|
|||||||
throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
|
throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const html = await response.text();
|
let html = await response.text();
|
||||||
|
let finalUrl = resolvedUrl;
|
||||||
|
|
||||||
|
// Check if we landed on Google News - need to extract actual article URL
|
||||||
|
if (resolvedUrl.includes('news.google.com')) {
|
||||||
|
const realUrl = extractGoogleNewsUrl(html);
|
||||||
|
if (realUrl) {
|
||||||
|
// Fetch the actual article
|
||||||
|
const articleResponse = await fetch(realUrl, {
|
||||||
|
headers: {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
},
|
||||||
|
});
|
||||||
|
if (articleResponse.ok) {
|
||||||
|
html = await articleResponse.text();
|
||||||
|
finalUrl = realUrl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const cleanedHtml = stripStyles(html);
|
const cleanedHtml = stripStyles(html);
|
||||||
const dom = new JSDOM(cleanedHtml, {
|
const dom = new JSDOM(cleanedHtml, {
|
||||||
url,
|
url: finalUrl,
|
||||||
virtualConsole: createVirtualConsole(),
|
virtualConsole: createVirtualConsole(),
|
||||||
});
|
});
|
||||||
const document = dom.window.document;
|
const document = dom.window.document;
|
||||||
@@ -136,7 +203,7 @@ export async function extractArticle(url: string): Promise<ExtractedArticle> {
|
|||||||
return {
|
return {
|
||||||
title: article.title || "Untitled",
|
title: article.title || "Untitled",
|
||||||
author: article.byline || null,
|
author: article.byline || null,
|
||||||
siteName: article.siteName || new URL(url).hostname,
|
siteName: article.siteName || new URL(finalUrl).hostname,
|
||||||
excerpt: article.excerpt || null,
|
excerpt: article.excerpt || null,
|
||||||
content,
|
content,
|
||||||
textContent,
|
textContent,
|
||||||
|
|||||||
Reference in New Issue
Block a user