From 9e7e2cc199a95659c68d9fc72ed98073bbec26b0 Mon Sep 17 00:00:00 2001 From: Gemini Agent Date: Wed, 28 Jan 2026 04:41:35 +0000 Subject: [PATCH] Handle email digest redirect URLs and Google News links - Follow redirects to resolve shortened URLs (c.gle, etc.) - Extract actual article URL from Google News redirect pages - Fetch and extract the real article content instead of the redirect page Co-Authored-By: Claude Opus 4.5 --- src/lib/utils/extract.ts | 75 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 4 deletions(-) diff --git a/src/lib/utils/extract.ts b/src/lib/utils/extract.ts index b60188f..b725bb9 100644 --- a/src/lib/utils/extract.ts +++ b/src/lib/utils/extract.ts @@ -36,9 +36,55 @@ export interface ExtractedArticle { publishedAt: Date | null; } +// Try to extract actual article URL from Google News redirect page +function extractGoogleNewsUrl(html: string): string | null { + // Google News embeds the real URL in various ways + // Look for data-n-au attribute (article URL) + const dataMatch = html.match(/data-n-au="([^"]+)"/); + if (dataMatch) return decodeURIComponent(dataMatch[1]); + + // Look for canonical link + const canonicalMatch = html.match(/]+rel=["']canonical["'][^>]+href=["']([^"']+)["']/i); + if (canonicalMatch && !canonicalMatch[1].includes('news.google.com')) { + return canonicalMatch[1]; + } + + // Look for og:url that's not Google News + const ogMatch = html.match(/]+property=["']og:url["'][^>]+content=["']([^"']+)["']/i); + if (ogMatch && !ogMatch[1].includes('news.google.com')) { + return ogMatch[1]; + } + + // Look for article link in jsdata or similar + const jsMatch = html.match(/jsdata="[^"]*https?:\/\/(?!news\.google\.com)[^"&\s]+/); + if (jsMatch) { + const urlMatch = jsMatch[0].match(/https?:\/\/[^"&\s]+/); + if (urlMatch) return urlMatch[0]; + } + + return null; +} + export async function extractArticle(url: string): Promise { + // Resolve shortened/redirect URLs first + let resolvedUrl = url; + + // Follow redirects to get final URL + try { + const headResponse = await fetch(url, { + method: 'HEAD', + redirect: 'follow', + headers: { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + }, + }); + resolvedUrl = headResponse.url; + } catch { + // If HEAD fails, continue with original URL + } + // Fetch the page with browser-like headers to avoid bot detection - const response = await fetch(url, { + const response = await fetch(resolvedUrl, { headers: { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", @@ -76,10 +122,31 @@ export async function extractArticle(url: string): Promise { throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`); } - const html = await response.text(); + let html = await response.text(); + let finalUrl = resolvedUrl; + + // Check if we landed on Google News - need to extract actual article URL + if (resolvedUrl.includes('news.google.com')) { + const realUrl = extractGoogleNewsUrl(html); + if (realUrl) { + // Fetch the actual article + const articleResponse = await fetch(realUrl, { + headers: { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + }, + }); + if (articleResponse.ok) { + html = await articleResponse.text(); + finalUrl = realUrl; + } + } + } + const cleanedHtml = stripStyles(html); const dom = new JSDOM(cleanedHtml, { - url, + url: finalUrl, virtualConsole: createVirtualConsole(), }); const document = dom.window.document; @@ -136,7 +203,7 @@ export async function extractArticle(url: string): Promise { return { title: article.title || "Untitled", author: article.byline || null, - siteName: article.siteName || new URL(url).hostname, + siteName: article.siteName || new URL(finalUrl).hostname, excerpt: article.excerpt || null, content, textContent,