From 9e7e2cc199a95659c68d9fc72ed98073bbec26b0 Mon Sep 17 00:00:00 2001
From: Gemini Agent <gemini-agent@homelab.local>
Date: Wed, 28 Jan 2026 04:41:35 +0000
Subject: [PATCH] Handle email digest redirect URLs and Google News links

- Follow redirects to resolve shortened URLs (c.gle, etc.)
- Extract actual article URL from Google News redirect pages
- Fetch and extract the real article content instead of the redirect page

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/lib/utils/extract.ts | 75 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 71 insertions(+), 4 deletions(-)
diff --git a/src/lib/utils/extract.ts b/src/lib/utils/extract.ts
index b60188f..b725bb9 100644
--- a/src/lib/utils/extract.ts
+++ b/src/lib/utils/extract.ts
@@ -36,9 +36,55 @@ export interface ExtractedArticle {
   publishedAt: Date | null;
 }
 
+// Try to extract actual article URL from Google News redirect page
+function extractGoogleNewsUrl(html: string): string | null {
+  // Google News embeds the real URL in various ways
+  // Look for data-n-au attribute (article URL)
+  const dataMatch = html.match(/data-n-au="([^"]+)"/);
+  if (dataMatch) return decodeURIComponent(dataMatch[1]);
+
+  // Look for canonical link
+  const canonicalMatch = html.match(/<link[^>]+rel=["']canonical["'][^>]+href=["']([^"']+)["']/i);
+  if (canonicalMatch && !canonicalMatch[1].includes('news.google.com')) {
+    return canonicalMatch[1];
+  }
+
+  // Look for og:url that's not Google News
+  const ogMatch = html.match(/<meta[^>]+property=["']og:url["'][^>]+content=["']([^"']+)["']/i);
+  if (ogMatch && !ogMatch[1].includes('news.google.com')) {
+    return ogMatch[1];
+  }
+
+  // Look for article link in jsdata or similar
+  const jsMatch = html.match(/jsdata="[^"]*https?:\/\/(?!news\.google\.com)[^"&\s]+/);
+  if (jsMatch) {
+    const urlMatch = jsMatch[0].match(/https?:\/\/[^"&\s]+/);
+    if (urlMatch) return urlMatch[0];
+  }
+
+  return null;
+}
+
 export async function extractArticle(url: string): Promise<ExtractedArticle> {
+  // Resolve shortened/redirect URLs first
+  let resolvedUrl = url;
+
+  // Follow redirects to get final URL
+  try {
+    const headResponse = await fetch(url, {
+      method: 'HEAD',
+      redirect: 'follow',
+      headers: {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+      },
+    });
+    resolvedUrl = headResponse.url;
+  } catch {
+    // If HEAD fails, continue with original URL
+  }
+
   // Fetch the page with browser-like headers to avoid bot detection
-  const response = await fetch(url, {
+  const response = await fetch(resolvedUrl, {
     headers: {
       "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
       "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
@@ -76,10 +122,31 @@ export async function extractArticle(url: string): Promise<ExtractedArticle> {
     throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
   }
 
-  const html = await response.text();
+  let html = await response.text();
+  let finalUrl = resolvedUrl;
+
+  // Check if we landed on Google News - need to extract actual article URL
+  if (resolvedUrl.includes('news.google.com')) {
+    const realUrl = extractGoogleNewsUrl(html);
+    if (realUrl) {
+      // Fetch the actual article
+      const articleResponse = await fetch(realUrl, {
+        headers: {
+          "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+          "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+          "Accept-Language": "en-US,en;q=0.9",
+        },
+      });
+      if (articleResponse.ok) {
+        html = await articleResponse.text();
+        finalUrl = realUrl;
+      }
+    }
+  }
+
   const cleanedHtml = stripStyles(html);
   const dom = new JSDOM(cleanedHtml, {
-    url,
+    url: finalUrl,
     virtualConsole: createVirtualConsole(),
   });
   const document = dom.window.document;
@@ -136,7 +203,7 @@ export async function extractArticle(url: string): Promise<ExtractedArticle> {
   return {
     title: article.title || "Untitled",
     author: article.byline || null,
-    siteName: article.siteName || new URL(url).hostname,
+    siteName: article.siteName || new URL(finalUrl).hostname,
     excerpt: article.excerpt || null,
     content,
     textContent,