Handle email digest redirect URLs and Google News links

- Follow redirects to resolve shortened URLs (c.gle, etc.) - Extract actual article URL from Google News redirect pages - Fetch and extract the real article content instead of the redirect page Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-24 22:01:41 +08:00 · 2026-01-28 04:41:35 +00:00
1 changed files with 71 additions and 4 deletions
--- a/src/lib/utils/extract.ts
+++ b/src/lib/utils/extract.ts
@@ -36,9 +36,55 @@ export interface ExtractedArticle {
  publishedAt: Date | null;
 }

+// Try to extract actual article URL from Google News redirect page
+function extractGoogleNewsUrl(html: string): string | null {
+  // Google News embeds the real URL in various ways
+  // Look for data-n-au attribute (article URL)
+  const dataMatch = html.match(/data-n-au="([^"]+)"/);
+  if (dataMatch) return decodeURIComponent(dataMatch[1]);
+
+  // Look for canonical link
+  const canonicalMatch = html.match(/<link[^>]+rel=["']canonical["'][^>]+href=["']([^"']+)["']/i);
+  if (canonicalMatch && !canonicalMatch[1].includes('news.google.com')) {
+    return canonicalMatch[1];
+  }
+
+  // Look for og:url that's not Google News
+  const ogMatch = html.match(/<meta[^>]+property=["']og:url["'][^>]+content=["']([^"']+)["']/i);
+  if (ogMatch && !ogMatch[1].includes('news.google.com')) {
+    return ogMatch[1];
+  }
+
+  // Look for article link in jsdata or similar
+  const jsMatch = html.match(/jsdata="[^"]*https?:\/\/(?!news\.google\.com)[^"&\s]+/);
+  if (jsMatch) {
+    const urlMatch = jsMatch[0].match(/https?:\/\/[^"&\s]+/);
+    if (urlMatch) return urlMatch[0];
+  }
+
+  return null;
+}
+
 export async function extractArticle(url: string): Promise<ExtractedArticle> {
+  // Resolve shortened/redirect URLs first
+  let resolvedUrl = url;
+
+  // Follow redirects to get final URL
+  try {
+    const headResponse = await fetch(url, {
+      method: 'HEAD',
+      redirect: 'follow',
+      headers: {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+      },
+    });
+    resolvedUrl = headResponse.url;
+  } catch {
+    // If HEAD fails, continue with original URL
+  }
+
  // Fetch the page with browser-like headers to avoid bot detection
-  const response = await fetch(url, {
+  const response = await fetch(resolvedUrl, {
    headers: {
      "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
@@ -76,10 +122,31 @@ export async function extractArticle(url: string): Promise<ExtractedArticle> {
    throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
  }

-  const html = await response.text();
+  let html = await response.text();
+  let finalUrl = resolvedUrl;
+
+  // Check if we landed on Google News - need to extract actual article URL
+  if (resolvedUrl.includes('news.google.com')) {
+    const realUrl = extractGoogleNewsUrl(html);
+    if (realUrl) {
+      // Fetch the actual article
+      const articleResponse = await fetch(realUrl, {
+        headers: {
+          "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+          "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+          "Accept-Language": "en-US,en;q=0.9",
+        },
+      });
+      if (articleResponse.ok) {
+        html = await articleResponse.text();
+        finalUrl = realUrl;
+      }
+    }
+  }
+
  const cleanedHtml = stripStyles(html);
  const dom = new JSDOM(cleanedHtml, {
-    url,
+    url: finalUrl,
    virtualConsole: createVirtualConsole(),
  });
  const document = dom.window.document;
@@ -136,7 +203,7 @@ export async function extractArticle(url: string): Promise<ExtractedArticle> {
  return {
    title: article.title || "Untitled",
    author: article.byline || null,
-    siteName: article.siteName || new URL(url).hostname,
+    siteName: article.siteName || new URL(finalUrl).hostname,
    excerpt: article.excerpt || null,
    content,
    textContent,