diff --git a/src/lib/utils/extract.ts b/src/lib/utils/extract.ts
index b60188f..b725bb9 100644
--- a/src/lib/utils/extract.ts
+++ b/src/lib/utils/extract.ts
@@ -36,9 +36,55 @@ export interface ExtractedArticle {
publishedAt: Date | null;
}
+// Try to extract actual article URL from Google News redirect page
+function extractGoogleNewsUrl(html: string): string | null {
+ // Google News embeds the real URL in various ways
+ // Look for data-n-au attribute (article URL)
+ const dataMatch = html.match(/data-n-au="([^"]+)"/);
+ if (dataMatch) return decodeURIComponent(dataMatch[1]);
+
+ // Look for canonical link
+ const canonicalMatch = html.match(/]+rel=["']canonical["'][^>]+href=["']([^"']+)["']/i);
+ if (canonicalMatch && !canonicalMatch[1].includes('news.google.com')) {
+ return canonicalMatch[1];
+ }
+
+ // Look for og:url that's not Google News
+ const ogMatch = html.match(/]+property=["']og:url["'][^>]+content=["']([^"']+)["']/i);
+ if (ogMatch && !ogMatch[1].includes('news.google.com')) {
+ return ogMatch[1];
+ }
+
+ // Look for article link in jsdata or similar
+ const jsMatch = html.match(/jsdata="[^"]*https?:\/\/(?!news\.google\.com)[^"&\s]+/);
+ if (jsMatch) {
+ const urlMatch = jsMatch[0].match(/https?:\/\/[^"&\s]+/);
+ if (urlMatch) return urlMatch[0];
+ }
+
+ return null;
+}
+
export async function extractArticle(url: string): Promise {
+ // Resolve shortened/redirect URLs first
+ let resolvedUrl = url;
+
+ // Follow redirects to get final URL
+ try {
+ const headResponse = await fetch(url, {
+ method: 'HEAD',
+ redirect: 'follow',
+ headers: {
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ },
+ });
+ resolvedUrl = headResponse.url;
+ } catch {
+ // If HEAD fails, continue with original URL
+ }
+
// Fetch the page with browser-like headers to avoid bot detection
- const response = await fetch(url, {
+ const response = await fetch(resolvedUrl, {
headers: {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
@@ -76,10 +122,31 @@ export async function extractArticle(url: string): Promise {
throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
}
- const html = await response.text();
+ let html = await response.text();
+ let finalUrl = resolvedUrl;
+
+ // Check if we landed on Google News - need to extract actual article URL
+ if (resolvedUrl.includes('news.google.com')) {
+ const realUrl = extractGoogleNewsUrl(html);
+ if (realUrl) {
+ // Fetch the actual article
+ const articleResponse = await fetch(realUrl, {
+ headers: {
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+ "Accept-Language": "en-US,en;q=0.9",
+ },
+ });
+ if (articleResponse.ok) {
+ html = await articleResponse.text();
+ finalUrl = realUrl;
+ }
+ }
+ }
+
const cleanedHtml = stripStyles(html);
const dom = new JSDOM(cleanedHtml, {
- url,
+ url: finalUrl,
virtualConsole: createVirtualConsole(),
});
const document = dom.window.document;
@@ -136,7 +203,7 @@ export async function extractArticle(url: string): Promise {
return {
title: article.title || "Untitled",
author: article.byline || null,
- siteName: article.siteName || new URL(url).hostname,
+ siteName: article.siteName || new URL(finalUrl).hostname,
excerpt: article.excerpt || null,
content,
textContent,