readlater/src/lib/utils/extract.ts

import { Readability } from "@mozilla/readability";
import { JSDOM, VirtualConsole } from "jsdom";

// Create a virtual console that suppresses CSS parsing errors
// JSDOM has issues with modern CSS (variables, etc.) that don't affect Readability
function createVirtualConsole() {
  const virtualConsole = new VirtualConsole();
  virtualConsole.on("error", () => {
    // Suppress CSS parsing errors
  });
  virtualConsole.on("warn", () => {
    // Suppress warnings
  });
  return virtualConsole;
}

// Strip style tags and inline styles from HTML to prevent JSDOM CSS parsing errors
// Readability doesn't need CSS - it only needs the DOM structure
function stripStyles(html: string): string {
  // Remove <style> tags and their contents
  let cleaned = html.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "");
  // Remove style attributes (but keep the rest of the tag)
  cleaned = cleaned.replace(/\s+style\s*=\s*["'][^"']*["']/gi, "");
  return cleaned;
}

export interface ExtractedArticle {
  title: string;
  author: string | null;
  siteName: string | null;
  excerpt: string | null;
  content: string;
  textContent: string;
  leadImage: string | null;
  wordCount: number;
  publishedAt: Date | null;
}

// Try to extract actual article URL from Google News redirect page
function extractGoogleNewsUrl(html: string): string | null {
  // Google News embeds the real URL in various ways
  // Look for data-n-au attribute (article URL)
  const dataMatch = html.match(/data-n-au="([^"]+)"/);
  if (dataMatch) return decodeURIComponent(dataMatch[1]);

  // Look for canonical link
  const canonicalMatch = html.match(/<link[^>]+rel=["']canonical["'][^>]+href=["']([^"']+)["']/i);
  if (canonicalMatch && !canonicalMatch[1].includes('news.google.com')) {
    return canonicalMatch[1];
  }

  // Look for og:url that's not Google News
  const ogMatch = html.match(/<meta[^>]+property=["']og:url["'][^>]+content=["']([^"']+)["']/i);
  if (ogMatch && !ogMatch[1].includes('news.google.com')) {
    return ogMatch[1];
  }

  // Look for article link in jsdata or similar
  const jsMatch = html.match(/jsdata="[^"]*https?:\/\/(?!news\.google\.com)[^"&\s]+/);
  if (jsMatch) {
    const urlMatch = jsMatch[0].match(/https?:\/\/[^"&\s]+/);
    if (urlMatch) return urlMatch[0];
  }

  return null;
}

export async function extractArticle(url: string): Promise<ExtractedArticle> {
  // Resolve shortened/redirect URLs first
  let resolvedUrl = url;

  // Follow redirects to get final URL
  try {
    const headResponse = await fetch(url, {
      method: 'HEAD',
      redirect: 'follow',
      headers: {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
      },
    });
    resolvedUrl = headResponse.url;
  } catch {
    // If HEAD fails, continue with original URL
  }

  // Fetch the page with browser-like headers to avoid bot detection
  const response = await fetch(resolvedUrl, {
    headers: {
      "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
      "Accept-Language": "en-US,en;q=0.9",
      "Accept-Encoding": "gzip, deflate, br",
      "Cache-Control": "no-cache",
      "Pragma": "no-cache",
      "Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
      "Sec-Ch-Ua-Mobile": "?0",
      "Sec-Ch-Ua-Platform": '"macOS"',
      "Sec-Fetch-Dest": "document",
      "Sec-Fetch-Mode": "navigate",
      "Sec-Fetch-Site": "none",
      "Sec-Fetch-User": "?1",
      "Upgrade-Insecure-Requests": "1",
    },
  });

  if (!response.ok) {
    // On 403/blocked, return minimal article with just URL info
    if (response.status === 403 || response.status === 401) {
      const hostname = new URL(url).hostname.replace(/^www\./, "");
      return {
        title: `Article from ${hostname}`,
        author: null,
        siteName: hostname,
        excerpt: "This site blocked automated access. Use 'Open original' to read, or the Content Capture bookmarklet to save the full article.",
        content: `<p>This site blocked automated access. <a href="${url}" target="_blank">Open original article</a> to read.</p><p>Tip: Use the Content Capture bookmarklet from the article page to save the full content.</p>`,
        textContent: "This site blocked automated access. Open original article to read.",
        leadImage: null,
        wordCount: 0,
        publishedAt: null,
      };
    }
    throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
  }

  let html = await response.text();
  let finalUrl = resolvedUrl;

  // Check if we landed on Google News - need to extract actual article URL
  if (resolvedUrl.includes('news.google.com')) {
    const realUrl = extractGoogleNewsUrl(html);
    if (realUrl) {
      // Fetch the actual article
      const articleResponse = await fetch(realUrl, {
        headers: {
          "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
          "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
          "Accept-Language": "en-US,en;q=0.9",
        },
      });
      if (articleResponse.ok) {
        html = await articleResponse.text();
        finalUrl = realUrl;
      }
    }
  }

  const cleanedHtml = stripStyles(html);
  const dom = new JSDOM(cleanedHtml, {
    url: finalUrl,
    virtualConsole: createVirtualConsole(),
  });
  const document = dom.window.document;

  // Extract using Readability
  const reader = new Readability(document);
  const article = reader.parse();

  if (!article) {
    throw new Error("Could not extract article content");
  }

  // Try to find lead image
  let leadImage: string | null = null;
  const ogImage = document.querySelector('meta[property="og:image"]');
  if (ogImage) {
    leadImage = ogImage.getAttribute("content");
  }

  // Try to find publish date from various meta tags
  let publishedAt: Date | null = null;
  const dateSelectors = [
    'meta[property="article:published_time"]',
    'meta[name="article:published_time"]',
    'meta[property="og:published_time"]',
    'meta[name="pubdate"]',
    'meta[name="publishdate"]',
    'meta[name="date"]',
    'meta[itemprop="datePublished"]',
    'time[datetime]',
    'time[pubdate]',
  ];

  for (const selector of dateSelectors) {
    const el = document.querySelector(selector);
    if (el) {
      const dateStr = el.getAttribute("content") || el.getAttribute("datetime");
      if (dateStr) {
        const parsed = new Date(dateStr);
        if (!isNaN(parsed.getTime())) {
          publishedAt = parsed;
          break;
        }
      }
    }
  }

  const textContent = article.textContent || "";
  const content = article.content || "";

  // Calculate word count
  const wordCount = textContent.split(/\s+/).filter(Boolean).length;

  return {
    title: article.title || "Untitled",
    author: article.byline || null,
    siteName: article.siteName || new URL(finalUrl).hostname,
    excerpt: article.excerpt || null,
    content,
    textContent,
    leadImage,
    wordCount,
    publishedAt,
  };
}

// Extract article from provided HTML content (for bookmarklet with content capture)
export async function extractFromHtml(
  html: string,
  url: string,
  fallbackTitle?: string
): Promise<ExtractedArticle> {
  const cleanedHtml = stripStyles(html);
  const dom = new JSDOM(cleanedHtml, {
    url,
    virtualConsole: createVirtualConsole(),
  });
  const document = dom.window.document;

  // Extract using Readability
  const reader = new Readability(document);
  const article = reader.parse();

  if (!article) {
    throw new Error("Could not extract article content from provided HTML");
  }

  // Try to find lead image
  let leadImage: string | null = null;
  const ogImage = document.querySelector('meta[property="og:image"]');
  if (ogImage) {
    leadImage = ogImage.getAttribute("content");
  }

  // Try to find publish date from various meta tags
  let publishedAt: Date | null = null;
  const dateSelectors = [
    'meta[property="article:published_time"]',
    'meta[name="article:published_time"]',
    'meta[property="og:published_time"]',
    'meta[name="pubdate"]',
    'meta[name="publishdate"]',
    'meta[name="date"]',
    'meta[itemprop="datePublished"]',
    'time[datetime]',
    'time[pubdate]',
  ];

  for (const selector of dateSelectors) {
    const el = document.querySelector(selector);
    if (el) {
      const dateStr = el.getAttribute("content") || el.getAttribute("datetime");
      if (dateStr) {
        const parsed = new Date(dateStr);
        if (!isNaN(parsed.getTime())) {
          publishedAt = parsed;
          break;
        }
      }
    }
  }

  const textContent = article.textContent || "";
  const content = article.content || "";

  // Calculate word count
  const wordCount = textContent.split(/\s+/).filter(Boolean).length;

  return {
    title: article.title || fallbackTitle || "Untitled",
    author: article.byline || null,
    siteName: article.siteName || new URL(url).hostname,
    excerpt: article.excerpt || null,
    content,
    textContent,
    leadImage,
    wordCount,
    publishedAt,
  };
}