readlater/src/lib/utils/extract.ts

import { Readability } from "@mozilla/readability";
import { JSDOM } from "jsdom";

export interface ExtractedArticle {
  title: string;
  author: string | null;
  siteName: string | null;
  excerpt: string | null;
  content: string;
  textContent: string;
  leadImage: string | null;
  wordCount: number;
}

export async function extractArticle(url: string): Promise<ExtractedArticle> {
  // Fetch the page with browser-like headers to avoid bot detection
  const response = await fetch(url, {
    headers: {
      "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
      "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
      "Accept-Language": "en-US,en;q=0.9",
      "Accept-Encoding": "gzip, deflate, br",
      "Cache-Control": "no-cache",
      "Pragma": "no-cache",
      "Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
      "Sec-Ch-Ua-Mobile": "?0",
      "Sec-Ch-Ua-Platform": '"macOS"',
      "Sec-Fetch-Dest": "document",
      "Sec-Fetch-Mode": "navigate",
      "Sec-Fetch-Site": "none",
      "Sec-Fetch-User": "?1",
      "Upgrade-Insecure-Requests": "1",
    },
  });

  if (!response.ok) {
    if (response.status === 403) {
      throw new Error(`This site blocks automated access (403 Forbidden). Try using the bookmarklet from the article page instead - it can capture content your browser can see.`);
    }
    throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
  }

  const html = await response.text();
  const dom = new JSDOM(html, { url });
  const document = dom.window.document;

  // Extract using Readability
  const reader = new Readability(document);
  const article = reader.parse();

  if (!article) {
    throw new Error("Could not extract article content");
  }

  // Try to find lead image
  let leadImage: string | null = null;
  const ogImage = document.querySelector('meta[property="og:image"]');
  if (ogImage) {
    leadImage = ogImage.getAttribute("content");
  }

  const textContent = article.textContent || "";
  const content = article.content || "";

  // Calculate word count
  const wordCount = textContent.split(/\s+/).filter(Boolean).length;

  return {
    title: article.title || "Untitled",
    author: article.byline || null,
    siteName: article.siteName || new URL(url).hostname,
    excerpt: article.excerpt || null,
    content,
    textContent,
    leadImage,
    wordCount,
  };
}

// Extract article from provided HTML content (for bookmarklet with content capture)
export async function extractFromHtml(
  html: string,
  url: string,
  fallbackTitle?: string
): Promise<ExtractedArticle> {
  const dom = new JSDOM(html, { url });
  const document = dom.window.document;

  // Extract using Readability
  const reader = new Readability(document);
  const article = reader.parse();

  if (!article) {
    throw new Error("Could not extract article content from provided HTML");
  }

  // Try to find lead image
  let leadImage: string | null = null;
  const ogImage = document.querySelector('meta[property="og:image"]');
  if (ogImage) {
    leadImage = ogImage.getAttribute("content");
  }

  const textContent = article.textContent || "";
  const content = article.content || "";

  // Calculate word count
  const wordCount = textContent.split(/\s+/).filter(Boolean).length;

  return {
    title: article.title || fallbackTitle || "Untitled",
    author: article.byline || null,
    siteName: article.siteName || new URL(url).hostname,
    excerpt: article.excerpt || null,
    content,
    textContent,
    leadImage,
    wordCount,
  };
}