news-reader-actions-test/services/geminiService.ts

import { GoogleGenAI, Modality } from '@google/genai';
import { VoiceName } from '../types';
import { normalizeUrl } from '../utils/url';

const getAiClient = () => {
  const apiKey = import.meta.env.VITE_API_KEY;

  if (!apiKey) {
    throw new Error(
      "Gemini API key is missing. Set VITE_API_KEY in your .env.local file (e.g., VITE_API_KEY=your_key_here)."
    );
  }

  return new GoogleGenAI({ apiKey });
};

/**
 * List of CORS proxies to try in order.
 * This improves reliability if one service is down or blocked.
 */
const PROXY_PROVIDERS = [
  // AllOrigins: Generally the most reliable for raw text
  (url: string) => `https://api.allorigins.win/raw?url=${encodeURIComponent(url)}`,

  // CodeTabs: Good fallback, handles redirects well
  (url: string) => `https://api.codetabs.com/v1/proxy?quest=${encodeURIComponent(url)}`,

  // CORSProxy.io: Fast but sometimes has strict CORS headers
  (url: string) => `https://corsproxy.io/?${encodeURIComponent(url)}`,

  // ThingProxy: Another fallback
  (url: string) => `https://thingproxy.freeboard.io/fetch/${url}`
];

/**
 * Cleans raw HTML by removing scripts, styles, and non-content elements.
 * This acts like a dedicated "Reader Mode" pre-processor.
 */
function cleanAndMinifyHtml(rawHtml: string): string {
  try {
    const parser = new DOMParser();
    const doc = parser.parseFromString(rawHtml, 'text/html');

    // 1. Remove heavy technical tags
    // We remove these because they consume tokens and provide no semantic value for text extraction.
    const technicalTags = ['script', 'style', 'noscript', 'iframe', 'svg', 'link', 'meta', 'button', 'input', 'form', 'img', 'picture', 'video'];
    technicalTags.forEach(tag => {
      const elements = doc.querySelectorAll(tag);
      elements.forEach(el => el.remove());
    });

    // NOTE: We intentionally DO NOT remove semantic tags like <nav>, <footer>, or use class-based heuristics.
    // Previous versions tried to identify <article> or remove .ad-container, but this often caused
    // the "Content appears to be empty" error on sites with unique structures.
    // Gemini Flash has a large enough context window to ingest the entire <body> and intelligently extract the article.

    // Return the body. Trust Gemini to find the needle in the haystack.
    return doc.body ? doc.body.innerHTML : rawHtml;
  } catch (e) {
    console.warn("HTML cleaning failed, using raw string", e);
    return rawHtml;
  }
}

/**
 * Fetches Raw HTML using a rotation of proxies.
 */
async function fetchRawHtml(inputUrl: string): Promise<string> {
  const url = normalizeUrl(inputUrl);
  let lastError;

  for (const provider of PROXY_PROVIDERS) {
    let proxyUrl = '';
    try {
      proxyUrl = provider(url);
      console.log(`Fetching via proxy: ${proxyUrl}`);

      const controller = new AbortController();
      const timeoutId = setTimeout(() => controller.abort(), 15000); // 15s timeout per proxy

      // We purposely do NOT add complex headers here.
      // Adding headers like 'X-Requested-With' often triggers a CORS Preflight (OPTIONS) request,
      // which many simple free proxies do not handle correctly, causing "Load failed".
      const response = await fetch(proxyUrl, {
        signal: controller.signal,
      });
      clearTimeout(timeoutId);

      if (!response.ok) {
        throw new Error(`Proxy returned status ${response.status}`);
      }

      const text = await response.text();

      // Simple validation to ensure we got something resembling HTML/Text
      if (text && text.length > 100) {
        return text;
      } else {
        throw new Error("Response too short, likely blocked or empty.");
      }
    } catch (e) {
      console.warn(`Proxy attempt failed for ${proxyUrl}:`, e);
      lastError = e;
    }
  }

  throw lastError || new Error("Unable to access article content via proxies.");
}

/**
 * Uses Gemini to extract clean text from the raw HTML.
 */
async function parseHtmlWithGemini(html: string, url: string): Promise<{ title: string; text: string }> {
  const ai = getAiClient();

  const cleanedHtml = cleanAndMinifyHtml(html);

  if (cleanedHtml.length < 100) {
    throw new Error("Content appears to be empty after cleaning. The site might require JavaScript to render.");
  }

  const prompt = `
    SOURCE URL: ${url}

    TASK:
    I have provided the HTML source of a webpage.
    Your job is to act as a dumb "Text Extractor" tool.
    Extract the TITLE and the FULL BODY TEXT of the main article.

    CRITICAL RULES:
    1. VERBATIM: Do NOT rewrite, summarize, or fix the text. Output it exactly as written in the HTML.
    2. FULL TEXT: Do NOT stop early. Process the entire HTML to find the end of the article.
    3. CLEANING: Exclude ads, navigation, "read more" links, and comments.
    4. FORMATTING: Keep the paragraphs intact.
    5. FAILURE: If the HTML contains a CAPTCHA, Login Screen, or Paywall message instead of an article, return the text "PAYWALL_DETECTED".

    Output Format:
    ===TITLE_START===
    (Headline)
    ===TITLE_END===
    ===TEXT_START===
    (Paragraph 1)

    (Paragraph 2)

    ...

    (Final Paragraph)
    ===TEXT_END===

    HTML CONTENT:
    ${cleanedHtml}
  `;

  const response = await ai.models.generateContent({
    model: 'gemini-2.5-flash',
    contents: prompt,
    config: {
      temperature: 0.0, // Strict deterministic output
    }
  });

  return parseResponse(response.text || "");
}

function parseResponse(rawText: string): { title: string; text: string } {
  if (rawText.includes("PAYWALL_DETECTED")) {
    throw new Error("This article is behind a paywall or anti-bot protection and cannot be accessed directly.");
  }

  const titleMatch = rawText.match(/===TITLE_START===([\s\S]*?)===TITLE_END===/);
  const textMatch = rawText.match(/===TEXT_START===([\s\S]*?)===TEXT_END===/);

  const title = titleMatch ? titleMatch[1].trim() : "";
  const text = textMatch ? textMatch[1].trim() : "";

  // Fallback logic for malformed AI responses
  if (!text && rawText.length > 100) {
      // If AI failed to use delimiters but returned text, try to use it if it looks like an article
      if (!rawText.includes("===TEXT_START===") && rawText.length > 200) {
        return { title: "Extracted Content", text: rawText };
      }
  }

  if (!text || text.length < 50) {
    throw new Error("Could not extract article text. The page structure might be too complex or empty.");
  }

  return { title, text };
}

/**
 * Main Extraction Function
 */
export const extractArticleContent = async (url: string): Promise<{ title: string; text: string }> => {
  console.log("Attempting to extract:", url);

  try {
    // 1. Fetch Raw HTML via Proxy
    const html = await fetchRawHtml(url);

    // 2. Parse with Gemini
    console.log("HTML fetched (" + html.length + " chars). Parsing...");
    return await parseHtmlWithGemini(html, url);

  } catch (error: any) {
    console.error("Extraction failed:", error);
    // We intentionally DO NOT fall back to Google Search here, as per user request.
    // We want to fail if we can't get the direct content.
    throw new Error(error.message || "Failed to access article directly.");
  }
};

/**
 * Generates speech audio from text.
 */
export const generateSpeechFromText = async (text: string, voice: VoiceName): Promise<string> => {
  const ai = getAiClient();

  const response = await ai.models.generateContent({
    model: 'gemini-2.5-flash-preview-tts',
    contents: {
      parts: [{ text: text }]
    },
    config: {
      responseModalities: [Modality.AUDIO],
      speechConfig: {
        voiceConfig: {
          prebuiltVoiceConfig: {
            voiceName: voice
          }
        }
      }
    }
  });

  const base64Audio = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;

  if (!base64Audio) {
    throw new Error("No audio data received from model");
  }

  return base64Audio;
};