import { GoogleGenAI, Modality } from '@google/genai'; import { VoiceName } from '../types'; const getAiClient = () => { const apiKey = process.env.API_KEY; if (!apiKey) { throw new Error("API Key is missing"); } return new GoogleGenAI({ apiKey }); }; /** * Helper to ensure URL has protocol. * Proxies often fail if 'http/https' is missing. */ const normalizeUrl = (url: string) => { let cleanUrl = url.trim(); if (!cleanUrl.startsWith('http://') && !cleanUrl.startsWith('https://')) { return `https://${cleanUrl}`; } return cleanUrl; }; /** * List of CORS proxies to try in order. * This improves reliability if one service is down or blocked. */ const PROXY_PROVIDERS = [ // AllOrigins: Generally the most reliable for raw text (url: string) => `https://api.allorigins.win/raw?url=${encodeURIComponent(url)}`, // CodeTabs: Good fallback, handles redirects well (url: string) => `https://api.codetabs.com/v1/proxy?quest=${encodeURIComponent(url)}`, // CORSProxy.io: Fast but sometimes has strict CORS headers (url: string) => `https://corsproxy.io/?${encodeURIComponent(url)}`, // ThingProxy: Another fallback (url: string) => `https://thingproxy.freeboard.io/fetch/${url}` ]; /** * Cleans raw HTML by removing scripts, styles, and non-content elements. * This acts like a dedicated "Reader Mode" pre-processor. */ function cleanAndMinifyHtml(rawHtml: string): string { try { const parser = new DOMParser(); const doc = parser.parseFromString(rawHtml, 'text/html'); // 1. Remove heavy technical tags const technicalTags = ['script', 'style', 'noscript', 'iframe', 'svg', 'link', 'meta', 'button', 'input', 'form', 'img', 'picture', 'video']; technicalTags.forEach(tag => { const elements = doc.querySelectorAll(tag); elements.forEach(el => el.remove()); }); // 2. Remove semantic layout tags that are usually clutter const layoutTags = ['nav', 'footer', 'aside', 'header']; layoutTags.forEach(tag => { const elements = doc.querySelectorAll(tag); elements.forEach(el => el.remove()); }); // 3. Remove common ad/social/cookie containers by class/id heuristics const junkSelectors = [ '[class*="ad-"]', '[id*="ad-"]', '[class*="cookie"]', '[id*="cookie"]', '[class*="newsletter"]', '[id*="newsletter"]', '[class*="social"]', '[class*="share"]', '[class*="comment"]', '[id*="comment"]', '[class*="recommended"]', '[class*="related"]' ]; junkSelectors.forEach(selector => { try { const elements = doc.querySelectorAll(selector); elements.forEach(el => el.remove()); } catch (e) { // Ignore invalid selector errors } }); // 4. Return the cleanest possible content // If there is a specific article tag, it's usually the best bet. const article = doc.querySelector('article'); if (article && article.textContent && article.textContent.length > 200) { return article.innerHTML; } const main = doc.querySelector('main'); if (main && main.textContent && main.textContent.length > 200) { return main.innerHTML; } // Fallback: Return the cleaned body return doc.body.innerHTML; } catch (e) { console.warn("HTML cleaning failed, using raw string", e); return rawHtml; } } /** * Fetches Raw HTML using a rotation of proxies. */ async function fetchRawHtml(inputUrl: string): Promise { const url = normalizeUrl(inputUrl); let lastError; for (const provider of PROXY_PROVIDERS) { let proxyUrl = ''; try { proxyUrl = provider(url); console.log(`Fetching via proxy: ${proxyUrl}`); const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 10000); // 10s timeout per proxy // We purposely do NOT add complex headers here. // Adding headers like 'X-Requested-With' often triggers a CORS Preflight (OPTIONS) request, // which many simple free proxies do not handle correctly, causing "Load failed". const response = await fetch(proxyUrl, { signal: controller.signal, }); clearTimeout(timeoutId); if (!response.ok) { throw new Error(`Proxy returned status ${response.status}`); } const text = await response.text(); // Simple validation to ensure we got something resembling HTML/Text if (text && text.length > 100) { return text; } else { throw new Error("Response too short, likely blocked or empty."); } } catch (e) { console.warn(`Proxy attempt failed for ${proxyUrl}:`, e); lastError = e; } } throw lastError || new Error("Unable to access article content via proxies."); } /** * Uses Gemini to extract clean text from the raw HTML. */ async function parseHtmlWithGemini(html: string, url: string): Promise<{ title: string; text: string }> { const ai = getAiClient(); const cleanedHtml = cleanAndMinifyHtml(html); if (cleanedHtml.length < 100) { throw new Error("Content appears to be empty after cleaning. The site might require JavaScript to render."); } const prompt = ` SOURCE URL: ${url} TASK: I have provided the HTML source of a webpage. Your job is to act as a dumb "Text Extractor" tool. Extract the TITLE and the FULL BODY TEXT of the main article. CRITICAL RULES: 1. VERBATIM: Do NOT rewrite, summarize, or fix the text. Output it exactly as written in the HTML. 2. FULL TEXT: Do NOT stop early. Process the entire HTML to find the end of the article. 3. CLEANING: Exclude ads, navigation, "read more" links, and comments. 4. FORMATTING: Keep the paragraphs intact. 5. FAILURE: If the HTML contains a CAPTCHA, Login Screen, or Paywall message instead of an article, return the text "PAYWALL_DETECTED". Output Format: ===TITLE_START=== (Headline) ===TITLE_END=== ===TEXT_START=== (Paragraph 1) (Paragraph 2) ... (Final Paragraph) ===TEXT_END=== HTML CONTENT: ${cleanedHtml} `; const response = await ai.models.generateContent({ model: 'gemini-2.5-flash', contents: prompt, config: { temperature: 0.0, // Strict deterministic output } }); return parseResponse(response.text || ""); } function parseResponse(rawText: string): { title: string; text: string } { if (rawText.includes("PAYWALL_DETECTED")) { throw new Error("This article is behind a paywall or anti-bot protection and cannot be accessed directly."); } const titleMatch = rawText.match(/===TITLE_START===([\s\S]*?)===TITLE_END===/); const textMatch = rawText.match(/===TEXT_START===([\s\S]*?)===TEXT_END===/); const title = titleMatch ? titleMatch[1].trim() : ""; const text = textMatch ? textMatch[1].trim() : ""; // Fallback logic for malformed AI responses if (!text && rawText.length > 100) { // If AI failed to use delimiters but returned text, try to use it if it looks like an article if (!rawText.includes("===TEXT_START===") && rawText.length > 200) { return { title: "Extracted Content", text: rawText }; } } if (!text || text.length < 50) { throw new Error("Could not extract article text. The page structure might be too complex or empty."); } return { title, text }; } /** * Main Extraction Function */ export const extractArticleContent = async (url: string): Promise<{ title: string; text: string }> => { console.log("Attempting to extract:", url); try { // 1. Fetch Raw HTML via Proxy const html = await fetchRawHtml(url); // 2. Parse with Gemini console.log("HTML fetched (" + html.length + " chars). Parsing..."); return await parseHtmlWithGemini(html, url); } catch (error: any) { console.error("Extraction failed:", error); // We intentionally DO NOT fall back to Google Search here, as per user request. // We want to fail if we can't get the direct content. throw new Error(error.message || "Failed to access article directly."); } }; /** * Generates speech audio from text. */ export const generateSpeechFromText = async (text: string, voice: VoiceName): Promise => { const ai = getAiClient(); const response = await ai.models.generateContent({ model: 'gemini-2.5-flash-preview-tts', contents: { parts: [{ text: text }] }, config: { responseModalities: [Modality.AUDIO], speechConfig: { voiceConfig: { prebuiltVoiceConfig: { voiceName: voice } } } } }); const base64Audio = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data; if (!base64Audio) { throw new Error("No audio data received from model"); } return base64Audio; };