Files
News-reader-pro/services/geminiService.ts
Anthony dadebf8cd0 feat: Enhance article segment navigation
Implement segment selection in ReaderView for user-driven playback control. This change allows users to click on specific segments within an article to jump to and play that segment directly.

The Gemini service's HTML parsing has also been simplified by removing redundant selectors and focusing on essential tag removal for more efficient text extraction.
2025-11-19 20:28:14 +08:00

251 lines
8.0 KiB
TypeScript

import { GoogleGenAI, Modality } from '@google/genai';
import { VoiceName } from '../types';
const getAiClient = () => {
const apiKey = process.env.API_KEY;
if (!apiKey) {
throw new Error("API Key is missing");
}
return new GoogleGenAI({ apiKey });
};
/**
* Helper to ensure URL has protocol.
* Proxies often fail if 'http/https' is missing.
*/
const normalizeUrl = (url: string) => {
let cleanUrl = url.trim();
if (!cleanUrl.startsWith('http://') && !cleanUrl.startsWith('https://')) {
return `https://${cleanUrl}`;
}
return cleanUrl;
};
/**
* List of CORS proxies to try in order.
* This improves reliability if one service is down or blocked.
*/
const PROXY_PROVIDERS = [
// AllOrigins: Generally the most reliable for raw text
(url: string) => `https://api.allorigins.win/raw?url=${encodeURIComponent(url)}`,
// CodeTabs: Good fallback, handles redirects well
(url: string) => `https://api.codetabs.com/v1/proxy?quest=${encodeURIComponent(url)}`,
// CORSProxy.io: Fast but sometimes has strict CORS headers
(url: string) => `https://corsproxy.io/?${encodeURIComponent(url)}`,
// ThingProxy: Another fallback
(url: string) => `https://thingproxy.freeboard.io/fetch/${url}`
];
/**
* Cleans raw HTML by removing scripts, styles, and non-content elements.
* This acts like a dedicated "Reader Mode" pre-processor.
*/
function cleanAndMinifyHtml(rawHtml: string): string {
try {
const parser = new DOMParser();
const doc = parser.parseFromString(rawHtml, 'text/html');
// 1. Remove heavy technical tags
// We remove these because they consume tokens and provide no semantic value for text extraction.
const technicalTags = ['script', 'style', 'noscript', 'iframe', 'svg', 'link', 'meta', 'button', 'input', 'form', 'img', 'picture', 'video'];
technicalTags.forEach(tag => {
const elements = doc.querySelectorAll(tag);
elements.forEach(el => el.remove());
});
// NOTE: We intentionally DO NOT remove semantic tags like <nav>, <footer>, or use class-based heuristics.
// Previous versions tried to identify <article> or remove .ad-container, but this often caused
// the "Content appears to be empty" error on sites with unique structures.
// Gemini Flash has a large enough context window to ingest the entire <body> and intelligently extract the article.
// Return the body. Trust Gemini to find the needle in the haystack.
return doc.body ? doc.body.innerHTML : rawHtml;
} catch (e) {
console.warn("HTML cleaning failed, using raw string", e);
return rawHtml;
}
}
/**
* Fetches Raw HTML using a rotation of proxies.
*/
async function fetchRawHtml(inputUrl: string): Promise<string> {
const url = normalizeUrl(inputUrl);
let lastError;
for (const provider of PROXY_PROVIDERS) {
let proxyUrl = '';
try {
proxyUrl = provider(url);
console.log(`Fetching via proxy: ${proxyUrl}`);
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), 15000); // 15s timeout per proxy
// We purposely do NOT add complex headers here.
// Adding headers like 'X-Requested-With' often triggers a CORS Preflight (OPTIONS) request,
// which many simple free proxies do not handle correctly, causing "Load failed".
const response = await fetch(proxyUrl, {
signal: controller.signal,
});
clearTimeout(timeoutId);
if (!response.ok) {
throw new Error(`Proxy returned status ${response.status}`);
}
const text = await response.text();
// Simple validation to ensure we got something resembling HTML/Text
if (text && text.length > 100) {
return text;
} else {
throw new Error("Response too short, likely blocked or empty.");
}
} catch (e) {
console.warn(`Proxy attempt failed for ${proxyUrl}:`, e);
lastError = e;
}
}
throw lastError || new Error("Unable to access article content via proxies.");
}
/**
* Uses Gemini to extract clean text from the raw HTML.
*/
async function parseHtmlWithGemini(html: string, url: string): Promise<{ title: string; text: string }> {
const ai = getAiClient();
const cleanedHtml = cleanAndMinifyHtml(html);
if (cleanedHtml.length < 100) {
throw new Error("Content appears to be empty after cleaning. The site might require JavaScript to render.");
}
const prompt = `
SOURCE URL: ${url}
TASK:
I have provided the HTML source of a webpage.
Your job is to act as a dumb "Text Extractor" tool.
Extract the TITLE and the FULL BODY TEXT of the main article.
CRITICAL RULES:
1. VERBATIM: Do NOT rewrite, summarize, or fix the text. Output it exactly as written in the HTML.
2. FULL TEXT: Do NOT stop early. Process the entire HTML to find the end of the article.
3. CLEANING: Exclude ads, navigation, "read more" links, and comments.
4. FORMATTING: Keep the paragraphs intact.
5. FAILURE: If the HTML contains a CAPTCHA, Login Screen, or Paywall message instead of an article, return the text "PAYWALL_DETECTED".
Output Format:
===TITLE_START===
(Headline)
===TITLE_END===
===TEXT_START===
(Paragraph 1)
(Paragraph 2)
...
(Final Paragraph)
===TEXT_END===
HTML CONTENT:
${cleanedHtml}
`;
const response = await ai.models.generateContent({
model: 'gemini-2.5-flash',
contents: prompt,
config: {
temperature: 0.0, // Strict deterministic output
}
});
return parseResponse(response.text || "");
}
function parseResponse(rawText: string): { title: string; text: string } {
if (rawText.includes("PAYWALL_DETECTED")) {
throw new Error("This article is behind a paywall or anti-bot protection and cannot be accessed directly.");
}
const titleMatch = rawText.match(/===TITLE_START===([\s\S]*?)===TITLE_END===/);
const textMatch = rawText.match(/===TEXT_START===([\s\S]*?)===TEXT_END===/);
const title = titleMatch ? titleMatch[1].trim() : "";
const text = textMatch ? textMatch[1].trim() : "";
// Fallback logic for malformed AI responses
if (!text && rawText.length > 100) {
// If AI failed to use delimiters but returned text, try to use it if it looks like an article
if (!rawText.includes("===TEXT_START===") && rawText.length > 200) {
return { title: "Extracted Content", text: rawText };
}
}
if (!text || text.length < 50) {
throw new Error("Could not extract article text. The page structure might be too complex or empty.");
}
return { title, text };
}
/**
* Main Extraction Function
*/
export const extractArticleContent = async (url: string): Promise<{ title: string; text: string }> => {
console.log("Attempting to extract:", url);
try {
// 1. Fetch Raw HTML via Proxy
const html = await fetchRawHtml(url);
// 2. Parse with Gemini
console.log("HTML fetched (" + html.length + " chars). Parsing...");
return await parseHtmlWithGemini(html, url);
} catch (error: any) {
console.error("Extraction failed:", error);
// We intentionally DO NOT fall back to Google Search here, as per user request.
// We want to fail if we can't get the direct content.
throw new Error(error.message || "Failed to access article directly.");
}
};
/**
* Generates speech audio from text.
*/
export const generateSpeechFromText = async (text: string, voice: VoiceName): Promise<string> => {
const ai = getAiClient();
const response = await ai.models.generateContent({
model: 'gemini-2.5-flash-preview-tts',
contents: {
parts: [{ text: text }]
},
config: {
responseModalities: [Modality.AUDIO],
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: voice
}
}
}
}
});
const base64Audio = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
if (!base64Audio) {
throw new Error("No audio data received from model");
}
return base64Audio;
};