Files
news-reader-actions-test/services/geminiService.ts
2025-11-27 21:19:58 +08:00

244 lines
7.8 KiB
TypeScript

import { GoogleGenAI, Modality } from '@google/genai';
import { VoiceName } from '../types';
import { normalizeUrl } from '../utils/url';
const getAiClient = () => {
const apiKey = import.meta.env.VITE_API_KEY;
if (!apiKey) {
throw new Error(
"Gemini API key is missing. Set VITE_API_KEY in your .env.local file (e.g., VITE_API_KEY=your_key_here)."
);
}
return new GoogleGenAI({ apiKey });
};
/**
* List of CORS proxies to try in order.
* This improves reliability if one service is down or blocked.
*/
const PROXY_PROVIDERS = [
// AllOrigins: Generally the most reliable for raw text
(url: string) => `https://api.allorigins.win/raw?url=${encodeURIComponent(url)}`,
// CodeTabs: Good fallback, handles redirects well
(url: string) => `https://api.codetabs.com/v1/proxy?quest=${encodeURIComponent(url)}`,
// CORSProxy.io: Fast but sometimes has strict CORS headers
(url: string) => `https://corsproxy.io/?${encodeURIComponent(url)}`,
// ThingProxy: Another fallback
(url: string) => `https://thingproxy.freeboard.io/fetch/${url}`
];
/**
* Cleans raw HTML by removing scripts, styles, and non-content elements.
* This acts like a dedicated "Reader Mode" pre-processor.
*/
function cleanAndMinifyHtml(rawHtml: string): string {
try {
const parser = new DOMParser();
const doc = parser.parseFromString(rawHtml, 'text/html');
// 1. Remove heavy technical tags
// We remove these because they consume tokens and provide no semantic value for text extraction.
const technicalTags = ['script', 'style', 'noscript', 'iframe', 'svg', 'link', 'meta', 'button', 'input', 'form', 'img', 'picture', 'video'];
technicalTags.forEach(tag => {
const elements = doc.querySelectorAll(tag);
elements.forEach(el => el.remove());
});
// NOTE: We intentionally DO NOT remove semantic tags like <nav>, <footer>, or use class-based heuristics.
// Previous versions tried to identify <article> or remove .ad-container, but this often caused
// the "Content appears to be empty" error on sites with unique structures.
// Gemini Flash has a large enough context window to ingest the entire <body> and intelligently extract the article.
// Return the body. Trust Gemini to find the needle in the haystack.
return doc.body ? doc.body.innerHTML : rawHtml;
} catch (e) {
console.warn("HTML cleaning failed, using raw string", e);
return rawHtml;
}
}
/**
* Fetches Raw HTML using a rotation of proxies.
*/
async function fetchRawHtml(inputUrl: string): Promise<string> {
const url = normalizeUrl(inputUrl);
let lastError;
for (const provider of PROXY_PROVIDERS) {
let proxyUrl = '';
try {
proxyUrl = provider(url);
console.log(`Fetching via proxy: ${proxyUrl}`);
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), 15000); // 15s timeout per proxy
// We purposely do NOT add complex headers here.
// Adding headers like 'X-Requested-With' often triggers a CORS Preflight (OPTIONS) request,
// which many simple free proxies do not handle correctly, causing "Load failed".
const response = await fetch(proxyUrl, {
signal: controller.signal,
});
clearTimeout(timeoutId);
if (!response.ok) {
throw new Error(`Proxy returned status ${response.status}`);
}
const text = await response.text();
// Simple validation to ensure we got something resembling HTML/Text
if (text && text.length > 100) {
return text;
} else {
throw new Error("Response too short, likely blocked or empty.");
}
} catch (e) {
console.warn(`Proxy attempt failed for ${proxyUrl}:`, e);
lastError = e;
}
}
throw lastError || new Error("Unable to access article content via proxies.");
}
/**
* Uses Gemini to extract clean text from the raw HTML.
*/
async function parseHtmlWithGemini(html: string, url: string): Promise<{ title: string; text: string }> {
const ai = getAiClient();
const cleanedHtml = cleanAndMinifyHtml(html);
if (cleanedHtml.length < 100) {
throw new Error("Content appears to be empty after cleaning. The site might require JavaScript to render.");
}
const prompt = `
SOURCE URL: ${url}
TASK:
I have provided the HTML source of a webpage.
Your job is to act as a dumb "Text Extractor" tool.
Extract the TITLE and the FULL BODY TEXT of the main article.
CRITICAL RULES:
1. VERBATIM: Do NOT rewrite, summarize, or fix the text. Output it exactly as written in the HTML.
2. FULL TEXT: Do NOT stop early. Process the entire HTML to find the end of the article.
3. CLEANING: Exclude ads, navigation, "read more" links, and comments.
4. FORMATTING: Keep the paragraphs intact.
5. FAILURE: If the HTML contains a CAPTCHA, Login Screen, or Paywall message instead of an article, return the text "PAYWALL_DETECTED".
Output Format:
===TITLE_START===
(Headline)
===TITLE_END===
===TEXT_START===
(Paragraph 1)
(Paragraph 2)
...
(Final Paragraph)
===TEXT_END===
HTML CONTENT:
${cleanedHtml}
`;
const response = await ai.models.generateContent({
model: 'gemini-2.5-flash',
contents: prompt,
config: {
temperature: 0.0, // Strict deterministic output
}
});
return parseResponse(response.text || "");
}
function parseResponse(rawText: string): { title: string; text: string } {
if (rawText.includes("PAYWALL_DETECTED")) {
throw new Error("This article is behind a paywall or anti-bot protection and cannot be accessed directly.");
}
const titleMatch = rawText.match(/===TITLE_START===([\s\S]*?)===TITLE_END===/);
const textMatch = rawText.match(/===TEXT_START===([\s\S]*?)===TEXT_END===/);
const title = titleMatch ? titleMatch[1].trim() : "";
const text = textMatch ? textMatch[1].trim() : "";
// Fallback logic for malformed AI responses
if (!text && rawText.length > 100) {
// If AI failed to use delimiters but returned text, try to use it if it looks like an article
if (!rawText.includes("===TEXT_START===") && rawText.length > 200) {
return { title: "Extracted Content", text: rawText };
}
}
if (!text || text.length < 50) {
throw new Error("Could not extract article text. The page structure might be too complex or empty.");
}
return { title, text };
}
/**
* Main Extraction Function
*/
export const extractArticleContent = async (url: string): Promise<{ title: string; text: string }> => {
console.log("Attempting to extract:", url);
try {
// 1. Fetch Raw HTML via Proxy
const html = await fetchRawHtml(url);
// 2. Parse with Gemini
console.log("HTML fetched (" + html.length + " chars). Parsing...");
return await parseHtmlWithGemini(html, url);
} catch (error: any) {
console.error("Extraction failed:", error);
// We intentionally DO NOT fall back to Google Search here, as per user request.
// We want to fail if we can't get the direct content.
throw new Error(error.message || "Failed to access article directly.");
}
};
/**
* Generates speech audio from text.
*/
export const generateSpeechFromText = async (text: string, voice: VoiceName): Promise<string> => {
const ai = getAiClient();
const response = await ai.models.generateContent({
model: 'gemini-2.5-flash-preview-tts',
contents: {
parts: [{ text: text }]
},
config: {
responseModalities: [Modality.AUDIO],
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: voice
}
}
}
}
});
const base64Audio = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
if (!base64Audio) {
throw new Error("No audio data received from model");
}
return base64Audio;
};