feat: Initialize project with basic structure and dependencies
Sets up the foundational elements for the NewsCaster AI application. This includes: - Initializing the project with Vite and React. - Defining core types for articles and player state. - Configuring build tools and TypeScript. - Adding essential dependencies like React, Vite, and Google's Gemini API client. - Providing initial README instructions for running locally. - Setting up basic styling and structure in index.html. - Defining available voices and playback constants. - Implementing utility functions for audio handling.
This commit is contained in:
283
services/geminiService.ts
Normal file
283
services/geminiService.ts
Normal file
@@ -0,0 +1,283 @@
|
||||
import { GoogleGenAI, Modality } from '@google/genai';
|
||||
import { VoiceName } from '../types';
|
||||
|
||||
const getAiClient = () => {
|
||||
const apiKey = process.env.API_KEY;
|
||||
if (!apiKey) {
|
||||
throw new Error("API Key is missing");
|
||||
}
|
||||
return new GoogleGenAI({ apiKey });
|
||||
};
|
||||
|
||||
/**
|
||||
* Helper to ensure URL has protocol.
|
||||
* Proxies often fail if 'http/https' is missing.
|
||||
*/
|
||||
const normalizeUrl = (url: string) => {
|
||||
let cleanUrl = url.trim();
|
||||
if (!cleanUrl.startsWith('http://') && !cleanUrl.startsWith('https://')) {
|
||||
return `https://${cleanUrl}`;
|
||||
}
|
||||
return cleanUrl;
|
||||
};
|
||||
|
||||
/**
|
||||
* List of CORS proxies to try in order.
|
||||
* This improves reliability if one service is down or blocked.
|
||||
*/
|
||||
const PROXY_PROVIDERS = [
|
||||
// AllOrigins: Generally the most reliable for raw text
|
||||
(url: string) => `https://api.allorigins.win/raw?url=${encodeURIComponent(url)}`,
|
||||
|
||||
// CodeTabs: Good fallback, handles redirects well
|
||||
(url: string) => `https://api.codetabs.com/v1/proxy?quest=${encodeURIComponent(url)}`,
|
||||
|
||||
// CORSProxy.io: Fast but sometimes has strict CORS headers
|
||||
(url: string) => `https://corsproxy.io/?${encodeURIComponent(url)}`,
|
||||
|
||||
// ThingProxy: Another fallback
|
||||
(url: string) => `https://thingproxy.freeboard.io/fetch/${url}`
|
||||
];
|
||||
|
||||
/**
|
||||
* Cleans raw HTML by removing scripts, styles, and non-content elements.
|
||||
* This acts like a dedicated "Reader Mode" pre-processor.
|
||||
*/
|
||||
function cleanAndMinifyHtml(rawHtml: string): string {
|
||||
try {
|
||||
const parser = new DOMParser();
|
||||
const doc = parser.parseFromString(rawHtml, 'text/html');
|
||||
|
||||
// 1. Remove heavy technical tags
|
||||
const technicalTags = ['script', 'style', 'noscript', 'iframe', 'svg', 'link', 'meta', 'button', 'input', 'form', 'img', 'picture', 'video'];
|
||||
technicalTags.forEach(tag => {
|
||||
const elements = doc.querySelectorAll(tag);
|
||||
elements.forEach(el => el.remove());
|
||||
});
|
||||
|
||||
// 2. Remove semantic layout tags that are usually clutter
|
||||
const layoutTags = ['nav', 'footer', 'aside', 'header'];
|
||||
layoutTags.forEach(tag => {
|
||||
const elements = doc.querySelectorAll(tag);
|
||||
elements.forEach(el => el.remove());
|
||||
});
|
||||
|
||||
// 3. Remove common ad/social/cookie containers by class/id heuristics
|
||||
const junkSelectors = [
|
||||
'[class*="ad-"]', '[id*="ad-"]',
|
||||
'[class*="cookie"]', '[id*="cookie"]',
|
||||
'[class*="newsletter"]', '[id*="newsletter"]',
|
||||
'[class*="social"]', '[class*="share"]',
|
||||
'[class*="comment"]', '[id*="comment"]',
|
||||
'[class*="recommended"]', '[class*="related"]'
|
||||
];
|
||||
|
||||
junkSelectors.forEach(selector => {
|
||||
try {
|
||||
const elements = doc.querySelectorAll(selector);
|
||||
elements.forEach(el => el.remove());
|
||||
} catch (e) {
|
||||
// Ignore invalid selector errors
|
||||
}
|
||||
});
|
||||
|
||||
// 4. Return the cleanest possible content
|
||||
// If there is a specific article tag, it's usually the best bet.
|
||||
const article = doc.querySelector('article');
|
||||
if (article && article.textContent && article.textContent.length > 200) {
|
||||
return article.innerHTML;
|
||||
}
|
||||
|
||||
const main = doc.querySelector('main');
|
||||
if (main && main.textContent && main.textContent.length > 200) {
|
||||
return main.innerHTML;
|
||||
}
|
||||
|
||||
// Fallback: Return the cleaned body
|
||||
return doc.body.innerHTML;
|
||||
} catch (e) {
|
||||
console.warn("HTML cleaning failed, using raw string", e);
|
||||
return rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches Raw HTML using a rotation of proxies.
|
||||
*/
|
||||
async function fetchRawHtml(inputUrl: string): Promise<string> {
|
||||
const url = normalizeUrl(inputUrl);
|
||||
let lastError;
|
||||
|
||||
for (const provider of PROXY_PROVIDERS) {
|
||||
let proxyUrl = '';
|
||||
try {
|
||||
proxyUrl = provider(url);
|
||||
console.log(`Fetching via proxy: ${proxyUrl}`);
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeoutId = setTimeout(() => controller.abort(), 10000); // 10s timeout per proxy
|
||||
|
||||
// We purposely do NOT add complex headers here.
|
||||
// Adding headers like 'X-Requested-With' often triggers a CORS Preflight (OPTIONS) request,
|
||||
// which many simple free proxies do not handle correctly, causing "Load failed".
|
||||
const response = await fetch(proxyUrl, {
|
||||
signal: controller.signal,
|
||||
});
|
||||
clearTimeout(timeoutId);
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Proxy returned status ${response.status}`);
|
||||
}
|
||||
|
||||
const text = await response.text();
|
||||
|
||||
// Simple validation to ensure we got something resembling HTML/Text
|
||||
if (text && text.length > 100) {
|
||||
return text;
|
||||
} else {
|
||||
throw new Error("Response too short, likely blocked or empty.");
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`Proxy attempt failed for ${proxyUrl}:`, e);
|
||||
lastError = e;
|
||||
}
|
||||
}
|
||||
|
||||
throw lastError || new Error("Unable to access article content via proxies.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Uses Gemini to extract clean text from the raw HTML.
|
||||
*/
|
||||
async function parseHtmlWithGemini(html: string, url: string): Promise<{ title: string; text: string }> {
|
||||
const ai = getAiClient();
|
||||
|
||||
const cleanedHtml = cleanAndMinifyHtml(html);
|
||||
|
||||
if (cleanedHtml.length < 100) {
|
||||
throw new Error("Content appears to be empty after cleaning. The site might require JavaScript to render.");
|
||||
}
|
||||
|
||||
const prompt = `
|
||||
SOURCE URL: ${url}
|
||||
|
||||
TASK:
|
||||
I have provided the HTML source of a webpage.
|
||||
Your job is to act as a dumb "Text Extractor" tool.
|
||||
Extract the TITLE and the FULL BODY TEXT of the main article.
|
||||
|
||||
CRITICAL RULES:
|
||||
1. VERBATIM: Do NOT rewrite, summarize, or fix the text. Output it exactly as written in the HTML.
|
||||
2. FULL TEXT: Do NOT stop early. Process the entire HTML to find the end of the article.
|
||||
3. CLEANING: Exclude ads, navigation, "read more" links, and comments.
|
||||
4. FORMATTING: Keep the paragraphs intact.
|
||||
5. FAILURE: If the HTML contains a CAPTCHA, Login Screen, or Paywall message instead of an article, return the text "PAYWALL_DETECTED".
|
||||
|
||||
Output Format:
|
||||
===TITLE_START===
|
||||
(Headline)
|
||||
===TITLE_END===
|
||||
===TEXT_START===
|
||||
(Paragraph 1)
|
||||
|
||||
(Paragraph 2)
|
||||
|
||||
...
|
||||
|
||||
(Final Paragraph)
|
||||
===TEXT_END===
|
||||
|
||||
HTML CONTENT:
|
||||
${cleanedHtml}
|
||||
`;
|
||||
|
||||
const response = await ai.models.generateContent({
|
||||
model: 'gemini-2.5-flash',
|
||||
contents: prompt,
|
||||
config: {
|
||||
temperature: 0.0, // Strict deterministic output
|
||||
}
|
||||
});
|
||||
|
||||
return parseResponse(response.text || "");
|
||||
}
|
||||
|
||||
function parseResponse(rawText: string): { title: string; text: string } {
|
||||
if (rawText.includes("PAYWALL_DETECTED")) {
|
||||
throw new Error("This article is behind a paywall or anti-bot protection and cannot be accessed directly.");
|
||||
}
|
||||
|
||||
const titleMatch = rawText.match(/===TITLE_START===([\s\S]*?)===TITLE_END===/);
|
||||
const textMatch = rawText.match(/===TEXT_START===([\s\S]*?)===TEXT_END===/);
|
||||
|
||||
const title = titleMatch ? titleMatch[1].trim() : "";
|
||||
const text = textMatch ? textMatch[1].trim() : "";
|
||||
|
||||
// Fallback logic for malformed AI responses
|
||||
if (!text && rawText.length > 100) {
|
||||
// If AI failed to use delimiters but returned text, try to use it if it looks like an article
|
||||
if (!rawText.includes("===TEXT_START===") && rawText.length > 200) {
|
||||
return { title: "Extracted Content", text: rawText };
|
||||
}
|
||||
}
|
||||
|
||||
if (!text || text.length < 50) {
|
||||
throw new Error("Could not extract article text. The page structure might be too complex or empty.");
|
||||
}
|
||||
|
||||
return { title, text };
|
||||
}
|
||||
|
||||
/**
|
||||
* Main Extraction Function
|
||||
*/
|
||||
export const extractArticleContent = async (url: string): Promise<{ title: string; text: string }> => {
|
||||
console.log("Attempting to extract:", url);
|
||||
|
||||
try {
|
||||
// 1. Fetch Raw HTML via Proxy
|
||||
const html = await fetchRawHtml(url);
|
||||
|
||||
// 2. Parse with Gemini
|
||||
console.log("HTML fetched (" + html.length + " chars). Parsing...");
|
||||
return await parseHtmlWithGemini(html, url);
|
||||
|
||||
} catch (error: any) {
|
||||
console.error("Extraction failed:", error);
|
||||
// We intentionally DO NOT fall back to Google Search here, as per user request.
|
||||
// We want to fail if we can't get the direct content.
|
||||
throw new Error(error.message || "Failed to access article directly.");
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Generates speech audio from text.
|
||||
*/
|
||||
export const generateSpeechFromText = async (text: string, voice: VoiceName): Promise<string> => {
|
||||
const ai = getAiClient();
|
||||
|
||||
const response = await ai.models.generateContent({
|
||||
model: 'gemini-2.5-flash-preview-tts',
|
||||
contents: {
|
||||
parts: [{ text: text }]
|
||||
},
|
||||
config: {
|
||||
responseModalities: [Modality.AUDIO],
|
||||
speechConfig: {
|
||||
voiceConfig: {
|
||||
prebuiltVoiceConfig: {
|
||||
voiceName: voice
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const base64Audio = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
|
||||
|
||||
if (!base64Audio) {
|
||||
throw new Error("No audio data received from model");
|
||||
}
|
||||
|
||||
return base64Audio;
|
||||
};
|
||||
Reference in New Issue
Block a user