feat: Initialize project with basic structure and dependencies

Sets up the foundational elements for the NewsCaster AI application. This includes: - Initializing the project with Vite and React. - Defining core types for articles and player state. - Configuring build tools and TypeScript. - Adding essential dependencies like React, Vite, and Google's Gemini API client. - Providing initial README instructions for running locally. - Setting up basic styling and structure in index.html. - Defining available voices and playback constants. - Implementing utility functions for audio handling.
2025-11-19 19:33:34 +08:00
parent 860124c0e0
commit 0775104b69
16 changed files with 1122 additions and 8 deletions
--- a/services/geminiService.ts
+++ b/services/geminiService.ts
@@ -0,0 +1,283 @@
+import { GoogleGenAI, Modality } from '@google/genai';
+import { VoiceName } from '../types';
+
+const getAiClient = () => {
+  const apiKey = process.env.API_KEY;
+  if (!apiKey) {
+    throw new Error("API Key is missing");
+  }
+  return new GoogleGenAI({ apiKey });
+};
+
+/**
+ * Helper to ensure URL has protocol. 
+ * Proxies often fail if 'http/https' is missing.
+ */
+const normalizeUrl = (url: string) => {
+  let cleanUrl = url.trim();
+  if (!cleanUrl.startsWith('http://') && !cleanUrl.startsWith('https://')) {
+    return `https://${cleanUrl}`;
+  }
+  return cleanUrl;
+};
+
+/**
+ * List of CORS proxies to try in order.
+ * This improves reliability if one service is down or blocked.
+ */
+const PROXY_PROVIDERS = [
+  // AllOrigins: Generally the most reliable for raw text
+  (url: string) => `https://api.allorigins.win/raw?url=${encodeURIComponent(url)}`,
+  
+  // CodeTabs: Good fallback, handles redirects well
+  (url: string) => `https://api.codetabs.com/v1/proxy?quest=${encodeURIComponent(url)}`,
+  
+  // CORSProxy.io: Fast but sometimes has strict CORS headers
+  (url: string) => `https://corsproxy.io/?${encodeURIComponent(url)}`,
+  
+  // ThingProxy: Another fallback
+  (url: string) => `https://thingproxy.freeboard.io/fetch/${url}`
+];
+
+/**
+ * Cleans raw HTML by removing scripts, styles, and non-content elements.
+ * This acts like a dedicated "Reader Mode" pre-processor.
+ */
+function cleanAndMinifyHtml(rawHtml: string): string {
+  try {
+    const parser = new DOMParser();
+    const doc = parser.parseFromString(rawHtml, 'text/html');
+
+    // 1. Remove heavy technical tags
+    const technicalTags = ['script', 'style', 'noscript', 'iframe', 'svg', 'link', 'meta', 'button', 'input', 'form', 'img', 'picture', 'video'];
+    technicalTags.forEach(tag => {
+      const elements = doc.querySelectorAll(tag);
+      elements.forEach(el => el.remove());
+    });
+
+    // 2. Remove semantic layout tags that are usually clutter
+    const layoutTags = ['nav', 'footer', 'aside', 'header'];
+    layoutTags.forEach(tag => {
+      const elements = doc.querySelectorAll(tag);
+      elements.forEach(el => el.remove());
+    });
+
+    // 3. Remove common ad/social/cookie containers by class/id heuristics
+    const junkSelectors = [
+      '[class*="ad-"]', '[id*="ad-"]',
+      '[class*="cookie"]', '[id*="cookie"]',
+      '[class*="newsletter"]', '[id*="newsletter"]',
+      '[class*="social"]', '[class*="share"]',
+      '[class*="comment"]', '[id*="comment"]',
+      '[class*="recommended"]', '[class*="related"]'
+    ];
+    
+    junkSelectors.forEach(selector => {
+      try {
+        const elements = doc.querySelectorAll(selector);
+        elements.forEach(el => el.remove());
+      } catch (e) {
+        // Ignore invalid selector errors
+      }
+    });
+
+    // 4. Return the cleanest possible content
+    // If there is a specific article tag, it's usually the best bet.
+    const article = doc.querySelector('article');
+    if (article && article.textContent && article.textContent.length > 200) {
+        return article.innerHTML;
+    }
+    
+    const main = doc.querySelector('main');
+    if (main && main.textContent && main.textContent.length > 200) {
+        return main.innerHTML;
+    }
+
+    // Fallback: Return the cleaned body
+    return doc.body.innerHTML;
+  } catch (e) {
+    console.warn("HTML cleaning failed, using raw string", e);
+    return rawHtml;
+  }
+}
+
+/**
+ * Fetches Raw HTML using a rotation of proxies.
+ */
+async function fetchRawHtml(inputUrl: string): Promise<string> {
+  const url = normalizeUrl(inputUrl);
+  let lastError;
+  
+  for (const provider of PROXY_PROVIDERS) {
+    let proxyUrl = '';
+    try {
+      proxyUrl = provider(url);
+      console.log(`Fetching via proxy: ${proxyUrl}`);
+      
+      const controller = new AbortController();
+      const timeoutId = setTimeout(() => controller.abort(), 10000); // 10s timeout per proxy
+      
+      // We purposely do NOT add complex headers here.
+      // Adding headers like 'X-Requested-With' often triggers a CORS Preflight (OPTIONS) request,
+      // which many simple free proxies do not handle correctly, causing "Load failed".
+      const response = await fetch(proxyUrl, { 
+        signal: controller.signal,
+      });
+      clearTimeout(timeoutId);
+
+      if (!response.ok) {
+        throw new Error(`Proxy returned status ${response.status}`);
+      }
+      
+      const text = await response.text();
+      
+      // Simple validation to ensure we got something resembling HTML/Text
+      if (text && text.length > 100) {
+        return text;
+      } else {
+        throw new Error("Response too short, likely blocked or empty.");
+      }
+    } catch (e) {
+      console.warn(`Proxy attempt failed for ${proxyUrl}:`, e);
+      lastError = e;
+    }
+  }
+  
+  throw lastError || new Error("Unable to access article content via proxies.");
+}
+
+/**
+ * Uses Gemini to extract clean text from the raw HTML.
+ */
+async function parseHtmlWithGemini(html: string, url: string): Promise<{ title: string; text: string }> {
+  const ai = getAiClient();
+  
+  const cleanedHtml = cleanAndMinifyHtml(html);
+  
+  if (cleanedHtml.length < 100) {
+    throw new Error("Content appears to be empty after cleaning. The site might require JavaScript to render.");
+  }
+
+  const prompt = `
+    SOURCE URL: ${url}
+    
+    TASK:
+    I have provided the HTML source of a webpage.
+    Your job is to act as a dumb "Text Extractor" tool.
+    Extract the TITLE and the FULL BODY TEXT of the main article.
+
+    CRITICAL RULES:
+    1. VERBATIM: Do NOT rewrite, summarize, or fix the text. Output it exactly as written in the HTML.
+    2. FULL TEXT: Do NOT stop early. Process the entire HTML to find the end of the article.
+    3. CLEANING: Exclude ads, navigation, "read more" links, and comments.
+    4. FORMATTING: Keep the paragraphs intact.
+    5. FAILURE: If the HTML contains a CAPTCHA, Login Screen, or Paywall message instead of an article, return the text "PAYWALL_DETECTED".
+    
+    Output Format:
+    ===TITLE_START===
+    (Headline)
+    ===TITLE_END===
+    ===TEXT_START===
+    (Paragraph 1)
+    
+    (Paragraph 2)
+    
+    ...
+    
+    (Final Paragraph)
+    ===TEXT_END===
+
+    HTML CONTENT:
+    ${cleanedHtml} 
+  `;
+
+  const response = await ai.models.generateContent({
+    model: 'gemini-2.5-flash',
+    contents: prompt,
+    config: {
+      temperature: 0.0, // Strict deterministic output
+    }
+  });
+
+  return parseResponse(response.text || "");
+}
+
+function parseResponse(rawText: string): { title: string; text: string } {
+  if (rawText.includes("PAYWALL_DETECTED")) {
+    throw new Error("This article is behind a paywall or anti-bot protection and cannot be accessed directly.");
+  }
+
+  const titleMatch = rawText.match(/===TITLE_START===([\s\S]*?)===TITLE_END===/);
+  const textMatch = rawText.match(/===TEXT_START===([\s\S]*?)===TEXT_END===/);
+
+  const title = titleMatch ? titleMatch[1].trim() : "";
+  const text = textMatch ? textMatch[1].trim() : "";
+
+  // Fallback logic for malformed AI responses
+  if (!text && rawText.length > 100) {
+      // If AI failed to use delimiters but returned text, try to use it if it looks like an article
+      if (!rawText.includes("===TEXT_START===") && rawText.length > 200) {
+        return { title: "Extracted Content", text: rawText };
+      }
+  }
+
+  if (!text || text.length < 50) {
+    throw new Error("Could not extract article text. The page structure might be too complex or empty.");
+  }
+
+  return { title, text };
+}
+
+/**
+ * Main Extraction Function
+ */
+export const extractArticleContent = async (url: string): Promise<{ title: string; text: string }> => {
+  console.log("Attempting to extract:", url);
+
+  try {
+    // 1. Fetch Raw HTML via Proxy
+    const html = await fetchRawHtml(url);
+    
+    // 2. Parse with Gemini
+    console.log("HTML fetched (" + html.length + " chars). Parsing...");
+    return await parseHtmlWithGemini(html, url);
+    
+  } catch (error: any) {
+    console.error("Extraction failed:", error);
+    // We intentionally DO NOT fall back to Google Search here, as per user request.
+    // We want to fail if we can't get the direct content.
+    throw new Error(error.message || "Failed to access article directly.");
+  }
+};
+
+/**
+ * Generates speech audio from text.
+ */
+export const generateSpeechFromText = async (text: string, voice: VoiceName): Promise<string> => {
+  const ai = getAiClient();
+
+  const response = await ai.models.generateContent({
+    model: 'gemini-2.5-flash-preview-tts',
+    contents: {
+      parts: [{ text: text }]
+    },
+    config: {
+      responseModalities: [Modality.AUDIO],
+      speechConfig: {
+        voiceConfig: {
+          prebuiltVoiceConfig: {
+            voiceName: voice
+          }
+        }
+      }
+    }
+  });
+
+  const base64Audio = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
+  
+  if (!base64Audio) {
+    throw new Error("No audio data received from model");
+  }
+
+  return base64Audio;
+};