mirror of
https://github.com/Tony0410/News-reader-pro.git
synced 2026-05-24 21:31:44 +08:00
feat: Enhance article segment navigation
Implement segment selection in ReaderView for user-driven playback control. This change allows users to click on specific segments within an article to jump to and play that segment directly. The Gemini service's HTML parsing has also been simplified by removing redundant selectors and focusing on essential tag removal for more efficient text extraction.
This commit is contained in:
@@ -49,52 +49,20 @@ function cleanAndMinifyHtml(rawHtml: string): string {
|
||||
const doc = parser.parseFromString(rawHtml, 'text/html');
|
||||
|
||||
// 1. Remove heavy technical tags
|
||||
// We remove these because they consume tokens and provide no semantic value for text extraction.
|
||||
const technicalTags = ['script', 'style', 'noscript', 'iframe', 'svg', 'link', 'meta', 'button', 'input', 'form', 'img', 'picture', 'video'];
|
||||
technicalTags.forEach(tag => {
|
||||
const elements = doc.querySelectorAll(tag);
|
||||
elements.forEach(el => el.remove());
|
||||
});
|
||||
|
||||
// 2. Remove semantic layout tags that are usually clutter
|
||||
const layoutTags = ['nav', 'footer', 'aside', 'header'];
|
||||
layoutTags.forEach(tag => {
|
||||
const elements = doc.querySelectorAll(tag);
|
||||
elements.forEach(el => el.remove());
|
||||
});
|
||||
// NOTE: We intentionally DO NOT remove semantic tags like <nav>, <footer>, or use class-based heuristics.
|
||||
// Previous versions tried to identify <article> or remove .ad-container, but this often caused
|
||||
// the "Content appears to be empty" error on sites with unique structures.
|
||||
// Gemini Flash has a large enough context window to ingest the entire <body> and intelligently extract the article.
|
||||
|
||||
// 3. Remove common ad/social/cookie containers by class/id heuristics
|
||||
const junkSelectors = [
|
||||
'[class*="ad-"]', '[id*="ad-"]',
|
||||
'[class*="cookie"]', '[id*="cookie"]',
|
||||
'[class*="newsletter"]', '[id*="newsletter"]',
|
||||
'[class*="social"]', '[class*="share"]',
|
||||
'[class*="comment"]', '[id*="comment"]',
|
||||
'[class*="recommended"]', '[class*="related"]'
|
||||
];
|
||||
|
||||
junkSelectors.forEach(selector => {
|
||||
try {
|
||||
const elements = doc.querySelectorAll(selector);
|
||||
elements.forEach(el => el.remove());
|
||||
} catch (e) {
|
||||
// Ignore invalid selector errors
|
||||
}
|
||||
});
|
||||
|
||||
// 4. Return the cleanest possible content
|
||||
// If there is a specific article tag, it's usually the best bet.
|
||||
const article = doc.querySelector('article');
|
||||
if (article && article.textContent && article.textContent.length > 200) {
|
||||
return article.innerHTML;
|
||||
}
|
||||
|
||||
const main = doc.querySelector('main');
|
||||
if (main && main.textContent && main.textContent.length > 200) {
|
||||
return main.innerHTML;
|
||||
}
|
||||
|
||||
// Fallback: Return the cleaned body
|
||||
return doc.body.innerHTML;
|
||||
// Return the body. Trust Gemini to find the needle in the haystack.
|
||||
return doc.body ? doc.body.innerHTML : rawHtml;
|
||||
} catch (e) {
|
||||
console.warn("HTML cleaning failed, using raw string", e);
|
||||
return rawHtml;
|
||||
@@ -115,7 +83,7 @@ async function fetchRawHtml(inputUrl: string): Promise<string> {
|
||||
console.log(`Fetching via proxy: ${proxyUrl}`);
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeoutId = setTimeout(() => controller.abort(), 10000); // 10s timeout per proxy
|
||||
const timeoutId = setTimeout(() => controller.abort(), 15000); // 15s timeout per proxy
|
||||
|
||||
// We purposely do NOT add complex headers here.
|
||||
// Adding headers like 'X-Requested-With' often triggers a CORS Preflight (OPTIONS) request,
|
||||
|
||||
Reference in New Issue
Block a user