mirror of
https://github.com/Tony0410/readlater.git
synced 2026-05-24 22:01:41 +08:00
Initial commit: ReadLater v1.0
- Save articles via URL or bookmarklet - Clean dark reader with customizable fonts/sizing - Text-to-speech with browser + Kokoro support - Speed control up to 3x - Favorites and archive - SQLite database with Drizzle ORM - Docker deployment ready Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
62
src/lib/utils/extract.ts
Normal file
62
src/lib/utils/extract.ts
Normal file
@@ -0,0 +1,62 @@
|
||||
import { Readability } from "@mozilla/readability";
|
||||
import { JSDOM } from "jsdom";
|
||||
|
||||
export interface ExtractedArticle {
|
||||
title: string;
|
||||
author: string | null;
|
||||
siteName: string | null;
|
||||
excerpt: string | null;
|
||||
content: string;
|
||||
textContent: string;
|
||||
leadImage: string | null;
|
||||
wordCount: number;
|
||||
}
|
||||
|
||||
export async function extractArticle(url: string): Promise<ExtractedArticle> {
|
||||
// Fetch the page
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; ReadLater/1.0)",
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
const dom = new JSDOM(html, { url });
|
||||
const document = dom.window.document;
|
||||
|
||||
// Extract using Readability
|
||||
const reader = new Readability(document);
|
||||
const article = reader.parse();
|
||||
|
||||
if (!article) {
|
||||
throw new Error("Could not extract article content");
|
||||
}
|
||||
|
||||
// Try to find lead image
|
||||
let leadImage: string | null = null;
|
||||
const ogImage = document.querySelector('meta[property="og:image"]');
|
||||
if (ogImage) {
|
||||
leadImage = ogImage.getAttribute("content");
|
||||
}
|
||||
|
||||
const textContent = article.textContent || "";
|
||||
const content = article.content || "";
|
||||
|
||||
// Calculate word count
|
||||
const wordCount = textContent.split(/\s+/).filter(Boolean).length;
|
||||
|
||||
return {
|
||||
title: article.title || "Untitled",
|
||||
author: article.byline || null,
|
||||
siteName: article.siteName || new URL(url).hostname,
|
||||
excerpt: article.excerpt || null,
|
||||
content,
|
||||
textContent,
|
||||
leadImage,
|
||||
wordCount,
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user