Files
openclaw-backups/automations/openclaw-digest/sources/news_fetcher.py

181 lines
6.7 KiB
Python

#!/usr/bin/env python3
"""
News Aggregation Fetcher for OpenClaw Daily Digest
Fetches from GitHub releases, Hacker News, and tech news sources
"""
import requests
import json
import feedparser
from datetime import datetime, timedelta
from typing import List, Dict, Any
# News sources configuration
SOURCES = {
"github_releases": {
"url": "https://github.com/openclaw/openclaw/releases.atom",
"type": "rss"
},
"hn_search": {
"url": "https://hn.algolia.com/api/v1/search",
"type": "hackernews",
"query": "openclaw"
}
}
def fetch_github_releases() -> List[Dict[str, Any]]:
"""Fetch latest OpenClaw releases from GitHub Atom feed"""
try:
feed = feedparser.parse(SOURCES["github_releases"]["url"])
releases = []
cutoff = datetime.utcnow() - timedelta(hours=24)
for entry in feed.entries[:5]: # Last 5 releases
try:
# Try parsed date first, fallback to string parsing
if hasattr(entry, 'published_parsed') and entry.published_parsed:
published = datetime(*entry.published_parsed[:6])
elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
published = datetime(*entry.updated_parsed[:6])
else:
# Parse ISO format string
date_str = entry.get('published', entry.get('updated', ''))
published = datetime.fromisoformat(date_str.replace('Z', '+00:00').replace('+00:00', ''))
if published >= cutoff:
releases.append({
"id": entry.id,
"title": entry.title,
"url": entry.link,
"published": published.isoformat(),
"summary": entry.get("summary", "")[:300] + "..." if len(entry.get("summary", "")) > 300 else entry.get("summary", ""),
"source": "GitHub",
"source_icon": "🐙",
"category": "Release"
})
except Exception as e:
print(f" Skipping entry due to date parse error: {e}")
continue
return releases
except Exception as e:
print(f"Error fetching GitHub releases: {e}")
return []
def fetch_hackernews() -> List[Dict[str, Any]]:
"""Fetch OpenClaw-related stories from Hacker News (last 24h)"""
try:
# Algolia HN search API - last 24 hours
params = {
"query": SOURCES["hn_search"]["query"],
"tags": "story",
"numericFilters": "created_at_i>" + str(int((datetime.utcnow() - timedelta(hours=24)).timestamp()))
}
response = requests.get(SOURCES["hn_search"]["url"], params=params, timeout=30)
response.raise_for_status()
data = response.json()
stories = []
for hit in data.get("hits", [])[:10]: # Top 10 stories
stories.append({
"id": hit.get("objectID"),
"title": hit.get("title"),
"url": hit.get("url") or f"https://news.ycombinator.com/item?id={hit.get('objectID')}",
"hn_url": f"https://news.ycombinator.com/item?id={hit.get('objectID')}",
"published": datetime.fromtimestamp(hit.get("created_at_i", 0)).isoformat(),
"author": hit.get("author"),
"points": hit.get("points", 0),
"num_comments": hit.get("num_comments", 0),
"summary": hit.get("story_text", "")[:200] + "..." if hit.get("story_text") else "",
"source": "Hacker News",
"source_icon": "🟠",
"category": "Discussion"
})
return stories
except Exception as e:
print(f"Error fetching Hacker News: {e}")
return []
def fetch_google_news() -> List[Dict[str, Any]]:
"""Fetch OpenClaw news from Google News RSS"""
try:
# Google News RSS for OpenClaw
url = "https://news.google.com/rss/search?q=OpenClaw+AI+agent&hl=en-US&gl=US&ceid=US:en"
feed = feedparser.parse(url)
news = []
cutoff = datetime.utcnow() - timedelta(hours=24)
for entry in feed.entries[:10]:
try:
published = datetime(*entry.published_parsed[:6])
if published >= cutoff:
news.append({
"id": entry.id,
"title": entry.title,
"url": entry.link,
"published": published.isoformat(),
"source": entry.get("source", {}).get("title", "Google News"),
"source_icon": "📰",
"category": "News"
})
except:
continue
return news
except Exception as e:
print(f"Error fetching Google News: {e}")
return []
def fetch_news_content(hours: int = 24) -> Dict[str, Any]:
"""Main function to fetch all news content"""
print(f"🔍 Fetching news from last {hours} hours...")
# Fetch from all sources
print(" 📡 GitHub releases...")
github = fetch_github_releases()
print(f" Found {len(github)} releases")
print(" 📡 Hacker News...")
hn = fetch_hackernews()
print(f" Found {len(hn)} stories")
print(" 📡 Google News...")
gnews = fetch_google_news()
print(f" Found {len(gnews)} articles")
# Combine and sort by published date
all_items = github + hn + gnews
all_items.sort(key=lambda x: x.get("published", ""), reverse=True)
return {
"source": "news",
"fetched_at": datetime.utcnow().isoformat(),
"time_window_hours": hours,
"total_items": len(all_items),
"github_releases": github,
"hackernews": hn,
"google_news": gnews,
"all_items": all_items[:15] # Top 15 overall
}
if __name__ == "__main__":
import sys
hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
output_file = sys.argv[2] if len(sys.argv) > 2 else "/home/openclaw/.openclaw/workspace/automations/openclaw-digest/output/news.json"
content = fetch_news_content(hours=hours)
with open(output_file, "w") as f:
json.dump(content, f, indent=2)
print(f"\n✅ News content saved to {output_file}")
print(f" Total items: {content['total_items']}")
print(f" GitHub releases: {len(content['github_releases'])}")
print(f" Hacker News: {len(content['hackernews'])}")
print(f" Google News: {len(content['google_news'])}")