AI Newsletter Digest improvements: fixed QP soft line break decoding, URL extraction, and content cleaning
This commit is contained in:
Binary file not shown.
Binary file not shown.
180
automations/openclaw-digest/sources/news_fetcher.py
Normal file
180
automations/openclaw-digest/sources/news_fetcher.py
Normal file
@@ -0,0 +1,180 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
News Aggregation Fetcher for OpenClaw Daily Digest
|
||||
Fetches from GitHub releases, Hacker News, and tech news sources
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import feedparser
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Dict, Any
|
||||
|
||||
# News sources configuration
|
||||
SOURCES = {
|
||||
"github_releases": {
|
||||
"url": "https://github.com/openclaw/openclaw/releases.atom",
|
||||
"type": "rss"
|
||||
},
|
||||
"hn_search": {
|
||||
"url": "https://hn.algolia.com/api/v1/search",
|
||||
"type": "hackernews",
|
||||
"query": "openclaw"
|
||||
}
|
||||
}
|
||||
|
||||
def fetch_github_releases() -> List[Dict[str, Any]]:
|
||||
"""Fetch latest OpenClaw releases from GitHub Atom feed"""
|
||||
try:
|
||||
feed = feedparser.parse(SOURCES["github_releases"]["url"])
|
||||
releases = []
|
||||
|
||||
cutoff = datetime.utcnow() - timedelta(hours=24)
|
||||
|
||||
for entry in feed.entries[:5]: # Last 5 releases
|
||||
try:
|
||||
# Try parsed date first, fallback to string parsing
|
||||
if hasattr(entry, 'published_parsed') and entry.published_parsed:
|
||||
published = datetime(*entry.published_parsed[:6])
|
||||
elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
|
||||
published = datetime(*entry.updated_parsed[:6])
|
||||
else:
|
||||
# Parse ISO format string
|
||||
date_str = entry.get('published', entry.get('updated', ''))
|
||||
published = datetime.fromisoformat(date_str.replace('Z', '+00:00').replace('+00:00', ''))
|
||||
|
||||
if published >= cutoff:
|
||||
releases.append({
|
||||
"id": entry.id,
|
||||
"title": entry.title,
|
||||
"url": entry.link,
|
||||
"published": published.isoformat(),
|
||||
"summary": entry.get("summary", "")[:300] + "..." if len(entry.get("summary", "")) > 300 else entry.get("summary", ""),
|
||||
"source": "GitHub",
|
||||
"source_icon": "🐙",
|
||||
"category": "Release"
|
||||
})
|
||||
except Exception as e:
|
||||
print(f" Skipping entry due to date parse error: {e}")
|
||||
continue
|
||||
|
||||
return releases
|
||||
except Exception as e:
|
||||
print(f"Error fetching GitHub releases: {e}")
|
||||
return []
|
||||
|
||||
def fetch_hackernews() -> List[Dict[str, Any]]:
|
||||
"""Fetch OpenClaw-related stories from Hacker News (last 24h)"""
|
||||
try:
|
||||
# Algolia HN search API - last 24 hours
|
||||
params = {
|
||||
"query": SOURCES["hn_search"]["query"],
|
||||
"tags": "story",
|
||||
"numericFilters": "created_at_i>" + str(int((datetime.utcnow() - timedelta(hours=24)).timestamp()))
|
||||
}
|
||||
|
||||
response = requests.get(SOURCES["hn_search"]["url"], params=params, timeout=30)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
stories = []
|
||||
for hit in data.get("hits", [])[:10]: # Top 10 stories
|
||||
stories.append({
|
||||
"id": hit.get("objectID"),
|
||||
"title": hit.get("title"),
|
||||
"url": hit.get("url") or f"https://news.ycombinator.com/item?id={hit.get('objectID')}",
|
||||
"hn_url": f"https://news.ycombinator.com/item?id={hit.get('objectID')}",
|
||||
"published": datetime.fromtimestamp(hit.get("created_at_i", 0)).isoformat(),
|
||||
"author": hit.get("author"),
|
||||
"points": hit.get("points", 0),
|
||||
"num_comments": hit.get("num_comments", 0),
|
||||
"summary": hit.get("story_text", "")[:200] + "..." if hit.get("story_text") else "",
|
||||
"source": "Hacker News",
|
||||
"source_icon": "🟠",
|
||||
"category": "Discussion"
|
||||
})
|
||||
|
||||
return stories
|
||||
except Exception as e:
|
||||
print(f"Error fetching Hacker News: {e}")
|
||||
return []
|
||||
|
||||
def fetch_google_news() -> List[Dict[str, Any]]:
|
||||
"""Fetch OpenClaw news from Google News RSS"""
|
||||
try:
|
||||
# Google News RSS for OpenClaw
|
||||
url = "https://news.google.com/rss/search?q=OpenClaw+AI+agent&hl=en-US&gl=US&ceid=US:en"
|
||||
feed = feedparser.parse(url)
|
||||
|
||||
news = []
|
||||
cutoff = datetime.utcnow() - timedelta(hours=24)
|
||||
|
||||
for entry in feed.entries[:10]:
|
||||
try:
|
||||
published = datetime(*entry.published_parsed[:6])
|
||||
if published >= cutoff:
|
||||
news.append({
|
||||
"id": entry.id,
|
||||
"title": entry.title,
|
||||
"url": entry.link,
|
||||
"published": published.isoformat(),
|
||||
"source": entry.get("source", {}).get("title", "Google News"),
|
||||
"source_icon": "📰",
|
||||
"category": "News"
|
||||
})
|
||||
except:
|
||||
continue
|
||||
|
||||
return news
|
||||
except Exception as e:
|
||||
print(f"Error fetching Google News: {e}")
|
||||
return []
|
||||
|
||||
def fetch_news_content(hours: int = 24) -> Dict[str, Any]:
|
||||
"""Main function to fetch all news content"""
|
||||
print(f"🔍 Fetching news from last {hours} hours...")
|
||||
|
||||
# Fetch from all sources
|
||||
print(" 📡 GitHub releases...")
|
||||
github = fetch_github_releases()
|
||||
print(f" Found {len(github)} releases")
|
||||
|
||||
print(" 📡 Hacker News...")
|
||||
hn = fetch_hackernews()
|
||||
print(f" Found {len(hn)} stories")
|
||||
|
||||
print(" 📡 Google News...")
|
||||
gnews = fetch_google_news()
|
||||
print(f" Found {len(gnews)} articles")
|
||||
|
||||
# Combine and sort by published date
|
||||
all_items = github + hn + gnews
|
||||
all_items.sort(key=lambda x: x.get("published", ""), reverse=True)
|
||||
|
||||
return {
|
||||
"source": "news",
|
||||
"fetched_at": datetime.utcnow().isoformat(),
|
||||
"time_window_hours": hours,
|
||||
"total_items": len(all_items),
|
||||
"github_releases": github,
|
||||
"hackernews": hn,
|
||||
"google_news": gnews,
|
||||
"all_items": all_items[:15] # Top 15 overall
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
|
||||
output_file = sys.argv[2] if len(sys.argv) > 2 else "/home/openclaw/.openclaw/workspace/automations/openclaw-digest/output/news.json"
|
||||
|
||||
content = fetch_news_content(hours=hours)
|
||||
|
||||
with open(output_file, "w") as f:
|
||||
json.dump(content, f, indent=2)
|
||||
|
||||
print(f"\n✅ News content saved to {output_file}")
|
||||
print(f" Total items: {content['total_items']}")
|
||||
print(f" GitHub releases: {len(content['github_releases'])}")
|
||||
print(f" Hacker News: {len(content['hackernews'])}")
|
||||
print(f" Google News: {len(content['google_news'])}")
|
||||
147
automations/openclaw-digest/sources/reddit_fetcher.py
Normal file
147
automations/openclaw-digest/sources/reddit_fetcher.py
Normal file
@@ -0,0 +1,147 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Reddit Content Fetcher for OpenClaw Daily Digest
|
||||
Fetches posts from OpenClaw-related subreddits using Reddit's JSON API
|
||||
No authentication required for read-only public access
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Dict, Any
|
||||
|
||||
# Reddit API endpoints (JSON API - no auth needed for read-only)
|
||||
SUBREDDITS = [
|
||||
"openclaw",
|
||||
"LocalLLaMA",
|
||||
"vibecoding",
|
||||
"selfhosted",
|
||||
"homeautomation"
|
||||
]
|
||||
|
||||
REDDIT_JSON_URL = "https://www.reddit.com/r/{subreddit}.json"
|
||||
|
||||
def fetch_subreddit(subreddit: str, limit: int = 25) -> List[Dict[str, Any]]:
|
||||
"""Fetch posts from a subreddit using Reddit JSON API"""
|
||||
url = REDDIT_JSON_URL.format(subreddit=subreddit)
|
||||
headers = {
|
||||
"User-Agent": "OpenClaw-Digest-Bot/1.0 (by /u/krillyclaw)"
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers, params={"limit": limit}, timeout=30)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
posts = []
|
||||
for child in data.get("data", {}).get("children", []):
|
||||
post = child.get("data", {})
|
||||
posts.append({
|
||||
"id": post.get("id"),
|
||||
"title": post.get("title"),
|
||||
"author": post.get("author"),
|
||||
"subreddit": post.get("subreddit"),
|
||||
"score": post.get("score", 0),
|
||||
"num_comments": post.get("num_comments", 0),
|
||||
"created_utc": post.get("created_utc", 0),
|
||||
"url": f"https://reddit.com{post.get('permalink', '')}",
|
||||
"selftext": post.get("selftext", "")[:500] + "..." if len(post.get("selftext", "")) > 500 else post.get("selftext", ""),
|
||||
"is_self": post.get("is_self", False),
|
||||
"link_flair_text": post.get("link_flair_text", ""),
|
||||
"upvote_ratio": post.get("upvote_ratio", 0)
|
||||
})
|
||||
return posts
|
||||
except Exception as e:
|
||||
print(f"Error fetching r/{subreddit}: {e}")
|
||||
return []
|
||||
|
||||
def filter_by_time(posts: List[Dict], hours: int = 24) -> List[Dict]:
|
||||
"""Filter posts to only include those from last N hours"""
|
||||
cutoff = datetime.utcnow() - timedelta(hours=hours)
|
||||
cutoff_timestamp = cutoff.timestamp()
|
||||
|
||||
filtered = []
|
||||
for post in posts:
|
||||
if post["created_utc"] >= cutoff_timestamp:
|
||||
post["created_datetime"] = datetime.utcfromtimestamp(post["created_utc"]).strftime("%Y-%m-%d %H:%M UTC")
|
||||
filtered.append(post)
|
||||
return filtered
|
||||
|
||||
def filter_openclaw_related(posts: List[Dict]) -> List[Dict]:
|
||||
"""Filter posts to only include OpenClaw-related content"""
|
||||
keywords = ["openclaw", "clawdbot", "open claw", "clawd"]
|
||||
filtered = []
|
||||
|
||||
for post in posts:
|
||||
text = f"{post.get('title', '')} {post.get('selftext', '')}".lower()
|
||||
if any(keyword in text for keyword in keywords):
|
||||
filtered.append(post)
|
||||
|
||||
return filtered
|
||||
|
||||
def score_post(post: Dict) -> float:
|
||||
"""Calculate relevance score based on engagement"""
|
||||
score = post.get("score", 0)
|
||||
comments = post.get("num_comments", 0)
|
||||
upvote_ratio = post.get("upvote_ratio", 0.5)
|
||||
|
||||
# Weighted scoring: comments matter more than upvotes
|
||||
# Upvote ratio indicates quality (avoid controversial posts)
|
||||
return (score * 0.3) + (comments * 2) + (upvote_ratio * 50)
|
||||
|
||||
def fetch_reddit_content(hours: int = 24, limit_per_sub: int = 25) -> Dict[str, Any]:
|
||||
"""Main function to fetch all Reddit content"""
|
||||
all_posts = []
|
||||
|
||||
print(f"🔍 Fetching Reddit posts from last {hours} hours...")
|
||||
|
||||
for subreddit in SUBREDDITS:
|
||||
print(f" 📡 r/{subreddit}...")
|
||||
posts = fetch_subreddit(subreddit, limit=limit_per_sub)
|
||||
|
||||
# Filter by time
|
||||
recent_posts = filter_by_time(posts, hours)
|
||||
|
||||
# For non-OpenClaw subreddits, filter for OpenClaw mentions
|
||||
if subreddit.lower() != "openclaw":
|
||||
recent_posts = filter_openclaw_related(recent_posts)
|
||||
|
||||
print(f" Found {len(recent_posts)} recent OpenClaw-related posts")
|
||||
all_posts.extend(recent_posts)
|
||||
|
||||
# Rate limiting - be nice to Reddit
|
||||
time.sleep(0.5)
|
||||
|
||||
# Sort by engagement score
|
||||
all_posts.sort(key=score_post, reverse=True)
|
||||
|
||||
# Separate into categories
|
||||
openclaw_subreddit = [p for p in all_posts if p["subreddit"].lower() == "openclaw"]
|
||||
other_subreddits = [p for p in all_posts if p["subreddit"].lower() != "openclaw"]
|
||||
|
||||
return {
|
||||
"source": "reddit",
|
||||
"fetched_at": datetime.utcnow().isoformat(),
|
||||
"time_window_hours": hours,
|
||||
"total_posts": len(all_posts),
|
||||
"openclaw_subreddit": openclaw_subreddit[:5], # Top 5 from r/OpenClaw
|
||||
"other_subreddits": other_subreddits[:5], # Top 5 from elsewhere
|
||||
"all_posts": all_posts[:10] # Top 10 overall
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
|
||||
output_file = sys.argv[2] if len(sys.argv) > 2 else "/home/openclaw/.openclaw/workspace/automations/openclaw-digest/output/reddit.json"
|
||||
|
||||
content = fetch_reddit_content(hours=hours)
|
||||
|
||||
with open(output_file, "w") as f:
|
||||
json.dump(content, f, indent=2)
|
||||
|
||||
print(f"\n✅ Reddit content saved to {output_file}")
|
||||
print(f" Total posts: {content['total_posts']}")
|
||||
print(f" From r/OpenClaw: {len(content['openclaw_subreddit'])}")
|
||||
print(f" From other subs: {len(content['other_subreddits'])}")
|
||||
Reference in New Issue
Block a user