AI Newsletter Digest improvements: fixed QP soft line break decoding, URL extraction, and content cleaning

This commit is contained in:
Krilly
2026-03-04 13:29:22 +00:00
parent 29a98137a7
commit 57dd294675
13706 changed files with 2114953 additions and 237629 deletions

View File

@@ -0,0 +1,180 @@
#!/usr/bin/env python3
"""
News Aggregation Fetcher for OpenClaw Daily Digest
Fetches from GitHub releases, Hacker News, and tech news sources
"""
import requests
import json
import feedparser
from datetime import datetime, timedelta
from typing import List, Dict, Any
# News sources configuration
SOURCES = {
"github_releases": {
"url": "https://github.com/openclaw/openclaw/releases.atom",
"type": "rss"
},
"hn_search": {
"url": "https://hn.algolia.com/api/v1/search",
"type": "hackernews",
"query": "openclaw"
}
}
def fetch_github_releases() -> List[Dict[str, Any]]:
"""Fetch latest OpenClaw releases from GitHub Atom feed"""
try:
feed = feedparser.parse(SOURCES["github_releases"]["url"])
releases = []
cutoff = datetime.utcnow() - timedelta(hours=24)
for entry in feed.entries[:5]: # Last 5 releases
try:
# Try parsed date first, fallback to string parsing
if hasattr(entry, 'published_parsed') and entry.published_parsed:
published = datetime(*entry.published_parsed[:6])
elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
published = datetime(*entry.updated_parsed[:6])
else:
# Parse ISO format string
date_str = entry.get('published', entry.get('updated', ''))
published = datetime.fromisoformat(date_str.replace('Z', '+00:00').replace('+00:00', ''))
if published >= cutoff:
releases.append({
"id": entry.id,
"title": entry.title,
"url": entry.link,
"published": published.isoformat(),
"summary": entry.get("summary", "")[:300] + "..." if len(entry.get("summary", "")) > 300 else entry.get("summary", ""),
"source": "GitHub",
"source_icon": "🐙",
"category": "Release"
})
except Exception as e:
print(f" Skipping entry due to date parse error: {e}")
continue
return releases
except Exception as e:
print(f"Error fetching GitHub releases: {e}")
return []
def fetch_hackernews() -> List[Dict[str, Any]]:
"""Fetch OpenClaw-related stories from Hacker News (last 24h)"""
try:
# Algolia HN search API - last 24 hours
params = {
"query": SOURCES["hn_search"]["query"],
"tags": "story",
"numericFilters": "created_at_i>" + str(int((datetime.utcnow() - timedelta(hours=24)).timestamp()))
}
response = requests.get(SOURCES["hn_search"]["url"], params=params, timeout=30)
response.raise_for_status()
data = response.json()
stories = []
for hit in data.get("hits", [])[:10]: # Top 10 stories
stories.append({
"id": hit.get("objectID"),
"title": hit.get("title"),
"url": hit.get("url") or f"https://news.ycombinator.com/item?id={hit.get('objectID')}",
"hn_url": f"https://news.ycombinator.com/item?id={hit.get('objectID')}",
"published": datetime.fromtimestamp(hit.get("created_at_i", 0)).isoformat(),
"author": hit.get("author"),
"points": hit.get("points", 0),
"num_comments": hit.get("num_comments", 0),
"summary": hit.get("story_text", "")[:200] + "..." if hit.get("story_text") else "",
"source": "Hacker News",
"source_icon": "🟠",
"category": "Discussion"
})
return stories
except Exception as e:
print(f"Error fetching Hacker News: {e}")
return []
def fetch_google_news() -> List[Dict[str, Any]]:
"""Fetch OpenClaw news from Google News RSS"""
try:
# Google News RSS for OpenClaw
url = "https://news.google.com/rss/search?q=OpenClaw+AI+agent&hl=en-US&gl=US&ceid=US:en"
feed = feedparser.parse(url)
news = []
cutoff = datetime.utcnow() - timedelta(hours=24)
for entry in feed.entries[:10]:
try:
published = datetime(*entry.published_parsed[:6])
if published >= cutoff:
news.append({
"id": entry.id,
"title": entry.title,
"url": entry.link,
"published": published.isoformat(),
"source": entry.get("source", {}).get("title", "Google News"),
"source_icon": "📰",
"category": "News"
})
except:
continue
return news
except Exception as e:
print(f"Error fetching Google News: {e}")
return []
def fetch_news_content(hours: int = 24) -> Dict[str, Any]:
"""Main function to fetch all news content"""
print(f"🔍 Fetching news from last {hours} hours...")
# Fetch from all sources
print(" 📡 GitHub releases...")
github = fetch_github_releases()
print(f" Found {len(github)} releases")
print(" 📡 Hacker News...")
hn = fetch_hackernews()
print(f" Found {len(hn)} stories")
print(" 📡 Google News...")
gnews = fetch_google_news()
print(f" Found {len(gnews)} articles")
# Combine and sort by published date
all_items = github + hn + gnews
all_items.sort(key=lambda x: x.get("published", ""), reverse=True)
return {
"source": "news",
"fetched_at": datetime.utcnow().isoformat(),
"time_window_hours": hours,
"total_items": len(all_items),
"github_releases": github,
"hackernews": hn,
"google_news": gnews,
"all_items": all_items[:15] # Top 15 overall
}
if __name__ == "__main__":
import sys
hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
output_file = sys.argv[2] if len(sys.argv) > 2 else "/home/openclaw/.openclaw/workspace/automations/openclaw-digest/output/news.json"
content = fetch_news_content(hours=hours)
with open(output_file, "w") as f:
json.dump(content, f, indent=2)
print(f"\n✅ News content saved to {output_file}")
print(f" Total items: {content['total_items']}")
print(f" GitHub releases: {len(content['github_releases'])}")
print(f" Hacker News: {len(content['hackernews'])}")
print(f" Google News: {len(content['google_news'])}")

View File

@@ -0,0 +1,147 @@
#!/usr/bin/env python3
"""
Reddit Content Fetcher for OpenClaw Daily Digest
Fetches posts from OpenClaw-related subreddits using Reddit's JSON API
No authentication required for read-only public access
"""
import requests
import json
import time
from datetime import datetime, timedelta
from typing import List, Dict, Any
# Reddit API endpoints (JSON API - no auth needed for read-only)
SUBREDDITS = [
"openclaw",
"LocalLLaMA",
"vibecoding",
"selfhosted",
"homeautomation"
]
REDDIT_JSON_URL = "https://www.reddit.com/r/{subreddit}.json"
def fetch_subreddit(subreddit: str, limit: int = 25) -> List[Dict[str, Any]]:
"""Fetch posts from a subreddit using Reddit JSON API"""
url = REDDIT_JSON_URL.format(subreddit=subreddit)
headers = {
"User-Agent": "OpenClaw-Digest-Bot/1.0 (by /u/krillyclaw)"
}
try:
response = requests.get(url, headers=headers, params={"limit": limit}, timeout=30)
response.raise_for_status()
data = response.json()
posts = []
for child in data.get("data", {}).get("children", []):
post = child.get("data", {})
posts.append({
"id": post.get("id"),
"title": post.get("title"),
"author": post.get("author"),
"subreddit": post.get("subreddit"),
"score": post.get("score", 0),
"num_comments": post.get("num_comments", 0),
"created_utc": post.get("created_utc", 0),
"url": f"https://reddit.com{post.get('permalink', '')}",
"selftext": post.get("selftext", "")[:500] + "..." if len(post.get("selftext", "")) > 500 else post.get("selftext", ""),
"is_self": post.get("is_self", False),
"link_flair_text": post.get("link_flair_text", ""),
"upvote_ratio": post.get("upvote_ratio", 0)
})
return posts
except Exception as e:
print(f"Error fetching r/{subreddit}: {e}")
return []
def filter_by_time(posts: List[Dict], hours: int = 24) -> List[Dict]:
"""Filter posts to only include those from last N hours"""
cutoff = datetime.utcnow() - timedelta(hours=hours)
cutoff_timestamp = cutoff.timestamp()
filtered = []
for post in posts:
if post["created_utc"] >= cutoff_timestamp:
post["created_datetime"] = datetime.utcfromtimestamp(post["created_utc"]).strftime("%Y-%m-%d %H:%M UTC")
filtered.append(post)
return filtered
def filter_openclaw_related(posts: List[Dict]) -> List[Dict]:
"""Filter posts to only include OpenClaw-related content"""
keywords = ["openclaw", "clawdbot", "open claw", "clawd"]
filtered = []
for post in posts:
text = f"{post.get('title', '')} {post.get('selftext', '')}".lower()
if any(keyword in text for keyword in keywords):
filtered.append(post)
return filtered
def score_post(post: Dict) -> float:
"""Calculate relevance score based on engagement"""
score = post.get("score", 0)
comments = post.get("num_comments", 0)
upvote_ratio = post.get("upvote_ratio", 0.5)
# Weighted scoring: comments matter more than upvotes
# Upvote ratio indicates quality (avoid controversial posts)
return (score * 0.3) + (comments * 2) + (upvote_ratio * 50)
def fetch_reddit_content(hours: int = 24, limit_per_sub: int = 25) -> Dict[str, Any]:
"""Main function to fetch all Reddit content"""
all_posts = []
print(f"🔍 Fetching Reddit posts from last {hours} hours...")
for subreddit in SUBREDDITS:
print(f" 📡 r/{subreddit}...")
posts = fetch_subreddit(subreddit, limit=limit_per_sub)
# Filter by time
recent_posts = filter_by_time(posts, hours)
# For non-OpenClaw subreddits, filter for OpenClaw mentions
if subreddit.lower() != "openclaw":
recent_posts = filter_openclaw_related(recent_posts)
print(f" Found {len(recent_posts)} recent OpenClaw-related posts")
all_posts.extend(recent_posts)
# Rate limiting - be nice to Reddit
time.sleep(0.5)
# Sort by engagement score
all_posts.sort(key=score_post, reverse=True)
# Separate into categories
openclaw_subreddit = [p for p in all_posts if p["subreddit"].lower() == "openclaw"]
other_subreddits = [p for p in all_posts if p["subreddit"].lower() != "openclaw"]
return {
"source": "reddit",
"fetched_at": datetime.utcnow().isoformat(),
"time_window_hours": hours,
"total_posts": len(all_posts),
"openclaw_subreddit": openclaw_subreddit[:5], # Top 5 from r/OpenClaw
"other_subreddits": other_subreddits[:5], # Top 5 from elsewhere
"all_posts": all_posts[:10] # Top 10 overall
}
if __name__ == "__main__":
import sys
hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
output_file = sys.argv[2] if len(sys.argv) > 2 else "/home/openclaw/.openclaw/workspace/automations/openclaw-digest/output/reddit.json"
content = fetch_reddit_content(hours=hours)
with open(output_file, "w") as f:
json.dump(content, f, indent=2)
print(f"\n✅ Reddit content saved to {output_file}")
print(f" Total posts: {content['total_posts']}")
print(f" From r/OpenClaw: {len(content['openclaw_subreddit'])}")
print(f" From other subs: {len(content['other_subreddits'])}")