AI Newsletter Digest improvements: fixed QP soft line break decoding, URL extraction, and content cleaning

2026-03-04 13:29:22 +00:00
parent 29a98137a7
commit 57dd294675
13706 changed files with 2114953 additions and 237629 deletions
--- a/automations/openclaw-digest/sources/pycache/news_fetcher.cpython-311.pyc
+++ b/automations/openclaw-digest/sources/pycache/news_fetcher.cpython-311.pyc
--- a/automations/openclaw-digest/sources/pycache/reddit_fetcher.cpython-311.pyc
+++ b/automations/openclaw-digest/sources/pycache/reddit_fetcher.cpython-311.pyc
--- a/automations/openclaw-digest/sources/news_fetcher.py
+++ b/automations/openclaw-digest/sources/news_fetcher.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+"""
+News Aggregation Fetcher for OpenClaw Daily Digest
+Fetches from GitHub releases, Hacker News, and tech news sources
+"""
+
+import requests
+import json
+import feedparser
+from datetime import datetime, timedelta
+from typing import List, Dict, Any
+
+# News sources configuration
+SOURCES = {
+    "github_releases": {
+        "url": "https://github.com/openclaw/openclaw/releases.atom",
+        "type": "rss"
+    },
+    "hn_search": {
+        "url": "https://hn.algolia.com/api/v1/search",
+        "type": "hackernews",
+        "query": "openclaw"
+    }
+}
+
+def fetch_github_releases() -> List[Dict[str, Any]]:
+    """Fetch latest OpenClaw releases from GitHub Atom feed"""
+    try:
+        feed = feedparser.parse(SOURCES["github_releases"]["url"])
+        releases = []
+        
+        cutoff = datetime.utcnow() - timedelta(hours=24)
+        
+        for entry in feed.entries[:5]:  # Last 5 releases
+            try:
+                # Try parsed date first, fallback to string parsing
+                if hasattr(entry, 'published_parsed') and entry.published_parsed:
+                    published = datetime(*entry.published_parsed[:6])
+                elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
+                    published = datetime(*entry.updated_parsed[:6])
+                else:
+                    # Parse ISO format string
+                    date_str = entry.get('published', entry.get('updated', ''))
+                    published = datetime.fromisoformat(date_str.replace('Z', '+00:00').replace('+00:00', ''))
+                
+                if published >= cutoff:
+                    releases.append({
+                        "id": entry.id,
+                        "title": entry.title,
+                        "url": entry.link,
+                        "published": published.isoformat(),
+                        "summary": entry.get("summary", "")[:300] + "..." if len(entry.get("summary", "")) > 300 else entry.get("summary", ""),
+                        "source": "GitHub",
+                        "source_icon": "🐙",
+                        "category": "Release"
+                    })
+            except Exception as e:
+                print(f"    Skipping entry due to date parse error: {e}")
+                continue
+        
+        return releases
+    except Exception as e:
+        print(f"Error fetching GitHub releases: {e}")
+        return []
+
+def fetch_hackernews() -> List[Dict[str, Any]]:
+    """Fetch OpenClaw-related stories from Hacker News (last 24h)"""
+    try:
+        # Algolia HN search API - last 24 hours
+        params = {
+            "query": SOURCES["hn_search"]["query"],
+            "tags": "story",
+            "numericFilters": "created_at_i>" + str(int((datetime.utcnow() - timedelta(hours=24)).timestamp()))
+        }
+        
+        response = requests.get(SOURCES["hn_search"]["url"], params=params, timeout=30)
+        response.raise_for_status()
+        data = response.json()
+        
+        stories = []
+        for hit in data.get("hits", [])[:10]:  # Top 10 stories
+            stories.append({
+                "id": hit.get("objectID"),
+                "title": hit.get("title"),
+                "url": hit.get("url") or f"https://news.ycombinator.com/item?id={hit.get('objectID')}",
+                "hn_url": f"https://news.ycombinator.com/item?id={hit.get('objectID')}",
+                "published": datetime.fromtimestamp(hit.get("created_at_i", 0)).isoformat(),
+                "author": hit.get("author"),
+                "points": hit.get("points", 0),
+                "num_comments": hit.get("num_comments", 0),
+                "summary": hit.get("story_text", "")[:200] + "..." if hit.get("story_text") else "",
+                "source": "Hacker News",
+                "source_icon": "🟠",
+                "category": "Discussion"
+            })
+        
+        return stories
+    except Exception as e:
+        print(f"Error fetching Hacker News: {e}")
+        return []
+
+def fetch_google_news() -> List[Dict[str, Any]]:
+    """Fetch OpenClaw news from Google News RSS"""
+    try:
+        # Google News RSS for OpenClaw
+        url = "https://news.google.com/rss/search?q=OpenClaw+AI+agent&hl=en-US&gl=US&ceid=US:en"
+        feed = feedparser.parse(url)
+        
+        news = []
+        cutoff = datetime.utcnow() - timedelta(hours=24)
+        
+        for entry in feed.entries[:10]:
+            try:
+                published = datetime(*entry.published_parsed[:6])
+                if published >= cutoff:
+                    news.append({
+                        "id": entry.id,
+                        "title": entry.title,
+                        "url": entry.link,
+                        "published": published.isoformat(),
+                        "source": entry.get("source", {}).get("title", "Google News"),
+                        "source_icon": "📰",
+                        "category": "News"
+                    })
+            except:
+                continue
+        
+        return news
+    except Exception as e:
+        print(f"Error fetching Google News: {e}")
+        return []
+
+def fetch_news_content(hours: int = 24) -> Dict[str, Any]:
+    """Main function to fetch all news content"""
+    print(f"🔍 Fetching news from last {hours} hours...")
+    
+    # Fetch from all sources
+    print("  📡 GitHub releases...")
+    github = fetch_github_releases()
+    print(f"     Found {len(github)} releases")
+    
+    print("  📡 Hacker News...")
+    hn = fetch_hackernews()
+    print(f"     Found {len(hn)} stories")
+    
+    print("  📡 Google News...")
+    gnews = fetch_google_news()
+    print(f"     Found {len(gnews)} articles")
+    
+    # Combine and sort by published date
+    all_items = github + hn + gnews
+    all_items.sort(key=lambda x: x.get("published", ""), reverse=True)
+    
+    return {
+        "source": "news",
+        "fetched_at": datetime.utcnow().isoformat(),
+        "time_window_hours": hours,
+        "total_items": len(all_items),
+        "github_releases": github,
+        "hackernews": hn,
+        "google_news": gnews,
+        "all_items": all_items[:15]  # Top 15 overall
+    }
+
+if __name__ == "__main__":
+    import sys
+    
+    hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
+    output_file = sys.argv[2] if len(sys.argv) > 2 else "/home/openclaw/.openclaw/workspace/automations/openclaw-digest/output/news.json"
+    
+    content = fetch_news_content(hours=hours)
+    
+    with open(output_file, "w") as f:
+        json.dump(content, f, indent=2)
+    
+    print(f"\n✅ News content saved to {output_file}")
+    print(f"   Total items: {content['total_items']}")
+    print(f"   GitHub releases: {len(content['github_releases'])}")
+    print(f"   Hacker News: {len(content['hackernews'])}")
+    print(f"   Google News: {len(content['google_news'])}")
--- a/automations/openclaw-digest/sources/reddit_fetcher.py
+++ b/automations/openclaw-digest/sources/reddit_fetcher.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+"""
+Reddit Content Fetcher for OpenClaw Daily Digest
+Fetches posts from OpenClaw-related subreddits using Reddit's JSON API
+No authentication required for read-only public access
+"""
+
+import requests
+import json
+import time
+from datetime import datetime, timedelta
+from typing import List, Dict, Any
+
+# Reddit API endpoints (JSON API - no auth needed for read-only)
+SUBREDDITS = [
+    "openclaw",
+    "LocalLLaMA", 
+    "vibecoding",
+    "selfhosted",
+    "homeautomation"
+]
+
+REDDIT_JSON_URL = "https://www.reddit.com/r/{subreddit}.json"
+
+def fetch_subreddit(subreddit: str, limit: int = 25) -> List[Dict[str, Any]]:
+    """Fetch posts from a subreddit using Reddit JSON API"""
+    url = REDDIT_JSON_URL.format(subreddit=subreddit)
+    headers = {
+        "User-Agent": "OpenClaw-Digest-Bot/1.0 (by /u/krillyclaw)"
+    }
+    
+    try:
+        response = requests.get(url, headers=headers, params={"limit": limit}, timeout=30)
+        response.raise_for_status()
+        data = response.json()
+        
+        posts = []
+        for child in data.get("data", {}).get("children", []):
+            post = child.get("data", {})
+            posts.append({
+                "id": post.get("id"),
+                "title": post.get("title"),
+                "author": post.get("author"),
+                "subreddit": post.get("subreddit"),
+                "score": post.get("score", 0),
+                "num_comments": post.get("num_comments", 0),
+                "created_utc": post.get("created_utc", 0),
+                "url": f"https://reddit.com{post.get('permalink', '')}",
+                "selftext": post.get("selftext", "")[:500] + "..." if len(post.get("selftext", "")) > 500 else post.get("selftext", ""),
+                "is_self": post.get("is_self", False),
+                "link_flair_text": post.get("link_flair_text", ""),
+                "upvote_ratio": post.get("upvote_ratio", 0)
+            })
+        return posts
+    except Exception as e:
+        print(f"Error fetching r/{subreddit}: {e}")
+        return []
+
+def filter_by_time(posts: List[Dict], hours: int = 24) -> List[Dict]:
+    """Filter posts to only include those from last N hours"""
+    cutoff = datetime.utcnow() - timedelta(hours=hours)
+    cutoff_timestamp = cutoff.timestamp()
+    
+    filtered = []
+    for post in posts:
+        if post["created_utc"] >= cutoff_timestamp:
+            post["created_datetime"] = datetime.utcfromtimestamp(post["created_utc"]).strftime("%Y-%m-%d %H:%M UTC")
+            filtered.append(post)
+    return filtered
+
+def filter_openclaw_related(posts: List[Dict]) -> List[Dict]:
+    """Filter posts to only include OpenClaw-related content"""
+    keywords = ["openclaw", "clawdbot", "open claw", "clawd"]
+    filtered = []
+    
+    for post in posts:
+        text = f"{post.get('title', '')} {post.get('selftext', '')}".lower()
+        if any(keyword in text for keyword in keywords):
+            filtered.append(post)
+    
+    return filtered
+
+def score_post(post: Dict) -> float:
+    """Calculate relevance score based on engagement"""
+    score = post.get("score", 0)
+    comments = post.get("num_comments", 0)
+    upvote_ratio = post.get("upvote_ratio", 0.5)
+    
+    # Weighted scoring: comments matter more than upvotes
+    # Upvote ratio indicates quality (avoid controversial posts)
+    return (score * 0.3) + (comments * 2) + (upvote_ratio * 50)
+
+def fetch_reddit_content(hours: int = 24, limit_per_sub: int = 25) -> Dict[str, Any]:
+    """Main function to fetch all Reddit content"""
+    all_posts = []
+    
+    print(f"🔍 Fetching Reddit posts from last {hours} hours...")
+    
+    for subreddit in SUBREDDITS:
+        print(f"  📡 r/{subreddit}...")
+        posts = fetch_subreddit(subreddit, limit=limit_per_sub)
+        
+        # Filter by time
+        recent_posts = filter_by_time(posts, hours)
+        
+        # For non-OpenClaw subreddits, filter for OpenClaw mentions
+        if subreddit.lower() != "openclaw":
+            recent_posts = filter_openclaw_related(recent_posts)
+        
+        print(f"     Found {len(recent_posts)} recent OpenClaw-related posts")
+        all_posts.extend(recent_posts)
+        
+        # Rate limiting - be nice to Reddit
+        time.sleep(0.5)
+    
+    # Sort by engagement score
+    all_posts.sort(key=score_post, reverse=True)
+    
+    # Separate into categories
+    openclaw_subreddit = [p for p in all_posts if p["subreddit"].lower() == "openclaw"]
+    other_subreddits = [p for p in all_posts if p["subreddit"].lower() != "openclaw"]
+    
+    return {
+        "source": "reddit",
+        "fetched_at": datetime.utcnow().isoformat(),
+        "time_window_hours": hours,
+        "total_posts": len(all_posts),
+        "openclaw_subreddit": openclaw_subreddit[:5],  # Top 5 from r/OpenClaw
+        "other_subreddits": other_subreddits[:5],       # Top 5 from elsewhere
+        "all_posts": all_posts[:10]                     # Top 10 overall
+    }
+
+if __name__ == "__main__":
+    import sys
+    
+    hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
+    output_file = sys.argv[2] if len(sys.argv) > 2 else "/home/openclaw/.openclaw/workspace/automations/openclaw-digest/output/reddit.json"
+    
+    content = fetch_reddit_content(hours=hours)
+    
+    with open(output_file, "w") as f:
+        json.dump(content, f, indent=2)
+    
+    print(f"\n✅ Reddit content saved to {output_file}")
+    print(f"   Total posts: {content['total_posts']}")
+    print(f"   From r/OpenClaw: {len(content['openclaw_subreddit'])}")
+    print(f"   From other subs: {len(content['other_subreddits'])}")