Files
openclaw-backups/automations/openclaw-digest/sources/reddit_fetcher.py

148 lines
5.5 KiB
Python

#!/usr/bin/env python3
"""
Reddit Content Fetcher for OpenClaw Daily Digest
Fetches posts from OpenClaw-related subreddits using Reddit's JSON API
No authentication required for read-only public access
"""
import requests
import json
import time
from datetime import datetime, timedelta
from typing import List, Dict, Any
# Reddit API endpoints (JSON API - no auth needed for read-only)
SUBREDDITS = [
"openclaw",
"LocalLLaMA",
"vibecoding",
"selfhosted",
"homeautomation"
]
REDDIT_JSON_URL = "https://www.reddit.com/r/{subreddit}.json"
def fetch_subreddit(subreddit: str, limit: int = 25) -> List[Dict[str, Any]]:
"""Fetch posts from a subreddit using Reddit JSON API"""
url = REDDIT_JSON_URL.format(subreddit=subreddit)
headers = {
"User-Agent": "OpenClaw-Digest-Bot/1.0 (by /u/krillyclaw)"
}
try:
response = requests.get(url, headers=headers, params={"limit": limit}, timeout=30)
response.raise_for_status()
data = response.json()
posts = []
for child in data.get("data", {}).get("children", []):
post = child.get("data", {})
posts.append({
"id": post.get("id"),
"title": post.get("title"),
"author": post.get("author"),
"subreddit": post.get("subreddit"),
"score": post.get("score", 0),
"num_comments": post.get("num_comments", 0),
"created_utc": post.get("created_utc", 0),
"url": f"https://reddit.com{post.get('permalink', '')}",
"selftext": post.get("selftext", "")[:500] + "..." if len(post.get("selftext", "")) > 500 else post.get("selftext", ""),
"is_self": post.get("is_self", False),
"link_flair_text": post.get("link_flair_text", ""),
"upvote_ratio": post.get("upvote_ratio", 0)
})
return posts
except Exception as e:
print(f"Error fetching r/{subreddit}: {e}")
return []
def filter_by_time(posts: List[Dict], hours: int = 24) -> List[Dict]:
"""Filter posts to only include those from last N hours"""
cutoff = datetime.utcnow() - timedelta(hours=hours)
cutoff_timestamp = cutoff.timestamp()
filtered = []
for post in posts:
if post["created_utc"] >= cutoff_timestamp:
post["created_datetime"] = datetime.utcfromtimestamp(post["created_utc"]).strftime("%Y-%m-%d %H:%M UTC")
filtered.append(post)
return filtered
def filter_openclaw_related(posts: List[Dict]) -> List[Dict]:
"""Filter posts to only include OpenClaw-related content"""
keywords = ["openclaw", "clawdbot", "open claw", "clawd"]
filtered = []
for post in posts:
text = f"{post.get('title', '')} {post.get('selftext', '')}".lower()
if any(keyword in text for keyword in keywords):
filtered.append(post)
return filtered
def score_post(post: Dict) -> float:
"""Calculate relevance score based on engagement"""
score = post.get("score", 0)
comments = post.get("num_comments", 0)
upvote_ratio = post.get("upvote_ratio", 0.5)
# Weighted scoring: comments matter more than upvotes
# Upvote ratio indicates quality (avoid controversial posts)
return (score * 0.3) + (comments * 2) + (upvote_ratio * 50)
def fetch_reddit_content(hours: int = 24, limit_per_sub: int = 25) -> Dict[str, Any]:
"""Main function to fetch all Reddit content"""
all_posts = []
print(f"🔍 Fetching Reddit posts from last {hours} hours...")
for subreddit in SUBREDDITS:
print(f" 📡 r/{subreddit}...")
posts = fetch_subreddit(subreddit, limit=limit_per_sub)
# Filter by time
recent_posts = filter_by_time(posts, hours)
# For non-OpenClaw subreddits, filter for OpenClaw mentions
if subreddit.lower() != "openclaw":
recent_posts = filter_openclaw_related(recent_posts)
print(f" Found {len(recent_posts)} recent OpenClaw-related posts")
all_posts.extend(recent_posts)
# Rate limiting - be nice to Reddit
time.sleep(0.5)
# Sort by engagement score
all_posts.sort(key=score_post, reverse=True)
# Separate into categories
openclaw_subreddit = [p for p in all_posts if p["subreddit"].lower() == "openclaw"]
other_subreddits = [p for p in all_posts if p["subreddit"].lower() != "openclaw"]
return {
"source": "reddit",
"fetched_at": datetime.utcnow().isoformat(),
"time_window_hours": hours,
"total_posts": len(all_posts),
"openclaw_subreddit": openclaw_subreddit[:5], # Top 5 from r/OpenClaw
"other_subreddits": other_subreddits[:5], # Top 5 from elsewhere
"all_posts": all_posts[:10] # Top 10 overall
}
if __name__ == "__main__":
import sys
hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
output_file = sys.argv[2] if len(sys.argv) > 2 else "/home/openclaw/.openclaw/workspace/automations/openclaw-digest/output/reddit.json"
content = fetch_reddit_content(hours=hours)
with open(output_file, "w") as f:
json.dump(content, f, indent=2)
print(f"\n✅ Reddit content saved to {output_file}")
print(f" Total posts: {content['total_posts']}")
print(f" From r/OpenClaw: {len(content['openclaw_subreddit'])}")
print(f" From other subs: {len(content['other_subreddits'])}")