148 lines
5.5 KiB
Python
148 lines
5.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Reddit Content Fetcher for OpenClaw Daily Digest
|
|
Fetches posts from OpenClaw-related subreddits using Reddit's JSON API
|
|
No authentication required for read-only public access
|
|
"""
|
|
|
|
import requests
|
|
import json
|
|
import time
|
|
from datetime import datetime, timedelta
|
|
from typing import List, Dict, Any
|
|
|
|
# Reddit API endpoints (JSON API - no auth needed for read-only)
|
|
SUBREDDITS = [
|
|
"openclaw",
|
|
"LocalLLaMA",
|
|
"vibecoding",
|
|
"selfhosted",
|
|
"homeautomation"
|
|
]
|
|
|
|
REDDIT_JSON_URL = "https://www.reddit.com/r/{subreddit}.json"
|
|
|
|
def fetch_subreddit(subreddit: str, limit: int = 25) -> List[Dict[str, Any]]:
|
|
"""Fetch posts from a subreddit using Reddit JSON API"""
|
|
url = REDDIT_JSON_URL.format(subreddit=subreddit)
|
|
headers = {
|
|
"User-Agent": "OpenClaw-Digest-Bot/1.0 (by /u/krillyclaw)"
|
|
}
|
|
|
|
try:
|
|
response = requests.get(url, headers=headers, params={"limit": limit}, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
posts = []
|
|
for child in data.get("data", {}).get("children", []):
|
|
post = child.get("data", {})
|
|
posts.append({
|
|
"id": post.get("id"),
|
|
"title": post.get("title"),
|
|
"author": post.get("author"),
|
|
"subreddit": post.get("subreddit"),
|
|
"score": post.get("score", 0),
|
|
"num_comments": post.get("num_comments", 0),
|
|
"created_utc": post.get("created_utc", 0),
|
|
"url": f"https://reddit.com{post.get('permalink', '')}",
|
|
"selftext": post.get("selftext", "")[:500] + "..." if len(post.get("selftext", "")) > 500 else post.get("selftext", ""),
|
|
"is_self": post.get("is_self", False),
|
|
"link_flair_text": post.get("link_flair_text", ""),
|
|
"upvote_ratio": post.get("upvote_ratio", 0)
|
|
})
|
|
return posts
|
|
except Exception as e:
|
|
print(f"Error fetching r/{subreddit}: {e}")
|
|
return []
|
|
|
|
def filter_by_time(posts: List[Dict], hours: int = 24) -> List[Dict]:
|
|
"""Filter posts to only include those from last N hours"""
|
|
cutoff = datetime.utcnow() - timedelta(hours=hours)
|
|
cutoff_timestamp = cutoff.timestamp()
|
|
|
|
filtered = []
|
|
for post in posts:
|
|
if post["created_utc"] >= cutoff_timestamp:
|
|
post["created_datetime"] = datetime.utcfromtimestamp(post["created_utc"]).strftime("%Y-%m-%d %H:%M UTC")
|
|
filtered.append(post)
|
|
return filtered
|
|
|
|
def filter_openclaw_related(posts: List[Dict]) -> List[Dict]:
|
|
"""Filter posts to only include OpenClaw-related content"""
|
|
keywords = ["openclaw", "clawdbot", "open claw", "clawd"]
|
|
filtered = []
|
|
|
|
for post in posts:
|
|
text = f"{post.get('title', '')} {post.get('selftext', '')}".lower()
|
|
if any(keyword in text for keyword in keywords):
|
|
filtered.append(post)
|
|
|
|
return filtered
|
|
|
|
def score_post(post: Dict) -> float:
|
|
"""Calculate relevance score based on engagement"""
|
|
score = post.get("score", 0)
|
|
comments = post.get("num_comments", 0)
|
|
upvote_ratio = post.get("upvote_ratio", 0.5)
|
|
|
|
# Weighted scoring: comments matter more than upvotes
|
|
# Upvote ratio indicates quality (avoid controversial posts)
|
|
return (score * 0.3) + (comments * 2) + (upvote_ratio * 50)
|
|
|
|
def fetch_reddit_content(hours: int = 24, limit_per_sub: int = 25) -> Dict[str, Any]:
|
|
"""Main function to fetch all Reddit content"""
|
|
all_posts = []
|
|
|
|
print(f"🔍 Fetching Reddit posts from last {hours} hours...")
|
|
|
|
for subreddit in SUBREDDITS:
|
|
print(f" 📡 r/{subreddit}...")
|
|
posts = fetch_subreddit(subreddit, limit=limit_per_sub)
|
|
|
|
# Filter by time
|
|
recent_posts = filter_by_time(posts, hours)
|
|
|
|
# For non-OpenClaw subreddits, filter for OpenClaw mentions
|
|
if subreddit.lower() != "openclaw":
|
|
recent_posts = filter_openclaw_related(recent_posts)
|
|
|
|
print(f" Found {len(recent_posts)} recent OpenClaw-related posts")
|
|
all_posts.extend(recent_posts)
|
|
|
|
# Rate limiting - be nice to Reddit
|
|
time.sleep(0.5)
|
|
|
|
# Sort by engagement score
|
|
all_posts.sort(key=score_post, reverse=True)
|
|
|
|
# Separate into categories
|
|
openclaw_subreddit = [p for p in all_posts if p["subreddit"].lower() == "openclaw"]
|
|
other_subreddits = [p for p in all_posts if p["subreddit"].lower() != "openclaw"]
|
|
|
|
return {
|
|
"source": "reddit",
|
|
"fetched_at": datetime.utcnow().isoformat(),
|
|
"time_window_hours": hours,
|
|
"total_posts": len(all_posts),
|
|
"openclaw_subreddit": openclaw_subreddit[:5], # Top 5 from r/OpenClaw
|
|
"other_subreddits": other_subreddits[:5], # Top 5 from elsewhere
|
|
"all_posts": all_posts[:10] # Top 10 overall
|
|
}
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
|
|
output_file = sys.argv[2] if len(sys.argv) > 2 else "/home/openclaw/.openclaw/workspace/automations/openclaw-digest/output/reddit.json"
|
|
|
|
content = fetch_reddit_content(hours=hours)
|
|
|
|
with open(output_file, "w") as f:
|
|
json.dump(content, f, indent=2)
|
|
|
|
print(f"\n✅ Reddit content saved to {output_file}")
|
|
print(f" Total posts: {content['total_posts']}")
|
|
print(f" From r/OpenClaw: {len(content['openclaw_subreddit'])}")
|
|
print(f" From other subs: {len(content['other_subreddits'])}")
|