openclaw-backups/automations/openclaw-digest/sources/reddit_fetcher.py

#!/usr/bin/env python3
"""
Reddit Content Fetcher for OpenClaw Daily Digest
Fetches posts from OpenClaw-related subreddits using Reddit's JSON API
No authentication required for read-only public access
"""

import requests
import json
import time
from datetime import datetime, timedelta
from typing import List, Dict, Any

# Reddit API endpoints (JSON API - no auth needed for read-only)
SUBREDDITS = [
    "openclaw",
    "LocalLLaMA",
    "vibecoding",
    "selfhosted",
    "homeautomation"
]

REDDIT_JSON_URL = "https://www.reddit.com/r/{subreddit}.json"

def fetch_subreddit(subreddit: str, limit: int = 25) -> List[Dict[str, Any]]:
    """Fetch posts from a subreddit using Reddit JSON API"""
    url = REDDIT_JSON_URL.format(subreddit=subreddit)
    headers = {
        "User-Agent": "OpenClaw-Digest-Bot/1.0 (by /u/krillyclaw)"
    }

    try:
        response = requests.get(url, headers=headers, params={"limit": limit}, timeout=30)
        response.raise_for_status()
        data = response.json()

        posts = []
        for child in data.get("data", {}).get("children", []):
            post = child.get("data", {})
            posts.append({
                "id": post.get("id"),
                "title": post.get("title"),
                "author": post.get("author"),
                "subreddit": post.get("subreddit"),
                "score": post.get("score", 0),
                "num_comments": post.get("num_comments", 0),
                "created_utc": post.get("created_utc", 0),
                "url": f"https://reddit.com{post.get('permalink', '')}",
                "selftext": post.get("selftext", "")[:500] + "..." if len(post.get("selftext", "")) > 500 else post.get("selftext", ""),
                "is_self": post.get("is_self", False),
                "link_flair_text": post.get("link_flair_text", ""),
                "upvote_ratio": post.get("upvote_ratio", 0)
            })
        return posts
    except Exception as e:
        print(f"Error fetching r/{subreddit}: {e}")
        return []

def filter_by_time(posts: List[Dict], hours: int = 24) -> List[Dict]:
    """Filter posts to only include those from last N hours"""
    cutoff = datetime.utcnow() - timedelta(hours=hours)
    cutoff_timestamp = cutoff.timestamp()

    filtered = []
    for post in posts:
        if post["created_utc"] >= cutoff_timestamp:
            post["created_datetime"] = datetime.utcfromtimestamp(post["created_utc"]).strftime("%Y-%m-%d %H:%M UTC")
            filtered.append(post)
    return filtered

def filter_openclaw_related(posts: List[Dict]) -> List[Dict]:
    """Filter posts to only include OpenClaw-related content"""
    keywords = ["openclaw", "clawdbot", "open claw", "clawd"]
    filtered = []

    for post in posts:
        text = f"{post.get('title', '')} {post.get('selftext', '')}".lower()
        if any(keyword in text for keyword in keywords):
            filtered.append(post)

    return filtered

def score_post(post: Dict) -> float:
    """Calculate relevance score based on engagement"""
    score = post.get("score", 0)
    comments = post.get("num_comments", 0)
    upvote_ratio = post.get("upvote_ratio", 0.5)

    # Weighted scoring: comments matter more than upvotes
    # Upvote ratio indicates quality (avoid controversial posts)
    return (score * 0.3) + (comments * 2) + (upvote_ratio * 50)

def fetch_reddit_content(hours: int = 24, limit_per_sub: int = 25) -> Dict[str, Any]:
    """Main function to fetch all Reddit content"""
    all_posts = []

    print(f"🔍 Fetching Reddit posts from last {hours} hours...")

    for subreddit in SUBREDDITS:
        print(f"  📡 r/{subreddit}...")
        posts = fetch_subreddit(subreddit, limit=limit_per_sub)

        # Filter by time
        recent_posts = filter_by_time(posts, hours)

        # For non-OpenClaw subreddits, filter for OpenClaw mentions
        if subreddit.lower() != "openclaw":
            recent_posts = filter_openclaw_related(recent_posts)

        print(f"     Found {len(recent_posts)} recent OpenClaw-related posts")
        all_posts.extend(recent_posts)

        # Rate limiting - be nice to Reddit
        time.sleep(0.5)

    # Sort by engagement score
    all_posts.sort(key=score_post, reverse=True)

    # Separate into categories
    openclaw_subreddit = [p for p in all_posts if p["subreddit"].lower() == "openclaw"]
    other_subreddits = [p for p in all_posts if p["subreddit"].lower() != "openclaw"]

    return {
        "source": "reddit",
        "fetched_at": datetime.utcnow().isoformat(),
        "time_window_hours": hours,
        "total_posts": len(all_posts),
        "openclaw_subreddit": openclaw_subreddit[:5],  # Top 5 from r/OpenClaw
        "other_subreddits": other_subreddits[:5],       # Top 5 from elsewhere
        "all_posts": all_posts[:10]                     # Top 10 overall
    }

if __name__ == "__main__":
    import sys

    hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
    output_file = sys.argv[2] if len(sys.argv) > 2 else "/home/openclaw/.openclaw/workspace/automations/openclaw-digest/output/reddit.json"

    content = fetch_reddit_content(hours=hours)

    with open(output_file, "w") as f:
        json.dump(content, f, indent=2)

    print(f"\n✅ Reddit content saved to {output_file}")
    print(f"   Total posts: {content['total_posts']}")
    print(f"   From r/OpenClaw: {len(content['openclaw_subreddit'])}")
    print(f"   From other subs: {len(content['other_subreddits'])}")