openclaw-backups/automations/openclaw-digest/sources/news_fetcher.py

#!/usr/bin/env python3
"""
News Aggregation Fetcher for OpenClaw Daily Digest
Fetches from GitHub releases, Hacker News, and tech news sources
"""

import requests
import json
import feedparser
from datetime import datetime, timedelta
from typing import List, Dict, Any

# News sources configuration
SOURCES = {
    "github_releases": {
        "url": "https://github.com/openclaw/openclaw/releases.atom",
        "type": "rss"
    },
    "hn_search": {
        "url": "https://hn.algolia.com/api/v1/search",
        "type": "hackernews",
        "query": "openclaw"
    }
}

def fetch_github_releases() -> List[Dict[str, Any]]:
    """Fetch latest OpenClaw releases from GitHub Atom feed"""
    try:
        feed = feedparser.parse(SOURCES["github_releases"]["url"])
        releases = []

        cutoff = datetime.utcnow() - timedelta(hours=24)

        for entry in feed.entries[:5]:  # Last 5 releases
            try:
                # Try parsed date first, fallback to string parsing
                if hasattr(entry, 'published_parsed') and entry.published_parsed:
                    published = datetime(*entry.published_parsed[:6])
                elif hasattr(entry, 'updated_parsed') and entry.updated_parsed:
                    published = datetime(*entry.updated_parsed[:6])
                else:
                    # Parse ISO format string
                    date_str = entry.get('published', entry.get('updated', ''))
                    published = datetime.fromisoformat(date_str.replace('Z', '+00:00').replace('+00:00', ''))

                if published >= cutoff:
                    releases.append({
                        "id": entry.id,
                        "title": entry.title,
                        "url": entry.link,
                        "published": published.isoformat(),
                        "summary": entry.get("summary", "")[:300] + "..." if len(entry.get("summary", "")) > 300 else entry.get("summary", ""),
                        "source": "GitHub",
                        "source_icon": "🐙",
                        "category": "Release"
                    })
            except Exception as e:
                print(f"    Skipping entry due to date parse error: {e}")
                continue

        return releases
    except Exception as e:
        print(f"Error fetching GitHub releases: {e}")
        return []

def fetch_hackernews() -> List[Dict[str, Any]]:
    """Fetch OpenClaw-related stories from Hacker News (last 24h)"""
    try:
        # Algolia HN search API - last 24 hours
        params = {
            "query": SOURCES["hn_search"]["query"],
            "tags": "story",
            "numericFilters": "created_at_i>" + str(int((datetime.utcnow() - timedelta(hours=24)).timestamp()))
        }

        response = requests.get(SOURCES["hn_search"]["url"], params=params, timeout=30)
        response.raise_for_status()
        data = response.json()

        stories = []
        for hit in data.get("hits", [])[:10]:  # Top 10 stories
            stories.append({
                "id": hit.get("objectID"),
                "title": hit.get("title"),
                "url": hit.get("url") or f"https://news.ycombinator.com/item?id={hit.get('objectID')}",
                "hn_url": f"https://news.ycombinator.com/item?id={hit.get('objectID')}",
                "published": datetime.fromtimestamp(hit.get("created_at_i", 0)).isoformat(),
                "author": hit.get("author"),
                "points": hit.get("points", 0),
                "num_comments": hit.get("num_comments", 0),
                "summary": hit.get("story_text", "")[:200] + "..." if hit.get("story_text") else "",
                "source": "Hacker News",
                "source_icon": "🟠",
                "category": "Discussion"
            })

        return stories
    except Exception as e:
        print(f"Error fetching Hacker News: {e}")
        return []

def fetch_google_news() -> List[Dict[str, Any]]:
    """Fetch OpenClaw news from Google News RSS"""
    try:
        # Google News RSS for OpenClaw
        url = "https://news.google.com/rss/search?q=OpenClaw+AI+agent&hl=en-US&gl=US&ceid=US:en"
        feed = feedparser.parse(url)

        news = []
        cutoff = datetime.utcnow() - timedelta(hours=24)

        for entry in feed.entries[:10]:
            try:
                published = datetime(*entry.published_parsed[:6])
                if published >= cutoff:
                    news.append({
                        "id": entry.id,
                        "title": entry.title,
                        "url": entry.link,
                        "published": published.isoformat(),
                        "source": entry.get("source", {}).get("title", "Google News"),
                        "source_icon": "📰",
                        "category": "News"
                    })
            except:
                continue

        return news
    except Exception as e:
        print(f"Error fetching Google News: {e}")
        return []

def fetch_news_content(hours: int = 24) -> Dict[str, Any]:
    """Main function to fetch all news content"""
    print(f"🔍 Fetching news from last {hours} hours...")

    # Fetch from all sources
    print("  📡 GitHub releases...")
    github = fetch_github_releases()
    print(f"     Found {len(github)} releases")

    print("  📡 Hacker News...")
    hn = fetch_hackernews()
    print(f"     Found {len(hn)} stories")

    print("  📡 Google News...")
    gnews = fetch_google_news()
    print(f"     Found {len(gnews)} articles")

    # Combine and sort by published date
    all_items = github + hn + gnews
    all_items.sort(key=lambda x: x.get("published", ""), reverse=True)

    return {
        "source": "news",
        "fetched_at": datetime.utcnow().isoformat(),
        "time_window_hours": hours,
        "total_items": len(all_items),
        "github_releases": github,
        "hackernews": hn,
        "google_news": gnews,
        "all_items": all_items[:15]  # Top 15 overall
    }

if __name__ == "__main__":
    import sys

    hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
    output_file = sys.argv[2] if len(sys.argv) > 2 else "/home/openclaw/.openclaw/workspace/automations/openclaw-digest/output/news.json"

    content = fetch_news_content(hours=hours)

    with open(output_file, "w") as f:
        json.dump(content, f, indent=2)

    print(f"\n✅ News content saved to {output_file}")
    print(f"   Total items: {content['total_items']}")
    print(f"   GitHub releases: {len(content['github_releases'])}")
    print(f"   Hacker News: {len(content['hackernews'])}")
    print(f"   Google News: {len(content['google_news'])}")