openclaw-backups/automations/openclaw-digest/aggregate.py.backup

#!/usr/bin/env python3
"""
Content Aggregation Script for OpenClaw Daily Digest
Unifies Reddit, News, and Twitter content into structured JSON
"""

import json
import sys
import os
from datetime import datetime, timedelta
from typing import List, Dict, Any

sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'sources'))

from reddit_fetcher import fetch_reddit_content
from news_fetcher import fetch_news_content

def deduplicate_stories(items: List[Dict]) -> List[Dict]:
    """Remove duplicate stories based on URL similarity"""
    seen_urls = set()
    unique = []

    for item in items:
        url = item.get('url', '').lower().split('?')[0]  # Normalize URL

        # Skip if we've seen this URL
        if url in seen_urls:
            continue

        # Also check for title similarity
        title = item.get('title', '').lower()
        is_duplicate = False
        for existing in unique:
            existing_title = existing.get('title', '').lower()
            # Simple similarity: if titles share 80%+ words
            title_words = set(title.split())
            existing_words = set(existing_title.split())
            if title_words and existing_words:
                overlap = len(title_words & existing_words) / max(len(title_words), len(existing_words))
                if overlap > 0.8:
                    is_duplicate = True
                    break

        if not is_duplicate:
            seen_urls.add(url)
            unique.append(item)

    return unique

def score_relevance(item: Dict) -> float:
    """Score story relevance for ranking"""
    score = 0.0

    # Base engagement score
    if 'score' in item and 'num_comments' in item:
        # Reddit-style scoring
        score += item.get('score', 0) * 0.5
        score += item.get('num_comments', 0) * 1.5
        score += item.get('upvote_ratio', 0.5) * 50
    elif 'points' in item:
        # Hacker News scoring
        score += item.get('points', 0) * 1.0
        score += item.get('num_comments', 0) * 2.0
    else:
        # Default: news articles get medium base score
        score = 50.0

    # Boost for high-engagement content
    if item.get('num_comments', 0) > 50 or item.get('points', 0) > 100:
        score += 100

    return score

def format_reddit_story(story: Dict) -> str:
    """Format a Reddit story for v2 HTML email - Spark-safe inline styles"""
    engagement = []
    if story.get('score'):
        engagement.append(f"<span style='color:#ff6b6b;font-weight:600;'>↑ {story['score']}</span>")
    if story.get('num_comments'):
        engagement.append(f"<span style='color:#74b9ff;font-weight:600;'>💬 {story['num_comments']}</span>")

    # Shorter excerpt for cleaner look
    excerpt = story.get('selftext', '')[:150]
    if len(story.get('selftext', '')) > 150:
        excerpt += "..."

    flair = story.get('link_flair_text', '')
    title = story.get('title', '')
    if flair:
        title = f"[{flair}] {title}"

    engagement_html = f"<p style='font-size:13px;color:#888;margin:12px 0 0 0;'>{' · '.join(engagement)}</p>" if engagement else ""
    excerpt_html = f"<p style='font-size:14px;line-height:1.6;color:#aaa;margin:12px 0 0 0;'>{excerpt}</p>" if excerpt else ""

    return f'''<table role="presentation" cellspacing="0" cellpadding="0" border="0" width="100%" style="background-color:#1a1a2e;border-radius:12px;margin-bottom:16px;border:1px solid #2a2a3e;">
    <tr><td style="padding:20px;">
        <span style="display:inline-block;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:0.5px;padding:6px 12px;border-radius:6px;background-color:rgba(255,69,0,0.15);color:#ff6b6b;margin-bottom:14px;">Reddit</span>
        <h3 style="font-size:17px;font-weight:600;line-height:1.5;color:#fff;margin:0 0 12px 0;"><a href="{story.get('url', '#')}" style="color:#74b9ff;text-decoration:none;">{title}</a></h3>
        <p style="font-size:14px;color:#888;margin:0;"><span style="color:#a29bfe;font-weight:500;">u/{story.get('author', 'unknown')}</span></p>
        {excerpt_html}
        {engagement_html}
    </td></tr>
</table>'''

def format_news_story(story: Dict) -> str:
    """Format a news story for v2 HTML email - Spark-safe inline styles"""
    source = story.get('source', 'News')
    tag_color = '#74b9ff'
    tag_bg = 'rgba(116,185,255,0.15)'
    if 'GitHub' in source:
        tag_color = '#a29bfe'
        tag_bg = 'rgba(139,148,158,0.15)'
    elif 'Hacker' in source:
        tag_color = '#ff9f43'
        tag_bg = 'rgba(255,102,0,0.15)'

    engagement = []
    if story.get('points'):
        engagement.append(f"<span style='color:#ff6b6b;font-weight:600;'>↑ {story['points']}</span>")
    if story.get('num_comments'):
        engagement.append(f"<span style='color:#74b9ff;font-weight:600;'>💬 {story['num_comments']}</span>")

    # Shorter excerpt for cleaner look
    excerpt = story.get('summary', '')[:150]
    if len(story.get('summary', '')) > 150:
        excerpt += "..."

    engagement_html = f"<p style='font-size:13px;color:#888;margin:12px 0 0 0;'>{' · '.join(engagement)}</p>" if engagement else ""
    excerpt_html = f"<p style='font-size:14px;line-height:1.6;color:#aaa;margin:12px 0 0 0;'>{excerpt}</p>" if excerpt else ""

    return f'''<table role="presentation" cellspacing="0" cellpadding="0" border="0" width="100%" style="background-color:#1a1a2e;border-radius:12px;margin-bottom:16px;border:1px solid #2a2a3e;">
    <tr><td style="padding:20px;">
        <span style="display:inline-block;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:0.5px;padding:6px 12px;border-radius:6px;background-color:{tag_bg};color:{tag_color};margin-bottom:14px;">{source}</span>
        <h3 style="font-size:17px;font-weight:600;line-height:1.5;color:#fff;margin:0 0 12px 0;"><a href="{story.get('url', '#')}" style="color:#74b9ff;text-decoration:none;">{story.get('title', '')}</a></h3>
        {excerpt_html}
        {engagement_html}
    </td></tr>
</table>'''

def format_story_text(story: Dict) -> str:
    """Format a story for plain-text email"""
    lines = [
        f"📌 {story.get('title', '')}",
        f"   Link: {story.get('url', '')}",
    ]

    if story.get('author'):
        lines.append(f"   Author: {story.get('author')}")

    if story.get('score') or story.get('points'):
        score = story.get('score') or story.get('points', 0)
        lines.append(f"   Score: {score} upvotes")

    if story.get('num_comments'):
        lines.append(f"   Comments: {story.get('num_comments')}")

    excerpt = story.get('selftext', '') or story.get('summary', '')
    if excerpt:
        excerpt = excerpt[:150] + "..." if len(excerpt) > 150 else excerpt
        lines.append(f"   {excerpt}")

    lines.append("")  # Empty line between stories
    return "\n".join(lines)

def aggregate_content(hours: int = 24) -> Dict[str, Any]:
    """Main aggregation function"""
    print(f"🦀 Aggregating OpenClaw content from last {hours} hours...")
    print("=" * 50)

    # Fetch from all sources
    print("\n📥 Fetching Reddit content...")
    reddit_data = fetch_reddit_content(hours=hours)

    print("\n📥 Fetching news content...")
    news_data = fetch_news_content(hours=hours)

    # Twitter placeholder (will be added when API is configured)
    twitter_data = {
        "source": "twitter",
        "total_items": 0,
        "tweets": [],
        "note": "X/Twitter integration requires API setup"
    }

    # Combine all items for deduplication
    all_items = []
    all_items.extend([{**item, '_source': 'reddit'} for item in reddit_data.get('all_posts', [])])
    all_items.extend([{**item, '_source': 'news'} for item in news_data.get('all_items', [])])

    # Deduplicate
    print("\n🧹 Deduplicating stories...")
    unique_items = deduplicate_stories(all_items)
    print(f"   Removed {len(all_items) - len(unique_items)} duplicates")

    # Sort by relevance score
    unique_items.sort(key=score_relevance, reverse=True)

    # Split back into sections
    reddit_top = [item for item in unique_items if item.get('_source') == 'reddit'][:8]
    news_top = [item for item in unique_items if item.get('_source') == 'news'][:8]

    # Generate HTML sections
    reddit_html = '\n'.join([format_reddit_story(s) for s in reddit_top])
    news_html = '\n'.join([format_news_story(s) for s in news_top])
    twitter_html = '<p style="text-align:center;color:#888;padding:30px 0;">🚧 X/Twitter integration coming soon</p>'

    # Generate text sections
    reddit_text = '\n'.join([format_story_text(s) for s in reddit_top]) if reddit_top else "No new Reddit posts today."
    news_text = '\n'.join([format_story_text(s) for s in news_top]) if news_top else "No new news articles today."
    twitter_text = "🚧 X/Twitter integration coming soon - requires API setup\n"

    # Build result
    result = {
        "meta": {
            "generated_at": datetime.utcnow().isoformat(),
            "time_window_hours": hours,
            "date": datetime.utcnow().strftime("%A, %B %d, %Y")
        },
        "stats": {
            "reddit_count": reddit_data.get('total_posts', 0),
            "news_count": news_data.get('total_items', 0),
            "twitter_count": 0,
            "total_unique": len(unique_items)
        },
        "content": {
            "reddit": reddit_data,
            "news": news_data,
            "twitter": twitter_data
        },
        "formatted": {
            "reddit_html": reddit_html,
            "news_html": news_html,
            "twitter_html": twitter_html,
            "reddit_text": reddit_text,
            "news_text": news_text,
            "twitter_text": twitter_text
        }
    }

    print("\n" + "=" * 50)
    print(f"✅ Aggregation complete!")
    print(f"   Reddit posts: {result['stats']['reddit_count']}")
    print(f"   News items: {result['stats']['news_count']}")
    print(f"   Total unique: {result['stats']['total_unique']}")

    return result

if __name__ == "__main__":
    hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
    output_file = sys.argv[2] if len(sys.argv) > 2 else "/home/openclaw/.openclaw/workspace/automations/openclaw-digest/output/digest.json"

    result = aggregate_content(hours=hours)

    with open(output_file, 'w') as f:
        json.dump(result, f, indent=2)

    print(f"\n📄 Output saved to: {output_file}")