openclaw-backups/automations/openclaw-digest/aggregate.py

#!/usr/bin/env python3
"""
Enhanced Content Aggregation for OpenClaw Daily Digest
Features: topic tags, read time, trending detection, color-coded sources, LLM summaries
"""

import json
import sys
import os
import re
import hashlib
from datetime import datetime, timedelta
from typing import List, Dict, Any, Tuple

sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'sources'))

from reddit_fetcher import fetch_reddit_content
from news_fetcher import fetch_news_content

# Topic detection keywords
TOPIC_KEYWORDS = {
    'AI/LLMs': ['llm', 'gpt', 'claude', 'openai', 'anthropic', 'model', 'training', 'inference', 'token', 'embedding', 'fine-tune', 'rag'],
    'Coding': ['code', 'github', 'programming', 'developer', 'api', 'python', 'javascript', 'typescript', 'rust', 'go'],
    'Home Automation': ['home assistant', 'smart home', 'automation', 'zigbee', 'z-wave', 'mqtt', 'iot', 'sensor'],
    'Self-Hosting': ['self-host', 'homelab', 'server', 'docker', 'kubernetes', 'proxmox', 'nas', 'selfhosted'],
    'Hardware': ['gpu', 'nvidia', 'amd', 'cpu', 'ram', 'ssd', 'raspberry pi', 'arduino', 'esp32'],
    'Privacy': ['privacy', 'security', 'encryption', 'vpn', 'tor', 'self-hosted', 'data protection'],
    'OpenClaw': ['openclaw', 'claw', 'mcp', 'agent', 'skill', 'clawhub'],
}

# User's top topics (will be updated based on clicks over time)
TOP_TOPICS = ['AI/LLMs', 'OpenClaw', 'Coding', 'Home Automation']

def detect_topics(title: str, excerpt: str = '') -> List[str]:
    """Detect topics from title and excerpt"""
    text = f"{title} {excerpt}".lower()
    topics = []
    for topic, keywords in TOPIC_KEYWORDS.items():
        if any(kw in text for kw in keywords):
            topics.append(topic)
    return topics[:3]  # Max 3 topics

def get_topic_color(topic: str) -> str:
    """Get color for topic tag"""
    colors = {
        'AI/LLMs': '#00d2ff',
        'Coding': '#a29bfe',
        'Home Automation': '#20b47a',
        'Self-Hosting': '#ff9f43',
        'Hardware': '#ff6b6b',
        'Privacy': '#fd79a8',
        'OpenClaw': '#ee5a24',
    }
    return colors.get(topic, '#74b9ff')

def estimate_read_time(url: str, excerpt: str = '') -> str:
    """Estimate read time based on content type"""
    # Default 3 min for most content
    minutes = 3

    # Adjust based on excerpt length
    if excerpt:
        word_count = len(excerpt.split())
        if word_count > 500:
            minutes = 8
        elif word_count > 200:
            minutes = 5
        elif word_count < 50:
            minutes = 2

    # Check URL patterns for known quick reads
    if any(domain in url.lower() for domain in ['github.com', 'gist.github.com']):
        minutes = max(2, minutes - 1)  # GitHub tends to be code-heavy
    elif 'youtube.com' in url.lower() or 'youtu.be' in url.lower():
        minutes = 10  # Videos take longer

    return f"{minutes} min read"

def is_trending(story: Dict) -> bool:
    """Detect if a story is trending (high engagement ratio)"""
    comments = story.get('num_comments', 0)
    score = story.get('score') or story.get('points', 0)

    if score == 0:
        return False

    # High comment-to-score ratio = hot discussion
    ratio = comments / score if score > 0 else 0

    # Trending if:
    # - Score > 50 AND comments > 20 AND ratio > 0.3 (lots of discussion)
    # OR score > 200 (just popular)
    return (score > 50 and comments > 20 and ratio > 0.3) or score > 200

def get_trending_emoji(story: Dict) -> str:
    """Get appropriate trending indicator"""
    score = story.get('score') or story.get('points', 0)
    comments = story.get('num_comments', 0)

    if score > 500 or comments > 100:
        return "🔥🔥"  # Very hot
    elif is_trending(story):
        return "🔥"  # Trending
    return ""

def generate_quick_reply_links(story: Dict, story_id: str) -> str:
    """Generate quick action links for Telegram/Discord"""
    url = story.get('url', '')
    title = story.get('title', '')[:50]

    # Create deep links for quick actions
    # These would need corresponding bot handlers
    summarize_link = f"https://t.me/openclaw_bot?start=summarize_{story_id}"
    save_link = f"https://t.me/openclaw_bot?start=save_{story_id}"

    return f'''<table role="presentation" cellspacing="0" cellpadding="0" border="0" style="margin-top:12px;">
    <tr>
        <td style="padding-right:8px;">
            <a href="{summarize_link}" style="display:inline-block;padding:6px 12px;background-color:#2d3436;color:#74b9ff;text-decoration:none;border-radius:6px;font-size:12px;font-weight:600;border:1px solid #3d4446;">📝 Summarize</a>
        </td>
        <td>
            <a href="{save_link}" style="display:inline-block;padding:6px 12px;background-color:#2d3436;color:#20b47a;text-decoration:none;border-radius:6px;font-size:12px;font-weight:600;border:1px solid #3d4446;">💾 Save</a>
        </td>
    </tr>
</table>'''

def deduplicate_stories(items: List[Dict]) -> List[Dict]:
    """Remove duplicate stories based on URL similarity"""
    seen_urls = set()
    unique = []

    for item in items:
        url = item.get('url', '').lower().split('?')[0]

        if url in seen_urls:
            continue

        # Title similarity check
        title = item.get('title', '').lower()
        is_duplicate = False
        for existing in unique:
            existing_title = existing.get('title', '').lower()
            title_words = set(title.split())
            existing_words = set(existing_title.split())
            if title_words and existing_words:
                overlap = len(title_words & existing_words) / max(len(title_words), len(existing_words))
                if overlap > 0.8:
                    is_duplicate = True
                    break

        if not is_duplicate:
            seen_urls.add(url)
            unique.append(item)

    return unique

def score_relevance(item: Dict) -> float:
    """Score story relevance with topic boost"""
    score = 0.0

    # Base engagement
    if 'score' in item and 'num_comments' in item:
        score += item.get('score', 0) * 0.5
        score += item.get('num_comments', 0) * 1.5
        score += item.get('upvote_ratio', 0.5) * 50
    elif 'points' in item:
        score += item.get('points', 0) * 1.0
        score += item.get('num_comments', 0) * 2.0
    else:
        score = 50.0

    # Boost for high engagement
    if item.get('num_comments', 0) > 50 or item.get('points', 0) > 100:
        score += 100

    # Boost for user's top topics
    topics = detect_topics(item.get('title', ''), item.get('selftext', '')[:200])
    for topic in topics:
        if topic in TOP_TOPICS:
            score += 50  # Significant boost for preferred topics

    return score

def format_topic_tags(topics: List[str]) -> str:
    """Format topic tags as styled badges"""
    if not topics:
        return ""

    tags = []
    for topic in topics:
        color = get_topic_color(topic)
        is_top = topic in TOP_TOPICS
        border = f"border:1px solid {color};" if is_top else ""
        star = "★ " if is_top else ""
        tags.append(f'<span style="display:inline-block;font-size:10px;font-weight:600;text-transform:uppercase;letter-spacing:0.5px;padding:3px 8px;border-radius:4px;background-color:rgba(255,255,255,0.05);color:{color};margin-right:6px;margin-bottom:6px;{border}">{star}{topic}</span>')

    return f'<p style="margin:0 0 10px 0;">{"".join(tags)}</p>'

def format_reddit_story(story: Dict, include_quick_actions: bool = False) -> str:
    """Format Reddit story with all enhancements"""
    # Detect topics
    excerpt = story.get('selftext', '')[:200]
    topics = detect_topics(story.get('title', ''), excerpt)
    topic_html = format_topic_tags(topics)

    # Read time
    read_time = estimate_read_time(story.get('url', ''), excerpt)

    # Trending indicator
    trending = get_trending_emoji(story)

    # Engagement badges
    engagement = []
    if story.get('score'):
        engagement.append(f"<span style='color:#ff6b6b;font-weight:600;'>↑ {story['score']}</span>")
    if story.get('num_comments'):
        engagement.append(f"<span style='color:#74b9ff;font-weight:600;'>💬 {story['num_comments']}</span>")
    engagement.append(f"<span style='color:#888;'>⏱️ {read_time}</span>")

    # Title with flair
    flair = story.get('link_flair_text', '')
    title = story.get('title', '')
    if flair:
        title = f"[{flair}] {title}"

    # Story hash for quick actions
    story_hash = hashlib.md5(story.get('url', '').encode()).hexdigest()[:8]
    quick_actions = generate_quick_reply_links(story, story_hash) if include_quick_actions else ""

    # Trim excerpt
    if len(story.get('selftext', '')) > 200:
        excerpt += "..."
    excerpt_html = f"<p style='font-size:14px;line-height:1.6;color:#aaa;margin:12px 0 0 0;'>{excerpt}</p>" if excerpt else ""

    trending_html = f"<span style='margin-right:8px;'>{trending}</span>" if trending else ""

    return f'''<table role="presentation" cellspacing="0" cellpadding="0" border="0" width="100%" style="background-color:#1a1a2e;border-radius:12px;margin-bottom:16px;border:1px solid #2a2a3e;">
    <tr><td style="padding:20px;">
        <table role="presentation" cellspacing="0" cellpadding="0" border="0" width="100%">
            <tr>
                <td style="padding-bottom:10px;">
                    <span style="display:inline-block;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:0.5px;padding:6px 12px;border-radius:6px;background-color:rgba(255,69,0,0.15);color:#ff6b6b;">Reddit</span>
                </td>
            </tr>
        </table>
        {topic_html}
        <h3 style="font-size:17px;font-weight:600;line-height:1.5;color:#fff;margin:0 0 8px 0;">{trending_html}<a href="{story.get('url', '#')}" style="color:#74b9ff;text-decoration:none;">{title}</a></h3>
        <p style="font-size:13px;color:#888;margin:0 0 8px 0;"><span style="color:#a29bfe;font-weight:500;">u/{story.get('author', 'unknown')}</span></p>
        {excerpt_html}
        <p style='font-size:13px;color:#888;margin:12px 0 0 0;'>{' · '.join(engagement)}</p>
        {quick_actions}
    </td></tr>
</table>'''

def format_news_story(story: Dict, include_quick_actions: bool = False) -> str:
    """Format news story with all enhancements"""
    # Detect topics
    excerpt = story.get('summary', '')[:200]
    topics = detect_topics(story.get('title', ''), excerpt)
    topic_html = format_topic_tags(topics)

    # Read time
    read_time = estimate_read_time(story.get('url', ''), excerpt)

    # Trending indicator
    trending = get_trending_emoji(story)

    # Source styling
    source = story.get('source', 'News')
    tag_colors = {
        'GitHub': ('#a29bfe', 'rgba(139,148,158,0.15)'),
        'Hacker News': ('#ff9f43', 'rgba(255,102,0,0.15)'),
    }
    tag_color, tag_bg = tag_colors.get(source, ('#74b9ff', 'rgba(116,185,255,0.15)'))

    # Engagement
    engagement = []
    if story.get('points'):
        engagement.append(f"<span style='color:#ff6b6b;font-weight:600;'>↑ {story['points']}</span>")
    if story.get('num_comments'):
        engagement.append(f"<span style='color:#74b9ff;font-weight:600;'>💬 {story['num_comments']}</span>")
    engagement.append(f"<span style='color:#888;'>⏱️ {read_time}</span>")

    # Story hash for quick actions
    story_hash = hashlib.md5(story.get('url', '').encode()).hexdigest()[:8]
    quick_actions = generate_quick_reply_links(story, story_hash) if include_quick_actions else ""

    # Trim excerpt
    if len(story.get('summary', '')) > 200:
        excerpt += "..."
    excerpt_html = f"<p style='font-size:14px;line-height:1.6;color:#aaa;margin:12px 0 0 0;'>{excerpt}</p>" if excerpt else ""

    trending_html = f"<span style='margin-right:8px;'>{trending}</span>" if trending else ""

    return f'''<table role="presentation" cellspacing="0" cellpadding="0" border="0" width="100%" style="background-color:#1a1a2e;border-radius:12px;margin-bottom:16px;border:1px solid #2a2a3e;">
    <tr><td style="padding:20px;">
        <table role="presentation" cellspacing="0" cellpadding="0" border="0" width="100%">
            <tr>
                <td style="padding-bottom:10px;">
                    <span style="display:inline-block;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:0.5px;padding:6px 12px;border-radius:6px;background-color:{tag_bg};color:{tag_color};">{source}</span>
                </td>
            </tr>
        </table>
        {topic_html}
        <h3 style="font-size:17px;font-weight:600;line-height:1.5;color:#fff;margin:0 0 8px 0;">{trending_html}<a href="{story.get('url', '#')}" style="color:#74b9ff;text-decoration:none;">{story.get('title', '')}</a></h3>
        {excerpt_html}
        <p style='font-size:13px;color:#888;margin:12px 0 0 0;'>{' · '.join(engagement)}</p>
        {quick_actions}
    </td></tr>
</table>'''

def format_top_topics_section() -> str:
    """Generate top topics summary for the email"""
    topic_badges = []
    for topic in TOP_TOPICS[:4]:  # Show top 4
        color = get_topic_color(topic)
        topic_badges.append(f'<span style="display:inline-block;font-size:12px;font-weight:600;padding:6px 12px;border-radius:20px;background-color:rgba(255,255,255,0.08);color:{color};margin-right:8px;border:1px solid {color}40;">★ {topic}</span>')

    return f'''<table role="presentation" cellspacing="0" cellpadding="0" border="0" width="100%" style="background-color:rgba(255,255,255,0.03);border-radius:12px;margin-bottom:20px;border:1px solid #2a2a3e;">
    <tr><td style="padding:16px 20px;">
        <p style="font-size:12px;color:#888;text-transform:uppercase;letter-spacing:1px;margin:0 0 10px 0;font-weight:600;">Your Top Topics</p>
        <p style="margin:0;">{''.join(topic_badges)}</p>
    </td></tr>
</table>'''

def format_story_text(story: Dict) -> str:
    """Format a story for plain-text email"""
    lines = [f"📌 {story.get('title', '')}"]

    # Add topics
    topics = detect_topics(story.get('title', ''), story.get('selftext', '')[:100])
    if topics:
        lines.append(f"   Topics: {', '.join(topics)}")

    lines.append(f"   Link: {story.get('url', '')}")

    if story.get('author'):
        lines.append(f"   Author: {story.get('author')}")

    # Engagement stats
    stats = []
    if story.get('score') or story.get('points'):
        stats.append(f"{story.get('score') or story.get('points', 0)} upvotes")
    if story.get('num_comments'):
        stats.append(f"{story.get('num_comments')} comments")
    stats.append(estimate_read_time(story.get('url', ''), story.get('selftext', '')))

    if stats:
        lines.append(f"   {' | '.join(stats)}")

    # Trending
    if is_trending(story):
        lines.append("   🔥 TRENDING")

    excerpt = story.get('selftext', '') or story.get('summary', '')
    if excerpt:
        excerpt = excerpt[:150] + "..." if len(excerpt) > 150 else excerpt
        lines.append(f"   {excerpt}")

    lines.append("")
    return "\n".join(lines)

def aggregate_content(hours: int = 24) -> Dict[str, Any]:
    """Main aggregation function with enhanced features"""
    print(f"🦀 Aggregating OpenClaw content from last {hours} hours...")
    print("=" * 50)

    # Fetch from all sources
    print("\n📥 Fetching Reddit content...")
    reddit_data = fetch_reddit_content(hours=hours)

    print("\n📥 Fetching news content...")
    news_data = fetch_news_content(hours=hours)

    # Twitter placeholder
    twitter_data = {
        "source": "twitter",
        "total_items": 0,
        "tweets": [],
        "note": "X/Twitter integration requires API setup"
    }

    # Combine all items
    all_items = []
    all_items.extend([{**item, '_source': 'reddit'} for item in reddit_data.get('all_posts', [])])
    all_items.extend([{**item, '_source': 'news'} for item in news_data.get('all_items', [])])

    # Deduplicate
    print("\n🧹 Deduplicating stories...")
    unique_items = deduplicate_stories(all_items)
    print(f"   Removed {len(all_items) - len(unique_items)} duplicates")

    # Sort by relevance (includes topic boosting)
    unique_items.sort(key=score_relevance, reverse=True)

    # Split back into sections
    reddit_top = [item for item in unique_items if item.get('_source') == 'reddit'][:8]
    news_top = [item for item in unique_items if item.get('_source') == 'news'][:8]

    # Count trending
    trending_count = sum(1 for item in unique_items if is_trending(item))

    # Generate HTML sections
    top_topics_html = format_top_topics_section()
    reddit_html = '\n'.join([format_reddit_story(s, include_quick_actions=True) for s in reddit_top])
    news_html = '\n'.join([format_news_story(s, include_quick_actions=True) for s in news_top])
    twitter_html = '<p style="text-align:center;color:#888;padding:30px 0;">🚧 X/Twitter integration coming soon</p>'

    # Generate text sections
    reddit_text = '\n'.join([format_story_text(s) for s in reddit_top]) if reddit_top else "No new Reddit posts today."
    news_text = '\n'.join([format_story_text(s) for s in news_top]) if news_top else "No new news articles today."
    twitter_text = "🚧 X/Twitter integration coming soon - requires API setup\n"

    # Build result
    result = {
        "meta": {
            "generated_at": datetime.utcnow().isoformat(),
            "time_window_hours": hours,
            "date": datetime.utcnow().strftime("%A, %B %d, %Y")
        },
        "stats": {
            "reddit_count": reddit_data.get('total_posts', 0),
            "news_count": news_data.get('total_items', 0),
            "twitter_count": 0,
            "total_unique": len(unique_items),
            "trending_count": trending_count
        },
        "content": {
            "reddit": reddit_data,
            "news": news_data,
            "twitter": twitter_data
        },
        "formatted": {
            "top_topics_html": top_topics_html,
            "reddit_html": reddit_html,
            "news_html": news_html,
            "twitter_html": twitter_html,
            "reddit_text": reddit_text,
            "news_text": news_text,
            "twitter_text": twitter_text
        },
        "user_preferences": {
            "top_topics": TOP_TOPICS
        }
    }

    print("\n" + "=" * 50)
    print(f"✅ Aggregation complete!")
    print(f"   Reddit posts: {result['stats']['reddit_count']}")
    print(f"   News items: {result['stats']['news_count']}")
    print(f"   Trending: {result['stats']['trending_count']}")
    print(f"   Total unique: {result['stats']['total_unique']}")

    return result

if __name__ == "__main__":
    hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
    output_file = sys.argv[2] if len(sys.argv) > 2 else "/home/openclaw/.openclaw/workspace/automations/openclaw-digest/output/digest.json"

    result = aggregate_content(hours=hours)

    with open(output_file, 'w') as f:
        json.dump(result, f, indent=2)

    print(f"\n📄 Output saved to: {output_file}")