Files
openclaw-backups/automations/openclaw-digest/aggregate.py.backup

258 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Content Aggregation Script for OpenClaw Daily Digest
Unifies Reddit, News, and Twitter content into structured JSON
"""
import json
import sys
import os
from datetime import datetime, timedelta
from typing import List, Dict, Any
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'sources'))
from reddit_fetcher import fetch_reddit_content
from news_fetcher import fetch_news_content
def deduplicate_stories(items: List[Dict]) -> List[Dict]:
"""Remove duplicate stories based on URL similarity"""
seen_urls = set()
unique = []
for item in items:
url = item.get('url', '').lower().split('?')[0] # Normalize URL
# Skip if we've seen this URL
if url in seen_urls:
continue
# Also check for title similarity
title = item.get('title', '').lower()
is_duplicate = False
for existing in unique:
existing_title = existing.get('title', '').lower()
# Simple similarity: if titles share 80%+ words
title_words = set(title.split())
existing_words = set(existing_title.split())
if title_words and existing_words:
overlap = len(title_words & existing_words) / max(len(title_words), len(existing_words))
if overlap > 0.8:
is_duplicate = True
break
if not is_duplicate:
seen_urls.add(url)
unique.append(item)
return unique
def score_relevance(item: Dict) -> float:
"""Score story relevance for ranking"""
score = 0.0
# Base engagement score
if 'score' in item and 'num_comments' in item:
# Reddit-style scoring
score += item.get('score', 0) * 0.5
score += item.get('num_comments', 0) * 1.5
score += item.get('upvote_ratio', 0.5) * 50
elif 'points' in item:
# Hacker News scoring
score += item.get('points', 0) * 1.0
score += item.get('num_comments', 0) * 2.0
else:
# Default: news articles get medium base score
score = 50.0
# Boost for high-engagement content
if item.get('num_comments', 0) > 50 or item.get('points', 0) > 100:
score += 100
return score
def format_reddit_story(story: Dict) -> str:
"""Format a Reddit story for v2 HTML email - Spark-safe inline styles"""
engagement = []
if story.get('score'):
engagement.append(f"<span style='color:#ff6b6b;font-weight:600;'>↑ {story['score']}</span>")
if story.get('num_comments'):
engagement.append(f"<span style='color:#74b9ff;font-weight:600;'>💬 {story['num_comments']}</span>")
# Shorter excerpt for cleaner look
excerpt = story.get('selftext', '')[:150]
if len(story.get('selftext', '')) > 150:
excerpt += "..."
flair = story.get('link_flair_text', '')
title = story.get('title', '')
if flair:
title = f"[{flair}] {title}"
engagement_html = f"<p style='font-size:13px;color:#888;margin:12px 0 0 0;'>{' · '.join(engagement)}</p>" if engagement else ""
excerpt_html = f"<p style='font-size:14px;line-height:1.6;color:#aaa;margin:12px 0 0 0;'>{excerpt}</p>" if excerpt else ""
return f'''<table role="presentation" cellspacing="0" cellpadding="0" border="0" width="100%" style="background-color:#1a1a2e;border-radius:12px;margin-bottom:16px;border:1px solid #2a2a3e;">
<tr><td style="padding:20px;">
<span style="display:inline-block;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:0.5px;padding:6px 12px;border-radius:6px;background-color:rgba(255,69,0,0.15);color:#ff6b6b;margin-bottom:14px;">Reddit</span>
<h3 style="font-size:17px;font-weight:600;line-height:1.5;color:#fff;margin:0 0 12px 0;"><a href="{story.get('url', '#')}" style="color:#74b9ff;text-decoration:none;">{title}</a></h3>
<p style="font-size:14px;color:#888;margin:0;"><span style="color:#a29bfe;font-weight:500;">u/{story.get('author', 'unknown')}</span></p>
{excerpt_html}
{engagement_html}
</td></tr>
</table>'''
def format_news_story(story: Dict) -> str:
"""Format a news story for v2 HTML email - Spark-safe inline styles"""
source = story.get('source', 'News')
tag_color = '#74b9ff'
tag_bg = 'rgba(116,185,255,0.15)'
if 'GitHub' in source:
tag_color = '#a29bfe'
tag_bg = 'rgba(139,148,158,0.15)'
elif 'Hacker' in source:
tag_color = '#ff9f43'
tag_bg = 'rgba(255,102,0,0.15)'
engagement = []
if story.get('points'):
engagement.append(f"<span style='color:#ff6b6b;font-weight:600;'>↑ {story['points']}</span>")
if story.get('num_comments'):
engagement.append(f"<span style='color:#74b9ff;font-weight:600;'>💬 {story['num_comments']}</span>")
# Shorter excerpt for cleaner look
excerpt = story.get('summary', '')[:150]
if len(story.get('summary', '')) > 150:
excerpt += "..."
engagement_html = f"<p style='font-size:13px;color:#888;margin:12px 0 0 0;'>{' · '.join(engagement)}</p>" if engagement else ""
excerpt_html = f"<p style='font-size:14px;line-height:1.6;color:#aaa;margin:12px 0 0 0;'>{excerpt}</p>" if excerpt else ""
return f'''<table role="presentation" cellspacing="0" cellpadding="0" border="0" width="100%" style="background-color:#1a1a2e;border-radius:12px;margin-bottom:16px;border:1px solid #2a2a3e;">
<tr><td style="padding:20px;">
<span style="display:inline-block;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:0.5px;padding:6px 12px;border-radius:6px;background-color:{tag_bg};color:{tag_color};margin-bottom:14px;">{source}</span>
<h3 style="font-size:17px;font-weight:600;line-height:1.5;color:#fff;margin:0 0 12px 0;"><a href="{story.get('url', '#')}" style="color:#74b9ff;text-decoration:none;">{story.get('title', '')}</a></h3>
{excerpt_html}
{engagement_html}
</td></tr>
</table>'''
def format_story_text(story: Dict) -> str:
"""Format a story for plain-text email"""
lines = [
f"📌 {story.get('title', '')}",
f" Link: {story.get('url', '')}",
]
if story.get('author'):
lines.append(f" Author: {story.get('author')}")
if story.get('score') or story.get('points'):
score = story.get('score') or story.get('points', 0)
lines.append(f" Score: {score} upvotes")
if story.get('num_comments'):
lines.append(f" Comments: {story.get('num_comments')}")
excerpt = story.get('selftext', '') or story.get('summary', '')
if excerpt:
excerpt = excerpt[:150] + "..." if len(excerpt) > 150 else excerpt
lines.append(f" {excerpt}")
lines.append("") # Empty line between stories
return "\n".join(lines)
def aggregate_content(hours: int = 24) -> Dict[str, Any]:
"""Main aggregation function"""
print(f"🦀 Aggregating OpenClaw content from last {hours} hours...")
print("=" * 50)
# Fetch from all sources
print("\n📥 Fetching Reddit content...")
reddit_data = fetch_reddit_content(hours=hours)
print("\n📥 Fetching news content...")
news_data = fetch_news_content(hours=hours)
# Twitter placeholder (will be added when API is configured)
twitter_data = {
"source": "twitter",
"total_items": 0,
"tweets": [],
"note": "X/Twitter integration requires API setup"
}
# Combine all items for deduplication
all_items = []
all_items.extend([{**item, '_source': 'reddit'} for item in reddit_data.get('all_posts', [])])
all_items.extend([{**item, '_source': 'news'} for item in news_data.get('all_items', [])])
# Deduplicate
print("\n🧹 Deduplicating stories...")
unique_items = deduplicate_stories(all_items)
print(f" Removed {len(all_items) - len(unique_items)} duplicates")
# Sort by relevance score
unique_items.sort(key=score_relevance, reverse=True)
# Split back into sections
reddit_top = [item for item in unique_items if item.get('_source') == 'reddit'][:8]
news_top = [item for item in unique_items if item.get('_source') == 'news'][:8]
# Generate HTML sections
reddit_html = '\n'.join([format_reddit_story(s) for s in reddit_top])
news_html = '\n'.join([format_news_story(s) for s in news_top])
twitter_html = '<p style="text-align:center;color:#888;padding:30px 0;">🚧 X/Twitter integration coming soon</p>'
# Generate text sections
reddit_text = '\n'.join([format_story_text(s) for s in reddit_top]) if reddit_top else "No new Reddit posts today."
news_text = '\n'.join([format_story_text(s) for s in news_top]) if news_top else "No new news articles today."
twitter_text = "🚧 X/Twitter integration coming soon - requires API setup\n"
# Build result
result = {
"meta": {
"generated_at": datetime.utcnow().isoformat(),
"time_window_hours": hours,
"date": datetime.utcnow().strftime("%A, %B %d, %Y")
},
"stats": {
"reddit_count": reddit_data.get('total_posts', 0),
"news_count": news_data.get('total_items', 0),
"twitter_count": 0,
"total_unique": len(unique_items)
},
"content": {
"reddit": reddit_data,
"news": news_data,
"twitter": twitter_data
},
"formatted": {
"reddit_html": reddit_html,
"news_html": news_html,
"twitter_html": twitter_html,
"reddit_text": reddit_text,
"news_text": news_text,
"twitter_text": twitter_text
}
}
print("\n" + "=" * 50)
print(f"✅ Aggregation complete!")
print(f" Reddit posts: {result['stats']['reddit_count']}")
print(f" News items: {result['stats']['news_count']}")
print(f" Total unique: {result['stats']['total_unique']}")
return result
if __name__ == "__main__":
hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
output_file = sys.argv[2] if len(sys.argv) > 2 else "/home/openclaw/.openclaw/workspace/automations/openclaw-digest/output/digest.json"
result = aggregate_content(hours=hours)
with open(output_file, 'w') as f:
json.dump(result, f, indent=2)
print(f"\n📄 Output saved to: {output_file}")