AI Newsletter Digest improvements: fixed QP soft line break decoding, URL extraction, and content cleaning
This commit is contained in:
257
automations/openclaw-digest/aggregate.py.backup
Normal file
257
automations/openclaw-digest/aggregate.py.backup
Normal file
@@ -0,0 +1,257 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Content Aggregation Script for OpenClaw Daily Digest
|
||||
Unifies Reddit, News, and Twitter content into structured JSON
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Dict, Any
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'sources'))
|
||||
|
||||
from reddit_fetcher import fetch_reddit_content
|
||||
from news_fetcher import fetch_news_content
|
||||
|
||||
def deduplicate_stories(items: List[Dict]) -> List[Dict]:
|
||||
"""Remove duplicate stories based on URL similarity"""
|
||||
seen_urls = set()
|
||||
unique = []
|
||||
|
||||
for item in items:
|
||||
url = item.get('url', '').lower().split('?')[0] # Normalize URL
|
||||
|
||||
# Skip if we've seen this URL
|
||||
if url in seen_urls:
|
||||
continue
|
||||
|
||||
# Also check for title similarity
|
||||
title = item.get('title', '').lower()
|
||||
is_duplicate = False
|
||||
for existing in unique:
|
||||
existing_title = existing.get('title', '').lower()
|
||||
# Simple similarity: if titles share 80%+ words
|
||||
title_words = set(title.split())
|
||||
existing_words = set(existing_title.split())
|
||||
if title_words and existing_words:
|
||||
overlap = len(title_words & existing_words) / max(len(title_words), len(existing_words))
|
||||
if overlap > 0.8:
|
||||
is_duplicate = True
|
||||
break
|
||||
|
||||
if not is_duplicate:
|
||||
seen_urls.add(url)
|
||||
unique.append(item)
|
||||
|
||||
return unique
|
||||
|
||||
def score_relevance(item: Dict) -> float:
|
||||
"""Score story relevance for ranking"""
|
||||
score = 0.0
|
||||
|
||||
# Base engagement score
|
||||
if 'score' in item and 'num_comments' in item:
|
||||
# Reddit-style scoring
|
||||
score += item.get('score', 0) * 0.5
|
||||
score += item.get('num_comments', 0) * 1.5
|
||||
score += item.get('upvote_ratio', 0.5) * 50
|
||||
elif 'points' in item:
|
||||
# Hacker News scoring
|
||||
score += item.get('points', 0) * 1.0
|
||||
score += item.get('num_comments', 0) * 2.0
|
||||
else:
|
||||
# Default: news articles get medium base score
|
||||
score = 50.0
|
||||
|
||||
# Boost for high-engagement content
|
||||
if item.get('num_comments', 0) > 50 or item.get('points', 0) > 100:
|
||||
score += 100
|
||||
|
||||
return score
|
||||
|
||||
def format_reddit_story(story: Dict) -> str:
|
||||
"""Format a Reddit story for v2 HTML email - Spark-safe inline styles"""
|
||||
engagement = []
|
||||
if story.get('score'):
|
||||
engagement.append(f"<span style='color:#ff6b6b;font-weight:600;'>↑ {story['score']}</span>")
|
||||
if story.get('num_comments'):
|
||||
engagement.append(f"<span style='color:#74b9ff;font-weight:600;'>💬 {story['num_comments']}</span>")
|
||||
|
||||
# Shorter excerpt for cleaner look
|
||||
excerpt = story.get('selftext', '')[:150]
|
||||
if len(story.get('selftext', '')) > 150:
|
||||
excerpt += "..."
|
||||
|
||||
flair = story.get('link_flair_text', '')
|
||||
title = story.get('title', '')
|
||||
if flair:
|
||||
title = f"[{flair}] {title}"
|
||||
|
||||
engagement_html = f"<p style='font-size:13px;color:#888;margin:12px 0 0 0;'>{' · '.join(engagement)}</p>" if engagement else ""
|
||||
excerpt_html = f"<p style='font-size:14px;line-height:1.6;color:#aaa;margin:12px 0 0 0;'>{excerpt}</p>" if excerpt else ""
|
||||
|
||||
return f'''<table role="presentation" cellspacing="0" cellpadding="0" border="0" width="100%" style="background-color:#1a1a2e;border-radius:12px;margin-bottom:16px;border:1px solid #2a2a3e;">
|
||||
<tr><td style="padding:20px;">
|
||||
<span style="display:inline-block;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:0.5px;padding:6px 12px;border-radius:6px;background-color:rgba(255,69,0,0.15);color:#ff6b6b;margin-bottom:14px;">Reddit</span>
|
||||
<h3 style="font-size:17px;font-weight:600;line-height:1.5;color:#fff;margin:0 0 12px 0;"><a href="{story.get('url', '#')}" style="color:#74b9ff;text-decoration:none;">{title}</a></h3>
|
||||
<p style="font-size:14px;color:#888;margin:0;"><span style="color:#a29bfe;font-weight:500;">u/{story.get('author', 'unknown')}</span></p>
|
||||
{excerpt_html}
|
||||
{engagement_html}
|
||||
</td></tr>
|
||||
</table>'''
|
||||
|
||||
def format_news_story(story: Dict) -> str:
|
||||
"""Format a news story for v2 HTML email - Spark-safe inline styles"""
|
||||
source = story.get('source', 'News')
|
||||
tag_color = '#74b9ff'
|
||||
tag_bg = 'rgba(116,185,255,0.15)'
|
||||
if 'GitHub' in source:
|
||||
tag_color = '#a29bfe'
|
||||
tag_bg = 'rgba(139,148,158,0.15)'
|
||||
elif 'Hacker' in source:
|
||||
tag_color = '#ff9f43'
|
||||
tag_bg = 'rgba(255,102,0,0.15)'
|
||||
|
||||
engagement = []
|
||||
if story.get('points'):
|
||||
engagement.append(f"<span style='color:#ff6b6b;font-weight:600;'>↑ {story['points']}</span>")
|
||||
if story.get('num_comments'):
|
||||
engagement.append(f"<span style='color:#74b9ff;font-weight:600;'>💬 {story['num_comments']}</span>")
|
||||
|
||||
# Shorter excerpt for cleaner look
|
||||
excerpt = story.get('summary', '')[:150]
|
||||
if len(story.get('summary', '')) > 150:
|
||||
excerpt += "..."
|
||||
|
||||
engagement_html = f"<p style='font-size:13px;color:#888;margin:12px 0 0 0;'>{' · '.join(engagement)}</p>" if engagement else ""
|
||||
excerpt_html = f"<p style='font-size:14px;line-height:1.6;color:#aaa;margin:12px 0 0 0;'>{excerpt}</p>" if excerpt else ""
|
||||
|
||||
return f'''<table role="presentation" cellspacing="0" cellpadding="0" border="0" width="100%" style="background-color:#1a1a2e;border-radius:12px;margin-bottom:16px;border:1px solid #2a2a3e;">
|
||||
<tr><td style="padding:20px;">
|
||||
<span style="display:inline-block;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:0.5px;padding:6px 12px;border-radius:6px;background-color:{tag_bg};color:{tag_color};margin-bottom:14px;">{source}</span>
|
||||
<h3 style="font-size:17px;font-weight:600;line-height:1.5;color:#fff;margin:0 0 12px 0;"><a href="{story.get('url', '#')}" style="color:#74b9ff;text-decoration:none;">{story.get('title', '')}</a></h3>
|
||||
{excerpt_html}
|
||||
{engagement_html}
|
||||
</td></tr>
|
||||
</table>'''
|
||||
|
||||
def format_story_text(story: Dict) -> str:
|
||||
"""Format a story for plain-text email"""
|
||||
lines = [
|
||||
f"📌 {story.get('title', '')}",
|
||||
f" Link: {story.get('url', '')}",
|
||||
]
|
||||
|
||||
if story.get('author'):
|
||||
lines.append(f" Author: {story.get('author')}")
|
||||
|
||||
if story.get('score') or story.get('points'):
|
||||
score = story.get('score') or story.get('points', 0)
|
||||
lines.append(f" Score: {score} upvotes")
|
||||
|
||||
if story.get('num_comments'):
|
||||
lines.append(f" Comments: {story.get('num_comments')}")
|
||||
|
||||
excerpt = story.get('selftext', '') or story.get('summary', '')
|
||||
if excerpt:
|
||||
excerpt = excerpt[:150] + "..." if len(excerpt) > 150 else excerpt
|
||||
lines.append(f" {excerpt}")
|
||||
|
||||
lines.append("") # Empty line between stories
|
||||
return "\n".join(lines)
|
||||
|
||||
def aggregate_content(hours: int = 24) -> Dict[str, Any]:
|
||||
"""Main aggregation function"""
|
||||
print(f"🦀 Aggregating OpenClaw content from last {hours} hours...")
|
||||
print("=" * 50)
|
||||
|
||||
# Fetch from all sources
|
||||
print("\n📥 Fetching Reddit content...")
|
||||
reddit_data = fetch_reddit_content(hours=hours)
|
||||
|
||||
print("\n📥 Fetching news content...")
|
||||
news_data = fetch_news_content(hours=hours)
|
||||
|
||||
# Twitter placeholder (will be added when API is configured)
|
||||
twitter_data = {
|
||||
"source": "twitter",
|
||||
"total_items": 0,
|
||||
"tweets": [],
|
||||
"note": "X/Twitter integration requires API setup"
|
||||
}
|
||||
|
||||
# Combine all items for deduplication
|
||||
all_items = []
|
||||
all_items.extend([{**item, '_source': 'reddit'} for item in reddit_data.get('all_posts', [])])
|
||||
all_items.extend([{**item, '_source': 'news'} for item in news_data.get('all_items', [])])
|
||||
|
||||
# Deduplicate
|
||||
print("\n🧹 Deduplicating stories...")
|
||||
unique_items = deduplicate_stories(all_items)
|
||||
print(f" Removed {len(all_items) - len(unique_items)} duplicates")
|
||||
|
||||
# Sort by relevance score
|
||||
unique_items.sort(key=score_relevance, reverse=True)
|
||||
|
||||
# Split back into sections
|
||||
reddit_top = [item for item in unique_items if item.get('_source') == 'reddit'][:8]
|
||||
news_top = [item for item in unique_items if item.get('_source') == 'news'][:8]
|
||||
|
||||
# Generate HTML sections
|
||||
reddit_html = '\n'.join([format_reddit_story(s) for s in reddit_top])
|
||||
news_html = '\n'.join([format_news_story(s) for s in news_top])
|
||||
twitter_html = '<p style="text-align:center;color:#888;padding:30px 0;">🚧 X/Twitter integration coming soon</p>'
|
||||
|
||||
# Generate text sections
|
||||
reddit_text = '\n'.join([format_story_text(s) for s in reddit_top]) if reddit_top else "No new Reddit posts today."
|
||||
news_text = '\n'.join([format_story_text(s) for s in news_top]) if news_top else "No new news articles today."
|
||||
twitter_text = "🚧 X/Twitter integration coming soon - requires API setup\n"
|
||||
|
||||
# Build result
|
||||
result = {
|
||||
"meta": {
|
||||
"generated_at": datetime.utcnow().isoformat(),
|
||||
"time_window_hours": hours,
|
||||
"date": datetime.utcnow().strftime("%A, %B %d, %Y")
|
||||
},
|
||||
"stats": {
|
||||
"reddit_count": reddit_data.get('total_posts', 0),
|
||||
"news_count": news_data.get('total_items', 0),
|
||||
"twitter_count": 0,
|
||||
"total_unique": len(unique_items)
|
||||
},
|
||||
"content": {
|
||||
"reddit": reddit_data,
|
||||
"news": news_data,
|
||||
"twitter": twitter_data
|
||||
},
|
||||
"formatted": {
|
||||
"reddit_html": reddit_html,
|
||||
"news_html": news_html,
|
||||
"twitter_html": twitter_html,
|
||||
"reddit_text": reddit_text,
|
||||
"news_text": news_text,
|
||||
"twitter_text": twitter_text
|
||||
}
|
||||
}
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print(f"✅ Aggregation complete!")
|
||||
print(f" Reddit posts: {result['stats']['reddit_count']}")
|
||||
print(f" News items: {result['stats']['news_count']}")
|
||||
print(f" Total unique: {result['stats']['total_unique']}")
|
||||
|
||||
return result
|
||||
|
||||
if __name__ == "__main__":
|
||||
hours = int(sys.argv[1]) if len(sys.argv) > 1 else 24
|
||||
output_file = sys.argv[2] if len(sys.argv) > 2 else "/home/openclaw/.openclaw/workspace/automations/openclaw-digest/output/digest.json"
|
||||
|
||||
result = aggregate_content(hours=hours)
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(result, f, indent=2)
|
||||
|
||||
print(f"\n📄 Output saved to: {output_file}")
|
||||
Reference in New Issue
Block a user