AI Newsletter Digest improvements: fixed QP soft line break decoding, URL extraction, and content cleaning

2026-03-04 13:29:22 +00:00
parent 29a98137a7
commit 57dd294675
13706 changed files with 2114953 additions and 237629 deletions
--- a/automations/ai-newsletter-digest/parse-emails.py
+++ b/automations/ai-newsletter-digest/parse-emails.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+"""Parse AI_EMAIL / AI_CONTENT pairs from imap check script output."""
+import sys
+import json
+import re
+
+# Senders to exclude
+BLOCKLIST = [
+    'googlenews-noreply@google.com',
+]
+
+# URL patterns to skip (ads, tracking, social, email)
+URL_SKIP = re.compile(
+    r'unsubscribe|mailto|twitter\.com|instagram\.com|facebook\.com|'
+    r'youtube\.com/unsubscribe|youtube\.com/channel|'
+    r'utm_source=dlvr|utm_medium=email|'
+    r'genstore|typeform|pigment\.com|maton\.ai|'
+    r'youtu\.be|medium\.com/@|linkedin\.com/posts|'
+    r'cdn-cgi|imageproxy',
+    re.IGNORECASE
+)
+
+def decode_subject(subject):
+    try:
+        from email.header import decode_header
+        parts = decode_header(subject)
+        decoded = ''
+        for part, charset in parts:
+            if isinstance(part, bytes):
+                decoded += part.decode(charset or 'utf-8', errors='replace')
+            else:
+                decoded += part
+        return decoded.strip()
+    except Exception:
+        return subject.strip()
+
+def decode_qp(content):
+    """Decode quoted-printable content before URL extraction."""
+    # First decode soft line breaks (the main issue) - must be first!
+    # Match = followed by newline (any type)
+    content = re.sub(r'=\r?\n', '', content)
+    content = re.sub(r'=20', '', content)  # Remove =20 (space) encoding
+    
+    # More aggressive: remove any = followed by lowercase letter and space
+    content = re.sub(r'=[a-z] ', '', content)  
+    content = re.sub(r'=[a-z]$', '', content, flags=re.MULTILINE)
+    
+    def qp_decode(m):
+        try:
+            return bytes.fromhex(m.group(1)).decode('utf-8', errors='replace')
+        except Exception:
+            return m.group(0)
+    
+    # Decode quoted-printable hex codes
+    content = re.sub(r'=([0-9A-Fa-f]{2})', qp_decode, content)
+    
+    # Clean up any remaining = in URLs
+    content = content.replace('=', '')
+    
+    return content
+
+def extract_urls(content):
+    """Extract clean article URLs from email content (after full QP decoding)."""
+    # Decode QP FIRST - this is the key fix
+    content = decode_qp(content)
+    
+    # Also extract markdown-style links: [text](https://...)
+    markdown_urls = re.findall(r'\[([^\]]+)\]\((https?://[^\s"<>)\]\']+)\)', content)
+    
+    # Extract regular URLs
+    urls = re.findall(r'https?://[^\s"<>)\]\']+', content)
+    
+    seen = set()
+    clean = []
+    
+    # First add markdown URLs (they tend to be cleaner)
+    for text, url in markdown_urls:
+        if url not in seen and not URL_SKIP.search(url):
+            # Clean tracking
+            url = re.sub(r'[?&]utm_[^&]+', '', url)
+            url = re.sub(r'[?&]_bhlid=\w+', '', url)
+            url = re.sub(r'[?&]jwt_token=\w+', '', url)
+            url = re.sub(r'[?&]ref=[^&]+', '', url)
+            url = url.rstrip('.,;)\'"')
+            if len(url) > 15 and not any(x in url.lower() for x in ['.jpg', '.png', '.gif', '.jpeg', '.svg', '/cdn-cgi/']):
+                seen.add(url)
+                clean.append(url)
+    
+    # Then add regular URLs
+    for url in urls:
+        url = url.rstrip('.,;)\'"')
+        
+        # Skip tracking-heavy URLs
+        if URL_SKIP.search(url):
+            continue
+        
+        # Clean up common tracking garbage
+        url = re.sub(r'[?&]utm_[^&]+', '', url)
+        url = re.sub(r'[?&]_bhlid=\w+', '', url)
+        url = re.sub(r'[?&]jwt_token=\w+', '', url)
+        url = re.sub(r'[?&]ref=[^&]+', '', url)
+        url = url.rstrip('.,;)\'"')
+        
+        # Must be reasonably long to be a real article
+        if len(url) > 15 and url not in seen:
+            # Also skip image/video URLs
+            if any(x in url.lower() for x in ['.jpg', '.png', '.gif', '.jpeg', '.svg', '/image/', '/cdn-cgi/', '/media/', '/assets/']):
+                continue
+            seen.add(url)
+            clean.append(url)
+    
+    return clean[:5]
+
+def clean_content(content):
+    """Best-effort text extraction from messy email body."""
+    # Decode QP
+    content = decode_qp(content)
+    
+    # Remove HTML tags
+    content = re.sub(r'<[^>]+>', ' ', content)
+    
+    # Remove email formatting artifacts
+    content = re.sub(r'\[\[.*?\]\]', ' ', content)  # [[markup]]
+    content = re.sub(r'\{\{.*?\}\}', ' ', content)  # {{markup}}
+    content = re.sub(r'\{\|\|.*?\|\|\}', ' ', content)  # {||markup||}
+    content = re.sub(r'\^[^\^]+\^', ' ', content)  # ^markup^
+    content = re.sub(r'~~[^~]+~~', ' ', content)  # ~~markup~~
+    content = re.sub(r'__[^_]+__', ' ', content)  # __markup__
+    
+    # Remove base64/encoded blocks
+    content = re.sub(r'[A-Za-z0-9+/]{60,}', '', content)
+    
+    # Convert markdown links to just text
+    content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content)
+    
+    # Clean up whitespace
+    content = re.sub(r'[ \t]+', ' ', content)
+    content = re.sub(r'\n{3,}', '\n\n', content)
+    
+    # Split into lines and filter - be LESS aggressive
+    lines = [l.strip() for l in content.split('\n')]
+    
+    # Filter out noise lines
+    noise_patterns = [
+        r'^[-=_\*\^|>.@#]+$',  # Separator lines
+        r'^read online$',
+        r'^sign up$',
+        r'^advertise$',
+        r'^sponsor$',
+        r'^view all$',
+        r'^ unsubscribe$',
+        r'^\d+ more$',
+    ]
+    
+    filtered = []
+    for line in lines:
+        if len(line) < 20:
+            continue
+        if re.match(r'^[\W_]+\s*[\W_]+$', line):
+            continue
+        skip = False
+        for pattern in noise_patterns:
+            if re.match(pattern, line, re.IGNORECASE):
+                skip = True
+                break
+        if not skip:
+            filtered.append(line[:300])
+    
+    result = '\n'.join(filtered)
+    
+    # Final cleanup
+    result = re.sub(r'^\s*(by|from|to|subject|date):.*$', '', result, flags=re.IGNORECASE | re.MULTILINE)
+    
+    return result[:3000].strip()
+
+def is_blocked(sender):
+    return any(b in sender.lower() for b in BLOCKLIST)
+
+if len(sys.argv) < 2:
+    print("[]")
+    sys.exit(0)
+
+with open(sys.argv[1], 'r', errors='replace') as f:
+    text = f.read()
+
+# Split into structured records
+emails = []
+current_from = None
+current_subject = None
+content_lines = []
+in_content = False
+
+for line in text.split('\n'):
+    if line.startswith('AI_EMAIL:'):
+        if current_from and in_content and not is_blocked(current_from):
+            raw = '\n'.join(content_lines)
+            emails.append({
+                'from': current_from,
+                'subject': decode_subject(current_subject),
+                'urls': extract_urls(raw),
+                'content': clean_content(raw)
+            })
+        meta = line[9:]
+        parts = meta.split(' | ', 1)
+        current_from = parts[0].strip()
+        current_subject = parts[1].strip() if len(parts) > 1 else ''
+        content_lines = []
+        in_content = False
+    elif line.startswith('AI_CONTENT:') and current_from:
+        content_lines = [line[12:]]  # strip prefix
+        in_content = True
+    elif in_content and not line.startswith(('STATUS:', 'TOTAL:', 'LAST_UID:', 'RECENT:', 'AI_COUNT:')):
+        content_lines.append(line)
+
+# Last one
+if current_from and in_content and not is_blocked(current_from):
+    raw = '\n'.join(content_lines)
+    emails.append({
+        'from': current_from,
+        'subject': decode_subject(current_subject),
+        'urls': extract_urls(raw),
+        'content': clean_content(raw)
+    })
+
+print(json.dumps(emails, indent=2))