AI Newsletter Digest improvements: fixed QP soft line break decoding, URL extraction, and content cleaning

2026-03-04 13:29:22 +00:00
parent 29a98137a7
commit 57dd294675
13706 changed files with 2114953 additions and 237629 deletions
--- a/archive/inactive-skills/tube-summary/scripts/process-subtitles.py
+++ b/archive/inactive-skills/tube-summary/scripts/process-subtitles.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+"""
+Subtitle Processing Script for tube-summary skill
+
+Processes VTT subtitle files to extract key information and generate summaries.
+
+Usage: python3 process-subtitles.py "path/to/subtitle-file.vtt"
+"""
+
+import sys
+import re
+from pathlib import Path
+from collections import defaultdict
+
+def parse_vtt(vtt_file):
+    """Parse a VTT subtitle file and extract text with timestamps"""
+    subtitles = []
+    
+    with open(vtt_file, 'r', encoding='utf-8', errors='ignore') as f:
+        content = f.read()
+    
+    # Remove header
+    content = content.replace('WEBVTT', '').strip()
+    
+    # Split by double newlines (subtitle blocks)
+    blocks = content.split('\n\n')
+    
+    for block in blocks:
+        lines = block.strip().split('\n')
+        if len(lines) < 2:
+            continue
+        
+        # Parse timestamp line
+        timestamp_line = lines[0]
+        if '-->' in timestamp_line:
+            time_parts = timestamp_line.split(' --> ')
+            if len(time_parts) == 2:
+                start_time = time_parts[0].strip()
+                # Extract text (remaining lines)
+                text = ' '.join(lines[1:]).strip()
+                if text:
+                    subtitles.append({
+                        'time': start_time,
+                        'text': text
+                    })
+    
+    return subtitles
+
+def extract_key_topics(subtitles):
+    """Extract key topics/keywords from subtitles"""
+    all_text = ' '.join([s['text'] for s in subtitles])
+    
+    # Remove common words
+    stop_words = {
+        'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+        'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
+        'you', 'i', 'we', 'he', 'she', 'it', 'that', 'this', 'what', 'which',
+        'who', 'when', 'where', 'why', 'how', 'so', 'if', 'as', 'can', 'have',
+        'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should',
+        'um', 'uh', 'like', 'you know', 'basically', 'sort of', 'kind of'
+    }
+    
+    # Extract words
+    words = re.findall(r'\b[a-z]{4,}\b', all_text.lower())
+    
+    # Count word frequencies (excluding stop words)
+    word_freq = defaultdict(int)
+    for word in words:
+        if word not in stop_words:
+            word_freq[word] += 1
+    
+    # Get top keywords
+    top_keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:15]
+    return [word for word, count in top_keywords if count >= 2]
+
+def generate_summary(subtitles, max_length=1000):
+    """Generate a summary from the full subtitle text"""
+    full_text = ' '.join([s['text'] for s in subtitles])
+    
+    # Split into sentences
+    sentences = re.split(r'[.!?]+', full_text)
+    sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
+    
+    # Simple extractive summarization: take first few sentences
+    summary_sentences = sentences[:5]
+    summary = '. '.join(summary_sentences) + '.'
+    
+    # Truncate if too long
+    if len(summary) > max_length:
+        summary = summary[:max_length] + '...'
+    
+    return summary
+
+def get_key_quotes(subtitles, count=3):
+    """Extract the longest/most impactful quotes from subtitles"""
+    # Filter for substantial segments
+    quotes = [s for s in subtitles if len(s['text']) > 30]
+    
+    # Sort by length (longer = more substantial)
+    quotes_sorted = sorted(quotes, key=lambda x: len(x['text']), reverse=True)
+    
+    return quotes_sorted[:count]
+
+def get_notable_moments(subtitles):
+    """Find notable moments based on specific keywords"""
+    keywords = [
+        'important', 'remember', 'key', 'main', 'best', 'worst',
+        'conclusion', 'summary', 'therefore', 'so', 'now',
+        'first', 'second', 'third', 'finally', 'ultimately'
+    ]
+    
+    notable = []
+    for sub in subtitles:
+        text_lower = sub['text'].lower()
+        if any(keyword in text_lower for keyword in keywords):
+            notable.append(sub)
+    
+    return notable[:5]
+
+def format_output(subtitles, vtt_file):
+    """Format and print the analysis"""
+    if not subtitles:
+        print("❌ No subtitles found in the file.")
+        return
+    
+    topics = extract_key_topics(subtitles)
+    summary = generate_summary(subtitles)
+    quotes = get_key_quotes(subtitles, count=3)
+    notable = get_notable_moments(subtitles)
+    
+    print("\n" + "="*70)
+    print("📊 VIDEO SUBTITLE ANALYSIS")
+    print("="*70)
+    
+    print(f"\n📁 File: {Path(vtt_file).name}")
+    print(f"⏱️  Total Duration: {subtitles[-1]['time'] if subtitles else 'N/A'}")
+    print(f"📝 Total Subtitle Lines: {len(subtitles)}")
+    
+    print("\n" + "-"*70)
+    print("🔑 KEY TOPICS")
+    print("-"*70)
+    if topics:
+        for i, topic in enumerate(topics[:10], 1):
+            print(f"  {i}. {topic}")
+    else:
+        print("  No topics extracted")
+    
+    print("\n" + "-"*70)
+    print("📄 SUMMARY")
+    print("-"*70)
+    print(f"\n{summary}\n")
+    
+    print("-"*70)
+    print("💬 KEY QUOTES")
+    print("-"*70)
+    if quotes:
+        for i, quote in enumerate(quotes, 1):
+            # Clean up quote text
+            text = quote['text'].replace('\n', ' ').strip()
+            # Limit length
+            if len(text) > 150:
+                text = text[:150] + "..."
+            print(f"\n  [{quote['time']}]")
+            print(f"  \"{text}\"")
+    else:
+        print("  No notable quotes found")
+    
+    print("\n" + "-"*70)
+    print("⭐ NOTABLE MOMENTS")
+    print("-"*70)
+    if notable:
+        for moment in notable:
+            text = moment['text'].replace('\n', ' ').strip()
+            if len(text) > 100:
+                text = text[:100] + "..."
+            print(f"  [{moment['time']}] {text}")
+    else:
+        print("  No notable moments found")
+    
+    print("\n" + "="*70 + "\n")
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python3 process-subtitles.py \"path/to/subtitle-file.vtt\"")
+        sys.exit(1)
+    
+    vtt_file = sys.argv[1]
+    
+    if not Path(vtt_file).exists():
+        print(f"❌ File not found: {vtt_file}")
+        sys.exit(1)
+    
+    if not vtt_file.endswith('.vtt'):
+        print("⚠️  Warning: File does not end in .vtt, but attempting to parse...")
+    
+    try:
+        subtitles = parse_vtt(vtt_file)
+        format_output(subtitles, vtt_file)
+    except Exception as e:
+        print(f"❌ Error processing subtitles: {e}")
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()
--- a/archive/inactive-skills/tube-summary/scripts/youtube-search.py
+++ b/archive/inactive-skills/tube-summary/scripts/youtube-search.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+"""
+YouTube Search Script for tube-summary skill
+
+Searches YouTube for videos on a given topic and returns top 10 results.
+Falls back to web scraping if API is unavailable.
+
+Usage: python3 youtube-search.py "search query"
+"""
+
+import sys
+import json
+import subprocess
+from urllib.parse import quote, urljoin
+import re
+
+def search_via_yt_dlp(query):
+    """Search YouTube using yt-dlp (most reliable)"""
+    try:
+        # Use yt-dlp's search functionality
+        cmd = [
+            'yt-dlp',
+            f'ytsearch10:{query}',
+            '--dump-json',
+            '--flat-playlist'
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+        
+        if result.returncode == 0:
+            videos = []
+            for line in result.stdout.strip().split('\n'):
+                if line:
+                    try:
+                        entry = json.loads(line)
+                        videos.append({
+                            'title': entry.get('title', 'Unknown'),
+                            'url': f"https://www.youtube.com/watch?v={entry.get('id', '')}",
+                            'channel': entry.get('channel', 'Unknown'),
+                            'duration': entry.get('duration', 0),
+                            'views': entry.get('view_count', 'N/A')
+                        })
+                    except json.JSONDecodeError:
+                        continue
+            return videos[:10]
+    except (subprocess.TimeoutExpired, FileNotFoundError, Exception):
+        pass
+    
+    return None
+
+def search_via_web_scrape(query):
+    """Fallback: web scraping via requests"""
+    try:
+        import requests
+        from bs4 import BeautifulSoup
+        
+        search_url = f"https://www.youtube.com/results?search_query={quote(query)}"
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        }
+        
+        response = requests.get(search_url, headers=headers, timeout=10)
+        response.raise_for_status()
+        
+        # Extract initial data from the page
+        match = re.search(r'var ytInitialData = ({.*?});', response.text)
+        if match:
+            try:
+                data = json.loads(match.group(1))
+                videos = []
+                
+                # Navigate the nested JSON structure
+                contents = (
+                    data.get('contents', {})
+                    .get('twoColumnSearchResultsTabsRenderer', {})
+                    .get('tabs', [{}])[0]
+                    .get('tabRenderer', {})
+                    .get('content', {})
+                    .get('sectionListRenderer', {})
+                    .get('contents', [])
+                )
+                
+                for section in contents:
+                    items = (
+                        section.get('itemSectionRenderer', {})
+                        .get('contents', [])
+                    )
+                    
+                    for item in items:
+                        if 'videoRenderer' in item:
+                            video = item['videoRenderer']
+                            videos.append({
+                                'title': video.get('title', {}).get('runs', [{}])[0].get('text', 'Unknown'),
+                                'url': f"https://www.youtube.com/watch?v={video.get('videoId', '')}",
+                                'channel': video.get('longBylineText', {}).get('simpleText', 'Unknown'),
+                                'duration': video.get('lengthText', {}).get('simpleText', 'N/A'),
+                                'views': video.get('viewCountText', {}).get('simpleText', 'N/A')
+                            })
+                        
+                        if len(videos) >= 10:
+                            break
+                    
+                    if len(videos) >= 10:
+                        break
+                
+                return videos[:10]
+            except (json.JSONDecodeError, KeyError, IndexError):
+                pass
+    except (ImportError, Exception):
+        pass
+    
+    return None
+
+def format_results(videos):
+    """Format video results for display"""
+    output = [f"\n📺 Top 10 YouTube Videos for this search:\n"]
+    
+    for i, video in enumerate(videos, 1):
+        output.append(f"{i}. {video['title']}")
+        output.append(f"   Channel: {video['channel']}")
+        output.append(f"   Views: {video['views']} • Duration: {video.get('duration', 'N/A')}")
+        output.append(f"   URL: {video['url']}\n")
+    
+    output.append("\n➡️  Respond with the video number (1-10) to summarize that video.\n")
+    return "".join(output)
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python3 youtube-search.py \"search query\"")
+        sys.exit(1)
+    
+    query = " ".join(sys.argv[1:])
+    
+    # Try yt-dlp first (most reliable)
+    videos = search_via_yt_dlp(query)
+    
+    # Fallback to web scraping
+    if not videos:
+        videos = search_via_web_scrape(query)
+    
+    if videos:
+        print(format_results(videos))
+        # Also output JSON for programmatic access
+        print("\n<!-- JSON Data (for tool processing) -->")
+        print(json.dumps(videos, indent=2))
+    else:
+        print("❌ No videos found. Try a different search query or check your internet connection.")
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()