AI Newsletter Digest improvements: fixed QP soft line break decoding, URL extraction, and content cleaning
This commit is contained in:
83
archive/inactive-skills/tube-summary/SKILL.md
Normal file
83
archive/inactive-skills/tube-summary/SKILL.md
Normal file
@@ -0,0 +1,83 @@
|
||||
---
|
||||
name: tube-summary
|
||||
description: Search YouTube for videos on any topic and get intelligent summaries from video subtitles. Use when you need to: (1) Find and preview YouTube videos on a subject, (2) Get a detailed description of what a video covers based on its actual content, (3) Quickly understand video topics without watching. Workflow: search YouTube → pick a video → extract and summarize subtitles.
|
||||
---
|
||||
|
||||
# tube-summary
|
||||
|
||||
Search YouTube for videos on any topic, then extract and summarize their content using subtitles.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Step 1: Search for Videos
|
||||
|
||||
When asked about a topic, search YouTube and list the top 10 results:
|
||||
|
||||
```bash
|
||||
python3 scripts/youtube-search.py "your search query"
|
||||
```
|
||||
|
||||
This returns a numbered list of videos with titles, channels, and view counts.
|
||||
|
||||
### Step 2: User Picks a Video
|
||||
|
||||
The user selects one video by number (e.g., "3" for the third video).
|
||||
|
||||
### Step 3: Download Subtitles
|
||||
|
||||
Extract English subtitles from the selected video using yt-dlp:
|
||||
|
||||
```bash
|
||||
yt-dlp --write-subs --sub-langs en --skip-download "VIDEO_URL"
|
||||
```
|
||||
|
||||
This creates a `.en.vtt` subtitle file without downloading the video.
|
||||
|
||||
### Step 4: Process & Summarize
|
||||
|
||||
Use the subtitle processor to analyze and summarize:
|
||||
|
||||
```bash
|
||||
python3 scripts/process-subtitles.py "path/to/subtitle-file.vtt"
|
||||
```
|
||||
|
||||
This generates:
|
||||
- **Key Topics**: Main subjects covered in the video
|
||||
- **Summary**: Concise 2-3 paragraph description of content
|
||||
- **Timestamps**: Notable moments with context
|
||||
- **Key Quotes**: Important statements from speakers
|
||||
|
||||
## Workflow
|
||||
|
||||
1. **Search** → `youtube-search.py "<topic>"` → Display top 10 videos
|
||||
2. **User selects** → e.g., "Video 5"
|
||||
3. **Extract URL** → From the search results
|
||||
4. **Download subs** → `yt-dlp --write-subs --sub-langs en --skip-download "URL"`
|
||||
5. **Process** → `process-subtitles.py "subtitle.vtt"`
|
||||
6. **Present** → Formatted summary with key points
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- `yt-dlp` (install: `pip install yt-dlp`)
|
||||
- `requests` (for YouTube search fallback)
|
||||
- Python 3.7+
|
||||
|
||||
## Notes
|
||||
|
||||
- If YouTube search API is unavailable, the fallback uses web scraping via requests
|
||||
- Subtitles may be auto-generated if not manually authored
|
||||
- Some videos may not have English subtitles available
|
||||
- The subtitle file is created in the same directory as yt-dlp is run
|
||||
|
||||
## Example Usage
|
||||
|
||||
```
|
||||
User: "Tell me about Rust programming language"
|
||||
|
||||
→ Search returns 10 videos about Rust
|
||||
|
||||
User: "Summarize video 3"
|
||||
|
||||
→ Downloads subtitles from video 3
|
||||
→ Processes and returns detailed summary
|
||||
```
|
||||
6
archive/inactive-skills/tube-summary/_meta.json
Normal file
6
archive/inactive-skills/tube-summary/_meta.json
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"ownerId": "kn72rt9s917kwv4x2asxkjh4cd805p6r",
|
||||
"slug": "tube-summary",
|
||||
"version": "1.0.0",
|
||||
"publishedAt": 1769653406198
|
||||
}
|
||||
@@ -0,0 +1,204 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Subtitle Processing Script for tube-summary skill
|
||||
|
||||
Processes VTT subtitle files to extract key information and generate summaries.
|
||||
|
||||
Usage: python3 process-subtitles.py "path/to/subtitle-file.vtt"
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
def parse_vtt(vtt_file):
|
||||
"""Parse a VTT subtitle file and extract text with timestamps"""
|
||||
subtitles = []
|
||||
|
||||
with open(vtt_file, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
# Remove header
|
||||
content = content.replace('WEBVTT', '').strip()
|
||||
|
||||
# Split by double newlines (subtitle blocks)
|
||||
blocks = content.split('\n\n')
|
||||
|
||||
for block in blocks:
|
||||
lines = block.strip().split('\n')
|
||||
if len(lines) < 2:
|
||||
continue
|
||||
|
||||
# Parse timestamp line
|
||||
timestamp_line = lines[0]
|
||||
if '-->' in timestamp_line:
|
||||
time_parts = timestamp_line.split(' --> ')
|
||||
if len(time_parts) == 2:
|
||||
start_time = time_parts[0].strip()
|
||||
# Extract text (remaining lines)
|
||||
text = ' '.join(lines[1:]).strip()
|
||||
if text:
|
||||
subtitles.append({
|
||||
'time': start_time,
|
||||
'text': text
|
||||
})
|
||||
|
||||
return subtitles
|
||||
|
||||
def extract_key_topics(subtitles):
|
||||
"""Extract key topics/keywords from subtitles"""
|
||||
all_text = ' '.join([s['text'] for s in subtitles])
|
||||
|
||||
# Remove common words
|
||||
stop_words = {
|
||||
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
||||
'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
|
||||
'you', 'i', 'we', 'he', 'she', 'it', 'that', 'this', 'what', 'which',
|
||||
'who', 'when', 'where', 'why', 'how', 'so', 'if', 'as', 'can', 'have',
|
||||
'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should',
|
||||
'um', 'uh', 'like', 'you know', 'basically', 'sort of', 'kind of'
|
||||
}
|
||||
|
||||
# Extract words
|
||||
words = re.findall(r'\b[a-z]{4,}\b', all_text.lower())
|
||||
|
||||
# Count word frequencies (excluding stop words)
|
||||
word_freq = defaultdict(int)
|
||||
for word in words:
|
||||
if word not in stop_words:
|
||||
word_freq[word] += 1
|
||||
|
||||
# Get top keywords
|
||||
top_keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:15]
|
||||
return [word for word, count in top_keywords if count >= 2]
|
||||
|
||||
def generate_summary(subtitles, max_length=1000):
|
||||
"""Generate a summary from the full subtitle text"""
|
||||
full_text = ' '.join([s['text'] for s in subtitles])
|
||||
|
||||
# Split into sentences
|
||||
sentences = re.split(r'[.!?]+', full_text)
|
||||
sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
|
||||
|
||||
# Simple extractive summarization: take first few sentences
|
||||
summary_sentences = sentences[:5]
|
||||
summary = '. '.join(summary_sentences) + '.'
|
||||
|
||||
# Truncate if too long
|
||||
if len(summary) > max_length:
|
||||
summary = summary[:max_length] + '...'
|
||||
|
||||
return summary
|
||||
|
||||
def get_key_quotes(subtitles, count=3):
|
||||
"""Extract the longest/most impactful quotes from subtitles"""
|
||||
# Filter for substantial segments
|
||||
quotes = [s for s in subtitles if len(s['text']) > 30]
|
||||
|
||||
# Sort by length (longer = more substantial)
|
||||
quotes_sorted = sorted(quotes, key=lambda x: len(x['text']), reverse=True)
|
||||
|
||||
return quotes_sorted[:count]
|
||||
|
||||
def get_notable_moments(subtitles):
|
||||
"""Find notable moments based on specific keywords"""
|
||||
keywords = [
|
||||
'important', 'remember', 'key', 'main', 'best', 'worst',
|
||||
'conclusion', 'summary', 'therefore', 'so', 'now',
|
||||
'first', 'second', 'third', 'finally', 'ultimately'
|
||||
]
|
||||
|
||||
notable = []
|
||||
for sub in subtitles:
|
||||
text_lower = sub['text'].lower()
|
||||
if any(keyword in text_lower for keyword in keywords):
|
||||
notable.append(sub)
|
||||
|
||||
return notable[:5]
|
||||
|
||||
def format_output(subtitles, vtt_file):
|
||||
"""Format and print the analysis"""
|
||||
if not subtitles:
|
||||
print("❌ No subtitles found in the file.")
|
||||
return
|
||||
|
||||
topics = extract_key_topics(subtitles)
|
||||
summary = generate_summary(subtitles)
|
||||
quotes = get_key_quotes(subtitles, count=3)
|
||||
notable = get_notable_moments(subtitles)
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("📊 VIDEO SUBTITLE ANALYSIS")
|
||||
print("="*70)
|
||||
|
||||
print(f"\n📁 File: {Path(vtt_file).name}")
|
||||
print(f"⏱️ Total Duration: {subtitles[-1]['time'] if subtitles else 'N/A'}")
|
||||
print(f"📝 Total Subtitle Lines: {len(subtitles)}")
|
||||
|
||||
print("\n" + "-"*70)
|
||||
print("🔑 KEY TOPICS")
|
||||
print("-"*70)
|
||||
if topics:
|
||||
for i, topic in enumerate(topics[:10], 1):
|
||||
print(f" {i}. {topic}")
|
||||
else:
|
||||
print(" No topics extracted")
|
||||
|
||||
print("\n" + "-"*70)
|
||||
print("📄 SUMMARY")
|
||||
print("-"*70)
|
||||
print(f"\n{summary}\n")
|
||||
|
||||
print("-"*70)
|
||||
print("💬 KEY QUOTES")
|
||||
print("-"*70)
|
||||
if quotes:
|
||||
for i, quote in enumerate(quotes, 1):
|
||||
# Clean up quote text
|
||||
text = quote['text'].replace('\n', ' ').strip()
|
||||
# Limit length
|
||||
if len(text) > 150:
|
||||
text = text[:150] + "..."
|
||||
print(f"\n [{quote['time']}]")
|
||||
print(f" \"{text}\"")
|
||||
else:
|
||||
print(" No notable quotes found")
|
||||
|
||||
print("\n" + "-"*70)
|
||||
print("⭐ NOTABLE MOMENTS")
|
||||
print("-"*70)
|
||||
if notable:
|
||||
for moment in notable:
|
||||
text = moment['text'].replace('\n', ' ').strip()
|
||||
if len(text) > 100:
|
||||
text = text[:100] + "..."
|
||||
print(f" [{moment['time']}] {text}")
|
||||
else:
|
||||
print(" No notable moments found")
|
||||
|
||||
print("\n" + "="*70 + "\n")
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python3 process-subtitles.py \"path/to/subtitle-file.vtt\"")
|
||||
sys.exit(1)
|
||||
|
||||
vtt_file = sys.argv[1]
|
||||
|
||||
if not Path(vtt_file).exists():
|
||||
print(f"❌ File not found: {vtt_file}")
|
||||
sys.exit(1)
|
||||
|
||||
if not vtt_file.endswith('.vtt'):
|
||||
print("⚠️ Warning: File does not end in .vtt, but attempting to parse...")
|
||||
|
||||
try:
|
||||
subtitles = parse_vtt(vtt_file)
|
||||
format_output(subtitles, vtt_file)
|
||||
except Exception as e:
|
||||
print(f"❌ Error processing subtitles: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
150
archive/inactive-skills/tube-summary/scripts/youtube-search.py
Normal file
150
archive/inactive-skills/tube-summary/scripts/youtube-search.py
Normal file
@@ -0,0 +1,150 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
YouTube Search Script for tube-summary skill
|
||||
|
||||
Searches YouTube for videos on a given topic and returns top 10 results.
|
||||
Falls back to web scraping if API is unavailable.
|
||||
|
||||
Usage: python3 youtube-search.py "search query"
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
from urllib.parse import quote, urljoin
|
||||
import re
|
||||
|
||||
def search_via_yt_dlp(query):
|
||||
"""Search YouTube using yt-dlp (most reliable)"""
|
||||
try:
|
||||
# Use yt-dlp's search functionality
|
||||
cmd = [
|
||||
'yt-dlp',
|
||||
f'ytsearch10:{query}',
|
||||
'--dump-json',
|
||||
'--flat-playlist'
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
|
||||
if result.returncode == 0:
|
||||
videos = []
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line:
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
videos.append({
|
||||
'title': entry.get('title', 'Unknown'),
|
||||
'url': f"https://www.youtube.com/watch?v={entry.get('id', '')}",
|
||||
'channel': entry.get('channel', 'Unknown'),
|
||||
'duration': entry.get('duration', 0),
|
||||
'views': entry.get('view_count', 'N/A')
|
||||
})
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
return videos[:10]
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError, Exception):
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def search_via_web_scrape(query):
|
||||
"""Fallback: web scraping via requests"""
|
||||
try:
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
search_url = f"https://www.youtube.com/results?search_query={quote(query)}"
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
}
|
||||
|
||||
response = requests.get(search_url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
# Extract initial data from the page
|
||||
match = re.search(r'var ytInitialData = ({.*?});', response.text)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group(1))
|
||||
videos = []
|
||||
|
||||
# Navigate the nested JSON structure
|
||||
contents = (
|
||||
data.get('contents', {})
|
||||
.get('twoColumnSearchResultsTabsRenderer', {})
|
||||
.get('tabs', [{}])[0]
|
||||
.get('tabRenderer', {})
|
||||
.get('content', {})
|
||||
.get('sectionListRenderer', {})
|
||||
.get('contents', [])
|
||||
)
|
||||
|
||||
for section in contents:
|
||||
items = (
|
||||
section.get('itemSectionRenderer', {})
|
||||
.get('contents', [])
|
||||
)
|
||||
|
||||
for item in items:
|
||||
if 'videoRenderer' in item:
|
||||
video = item['videoRenderer']
|
||||
videos.append({
|
||||
'title': video.get('title', {}).get('runs', [{}])[0].get('text', 'Unknown'),
|
||||
'url': f"https://www.youtube.com/watch?v={video.get('videoId', '')}",
|
||||
'channel': video.get('longBylineText', {}).get('simpleText', 'Unknown'),
|
||||
'duration': video.get('lengthText', {}).get('simpleText', 'N/A'),
|
||||
'views': video.get('viewCountText', {}).get('simpleText', 'N/A')
|
||||
})
|
||||
|
||||
if len(videos) >= 10:
|
||||
break
|
||||
|
||||
if len(videos) >= 10:
|
||||
break
|
||||
|
||||
return videos[:10]
|
||||
except (json.JSONDecodeError, KeyError, IndexError):
|
||||
pass
|
||||
except (ImportError, Exception):
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def format_results(videos):
|
||||
"""Format video results for display"""
|
||||
output = [f"\n📺 Top 10 YouTube Videos for this search:\n"]
|
||||
|
||||
for i, video in enumerate(videos, 1):
|
||||
output.append(f"{i}. {video['title']}")
|
||||
output.append(f" Channel: {video['channel']}")
|
||||
output.append(f" Views: {video['views']} • Duration: {video.get('duration', 'N/A')}")
|
||||
output.append(f" URL: {video['url']}\n")
|
||||
|
||||
output.append("\n➡️ Respond with the video number (1-10) to summarize that video.\n")
|
||||
return "".join(output)
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python3 youtube-search.py \"search query\"")
|
||||
sys.exit(1)
|
||||
|
||||
query = " ".join(sys.argv[1:])
|
||||
|
||||
# Try yt-dlp first (most reliable)
|
||||
videos = search_via_yt_dlp(query)
|
||||
|
||||
# Fallback to web scraping
|
||||
if not videos:
|
||||
videos = search_via_web_scrape(query)
|
||||
|
||||
if videos:
|
||||
print(format_results(videos))
|
||||
# Also output JSON for programmatic access
|
||||
print("\n<!-- JSON Data (for tool processing) -->")
|
||||
print(json.dumps(videos, indent=2))
|
||||
else:
|
||||
print("❌ No videos found. Try a different search query or check your internet connection.")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user