AI Newsletter Digest improvements: fixed QP soft line break decoding, URL extraction, and content cleaning
This commit is contained in:
150
archive/inactive-skills/tube-summary/scripts/youtube-search.py
Normal file
150
archive/inactive-skills/tube-summary/scripts/youtube-search.py
Normal file
@@ -0,0 +1,150 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
YouTube Search Script for tube-summary skill
|
||||
|
||||
Searches YouTube for videos on a given topic and returns top 10 results.
|
||||
Falls back to web scraping if API is unavailable.
|
||||
|
||||
Usage: python3 youtube-search.py "search query"
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
from urllib.parse import quote, urljoin
|
||||
import re
|
||||
|
||||
def search_via_yt_dlp(query):
|
||||
"""Search YouTube using yt-dlp (most reliable)"""
|
||||
try:
|
||||
# Use yt-dlp's search functionality
|
||||
cmd = [
|
||||
'yt-dlp',
|
||||
f'ytsearch10:{query}',
|
||||
'--dump-json',
|
||||
'--flat-playlist'
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
|
||||
if result.returncode == 0:
|
||||
videos = []
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line:
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
videos.append({
|
||||
'title': entry.get('title', 'Unknown'),
|
||||
'url': f"https://www.youtube.com/watch?v={entry.get('id', '')}",
|
||||
'channel': entry.get('channel', 'Unknown'),
|
||||
'duration': entry.get('duration', 0),
|
||||
'views': entry.get('view_count', 'N/A')
|
||||
})
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
return videos[:10]
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError, Exception):
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def search_via_web_scrape(query):
|
||||
"""Fallback: web scraping via requests"""
|
||||
try:
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
search_url = f"https://www.youtube.com/results?search_query={quote(query)}"
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
}
|
||||
|
||||
response = requests.get(search_url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
# Extract initial data from the page
|
||||
match = re.search(r'var ytInitialData = ({.*?});', response.text)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group(1))
|
||||
videos = []
|
||||
|
||||
# Navigate the nested JSON structure
|
||||
contents = (
|
||||
data.get('contents', {})
|
||||
.get('twoColumnSearchResultsTabsRenderer', {})
|
||||
.get('tabs', [{}])[0]
|
||||
.get('tabRenderer', {})
|
||||
.get('content', {})
|
||||
.get('sectionListRenderer', {})
|
||||
.get('contents', [])
|
||||
)
|
||||
|
||||
for section in contents:
|
||||
items = (
|
||||
section.get('itemSectionRenderer', {})
|
||||
.get('contents', [])
|
||||
)
|
||||
|
||||
for item in items:
|
||||
if 'videoRenderer' in item:
|
||||
video = item['videoRenderer']
|
||||
videos.append({
|
||||
'title': video.get('title', {}).get('runs', [{}])[0].get('text', 'Unknown'),
|
||||
'url': f"https://www.youtube.com/watch?v={video.get('videoId', '')}",
|
||||
'channel': video.get('longBylineText', {}).get('simpleText', 'Unknown'),
|
||||
'duration': video.get('lengthText', {}).get('simpleText', 'N/A'),
|
||||
'views': video.get('viewCountText', {}).get('simpleText', 'N/A')
|
||||
})
|
||||
|
||||
if len(videos) >= 10:
|
||||
break
|
||||
|
||||
if len(videos) >= 10:
|
||||
break
|
||||
|
||||
return videos[:10]
|
||||
except (json.JSONDecodeError, KeyError, IndexError):
|
||||
pass
|
||||
except (ImportError, Exception):
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def format_results(videos):
|
||||
"""Format video results for display"""
|
||||
output = [f"\n📺 Top 10 YouTube Videos for this search:\n"]
|
||||
|
||||
for i, video in enumerate(videos, 1):
|
||||
output.append(f"{i}. {video['title']}")
|
||||
output.append(f" Channel: {video['channel']}")
|
||||
output.append(f" Views: {video['views']} • Duration: {video.get('duration', 'N/A')}")
|
||||
output.append(f" URL: {video['url']}\n")
|
||||
|
||||
output.append("\n➡️ Respond with the video number (1-10) to summarize that video.\n")
|
||||
return "".join(output)
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python3 youtube-search.py \"search query\"")
|
||||
sys.exit(1)
|
||||
|
||||
query = " ".join(sys.argv[1:])
|
||||
|
||||
# Try yt-dlp first (most reliable)
|
||||
videos = search_via_yt_dlp(query)
|
||||
|
||||
# Fallback to web scraping
|
||||
if not videos:
|
||||
videos = search_via_web_scrape(query)
|
||||
|
||||
if videos:
|
||||
print(format_results(videos))
|
||||
# Also output JSON for programmatic access
|
||||
print("\n<!-- JSON Data (for tool processing) -->")
|
||||
print(json.dumps(videos, indent=2))
|
||||
else:
|
||||
print("❌ No videos found. Try a different search query or check your internet connection.")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user