Files
openclaw-backups/skills/youtube-transcript-analyzer/scripts/extract_transcript.sh
2026-02-20 13:01:46 +00:00

69 lines
1.8 KiB
Bash

#!/bin/bash
# YouTube Transcript Extractor
# Extracts and cleans transcript from YouTube video URL
set -e
if [ $# -eq 0 ]; then
echo "Usage: $0 <youtube-url> [output-file]"
echo "Example: $0 'https://www.youtube.com/watch?v=dQw4w9WgXcQ' transcript.txt"
exit 1
fi
VIDEO_URL="$1"
OUTPUT_FILE="${2:-transcript.txt}"
# Check if yt-dlp exists
if ! command -v yt-dlp &> /dev/null; then
# Try to find it in home directory
if [ -f "$HOME/yt-dlp" ]; then
YT_DLP="$HOME/yt-dlp"
else
echo "Error: yt-dlp not found. Installing to ~/yt-dlp..."
curl -L https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o "$HOME/yt-dlp"
chmod +x "$HOME/yt-dlp"
YT_DLP="$HOME/yt-dlp"
fi
else
YT_DLP="yt-dlp"
fi
echo "Extracting transcript from: $VIDEO_URL"
# Extract transcript as VTT subtitle file
$YT_DLP --write-auto-sub --write-sub --sub-lang en --skip-download --sub-format vtt "$VIDEO_URL" -o "temp_transcript" --quiet 2>/dev/null || {
echo "Warning: Some warnings occurred during extraction, but continuing..."
}
# Check if transcript was extracted
if [ ! -f "temp_transcript.en.vtt" ]; then
echo "Error: Failed to extract transcript. Video may not have captions available."
exit 1
fi
echo "Cleaning transcript..."
# Clean up the VTT format to extract just the text
grep -v "^[0-9]" temp_transcript.en.vtt | \
grep -v "^WEBVTT" | \
grep -v "^Kind:" | \
grep -v "^Language:" | \
grep -v "^\s*$" | \
grep -v "align:start" | \
sed 's/<[^>]*>//g' | \
tr '\n' ' ' | \
sed 's/ */ /g' | \
sed 's/^ *//g' | \
sed 's/ *$//g' > "$OUTPUT_FILE"
# Clean up temporary file
rm -f temp_transcript.en.vtt
echo "Transcript saved to: $OUTPUT_FILE"
echo "Character count: $(wc -c < "$OUTPUT_FILE")"
# Show first few lines as preview
echo -e "\nPreview:"
head -c 200 "$OUTPUT_FILE"
echo -e "...\n"