#!/bin/bash # YouTube Transcript Extractor # Extracts and cleans transcript from YouTube video URL set -e if [ $# -eq 0 ]; then echo "Usage: $0 [output-file]" echo "Example: $0 'https://www.youtube.com/watch?v=dQw4w9WgXcQ' transcript.txt" exit 1 fi VIDEO_URL="$1" OUTPUT_FILE="${2:-transcript.txt}" # Check if yt-dlp exists if ! command -v yt-dlp &> /dev/null; then # Try to find it in home directory if [ -f "$HOME/yt-dlp" ]; then YT_DLP="$HOME/yt-dlp" else echo "Error: yt-dlp not found. Installing to ~/yt-dlp..." curl -L https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o "$HOME/yt-dlp" chmod +x "$HOME/yt-dlp" YT_DLP="$HOME/yt-dlp" fi else YT_DLP="yt-dlp" fi echo "Extracting transcript from: $VIDEO_URL" # Extract transcript as VTT subtitle file $YT_DLP --write-auto-sub --write-sub --sub-lang en --skip-download --sub-format vtt "$VIDEO_URL" -o "temp_transcript" --quiet 2>/dev/null || { echo "Warning: Some warnings occurred during extraction, but continuing..." } # Check if transcript was extracted if [ ! -f "temp_transcript.en.vtt" ]; then echo "Error: Failed to extract transcript. Video may not have captions available." exit 1 fi echo "Cleaning transcript..." # Clean up the VTT format to extract just the text grep -v "^[0-9]" temp_transcript.en.vtt | \ grep -v "^WEBVTT" | \ grep -v "^Kind:" | \ grep -v "^Language:" | \ grep -v "^\s*$" | \ grep -v "align:start" | \ sed 's/<[^>]*>//g' | \ tr '\n' ' ' | \ sed 's/ */ /g' | \ sed 's/^ *//g' | \ sed 's/ *$//g' > "$OUTPUT_FILE" # Clean up temporary file rm -f temp_transcript.en.vtt echo "Transcript saved to: $OUTPUT_FILE" echo "Character count: $(wc -c < "$OUTPUT_FILE")" # Show first few lines as preview echo -e "\nPreview:" head -c 200 "$OUTPUT_FILE" echo -e "...\n"