69 lines
1.8 KiB
Bash
Executable File
69 lines
1.8 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# YouTube Transcript Extractor
|
|
# Extracts and cleans transcript from YouTube video URL
|
|
|
|
set -e
|
|
|
|
if [ $# -eq 0 ]; then
|
|
echo "Usage: $0 <youtube-url> [output-file]"
|
|
echo "Example: $0 'https://www.youtube.com/watch?v=dQw4w9WgXcQ' transcript.txt"
|
|
exit 1
|
|
fi
|
|
|
|
VIDEO_URL="$1"
|
|
OUTPUT_FILE="${2:-transcript.txt}"
|
|
|
|
# Check if yt-dlp exists
|
|
if ! command -v yt-dlp &> /dev/null; then
|
|
# Try to find it in home directory
|
|
if [ -f "$HOME/yt-dlp" ]; then
|
|
YT_DLP="$HOME/yt-dlp"
|
|
else
|
|
echo "Error: yt-dlp not found. Installing to ~/yt-dlp..."
|
|
curl -L https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o "$HOME/yt-dlp"
|
|
chmod +x "$HOME/yt-dlp"
|
|
YT_DLP="$HOME/yt-dlp"
|
|
fi
|
|
else
|
|
YT_DLP="yt-dlp"
|
|
fi
|
|
|
|
echo "Extracting transcript from: $VIDEO_URL"
|
|
|
|
# Extract transcript as VTT subtitle file
|
|
$YT_DLP --write-auto-sub --write-sub --sub-lang en --skip-download --sub-format vtt "$VIDEO_URL" -o "temp_transcript" --quiet 2>/dev/null || {
|
|
echo "Warning: Some warnings occurred during extraction, but continuing..."
|
|
}
|
|
|
|
# Check if transcript was extracted
|
|
if [ ! -f "temp_transcript.en.vtt" ]; then
|
|
echo "Error: Failed to extract transcript. Video may not have captions available."
|
|
exit 1
|
|
fi
|
|
|
|
echo "Cleaning transcript..."
|
|
|
|
# Clean up the VTT format to extract just the text
|
|
grep -v "^[0-9]" temp_transcript.en.vtt | \
|
|
grep -v "^WEBVTT" | \
|
|
grep -v "^Kind:" | \
|
|
grep -v "^Language:" | \
|
|
grep -v "^\s*$" | \
|
|
grep -v "align:start" | \
|
|
sed 's/<[^>]*>//g' | \
|
|
tr '\n' ' ' | \
|
|
sed 's/ */ /g' | \
|
|
sed 's/^ *//g' | \
|
|
sed 's/ *$//g' > "$OUTPUT_FILE"
|
|
|
|
# Clean up temporary file
|
|
rm -f temp_transcript.en.vtt
|
|
|
|
echo "Transcript saved to: $OUTPUT_FILE"
|
|
echo "Character count: $(wc -c < "$OUTPUT_FILE")"
|
|
|
|
# Show first few lines as preview
|
|
echo -e "\nPreview:"
|
|
head -c 200 "$OUTPUT_FILE"
|
|
echo -e "...\n" |