Improve transcribe api and add alias to transcribe twitch stream downloads

This commit is contained in:
Michael Campagnaro 2023-06-04 11:57:32 -04:00
parent 8ff9c338b5
commit e5e8f309e5
8 changed files with 157 additions and 103 deletions

130
aliases
View File

@ -278,6 +278,8 @@ custom_grep() {
local include_list=("$@") local include_list=("$@")
local include_arg="" local include_arg=""
if [[ $include_list != "" ]]; then if [[ $include_list != "" ]]; then
# We're looping like this instead of for var in "$@", because that way of looping is affecting
# my shell environment. Very strange!
for i in "${include_list[@]}"; do for i in "${include_list[@]}"; do
include_arg+="--include=\*${i} " include_arg+="--include=\*${i} "
done done
@ -511,18 +513,8 @@ download_twitch_chat() {
fi fi
} }
# Download Twitch videos, both VODs and live streams. Pass a Twitch account URL to download a live stream. # Copy pasta of download_twitch_vid with a final pass to transcribe the audio using whisper.cpp
# The live stream filename will not contain the stream title, so you'll need to modify it afterwards. download_twitch_vid_and_transcribe() {
#
# If you want to download subcriber-only vids then first extract your Twitch
# cookies to a file (can use cookies.txt add-on from Lennon Hill) and then pass it as an option,
# using the full path to the cookies file, e.g.
# `tw-1080p60 <url> --cookies /c/<cookie_path>/twitch_cookies.txt`
#
# To extract a portion of a video, you have to first download the entire file and then use the
# `trim-video` or `compress-video-and-trim` scripts.
#
download_twitch_vid() {
local format="$1" local format="$1"
local shortname="$2" local shortname="$2"
local compress="$3" local compress="$3"
@ -533,7 +525,7 @@ download_twitch_vid() {
if [[ $url == "" ]]; then if [[ $url == "" ]]; then
error "Usage: $0 <make folder?> <url> <optional args>" error "Usage: $0 <make folder?> <url> <optional args>"
return exit 1
fi fi
# We use yt-dlp to get the filename and then use streamlink to download it (the latter is a lot faster). # We use yt-dlp to get the filename and then use streamlink to download it (the latter is a lot faster).
@ -569,7 +561,7 @@ download_twitch_vid() {
if [[ $make_folder == "1" ]]; then if [[ $make_folder == "1" ]]; then
make_vid_dir_and_cd_into $url "" $opts make_vid_dir_and_cd_into $url "" $opts
if [[ $? -ne 0 ]]; then if [[ $? -ne 0 ]]; then
return exit 1
fi fi
fi fi
@ -607,6 +599,113 @@ download_twitch_vid() {
fi fi
else else
error "Error: Failed to download '$url'" error "Error: Failed to download '$url'"
exit 1
fi
transcribe-video "$filename" jon base small
if [[ $make_folder == "1" ]]; then
cd ..
fi
}
# Download Twitch videos, both VODs and live streams. Pass a Twitch account URL to download a live stream.
# The live stream filename will not contain the stream title, so you'll need to modify it afterwards.
#
# If you want to download subcriber-only vids then first extract your Twitch
# cookies to a file (can use cookies.txt add-on from Lennon Hill) and then pass it as an option,
# using the full path to the cookies file, e.g.
# `tw-1080p60 <url> --cookies /c/<cookie_path>/twitch_cookies.txt`
#
# To extract a portion of a video, you have to first download the entire file and then use the
# `trim-video` or `compress-video-and-trim` scripts.
#
download_twitch_vid() {
local format="$1"
local shortname="$2"
local compress="$3"
local make_folder="$4"
local url="$5"
shift 5
local opts="$@"
if [[ $url == "" ]]; then
error "Usage: $0 <make folder?> <url> <optional args>"
exit 1
fi
# We use yt-dlp to get the filename and then use streamlink to download it (the latter is a lot faster).
# It's a two step process because streamlink cannot pass the formatted filename to ffmpeg.
# We fallback to yt-dlp when it's a subscriber VOD because we don't have an easy way to access it with streamlink.
local subscriber_vod=0
local split_opts=($opts)
if [[ ${split_opts[0]} == "--cookies" ]]; then
subscriber_vod=1
printf "${BOLD}Subscriber VOD. Will use yt-dlp to download.${NORMAL}\n"
fi
if [[ $compress -eq 0 ]]; then
printf "${BOLD}Downloading Twitch vid with no compression.${NORMAL}\n"
else
printf "${BOLD}Downloading Twitch vid with compression.${NORMAL}\n"
fi
local yt_dlp_format=""
local streamlink_format=""
if [[ $format == "" ]]; then
# Twitch only supplies pre-merged mp4s so we can ask for the best format and not worry about anything else.
printf "${BOLD}No format given; using best available.${NORMAL}\n"
yt_dlp_format="b"
streamlink_format="best"
else
yt_dlp_format="$format"
streamlink_format="$format"
fi
if [[ $make_folder == "1" ]]; then
make_vid_dir_and_cd_into $url "" $opts
if [[ $? -ne 0 ]]; then
exit 1
fi
fi
if [[ $shortname -eq 0 ]]; then
local name_format="%(upload_date>%Y-%m-%d)s-%(title)s-tw-%(id)s"
else
local name_format="%(upload_date>%Y-%m-%d)s-shortname-tw-%(id)s"
fi
# Download Twitch chat transcript
actually_download_twitch_chat $url "$(yt-dlp.exe --get-filename -o "$name_format" $opts $url)"
# Get the video filename.
local filename=$(yt-dlp.exe --get-filename -o "$name_format.%(ext)s" $opts $url)
# Download
if [[ $subscriber_vod -eq 0 ]]; then
local cmd="streamlink.exe --twitch-low-latency --twitch-disable-ads --twitch-disable-hosting --force --force-progress $opts $url $streamlink_format -O | ffmpeg -i pipe:0 -c copy \"$filename\""
else
local cmd="yt-dlp.exe -f $yt_dlp_format -o \"$filename\" $opts $url"
fi
eval $cmd # Need to eval in order to preserve the quotes wrapping the filename format string.
error=$?
if [[ $error -eq 0 ]]; then
if [[ $compress -eq 1 ]]; then
local temp_name="temp_${RANDOM}"
# 0=cpu, 1=gpu
compress-video "$filename" "$temp_name" 0
extension="${filename##*.}"
mv "$filename" "orig_$filename"
mv $temp_name.$extension "$filename"
printf "${BOLD}Make sure to delete the original video file${NORMAL}\n"
fi
else
error "Error: Failed to download '$url'"
exit 1
fi fi
if [[ $make_folder == "1" ]]; then if [[ $make_folder == "1" ]]; then
@ -815,6 +914,7 @@ alias yt-and-hflip='download_youtube_vid_and_hflip "137+140"' # 1080p
# Twitch Vid DL # Twitch Vid DL
alias tw='download_twitch_vid "" $SHORTNAME_OFF $COMPRESSION_OFF' alias tw='download_twitch_vid "" $SHORTNAME_OFF $COMPRESSION_OFF'
alias twt='download_twitch_vid_and_transcribe "" $SHORTNAME_OFF $COMPRESSION_OFF'
alias tw-compressed='download_twitch_vid "" $SHORTNAME_OFF $COMPRESSION_ON' alias tw-compressed='download_twitch_vid "" $SHORTNAME_OFF $COMPRESSION_ON'
alias tw-shortname='download_twitch_vid "" $SHORTNAME_ON $COMPRESSION_OFF' alias tw-shortname='download_twitch_vid "" $SHORTNAME_ON $COMPRESSION_OFF'
alias tw-shortname-compressed='download_twitch_vid "" $SHORTNAME_ON $COMPRESSION_ON' alias tw-shortname-compressed='download_twitch_vid "" $SHORTNAME_ON $COMPRESSION_ON'
@ -856,7 +956,7 @@ alias vimeo-compressed='download_vimeo_vid "Original" $SHORTNAME_OFF $COMPRESSIO
alias ig-download-and-hflip='download_instagram_vid_and_hflip ' alias ig-download-and-hflip='download_instagram_vid_and_hflip '
# Twitter Vid DL # Twitter Vid DL
alias twt='download_twitter_vid "" ' alias twitter='download_twitter_vid "" '
# Misc # Misc
alias download-mp4='download_mp4' alias download-mp4='download_mp4'

View File

@ -23,6 +23,23 @@ else
NORMAL="" NORMAL=""
fi fi
# Will return a symlink path in its expanded form. If the path's root is the
# home directory symbol "~" then it'll be replaced by the full home path.
expand_path() {
local ret="$1"
IFS="/" read -ra parts <<< "$ret"
if [[ "${parts[0]}" == "~" ]]; then
ret="$HOME"
for ((i=1; i < ${#parts[@]}; i++))
do
ret="$ret/${parts[$i]}"
done
fi
ret=$(readlink -m "$ret")
echo $ret
}
input_wav="$1" input_wav="$1"
output_name_without_ext="$2" output_name_without_ext="$2"
model="$3" model="$3"
@ -49,6 +66,14 @@ output_name="$output_name_without_ext.${model}"
printf "\n${YELLOW}${BOLD}Transcribing $input_wav | model: $model | threads: $threads | output: $output_name ${NORMAL}\n" printf "\n${YELLOW}${BOLD}Transcribing $input_wav | model: $model | threads: $threads | output: $output_name ${NORMAL}\n"
whisper.exe --threads ${threads} -m $JELLYPIXEL_OPENSOURCE_DEV/whisper.cpp/models/ggml-${model}.en.bin -otxt -osrt -f "$input_wav" -of "$output_name" --print-colors whisper_fullname="$(expand_path $(which whisper.exe))"
whisper_path="$(dirname $whisper_fullname)"
models_path="$whisper_path/models"
printf "${GREEN}${BOLD}Done transcribing $input_wav | model: $model | threads: $threads | output: $output_name${NORMAL}\n" whisper.exe --threads ${threads} -m "$models_path/ggml-${model}.en.bin" -otxt -osrt -f "$input_wav" -of "$output_name" --print-colors
error=$?
if [[ error -eq 0 ]]; then
printf "${GREEN}${BOLD}Done transcribing $input_wav | model: $model | threads: $threads | output: $output_name${NORMAL}\n"
else
printf "${GREEN}${BOLD}Error while transcribing $input_wav | model: $model | threads: $threads | output: $output_name${NORMAL}\n"
fi

View File

@ -25,11 +25,11 @@ fi
input_video="$1" input_video="$1"
output_name_without_ext="$2" output_name_without_ext="$2"
model="$3" shift 2
threads=$4 models="$@"
if [[ $input_video == "" || $output_name_without_ext == "" || $model == "" ]]; then if [[ $input_video == "" || $output_name_without_ext == "" || $models == "" ]]; then
printf "${BOLD}${RED}Usage: $0 <input.wav> <output name without extension> <model name> <optional: thread count>${NORMAL}\n" printf "${BOLD}${RED}Usage: $0 <input.wav> <output name without extension> <list of model names to use>${NORMAL}\n"
exit 1 exit 1
fi fi
@ -45,11 +45,13 @@ fi
extract-16bit-wav-from-video "$input_video" "$wav_name" extract-16bit-wav-from-video "$input_video" "$wav_name"
if [[ $? == 1 ]]; then exit 1; fi if [[ $? == 1 ]]; then exit 1; fi
transcribe-audio "$wav_name" "$output_name_without_ext" "$model" $threads for model in "$@"; do
if [[ $? == 1 ]]; then transcribe-audio "$wav_name" "$output_name_without_ext" "${model}"
printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}" if [[ $? == 1 ]]; then
exit 1 printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}"
fi exit 1
fi
done
rm "$wav_name" rm "$wav_name"

View File

@ -1,4 +1,2 @@
#!/usr/bin/env bash #!/usr/bin/env bash
transcribe-video "$1" "$2" "base"
transcribe-video-with-model "$1" "$2" "base" "$3"

View File

@ -1,65 +0,0 @@
#!/usr/bin/env bash
# I was originally just using three calls to transcribe-video-with-model but I want to reuse the same audio input, so this
# is mostly a copy pasta of that file.
if which tput >/dev/null 2>&1; then
ncolors=$(tput colors)
fi
if [ -t 1 ] && [ -n "$ncolors" ] && [ "$ncolors" -ge 8 ]; then
RED="$(tput setaf 1)"
GREEN="$(tput setaf 2)"
YELLOW="$(tput setaf 3)"
BLUE="$(tput setaf 4)"
MAGENTA="$(tput setaf 5)"
CYAN="$(tput setaf 6)"
BOLD="$(tput bold)"
NORMAL="$(tput sgr0)"
else
RED=""
GREEN=""
YELLOW=""
BLUE=""
MAGENTA=""
CYAN=""
BOLD=""
NORMAL=""
fi
input_video="$1"
output_name_without_ext="$2"
threads=$3
if [[ $input_video == "" || $output_name_without_ext == "" ]]; then
printf "${BOLD}${RED}Usage: $0 <input.wav> <output name without extension> <optional: thread count>${NORMAL}\n"
exit 1
fi
wav_name="${output_name_without_ext}_audio_${RANDOM}.wav"
extract-16bit-wav-from-video "$input_video" "$wav_name"
if [[ $? == 1 ]]; then exit 1; fi
#
# Tiny model first to have something quickly banged out. base and small have similar output quality. Neither are perfect.
#
transcribe-audio "$wav_name" "$output_name_without_ext" "tiny" $threads
if [[ $? == 1 ]]; then
printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}"
exit 1
fi
transcribe-audio "$wav_name" "$output_name_without_ext" "base" $threads
if [[ $? == 1 ]]; then
printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}"
exit 1
fi
transcribe-audio "$wav_name" "$output_name_without_ext" "small" $threads
if [[ $? == 1 ]]; then
printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}"
exit 1
fi
rm "$wav_name"

View File

@ -1,4 +1,2 @@
#!/usr/bin/env bash #!/usr/bin/env bash
transcribe-video "$1" "$2" "medium"
transcribe-video-with-model "$1" "$2" "medium" "$3"

View File

@ -1,4 +1,2 @@
#!/usr/bin/env bash #!/usr/bin/env bash
transcribe-video "$1" "$2" "small"
transcribe-video-with-model "$1" "$2" "small" "$3"

View File

@ -1,4 +1,2 @@
#!/usr/bin/env bash #!/usr/bin/env bash
transcribe-video "$1" "$2" "tiny"
transcribe-video-with-model "$1" "$2" "tiny" "$3"