Add scripts for audio transcription using whisper.cpp
This commit is contained in:
parent
06bbc0c995
commit
173bda9eb0
50
bin/extract-16bit-wav-from-video
Normal file
50
bin/extract-16bit-wav-from-video
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
if which tput >/dev/null 2>&1; then
|
||||||
|
ncolors=$(tput colors)
|
||||||
|
fi
|
||||||
|
if [ -t 1 ] && [ -n "$ncolors" ] && [ "$ncolors" -ge 8 ]; then
|
||||||
|
RED="$(tput setaf 1)"
|
||||||
|
GREEN="$(tput setaf 2)"
|
||||||
|
YELLOW="$(tput setaf 3)"
|
||||||
|
BLUE="$(tput setaf 4)"
|
||||||
|
MAGENTA="$(tput setaf 5)"
|
||||||
|
CYAN="$(tput setaf 6)"
|
||||||
|
BOLD="$(tput bold)"
|
||||||
|
NORMAL="$(tput sgr0)"
|
||||||
|
else
|
||||||
|
RED=""
|
||||||
|
GREEN=""
|
||||||
|
YELLOW=""
|
||||||
|
BLUE=""
|
||||||
|
MAGENTA=""
|
||||||
|
CYAN=""
|
||||||
|
BOLD=""
|
||||||
|
NORMAL=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
input="$1"
|
||||||
|
output_name="$2"
|
||||||
|
|
||||||
|
if [[ $input == "" || $output_name == "" ]]; then
|
||||||
|
printf "${BOLD}${RED}Usage: $0 <input video> <wav output name>${NORMAL}\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "$input" ]]; then
|
||||||
|
printf "${RED}${BOLD}Error: failed to extract audio. Video file \"$input\" doesn't exist.\n${NORMAL}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Add extension if not provided.
|
||||||
|
output_basename=$(basename -- "$output_name")
|
||||||
|
output_extension="${output_basename##*.}"
|
||||||
|
if [[ $output_extension != "wav" ]]; then
|
||||||
|
output_name="${output_name}.wav"
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf "\n${YELLOW}${BOLD}Extracting 16-bit WAV from $input | output: $output_name${NORMAL}\n"
|
||||||
|
|
||||||
|
ffmpeg -i "$input" -ar 16000 -ac 1 -c:a pcm_s16le "$output_name"
|
||||||
|
|
||||||
|
printf "${GREEN}${BOLD}Done extracting 16-bit WAV from $input | output: $output_name${NORMAL}\n"
|
|
@ -41,7 +41,7 @@ if [[ $bitrate == "" ]]; then
|
||||||
bitrate="64"
|
bitrate="64"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
printf "\n${YELLOW}${BOLD}Extracting audio from '$filename.$extension' | bitrate: ${bitrate}k | output: $output_name${NORMAL}\n"
|
printf "\n${YELLOW}${BOLD}Extracting audio from $filename.$extension | bitrate: ${bitrate}k | output: $output_name${NORMAL}\n"
|
||||||
|
|
||||||
if [[ $transcode == "1" ]]; then
|
if [[ $transcode == "1" ]]; then
|
||||||
# Transcode audio
|
# Transcode audio
|
||||||
|
@ -51,5 +51,5 @@ else
|
||||||
ffmpeg -i "$filename.$extension" -vn -acodec copy "$output_name"
|
ffmpeg -i "$filename.$extension" -vn -acodec copy "$output_name"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
printf "\n${GREEN}${BOLD}Done extracting audio from '$filename.$extension' | output name '$output_name'${NORMAL}\n\n"
|
printf "\n${GREEN}${BOLD}Done extracting audio from $filename.$extension | output name '$output_name'${NORMAL}\n\n"
|
||||||
|
|
||||||
|
|
54
bin/transcribe-audio
Normal file
54
bin/transcribe-audio
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
if which tput >/dev/null 2>&1; then
|
||||||
|
ncolors=$(tput colors)
|
||||||
|
fi
|
||||||
|
if [ -t 1 ] && [ -n "$ncolors" ] && [ "$ncolors" -ge 8 ]; then
|
||||||
|
RED="$(tput setaf 1)"
|
||||||
|
GREEN="$(tput setaf 2)"
|
||||||
|
YELLOW="$(tput setaf 3)"
|
||||||
|
BLUE="$(tput setaf 4)"
|
||||||
|
MAGENTA="$(tput setaf 5)"
|
||||||
|
CYAN="$(tput setaf 6)"
|
||||||
|
BOLD="$(tput bold)"
|
||||||
|
NORMAL="$(tput sgr0)"
|
||||||
|
else
|
||||||
|
RED=""
|
||||||
|
GREEN=""
|
||||||
|
YELLOW=""
|
||||||
|
BLUE=""
|
||||||
|
MAGENTA=""
|
||||||
|
CYAN=""
|
||||||
|
BOLD=""
|
||||||
|
NORMAL=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
input_wav="$1"
|
||||||
|
output_name_without_ext="$2"
|
||||||
|
model="$3"
|
||||||
|
threads=$4
|
||||||
|
|
||||||
|
# 4 seems to be the sweet spot.
|
||||||
|
default_thread_count=4
|
||||||
|
|
||||||
|
if [[ $input_wav == "" || $output_name_without_ext == "" || $model == "" ]]; then
|
||||||
|
printf "${BOLD}${RED}Usage: $0 <input.wav> <output name without extension> <model name> <optional: thread count>${NORMAL}\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "$input_wav" ]]; then
|
||||||
|
printf "${RED}${BOLD}Input file \"$input_wav\" doesn't exist!\n${NORMAL}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $threads == "" ]]; then
|
||||||
|
threads=$default_thread_count
|
||||||
|
fi
|
||||||
|
|
||||||
|
output_name="$output_name_without_ext.${model}"
|
||||||
|
|
||||||
|
printf "\n${YELLOW}${BOLD}Transcribing $input_wav | model: $model | threads: $threads | output: $output_name ${NORMAL}\n"
|
||||||
|
|
||||||
|
whisper.exe --threads ${threads} -m $JELLYPIXEL_OPENSOURCE_DEV/whisper.cpp/models/ggml-${model}.en.bin -otxt -osrt -f "$input_wav" -of "$output_name" --print-colors
|
||||||
|
|
||||||
|
printf "${GREEN}${BOLD}Done transcribing $input_wav | model: $model | threads: $threads | output: $output_name${NORMAL}\n"
|
4
bin/transcribe-video-base
Normal file
4
bin/transcribe-video-base
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
transcribe-video-with-model "$1" "$2" "base" "$3"
|
||||||
|
|
65
bin/transcribe-video-batch
Normal file
65
bin/transcribe-video-batch
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# I was originally just using three calls to transcribe-video-with-model but I want to reuse the same audio input, so this
|
||||||
|
# is mostly a copy pasta of that file.
|
||||||
|
|
||||||
|
if which tput >/dev/null 2>&1; then
|
||||||
|
ncolors=$(tput colors)
|
||||||
|
fi
|
||||||
|
if [ -t 1 ] && [ -n "$ncolors" ] && [ "$ncolors" -ge 8 ]; then
|
||||||
|
RED="$(tput setaf 1)"
|
||||||
|
GREEN="$(tput setaf 2)"
|
||||||
|
YELLOW="$(tput setaf 3)"
|
||||||
|
BLUE="$(tput setaf 4)"
|
||||||
|
MAGENTA="$(tput setaf 5)"
|
||||||
|
CYAN="$(tput setaf 6)"
|
||||||
|
BOLD="$(tput bold)"
|
||||||
|
NORMAL="$(tput sgr0)"
|
||||||
|
else
|
||||||
|
RED=""
|
||||||
|
GREEN=""
|
||||||
|
YELLOW=""
|
||||||
|
BLUE=""
|
||||||
|
MAGENTA=""
|
||||||
|
CYAN=""
|
||||||
|
BOLD=""
|
||||||
|
NORMAL=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
input_video="$1"
|
||||||
|
output_name_without_ext="$2"
|
||||||
|
threads=$3
|
||||||
|
|
||||||
|
if [[ $input_video == "" || $output_name_without_ext == "" ]]; then
|
||||||
|
printf "${BOLD}${RED}Usage: $0 <input.wav> <output name without extension> <optional: thread count>${NORMAL}\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
wav_name="${output_name_without_ext}_audio_${RANDOM}.wav"
|
||||||
|
|
||||||
|
extract-16bit-wav-from-video "$input_video" "$wav_name"
|
||||||
|
if [[ $? == 1 ]]; then exit 1; fi
|
||||||
|
|
||||||
|
#
|
||||||
|
# Tiny model first to have something quickly banged out. base and small have similar output quality. Neither are perfect.
|
||||||
|
#
|
||||||
|
|
||||||
|
transcribe-audio "$wav_name" "$output_name_without_ext" "tiny" $threads
|
||||||
|
if [[ $? == 1 ]]; then
|
||||||
|
printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
transcribe-audio "$wav_name" "$output_name_without_ext" "base" $threads
|
||||||
|
if [[ $? == 1 ]]; then
|
||||||
|
printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
transcribe-audio "$wav_name" "$output_name_without_ext" "small" $threads
|
||||||
|
if [[ $? == 1 ]]; then
|
||||||
|
printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm "$wav_name"
|
4
bin/transcribe-video-medium
Normal file
4
bin/transcribe-video-medium
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
transcribe-video-with-model "$1" "$2" "medium" "$3"
|
||||||
|
|
4
bin/transcribe-video-small
Normal file
4
bin/transcribe-video-small
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
transcribe-video-with-model "$1" "$2" "small" "$3"
|
||||||
|
|
4
bin/transcribe-video-tiny
Normal file
4
bin/transcribe-video-tiny
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
transcribe-video-with-model "$1" "$2" "tiny" "$3"
|
||||||
|
|
55
bin/transcribe-video-with-model
Normal file
55
bin/transcribe-video-with-model
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
if which tput >/dev/null 2>&1; then
|
||||||
|
ncolors=$(tput colors)
|
||||||
|
fi
|
||||||
|
if [ -t 1 ] && [ -n "$ncolors" ] && [ "$ncolors" -ge 8 ]; then
|
||||||
|
RED="$(tput setaf 1)"
|
||||||
|
GREEN="$(tput setaf 2)"
|
||||||
|
YELLOW="$(tput setaf 3)"
|
||||||
|
BLUE="$(tput setaf 4)"
|
||||||
|
MAGENTA="$(tput setaf 5)"
|
||||||
|
CYAN="$(tput setaf 6)"
|
||||||
|
BOLD="$(tput bold)"
|
||||||
|
NORMAL="$(tput sgr0)"
|
||||||
|
else
|
||||||
|
RED=""
|
||||||
|
GREEN=""
|
||||||
|
YELLOW=""
|
||||||
|
BLUE=""
|
||||||
|
MAGENTA=""
|
||||||
|
CYAN=""
|
||||||
|
BOLD=""
|
||||||
|
NORMAL=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
input_video="$1"
|
||||||
|
output_name_without_ext="$2"
|
||||||
|
model="$3"
|
||||||
|
threads=$4
|
||||||
|
|
||||||
|
if [[ $input_video == "" || $output_name_without_ext == "" || $model == "" ]]; then
|
||||||
|
printf "${BOLD}${RED}Usage: $0 <input.wav> <output name without extension> <model name> <optional: thread count>${NORMAL}\n"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
wav_name="${output_name_without_ext}_audio_${RANDOM}"
|
||||||
|
|
||||||
|
# Add extension if not provided.
|
||||||
|
input_basename=$(basename -- "$wav_name")
|
||||||
|
input_extension="${input_basename##*.}"
|
||||||
|
if [[ input_extension != "wav" ]]; then
|
||||||
|
wav_name="${wav_name}.wav"
|
||||||
|
fi
|
||||||
|
|
||||||
|
extract-16bit-wav-from-video "$input_video" "$wav_name"
|
||||||
|
if [[ $? == 1 ]]; then exit 1; fi
|
||||||
|
|
||||||
|
transcribe-audio "$wav_name" "$output_name_without_ext" "$model" $threads
|
||||||
|
if [[ $? == 1 ]]; then
|
||||||
|
printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm "$wav_name"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user