From 173bda9eb0e1af2d16ff3ee79a94294d091745c7 Mon Sep 17 00:00:00 2001 From: Michael Campagnaro Date: Sat, 3 Jun 2023 17:21:16 -0400 Subject: [PATCH] Add scripts for audio transcription using whisper.cpp --- bin/extract-16bit-wav-from-video | 50 ++++++++++++++++++++++++ bin/extract-audio-from-video | 4 +- bin/transcribe-audio | 54 ++++++++++++++++++++++++++ bin/transcribe-video-base | 4 ++ bin/transcribe-video-batch | 65 ++++++++++++++++++++++++++++++++ bin/transcribe-video-medium | 4 ++ bin/transcribe-video-small | 4 ++ bin/transcribe-video-tiny | 4 ++ bin/transcribe-video-with-model | 55 +++++++++++++++++++++++++++ 9 files changed, 242 insertions(+), 2 deletions(-) create mode 100644 bin/extract-16bit-wav-from-video create mode 100644 bin/transcribe-audio create mode 100644 bin/transcribe-video-base create mode 100644 bin/transcribe-video-batch create mode 100644 bin/transcribe-video-medium create mode 100644 bin/transcribe-video-small create mode 100644 bin/transcribe-video-tiny create mode 100644 bin/transcribe-video-with-model diff --git a/bin/extract-16bit-wav-from-video b/bin/extract-16bit-wav-from-video new file mode 100644 index 0000000..32802c9 --- /dev/null +++ b/bin/extract-16bit-wav-from-video @@ -0,0 +1,50 @@ +#!/usr/bin/env bash + +if which tput >/dev/null 2>&1; then + ncolors=$(tput colors) +fi +if [ -t 1 ] && [ -n "$ncolors" ] && [ "$ncolors" -ge 8 ]; then + RED="$(tput setaf 1)" + GREEN="$(tput setaf 2)" + YELLOW="$(tput setaf 3)" + BLUE="$(tput setaf 4)" + MAGENTA="$(tput setaf 5)" + CYAN="$(tput setaf 6)" + BOLD="$(tput bold)" + NORMAL="$(tput sgr0)" +else + RED="" + GREEN="" + YELLOW="" + BLUE="" + MAGENTA="" + CYAN="" + BOLD="" + NORMAL="" +fi + +input="$1" +output_name="$2" + +if [[ $input == "" || $output_name == "" ]]; then + printf "${BOLD}${RED}Usage: $0 ${NORMAL}\n" + exit 1 +fi + +if [[ ! -f "$input" ]]; then + printf "${RED}${BOLD}Error: failed to extract audio. Video file \"$input\" doesn't exist.\n${NORMAL}" + exit 1 +fi + +# Add extension if not provided. +output_basename=$(basename -- "$output_name") +output_extension="${output_basename##*.}" +if [[ $output_extension != "wav" ]]; then + output_name="${output_name}.wav" +fi + +printf "\n${YELLOW}${BOLD}Extracting 16-bit WAV from $input | output: $output_name${NORMAL}\n" + +ffmpeg -i "$input" -ar 16000 -ac 1 -c:a pcm_s16le "$output_name" + +printf "${GREEN}${BOLD}Done extracting 16-bit WAV from $input | output: $output_name${NORMAL}\n" diff --git a/bin/extract-audio-from-video b/bin/extract-audio-from-video index 30e5396..ae4b27f 100644 --- a/bin/extract-audio-from-video +++ b/bin/extract-audio-from-video @@ -41,7 +41,7 @@ if [[ $bitrate == "" ]]; then bitrate="64" fi -printf "\n${YELLOW}${BOLD}Extracting audio from '$filename.$extension' | bitrate: ${bitrate}k | output: $output_name${NORMAL}\n" +printf "\n${YELLOW}${BOLD}Extracting audio from $filename.$extension | bitrate: ${bitrate}k | output: $output_name${NORMAL}\n" if [[ $transcode == "1" ]]; then # Transcode audio @@ -51,5 +51,5 @@ else ffmpeg -i "$filename.$extension" -vn -acodec copy "$output_name" fi -printf "\n${GREEN}${BOLD}Done extracting audio from '$filename.$extension' | output name '$output_name'${NORMAL}\n\n" +printf "\n${GREEN}${BOLD}Done extracting audio from $filename.$extension | output name '$output_name'${NORMAL}\n\n" diff --git a/bin/transcribe-audio b/bin/transcribe-audio new file mode 100644 index 0000000..ad9f92d --- /dev/null +++ b/bin/transcribe-audio @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +if which tput >/dev/null 2>&1; then + ncolors=$(tput colors) +fi +if [ -t 1 ] && [ -n "$ncolors" ] && [ "$ncolors" -ge 8 ]; then + RED="$(tput setaf 1)" + GREEN="$(tput setaf 2)" + YELLOW="$(tput setaf 3)" + BLUE="$(tput setaf 4)" + MAGENTA="$(tput setaf 5)" + CYAN="$(tput setaf 6)" + BOLD="$(tput bold)" + NORMAL="$(tput sgr0)" +else + RED="" + GREEN="" + YELLOW="" + BLUE="" + MAGENTA="" + CYAN="" + BOLD="" + NORMAL="" +fi + +input_wav="$1" +output_name_without_ext="$2" +model="$3" +threads=$4 + +# 4 seems to be the sweet spot. +default_thread_count=4 + +if [[ $input_wav == "" || $output_name_without_ext == "" || $model == "" ]]; then + printf "${BOLD}${RED}Usage: $0 ${NORMAL}\n" + exit 1 +fi + +if [[ ! -f "$input_wav" ]]; then + printf "${RED}${BOLD}Input file \"$input_wav\" doesn't exist!\n${NORMAL}" + exit 1 +fi + +if [[ $threads == "" ]]; then + threads=$default_thread_count +fi + +output_name="$output_name_without_ext.${model}" + +printf "\n${YELLOW}${BOLD}Transcribing $input_wav | model: $model | threads: $threads | output: $output_name ${NORMAL}\n" + +whisper.exe --threads ${threads} -m $JELLYPIXEL_OPENSOURCE_DEV/whisper.cpp/models/ggml-${model}.en.bin -otxt -osrt -f "$input_wav" -of "$output_name" --print-colors + +printf "${GREEN}${BOLD}Done transcribing $input_wav | model: $model | threads: $threads | output: $output_name${NORMAL}\n" diff --git a/bin/transcribe-video-base b/bin/transcribe-video-base new file mode 100644 index 0000000..3f225c3 --- /dev/null +++ b/bin/transcribe-video-base @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +transcribe-video-with-model "$1" "$2" "base" "$3" + diff --git a/bin/transcribe-video-batch b/bin/transcribe-video-batch new file mode 100644 index 0000000..f2927a6 --- /dev/null +++ b/bin/transcribe-video-batch @@ -0,0 +1,65 @@ +#!/usr/bin/env bash + +# I was originally just using three calls to transcribe-video-with-model but I want to reuse the same audio input, so this +# is mostly a copy pasta of that file. + +if which tput >/dev/null 2>&1; then + ncolors=$(tput colors) +fi +if [ -t 1 ] && [ -n "$ncolors" ] && [ "$ncolors" -ge 8 ]; then + RED="$(tput setaf 1)" + GREEN="$(tput setaf 2)" + YELLOW="$(tput setaf 3)" + BLUE="$(tput setaf 4)" + MAGENTA="$(tput setaf 5)" + CYAN="$(tput setaf 6)" + BOLD="$(tput bold)" + NORMAL="$(tput sgr0)" +else + RED="" + GREEN="" + YELLOW="" + BLUE="" + MAGENTA="" + CYAN="" + BOLD="" + NORMAL="" +fi + +input_video="$1" +output_name_without_ext="$2" +threads=$3 + +if [[ $input_video == "" || $output_name_without_ext == "" ]]; then + printf "${BOLD}${RED}Usage: $0 ${NORMAL}\n" + exit 1 +fi + +wav_name="${output_name_without_ext}_audio_${RANDOM}.wav" + +extract-16bit-wav-from-video "$input_video" "$wav_name" +if [[ $? == 1 ]]; then exit 1; fi + +# +# Tiny model first to have something quickly banged out. base and small have similar output quality. Neither are perfect. +# + +transcribe-audio "$wav_name" "$output_name_without_ext" "tiny" $threads +if [[ $? == 1 ]]; then + printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}" + exit 1 +fi + +transcribe-audio "$wav_name" "$output_name_without_ext" "base" $threads +if [[ $? == 1 ]]; then + printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}" + exit 1 +fi + +transcribe-audio "$wav_name" "$output_name_without_ext" "small" $threads +if [[ $? == 1 ]]; then + printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}" + exit 1 +fi + +rm "$wav_name" diff --git a/bin/transcribe-video-medium b/bin/transcribe-video-medium new file mode 100644 index 0000000..7554053 --- /dev/null +++ b/bin/transcribe-video-medium @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +transcribe-video-with-model "$1" "$2" "medium" "$3" + diff --git a/bin/transcribe-video-small b/bin/transcribe-video-small new file mode 100644 index 0000000..5d040da --- /dev/null +++ b/bin/transcribe-video-small @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +transcribe-video-with-model "$1" "$2" "small" "$3" + diff --git a/bin/transcribe-video-tiny b/bin/transcribe-video-tiny new file mode 100644 index 0000000..550ad72 --- /dev/null +++ b/bin/transcribe-video-tiny @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +transcribe-video-with-model "$1" "$2" "tiny" "$3" + diff --git a/bin/transcribe-video-with-model b/bin/transcribe-video-with-model new file mode 100644 index 0000000..f61f583 --- /dev/null +++ b/bin/transcribe-video-with-model @@ -0,0 +1,55 @@ +#!/usr/bin/env bash + +if which tput >/dev/null 2>&1; then + ncolors=$(tput colors) +fi +if [ -t 1 ] && [ -n "$ncolors" ] && [ "$ncolors" -ge 8 ]; then + RED="$(tput setaf 1)" + GREEN="$(tput setaf 2)" + YELLOW="$(tput setaf 3)" + BLUE="$(tput setaf 4)" + MAGENTA="$(tput setaf 5)" + CYAN="$(tput setaf 6)" + BOLD="$(tput bold)" + NORMAL="$(tput sgr0)" +else + RED="" + GREEN="" + YELLOW="" + BLUE="" + MAGENTA="" + CYAN="" + BOLD="" + NORMAL="" +fi + +input_video="$1" +output_name_without_ext="$2" +model="$3" +threads=$4 + +if [[ $input_video == "" || $output_name_without_ext == "" || $model == "" ]]; then + printf "${BOLD}${RED}Usage: $0 ${NORMAL}\n" + exit 1 +fi + +wav_name="${output_name_without_ext}_audio_${RANDOM}" + +# Add extension if not provided. +input_basename=$(basename -- "$wav_name") +input_extension="${input_basename##*.}" +if [[ input_extension != "wav" ]]; then + wav_name="${wav_name}.wav" +fi + +extract-16bit-wav-from-video "$input_video" "$wav_name" +if [[ $? == 1 ]]; then exit 1; fi + +transcribe-audio "$wav_name" "$output_name_without_ext" "$model" $threads +if [[ $? == 1 ]]; then + printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}" + exit 1 +fi + +rm "$wav_name" +