From 173bda9eb0e1af2d16ff3ee79a94294d091745c7 Mon Sep 17 00:00:00 2001
From: Michael Campagnaro <mikecampo@protonmail.com>
Date: Sat, 3 Jun 2023 17:21:16 -0400
Subject: [PATCH] Add scripts for audio transcription using whisper.cpp

---
 bin/extract-16bit-wav-from-video | 50 ++++++++++++++++++++++++
 bin/extract-audio-from-video     |  4 +-
 bin/transcribe-audio             | 54 ++++++++++++++++++++++++++
 bin/transcribe-video-base        |  4 ++
 bin/transcribe-video-batch       | 65 ++++++++++++++++++++++++++++++++
 bin/transcribe-video-medium      |  4 ++
 bin/transcribe-video-small       |  4 ++
 bin/transcribe-video-tiny        |  4 ++
 bin/transcribe-video-with-model  | 55 +++++++++++++++++++++++++++
 9 files changed, 242 insertions(+), 2 deletions(-)
 create mode 100644 bin/extract-16bit-wav-from-video
 create mode 100644 bin/transcribe-audio
 create mode 100644 bin/transcribe-video-base
 create mode 100644 bin/transcribe-video-batch
 create mode 100644 bin/transcribe-video-medium
 create mode 100644 bin/transcribe-video-small
 create mode 100644 bin/transcribe-video-tiny
 create mode 100644 bin/transcribe-video-with-model
diff --git a/bin/extract-16bit-wav-from-video b/bin/extract-16bit-wav-from-video
new file mode 100644
index 0000000..32802c9
--- /dev/null
+++ b/bin/extract-16bit-wav-from-video
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+if which tput >/dev/null 2>&1; then
+    ncolors=$(tput colors)
+fi
+if [ -t 1 ] && [ -n "$ncolors" ] && [ "$ncolors" -ge 8 ]; then
+    RED="$(tput setaf 1)"
+    GREEN="$(tput setaf 2)"
+    YELLOW="$(tput setaf 3)"
+    BLUE="$(tput setaf 4)"
+    MAGENTA="$(tput setaf 5)"
+    CYAN="$(tput setaf 6)"
+    BOLD="$(tput bold)"
+    NORMAL="$(tput sgr0)"
+else
+    RED=""
+    GREEN=""
+    YELLOW=""
+    BLUE=""
+    MAGENTA=""
+    CYAN=""
+    BOLD=""
+    NORMAL=""
+fi
+
+input="$1"
+output_name="$2"
+
+if [[ $input == "" || $output_name == "" ]]; then
+    printf "${BOLD}${RED}Usage: $0 <input video> <wav output name>${NORMAL}\n"
+    exit 1
+fi
+
+if [[ ! -f "$input" ]]; then
+    printf "${RED}${BOLD}Error: failed to extract audio. Video file \"$input\" doesn't exist.\n${NORMAL}"
+    exit 1
+fi
+
+# Add extension if not provided.
+output_basename=$(basename -- "$output_name")
+output_extension="${output_basename##*.}"
+if [[ $output_extension != "wav" ]]; then
+    output_name="${output_name}.wav"
+fi
+
+printf "\n${YELLOW}${BOLD}Extracting 16-bit WAV from $input | output: $output_name${NORMAL}\n"
+
+ffmpeg -i "$input" -ar 16000 -ac 1 -c:a pcm_s16le "$output_name"
+
+printf "${GREEN}${BOLD}Done extracting 16-bit WAV from $input | output: $output_name${NORMAL}\n"
diff --git a/bin/extract-audio-from-video b/bin/extract-audio-from-video
index 30e5396..ae4b27f 100644
--- a/bin/extract-audio-from-video
+++ b/bin/extract-audio-from-video
@@ -41,7 +41,7 @@ if [[ $bitrate == "" ]]; then
     bitrate="64"
 fi
 
-printf "\n${YELLOW}${BOLD}Extracting audio from '$filename.$extension' | bitrate: ${bitrate}k | output: $output_name${NORMAL}\n"
+printf "\n${YELLOW}${BOLD}Extracting audio from $filename.$extension | bitrate: ${bitrate}k | output: $output_name${NORMAL}\n"
 
 if [[ $transcode == "1" ]]; then
     # Transcode audio
@@ -51,5 +51,5 @@ else
     ffmpeg -i "$filename.$extension" -vn -acodec copy "$output_name"
 fi
 
-printf "\n${GREEN}${BOLD}Done extracting audio from '$filename.$extension' | output name '$output_name'${NORMAL}\n\n"
+printf "\n${GREEN}${BOLD}Done extracting audio from $filename.$extension | output name '$output_name'${NORMAL}\n\n"
 
diff --git a/bin/transcribe-audio b/bin/transcribe-audio
new file mode 100644
index 0000000..ad9f92d
--- /dev/null
+++ b/bin/transcribe-audio
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+if which tput >/dev/null 2>&1; then
+    ncolors=$(tput colors)
+fi
+if [ -t 1 ] && [ -n "$ncolors" ] && [ "$ncolors" -ge 8 ]; then
+    RED="$(tput setaf 1)"
+    GREEN="$(tput setaf 2)"
+    YELLOW="$(tput setaf 3)"
+    BLUE="$(tput setaf 4)"
+    MAGENTA="$(tput setaf 5)"
+    CYAN="$(tput setaf 6)"
+    BOLD="$(tput bold)"
+    NORMAL="$(tput sgr0)"
+else
+    RED=""
+    GREEN=""
+    YELLOW=""
+    BLUE=""
+    MAGENTA=""
+    CYAN=""
+    BOLD=""
+    NORMAL=""
+fi
+
+input_wav="$1"
+output_name_without_ext="$2"
+model="$3"
+threads=$4
+
+# 4 seems to be the sweet spot.
+default_thread_count=4
+
+if [[ $input_wav == "" || $output_name_without_ext == "" || $model == "" ]]; then
+    printf "${BOLD}${RED}Usage: $0 <input.wav> <output name without extension> <model name> <optional: thread count>${NORMAL}\n"
+    exit 1
+fi
+
+if [[ ! -f "$input_wav" ]]; then
+    printf "${RED}${BOLD}Input file \"$input_wav\" doesn't exist!\n${NORMAL}"
+    exit 1
+fi
+
+if [[ $threads == "" ]]; then
+    threads=$default_thread_count
+fi
+
+output_name="$output_name_without_ext.${model}"
+
+printf "\n${YELLOW}${BOLD}Transcribing $input_wav | model: $model | threads: $threads | output: $output_name ${NORMAL}\n"
+
+whisper.exe --threads ${threads} -m $JELLYPIXEL_OPENSOURCE_DEV/whisper.cpp/models/ggml-${model}.en.bin -otxt -osrt -f "$input_wav" -of "$output_name" --print-colors
+
+printf "${GREEN}${BOLD}Done transcribing $input_wav | model: $model | threads: $threads | output: $output_name${NORMAL}\n"
diff --git a/bin/transcribe-video-base b/bin/transcribe-video-base
new file mode 100644
index 0000000..3f225c3
--- /dev/null
+++ b/bin/transcribe-video-base
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+transcribe-video-with-model "$1" "$2" "base" "$3"
+
diff --git a/bin/transcribe-video-batch b/bin/transcribe-video-batch
new file mode 100644
index 0000000..f2927a6
--- /dev/null
+++ b/bin/transcribe-video-batch
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+
+# I was originally just using three calls to transcribe-video-with-model but I want to reuse the same audio input, so this
+# is mostly a copy pasta of that file.
+
+if which tput >/dev/null 2>&1; then
+    ncolors=$(tput colors)
+fi
+if [ -t 1 ] && [ -n "$ncolors" ] && [ "$ncolors" -ge 8 ]; then
+    RED="$(tput setaf 1)"
+    GREEN="$(tput setaf 2)"
+    YELLOW="$(tput setaf 3)"
+    BLUE="$(tput setaf 4)"
+    MAGENTA="$(tput setaf 5)"
+    CYAN="$(tput setaf 6)"
+    BOLD="$(tput bold)"
+    NORMAL="$(tput sgr0)"
+else
+    RED=""
+    GREEN=""
+    YELLOW=""
+    BLUE=""
+    MAGENTA=""
+    CYAN=""
+    BOLD=""
+    NORMAL=""
+fi
+
+input_video="$1"
+output_name_without_ext="$2"
+threads=$3
+
+if [[ $input_video == "" || $output_name_without_ext == "" ]]; then
+    printf "${BOLD}${RED}Usage: $0 <input.wav> <output name without extension> <optional: thread count>${NORMAL}\n"
+    exit 1
+fi
+
+wav_name="${output_name_without_ext}_audio_${RANDOM}.wav"
+
+extract-16bit-wav-from-video "$input_video" "$wav_name"
+if [[ $? == 1 ]]; then exit 1; fi
+
+#
+# Tiny model first to have something quickly banged out. base and small have similar output quality. Neither are perfect.
+#
+
+transcribe-audio "$wav_name" "$output_name_without_ext" "tiny" $threads
+if [[ $? == 1 ]]; then
+    printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}"
+    exit 1
+fi
+
+transcribe-audio "$wav_name" "$output_name_without_ext" "base" $threads
+if [[ $? == 1 ]]; then
+    printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}"
+    exit 1
+fi
+
+transcribe-audio "$wav_name" "$output_name_without_ext" "small" $threads
+if [[ $? == 1 ]]; then
+    printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}"
+    exit 1
+fi
+
+rm "$wav_name"
diff --git a/bin/transcribe-video-medium b/bin/transcribe-video-medium
new file mode 100644
index 0000000..7554053
--- /dev/null
+++ b/bin/transcribe-video-medium
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+transcribe-video-with-model "$1" "$2" "medium" "$3"
+
diff --git a/bin/transcribe-video-small b/bin/transcribe-video-small
new file mode 100644
index 0000000..5d040da
--- /dev/null
+++ b/bin/transcribe-video-small
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+transcribe-video-with-model "$1" "$2" "small" "$3"
+
diff --git a/bin/transcribe-video-tiny b/bin/transcribe-video-tiny
new file mode 100644
index 0000000..550ad72
--- /dev/null
+++ b/bin/transcribe-video-tiny
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+transcribe-video-with-model "$1" "$2" "tiny" "$3"
+
diff --git a/bin/transcribe-video-with-model b/bin/transcribe-video-with-model
new file mode 100644
index 0000000..f61f583
--- /dev/null
+++ b/bin/transcribe-video-with-model
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+
+if which tput >/dev/null 2>&1; then
+    ncolors=$(tput colors)
+fi
+if [ -t 1 ] && [ -n "$ncolors" ] && [ "$ncolors" -ge 8 ]; then
+    RED="$(tput setaf 1)"
+    GREEN="$(tput setaf 2)"
+    YELLOW="$(tput setaf 3)"
+    BLUE="$(tput setaf 4)"
+    MAGENTA="$(tput setaf 5)"
+    CYAN="$(tput setaf 6)"
+    BOLD="$(tput bold)"
+    NORMAL="$(tput sgr0)"
+else
+    RED=""
+    GREEN=""
+    YELLOW=""
+    BLUE=""
+    MAGENTA=""
+    CYAN=""
+    BOLD=""
+    NORMAL=""
+fi
+
+input_video="$1"
+output_name_without_ext="$2"
+model="$3"
+threads=$4
+
+if [[ $input_video == "" || $output_name_without_ext == "" || $model == "" ]]; then
+    printf "${BOLD}${RED}Usage: $0 <input.wav> <output name without extension> <model name> <optional: thread count>${NORMAL}\n"
+    exit 1
+fi
+
+wav_name="${output_name_without_ext}_audio_${RANDOM}"
+
+# Add extension if not provided.
+input_basename=$(basename -- "$wav_name")
+input_extension="${input_basename##*.}"
+if [[ input_extension != "wav" ]]; then
+    wav_name="${wav_name}.wav"
+fi
+
+extract-16bit-wav-from-video "$input_video" "$wav_name"
+if [[ $? == 1 ]]; then exit 1; fi
+
+transcribe-audio "$wav_name" "$output_name_without_ext" "$model" $threads
+if [[ $? == 1 ]]; then
+    printf "${RED}${BOLD}Saving the audio file \"$wav_name\" in case you want to reuse it for debugging.\n${NORMAL}"
+    exit 1
+fi
+
+rm "$wav_name"
+