From 1552868f0c99863f3afbd1bead0c9c4be41b2ec2 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Sun, 27 Oct 2024 09:21:11 -0400
Subject: [PATCH 01/11] Refactor version numbers in pyproject.toml and setup.py

---
 pyproject.toml             |   2 +-
 setup.py                   |   2 +-
 simpler_whisper/whisper.py |   2 +-
 src/whisper_wrapper.cpp    | 130 +++++++++++++++++++++++--------------
 test_simpler_whisper.py    |   5 +-
 5 files changed, 88 insertions(+), 53 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index 3db127c..d0d78a8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "simpler-whisper"
-version = "0.2.1"
+version = "0.2.2"
 authors = [
     {name = "Roy Shilkrot", email = "roy.shil@gmail.com"},
 ]
diff --git a/setup.py b/setup.py
index a7e86ef..a34fff7 100644
--- a/setup.py
+++ b/setup.py
@@ -99,7 +99,7 @@ def build_extension(self, ext):
 
 setup(
     name="simpler-whisper",
-    version="0.2.1",
+    version="0.2.2",
     author="Roy Shilkrot",
     author_email="roy.shil@gmail.com",
     description="A simple Python wrapper for whisper.cpp",
diff --git a/simpler_whisper/whisper.py b/simpler_whisper/whisper.py
index b0ced56..62319a8 100644
--- a/simpler_whisper/whisper.py
+++ b/simpler_whisper/whisper.py
@@ -48,7 +48,7 @@ class ThreadedWhisperModel:
     def __init__(
         self,
         model_path: str,
-        callback: Callable[[int, str, bool], None],
+        callback: Callable[[int, List[WhisperSegment], bool], None],
         use_gpu=False,
         max_duration_sec=10.0,
         sample_rate=16000,
diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp
index 68afd6e..11f5f7f 100644
--- a/src/whisper_wrapper.cpp
+++ b/src/whisper_wrapper.cpp
@@ -119,6 +119,8 @@ class WhisperModel
         {
             const char *text = whisper_full_get_segment_text(ctx, i);
             WhisperSegment segment;
+            segment.start = whisper_full_get_segment_t0(ctx, i);
+            segment.end = whisper_full_get_segment_t1(ctx, i);
             segment.text = std::string(text);
             const int n_tokens = whisper_full_n_tokens(ctx, i);
             for (int j = 0; j < n_tokens; ++j)
@@ -146,6 +148,7 @@ class WhisperModel
     whisper_full_params params;
 };
 
+
 struct AudioChunk
 {
     std::vector<float> data;
@@ -155,25 +158,22 @@ struct AudioChunk
 struct TranscriptionResult
 {
     size_t chunk_id;
-    std::vector<std::string> segments;
     bool is_partial;
+    std::vector<WhisperSegment> segments;
 };
 
-class ThreadedWhisperModel
+
+class AsyncWhisperModel : public WhisperModel
 {
 public:
-    ThreadedWhisperModel(const std::string &model_path, bool use_gpu = false,
-                         float max_duration_sec = 10.0f, int sample_rate = 16000)
-        : running(false), next_chunk_id(0),
-          max_samples(static_cast<size_t>(max_duration_sec * sample_rate)),
-          current_chunk_id(0), model_path(model_path),
-          use_gpu(use_gpu)
+    AsyncWhisperModel(const std::string &model_path, bool use_gpu = false) :
+        WhisperModel(model_path, use_gpu), running(false), next_chunk_id(0), current_chunk_id(0)
     {
+        
     }
 
-    ~ThreadedWhisperModel()
+    ~AsyncWhisperModel()
     {
-        stop();
     }
 
     void start(py::function callback, int result_check_interval_ms = 100)
@@ -184,12 +184,17 @@ class ThreadedWhisperModel
         running = true;
         result_callback = callback;
 
-        process_thread = std::thread(&ThreadedWhisperModel::processThread, this);
-        result_thread = std::thread(&ThreadedWhisperModel::resultThread, this,
+        process_thread = std::thread(&AsyncWhisperModel::processThread, this);
+        result_thread = std::thread(&AsyncWhisperModel::resultThread, this,
                                     result_check_interval_ms);
     }
+    
+    void transcribe(py::array_t<float> audio)
+    {
+        this->queueAudio(audio);
+    }
 
-    void stop()
+    virtual void stop()
     {
         if (!running)
             return;
@@ -209,12 +214,6 @@ class ThreadedWhisperModel
             process_thread.join();
         if (result_thread.joinable())
             result_thread.join();
-
-        // Clear accumulated buffer
-        {
-            std::lock_guard<std::mutex> lock(buffer_mutex);
-            accumulated_buffer.clear();
-        }
     }
 
     size_t queueAudio(py::array_t<float> audio)
@@ -236,13 +235,69 @@ class ThreadedWhisperModel
         return chunk.id;
     }
 
+protected:
+
+    virtual void processThread() = 0;
+    virtual void resultThread(int check_interval_ms) = 0;
+
+    std::atomic<bool> running;
+    std::atomic<size_t> next_chunk_id;
+    size_t current_chunk_id;
+
+    std::thread process_thread;
+    std::thread result_thread;
+
+    std::queue<AudioChunk> input_queue;
+    std::mutex input_mutex;
+    std::condition_variable input_cv;
+
+    std::queue<TranscriptionResult> result_queue;
+    std::mutex result_mutex;
+    std::condition_variable result_cv;
+
+    py::function result_callback;
+
+};
+
+
+class ThreadedWhisperModel : public AsyncWhisperModel
+{
+public:
+    ThreadedWhisperModel(const std::string &model_path, bool use_gpu = false,
+                         float max_duration_sec = 10.0f, int sample_rate = 16000)
+        : AsyncWhisperModel(model_path, use_gpu),
+          max_samples(static_cast<size_t>(max_duration_sec * sample_rate))
+    {
+    }
+
+    ~ThreadedWhisperModel()
+    {
+        stop();
+    }
+
     void setMaxDuration(float max_duration_sec, int sample_rate = 16000)
     {
         max_samples = static_cast<size_t>(max_duration_sec * sample_rate);
     }
 
+    void start(py::function callback, int result_check_interval_ms = 100)
+    {
+        AsyncWhisperModel::start(callback, result_check_interval_ms);
+    }
+
+    void stop()
+    {
+        AsyncWhisperModel::stop();
+
+        // Clear accumulated buffer
+        {
+            std::lock_guard<std::mutex> lock(buffer_mutex);
+            accumulated_buffer.clear();
+        }
+    }
+
 private:
-    void processAccumulatedAudio(WhisperModel &model, bool force_final = false)
+    void processAccumulatedAudio(bool force_final = false)
     {
         std::vector<float> process_buffer;
         size_t current_id;
@@ -266,7 +321,7 @@ class ThreadedWhisperModel
         std::vector<WhisperSegment> segments;
         try
         {
-            segments = model.transcribe_raw_audio(process_buffer.data(), process_buffer.size());
+            segments = this->transcribe_raw_audio(process_buffer.data(), process_buffer.size());
         }
         catch (const std::exception &e)
         {
@@ -286,7 +341,7 @@ class ThreadedWhisperModel
         result.chunk_id = current_id;
         for (const auto &segment : segments)
         {
-            result.segments.push_back(segment.text);
+            result.segments.push_back(segment);
         }
         // Set partial flag based on whether this is a final result
         result.is_partial = !(force_final || process_buffer.size() >= max_samples);
@@ -301,8 +356,6 @@ class ThreadedWhisperModel
 
     void processThread()
     {
-        WhisperModel model(this->model_path, this->use_gpu);
-
         while (running)
         {
             AudioChunk all_chunks;
@@ -317,7 +370,7 @@ class ThreadedWhisperModel
                 if (!running)
                 {
                     // Process any remaining audio as final before shutting down
-                    processAccumulatedAudio(model, true);
+                    processAccumulatedAudio(true);
                     break;
                 }
 
@@ -346,7 +399,7 @@ class ThreadedWhisperModel
                 }
 
                 // Process the accumulated audio
-                processAccumulatedAudio(model, false);
+                processAccumulatedAudio(false);
             }
         }
     }
@@ -386,7 +439,7 @@ class ThreadedWhisperModel
                     std::string full_text;
                     for (const auto &segment : result.segments)
                     {
-                        full_text += segment;
+                        full_text += segment.text;
                     }
                     full_text = trim(full_text);
                     if (full_text.empty())
@@ -396,7 +449,7 @@ class ThreadedWhisperModel
                     {
                         try
                         {
-                            result_callback((int)result.chunk_id, py::str(full_text), result.is_partial);
+                            result_callback((int)result.chunk_id, result.segments, result.is_partial);
                         }
                         catch (const std::exception &e)
                         {
@@ -412,31 +465,10 @@ class ThreadedWhisperModel
         }
     }
 
-    whisper_context *ctx;
-    std::atomic<bool> running;
-    std::atomic<size_t> next_chunk_id;
-    size_t current_chunk_id;
-
     // Audio accumulation
     std::vector<float> accumulated_buffer;
     size_t max_samples;
     std::mutex buffer_mutex;
-
-    std::thread process_thread;
-    std::thread result_thread;
-
-    std::queue<AudioChunk> input_queue;
-    std::mutex input_mutex;
-    std::condition_variable input_cv;
-
-    std::queue<TranscriptionResult> result_queue;
-    std::mutex result_mutex;
-    std::condition_variable result_cv;
-
-    py::function result_callback;
-
-    std::string model_path;
-    bool use_gpu;
 };
 
 PYBIND11_MODULE(_whisper_cpp, m)
diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py
index 745aef2..e44444e 100644
--- a/test_simpler_whisper.py
+++ b/test_simpler_whisper.py
@@ -1,3 +1,4 @@
+from typing import List
 import av
 import argparse
 import sys
@@ -10,6 +11,7 @@
 import resampy
 
 from simpler_whisper.whisper import (
+    WhisperSegment,
     load_model,
     set_log_callback,
     LogLevel,
@@ -125,7 +127,8 @@ def test_simpler_whisper():
 def test_threaded_whisper():
     set_log_callback(my_log_callback)
 
-    def handle_result(chunk_id: int, text: str, is_partial: bool):
+    def handle_result(chunk_id: int, segments: List[WhisperSegment], is_partial: bool):
+        text = " ".join([seg.text for seg in segments])
         print(
             f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}): {text}"
         )

From 88d38992ff19aa797cce3ffb9a65dd454147b20d Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Sun, 27 Oct 2024 23:26:46 -0400
Subject: [PATCH 02/11] Refactor test_simpler_whisper.py and whisper.py: Add
 async transcription support

---
 simpler_whisper/whisper.py |  99 ++++++++++++++-----
 src/whisper_wrapper.cpp    | 197 +++++++++++++++++++++++--------------
 test_simpler_whisper.py    |  66 +++++++++++--
 3 files changed, 258 insertions(+), 104 deletions(-)

diff --git a/simpler_whisper/whisper.py b/simpler_whisper/whisper.py
index 62319a8..596f971 100644
--- a/simpler_whisper/whisper.py
+++ b/simpler_whisper/whisper.py
@@ -44,6 +44,77 @@ def __del__(self):
             del self.model
 
 
+class AsyncWhisperModel:
+    """
+    AsyncWhisperModel is a class that provides asynchronous transcription of audio data using a Whisper model.
+    """
+
+    def __init__(
+        self,
+        model_path: str,
+        callback: Callable[[int, List[WhisperSegment], bool], None],
+        use_gpu=False,
+    ):
+        self.model = _whisper_cpp.AsyncWhisperModel(model_path, use_gpu)
+        self._is_running = False
+        self.callback = callback
+
+    def transcribe(self, audio: Union[np.ndarray, List[float]]) -> int:
+        """
+        Transcribes the given audio input using the model.
+        Args:
+            audio (Union[np.ndarray, List[float]]): The audio data to be transcribed.
+                It can be either a numpy array or a list of floats.
+        Returns:
+            int: The queued chunk ID.
+        """
+        # Ensure audio is a numpy array of float32
+        audio = np.array(audio, dtype=np.float32)
+
+        # Run async inference (no return value)
+        return self.model.transcribe(audio)
+
+    def handle_result(self, chunk_id: int, text: str, is_partial: bool):
+        if self.callback is not None:
+            self.callback(chunk_id, text, is_partial)
+
+    def start(self, result_check_interval_ms=100):
+        """
+        Start the processing threads with a callback for results.
+
+        Args:
+            callback: Function that takes three arguments:
+                     - chunk_id (int): Unique identifier for the audio chunk
+                     - segments (WhisperSegment): Transcribed text for the audio chunk
+                     - is_partial (bool): Whether this is a partial result
+            result_check_interval_ms (int): How often to check for results
+        """
+        if self._is_running:
+            return
+
+        self.model.start(self.handle_result, result_check_interval_ms)
+        self._is_running = True
+
+    def stop(self):
+        """
+        Stop processing and clean up resources.
+        Any remaining audio will be processed as a final segment.
+        """
+        if not self._is_running:
+            return
+
+        self.model.stop()
+        self._is_running = False
+
+    def __del__(self):
+        # Explicitly delete the C++ object
+        if hasattr(self, "model"):
+            if self._is_running:
+                self.stop()
+                self._is_running = False
+            del self.model
+
+
 class ThreadedWhisperModel:
     def __init__(
         self,
@@ -61,6 +132,10 @@ def __init__(
             use_gpu (bool): Whether to use GPU acceleration
             max_duration_sec (float): Maximum duration in seconds before finalizing a segment
             sample_rate (int): Audio sample rate (default: 16000)
+            callback: Function that takes three arguments:
+                     - chunk_id (int): Unique identifier for the audio chunk
+                     - segments (List[WhisperSegment]): Transcribed text for the audio chunk
+                     - is_partial (bool): Whether this is a partial result
         """
         self.model = _whisper_cpp.ThreadedWhisperModel(
             model_path, use_gpu, max_duration_sec, sample_rate
@@ -79,7 +154,7 @@ def start(self, result_check_interval_ms=100):
         Args:
             callback: Function that takes three arguments:
                      - chunk_id (int): Unique identifier for the audio chunk
-                     - segments (str): Transcribed text for the audio chunk
+                     - segments (WhisperSegment): Transcribed text for the audio chunk
                      - is_partial (bool): Whether this is a partial result
             result_check_interval_ms (int): How often to check for results
         """
@@ -133,28 +208,6 @@ def __del__(self):
             del self.model
 
 
-def load_model(model_path: str, use_gpu=False) -> WhisperModel:
-    return WhisperModel(model_path, use_gpu)
-
-
-def load_threaded_model(
-    model_path: str, use_gpu=False, max_duration_sec=10.0, sample_rate=16000
-) -> ThreadedWhisperModel:
-    """
-    Load a threaded Whisper model for continuous audio processing.
-
-    Args:
-        model_path (str): Path to the Whisper model file
-        use_gpu (bool): Whether to use GPU acceleration
-        max_duration_sec (float): Maximum duration in seconds before finalizing a segment
-        sample_rate (int): Audio sample rate (default: 16000)
-
-    Returns:
-        ThreadedWhisperModel: A model instance ready for processing
-    """
-    return ThreadedWhisperModel(model_path, use_gpu, max_duration_sec, sample_rate)
-
-
 def set_log_callback(callback):
     """
     Set a custom logging callback function.
diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp
index 11f5f7f..381ad36 100644
--- a/src/whisper_wrapper.cpp
+++ b/src/whisper_wrapper.cpp
@@ -148,7 +148,6 @@ class WhisperModel
     whisper_full_params params;
 };
 
-
 struct AudioChunk
 {
     std::vector<float> data;
@@ -162,14 +161,11 @@ struct TranscriptionResult
     std::vector<WhisperSegment> segments;
 };
 
-
 class AsyncWhisperModel : public WhisperModel
 {
 public:
-    AsyncWhisperModel(const std::string &model_path, bool use_gpu = false) :
-        WhisperModel(model_path, use_gpu), running(false), next_chunk_id(0), current_chunk_id(0)
+    AsyncWhisperModel(const std::string &model_path, bool use_gpu = false) : WhisperModel(model_path, use_gpu), running(false), next_chunk_id(0), current_chunk_id(0)
     {
-        
     }
 
     ~AsyncWhisperModel()
@@ -188,10 +184,19 @@ class AsyncWhisperModel : public WhisperModel
         result_thread = std::thread(&AsyncWhisperModel::resultThread, this,
                                     result_check_interval_ms);
     }
-    
-    void transcribe(py::array_t<float> audio)
+
+    /**
+     * @brief Transcribes the given audio data.
+     *
+     * This function takes an audio input in the form of a py::array_t<float> and
+     * processes it by queuing the audio for transcription.
+     *
+     * @param audio A py::array_t<float> containing the audio data to be transcribed.
+     * @return size_t The queued chunk ID.
+     */
+    size_t transcribe(py::array_t<float> audio)
     {
-        this->queueAudio(audio);
+        return this->queueAudio(audio);
     }
 
     virtual void stop()
@@ -236,9 +241,106 @@ class AsyncWhisperModel : public WhisperModel
     }
 
 protected:
+    virtual void processThread()
+    {
+        while (running)
+        {
+            // Get next chunk from input queue
+            {
+                std::unique_lock<std::mutex> lock(input_mutex);
+                input_cv.wait(lock, [this]
+                              { return !input_queue.empty() || !running; });
+
+                AudioChunk chunk = std::move(input_queue.front());
+                input_queue.pop();
+
+                // Process audio
+                TranscriptionResult result;
+                result.chunk_id = chunk.id;
+                result.is_partial = false;
+                try
+                {
+                    result.segments = this->transcribe_raw_audio(chunk.data.data(), chunk.data.size());
+                }
+                catch (const std::exception &e)
+                {
+                    std::cerr << "Exception during transcription: " << e.what() << std::endl;
+                }
+                catch (...)
+                {
+                    std::cerr << "Unknown exception during transcription" << std::endl;
+                }
+
+                // Add result to output queue
+                {
+                    std::lock_guard<std::mutex> lock(result_mutex);
+                    result_queue.push(result);
+                    result_cv.notify_one();
+                }
+            }
+        }
+    }
+
+    void resultThread(int check_interval_ms)
+    {
+        while (running)
+        {
+            std::vector<TranscriptionResult> results;
+
+            {
+                std::unique_lock<std::mutex> lock(result_mutex);
+                result_cv.wait_for(lock,
+                                   std::chrono::milliseconds(check_interval_ms),
+                                   [this]
+                                   { return !result_queue.empty() || !running; });
 
-    virtual void processThread() = 0;
-    virtual void resultThread(int check_interval_ms) = 0;
+                if (!running && result_queue.empty())
+                    break;
+
+                while (!result_queue.empty())
+                {
+                    results.push_back(std::move(result_queue.front()));
+                    result_queue.pop();
+                }
+            }
+
+            if (!results.empty())
+            {
+                py::gil_scoped_acquire gil;
+                for (const auto &result : results)
+                {
+                    if (result.segments.empty())
+                        continue;
+
+                    // concatenate segments into a single string
+                    std::string full_text;
+                    for (const auto &segment : result.segments)
+                    {
+                        full_text += segment.text;
+                    }
+                    full_text = trim(full_text);
+                    if (full_text.empty())
+                        continue;
+
+                    if (result_callback)
+                    {
+                        try
+                        {
+                            result_callback((int)result.chunk_id, result.segments, result.is_partial);
+                        }
+                        catch (const std::exception &e)
+                        {
+                            std::cerr << "Exception in result callback: " << e.what() << std::endl;
+                        }
+                        catch (...)
+                        {
+                            std::cerr << "Unknown exception in result callback" << std::endl;
+                        }
+                    }
+                }
+            }
+        }
+    }
 
     std::atomic<bool> running;
     std::atomic<size_t> next_chunk_id;
@@ -256,10 +358,8 @@ class AsyncWhisperModel : public WhisperModel
     std::condition_variable result_cv;
 
     py::function result_callback;
-
 };
 
-
 class ThreadedWhisperModel : public AsyncWhisperModel
 {
 public:
@@ -354,7 +454,7 @@ class ThreadedWhisperModel : public AsyncWhisperModel
         }
     }
 
-    void processThread()
+    void processThread() override
     {
         while (running)
         {
@@ -404,67 +504,6 @@ class ThreadedWhisperModel : public AsyncWhisperModel
         }
     }
 
-    void resultThread(int check_interval_ms)
-    {
-        while (running)
-        {
-            std::vector<TranscriptionResult> results;
-
-            {
-                std::unique_lock<std::mutex> lock(result_mutex);
-                result_cv.wait_for(lock,
-                                   std::chrono::milliseconds(check_interval_ms),
-                                   [this]
-                                   { return !result_queue.empty() || !running; });
-
-                if (!running && result_queue.empty())
-                    break;
-
-                while (!result_queue.empty())
-                {
-                    results.push_back(std::move(result_queue.front()));
-                    result_queue.pop();
-                }
-            }
-
-            if (!results.empty())
-            {
-                py::gil_scoped_acquire gil;
-                for (const auto &result : results)
-                {
-                    if (result.segments.empty())
-                        continue;
-
-                    // concatenate segments into a single string
-                    std::string full_text;
-                    for (const auto &segment : result.segments)
-                    {
-                        full_text += segment.text;
-                    }
-                    full_text = trim(full_text);
-                    if (full_text.empty())
-                        continue;
-
-                    if (result_callback)
-                    {
-                        try
-                        {
-                            result_callback((int)result.chunk_id, result.segments, result.is_partial);
-                        }
-                        catch (const std::exception &e)
-                        {
-                            std::cerr << "Exception in result callback: " << e.what() << std::endl;
-                        }
-                        catch (...)
-                        {
-                            std::cerr << "Unknown exception in result callback" << std::endl;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
     // Audio accumulation
     std::vector<float> accumulated_buffer;
     size_t max_samples;
@@ -507,6 +546,16 @@ PYBIND11_MODULE(_whisper_cpp, m)
         .def(py::init<const std::string &, bool>())
         .def("transcribe", &WhisperModel::transcribe);
 
+    // Expose asynchronous model
+    py::class_<AsyncWhisperModel>(m, "AsyncWhisperModel")
+        .def(py::init<const std::string &, bool>())
+        .def("start", &AsyncWhisperModel::start,
+             py::arg("callback"),
+             py::arg("result_check_interval_ms") = 100)
+        .def("stop", &AsyncWhisperModel::stop)
+        .def("transcribe", &AsyncWhisperModel::transcribe)
+        .def("queue_audio", &AsyncWhisperModel::queueAudio);
+
     py::class_<ThreadedWhisperModel>(m, "ThreadedWhisperModel")
         .def(py::init<const std::string &, bool, float, int>(),
              py::arg("model_path"),
diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py
index e44444e..bf2288b 100644
--- a/test_simpler_whisper.py
+++ b/test_simpler_whisper.py
@@ -2,20 +2,21 @@
 import av
 import argparse
 import sys
-
-# Remove the current directory from sys.path to avoid conflicts with the installed package
-sys.path.pop(0)
-
 import numpy as np
 import time
 import resampy
+import librosa
+
+# Remove the current directory from sys.path to avoid conflicts with the installed package
+sys.path.pop(0)
 
 from simpler_whisper.whisper import (
     WhisperSegment,
-    load_model,
     set_log_callback,
     LogLevel,
+    WhisperModel,
     ThreadedWhisperModel,
+    AsyncWhisperModel,
 )
 
 
@@ -36,7 +37,7 @@ def my_log_callback(level, message):
 parser.add_argument(
     "method",
     type=str,
-    choices=["regular", "threaded"],
+    choices=["regular", "threaded", "async"],
     help="The method to use for testing the model",
 )
 args = parser.parse_args()
@@ -77,7 +78,7 @@ def test_simpler_whisper():
 
     # Load the model
     print("Loading the Whisper model...")
-    model = load_model(model_path, use_gpu=True)
+    model = WhisperModel(model_path, use_gpu=True)
     print("Model loaded successfully!")
 
     # Load audio from file with av
@@ -124,6 +125,55 @@ def test_simpler_whisper():
     print("Transcription completed.")
 
 
+def test_async_whisper():
+    set_log_callback(my_log_callback)
+    chunk_ids = []
+
+    def handle_result(chunk_id: int, segments: List[WhisperSegment], is_partial: bool):
+        text = " ".join([seg.text for seg in segments])
+        print(
+            f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}): {text}"
+        )
+        # remove the chunk_id from the list of chunk_ids
+        chunk_ids.remove(chunk_id)
+
+    # Create model
+    model = AsyncWhisperModel(
+        model_path=model_path, callback=handle_result, use_gpu=True
+    )
+
+    print("Loading audio from file...")
+    # Load audio from file with librosa
+    audio_data, sample_rate = librosa.load(audio_file, sr=16000)
+
+    # Start processing with callback
+    print("Starting Whisper model")
+    model.start()
+
+    # create 30-seconds chunks of audio_data
+    for i in range(0, len(audio_data), 16000 * 30):
+        try:
+            samples_for_transcription = audio_data[i : i + 16000 * 30]
+
+            # Queue the chunk for processing
+            chunk_id = model.transcribe(samples_for_transcription)
+            chunk_ids.append(chunk_id)
+            print(f"Queued chunk {chunk_id}")
+
+            # reset
+            samples_for_transcription = np.array([])
+        except:
+            break
+
+    # wait for all chunks to finish processing
+    while len(chunk_ids) > 0:
+        time.sleep(0.1)
+
+    # When done
+    print("Stopping Whisper model")
+    model.stop()
+
+
 def test_threaded_whisper():
     set_log_callback(my_log_callback)
 
@@ -173,5 +223,7 @@ def handle_result(chunk_id: int, segments: List[WhisperSegment], is_partial: boo
 if __name__ == "__main__":
     if args.method == "regular":
         test_simpler_whisper()
+    elif args.method == "async":
+        test_async_whisper()
     else:
         test_threaded_whisper()

From 22d53f1b09bcfa320ac5a0e759ec181319f42ad4 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Mon, 28 Oct 2024 09:08:45 -0400
Subject: [PATCH 03/11] Refactor simpler_whisper/__init__.py: Import
 _whisper_cpp module

---
 pyproject.toml              |   6 ++
 simpler_whisper/__init__.py |   1 +
 simpler_whisper/whisper.py  |   8 +-
 src/whisper_wrapper.cpp     |  79 +++++++++------
 tests/__init__.py           |   0
 tests/test_wrapper.py       | 197 ++++++++++++++++++++++++++++++++++++
 6 files changed, 256 insertions(+), 35 deletions(-)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_wrapper.py

diff --git a/pyproject.toml b/pyproject.toml
index d0d78a8..ad115e5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,3 +34,9 @@ packages = ["simpler_whisper"]
 
 [tool.setuptools.package-data]
 simpler_whisper = ["*.dll", "*.pyd", "*.so", "*.metal"]
+
+[tool.pytest]
+testpaths = ["tests"]
+python_files = "test_*.py"
+python_classes = "Test*"
+python_functions = "test_*"
\ No newline at end of file
diff --git a/simpler_whisper/__init__.py b/simpler_whisper/__init__.py
index e69de29..86d4364 100644
--- a/simpler_whisper/__init__.py
+++ b/simpler_whisper/__init__.py
@@ -0,0 +1 @@
+from . import _whisper_cpp
diff --git a/simpler_whisper/whisper.py b/simpler_whisper/whisper.py
index 596f971..834cbbe 100644
--- a/simpler_whisper/whisper.py
+++ b/simpler_whisper/whisper.py
@@ -74,9 +74,9 @@ def transcribe(self, audio: Union[np.ndarray, List[float]]) -> int:
         # Run async inference (no return value)
         return self.model.transcribe(audio)
 
-    def handle_result(self, chunk_id: int, text: str, is_partial: bool):
+    def handle_result(self, chunk_id: int, segments: List[WhisperSegment], is_partial: bool):
         if self.callback is not None:
-            self.callback(chunk_id, text, is_partial)
+            self.callback(chunk_id, segments, is_partial)
 
     def start(self, result_check_interval_ms=100):
         """
@@ -143,9 +143,9 @@ def __init__(
         self._is_running = False
         self.callback = callback
 
-    def handle_result(self, chunk_id: int, text: str, is_partial: bool):
+    def handle_result(self, chunk_id: int, segments: List[WhisperSegment], is_partial: bool):
         if self.callback is not None:
-            self.callback(chunk_id, text, is_partial)
+            self.callback(chunk_id, segments, is_partial)
 
     def start(self, result_check_interval_ms=100):
         """
diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp
index 381ad36..de9daea 100644
--- a/src/whisper_wrapper.cpp
+++ b/src/whisper_wrapper.cpp
@@ -161,10 +161,11 @@ struct TranscriptionResult
     std::vector<WhisperSegment> segments;
 };
 
-class AsyncWhisperModel : public WhisperModel
+class AsyncWhisperModel
 {
 public:
-    AsyncWhisperModel(const std::string &model_path, bool use_gpu = false) : WhisperModel(model_path, use_gpu), running(false), next_chunk_id(0), current_chunk_id(0)
+    AsyncWhisperModel(const std::string &model_path, bool use_gpu = false) : model_path(model_path), use_gpu(use_gpu),
+                                                                             running(false), next_chunk_id(0), current_chunk_id(0)
     {
     }
 
@@ -243,40 +244,51 @@ class AsyncWhisperModel : public WhisperModel
 protected:
     virtual void processThread()
     {
+        WhisperModel model(model_path, use_gpu);
+
         while (running)
         {
+            AudioChunk chunk;
             // Get next chunk from input queue
             {
                 std::unique_lock<std::mutex> lock(input_mutex);
-                input_cv.wait(lock, [this]
-                              { return !input_queue.empty() || !running; });
+                input_cv.wait_for(lock,
+                                  std::chrono::milliseconds(100),
+                                  [this]
+                                  { return !input_queue.empty() || !running; });
+
+                if (!running)
+                    break;
+
+                if (input_queue.empty())
+                    continue;
 
-                AudioChunk chunk = std::move(input_queue.front());
+                chunk = std::move(input_queue.front());
                 input_queue.pop();
+            }
 
-                // Process audio
-                TranscriptionResult result;
-                result.chunk_id = chunk.id;
-                result.is_partial = false;
-                try
-                {
-                    result.segments = this->transcribe_raw_audio(chunk.data.data(), chunk.data.size());
-                }
-                catch (const std::exception &e)
-                {
-                    std::cerr << "Exception during transcription: " << e.what() << std::endl;
-                }
-                catch (...)
-                {
-                    std::cerr << "Unknown exception during transcription" << std::endl;
-                }
+            // Process audio
+            TranscriptionResult result;
+            result.chunk_id = chunk.id;
+            result.is_partial = false;
+            try
+            {
+                result.segments = model.transcribe_raw_audio(chunk.data.data(), chunk.data.size());
+            }
+            catch (const std::exception &e)
+            {
+                std::cerr << "Exception during transcription: " << e.what() << std::endl;
+            }
+            catch (...)
+            {
+                std::cerr << "Unknown exception during transcription" << std::endl;
+            }
 
-                // Add result to output queue
-                {
-                    std::lock_guard<std::mutex> lock(result_mutex);
-                    result_queue.push(result);
-                    result_cv.notify_one();
-                }
+            // Add result to output queue
+            {
+                std::lock_guard<std::mutex> lock(result_mutex);
+                result_queue.push(result);
+                result_cv.notify_one();
             }
         }
     }
@@ -342,6 +354,9 @@ class AsyncWhisperModel : public WhisperModel
         }
     }
 
+    std::string model_path;
+    bool use_gpu;
+
     std::atomic<bool> running;
     std::atomic<size_t> next_chunk_id;
     size_t current_chunk_id;
@@ -397,7 +412,7 @@ class ThreadedWhisperModel : public AsyncWhisperModel
     }
 
 private:
-    void processAccumulatedAudio(bool force_final = false)
+    void processAccumulatedAudio(WhisperModel &model, bool force_final = false)
     {
         std::vector<float> process_buffer;
         size_t current_id;
@@ -421,7 +436,7 @@ class ThreadedWhisperModel : public AsyncWhisperModel
         std::vector<WhisperSegment> segments;
         try
         {
-            segments = this->transcribe_raw_audio(process_buffer.data(), process_buffer.size());
+            segments = model.transcribe_raw_audio(process_buffer.data(), process_buffer.size());
         }
         catch (const std::exception &e)
         {
@@ -456,6 +471,8 @@ class ThreadedWhisperModel : public AsyncWhisperModel
 
     void processThread() override
     {
+        WhisperModel model(model_path, use_gpu);
+
         while (running)
         {
             AudioChunk all_chunks;
@@ -470,7 +487,7 @@ class ThreadedWhisperModel : public AsyncWhisperModel
                 if (!running)
                 {
                     // Process any remaining audio as final before shutting down
-                    processAccumulatedAudio(true);
+                    processAccumulatedAudio(model, true);
                     break;
                 }
 
@@ -499,7 +516,7 @@ class ThreadedWhisperModel : public AsyncWhisperModel
                 }
 
                 // Process the accumulated audio
-                processAccumulatedAudio(false);
+                processAccumulatedAudio(model, false);
             }
         }
     }
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py
new file mode 100644
index 0000000..6cf7d1a
--- /dev/null
+++ b/tests/test_wrapper.py
@@ -0,0 +1,197 @@
+import unittest
+import numpy as np
+import threading
+import time
+import queue
+import os
+from concurrent.futures import ThreadPoolExecutor
+from simpler_whisper import _whisper_cpp as whisper
+
+
+class TestWhisperWrapper(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        # Get the model path relative to the project root
+        cls.model_path = os.path.join(
+            os.path.dirname(os.path.dirname(__file__)), "ggml-tiny.en-q5_1.bin"
+        )
+
+        # Verify model exists
+        if not os.path.exists(cls.model_path):
+            raise FileNotFoundError(f"Model file not found at {cls.model_path}")
+
+        # Create sample audio data (silence)
+        cls.sample_rate = 16000
+        duration_sec = 3
+        cls.test_audio = np.zeros(cls.sample_rate * duration_sec, dtype=np.float32)
+
+        # Create some mock audio with varying amplitudes for better testing
+        cls.mock_speech = np.sin(
+            2 * np.pi * 440 * np.linspace(0, 1, cls.sample_rate)
+        ).astype(np.float32)
+
+    def setUp(self):
+        """Ensure each test starts with fresh instances"""
+        self.results = queue.Queue()
+
+    def tearDown(self):
+        """Cleanup after each test"""
+        while not self.results.empty():
+            try:
+                self.results.get_nowait()
+            except queue.Empty:
+                break
+
+    def test_sync_model_basic(self):
+        """Test basic synchronous model initialization and transcription"""
+        try:
+            model = whisper.WhisperModel(self.model_path, use_gpu=False)
+            result = model.transcribe(self.test_audio)
+            self.assertIsInstance(result, list)
+        except Exception as e:
+            self.fail(f"Basic synchronous model test failed: {str(e)}")
+
+    def test_sync_model_empty_audio(self):
+        """Test synchronous model with empty audio"""
+        model = whisper.WhisperModel(self.model_path, use_gpu=False)
+        empty_audio = np.array([], dtype=np.float32)
+        with self.assertRaises(Exception):
+            model.transcribe(empty_audio)
+
+    def test_sync_model_invalid_audio(self):
+        """Test synchronous model with invalid audio data"""
+        model = whisper.WhisperModel(self.model_path, use_gpu=False)
+        invalid_audio = np.array([1.5, -1.5], dtype=np.float64)  # Wrong dtype
+        with self.assertRaises(Exception):
+            model.transcribe(invalid_audio)
+
+    def test_async_model_basic(self):
+        """Test basic async model functionality"""
+        results = queue.Queue()
+
+        def callback(chunk_id, segments, is_partial):
+            results.put((chunk_id, segments, is_partial))
+
+        model = whisper.AsyncWhisperModel(self.model_path, use_gpu=False)
+        try:
+            model.start(callback)
+            chunk_id = model.transcribe(self.test_audio)
+
+            # Wait for result with timeout
+            try:
+                result = results.get(timeout=10)
+                self.assertEqual(result[0], chunk_id)  # Check if chunk_id matches
+            except queue.Empty:
+                self.fail("Async transcription timeout")
+
+        finally:
+            model.stop()
+
+    def test_threaded_model_basic(self):
+        """Test basic threaded model functionality"""
+        results = queue.Queue()
+
+        def callback(chunk_id, segments, is_partial):
+            results.put((chunk_id, segments, is_partial))
+
+        model = whisper.ThreadedWhisperModel(
+            self.model_path,
+            use_gpu=False,
+            max_duration_sec=5.0,
+            sample_rate=self.sample_rate,
+        )
+
+        try:
+            model.start(callback)
+            chunk_id = model.queue_audio(self.mock_speech)
+
+            # Wait for result with timeout
+            try:
+                result = results.get(timeout=10)
+                self.assertEqual(result[0], chunk_id)
+            except queue.Empty:
+                self.fail("Threaded transcription timeout")
+        finally:
+            model.stop()
+
+    def test_threaded_model_continuous(self):
+        """Test threaded model with continuous audio chunks"""
+        results = []
+        result_lock = threading.Lock()
+
+        def callback(chunk_id, segments, is_partial):
+            with result_lock:
+                results.append((chunk_id, segments, is_partial))
+
+        model = whisper.ThreadedWhisperModel(
+            self.model_path,
+            use_gpu=False,
+            max_duration_sec=1.0,
+            sample_rate=self.sample_rate,
+        )
+
+        try:
+            model.start(callback)
+
+            # Queue multiple chunks of audio
+            chunk_size = self.sample_rate  # 1 second chunks
+            num_chunks = 3
+            chunk_ids = []
+
+            for i in range(num_chunks):
+                chunk = self.mock_speech[i * chunk_size : (i + 1) * chunk_size]
+                chunk_id = model.queue_audio(chunk)
+                chunk_ids.append(chunk_id)
+                time.sleep(0.1)  # Small delay between chunks
+
+            # Wait for all results
+            max_wait = 15  # seconds
+            start_time = time.time()
+            while len(results) < num_chunks and (time.time() - start_time) < max_wait:
+                time.sleep(0.1)
+
+            self.assertGreaterEqual(len(results), num_chunks)
+
+        finally:
+            model.stop()
+
+    def test_log_callback(self):
+        """Test log callback functionality"""
+        log_messages = queue.Queue()
+
+        def log_callback(level, message):
+            log_messages.put((level, message))
+
+        # Set the log callback
+        whisper.set_log_callback(log_callback)
+
+        # Create a model to generate some logs
+        model = whisper.WhisperModel(self.model_path, use_gpu=False)
+        model.transcribe(self.test_audio)
+
+        # Check if we received any log messages
+        try:
+            log_message = log_messages.get_nowait()
+            self.assertIsInstance(log_message, tuple)
+            self.assertIsInstance(log_message[0], int)  # level
+            self.assertIsInstance(log_message[1], str)  # message
+        except queue.Empty:
+            pass  # It's okay if we don't get any log messages
+
+    def test_concurrent_models(self):
+        """Test running multiple models concurrently"""
+
+        def run_model():
+            model = whisper.WhisperModel(self.model_path, use_gpu=False)
+            result = model.transcribe(self.test_audio)
+            return len(result)
+
+        with ThreadPoolExecutor(max_workers=3) as executor:
+            futures = [executor.submit(run_model) for _ in range(3)]
+            results = [f.result() for f in futures]
+
+        self.assertEqual(len(results), 3)
+
+
+if __name__ == "__main__":
+    unittest.main()

From d8124b921cb667ca53e052d5f8e303997add76bc Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Tue, 29 Oct 2024 21:40:05 -0400
Subject: [PATCH 04/11] Refactor cmake/BuildWhispercpp.cmake: Update logic for
 building universal binaries

---
 cmake/BuildWhispercpp.cmake | 85 ++++++++++++++++++++++++++-----------
 setup.py                    |  6 ++-
 2 files changed, 64 insertions(+), 27 deletions(-)

diff --git a/cmake/BuildWhispercpp.cmake b/cmake/BuildWhispercpp.cmake
index 0043a28..0d8667f 100644
--- a/cmake/BuildWhispercpp.cmake
+++ b/cmake/BuildWhispercpp.cmake
@@ -6,46 +6,81 @@ set(PREBUILT_WHISPERCPP_URL_BASE
     "https://github.com/locaal-ai/occ-ai-dep-whispercpp/releases/download/${PREBUILT_WHISPERCPP_VERSION}")
 
 if(APPLE)
-  # check the "MACOS_ARCH" env var to figure out if this is x86 or arm64
-  if($ENV{MACOS_ARCH} STREQUAL "x86_64")
-    set(WHISPER_CPP_HASH "dc7fd5ff9c7fbb8623f8e14d9ff2872186cab4cd7a52066fcb2fab790d6092fc")
-  elseif($ENV{MACOS_ARCH} STREQUAL "arm64")
-    set(WHISPER_CPP_HASH "ebed595ee431b182261bce41583993b149eed539e15ebf770d98a6bc85d53a92")
-  else()
-    message(
-      FATAL_ERROR
-        "The MACOS_ARCH environment variable is not set to a valid value. Please set it to either `x86_64` or `arm64`")
-  endif()
-  set(WHISPER_CPP_URL
-      "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-macos-$ENV{MACOS_ARCH}-${PREBUILT_WHISPERCPP_VERSION}.tar.gz")
+  # Store source directories for each architecture
+  foreach(MACOS_ARCH IN ITEMS "x86_64" "arm64")
+    if(${MACOS_ARCH} STREQUAL "x86_64")
+      set(WHISPER_CPP_HASH "dc7fd5ff9c7fbb8623f8e14d9ff2872186cab4cd7a52066fcb2fab790d6092fc")
+    elseif(${MACOS_ARCH} STREQUAL "arm64")
+      set(WHISPER_CPP_HASH "ebed595ee431b182261bce41583993b149eed539e15ebf770d98a6bc85d53a92")
+    endif()
+    
+    set(WHISPER_CPP_URL
+        "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-macos-${MACOS_ARCH}-${PREBUILT_WHISPERCPP_VERSION}.tar.gz")
 
-  FetchContent_Declare(
-    whispercpp_fetch
-    URL ${WHISPER_CPP_URL}
-    URL_HASH SHA256=${WHISPER_CPP_HASH})
-  FetchContent_MakeAvailable(whispercpp_fetch)
+    # Use unique names for each architecture's fetch
+    FetchContent_Declare(
+      whispercpp_fetch_${MACOS_ARCH}
+      URL ${WHISPER_CPP_URL}
+      URL_HASH SHA256=${WHISPER_CPP_HASH})
+    FetchContent_MakeAvailable(whispercpp_fetch_${MACOS_ARCH})
+    
+    # Store the source dir for each arch
+    if(${MACOS_ARCH} STREQUAL "x86_64")
+      set(WHISPER_X86_64_DIR ${whispercpp_fetch_x86_64_SOURCE_DIR})
+    else()
+      set(WHISPER_ARM64_DIR ${whispercpp_fetch_arm64_SOURCE_DIR})
+    endif()
+  endforeach()
 
+  # Create a directory for the universal binaries
+  set(UNIVERSAL_LIB_DIR ${CMAKE_BINARY_DIR}/universal/lib)
+  file(MAKE_DIRECTORY ${UNIVERSAL_LIB_DIR})
+
+  # Create universal binaries using lipo
+  execute_process(
+    COMMAND lipo -create 
+      "${WHISPER_X86_64_DIR}/lib/libwhisper.a"
+      "${WHISPER_ARM64_DIR}/lib/libwhisper.a"
+      -output "${UNIVERSAL_LIB_DIR}/libwhisper.a"
+  )
+  
+  execute_process(
+    COMMAND lipo -create 
+      "${WHISPER_X86_64_DIR}/lib/libggml.a"
+      "${WHISPER_ARM64_DIR}/lib/libggml.a"
+      -output "${UNIVERSAL_LIB_DIR}/libggml.a"
+  )
+
+  execute_process(
+    COMMAND lipo -create 
+      "${WHISPER_X86_64_DIR}/lib/libwhisper.coreml.a"
+      "${WHISPER_ARM64_DIR}/lib/libwhisper.coreml.a"
+      -output "${UNIVERSAL_LIB_DIR}/libwhisper.coreml.a"
+  )
+
+  # Set up the imported libraries to use the universal binaries
   add_library(Whispercpp::Whisper STATIC IMPORTED)
   set_target_properties(
     Whispercpp::Whisper
     PROPERTIES IMPORTED_LOCATION
-               ${whispercpp_fetch_SOURCE_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX})
-  set_target_properties(Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
-                                                       ${whispercpp_fetch_SOURCE_DIR}/include)
+                "${UNIVERSAL_LIB_DIR}/libwhisper.a")
+  set_target_properties(Whispercpp::Whisper PROPERTIES 
+    INTERFACE_INCLUDE_DIRECTORIES ${WHISPER_ARM64_DIR}/include)  # Either arch's include dir is fine
+
   add_library(Whispercpp::GGML STATIC IMPORTED)
   set_target_properties(
     Whispercpp::GGML
     PROPERTIES IMPORTED_LOCATION
-               ${whispercpp_fetch_SOURCE_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}ggml${CMAKE_STATIC_LIBRARY_SUFFIX})
+                "${UNIVERSAL_LIB_DIR}/libggml.a")
 
   add_library(Whispercpp::CoreML STATIC IMPORTED)
   set_target_properties(
     Whispercpp::CoreML
-    PROPERTIES
-      IMPORTED_LOCATION
-      ${whispercpp_fetch_SOURCE_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}whisper.coreml${CMAKE_STATIC_LIBRARY_SUFFIX})
+    PROPERTIES IMPORTED_LOCATION
+                "${UNIVERSAL_LIB_DIR}/libwhisper.coreml.a")
 
-  set(WHISPER_ADDITIONAL_FILES ${whispercpp_fetch_SOURCE_DIR}/bin/ggml-metal.metal)
+  # Copy the metal file from either architecture (they should be identical)
+  set(WHISPER_ADDITIONAL_FILES ${WHISPER_ARM64_DIR}/bin/ggml-metal.metal)  set(WHISPER_ADDITIONAL_FILES ${whispercpp_fetch_SOURCE_DIR}/bin/ggml-metal.metal)
 elseif(WIN32)
   if(NOT DEFINED ACCELERATION)
     message(FATAL_ERROR "ACCELERATION is not set. Please set it to either `cpu`, `cuda`, `vulkan` or `hipblas`")
diff --git a/setup.py b/setup.py
index a34fff7..03f1fad 100644
--- a/setup.py
+++ b/setup.py
@@ -56,12 +56,14 @@ def build_extension(self, ext):
         # Add platform-specific arguments
         if platform.system() == "Darwin":  # macOS
             cmake_args += [
-                f"-DCMAKE_OSX_ARCHITECTURES={target_platform}",
+                f"-DCMAKE_OSX_ARCHITECTURES=arm64;x86_64",
                 "-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON",
                 "-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON",
                 f"-DCMAKE_INSTALL_NAME_DIR=@rpath",
             ]
-            env["MACOS_ARCH"] = target_platform
+            # Remove the MACOS_ARCH environment variable as we're building universal
+            if "MACOS_ARCH" in env:
+                del env["MACOS_ARCH"]
 
         cfg = "Debug" if self.debug else "Release"
         build_args = ["--config", cfg]

From dfe6c8ed5f627a2eed777fefa6658717522bfae9 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Tue, 29 Oct 2024 22:26:15 -0400
Subject: [PATCH 05/11] Refactor test_simpler_whisper.py and whisper.py: Add
 exception handling for time.sleep()

---
 .github/workflows/build.yaml |  21 +---
 CMakeLists.txt               |  89 ++++++++---------
 cmake/BuildWhispercpp.cmake  | 179 +++++++++++++++++++++--------------
 src/whisper_wrapper.cpp      |  10 +-
 test_simpler_whisper.py      |   5 +-
 5 files changed, 169 insertions(+), 135 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 1febf40..0b09a31 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -14,7 +14,7 @@ jobs:
       matrix:
         os: ['windows-latest', 'macos-latest', 'ubuntu-latest']
         python-version: ['3.11', '3.12']
-        platform: ['x86_64', 'arm64', 'win64']
+        platform: ['x86_64', 'win64']
         acceleration: ['cpu', 'cuda', 'hipblas', 'vulkan']
         exclude:
           - os: windows-latest
@@ -31,8 +31,6 @@ jobs:
             platform: win64
           - os: ubuntu-latest
             platform: win64
-          - os: ubuntu-latest
-            platform: arm64
           - os: ubuntu-latest
             acceleration: cuda
           - os: ubuntu-latest
@@ -45,25 +43,10 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
-    - name: Set up Python Non-Mac
-      if: ${{ matrix.os != 'macos-latest' }}
-      uses: actions/setup-python@v5
-      with:
-        python-version: ${{ matrix.python-version }}
-
-    - name: Set up Python Mac arm64
-      if: ${{ matrix.os == 'macos-latest' && matrix.platform == 'arm64' }}
-      uses: actions/setup-python@v5
-      with:
-        python-version: ${{ matrix.python-version }}
-        architecture: 'arm64'
-
-    - name: Set up Python Mac x86_64
-      if: ${{ matrix.os == 'macos-latest' && matrix.platform == 'x86_64' }}
+    - name: Set up Python
       uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
-        architecture: 'x64'
 
     - name: Install dependencies
       run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 48a9fe7..b760016 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,43 +1,46 @@
-cmake_minimum_required(VERSION 3.15)
-project(whisper_cpp_wrapper)
-
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-# Find Python
-find_package(Python ${PYTHON_VERSION} EXACT COMPONENTS Interpreter Development NumPy REQUIRED)
-
-# Fetch pybind11
-include(FetchContent)
-FetchContent_Declare(
-  pybind11
-  GIT_REPOSITORY https://github.com/pybind/pybind11.git
-  GIT_TAG        v2.13.6  # Specify a version/tag here
-)
-FetchContent_MakeAvailable(pybind11)
-
-include(cmake/BuildWhispercpp.cmake)
-
-# Include directories
-include_directories(${Python_INCLUDE_DIRS})
-include_directories(${Python_NumPy_INCLUDE_DIRS})
-
-# Create the extension module
-pybind11_add_module(_whisper_cpp src/whisper_wrapper.cpp)
-target_link_libraries(_whisper_cpp PRIVATE Whispercpp)
-
-# Set the output directory for the built module
-set_target_properties(_whisper_cpp PROPERTIES
-    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/simpler_whisper
-)
-
-# Copy the DLL to the output directory on Windows
-if(WIN32 OR APPLE)
-    foreach(WHISPER_ADDITIONAL_FILE ${WHISPER_ADDITIONAL_FILES})
-        add_custom_command(TARGET _whisper_cpp POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy_if_different
-            "${WHISPER_ADDITIONAL_FILE}"
-            $<TARGET_FILE_DIR:_whisper_cpp>
-        )
-    endforeach()
-endif()
+cmake_minimum_required(VERSION 3.15)
+project(whisper_cpp_wrapper)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Find Python
+find_package(
+  Python ${PYTHON_VERSION} EXACT
+  COMPONENTS Interpreter Development
+  REQUIRED)
+
+# Fetch pybind11
+include(FetchContent)
+FetchContent_Declare(
+  pybind11
+  GIT_REPOSITORY https://github.com/pybind/pybind11.git
+  GIT_TAG v2.13.6 # Specify a version/tag here
+)
+FetchContent_MakeAvailable(pybind11)
+
+include(cmake/BuildWhispercpp.cmake)
+
+# Include directories
+include_directories(${Python_INCLUDE_DIRS})
+include_directories(${Python_NumPy_INCLUDE_DIRS})
+
+# Create the extension module
+pybind11_add_module(_whisper_cpp src/whisper_wrapper.cpp)
+target_link_libraries(_whisper_cpp PRIVATE Whispercpp)
+
+# Set the output directory for the built module
+set_target_properties(
+  _whisper_cpp PROPERTIES LIBRARY_OUTPUT_DIRECTORY
+                          ${CMAKE_CURRENT_SOURCE_DIR}/simpler_whisper)
+
+# Copy the DLL to the output directory on Windows
+if(WIN32 OR APPLE)
+  foreach(WHISPER_ADDITIONAL_FILE ${WHISPER_ADDITIONAL_FILES})
+    add_custom_command(
+      TARGET _whisper_cpp
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different
+              "${WHISPER_ADDITIONAL_FILE}" $<TARGET_FILE_DIR:_whisper_cpp>)
+  endforeach()
+endif()
diff --git a/cmake/BuildWhispercpp.cmake b/cmake/BuildWhispercpp.cmake
index 0d8667f..c80f801 100644
--- a/cmake/BuildWhispercpp.cmake
+++ b/cmake/BuildWhispercpp.cmake
@@ -3,19 +3,23 @@ include(FetchContent)
 
 set(PREBUILT_WHISPERCPP_VERSION "0.0.7")
 set(PREBUILT_WHISPERCPP_URL_BASE
-    "https://github.com/locaal-ai/occ-ai-dep-whispercpp/releases/download/${PREBUILT_WHISPERCPP_VERSION}")
+    "https://github.com/locaal-ai/occ-ai-dep-whispercpp/releases/download/${PREBUILT_WHISPERCPP_VERSION}"
+)
 
 if(APPLE)
   # Store source directories for each architecture
   foreach(MACOS_ARCH IN ITEMS "x86_64" "arm64")
     if(${MACOS_ARCH} STREQUAL "x86_64")
-      set(WHISPER_CPP_HASH "dc7fd5ff9c7fbb8623f8e14d9ff2872186cab4cd7a52066fcb2fab790d6092fc")
+      set(WHISPER_CPP_HASH
+          "dc7fd5ff9c7fbb8623f8e14d9ff2872186cab4cd7a52066fcb2fab790d6092fc")
     elseif(${MACOS_ARCH} STREQUAL "arm64")
-      set(WHISPER_CPP_HASH "ebed595ee431b182261bce41583993b149eed539e15ebf770d98a6bc85d53a92")
+      set(WHISPER_CPP_HASH
+          "ebed595ee431b182261bce41583993b149eed539e15ebf770d98a6bc85d53a92")
     endif()
-    
+
     set(WHISPER_CPP_URL
-        "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-macos-${MACOS_ARCH}-${PREBUILT_WHISPERCPP_VERSION}.tar.gz")
+        "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-macos-${MACOS_ARCH}-${PREBUILT_WHISPERCPP_VERSION}.tar.gz"
+    )
 
     # Use unique names for each architecture's fetch
     FetchContent_Declare(
@@ -23,7 +27,7 @@ if(APPLE)
       URL ${WHISPER_CPP_URL}
       URL_HASH SHA256=${WHISPER_CPP_HASH})
     FetchContent_MakeAvailable(whispercpp_fetch_${MACOS_ARCH})
-    
+
     # Store the source dir for each arch
     if(${MACOS_ARCH} STREQUAL "x86_64")
       set(WHISPER_X86_64_DIR ${whispercpp_fetch_x86_64_SOURCE_DIR})
@@ -38,68 +42,73 @@ if(APPLE)
 
   # Create universal binaries using lipo
   execute_process(
-    COMMAND lipo -create 
-      "${WHISPER_X86_64_DIR}/lib/libwhisper.a"
-      "${WHISPER_ARM64_DIR}/lib/libwhisper.a"
-      -output "${UNIVERSAL_LIB_DIR}/libwhisper.a"
-  )
-  
+    COMMAND
+      lipo -create "${WHISPER_X86_64_DIR}/lib/libwhisper.a"
+      "${WHISPER_ARM64_DIR}/lib/libwhisper.a" -output
+      "${UNIVERSAL_LIB_DIR}/libwhisper.a")
+
   execute_process(
-    COMMAND lipo -create 
-      "${WHISPER_X86_64_DIR}/lib/libggml.a"
-      "${WHISPER_ARM64_DIR}/lib/libggml.a"
-      -output "${UNIVERSAL_LIB_DIR}/libggml.a"
-  )
+    COMMAND
+      lipo -create "${WHISPER_X86_64_DIR}/lib/libggml.a"
+      "${WHISPER_ARM64_DIR}/lib/libggml.a" -output
+      "${UNIVERSAL_LIB_DIR}/libggml.a")
 
   execute_process(
-    COMMAND lipo -create 
-      "${WHISPER_X86_64_DIR}/lib/libwhisper.coreml.a"
-      "${WHISPER_ARM64_DIR}/lib/libwhisper.coreml.a"
-      -output "${UNIVERSAL_LIB_DIR}/libwhisper.coreml.a"
-  )
+    COMMAND
+      lipo -create "${WHISPER_X86_64_DIR}/lib/libwhisper.coreml.a"
+      "${WHISPER_ARM64_DIR}/lib/libwhisper.coreml.a" -output
+      "${UNIVERSAL_LIB_DIR}/libwhisper.coreml.a")
 
   # Set up the imported libraries to use the universal binaries
   add_library(Whispercpp::Whisper STATIC IMPORTED)
   set_target_properties(
-    Whispercpp::Whisper
-    PROPERTIES IMPORTED_LOCATION
-                "${UNIVERSAL_LIB_DIR}/libwhisper.a")
-  set_target_properties(Whispercpp::Whisper PROPERTIES 
-    INTERFACE_INCLUDE_DIRECTORIES ${WHISPER_ARM64_DIR}/include)  # Either arch's include dir is fine
+    Whispercpp::Whisper PROPERTIES IMPORTED_LOCATION
+                                   "${UNIVERSAL_LIB_DIR}/libwhisper.a")
+  set_target_properties(
+    Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+                                   ${WHISPER_ARM64_DIR}/include) # Either arch's
+                                                                 # include dir
+                                                                 # is fine
 
   add_library(Whispercpp::GGML STATIC IMPORTED)
   set_target_properties(
-    Whispercpp::GGML
-    PROPERTIES IMPORTED_LOCATION
-                "${UNIVERSAL_LIB_DIR}/libggml.a")
+    Whispercpp::GGML PROPERTIES IMPORTED_LOCATION
+                                "${UNIVERSAL_LIB_DIR}/libggml.a")
 
   add_library(Whispercpp::CoreML STATIC IMPORTED)
   set_target_properties(
-    Whispercpp::CoreML
-    PROPERTIES IMPORTED_LOCATION
-                "${UNIVERSAL_LIB_DIR}/libwhisper.coreml.a")
+    Whispercpp::CoreML PROPERTIES IMPORTED_LOCATION
+                                  "${UNIVERSAL_LIB_DIR}/libwhisper.coreml.a")
 
   # Copy the metal file from either architecture (they should be identical)
-  set(WHISPER_ADDITIONAL_FILES ${WHISPER_ARM64_DIR}/bin/ggml-metal.metal)  set(WHISPER_ADDITIONAL_FILES ${whispercpp_fetch_SOURCE_DIR}/bin/ggml-metal.metal)
+  set(WHISPER_ADDITIONAL_FILES ${WHISPER_ARM64_DIR}/bin/ggml-metal.metal)
 elseif(WIN32)
   if(NOT DEFINED ACCELERATION)
-    message(FATAL_ERROR "ACCELERATION is not set. Please set it to either `cpu`, `cuda`, `vulkan` or `hipblas`")
+    message(
+      FATAL_ERROR
+        "ACCELERATION is not set. Please set it to either `cpu`, `cuda`, `vulkan` or `hipblas`"
+    )
   endif()
 
   set(ARCH_PREFIX ${ACCELERATION})
   set(WHISPER_CPP_URL
-      "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-windows-${ARCH_PREFIX}-${PREBUILT_WHISPERCPP_VERSION}.zip")
+      "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-windows-${ARCH_PREFIX}-${PREBUILT_WHISPERCPP_VERSION}.zip"
+  )
   if(${ACCELERATION} STREQUAL "cpu")
-    set(WHISPER_CPP_HASH "c23862b4aac7d8448cf7de4d339a86498f88ecba6fa7d243bbd7fabdb13d4dd4")
+    set(WHISPER_CPP_HASH
+        "c23862b4aac7d8448cf7de4d339a86498f88ecba6fa7d243bbd7fabdb13d4dd4")
     add_compile_definitions("LOCALVOCAL_WITH_CPU")
   elseif(${ACCELERATION} STREQUAL "cuda")
-    set(WHISPER_CPP_HASH "a0adeaccae76fab0678d016a62b79a19661ed34eb810d8bae3b610345ee9a405")
+    set(WHISPER_CPP_HASH
+        "a0adeaccae76fab0678d016a62b79a19661ed34eb810d8bae3b610345ee9a405")
     add_compile_definitions("LOCALVOCAL_WITH_CUDA")
   elseif(${ACCELERATION} STREQUAL "hipblas")
-    set(WHISPER_CPP_HASH "bbad0b4eec01c5a801d384c03745ef5e97061958f8cf8f7724281d433d7d92a1")
+    set(WHISPER_CPP_HASH
+        "bbad0b4eec01c5a801d384c03745ef5e97061958f8cf8f7724281d433d7d92a1")
     add_compile_definitions("LOCALVOCAL_WITH_HIPBLAS")
   elseif(${ACCELERATION} STREQUAL "vulkan")
-    set(WHISPER_CPP_HASH "12bb34821f9efcd31f04a487569abff2b669221f2706fe0d09c17883635ef58a")
+    set(WHISPER_CPP_HASH
+        "12bb34821f9efcd31f04a487569abff2b669221f2706fe0d09c17883635ef58a")
     add_compile_definitions("LOCALVOCAL_WITH_VULKAN")
   else()
     message(
@@ -118,42 +127,55 @@ elseif(WIN32)
   add_library(Whispercpp::Whisper SHARED IMPORTED)
   set_target_properties(
     Whispercpp::Whisper
-    PROPERTIES IMPORTED_LOCATION
-               ${whispercpp_fetch_SOURCE_DIR}/bin/${CMAKE_SHARED_LIBRARY_PREFIX}whisper${CMAKE_SHARED_LIBRARY_SUFFIX})
+    PROPERTIES
+      IMPORTED_LOCATION
+      ${whispercpp_fetch_SOURCE_DIR}/bin/${CMAKE_SHARED_LIBRARY_PREFIX}whisper${CMAKE_SHARED_LIBRARY_SUFFIX}
+  )
   set_target_properties(
     Whispercpp::Whisper
-    PROPERTIES IMPORTED_IMPLIB
-               ${whispercpp_fetch_SOURCE_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX})
-  set_target_properties(Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
-                                                       ${whispercpp_fetch_SOURCE_DIR}/include)
+    PROPERTIES
+      IMPORTED_IMPLIB
+      ${whispercpp_fetch_SOURCE_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX}
+  )
+  set_target_properties(
+    Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+                                   ${whispercpp_fetch_SOURCE_DIR}/include)
   add_library(Whispercpp::GGML SHARED IMPORTED)
   set_target_properties(
     Whispercpp::GGML
-    PROPERTIES IMPORTED_LOCATION
-               ${whispercpp_fetch_SOURCE_DIR}/bin/${CMAKE_SHARED_LIBRARY_PREFIX}ggml${CMAKE_SHARED_LIBRARY_SUFFIX})
+    PROPERTIES
+      IMPORTED_LOCATION
+      ${whispercpp_fetch_SOURCE_DIR}/bin/${CMAKE_SHARED_LIBRARY_PREFIX}ggml${CMAKE_SHARED_LIBRARY_SUFFIX}
+  )
   set_target_properties(
     Whispercpp::GGML
-    PROPERTIES IMPORTED_IMPLIB
-               ${whispercpp_fetch_SOURCE_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}ggml${CMAKE_STATIC_LIBRARY_SUFFIX})
+    PROPERTIES
+      IMPORTED_IMPLIB
+      ${whispercpp_fetch_SOURCE_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}ggml${CMAKE_STATIC_LIBRARY_SUFFIX}
+  )
 
   if(${ACCELERATION} STREQUAL "cpu")
     # add openblas to the link line
     add_library(Whispercpp::OpenBLAS STATIC IMPORTED)
-    set_target_properties(Whispercpp::OpenBLAS PROPERTIES IMPORTED_LOCATION
-                                                          ${whispercpp_fetch_SOURCE_DIR}/lib/libopenblas.dll.a)
+    set_target_properties(
+      Whispercpp::OpenBLAS
+      PROPERTIES IMPORTED_LOCATION
+                 ${whispercpp_fetch_SOURCE_DIR}/lib/libopenblas.dll.a)
   endif()
 
   # glob all dlls in the bin directory and install them
   file(GLOB WHISPER_ADDITIONAL_FILES ${whispercpp_fetch_SOURCE_DIR}/bin/*.dll)
 else()
-  if(${CMAKE_BUILD_TYPE} STREQUAL Release OR ${CMAKE_BUILD_TYPE} STREQUAL RelWithDebInfo)
+  if(${CMAKE_BUILD_TYPE} STREQUAL Release OR ${CMAKE_BUILD_TYPE} STREQUAL
+                                             RelWithDebInfo)
     set(Whispercpp_BUILD_TYPE Release)
   else()
     set(Whispercpp_BUILD_TYPE Debug)
   endif()
   set(Whispercpp_Build_GIT_TAG "v1.7.1")
   set(WHISPER_EXTRA_CXX_FLAGS "-fPIC")
-  set(WHISPER_ADDITIONAL_CMAKE_ARGS -DWHISPER_BLAS=OFF -DWHISPER_CUBLAS=OFF -DWHISPER_OPENBLAS=OFF)
+  set(WHISPER_ADDITIONAL_CMAKE_ARGS -DWHISPER_BLAS=OFF -DWHISPER_CUBLAS=OFF
+                                    -DWHISPER_OPENBLAS=OFF)
 
   # On Linux build a static Whisper library
   ExternalProject_Add(
@@ -161,18 +183,27 @@ else()
     DOWNLOAD_EXTRACT_TIMESTAMP true
     GIT_REPOSITORY https://github.com/ggerganov/whisper.cpp.git
     GIT_TAG ${Whispercpp_Build_GIT_TAG}
-    BUILD_COMMAND ${CMAKE_COMMAND} --build <BINARY_DIR> --config ${Whispercpp_BUILD_TYPE}
-    BUILD_BYPRODUCTS <INSTALL_DIR>/lib/static/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX}
+    BUILD_COMMAND ${CMAKE_COMMAND} --build <BINARY_DIR> --config
+                  ${Whispercpp_BUILD_TYPE}
+    BUILD_BYPRODUCTS
+      <INSTALL_DIR>/lib/static/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX}
     CMAKE_GENERATOR ${CMAKE_GENERATOR}
-    INSTALL_COMMAND ${CMAKE_COMMAND} --install <BINARY_DIR> --config ${Whispercpp_BUILD_TYPE} && ${CMAKE_COMMAND} -E
-                    copy <SOURCE_DIR>/ggml/include/ggml.h <INSTALL_DIR>/include
+    INSTALL_COMMAND
+      ${CMAKE_COMMAND} --install <BINARY_DIR> --config ${Whispercpp_BUILD_TYPE}
+      && ${CMAKE_COMMAND} -E copy <SOURCE_DIR>/ggml/include/ggml.h
+      <INSTALL_DIR>/include
     CONFIGURE_COMMAND
-      ${CMAKE_COMMAND} -E env ${WHISPER_ADDITIONAL_ENV} ${CMAKE_COMMAND} <SOURCE_DIR> -B <BINARY_DIR> -G
-      ${CMAKE_GENERATOR} -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DCMAKE_BUILD_TYPE=${Whispercpp_BUILD_TYPE}
-      -DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM} -DCMAKE_OSX_DEPLOYMENT_TARGET=10.13
-      -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES_} -DCMAKE_CXX_FLAGS=${WHISPER_EXTRA_CXX_FLAGS}
-      -DCMAKE_C_FLAGS=${WHISPER_EXTRA_CXX_FLAGS} -DBUILD_SHARED_LIBS=OFF -DWHISPER_BUILD_TESTS=OFF
-      -DWHISPER_BUILD_EXAMPLES=OFF ${WHISPER_ADDITIONAL_CMAKE_ARGS})
+      ${CMAKE_COMMAND} -E env ${WHISPER_ADDITIONAL_ENV} ${CMAKE_COMMAND}
+      <SOURCE_DIR> -B <BINARY_DIR> -G ${CMAKE_GENERATOR}
+      -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+      -DCMAKE_BUILD_TYPE=${Whispercpp_BUILD_TYPE}
+      -DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM}
+      -DCMAKE_OSX_DEPLOYMENT_TARGET=10.13
+      -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES_}
+      -DCMAKE_CXX_FLAGS=${WHISPER_EXTRA_CXX_FLAGS}
+      -DCMAKE_C_FLAGS=${WHISPER_EXTRA_CXX_FLAGS} -DBUILD_SHARED_LIBS=OFF
+      -DWHISPER_BUILD_TESTS=OFF -DWHISPER_BUILD_EXAMPLES=OFF
+      ${WHISPER_ADDITIONAL_CMAKE_ARGS})
 
   ExternalProject_Get_Property(Whispercpp_Build INSTALL_DIR)
 
@@ -180,14 +211,20 @@ else()
   add_library(Whispercpp::Whisper STATIC IMPORTED)
   set_target_properties(
     Whispercpp::Whisper
-    PROPERTIES IMPORTED_LOCATION
-               ${INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX})
+    PROPERTIES
+      IMPORTED_LOCATION
+      ${INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX}
+  )
   add_library(Whispercpp::GGML STATIC IMPORTED)
   set_target_properties(
     Whispercpp::GGML
-    PROPERTIES IMPORTED_LOCATION
-               ${INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}ggml${CMAKE_STATIC_LIBRARY_SUFFIX})
-  set_target_properties(Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include)
+    PROPERTIES
+      IMPORTED_LOCATION
+      ${INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}ggml${CMAKE_STATIC_LIBRARY_SUFFIX}
+  )
+  set_target_properties(
+    Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+                                   ${INSTALL_DIR}/include)
 endif()
 
 add_library(Whispercpp INTERFACE)
@@ -197,6 +234,8 @@ if(WIN32 AND "${ACCELERATION}" STREQUAL "cpu")
   target_link_libraries(Whispercpp INTERFACE Whispercpp::OpenBLAS)
 endif()
 if(APPLE)
-  target_link_libraries(Whispercpp INTERFACE "-framework Accelerate -framework CoreML -framework Metal")
+  target_link_libraries(
+    Whispercpp
+    INTERFACE "-framework Accelerate -framework CoreML -framework Metal")
   target_link_libraries(Whispercpp INTERFACE Whispercpp::CoreML)
 endif(APPLE)
diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp
index de9daea..b493754 100644
--- a/src/whisper_wrapper.cpp
+++ b/src/whisper_wrapper.cpp
@@ -91,13 +91,19 @@ class WhisperModel
 
     py::list transcribe(py::array_t<float> audio)
     {
+        py::list result;
+        // Check if input is empty
+        if (audio.is_none() || audio.size() == 0)
+        {
+            return result;
+        }
+
         auto audio_buffer = audio.request();
         float *audio_data = static_cast<float *>(audio_buffer.ptr);
         int n_samples = audio_buffer.size;
 
         std::vector<WhisperSegment> segments = transcribe_raw_audio(audio_data, n_samples);
 
-        py::list result;
         for (const auto &segment : segments)
         {
             result.append(py::cast(segment));
@@ -400,7 +406,7 @@ class ThreadedWhisperModel : public AsyncWhisperModel
         AsyncWhisperModel::start(callback, result_check_interval_ms);
     }
 
-    void stop()
+    void stop() override
     {
         AsyncWhisperModel::stop();
 
diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py
index bf2288b..0d873ad 100644
--- a/test_simpler_whisper.py
+++ b/test_simpler_whisper.py
@@ -167,7 +167,10 @@ def handle_result(chunk_id: int, segments: List[WhisperSegment], is_partial: boo
 
     # wait for all chunks to finish processing
     while len(chunk_ids) > 0:
-        time.sleep(0.1)
+        try:
+            time.sleep(0.1)
+        except:
+            break
 
     # When done
     print("Stopping Whisper model")

From 9828b2698af8d9205b3e6877a0738178fcebd4d7 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Tue, 29 Oct 2024 22:43:56 -0400
Subject: [PATCH 06/11] Refactor cmake/BuildWhispercpp.cmake: Add OpenMP
 support for Unix systems

---
 cmake/BuildWhispercpp.cmake | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/cmake/BuildWhispercpp.cmake b/cmake/BuildWhispercpp.cmake
index c80f801..b71ef00 100644
--- a/cmake/BuildWhispercpp.cmake
+++ b/cmake/BuildWhispercpp.cmake
@@ -1,6 +1,13 @@
 include(ExternalProject)
 include(FetchContent)
 
+if(UNIX AND NOT APPLE)
+  find_package(OpenMP REQUIRED)
+  # Set compiler flags for OpenMP
+  set(WHISPER_EXTRA_CXX_FLAGS "${WHISPER_EXTRA_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+  set(WHISPER_EXTRA_C_FLAGS "${WHISPER_EXTRA_CXX_FLAGS} ${OpenMP_C_FLAGS}")
+endif()
+
 set(PREBUILT_WHISPERCPP_VERSION "0.0.7")
 set(PREBUILT_WHISPERCPP_URL_BASE
     "https://github.com/locaal-ai/occ-ai-dep-whispercpp/releases/download/${PREBUILT_WHISPERCPP_VERSION}"
@@ -177,6 +184,11 @@ else()
   set(WHISPER_ADDITIONAL_CMAKE_ARGS -DWHISPER_BLAS=OFF -DWHISPER_CUBLAS=OFF
                                     -DWHISPER_OPENBLAS=OFF)
 
+  find_package(OpenMP REQUIRED)
+  # Set compiler flags for OpenMP
+  set(WHISPER_EXTRA_CXX_FLAGS "${WHISPER_EXTRA_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+  set(WHISPER_EXTRA_C_FLAGS "${WHISPER_EXTRA_CXX_FLAGS} ${OpenMP_C_FLAGS}")
+
   # On Linux build a static Whisper library
   ExternalProject_Add(
     Whispercpp_Build
@@ -201,8 +213,9 @@ else()
       -DCMAKE_OSX_DEPLOYMENT_TARGET=10.13
       -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES_}
       -DCMAKE_CXX_FLAGS=${WHISPER_EXTRA_CXX_FLAGS}
-      -DCMAKE_C_FLAGS=${WHISPER_EXTRA_CXX_FLAGS} -DBUILD_SHARED_LIBS=OFF
+      -DCMAKE_C_FLAGS=${WHISPER_EXTRA_C_FLAGS} -DBUILD_SHARED_LIBS=OFF
       -DWHISPER_BUILD_TESTS=OFF -DWHISPER_BUILD_EXAMPLES=OFF
+      -DWHISPER_USE_OPENMP=ON # Enable OpenMP explicitly
       ${WHISPER_ADDITIONAL_CMAKE_ARGS})
 
   ExternalProject_Get_Property(Whispercpp_Build INSTALL_DIR)
@@ -225,6 +238,11 @@ else()
   set_target_properties(
     Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
                                    ${INSTALL_DIR}/include)
+  set_property(
+    TARGET Whispercpp::Whisper
+    APPEND
+    PROPERTY INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_CXX)
+
 endif()
 
 add_library(Whispercpp INTERFACE)
@@ -239,3 +257,6 @@ if(APPLE)
     INTERFACE "-framework Accelerate -framework CoreML -framework Metal")
   target_link_libraries(Whispercpp INTERFACE Whispercpp::CoreML)
 endif(APPLE)
+if(UNIX AND NOT APPLE)
+  target_link_libraries(Whispercpp INTERFACE OpenMP::OpenMP_CXX)
+endif()

From a6a42affaa87c6320fc124315c84978c42adcdca Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Tue, 29 Oct 2024 23:49:35 -0400
Subject: [PATCH 07/11] Refactor .github/workflows/build.yaml: Modify import
 path for simpler_whisper to avoid conflicts

---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0b09a31..b0a6fbb 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -81,7 +81,7 @@ jobs:
 
     - name: Test import
       run: |
-        python -c "import simpler_whisper; print(simpler_whisper.__file__)"
+        python -c "import sys; sys.path.pop(0); import simpler_whisper; print(simpler_whisper.__file__)"
 
     - name: Rename wheel file
       shell: python

From ec48641d4ca5a77bf4c29c5f54c0d8f66f034a9b Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Wed, 30 Oct 2024 08:26:51 -0400
Subject: [PATCH 08/11] Refactor .github/workflows/build.yaml: Simplify
 acceleration options and disable import test

---
 .github/workflows/build.yaml | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index b0a6fbb..4bd103f 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -15,7 +15,7 @@ jobs:
         os: ['windows-latest', 'macos-latest', 'ubuntu-latest']
         python-version: ['3.11', '3.12']
         platform: ['x86_64', 'win64']
-        acceleration: ['cpu', 'cuda', 'hipblas', 'vulkan']
+        acceleration: ['cpu', 'cuda']
         exclude:
           - os: windows-latest
             platform: arm64
@@ -23,20 +23,12 @@ jobs:
             platform: x86_64
           - os: macos-latest
             acceleration: cuda
-          - os: macos-latest
-            acceleration: hipblas
-          - os: macos-latest
-            acceleration: vulkan
           - os: macos-latest
             platform: win64
           - os: ubuntu-latest
             platform: win64
           - os: ubuntu-latest
             acceleration: cuda
-          - os: ubuntu-latest
-            acceleration: hipblas
-          - os: ubuntu-latest
-            acceleration: vulkan
 
     runs-on: ${{ matrix.os }}
 
@@ -80,6 +72,7 @@ jobs:
         pip install $wheelFile.FullName
 
     - name: Test import
+      if: false
       run: |
         python -c "import sys; sys.path.pop(0); import simpler_whisper; print(simpler_whisper.__file__)"
 

From bf83a5f2313e7be50097544659a4674e96c28766 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Wed, 30 Oct 2024 09:21:33 -0400
Subject: [PATCH 09/11] Refactor build.yaml, setup.py, whisper.py, and
 test_simpler_whisper.py: Remove wheel renaming logic, update long
 description, and enhance audio normalization

---
 .github/workflows/build.yaml | 19 -------------------
 setup.py                     |  6 +++++-
 simpler_whisper/whisper.py   |  8 ++++++--
 test_simpler_whisper.py      |  2 ++
 4 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 4bd103f..5159778 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -76,25 +76,6 @@ jobs:
       run: |
         python -c "import sys; sys.path.pop(0); import simpler_whisper; print(simpler_whisper.__file__)"
 
-    - name: Rename wheel file
-      shell: python
-      run: |
-        import os
-        import glob
-
-        wheel_file = glob.glob('dist/*.whl')[0]
-        base_name = os.path.basename(wheel_file)
-        name_parts = base_name.split('-')
-
-        # Insert acceleration and platform after the first part of the name
-        underscore_parts = [name_parts[0], '${{ matrix.acceleration }}', '${{ matrix.platform }}']
-        new_name_parts = ['_'.join(underscore_parts)] + name_parts[1:]
-        new_name = '-'.join(new_name_parts)
-
-        new_path = os.path.join('dist', new_name)
-        os.rename(wheel_file, new_path)
-        print(f"Renamed {base_name} to {new_name}")
-
     - name: Set wheel name
       shell: pwsh
       run: |
diff --git a/setup.py b/setup.py
index 03f1fad..223c6f0 100644
--- a/setup.py
+++ b/setup.py
@@ -99,17 +99,21 @@ def build_extension(self, ext):
         )
 
 
+acceleration = os.getenv("SIMPLER_WHISPER_ACCELERATION", "")
+build_tag = acceleration if acceleration else ""
+
 setup(
     name="simpler-whisper",
     version="0.2.2",
     author="Roy Shilkrot",
     author_email="roy.shil@gmail.com",
     description="A simple Python wrapper for whisper.cpp",
-    long_description="",
+    long_description="A simple Python wrapper for whisper.cpp",
     ext_modules=[CMakeExtension("simpler_whisper._whisper_cpp")],
     cmdclass=dict(build_ext=CMakeBuild),
     zip_safe=False,
     packages=[
         "simpler_whisper"
     ],  # Add this line to ensure the package directory is created
+    options={"bdist_wheel": {"build_tag": build_tag}},
 )
diff --git a/simpler_whisper/whisper.py b/simpler_whisper/whisper.py
index 834cbbe..090ee0c 100644
--- a/simpler_whisper/whisper.py
+++ b/simpler_whisper/whisper.py
@@ -74,7 +74,9 @@ def transcribe(self, audio: Union[np.ndarray, List[float]]) -> int:
         # Run async inference (no return value)
         return self.model.transcribe(audio)
 
-    def handle_result(self, chunk_id: int, segments: List[WhisperSegment], is_partial: bool):
+    def handle_result(
+        self, chunk_id: int, segments: List[WhisperSegment], is_partial: bool
+    ):
         if self.callback is not None:
             self.callback(chunk_id, segments, is_partial)
 
@@ -143,7 +145,9 @@ def __init__(
         self._is_running = False
         self.callback = callback
 
-    def handle_result(self, chunk_id: int, segments: List[WhisperSegment], is_partial: bool):
+    def handle_result(
+        self, chunk_id: int, segments: List[WhisperSegment], is_partial: bool
+    ):
         if self.callback is not None:
             self.callback(chunk_id, segments, is_partial)
 
diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py
index 0d873ad..fda26bf 100644
--- a/test_simpler_whisper.py
+++ b/test_simpler_whisper.py
@@ -64,6 +64,8 @@ def get_samples_from_frame(frame: av.AudioFrame) -> np.ndarray:
     # check if the type is int16 or float32
     if incoming_audio.dtype == np.int16:
         incoming_audio = incoming_audio / 32768.0  # normalize to [-1, 1]
+    if incoming_audio.dtype == np.int32:
+        incoming_audio = incoming_audio / 2147483648.0  # normalize to [-1, 1]
     # resample to 16kHz if needed
     if frame.rate != 16000:
         samples = resampy.resample(incoming_audio, frame.rate, 16000)

From f84c3cbeee643cd100db8b1be34d1369e5b21921 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Wed, 30 Oct 2024 09:47:13 -0400
Subject: [PATCH 10/11] Refactor pyproject.toml and setup.py: Update numpy
 version constraint and enhance wheel distribution tagging

---
 pyproject.toml |  3 +--
 setup.py       | 18 ++++++++++++++----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ad115e5..8a9ca23 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=45", "wheel", "cmake>=3.12", "numpy"]
+requires = ["setuptools>=45", "wheel", "cmake>=3.12", "numpy<=1.26.4"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -17,7 +17,6 @@ classifiers = [
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
 ]
diff --git a/setup.py b/setup.py
index 223c6f0..0337013 100644
--- a/setup.py
+++ b/setup.py
@@ -5,6 +5,7 @@
 import subprocess
 import platform
 import sysconfig
+from wheel.bdist_wheel import bdist_wheel
 
 
 class CMakeExtension(Extension):
@@ -99,8 +100,15 @@ def build_extension(self, ext):
         )
 
 
-acceleration = os.getenv("SIMPLER_WHISPER_ACCELERATION", "")
-build_tag = acceleration if acceleration else ""
+class CustomBdistWheel(bdist_wheel):
+    def get_tag(self):
+        python, abi, platform = super().get_tag()
+        acceleration = os.environ.get("SIMPLER_WHISPER_ACCELERATION", "")
+        if acceleration:
+            # This creates the +cuda or +cpu tag
+            self.distribution.version += f"+{acceleration}"
+        return python, abi, platform
+
 
 setup(
     name="simpler-whisper",
@@ -110,10 +118,12 @@ def build_extension(self, ext):
     description="A simple Python wrapper for whisper.cpp",
     long_description="A simple Python wrapper for whisper.cpp",
     ext_modules=[CMakeExtension("simpler_whisper._whisper_cpp")],
-    cmdclass=dict(build_ext=CMakeBuild),
+    cmdclass={
+        "build_ext": CMakeBuild,
+        "bdist_wheel": CustomBdistWheel,
+    },
     zip_safe=False,
     packages=[
         "simpler_whisper"
     ],  # Add this line to ensure the package directory is created
-    options={"bdist_wheel": {"build_tag": build_tag}},
 )

From 4136426fe8c656894b558655125b823baa676095 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Wed, 30 Oct 2024 09:51:14 -0400
Subject: [PATCH 11/11] Refactor setup.py: Modify version tagging logic to
 temporarily adjust version for acceleration

---
 setup.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 0337013..dc6e14a 100644
--- a/setup.py
+++ b/setup.py
@@ -105,8 +105,10 @@ def get_tag(self):
         python, abi, platform = super().get_tag()
         acceleration = os.environ.get("SIMPLER_WHISPER_ACCELERATION", "")
         if acceleration:
-            # This creates the +cuda or +cpu tag
-            self.distribution.version += f"+{acceleration}"
+            # Store original version
+            orig_version = self.distribution.get_version()
+            # Temporarily modify version
+            self.distribution.metadata.version = f"{orig_version}+{acceleration}"
         return python, abi, platform