From 1552868f0c99863f3afbd1bead0c9c4be41b2ec2 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Sun, 27 Oct 2024 09:21:11 -0400 Subject: [PATCH 01/11] Refactor version numbers in pyproject.toml and setup.py --- pyproject.toml | 2 +- setup.py | 2 +- simpler_whisper/whisper.py | 2 +- src/whisper_wrapper.cpp | 130 +++++++++++++++++++++++-------------- test_simpler_whisper.py | 5 +- 5 files changed, 88 insertions(+), 53 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3db127c..d0d78a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "simpler-whisper" -version = "0.2.1" +version = "0.2.2" authors = [ {name = "Roy Shilkrot", email = "roy.shil@gmail.com"}, ] diff --git a/setup.py b/setup.py index a7e86ef..a34fff7 100644 --- a/setup.py +++ b/setup.py @@ -99,7 +99,7 @@ def build_extension(self, ext): setup( name="simpler-whisper", - version="0.2.1", + version="0.2.2", author="Roy Shilkrot", author_email="roy.shil@gmail.com", description="A simple Python wrapper for whisper.cpp", diff --git a/simpler_whisper/whisper.py b/simpler_whisper/whisper.py index b0ced56..62319a8 100644 --- a/simpler_whisper/whisper.py +++ b/simpler_whisper/whisper.py @@ -48,7 +48,7 @@ class ThreadedWhisperModel: def __init__( self, model_path: str, - callback: Callable[[int, str, bool], None], + callback: Callable[[int, List[WhisperSegment], bool], None], use_gpu=False, max_duration_sec=10.0, sample_rate=16000, diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp index 68afd6e..11f5f7f 100644 --- a/src/whisper_wrapper.cpp +++ b/src/whisper_wrapper.cpp @@ -119,6 +119,8 @@ class WhisperModel { const char *text = whisper_full_get_segment_text(ctx, i); WhisperSegment segment; + segment.start = whisper_full_get_segment_t0(ctx, i); + segment.end = whisper_full_get_segment_t1(ctx, i); segment.text = std::string(text); const int n_tokens = whisper_full_n_tokens(ctx, i); for (int j = 0; j < n_tokens; ++j) @@ -146,6 +148,7 @@ class WhisperModel whisper_full_params params; }; + struct AudioChunk { std::vector data; @@ -155,25 +158,22 @@ struct AudioChunk struct TranscriptionResult { size_t chunk_id; - std::vector segments; bool is_partial; + std::vector segments; }; -class ThreadedWhisperModel + +class AsyncWhisperModel : public WhisperModel { public: - ThreadedWhisperModel(const std::string &model_path, bool use_gpu = false, - float max_duration_sec = 10.0f, int sample_rate = 16000) - : running(false), next_chunk_id(0), - max_samples(static_cast(max_duration_sec * sample_rate)), - current_chunk_id(0), model_path(model_path), - use_gpu(use_gpu) + AsyncWhisperModel(const std::string &model_path, bool use_gpu = false) : + WhisperModel(model_path, use_gpu), running(false), next_chunk_id(0), current_chunk_id(0) { + } - ~ThreadedWhisperModel() + ~AsyncWhisperModel() { - stop(); } void start(py::function callback, int result_check_interval_ms = 100) @@ -184,12 +184,17 @@ class ThreadedWhisperModel running = true; result_callback = callback; - process_thread = std::thread(&ThreadedWhisperModel::processThread, this); - result_thread = std::thread(&ThreadedWhisperModel::resultThread, this, + process_thread = std::thread(&AsyncWhisperModel::processThread, this); + result_thread = std::thread(&AsyncWhisperModel::resultThread, this, result_check_interval_ms); } + + void transcribe(py::array_t audio) + { + this->queueAudio(audio); + } - void stop() + virtual void stop() { if (!running) return; @@ -209,12 +214,6 @@ class ThreadedWhisperModel process_thread.join(); if (result_thread.joinable()) result_thread.join(); - - // Clear accumulated buffer - { - std::lock_guard lock(buffer_mutex); - accumulated_buffer.clear(); - } } size_t queueAudio(py::array_t audio) @@ -236,13 +235,69 @@ class ThreadedWhisperModel return chunk.id; } +protected: + + virtual void processThread() = 0; + virtual void resultThread(int check_interval_ms) = 0; + + std::atomic running; + std::atomic next_chunk_id; + size_t current_chunk_id; + + std::thread process_thread; + std::thread result_thread; + + std::queue input_queue; + std::mutex input_mutex; + std::condition_variable input_cv; + + std::queue result_queue; + std::mutex result_mutex; + std::condition_variable result_cv; + + py::function result_callback; + +}; + + +class ThreadedWhisperModel : public AsyncWhisperModel +{ +public: + ThreadedWhisperModel(const std::string &model_path, bool use_gpu = false, + float max_duration_sec = 10.0f, int sample_rate = 16000) + : AsyncWhisperModel(model_path, use_gpu), + max_samples(static_cast(max_duration_sec * sample_rate)) + { + } + + ~ThreadedWhisperModel() + { + stop(); + } + void setMaxDuration(float max_duration_sec, int sample_rate = 16000) { max_samples = static_cast(max_duration_sec * sample_rate); } + void start(py::function callback, int result_check_interval_ms = 100) + { + AsyncWhisperModel::start(callback, result_check_interval_ms); + } + + void stop() + { + AsyncWhisperModel::stop(); + + // Clear accumulated buffer + { + std::lock_guard lock(buffer_mutex); + accumulated_buffer.clear(); + } + } + private: - void processAccumulatedAudio(WhisperModel &model, bool force_final = false) + void processAccumulatedAudio(bool force_final = false) { std::vector process_buffer; size_t current_id; @@ -266,7 +321,7 @@ class ThreadedWhisperModel std::vector segments; try { - segments = model.transcribe_raw_audio(process_buffer.data(), process_buffer.size()); + segments = this->transcribe_raw_audio(process_buffer.data(), process_buffer.size()); } catch (const std::exception &e) { @@ -286,7 +341,7 @@ class ThreadedWhisperModel result.chunk_id = current_id; for (const auto &segment : segments) { - result.segments.push_back(segment.text); + result.segments.push_back(segment); } // Set partial flag based on whether this is a final result result.is_partial = !(force_final || process_buffer.size() >= max_samples); @@ -301,8 +356,6 @@ class ThreadedWhisperModel void processThread() { - WhisperModel model(this->model_path, this->use_gpu); - while (running) { AudioChunk all_chunks; @@ -317,7 +370,7 @@ class ThreadedWhisperModel if (!running) { // Process any remaining audio as final before shutting down - processAccumulatedAudio(model, true); + processAccumulatedAudio(true); break; } @@ -346,7 +399,7 @@ class ThreadedWhisperModel } // Process the accumulated audio - processAccumulatedAudio(model, false); + processAccumulatedAudio(false); } } } @@ -386,7 +439,7 @@ class ThreadedWhisperModel std::string full_text; for (const auto &segment : result.segments) { - full_text += segment; + full_text += segment.text; } full_text = trim(full_text); if (full_text.empty()) @@ -396,7 +449,7 @@ class ThreadedWhisperModel { try { - result_callback((int)result.chunk_id, py::str(full_text), result.is_partial); + result_callback((int)result.chunk_id, result.segments, result.is_partial); } catch (const std::exception &e) { @@ -412,31 +465,10 @@ class ThreadedWhisperModel } } - whisper_context *ctx; - std::atomic running; - std::atomic next_chunk_id; - size_t current_chunk_id; - // Audio accumulation std::vector accumulated_buffer; size_t max_samples; std::mutex buffer_mutex; - - std::thread process_thread; - std::thread result_thread; - - std::queue input_queue; - std::mutex input_mutex; - std::condition_variable input_cv; - - std::queue result_queue; - std::mutex result_mutex; - std::condition_variable result_cv; - - py::function result_callback; - - std::string model_path; - bool use_gpu; }; PYBIND11_MODULE(_whisper_cpp, m) diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py index 745aef2..e44444e 100644 --- a/test_simpler_whisper.py +++ b/test_simpler_whisper.py @@ -1,3 +1,4 @@ +from typing import List import av import argparse import sys @@ -10,6 +11,7 @@ import resampy from simpler_whisper.whisper import ( + WhisperSegment, load_model, set_log_callback, LogLevel, @@ -125,7 +127,8 @@ def test_simpler_whisper(): def test_threaded_whisper(): set_log_callback(my_log_callback) - def handle_result(chunk_id: int, text: str, is_partial: bool): + def handle_result(chunk_id: int, segments: List[WhisperSegment], is_partial: bool): + text = " ".join([seg.text for seg in segments]) print( f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}): {text}" ) From 88d38992ff19aa797cce3ffb9a65dd454147b20d Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Sun, 27 Oct 2024 23:26:46 -0400 Subject: [PATCH 02/11] Refactor test_simpler_whisper.py and whisper.py: Add async transcription support --- simpler_whisper/whisper.py | 99 ++++++++++++++----- src/whisper_wrapper.cpp | 197 +++++++++++++++++++++++-------------- test_simpler_whisper.py | 66 +++++++++++-- 3 files changed, 258 insertions(+), 104 deletions(-) diff --git a/simpler_whisper/whisper.py b/simpler_whisper/whisper.py index 62319a8..596f971 100644 --- a/simpler_whisper/whisper.py +++ b/simpler_whisper/whisper.py @@ -44,6 +44,77 @@ def __del__(self): del self.model +class AsyncWhisperModel: + """ + AsyncWhisperModel is a class that provides asynchronous transcription of audio data using a Whisper model. + """ + + def __init__( + self, + model_path: str, + callback: Callable[[int, List[WhisperSegment], bool], None], + use_gpu=False, + ): + self.model = _whisper_cpp.AsyncWhisperModel(model_path, use_gpu) + self._is_running = False + self.callback = callback + + def transcribe(self, audio: Union[np.ndarray, List[float]]) -> int: + """ + Transcribes the given audio input using the model. + Args: + audio (Union[np.ndarray, List[float]]): The audio data to be transcribed. + It can be either a numpy array or a list of floats. + Returns: + int: The queued chunk ID. + """ + # Ensure audio is a numpy array of float32 + audio = np.array(audio, dtype=np.float32) + + # Run async inference (no return value) + return self.model.transcribe(audio) + + def handle_result(self, chunk_id: int, text: str, is_partial: bool): + if self.callback is not None: + self.callback(chunk_id, text, is_partial) + + def start(self, result_check_interval_ms=100): + """ + Start the processing threads with a callback for results. + + Args: + callback: Function that takes three arguments: + - chunk_id (int): Unique identifier for the audio chunk + - segments (WhisperSegment): Transcribed text for the audio chunk + - is_partial (bool): Whether this is a partial result + result_check_interval_ms (int): How often to check for results + """ + if self._is_running: + return + + self.model.start(self.handle_result, result_check_interval_ms) + self._is_running = True + + def stop(self): + """ + Stop processing and clean up resources. + Any remaining audio will be processed as a final segment. + """ + if not self._is_running: + return + + self.model.stop() + self._is_running = False + + def __del__(self): + # Explicitly delete the C++ object + if hasattr(self, "model"): + if self._is_running: + self.stop() + self._is_running = False + del self.model + + class ThreadedWhisperModel: def __init__( self, @@ -61,6 +132,10 @@ def __init__( use_gpu (bool): Whether to use GPU acceleration max_duration_sec (float): Maximum duration in seconds before finalizing a segment sample_rate (int): Audio sample rate (default: 16000) + callback: Function that takes three arguments: + - chunk_id (int): Unique identifier for the audio chunk + - segments (List[WhisperSegment]): Transcribed text for the audio chunk + - is_partial (bool): Whether this is a partial result """ self.model = _whisper_cpp.ThreadedWhisperModel( model_path, use_gpu, max_duration_sec, sample_rate @@ -79,7 +154,7 @@ def start(self, result_check_interval_ms=100): Args: callback: Function that takes three arguments: - chunk_id (int): Unique identifier for the audio chunk - - segments (str): Transcribed text for the audio chunk + - segments (WhisperSegment): Transcribed text for the audio chunk - is_partial (bool): Whether this is a partial result result_check_interval_ms (int): How often to check for results """ @@ -133,28 +208,6 @@ def __del__(self): del self.model -def load_model(model_path: str, use_gpu=False) -> WhisperModel: - return WhisperModel(model_path, use_gpu) - - -def load_threaded_model( - model_path: str, use_gpu=False, max_duration_sec=10.0, sample_rate=16000 -) -> ThreadedWhisperModel: - """ - Load a threaded Whisper model for continuous audio processing. - - Args: - model_path (str): Path to the Whisper model file - use_gpu (bool): Whether to use GPU acceleration - max_duration_sec (float): Maximum duration in seconds before finalizing a segment - sample_rate (int): Audio sample rate (default: 16000) - - Returns: - ThreadedWhisperModel: A model instance ready for processing - """ - return ThreadedWhisperModel(model_path, use_gpu, max_duration_sec, sample_rate) - - def set_log_callback(callback): """ Set a custom logging callback function. diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp index 11f5f7f..381ad36 100644 --- a/src/whisper_wrapper.cpp +++ b/src/whisper_wrapper.cpp @@ -148,7 +148,6 @@ class WhisperModel whisper_full_params params; }; - struct AudioChunk { std::vector data; @@ -162,14 +161,11 @@ struct TranscriptionResult std::vector segments; }; - class AsyncWhisperModel : public WhisperModel { public: - AsyncWhisperModel(const std::string &model_path, bool use_gpu = false) : - WhisperModel(model_path, use_gpu), running(false), next_chunk_id(0), current_chunk_id(0) + AsyncWhisperModel(const std::string &model_path, bool use_gpu = false) : WhisperModel(model_path, use_gpu), running(false), next_chunk_id(0), current_chunk_id(0) { - } ~AsyncWhisperModel() @@ -188,10 +184,19 @@ class AsyncWhisperModel : public WhisperModel result_thread = std::thread(&AsyncWhisperModel::resultThread, this, result_check_interval_ms); } - - void transcribe(py::array_t audio) + + /** + * @brief Transcribes the given audio data. + * + * This function takes an audio input in the form of a py::array_t and + * processes it by queuing the audio for transcription. + * + * @param audio A py::array_t containing the audio data to be transcribed. + * @return size_t The queued chunk ID. + */ + size_t transcribe(py::array_t audio) { - this->queueAudio(audio); + return this->queueAudio(audio); } virtual void stop() @@ -236,9 +241,106 @@ class AsyncWhisperModel : public WhisperModel } protected: + virtual void processThread() + { + while (running) + { + // Get next chunk from input queue + { + std::unique_lock lock(input_mutex); + input_cv.wait(lock, [this] + { return !input_queue.empty() || !running; }); + + AudioChunk chunk = std::move(input_queue.front()); + input_queue.pop(); + + // Process audio + TranscriptionResult result; + result.chunk_id = chunk.id; + result.is_partial = false; + try + { + result.segments = this->transcribe_raw_audio(chunk.data.data(), chunk.data.size()); + } + catch (const std::exception &e) + { + std::cerr << "Exception during transcription: " << e.what() << std::endl; + } + catch (...) + { + std::cerr << "Unknown exception during transcription" << std::endl; + } + + // Add result to output queue + { + std::lock_guard lock(result_mutex); + result_queue.push(result); + result_cv.notify_one(); + } + } + } + } + + void resultThread(int check_interval_ms) + { + while (running) + { + std::vector results; + + { + std::unique_lock lock(result_mutex); + result_cv.wait_for(lock, + std::chrono::milliseconds(check_interval_ms), + [this] + { return !result_queue.empty() || !running; }); - virtual void processThread() = 0; - virtual void resultThread(int check_interval_ms) = 0; + if (!running && result_queue.empty()) + break; + + while (!result_queue.empty()) + { + results.push_back(std::move(result_queue.front())); + result_queue.pop(); + } + } + + if (!results.empty()) + { + py::gil_scoped_acquire gil; + for (const auto &result : results) + { + if (result.segments.empty()) + continue; + + // concatenate segments into a single string + std::string full_text; + for (const auto &segment : result.segments) + { + full_text += segment.text; + } + full_text = trim(full_text); + if (full_text.empty()) + continue; + + if (result_callback) + { + try + { + result_callback((int)result.chunk_id, result.segments, result.is_partial); + } + catch (const std::exception &e) + { + std::cerr << "Exception in result callback: " << e.what() << std::endl; + } + catch (...) + { + std::cerr << "Unknown exception in result callback" << std::endl; + } + } + } + } + } + } std::atomic running; std::atomic next_chunk_id; @@ -256,10 +358,8 @@ class AsyncWhisperModel : public WhisperModel std::condition_variable result_cv; py::function result_callback; - }; - class ThreadedWhisperModel : public AsyncWhisperModel { public: @@ -354,7 +454,7 @@ class ThreadedWhisperModel : public AsyncWhisperModel } } - void processThread() + void processThread() override { while (running) { @@ -404,67 +504,6 @@ class ThreadedWhisperModel : public AsyncWhisperModel } } - void resultThread(int check_interval_ms) - { - while (running) - { - std::vector results; - - { - std::unique_lock lock(result_mutex); - result_cv.wait_for(lock, - std::chrono::milliseconds(check_interval_ms), - [this] - { return !result_queue.empty() || !running; }); - - if (!running && result_queue.empty()) - break; - - while (!result_queue.empty()) - { - results.push_back(std::move(result_queue.front())); - result_queue.pop(); - } - } - - if (!results.empty()) - { - py::gil_scoped_acquire gil; - for (const auto &result : results) - { - if (result.segments.empty()) - continue; - - // concatenate segments into a single string - std::string full_text; - for (const auto &segment : result.segments) - { - full_text += segment.text; - } - full_text = trim(full_text); - if (full_text.empty()) - continue; - - if (result_callback) - { - try - { - result_callback((int)result.chunk_id, result.segments, result.is_partial); - } - catch (const std::exception &e) - { - std::cerr << "Exception in result callback: " << e.what() << std::endl; - } - catch (...) - { - std::cerr << "Unknown exception in result callback" << std::endl; - } - } - } - } - } - } - // Audio accumulation std::vector accumulated_buffer; size_t max_samples; @@ -507,6 +546,16 @@ PYBIND11_MODULE(_whisper_cpp, m) .def(py::init()) .def("transcribe", &WhisperModel::transcribe); + // Expose asynchronous model + py::class_(m, "AsyncWhisperModel") + .def(py::init()) + .def("start", &AsyncWhisperModel::start, + py::arg("callback"), + py::arg("result_check_interval_ms") = 100) + .def("stop", &AsyncWhisperModel::stop) + .def("transcribe", &AsyncWhisperModel::transcribe) + .def("queue_audio", &AsyncWhisperModel::queueAudio); + py::class_(m, "ThreadedWhisperModel") .def(py::init(), py::arg("model_path"), diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py index e44444e..bf2288b 100644 --- a/test_simpler_whisper.py +++ b/test_simpler_whisper.py @@ -2,20 +2,21 @@ import av import argparse import sys - -# Remove the current directory from sys.path to avoid conflicts with the installed package -sys.path.pop(0) - import numpy as np import time import resampy +import librosa + +# Remove the current directory from sys.path to avoid conflicts with the installed package +sys.path.pop(0) from simpler_whisper.whisper import ( WhisperSegment, - load_model, set_log_callback, LogLevel, + WhisperModel, ThreadedWhisperModel, + AsyncWhisperModel, ) @@ -36,7 +37,7 @@ def my_log_callback(level, message): parser.add_argument( "method", type=str, - choices=["regular", "threaded"], + choices=["regular", "threaded", "async"], help="The method to use for testing the model", ) args = parser.parse_args() @@ -77,7 +78,7 @@ def test_simpler_whisper(): # Load the model print("Loading the Whisper model...") - model = load_model(model_path, use_gpu=True) + model = WhisperModel(model_path, use_gpu=True) print("Model loaded successfully!") # Load audio from file with av @@ -124,6 +125,55 @@ def test_simpler_whisper(): print("Transcription completed.") +def test_async_whisper(): + set_log_callback(my_log_callback) + chunk_ids = [] + + def handle_result(chunk_id: int, segments: List[WhisperSegment], is_partial: bool): + text = " ".join([seg.text for seg in segments]) + print( + f"Chunk {chunk_id} results ({'partial' if is_partial else 'final'}): {text}" + ) + # remove the chunk_id from the list of chunk_ids + chunk_ids.remove(chunk_id) + + # Create model + model = AsyncWhisperModel( + model_path=model_path, callback=handle_result, use_gpu=True + ) + + print("Loading audio from file...") + # Load audio from file with librosa + audio_data, sample_rate = librosa.load(audio_file, sr=16000) + + # Start processing with callback + print("Starting Whisper model") + model.start() + + # create 30-seconds chunks of audio_data + for i in range(0, len(audio_data), 16000 * 30): + try: + samples_for_transcription = audio_data[i : i + 16000 * 30] + + # Queue the chunk for processing + chunk_id = model.transcribe(samples_for_transcription) + chunk_ids.append(chunk_id) + print(f"Queued chunk {chunk_id}") + + # reset + samples_for_transcription = np.array([]) + except: + break + + # wait for all chunks to finish processing + while len(chunk_ids) > 0: + time.sleep(0.1) + + # When done + print("Stopping Whisper model") + model.stop() + + def test_threaded_whisper(): set_log_callback(my_log_callback) @@ -173,5 +223,7 @@ def handle_result(chunk_id: int, segments: List[WhisperSegment], is_partial: boo if __name__ == "__main__": if args.method == "regular": test_simpler_whisper() + elif args.method == "async": + test_async_whisper() else: test_threaded_whisper() From 22d53f1b09bcfa320ac5a0e759ec181319f42ad4 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Mon, 28 Oct 2024 09:08:45 -0400 Subject: [PATCH 03/11] Refactor simpler_whisper/__init__.py: Import _whisper_cpp module --- pyproject.toml | 6 ++ simpler_whisper/__init__.py | 1 + simpler_whisper/whisper.py | 8 +- src/whisper_wrapper.cpp | 79 +++++++++------ tests/__init__.py | 0 tests/test_wrapper.py | 197 ++++++++++++++++++++++++++++++++++++ 6 files changed, 256 insertions(+), 35 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/test_wrapper.py diff --git a/pyproject.toml b/pyproject.toml index d0d78a8..ad115e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,3 +34,9 @@ packages = ["simpler_whisper"] [tool.setuptools.package-data] simpler_whisper = ["*.dll", "*.pyd", "*.so", "*.metal"] + +[tool.pytest] +testpaths = ["tests"] +python_files = "test_*.py" +python_classes = "Test*" +python_functions = "test_*" \ No newline at end of file diff --git a/simpler_whisper/__init__.py b/simpler_whisper/__init__.py index e69de29..86d4364 100644 --- a/simpler_whisper/__init__.py +++ b/simpler_whisper/__init__.py @@ -0,0 +1 @@ +from . import _whisper_cpp diff --git a/simpler_whisper/whisper.py b/simpler_whisper/whisper.py index 596f971..834cbbe 100644 --- a/simpler_whisper/whisper.py +++ b/simpler_whisper/whisper.py @@ -74,9 +74,9 @@ def transcribe(self, audio: Union[np.ndarray, List[float]]) -> int: # Run async inference (no return value) return self.model.transcribe(audio) - def handle_result(self, chunk_id: int, text: str, is_partial: bool): + def handle_result(self, chunk_id: int, segments: List[WhisperSegment], is_partial: bool): if self.callback is not None: - self.callback(chunk_id, text, is_partial) + self.callback(chunk_id, segments, is_partial) def start(self, result_check_interval_ms=100): """ @@ -143,9 +143,9 @@ def __init__( self._is_running = False self.callback = callback - def handle_result(self, chunk_id: int, text: str, is_partial: bool): + def handle_result(self, chunk_id: int, segments: List[WhisperSegment], is_partial: bool): if self.callback is not None: - self.callback(chunk_id, text, is_partial) + self.callback(chunk_id, segments, is_partial) def start(self, result_check_interval_ms=100): """ diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp index 381ad36..de9daea 100644 --- a/src/whisper_wrapper.cpp +++ b/src/whisper_wrapper.cpp @@ -161,10 +161,11 @@ struct TranscriptionResult std::vector segments; }; -class AsyncWhisperModel : public WhisperModel +class AsyncWhisperModel { public: - AsyncWhisperModel(const std::string &model_path, bool use_gpu = false) : WhisperModel(model_path, use_gpu), running(false), next_chunk_id(0), current_chunk_id(0) + AsyncWhisperModel(const std::string &model_path, bool use_gpu = false) : model_path(model_path), use_gpu(use_gpu), + running(false), next_chunk_id(0), current_chunk_id(0) { } @@ -243,40 +244,51 @@ class AsyncWhisperModel : public WhisperModel protected: virtual void processThread() { + WhisperModel model(model_path, use_gpu); + while (running) { + AudioChunk chunk; // Get next chunk from input queue { std::unique_lock lock(input_mutex); - input_cv.wait(lock, [this] - { return !input_queue.empty() || !running; }); + input_cv.wait_for(lock, + std::chrono::milliseconds(100), + [this] + { return !input_queue.empty() || !running; }); + + if (!running) + break; + + if (input_queue.empty()) + continue; - AudioChunk chunk = std::move(input_queue.front()); + chunk = std::move(input_queue.front()); input_queue.pop(); + } - // Process audio - TranscriptionResult result; - result.chunk_id = chunk.id; - result.is_partial = false; - try - { - result.segments = this->transcribe_raw_audio(chunk.data.data(), chunk.data.size()); - } - catch (const std::exception &e) - { - std::cerr << "Exception during transcription: " << e.what() << std::endl; - } - catch (...) - { - std::cerr << "Unknown exception during transcription" << std::endl; - } + // Process audio + TranscriptionResult result; + result.chunk_id = chunk.id; + result.is_partial = false; + try + { + result.segments = model.transcribe_raw_audio(chunk.data.data(), chunk.data.size()); + } + catch (const std::exception &e) + { + std::cerr << "Exception during transcription: " << e.what() << std::endl; + } + catch (...) + { + std::cerr << "Unknown exception during transcription" << std::endl; + } - // Add result to output queue - { - std::lock_guard lock(result_mutex); - result_queue.push(result); - result_cv.notify_one(); - } + // Add result to output queue + { + std::lock_guard lock(result_mutex); + result_queue.push(result); + result_cv.notify_one(); } } } @@ -342,6 +354,9 @@ class AsyncWhisperModel : public WhisperModel } } + std::string model_path; + bool use_gpu; + std::atomic running; std::atomic next_chunk_id; size_t current_chunk_id; @@ -397,7 +412,7 @@ class ThreadedWhisperModel : public AsyncWhisperModel } private: - void processAccumulatedAudio(bool force_final = false) + void processAccumulatedAudio(WhisperModel &model, bool force_final = false) { std::vector process_buffer; size_t current_id; @@ -421,7 +436,7 @@ class ThreadedWhisperModel : public AsyncWhisperModel std::vector segments; try { - segments = this->transcribe_raw_audio(process_buffer.data(), process_buffer.size()); + segments = model.transcribe_raw_audio(process_buffer.data(), process_buffer.size()); } catch (const std::exception &e) { @@ -456,6 +471,8 @@ class ThreadedWhisperModel : public AsyncWhisperModel void processThread() override { + WhisperModel model(model_path, use_gpu); + while (running) { AudioChunk all_chunks; @@ -470,7 +487,7 @@ class ThreadedWhisperModel : public AsyncWhisperModel if (!running) { // Process any remaining audio as final before shutting down - processAccumulatedAudio(true); + processAccumulatedAudio(model, true); break; } @@ -499,7 +516,7 @@ class ThreadedWhisperModel : public AsyncWhisperModel } // Process the accumulated audio - processAccumulatedAudio(false); + processAccumulatedAudio(model, false); } } } diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py new file mode 100644 index 0000000..6cf7d1a --- /dev/null +++ b/tests/test_wrapper.py @@ -0,0 +1,197 @@ +import unittest +import numpy as np +import threading +import time +import queue +import os +from concurrent.futures import ThreadPoolExecutor +from simpler_whisper import _whisper_cpp as whisper + + +class TestWhisperWrapper(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Get the model path relative to the project root + cls.model_path = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "ggml-tiny.en-q5_1.bin" + ) + + # Verify model exists + if not os.path.exists(cls.model_path): + raise FileNotFoundError(f"Model file not found at {cls.model_path}") + + # Create sample audio data (silence) + cls.sample_rate = 16000 + duration_sec = 3 + cls.test_audio = np.zeros(cls.sample_rate * duration_sec, dtype=np.float32) + + # Create some mock audio with varying amplitudes for better testing + cls.mock_speech = np.sin( + 2 * np.pi * 440 * np.linspace(0, 1, cls.sample_rate) + ).astype(np.float32) + + def setUp(self): + """Ensure each test starts with fresh instances""" + self.results = queue.Queue() + + def tearDown(self): + """Cleanup after each test""" + while not self.results.empty(): + try: + self.results.get_nowait() + except queue.Empty: + break + + def test_sync_model_basic(self): + """Test basic synchronous model initialization and transcription""" + try: + model = whisper.WhisperModel(self.model_path, use_gpu=False) + result = model.transcribe(self.test_audio) + self.assertIsInstance(result, list) + except Exception as e: + self.fail(f"Basic synchronous model test failed: {str(e)}") + + def test_sync_model_empty_audio(self): + """Test synchronous model with empty audio""" + model = whisper.WhisperModel(self.model_path, use_gpu=False) + empty_audio = np.array([], dtype=np.float32) + with self.assertRaises(Exception): + model.transcribe(empty_audio) + + def test_sync_model_invalid_audio(self): + """Test synchronous model with invalid audio data""" + model = whisper.WhisperModel(self.model_path, use_gpu=False) + invalid_audio = np.array([1.5, -1.5], dtype=np.float64) # Wrong dtype + with self.assertRaises(Exception): + model.transcribe(invalid_audio) + + def test_async_model_basic(self): + """Test basic async model functionality""" + results = queue.Queue() + + def callback(chunk_id, segments, is_partial): + results.put((chunk_id, segments, is_partial)) + + model = whisper.AsyncWhisperModel(self.model_path, use_gpu=False) + try: + model.start(callback) + chunk_id = model.transcribe(self.test_audio) + + # Wait for result with timeout + try: + result = results.get(timeout=10) + self.assertEqual(result[0], chunk_id) # Check if chunk_id matches + except queue.Empty: + self.fail("Async transcription timeout") + + finally: + model.stop() + + def test_threaded_model_basic(self): + """Test basic threaded model functionality""" + results = queue.Queue() + + def callback(chunk_id, segments, is_partial): + results.put((chunk_id, segments, is_partial)) + + model = whisper.ThreadedWhisperModel( + self.model_path, + use_gpu=False, + max_duration_sec=5.0, + sample_rate=self.sample_rate, + ) + + try: + model.start(callback) + chunk_id = model.queue_audio(self.mock_speech) + + # Wait for result with timeout + try: + result = results.get(timeout=10) + self.assertEqual(result[0], chunk_id) + except queue.Empty: + self.fail("Threaded transcription timeout") + finally: + model.stop() + + def test_threaded_model_continuous(self): + """Test threaded model with continuous audio chunks""" + results = [] + result_lock = threading.Lock() + + def callback(chunk_id, segments, is_partial): + with result_lock: + results.append((chunk_id, segments, is_partial)) + + model = whisper.ThreadedWhisperModel( + self.model_path, + use_gpu=False, + max_duration_sec=1.0, + sample_rate=self.sample_rate, + ) + + try: + model.start(callback) + + # Queue multiple chunks of audio + chunk_size = self.sample_rate # 1 second chunks + num_chunks = 3 + chunk_ids = [] + + for i in range(num_chunks): + chunk = self.mock_speech[i * chunk_size : (i + 1) * chunk_size] + chunk_id = model.queue_audio(chunk) + chunk_ids.append(chunk_id) + time.sleep(0.1) # Small delay between chunks + + # Wait for all results + max_wait = 15 # seconds + start_time = time.time() + while len(results) < num_chunks and (time.time() - start_time) < max_wait: + time.sleep(0.1) + + self.assertGreaterEqual(len(results), num_chunks) + + finally: + model.stop() + + def test_log_callback(self): + """Test log callback functionality""" + log_messages = queue.Queue() + + def log_callback(level, message): + log_messages.put((level, message)) + + # Set the log callback + whisper.set_log_callback(log_callback) + + # Create a model to generate some logs + model = whisper.WhisperModel(self.model_path, use_gpu=False) + model.transcribe(self.test_audio) + + # Check if we received any log messages + try: + log_message = log_messages.get_nowait() + self.assertIsInstance(log_message, tuple) + self.assertIsInstance(log_message[0], int) # level + self.assertIsInstance(log_message[1], str) # message + except queue.Empty: + pass # It's okay if we don't get any log messages + + def test_concurrent_models(self): + """Test running multiple models concurrently""" + + def run_model(): + model = whisper.WhisperModel(self.model_path, use_gpu=False) + result = model.transcribe(self.test_audio) + return len(result) + + with ThreadPoolExecutor(max_workers=3) as executor: + futures = [executor.submit(run_model) for _ in range(3)] + results = [f.result() for f in futures] + + self.assertEqual(len(results), 3) + + +if __name__ == "__main__": + unittest.main() From d8124b921cb667ca53e052d5f8e303997add76bc Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Tue, 29 Oct 2024 21:40:05 -0400 Subject: [PATCH 04/11] Refactor cmake/BuildWhispercpp.cmake: Update logic for building universal binaries --- cmake/BuildWhispercpp.cmake | 85 ++++++++++++++++++++++++++----------- setup.py | 6 ++- 2 files changed, 64 insertions(+), 27 deletions(-) diff --git a/cmake/BuildWhispercpp.cmake b/cmake/BuildWhispercpp.cmake index 0043a28..0d8667f 100644 --- a/cmake/BuildWhispercpp.cmake +++ b/cmake/BuildWhispercpp.cmake @@ -6,46 +6,81 @@ set(PREBUILT_WHISPERCPP_URL_BASE "https://github.com/locaal-ai/occ-ai-dep-whispercpp/releases/download/${PREBUILT_WHISPERCPP_VERSION}") if(APPLE) - # check the "MACOS_ARCH" env var to figure out if this is x86 or arm64 - if($ENV{MACOS_ARCH} STREQUAL "x86_64") - set(WHISPER_CPP_HASH "dc7fd5ff9c7fbb8623f8e14d9ff2872186cab4cd7a52066fcb2fab790d6092fc") - elseif($ENV{MACOS_ARCH} STREQUAL "arm64") - set(WHISPER_CPP_HASH "ebed595ee431b182261bce41583993b149eed539e15ebf770d98a6bc85d53a92") - else() - message( - FATAL_ERROR - "The MACOS_ARCH environment variable is not set to a valid value. Please set it to either `x86_64` or `arm64`") - endif() - set(WHISPER_CPP_URL - "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-macos-$ENV{MACOS_ARCH}-${PREBUILT_WHISPERCPP_VERSION}.tar.gz") + # Store source directories for each architecture + foreach(MACOS_ARCH IN ITEMS "x86_64" "arm64") + if(${MACOS_ARCH} STREQUAL "x86_64") + set(WHISPER_CPP_HASH "dc7fd5ff9c7fbb8623f8e14d9ff2872186cab4cd7a52066fcb2fab790d6092fc") + elseif(${MACOS_ARCH} STREQUAL "arm64") + set(WHISPER_CPP_HASH "ebed595ee431b182261bce41583993b149eed539e15ebf770d98a6bc85d53a92") + endif() + + set(WHISPER_CPP_URL + "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-macos-${MACOS_ARCH}-${PREBUILT_WHISPERCPP_VERSION}.tar.gz") - FetchContent_Declare( - whispercpp_fetch - URL ${WHISPER_CPP_URL} - URL_HASH SHA256=${WHISPER_CPP_HASH}) - FetchContent_MakeAvailable(whispercpp_fetch) + # Use unique names for each architecture's fetch + FetchContent_Declare( + whispercpp_fetch_${MACOS_ARCH} + URL ${WHISPER_CPP_URL} + URL_HASH SHA256=${WHISPER_CPP_HASH}) + FetchContent_MakeAvailable(whispercpp_fetch_${MACOS_ARCH}) + + # Store the source dir for each arch + if(${MACOS_ARCH} STREQUAL "x86_64") + set(WHISPER_X86_64_DIR ${whispercpp_fetch_x86_64_SOURCE_DIR}) + else() + set(WHISPER_ARM64_DIR ${whispercpp_fetch_arm64_SOURCE_DIR}) + endif() + endforeach() + # Create a directory for the universal binaries + set(UNIVERSAL_LIB_DIR ${CMAKE_BINARY_DIR}/universal/lib) + file(MAKE_DIRECTORY ${UNIVERSAL_LIB_DIR}) + + # Create universal binaries using lipo + execute_process( + COMMAND lipo -create + "${WHISPER_X86_64_DIR}/lib/libwhisper.a" + "${WHISPER_ARM64_DIR}/lib/libwhisper.a" + -output "${UNIVERSAL_LIB_DIR}/libwhisper.a" + ) + + execute_process( + COMMAND lipo -create + "${WHISPER_X86_64_DIR}/lib/libggml.a" + "${WHISPER_ARM64_DIR}/lib/libggml.a" + -output "${UNIVERSAL_LIB_DIR}/libggml.a" + ) + + execute_process( + COMMAND lipo -create + "${WHISPER_X86_64_DIR}/lib/libwhisper.coreml.a" + "${WHISPER_ARM64_DIR}/lib/libwhisper.coreml.a" + -output "${UNIVERSAL_LIB_DIR}/libwhisper.coreml.a" + ) + + # Set up the imported libraries to use the universal binaries add_library(Whispercpp::Whisper STATIC IMPORTED) set_target_properties( Whispercpp::Whisper PROPERTIES IMPORTED_LOCATION - ${whispercpp_fetch_SOURCE_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX}) - set_target_properties(Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - ${whispercpp_fetch_SOURCE_DIR}/include) + "${UNIVERSAL_LIB_DIR}/libwhisper.a") + set_target_properties(Whispercpp::Whisper PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES ${WHISPER_ARM64_DIR}/include) # Either arch's include dir is fine + add_library(Whispercpp::GGML STATIC IMPORTED) set_target_properties( Whispercpp::GGML PROPERTIES IMPORTED_LOCATION - ${whispercpp_fetch_SOURCE_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}ggml${CMAKE_STATIC_LIBRARY_SUFFIX}) + "${UNIVERSAL_LIB_DIR}/libggml.a") add_library(Whispercpp::CoreML STATIC IMPORTED) set_target_properties( Whispercpp::CoreML - PROPERTIES - IMPORTED_LOCATION - ${whispercpp_fetch_SOURCE_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}whisper.coreml${CMAKE_STATIC_LIBRARY_SUFFIX}) + PROPERTIES IMPORTED_LOCATION + "${UNIVERSAL_LIB_DIR}/libwhisper.coreml.a") - set(WHISPER_ADDITIONAL_FILES ${whispercpp_fetch_SOURCE_DIR}/bin/ggml-metal.metal) + # Copy the metal file from either architecture (they should be identical) + set(WHISPER_ADDITIONAL_FILES ${WHISPER_ARM64_DIR}/bin/ggml-metal.metal) set(WHISPER_ADDITIONAL_FILES ${whispercpp_fetch_SOURCE_DIR}/bin/ggml-metal.metal) elseif(WIN32) if(NOT DEFINED ACCELERATION) message(FATAL_ERROR "ACCELERATION is not set. Please set it to either `cpu`, `cuda`, `vulkan` or `hipblas`") diff --git a/setup.py b/setup.py index a34fff7..03f1fad 100644 --- a/setup.py +++ b/setup.py @@ -56,12 +56,14 @@ def build_extension(self, ext): # Add platform-specific arguments if platform.system() == "Darwin": # macOS cmake_args += [ - f"-DCMAKE_OSX_ARCHITECTURES={target_platform}", + f"-DCMAKE_OSX_ARCHITECTURES=arm64;x86_64", "-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON", "-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON", f"-DCMAKE_INSTALL_NAME_DIR=@rpath", ] - env["MACOS_ARCH"] = target_platform + # Remove the MACOS_ARCH environment variable as we're building universal + if "MACOS_ARCH" in env: + del env["MACOS_ARCH"] cfg = "Debug" if self.debug else "Release" build_args = ["--config", cfg] From dfe6c8ed5f627a2eed777fefa6658717522bfae9 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Tue, 29 Oct 2024 22:26:15 -0400 Subject: [PATCH 05/11] Refactor test_simpler_whisper.py and whisper.py: Add exception handling for time.sleep() --- .github/workflows/build.yaml | 21 +--- CMakeLists.txt | 89 ++++++++--------- cmake/BuildWhispercpp.cmake | 179 +++++++++++++++++++++-------------- src/whisper_wrapper.cpp | 10 +- test_simpler_whisper.py | 5 +- 5 files changed, 169 insertions(+), 135 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 1febf40..0b09a31 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -14,7 +14,7 @@ jobs: matrix: os: ['windows-latest', 'macos-latest', 'ubuntu-latest'] python-version: ['3.11', '3.12'] - platform: ['x86_64', 'arm64', 'win64'] + platform: ['x86_64', 'win64'] acceleration: ['cpu', 'cuda', 'hipblas', 'vulkan'] exclude: - os: windows-latest @@ -31,8 +31,6 @@ jobs: platform: win64 - os: ubuntu-latest platform: win64 - - os: ubuntu-latest - platform: arm64 - os: ubuntu-latest acceleration: cuda - os: ubuntu-latest @@ -45,25 +43,10 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Set up Python Non-Mac - if: ${{ matrix.os != 'macos-latest' }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Set up Python Mac arm64 - if: ${{ matrix.os == 'macos-latest' && matrix.platform == 'arm64' }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - architecture: 'arm64' - - - name: Set up Python Mac x86_64 - if: ${{ matrix.os == 'macos-latest' && matrix.platform == 'x86_64' }} + - name: Set up Python uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - architecture: 'x64' - name: Install dependencies run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 48a9fe7..b760016 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,43 +1,46 @@ -cmake_minimum_required(VERSION 3.15) -project(whisper_cpp_wrapper) - -set(CMAKE_CXX_STANDARD 11) -set(CMAKE_CXX_STANDARD_REQUIRED ON) - -# Find Python -find_package(Python ${PYTHON_VERSION} EXACT COMPONENTS Interpreter Development NumPy REQUIRED) - -# Fetch pybind11 -include(FetchContent) -FetchContent_Declare( - pybind11 - GIT_REPOSITORY https://github.com/pybind/pybind11.git - GIT_TAG v2.13.6 # Specify a version/tag here -) -FetchContent_MakeAvailable(pybind11) - -include(cmake/BuildWhispercpp.cmake) - -# Include directories -include_directories(${Python_INCLUDE_DIRS}) -include_directories(${Python_NumPy_INCLUDE_DIRS}) - -# Create the extension module -pybind11_add_module(_whisper_cpp src/whisper_wrapper.cpp) -target_link_libraries(_whisper_cpp PRIVATE Whispercpp) - -# Set the output directory for the built module -set_target_properties(_whisper_cpp PROPERTIES - LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/simpler_whisper -) - -# Copy the DLL to the output directory on Windows -if(WIN32 OR APPLE) - foreach(WHISPER_ADDITIONAL_FILE ${WHISPER_ADDITIONAL_FILES}) - add_custom_command(TARGET _whisper_cpp POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different - "${WHISPER_ADDITIONAL_FILE}" - $ - ) - endforeach() -endif() +cmake_minimum_required(VERSION 3.15) +project(whisper_cpp_wrapper) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Find Python +find_package( + Python ${PYTHON_VERSION} EXACT + COMPONENTS Interpreter Development + REQUIRED) + +# Fetch pybind11 +include(FetchContent) +FetchContent_Declare( + pybind11 + GIT_REPOSITORY https://github.com/pybind/pybind11.git + GIT_TAG v2.13.6 # Specify a version/tag here +) +FetchContent_MakeAvailable(pybind11) + +include(cmake/BuildWhispercpp.cmake) + +# Include directories +include_directories(${Python_INCLUDE_DIRS}) +include_directories(${Python_NumPy_INCLUDE_DIRS}) + +# Create the extension module +pybind11_add_module(_whisper_cpp src/whisper_wrapper.cpp) +target_link_libraries(_whisper_cpp PRIVATE Whispercpp) + +# Set the output directory for the built module +set_target_properties( + _whisper_cpp PROPERTIES LIBRARY_OUTPUT_DIRECTORY + ${CMAKE_CURRENT_SOURCE_DIR}/simpler_whisper) + +# Copy the DLL to the output directory on Windows +if(WIN32 OR APPLE) + foreach(WHISPER_ADDITIONAL_FILE ${WHISPER_ADDITIONAL_FILES}) + add_custom_command( + TARGET _whisper_cpp + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${WHISPER_ADDITIONAL_FILE}" $) + endforeach() +endif() diff --git a/cmake/BuildWhispercpp.cmake b/cmake/BuildWhispercpp.cmake index 0d8667f..c80f801 100644 --- a/cmake/BuildWhispercpp.cmake +++ b/cmake/BuildWhispercpp.cmake @@ -3,19 +3,23 @@ include(FetchContent) set(PREBUILT_WHISPERCPP_VERSION "0.0.7") set(PREBUILT_WHISPERCPP_URL_BASE - "https://github.com/locaal-ai/occ-ai-dep-whispercpp/releases/download/${PREBUILT_WHISPERCPP_VERSION}") + "https://github.com/locaal-ai/occ-ai-dep-whispercpp/releases/download/${PREBUILT_WHISPERCPP_VERSION}" +) if(APPLE) # Store source directories for each architecture foreach(MACOS_ARCH IN ITEMS "x86_64" "arm64") if(${MACOS_ARCH} STREQUAL "x86_64") - set(WHISPER_CPP_HASH "dc7fd5ff9c7fbb8623f8e14d9ff2872186cab4cd7a52066fcb2fab790d6092fc") + set(WHISPER_CPP_HASH + "dc7fd5ff9c7fbb8623f8e14d9ff2872186cab4cd7a52066fcb2fab790d6092fc") elseif(${MACOS_ARCH} STREQUAL "arm64") - set(WHISPER_CPP_HASH "ebed595ee431b182261bce41583993b149eed539e15ebf770d98a6bc85d53a92") + set(WHISPER_CPP_HASH + "ebed595ee431b182261bce41583993b149eed539e15ebf770d98a6bc85d53a92") endif() - + set(WHISPER_CPP_URL - "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-macos-${MACOS_ARCH}-${PREBUILT_WHISPERCPP_VERSION}.tar.gz") + "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-macos-${MACOS_ARCH}-${PREBUILT_WHISPERCPP_VERSION}.tar.gz" + ) # Use unique names for each architecture's fetch FetchContent_Declare( @@ -23,7 +27,7 @@ if(APPLE) URL ${WHISPER_CPP_URL} URL_HASH SHA256=${WHISPER_CPP_HASH}) FetchContent_MakeAvailable(whispercpp_fetch_${MACOS_ARCH}) - + # Store the source dir for each arch if(${MACOS_ARCH} STREQUAL "x86_64") set(WHISPER_X86_64_DIR ${whispercpp_fetch_x86_64_SOURCE_DIR}) @@ -38,68 +42,73 @@ if(APPLE) # Create universal binaries using lipo execute_process( - COMMAND lipo -create - "${WHISPER_X86_64_DIR}/lib/libwhisper.a" - "${WHISPER_ARM64_DIR}/lib/libwhisper.a" - -output "${UNIVERSAL_LIB_DIR}/libwhisper.a" - ) - + COMMAND + lipo -create "${WHISPER_X86_64_DIR}/lib/libwhisper.a" + "${WHISPER_ARM64_DIR}/lib/libwhisper.a" -output + "${UNIVERSAL_LIB_DIR}/libwhisper.a") + execute_process( - COMMAND lipo -create - "${WHISPER_X86_64_DIR}/lib/libggml.a" - "${WHISPER_ARM64_DIR}/lib/libggml.a" - -output "${UNIVERSAL_LIB_DIR}/libggml.a" - ) + COMMAND + lipo -create "${WHISPER_X86_64_DIR}/lib/libggml.a" + "${WHISPER_ARM64_DIR}/lib/libggml.a" -output + "${UNIVERSAL_LIB_DIR}/libggml.a") execute_process( - COMMAND lipo -create - "${WHISPER_X86_64_DIR}/lib/libwhisper.coreml.a" - "${WHISPER_ARM64_DIR}/lib/libwhisper.coreml.a" - -output "${UNIVERSAL_LIB_DIR}/libwhisper.coreml.a" - ) + COMMAND + lipo -create "${WHISPER_X86_64_DIR}/lib/libwhisper.coreml.a" + "${WHISPER_ARM64_DIR}/lib/libwhisper.coreml.a" -output + "${UNIVERSAL_LIB_DIR}/libwhisper.coreml.a") # Set up the imported libraries to use the universal binaries add_library(Whispercpp::Whisper STATIC IMPORTED) set_target_properties( - Whispercpp::Whisper - PROPERTIES IMPORTED_LOCATION - "${UNIVERSAL_LIB_DIR}/libwhisper.a") - set_target_properties(Whispercpp::Whisper PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES ${WHISPER_ARM64_DIR}/include) # Either arch's include dir is fine + Whispercpp::Whisper PROPERTIES IMPORTED_LOCATION + "${UNIVERSAL_LIB_DIR}/libwhisper.a") + set_target_properties( + Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + ${WHISPER_ARM64_DIR}/include) # Either arch's + # include dir + # is fine add_library(Whispercpp::GGML STATIC IMPORTED) set_target_properties( - Whispercpp::GGML - PROPERTIES IMPORTED_LOCATION - "${UNIVERSAL_LIB_DIR}/libggml.a") + Whispercpp::GGML PROPERTIES IMPORTED_LOCATION + "${UNIVERSAL_LIB_DIR}/libggml.a") add_library(Whispercpp::CoreML STATIC IMPORTED) set_target_properties( - Whispercpp::CoreML - PROPERTIES IMPORTED_LOCATION - "${UNIVERSAL_LIB_DIR}/libwhisper.coreml.a") + Whispercpp::CoreML PROPERTIES IMPORTED_LOCATION + "${UNIVERSAL_LIB_DIR}/libwhisper.coreml.a") # Copy the metal file from either architecture (they should be identical) - set(WHISPER_ADDITIONAL_FILES ${WHISPER_ARM64_DIR}/bin/ggml-metal.metal) set(WHISPER_ADDITIONAL_FILES ${whispercpp_fetch_SOURCE_DIR}/bin/ggml-metal.metal) + set(WHISPER_ADDITIONAL_FILES ${WHISPER_ARM64_DIR}/bin/ggml-metal.metal) elseif(WIN32) if(NOT DEFINED ACCELERATION) - message(FATAL_ERROR "ACCELERATION is not set. Please set it to either `cpu`, `cuda`, `vulkan` or `hipblas`") + message( + FATAL_ERROR + "ACCELERATION is not set. Please set it to either `cpu`, `cuda`, `vulkan` or `hipblas`" + ) endif() set(ARCH_PREFIX ${ACCELERATION}) set(WHISPER_CPP_URL - "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-windows-${ARCH_PREFIX}-${PREBUILT_WHISPERCPP_VERSION}.zip") + "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-windows-${ARCH_PREFIX}-${PREBUILT_WHISPERCPP_VERSION}.zip" + ) if(${ACCELERATION} STREQUAL "cpu") - set(WHISPER_CPP_HASH "c23862b4aac7d8448cf7de4d339a86498f88ecba6fa7d243bbd7fabdb13d4dd4") + set(WHISPER_CPP_HASH + "c23862b4aac7d8448cf7de4d339a86498f88ecba6fa7d243bbd7fabdb13d4dd4") add_compile_definitions("LOCALVOCAL_WITH_CPU") elseif(${ACCELERATION} STREQUAL "cuda") - set(WHISPER_CPP_HASH "a0adeaccae76fab0678d016a62b79a19661ed34eb810d8bae3b610345ee9a405") + set(WHISPER_CPP_HASH + "a0adeaccae76fab0678d016a62b79a19661ed34eb810d8bae3b610345ee9a405") add_compile_definitions("LOCALVOCAL_WITH_CUDA") elseif(${ACCELERATION} STREQUAL "hipblas") - set(WHISPER_CPP_HASH "bbad0b4eec01c5a801d384c03745ef5e97061958f8cf8f7724281d433d7d92a1") + set(WHISPER_CPP_HASH + "bbad0b4eec01c5a801d384c03745ef5e97061958f8cf8f7724281d433d7d92a1") add_compile_definitions("LOCALVOCAL_WITH_HIPBLAS") elseif(${ACCELERATION} STREQUAL "vulkan") - set(WHISPER_CPP_HASH "12bb34821f9efcd31f04a487569abff2b669221f2706fe0d09c17883635ef58a") + set(WHISPER_CPP_HASH + "12bb34821f9efcd31f04a487569abff2b669221f2706fe0d09c17883635ef58a") add_compile_definitions("LOCALVOCAL_WITH_VULKAN") else() message( @@ -118,42 +127,55 @@ elseif(WIN32) add_library(Whispercpp::Whisper SHARED IMPORTED) set_target_properties( Whispercpp::Whisper - PROPERTIES IMPORTED_LOCATION - ${whispercpp_fetch_SOURCE_DIR}/bin/${CMAKE_SHARED_LIBRARY_PREFIX}whisper${CMAKE_SHARED_LIBRARY_SUFFIX}) + PROPERTIES + IMPORTED_LOCATION + ${whispercpp_fetch_SOURCE_DIR}/bin/${CMAKE_SHARED_LIBRARY_PREFIX}whisper${CMAKE_SHARED_LIBRARY_SUFFIX} + ) set_target_properties( Whispercpp::Whisper - PROPERTIES IMPORTED_IMPLIB - ${whispercpp_fetch_SOURCE_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX}) - set_target_properties(Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - ${whispercpp_fetch_SOURCE_DIR}/include) + PROPERTIES + IMPORTED_IMPLIB + ${whispercpp_fetch_SOURCE_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX} + ) + set_target_properties( + Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + ${whispercpp_fetch_SOURCE_DIR}/include) add_library(Whispercpp::GGML SHARED IMPORTED) set_target_properties( Whispercpp::GGML - PROPERTIES IMPORTED_LOCATION - ${whispercpp_fetch_SOURCE_DIR}/bin/${CMAKE_SHARED_LIBRARY_PREFIX}ggml${CMAKE_SHARED_LIBRARY_SUFFIX}) + PROPERTIES + IMPORTED_LOCATION + ${whispercpp_fetch_SOURCE_DIR}/bin/${CMAKE_SHARED_LIBRARY_PREFIX}ggml${CMAKE_SHARED_LIBRARY_SUFFIX} + ) set_target_properties( Whispercpp::GGML - PROPERTIES IMPORTED_IMPLIB - ${whispercpp_fetch_SOURCE_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}ggml${CMAKE_STATIC_LIBRARY_SUFFIX}) + PROPERTIES + IMPORTED_IMPLIB + ${whispercpp_fetch_SOURCE_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}ggml${CMAKE_STATIC_LIBRARY_SUFFIX} + ) if(${ACCELERATION} STREQUAL "cpu") # add openblas to the link line add_library(Whispercpp::OpenBLAS STATIC IMPORTED) - set_target_properties(Whispercpp::OpenBLAS PROPERTIES IMPORTED_LOCATION - ${whispercpp_fetch_SOURCE_DIR}/lib/libopenblas.dll.a) + set_target_properties( + Whispercpp::OpenBLAS + PROPERTIES IMPORTED_LOCATION + ${whispercpp_fetch_SOURCE_DIR}/lib/libopenblas.dll.a) endif() # glob all dlls in the bin directory and install them file(GLOB WHISPER_ADDITIONAL_FILES ${whispercpp_fetch_SOURCE_DIR}/bin/*.dll) else() - if(${CMAKE_BUILD_TYPE} STREQUAL Release OR ${CMAKE_BUILD_TYPE} STREQUAL RelWithDebInfo) + if(${CMAKE_BUILD_TYPE} STREQUAL Release OR ${CMAKE_BUILD_TYPE} STREQUAL + RelWithDebInfo) set(Whispercpp_BUILD_TYPE Release) else() set(Whispercpp_BUILD_TYPE Debug) endif() set(Whispercpp_Build_GIT_TAG "v1.7.1") set(WHISPER_EXTRA_CXX_FLAGS "-fPIC") - set(WHISPER_ADDITIONAL_CMAKE_ARGS -DWHISPER_BLAS=OFF -DWHISPER_CUBLAS=OFF -DWHISPER_OPENBLAS=OFF) + set(WHISPER_ADDITIONAL_CMAKE_ARGS -DWHISPER_BLAS=OFF -DWHISPER_CUBLAS=OFF + -DWHISPER_OPENBLAS=OFF) # On Linux build a static Whisper library ExternalProject_Add( @@ -161,18 +183,27 @@ else() DOWNLOAD_EXTRACT_TIMESTAMP true GIT_REPOSITORY https://github.com/ggerganov/whisper.cpp.git GIT_TAG ${Whispercpp_Build_GIT_TAG} - BUILD_COMMAND ${CMAKE_COMMAND} --build --config ${Whispercpp_BUILD_TYPE} - BUILD_BYPRODUCTS /lib/static/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_COMMAND ${CMAKE_COMMAND} --build --config + ${Whispercpp_BUILD_TYPE} + BUILD_BYPRODUCTS + /lib/static/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX} CMAKE_GENERATOR ${CMAKE_GENERATOR} - INSTALL_COMMAND ${CMAKE_COMMAND} --install --config ${Whispercpp_BUILD_TYPE} && ${CMAKE_COMMAND} -E - copy /ggml/include/ggml.h /include + INSTALL_COMMAND + ${CMAKE_COMMAND} --install --config ${Whispercpp_BUILD_TYPE} + && ${CMAKE_COMMAND} -E copy /ggml/include/ggml.h + /include CONFIGURE_COMMAND - ${CMAKE_COMMAND} -E env ${WHISPER_ADDITIONAL_ENV} ${CMAKE_COMMAND} -B -G - ${CMAKE_GENERATOR} -DCMAKE_INSTALL_PREFIX= -DCMAKE_BUILD_TYPE=${Whispercpp_BUILD_TYPE} - -DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM} -DCMAKE_OSX_DEPLOYMENT_TARGET=10.13 - -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES_} -DCMAKE_CXX_FLAGS=${WHISPER_EXTRA_CXX_FLAGS} - -DCMAKE_C_FLAGS=${WHISPER_EXTRA_CXX_FLAGS} -DBUILD_SHARED_LIBS=OFF -DWHISPER_BUILD_TESTS=OFF - -DWHISPER_BUILD_EXAMPLES=OFF ${WHISPER_ADDITIONAL_CMAKE_ARGS}) + ${CMAKE_COMMAND} -E env ${WHISPER_ADDITIONAL_ENV} ${CMAKE_COMMAND} + -B -G ${CMAKE_GENERATOR} + -DCMAKE_INSTALL_PREFIX= + -DCMAKE_BUILD_TYPE=${Whispercpp_BUILD_TYPE} + -DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM} + -DCMAKE_OSX_DEPLOYMENT_TARGET=10.13 + -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES_} + -DCMAKE_CXX_FLAGS=${WHISPER_EXTRA_CXX_FLAGS} + -DCMAKE_C_FLAGS=${WHISPER_EXTRA_CXX_FLAGS} -DBUILD_SHARED_LIBS=OFF + -DWHISPER_BUILD_TESTS=OFF -DWHISPER_BUILD_EXAMPLES=OFF + ${WHISPER_ADDITIONAL_CMAKE_ARGS}) ExternalProject_Get_Property(Whispercpp_Build INSTALL_DIR) @@ -180,14 +211,20 @@ else() add_library(Whispercpp::Whisper STATIC IMPORTED) set_target_properties( Whispercpp::Whisper - PROPERTIES IMPORTED_LOCATION - ${INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX}) + PROPERTIES + IMPORTED_LOCATION + ${INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}whisper${CMAKE_STATIC_LIBRARY_SUFFIX} + ) add_library(Whispercpp::GGML STATIC IMPORTED) set_target_properties( Whispercpp::GGML - PROPERTIES IMPORTED_LOCATION - ${INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}ggml${CMAKE_STATIC_LIBRARY_SUFFIX}) - set_target_properties(Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include) + PROPERTIES + IMPORTED_LOCATION + ${INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}ggml${CMAKE_STATIC_LIBRARY_SUFFIX} + ) + set_target_properties( + Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + ${INSTALL_DIR}/include) endif() add_library(Whispercpp INTERFACE) @@ -197,6 +234,8 @@ if(WIN32 AND "${ACCELERATION}" STREQUAL "cpu") target_link_libraries(Whispercpp INTERFACE Whispercpp::OpenBLAS) endif() if(APPLE) - target_link_libraries(Whispercpp INTERFACE "-framework Accelerate -framework CoreML -framework Metal") + target_link_libraries( + Whispercpp + INTERFACE "-framework Accelerate -framework CoreML -framework Metal") target_link_libraries(Whispercpp INTERFACE Whispercpp::CoreML) endif(APPLE) diff --git a/src/whisper_wrapper.cpp b/src/whisper_wrapper.cpp index de9daea..b493754 100644 --- a/src/whisper_wrapper.cpp +++ b/src/whisper_wrapper.cpp @@ -91,13 +91,19 @@ class WhisperModel py::list transcribe(py::array_t audio) { + py::list result; + // Check if input is empty + if (audio.is_none() || audio.size() == 0) + { + return result; + } + auto audio_buffer = audio.request(); float *audio_data = static_cast(audio_buffer.ptr); int n_samples = audio_buffer.size; std::vector segments = transcribe_raw_audio(audio_data, n_samples); - py::list result; for (const auto &segment : segments) { result.append(py::cast(segment)); @@ -400,7 +406,7 @@ class ThreadedWhisperModel : public AsyncWhisperModel AsyncWhisperModel::start(callback, result_check_interval_ms); } - void stop() + void stop() override { AsyncWhisperModel::stop(); diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py index bf2288b..0d873ad 100644 --- a/test_simpler_whisper.py +++ b/test_simpler_whisper.py @@ -167,7 +167,10 @@ def handle_result(chunk_id: int, segments: List[WhisperSegment], is_partial: boo # wait for all chunks to finish processing while len(chunk_ids) > 0: - time.sleep(0.1) + try: + time.sleep(0.1) + except: + break # When done print("Stopping Whisper model") From 9828b2698af8d9205b3e6877a0738178fcebd4d7 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Tue, 29 Oct 2024 22:43:56 -0400 Subject: [PATCH 06/11] Refactor cmake/BuildWhispercpp.cmake: Add OpenMP support for Unix systems --- cmake/BuildWhispercpp.cmake | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/cmake/BuildWhispercpp.cmake b/cmake/BuildWhispercpp.cmake index c80f801..b71ef00 100644 --- a/cmake/BuildWhispercpp.cmake +++ b/cmake/BuildWhispercpp.cmake @@ -1,6 +1,13 @@ include(ExternalProject) include(FetchContent) +if(UNIX AND NOT APPLE) + find_package(OpenMP REQUIRED) + # Set compiler flags for OpenMP + set(WHISPER_EXTRA_CXX_FLAGS "${WHISPER_EXTRA_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set(WHISPER_EXTRA_C_FLAGS "${WHISPER_EXTRA_CXX_FLAGS} ${OpenMP_C_FLAGS}") +endif() + set(PREBUILT_WHISPERCPP_VERSION "0.0.7") set(PREBUILT_WHISPERCPP_URL_BASE "https://github.com/locaal-ai/occ-ai-dep-whispercpp/releases/download/${PREBUILT_WHISPERCPP_VERSION}" @@ -177,6 +184,11 @@ else() set(WHISPER_ADDITIONAL_CMAKE_ARGS -DWHISPER_BLAS=OFF -DWHISPER_CUBLAS=OFF -DWHISPER_OPENBLAS=OFF) + find_package(OpenMP REQUIRED) + # Set compiler flags for OpenMP + set(WHISPER_EXTRA_CXX_FLAGS "${WHISPER_EXTRA_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set(WHISPER_EXTRA_C_FLAGS "${WHISPER_EXTRA_CXX_FLAGS} ${OpenMP_C_FLAGS}") + # On Linux build a static Whisper library ExternalProject_Add( Whispercpp_Build @@ -201,8 +213,9 @@ else() -DCMAKE_OSX_DEPLOYMENT_TARGET=10.13 -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES_} -DCMAKE_CXX_FLAGS=${WHISPER_EXTRA_CXX_FLAGS} - -DCMAKE_C_FLAGS=${WHISPER_EXTRA_CXX_FLAGS} -DBUILD_SHARED_LIBS=OFF + -DCMAKE_C_FLAGS=${WHISPER_EXTRA_C_FLAGS} -DBUILD_SHARED_LIBS=OFF -DWHISPER_BUILD_TESTS=OFF -DWHISPER_BUILD_EXAMPLES=OFF + -DWHISPER_USE_OPENMP=ON # Enable OpenMP explicitly ${WHISPER_ADDITIONAL_CMAKE_ARGS}) ExternalProject_Get_Property(Whispercpp_Build INSTALL_DIR) @@ -225,6 +238,11 @@ else() set_target_properties( Whispercpp::Whisper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include) + set_property( + TARGET Whispercpp::Whisper + APPEND + PROPERTY INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_CXX) + endif() add_library(Whispercpp INTERFACE) @@ -239,3 +257,6 @@ if(APPLE) INTERFACE "-framework Accelerate -framework CoreML -framework Metal") target_link_libraries(Whispercpp INTERFACE Whispercpp::CoreML) endif(APPLE) +if(UNIX AND NOT APPLE) + target_link_libraries(Whispercpp INTERFACE OpenMP::OpenMP_CXX) +endif() From a6a42affaa87c6320fc124315c84978c42adcdca Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Tue, 29 Oct 2024 23:49:35 -0400 Subject: [PATCH 07/11] Refactor .github/workflows/build.yaml: Modify import path for simpler_whisper to avoid conflicts --- .github/workflows/build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 0b09a31..b0a6fbb 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -81,7 +81,7 @@ jobs: - name: Test import run: | - python -c "import simpler_whisper; print(simpler_whisper.__file__)" + python -c "import sys; sys.path.pop(0); import simpler_whisper; print(simpler_whisper.__file__)" - name: Rename wheel file shell: python From ec48641d4ca5a77bf4c29c5f54c0d8f66f034a9b Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Wed, 30 Oct 2024 08:26:51 -0400 Subject: [PATCH 08/11] Refactor .github/workflows/build.yaml: Simplify acceleration options and disable import test --- .github/workflows/build.yaml | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index b0a6fbb..4bd103f 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -15,7 +15,7 @@ jobs: os: ['windows-latest', 'macos-latest', 'ubuntu-latest'] python-version: ['3.11', '3.12'] platform: ['x86_64', 'win64'] - acceleration: ['cpu', 'cuda', 'hipblas', 'vulkan'] + acceleration: ['cpu', 'cuda'] exclude: - os: windows-latest platform: arm64 @@ -23,20 +23,12 @@ jobs: platform: x86_64 - os: macos-latest acceleration: cuda - - os: macos-latest - acceleration: hipblas - - os: macos-latest - acceleration: vulkan - os: macos-latest platform: win64 - os: ubuntu-latest platform: win64 - os: ubuntu-latest acceleration: cuda - - os: ubuntu-latest - acceleration: hipblas - - os: ubuntu-latest - acceleration: vulkan runs-on: ${{ matrix.os }} @@ -80,6 +72,7 @@ jobs: pip install $wheelFile.FullName - name: Test import + if: false run: | python -c "import sys; sys.path.pop(0); import simpler_whisper; print(simpler_whisper.__file__)" From bf83a5f2313e7be50097544659a4674e96c28766 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Wed, 30 Oct 2024 09:21:33 -0400 Subject: [PATCH 09/11] Refactor build.yaml, setup.py, whisper.py, and test_simpler_whisper.py: Remove wheel renaming logic, update long description, and enhance audio normalization --- .github/workflows/build.yaml | 19 ------------------- setup.py | 6 +++++- simpler_whisper/whisper.py | 8 ++++++-- test_simpler_whisper.py | 2 ++ 4 files changed, 13 insertions(+), 22 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 4bd103f..5159778 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -76,25 +76,6 @@ jobs: run: | python -c "import sys; sys.path.pop(0); import simpler_whisper; print(simpler_whisper.__file__)" - - name: Rename wheel file - shell: python - run: | - import os - import glob - - wheel_file = glob.glob('dist/*.whl')[0] - base_name = os.path.basename(wheel_file) - name_parts = base_name.split('-') - - # Insert acceleration and platform after the first part of the name - underscore_parts = [name_parts[0], '${{ matrix.acceleration }}', '${{ matrix.platform }}'] - new_name_parts = ['_'.join(underscore_parts)] + name_parts[1:] - new_name = '-'.join(new_name_parts) - - new_path = os.path.join('dist', new_name) - os.rename(wheel_file, new_path) - print(f"Renamed {base_name} to {new_name}") - - name: Set wheel name shell: pwsh run: | diff --git a/setup.py b/setup.py index 03f1fad..223c6f0 100644 --- a/setup.py +++ b/setup.py @@ -99,17 +99,21 @@ def build_extension(self, ext): ) +acceleration = os.getenv("SIMPLER_WHISPER_ACCELERATION", "") +build_tag = acceleration if acceleration else "" + setup( name="simpler-whisper", version="0.2.2", author="Roy Shilkrot", author_email="roy.shil@gmail.com", description="A simple Python wrapper for whisper.cpp", - long_description="", + long_description="A simple Python wrapper for whisper.cpp", ext_modules=[CMakeExtension("simpler_whisper._whisper_cpp")], cmdclass=dict(build_ext=CMakeBuild), zip_safe=False, packages=[ "simpler_whisper" ], # Add this line to ensure the package directory is created + options={"bdist_wheel": {"build_tag": build_tag}}, ) diff --git a/simpler_whisper/whisper.py b/simpler_whisper/whisper.py index 834cbbe..090ee0c 100644 --- a/simpler_whisper/whisper.py +++ b/simpler_whisper/whisper.py @@ -74,7 +74,9 @@ def transcribe(self, audio: Union[np.ndarray, List[float]]) -> int: # Run async inference (no return value) return self.model.transcribe(audio) - def handle_result(self, chunk_id: int, segments: List[WhisperSegment], is_partial: bool): + def handle_result( + self, chunk_id: int, segments: List[WhisperSegment], is_partial: bool + ): if self.callback is not None: self.callback(chunk_id, segments, is_partial) @@ -143,7 +145,9 @@ def __init__( self._is_running = False self.callback = callback - def handle_result(self, chunk_id: int, segments: List[WhisperSegment], is_partial: bool): + def handle_result( + self, chunk_id: int, segments: List[WhisperSegment], is_partial: bool + ): if self.callback is not None: self.callback(chunk_id, segments, is_partial) diff --git a/test_simpler_whisper.py b/test_simpler_whisper.py index 0d873ad..fda26bf 100644 --- a/test_simpler_whisper.py +++ b/test_simpler_whisper.py @@ -64,6 +64,8 @@ def get_samples_from_frame(frame: av.AudioFrame) -> np.ndarray: # check if the type is int16 or float32 if incoming_audio.dtype == np.int16: incoming_audio = incoming_audio / 32768.0 # normalize to [-1, 1] + if incoming_audio.dtype == np.int32: + incoming_audio = incoming_audio / 2147483648.0 # normalize to [-1, 1] # resample to 16kHz if needed if frame.rate != 16000: samples = resampy.resample(incoming_audio, frame.rate, 16000) From f84c3cbeee643cd100db8b1be34d1369e5b21921 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Wed, 30 Oct 2024 09:47:13 -0400 Subject: [PATCH 10/11] Refactor pyproject.toml and setup.py: Update numpy version constraint and enhance wheel distribution tagging --- pyproject.toml | 3 +-- setup.py | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ad115e5..8a9ca23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=45", "wheel", "cmake>=3.12", "numpy"] +requires = ["setuptools>=45", "wheel", "cmake>=3.12", "numpy<=1.26.4"] build-backend = "setuptools.build_meta" [project] @@ -17,7 +17,6 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", ] diff --git a/setup.py b/setup.py index 223c6f0..0337013 100644 --- a/setup.py +++ b/setup.py @@ -5,6 +5,7 @@ import subprocess import platform import sysconfig +from wheel.bdist_wheel import bdist_wheel class CMakeExtension(Extension): @@ -99,8 +100,15 @@ def build_extension(self, ext): ) -acceleration = os.getenv("SIMPLER_WHISPER_ACCELERATION", "") -build_tag = acceleration if acceleration else "" +class CustomBdistWheel(bdist_wheel): + def get_tag(self): + python, abi, platform = super().get_tag() + acceleration = os.environ.get("SIMPLER_WHISPER_ACCELERATION", "") + if acceleration: + # This creates the +cuda or +cpu tag + self.distribution.version += f"+{acceleration}" + return python, abi, platform + setup( name="simpler-whisper", @@ -110,10 +118,12 @@ def build_extension(self, ext): description="A simple Python wrapper for whisper.cpp", long_description="A simple Python wrapper for whisper.cpp", ext_modules=[CMakeExtension("simpler_whisper._whisper_cpp")], - cmdclass=dict(build_ext=CMakeBuild), + cmdclass={ + "build_ext": CMakeBuild, + "bdist_wheel": CustomBdistWheel, + }, zip_safe=False, packages=[ "simpler_whisper" ], # Add this line to ensure the package directory is created - options={"bdist_wheel": {"build_tag": build_tag}}, ) From 4136426fe8c656894b558655125b823baa676095 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Wed, 30 Oct 2024 09:51:14 -0400 Subject: [PATCH 11/11] Refactor setup.py: Modify version tagging logic to temporarily adjust version for acceleration --- setup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 0337013..dc6e14a 100644 --- a/setup.py +++ b/setup.py @@ -105,8 +105,10 @@ def get_tag(self): python, abi, platform = super().get_tag() acceleration = os.environ.get("SIMPLER_WHISPER_ACCELERATION", "") if acceleration: - # This creates the +cuda or +cpu tag - self.distribution.version += f"+{acceleration}" + # Store original version + orig_version = self.distribution.get_version() + # Temporarily modify version + self.distribution.metadata.version = f"{orig_version}+{acceleration}" return python, abi, platform