From 16e91c0167582361c283f89913c56724d5825d4c Mon Sep 17 00:00:00 2001 From: contra Date: Sat, 7 Mar 2026 21:07:50 -0800 Subject: [PATCH 1/3] feat: add escape hatch for custom phonemizer (BYO G2P) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `generateFromPhonemes` / `streamFromPhonemes` methods that accept pre-computed IPA phoneme strings, bypassing the built-in phonemis pipeline. This enables users to plug in any external G2P system (e.g. the Python `phonemizer` library, espeak-ng, or custom phonemizers) while still using the Kokoro synthesis engine. Changes across all layers: - C++ Kokoro: new public methods + shared impl helpers + UTF-8→UTF-32 - JSI ModelHostObject: expose new methods via promiseHostFunction - TextToSpeechModule: `forwardFromPhonemes()` and `streamFromPhonemes()` - useTextToSpeech hook: corresponding hook methods - Types: `TextToSpeechPhonemeInput`, `TextToSpeechStreamingPhonemeInput` --- .../host_objects/ModelHostObject.h | 8 ++ .../models/text_to_speech/kokoro/Kokoro.cpp | 123 ++++++++++++------ .../models/text_to_speech/kokoro/Kokoro.h | 20 +++ .../useTextToSpeech.ts | 57 ++++++++ .../TextToSpeechModule.ts | 77 +++++++++++ .../react-native-executorch/src/types/tts.ts | 51 ++++++++ 6 files changed, 294 insertions(+), 42 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index d6489c9be..7ece18a93 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -169,6 +169,14 @@ template class ModelHostObject : public JsiHostObject { addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject, promiseHostFunction<&Model::stream>, "stream")); + addFunctions(JSI_EXPORT_FUNCTION( + ModelHostObject, + promiseHostFunction<&Model::generateFromPhonemes>, + "generateFromPhonemes")); + addFunctions(JSI_EXPORT_FUNCTION( + ModelHostObject, + promiseHostFunction<&Model::streamFromPhonemes>, + "streamFromPhonemes")); } if constexpr (meta::HasGenerateFromString) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp index d73fb6205..1fd522676 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp @@ -3,7 +3,9 @@ #include "Utils.h" #include +#include #include +#include #include #include @@ -73,34 +75,59 @@ void Kokoro::loadVoice(const std::string &voiceSource) { } } -std::vector Kokoro::generate(std::string text, float speed) { - if (text.size() > params::kMaxTextSize) { - throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, - "Kokoro: maximum input text size exceeded"); +std::u32string Kokoro::utf8ToUtf32(const std::string &utf8) { + std::u32string result; + result.reserve(utf8.size()); + size_t i = 0; + while (i < utf8.size()) { + char32_t cp = 0; + unsigned char c = static_cast(utf8[i]); + size_t len = 0; + if (c < 0x80) { + cp = c; + len = 1; + } else if ((c >> 5) == 0x06) { + cp = c & 0x1F; + len = 2; + } else if ((c >> 4) == 0x0E) { + cp = c & 0x0F; + len = 3; + } else if ((c >> 3) == 0x1E) { + cp = c & 0x07; + len = 4; + } else { + throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, + "Kokoro: invalid UTF-8 in phoneme string"); + } + if (i + len > utf8.size()) { + throw RnExecutorchError( + RnExecutorchErrorCode::InvalidUserInput, + "Kokoro: truncated UTF-8 sequence in phoneme string"); + } + for (size_t j = 1; j < len; j++) { + cp = (cp << 6) | (static_cast(utf8[i + j]) & 0x3F); + } + result.push_back(cp); + i += len; } + return result; +} - // G2P (Grapheme to Phoneme) conversion - auto phonemes = phonemizer_.process(text); - - // Divide the phonemes string intro substrings. - // Affects the further calculations only in case of string size - // exceeding the biggest model's input. +std::vector +Kokoro::generateFromPhonemesImpl(const std::u32string &phonemes, float speed) { auto subsentences = partitioner_.divide(phonemes); std::vector audio = {}; for (const auto &subsentence : subsentences) { - // Generate an audio vector with the Kokoro model auto audioPart = synthesize(subsentence, speed); - // Calculate a pause between the sentences char32_t lastPhoneme = subsentence.back(); size_t pauseMs = params::kPauseValues.contains(lastPhoneme) ? params::kPauseValues.at(lastPhoneme) : params::kDefaultPause; std::vector pause(pauseMs * constants::kSamplesPerMilisecond, 0.F); - // Add audio part and pause to the main audio vector audio.insert(audio.end(), std::make_move_iterator(audioPart.begin()), std::make_move_iterator(audioPart.end())); audio.insert(audio.end(), std::make_move_iterator(pause.begin()), @@ -110,14 +137,9 @@ std::vector Kokoro::generate(std::string text, float speed) { return audio; } -void Kokoro::stream(std::string text, float speed, - std::shared_ptr callback) { - if (text.size() > params::kMaxTextSize) { - throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, - "Kokoro: maximum input text size exceeded"); - } - - // Build a full callback function +void Kokoro::streamFromPhonemesImpl( + const std::u32string &phonemes, float speed, + std::shared_ptr callback) { auto nativeCallback = [this, callback](const std::vector &audioVec) { if (this->isStreaming_) { this->callInvoker_->invokeAsync([callback, audioVec](jsi::Runtime &rt) { @@ -127,21 +149,11 @@ void Kokoro::stream(std::string text, float speed, } }; - // Mark the beginning of the streaming process isStreaming_ = true; - // G2P (Grapheme to Phoneme) conversion - auto phonemes = phonemizer_.process(text); - - // Divide the phonemes string intro substrings. - // Use specialized implementation to minimize the latency between the - // sentences. auto subsentences = partitioner_.divide(phonemes); - // We follow the implementation of generate() method, but - // instead of accumulating results in a vector, we push them - // back to the JS side with the callback. for (size_t i = 0; i < subsentences.size(); i++) { if (!isStreaming_) { break; @@ -149,37 +161,64 @@ void Kokoro::stream(std::string text, float speed, const auto &subsentence = subsentences[i]; - // Determine the silent padding duration to be stripped from the edges of - // the generated audio. If a chunk ends with a space or follows one that - // did, it indicates a word boundary split – we use a shorter padding (20ms) - // to ensure natural speech flow. Otherwise, we use 50ms for standard - // pauses. bool endsWithSpace = (subsentence.back() == U' '); bool prevEndsWithSpace = (i > 0 && subsentences[i - 1].back() == U' '); - size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50; // [ms] + size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50; - // Generate an audio vector with the Kokoro model auto audioPart = synthesize(subsentence, speed, paddingMs); - // Calculate a pause between the sentences char32_t lastPhoneme = subsentence.back(); size_t pauseMs = params::kPauseValues.contains(lastPhoneme) ? params::kPauseValues.at(lastPhoneme) : params::kDefaultPause; std::vector pause(pauseMs * constants::kSamplesPerMilisecond, 0.F); - // Add pause to the audio vector audioPart.insert(audioPart.end(), std::make_move_iterator(pause.begin()), std::make_move_iterator(pause.end())); - // Push the audio right away to the JS side nativeCallback(audioPart); } - // Mark the end of the streaming process isStreaming_ = false; } +std::vector Kokoro::generate(std::string text, float speed) { + if (text.size() > params::kMaxTextSize) { + throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, + "Kokoro: maximum input text size exceeded"); + } + + // G2P (Grapheme to Phoneme) conversion + auto phonemes = phonemizer_.process(text); + + return generateFromPhonemesImpl(phonemes, speed); +} + +std::vector Kokoro::generateFromPhonemes(std::string phonemes, + float speed) { + auto phonemes32 = utf8ToUtf32(phonemes); + return generateFromPhonemesImpl(phonemes32, speed); +} + +void Kokoro::stream(std::string text, float speed, + std::shared_ptr callback) { + if (text.size() > params::kMaxTextSize) { + throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, + "Kokoro: maximum input text size exceeded"); + } + + // G2P (Grapheme to Phoneme) conversion + auto phonemes = phonemizer_.process(text); + + streamFromPhonemesImpl(phonemes, speed, callback); +} + +void Kokoro::streamFromPhonemes(std::string phonemes, float speed, + std::shared_ptr callback) { + auto phonemes32 = utf8ToUtf32(phonemes); + streamFromPhonemesImpl(phonemes32, speed, callback); +} + void Kokoro::streamStop() noexcept { isStreaming_ = false; } std::vector Kokoro::synthesize(const std::u32string &phonemes, diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h index f27ba8018..a99435d1c 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h @@ -27,11 +27,22 @@ class Kokoro { // Processes the entire text at once, before sending back to the JS side. std::vector generate(std::string text, float speed = 1.F); + // Accepts pre-computed phonemes (as a UTF-8 IPA string) and synthesizes + // audio, bypassing the built-in phonemizer. This allows callers to use + // an external G2P system (e.g. the Python `phonemizer` library, espeak-ng, + // or any custom phonemizer). + std::vector generateFromPhonemes(std::string phonemes, + float speed = 1.F); + // Processes text in chunks, sending each chunk individualy to the JS side // with asynchronous callbacks. void stream(std::string text, float speed, std::shared_ptr callback); + // Streaming variant that accepts pre-computed phonemes instead of text. + void streamFromPhonemes(std::string phonemes, float speed, + std::shared_ptr callback); + // Stops the streaming process void streamStop() noexcept; @@ -42,6 +53,15 @@ class Kokoro { // Helper function - loading voice array void loadVoice(const std::string &voiceSource); + // Helper function - convert UTF-8 string to UTF-32 for phoneme processing + static std::u32string utf8ToUtf32(const std::string &utf8); + + // Helper function - shared synthesis pipeline (partition + synthesize) + std::vector generateFromPhonemesImpl(const std::u32string &phonemes, + float speed); + void streamFromPhonemesImpl(const std::u32string &phonemes, float speed, + std::shared_ptr callback); + // Helper function - generate specialization for given input size std::vector synthesize(const std::u32string &phonemes, float speed, size_t paddingMs = 50); diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts index b29b4bc8d..c1e1a2760 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts @@ -3,8 +3,10 @@ import { TextToSpeechModule } from '../../modules/natural_language_processing/Te import { TextToSpeechProps, TextToSpeechInput, + TextToSpeechPhonemeInput, TextToSpeechType, TextToSpeechStreamingInput, + TextToSpeechStreamingPhonemeInput, } from '../../types/tts'; import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; @@ -81,6 +83,28 @@ export const useTextToSpeech = ({ } }; + const forwardFromPhonemes = async (input: TextToSpeechPhonemeInput) => { + if (!isReady) + throw new RnExecutorchError( + RnExecutorchErrorCode.ModuleNotLoaded, + 'The model is currently not loaded. Please load the model before calling forwardFromPhonemes().' + ); + if (isGenerating) + throw new RnExecutorchError( + RnExecutorchErrorCode.ModelGenerating, + 'The model is currently generating. Please wait until previous model run is complete.' + ); + try { + setIsGenerating(true); + return await moduleInstance.forwardFromPhonemes( + input.phonemes, + input.speed ?? 1.0 + ); + } finally { + setIsGenerating(false); + } + }; + const stream = useCallback( async (input: TextToSpeechStreamingInput) => { if (!isReady) @@ -112,12 +136,45 @@ export const useTextToSpeech = ({ [isReady, isGenerating, moduleInstance] ); + const streamFromPhonemes = useCallback( + async (input: TextToSpeechStreamingPhonemeInput) => { + if (!isReady) + throw new RnExecutorchError( + RnExecutorchErrorCode.ModuleNotLoaded, + 'The model is currently not loaded. Please load the model before calling streamFromPhonemes().' + ); + if (isGenerating) + throw new RnExecutorchError( + RnExecutorchErrorCode.ModelGenerating, + 'The model is currently generating. Please wait until previous model run is complete.' + ); + setIsGenerating(true); + try { + await input.onBegin?.(); + for await (const audio of moduleInstance.streamFromPhonemes({ + phonemes: input.phonemes, + speed: input.speed ?? 1.0, + })) { + if (input.onNext) { + await input.onNext(audio); + } + } + } finally { + await input.onEnd?.(); + setIsGenerating(false); + } + }, + [isReady, isGenerating, moduleInstance] + ); + return { error, isReady, isGenerating, forward, + forwardFromPhonemes, stream, + streamFromPhonemes, streamStop: moduleInstance.streamStop, downloadProgress, }; diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts index 849c25676..d99a6312b 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts @@ -5,6 +5,7 @@ import { KokoroConfig, TextToSpeechConfig, TextToSpeechStreamingInput, + TextToSpeechStreamingPhonemeInput, VoiceConfig, } from '../../types/tts'; import { Logger } from '../../common/Logger'; @@ -118,6 +119,27 @@ export class TextToSpeechModule { return await this.nativeModule.generate(text, speed); } + /** + * Synthesizes pre-computed phonemes into speech, bypassing the built-in phonemizer. + * This allows using an external G2P system (e.g. the Python `phonemizer` library, + * espeak-ng, or any custom phonemizer). + * + * @param phonemes The pre-computed IPA phoneme string. + * @param speed Optional speed multiplier for the speech synthesis (default is 1.0). + * @returns A promise resolving to the synthesized audio waveform. + */ + public async forwardFromPhonemes( + phonemes: string, + speed: number = 1.0 + ): Promise { + if (this.nativeModule == null) + throw new RnExecutorchError( + RnExecutorchErrorCode.ModuleNotLoaded, + 'The model is currently not loaded. Please load the model before calling forwardFromPhonemes().' + ); + return await this.nativeModule.generateFromPhonemes(phonemes, speed); + } + /** * Starts a streaming synthesis session. Yields audio chunks as they are generated. * @@ -169,6 +191,61 @@ export class TextToSpeechModule { } } + /** + * Starts a streaming synthesis session from pre-computed phonemes. + * Bypasses the built-in phonemizer, allowing use of external G2P systems. + * + * @param input - Input object containing phonemes and optional speed. + * @returns An async generator yielding Float32Array audio chunks. + */ + public async *streamFromPhonemes({ + phonemes, + speed, + }: TextToSpeechStreamingPhonemeInput): AsyncGenerator { + const queue: Float32Array[] = []; + + let waiter: (() => void) | null = null; + let finished = false; + let error: unknown; + + const wake = () => { + waiter?.(); + waiter = null; + }; + + (async () => { + try { + await this.nativeModule.streamFromPhonemes( + phonemes, + speed, + (audio: number[]) => { + queue.push(new Float32Array(audio)); + wake(); + } + ); + finished = true; + wake(); + } catch (e) { + error = e; + finished = true; + wake(); + } + })(); + + while (true) { + if (queue.length > 0) { + yield queue.shift()!; + if (finished && queue.length === 0) { + return; + } + continue; + } + if (error) throw error; + if (finished) return; + await new Promise((r) => (waiter = r)); + } + } + /** * Stops the streaming process if there is any ongoing. */ diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts index 55937be49..b9c878c3b 100644 --- a/packages/react-native-executorch/src/types/tts.ts +++ b/packages/react-native-executorch/src/types/tts.ts @@ -90,6 +90,21 @@ export interface TextToSpeechInput { speed?: number; } +/** + * Text to Speech module input for pre-computed phonemes. + * Use this when you have your own phonemizer (e.g. the Python `phonemizer` + * library, espeak-ng, or any custom G2P system) and want to bypass the + * built-in phonemis pipeline. + * + * @category Types + * @property {string} phonemes - pre-computed IPA phoneme string + * @property {number} [speed] - optional speed argument - the higher it is, the faster the speech becomes + */ +export interface TextToSpeechPhonemeInput { + phonemes: string; + speed?: number; +} + /** * Return type for the `useTextToSpeech` hook. * Manages the state and operations for Text-to-Speech generation. @@ -125,6 +140,18 @@ export interface TextToSpeechType { */ forward: (input: TextToSpeechInput) => Promise; + /** + * Synthesizes pre-computed phonemes into speech audio in a single pass. + * Bypasses the built-in phonemizer, allowing use of external G2P systems. + * + * @param input - The `TextToSpeechPhonemeInput` object containing pre-computed `phonemes` and optional `speed`. + * @returns A Promise that resolves with the generated audio data. + * @throws {RnExecutorchError} If the model is not loaded or is currently generating. + */ + forwardFromPhonemes: ( + input: TextToSpeechPhonemeInput + ) => Promise; + /** * Streams the generated audio data incrementally. * This is optimal for real-time playback, allowing audio to start playing before the full text is synthesized. @@ -134,6 +161,17 @@ export interface TextToSpeechType { */ stream: (input: TextToSpeechStreamingInput) => Promise; + /** + * Streams pre-computed phonemes incrementally, bypassing the built-in phonemizer. + * + * @param input - The streaming input with pre-computed `phonemes` instead of `text`. + * @returns A Promise that resolves when the streaming process is complete. + * @throws {RnExecutorchError} If the model is not loaded or is currently generating. + */ + streamFromPhonemes: ( + input: TextToSpeechStreamingPhonemeInput + ) => Promise; + /** * Interrupts and stops the currently active audio generation stream. */ @@ -158,3 +196,16 @@ export interface TextToSpeechStreamingInput extends TextToSpeechInput { onNext?: (audio: Float32Array) => void | Promise; onEnd?: () => void | Promise; } + +/** + * Streaming input definition for pre-computed phonemes. + * Same as `TextToSpeechStreamingInput` but accepts `phonemes` instead of `text`. + * + * @category Types + */ +export interface TextToSpeechStreamingPhonemeInput + extends TextToSpeechPhonemeInput { + onBegin?: () => void | Promise; + onNext?: (audio: Float32Array) => void | Promise; + onEnd?: () => void | Promise; +} From 5a125c740256d1beeb2cde7434ad0765c794b880 Mon Sep 17 00:00:00 2001 From: contra Date: Sat, 7 Mar 2026 21:16:24 -0800 Subject: [PATCH 2/3] =?UTF-8?q?refactor:=20simplify=20after=20review=20?= =?UTF-8?q?=E2=80=94=20reuse=20phonemis=20utf8=5Fto=5Fu32string,=20extract?= =?UTF-8?q?=20shared=20helpers,=20add=20input=20validation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../models/text_to_speech/kokoro/Kokoro.cpp | 86 ++++++-------- .../models/text_to_speech/kokoro/Kokoro.h | 3 - .../useTextToSpeech.ts | 109 ++++++++---------- .../TextToSpeechModule.ts | 94 +++++---------- .../react-native-executorch/src/types/tts.ts | 32 ++--- 5 files changed, 132 insertions(+), 192 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp index 1fd522676..52da0fc46 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp @@ -3,9 +3,8 @@ #include "Utils.h" #include -#include #include -#include +#include #include #include @@ -75,63 +74,30 @@ void Kokoro::loadVoice(const std::string &voiceSource) { } } -std::u32string Kokoro::utf8ToUtf32(const std::string &utf8) { - std::u32string result; - result.reserve(utf8.size()); - size_t i = 0; - while (i < utf8.size()) { - char32_t cp = 0; - unsigned char c = static_cast(utf8[i]); - size_t len = 0; - if (c < 0x80) { - cp = c; - len = 1; - } else if ((c >> 5) == 0x06) { - cp = c & 0x1F; - len = 2; - } else if ((c >> 4) == 0x0E) { - cp = c & 0x0F; - len = 3; - } else if ((c >> 3) == 0x1E) { - cp = c & 0x07; - len = 4; - } else { - throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, - "Kokoro: invalid UTF-8 in phoneme string"); - } - if (i + len > utf8.size()) { - throw RnExecutorchError( - RnExecutorchErrorCode::InvalidUserInput, - "Kokoro: truncated UTF-8 sequence in phoneme string"); - } - for (size_t j = 1; j < len; j++) { - cp = (cp << 6) | (static_cast(utf8[i + j]) & 0x3F); - } - result.push_back(cp); - i += len; - } - return result; -} - std::vector Kokoro::generateFromPhonemesImpl(const std::u32string &phonemes, float speed) { + // Divide the phonemes string into substrings. + // Affects the further calculations only in case of string size + // exceeding the biggest model's input. auto subsentences = partitioner_.divide(phonemes); std::vector audio = {}; for (const auto &subsentence : subsentences) { + // Generate an audio vector with the Kokoro model auto audioPart = synthesize(subsentence, speed); + // Calculate a pause between the sentences char32_t lastPhoneme = subsentence.back(); size_t pauseMs = params::kPauseValues.contains(lastPhoneme) ? params::kPauseValues.at(lastPhoneme) : params::kDefaultPause; - std::vector pause(pauseMs * constants::kSamplesPerMilisecond, 0.F); + // Add audio part and silence pause to the main audio vector audio.insert(audio.end(), std::make_move_iterator(audioPart.begin()), std::make_move_iterator(audioPart.end())); - audio.insert(audio.end(), std::make_move_iterator(pause.begin()), - std::make_move_iterator(pause.end())); + audio.resize(audio.size() + pauseMs * constants::kSamplesPerMilisecond, + 0.F); } return audio; @@ -151,6 +117,7 @@ void Kokoro::streamFromPhonemesImpl( isStreaming_ = true; + // Use LATENCY strategy to minimize the time-to-first-audio for streaming auto subsentences = partitioner_.divide(phonemes); @@ -161,21 +128,27 @@ void Kokoro::streamFromPhonemesImpl( const auto &subsentence = subsentences[i]; + // Determine the silent padding duration to be stripped from the edges of + // the generated audio. If a chunk ends with a space or follows one that + // did, it indicates a word boundary split – we use a shorter padding + // to ensure natural speech flow. Otherwise, we use 50ms for standard + // pauses. bool endsWithSpace = (subsentence.back() == U' '); bool prevEndsWithSpace = (i > 0 && subsentences[i - 1].back() == U' '); - size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50; + size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50; // [ms] + // Generate an audio vector with the Kokoro model auto audioPart = synthesize(subsentence, speed, paddingMs); + // Calculate and append a pause between the sentences char32_t lastPhoneme = subsentence.back(); size_t pauseMs = params::kPauseValues.contains(lastPhoneme) ? params::kPauseValues.at(lastPhoneme) : params::kDefaultPause; - std::vector pause(pauseMs * constants::kSamplesPerMilisecond, 0.F); - - audioPart.insert(audioPart.end(), std::make_move_iterator(pause.begin()), - std::make_move_iterator(pause.end())); + audioPart.resize( + audioPart.size() + pauseMs * constants::kSamplesPerMilisecond, 0.F); + // Push the audio right away to the JS side nativeCallback(audioPart); } @@ -196,8 +169,12 @@ std::vector Kokoro::generate(std::string text, float speed) { std::vector Kokoro::generateFromPhonemes(std::string phonemes, float speed) { - auto phonemes32 = utf8ToUtf32(phonemes); - return generateFromPhonemesImpl(phonemes32, speed); + if (phonemes.empty()) { + throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, + "Kokoro: phoneme string must not be empty"); + } + return generateFromPhonemesImpl( + phonemis::utilities::string_utils::utf8_to_u32string(phonemes), speed); } void Kokoro::stream(std::string text, float speed, @@ -215,8 +192,13 @@ void Kokoro::stream(std::string text, float speed, void Kokoro::streamFromPhonemes(std::string phonemes, float speed, std::shared_ptr callback) { - auto phonemes32 = utf8ToUtf32(phonemes); - streamFromPhonemesImpl(phonemes32, speed, callback); + if (phonemes.empty()) { + throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, + "Kokoro: phoneme string must not be empty"); + } + streamFromPhonemesImpl( + phonemis::utilities::string_utils::utf8_to_u32string(phonemes), speed, + callback); } void Kokoro::streamStop() noexcept { isStreaming_ = false; } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h index a99435d1c..d7a4c2ae6 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h @@ -53,9 +53,6 @@ class Kokoro { // Helper function - loading voice array void loadVoice(const std::string &voiceSource); - // Helper function - convert UTF-8 string to UTF-32 for phoneme processing - static std::u32string utf8ToUtf32(const std::string &utf8); - // Helper function - shared synthesis pipeline (partition + synthesize) std::vector generateFromPhonemesImpl(const std::u32string &phonemes, float speed); diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts index c1e1a2760..1a751f42d 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts @@ -5,6 +5,7 @@ import { TextToSpeechInput, TextToSpeechPhonemeInput, TextToSpeechType, + TextToSpeechStreamingCallbacks, TextToSpeechStreamingInput, TextToSpeechStreamingPhonemeInput, } from '../../types/tts'; @@ -64,17 +65,47 @@ export const useTextToSpeech = ({ preventLoad, ]); - const forward = async (input: TextToSpeechInput) => { + // Shared guard for all generation methods + const guardReady = (methodName: string) => { if (!isReady) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, - 'The model is currently not loaded. Please load the model before calling forward().' + `The model is currently not loaded. Please load the model before calling ${methodName}().` ); if (isGenerating) throw new RnExecutorchError( RnExecutorchErrorCode.ModelGenerating, 'The model is currently generating. Please wait until previous model run is complete.' ); + }; + + // Shared streaming orchestration (guards + onBegin/onNext/onEnd lifecycle) + const runStream = useCallback( + async ( + methodName: string, + generator: AsyncGenerator, + callbacks: TextToSpeechStreamingCallbacks + ) => { + guardReady(methodName); + setIsGenerating(true); + try { + await callbacks.onBegin?.(); + for await (const audio of generator) { + if (callbacks.onNext) { + await callbacks.onNext(audio); + } + } + } finally { + await callbacks.onEnd?.(); + setIsGenerating(false); + } + }, + // eslint-disable-next-line react-hooks/exhaustive-deps + [isReady, isGenerating, moduleInstance] + ); + + const forward = async (input: TextToSpeechInput) => { + guardReady('forward'); try { setIsGenerating(true); return await moduleInstance.forward(input.text, input.speed ?? 1.0); @@ -84,16 +115,7 @@ export const useTextToSpeech = ({ }; const forwardFromPhonemes = async (input: TextToSpeechPhonemeInput) => { - if (!isReady) - throw new RnExecutorchError( - RnExecutorchErrorCode.ModuleNotLoaded, - 'The model is currently not loaded. Please load the model before calling forwardFromPhonemes().' - ); - if (isGenerating) - throw new RnExecutorchError( - RnExecutorchErrorCode.ModelGenerating, - 'The model is currently generating. Please wait until previous model run is complete.' - ); + guardReady('forwardFromPhonemes'); try { setIsGenerating(true); return await moduleInstance.forwardFromPhonemes( @@ -107,64 +129,27 @@ export const useTextToSpeech = ({ const stream = useCallback( async (input: TextToSpeechStreamingInput) => { - if (!isReady) - throw new RnExecutorchError( - RnExecutorchErrorCode.ModuleNotLoaded, - 'The model is currently not loaded. Please load the model before calling stream().' - ); - if (isGenerating) - throw new RnExecutorchError( - RnExecutorchErrorCode.ModelGenerating, - 'The model is currently generating. Please wait until previous model run is complete.' - ); - setIsGenerating(true); - try { - await input.onBegin?.(); - for await (const audio of moduleInstance.stream({ - text: input.text, - speed: input.speed ?? 1.0, - })) { - if (input.onNext) { - await input.onNext(audio); - } - } - } finally { - await input.onEnd?.(); - setIsGenerating(false); - } + await runStream( + 'stream', + moduleInstance.stream({ text: input.text, speed: input.speed ?? 1.0 }), + input + ); }, - [isReady, isGenerating, moduleInstance] + [runStream, moduleInstance] ); const streamFromPhonemes = useCallback( async (input: TextToSpeechStreamingPhonemeInput) => { - if (!isReady) - throw new RnExecutorchError( - RnExecutorchErrorCode.ModuleNotLoaded, - 'The model is currently not loaded. Please load the model before calling streamFromPhonemes().' - ); - if (isGenerating) - throw new RnExecutorchError( - RnExecutorchErrorCode.ModelGenerating, - 'The model is currently generating. Please wait until previous model run is complete.' - ); - setIsGenerating(true); - try { - await input.onBegin?.(); - for await (const audio of moduleInstance.streamFromPhonemes({ + await runStream( + 'streamFromPhonemes', + moduleInstance.streamFromPhonemes({ phonemes: input.phonemes, speed: input.speed ?? 1.0, - })) { - if (input.onNext) { - await input.onNext(audio); - } - } - } finally { - await input.onEnd?.(); - setIsGenerating(false); - } + }), + input + ); }, - [isReady, isGenerating, moduleInstance] + [runStream, moduleInstance] ); return { diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts index d99a6312b..932f166e7 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts @@ -99,6 +99,14 @@ export class TextToSpeechModule { } } + private ensureLoaded(methodName: string): void { + if (this.nativeModule == null) + throw new RnExecutorchError( + RnExecutorchErrorCode.ModuleNotLoaded, + `The model is currently not loaded. Please load the model before calling ${methodName}().` + ); + } + /** * Synthesizes the provided text into speech. * Returns a promise that resolves to the full audio waveform as a `Float32Array`. @@ -111,11 +119,7 @@ export class TextToSpeechModule { text: string, speed: number = 1.0 ): Promise { - if (this.nativeModule == null) - throw new RnExecutorchError( - RnExecutorchErrorCode.ModuleNotLoaded, - 'The model is currently not loaded. Please load the model before calling forward().' - ); + this.ensureLoaded('forward'); return await this.nativeModule.generate(text, speed); } @@ -132,25 +136,17 @@ export class TextToSpeechModule { phonemes: string, speed: number = 1.0 ): Promise { - if (this.nativeModule == null) - throw new RnExecutorchError( - RnExecutorchErrorCode.ModuleNotLoaded, - 'The model is currently not loaded. Please load the model before calling forwardFromPhonemes().' - ); + this.ensureLoaded('forwardFromPhonemes'); return await this.nativeModule.generateFromPhonemes(phonemes, speed); } /** - * Starts a streaming synthesis session. Yields audio chunks as they are generated. - * - * @param input - Input object containing text and optional speed. - * @returns An async generator yielding Float32Array audio chunks. + * Shared streaming implementation. Wraps a native streaming call in an + * async generator that yields Float32Array audio chunks as they arrive. */ - public async *stream({ - text, - speed, - }: TextToSpeechStreamingInput): AsyncGenerator { - // Stores computed audio segments + private async *streamImpl( + nativeCall: (cb: (audio: number[]) => void) => Promise + ): AsyncGenerator { const queue: Float32Array[] = []; let waiter: (() => void) | null = null; @@ -164,7 +160,7 @@ export class TextToSpeechModule { (async () => { try { - await this.nativeModule.stream(text, speed, (audio: number[]) => { + await nativeCall((audio: number[]) => { queue.push(new Float32Array(audio)); wake(); }); @@ -191,6 +187,19 @@ export class TextToSpeechModule { } } + /** + * Starts a streaming synthesis session. Yields audio chunks as they are generated. + * + * @param input - Input object containing text and optional speed. + * @returns An async generator yielding Float32Array audio chunks. + */ + public async *stream({ + text, + speed, + }: TextToSpeechStreamingInput): AsyncGenerator { + yield* this.streamImpl((cb) => this.nativeModule.stream(text, speed, cb)); + } + /** * Starts a streaming synthesis session from pre-computed phonemes. * Bypasses the built-in phonemizer, allowing use of external G2P systems. @@ -202,48 +211,9 @@ export class TextToSpeechModule { phonemes, speed, }: TextToSpeechStreamingPhonemeInput): AsyncGenerator { - const queue: Float32Array[] = []; - - let waiter: (() => void) | null = null; - let finished = false; - let error: unknown; - - const wake = () => { - waiter?.(); - waiter = null; - }; - - (async () => { - try { - await this.nativeModule.streamFromPhonemes( - phonemes, - speed, - (audio: number[]) => { - queue.push(new Float32Array(audio)); - wake(); - } - ); - finished = true; - wake(); - } catch (e) { - error = e; - finished = true; - wake(); - } - })(); - - while (true) { - if (queue.length > 0) { - yield queue.shift()!; - if (finished && queue.length === 0) { - return; - } - continue; - } - if (error) throw error; - if (finished) return; - await new Promise((r) => (waiter = r)); - } + yield* this.streamImpl((cb) => + this.nativeModule.streamFromPhonemes(phonemes, speed, cb) + ); } /** diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts index b9c878c3b..4df331494 100644 --- a/packages/react-native-executorch/src/types/tts.ts +++ b/packages/react-native-executorch/src/types/tts.ts @@ -94,7 +94,7 @@ export interface TextToSpeechInput { * Text to Speech module input for pre-computed phonemes. * Use this when you have your own phonemizer (e.g. the Python `phonemizer` * library, espeak-ng, or any custom G2P system) and want to bypass the - * built-in phonemis pipeline. + * built-in phonemizer pipeline. * * @category Types * @property {string} phonemes - pre-computed IPA phoneme string @@ -179,24 +179,33 @@ export interface TextToSpeechType { } /** - * Text to Speech streaming input definition - * - * Streaming mode in T2S is synchronized by passing specific callbacks - * executed at given moments of the streaming. - * Actions such as playing the audio should happen within the onNext callback. - * Callbacks can be both synchronous or asynchronous. + * Shared streaming lifecycle callbacks for TTS streaming modes. * * @category Types * @property {() => void | Promise} [onBegin] - Called when streaming begins * @property {(audio: Float32Array) => void | Promise} [onNext] - Called after each audio chunk gets calculated. * @property {() => void | Promise} [onEnd] - Called when streaming ends */ -export interface TextToSpeechStreamingInput extends TextToSpeechInput { +export interface TextToSpeechStreamingCallbacks { onBegin?: () => void | Promise; onNext?: (audio: Float32Array) => void | Promise; onEnd?: () => void | Promise; } +/** + * Text to Speech streaming input definition + * + * Streaming mode in T2S is synchronized by passing specific callbacks + * executed at given moments of the streaming. + * Actions such as playing the audio should happen within the onNext callback. + * Callbacks can be both synchronous or asynchronous. + * + * @category Types + */ +export interface TextToSpeechStreamingInput + extends TextToSpeechInput, + TextToSpeechStreamingCallbacks {} + /** * Streaming input definition for pre-computed phonemes. * Same as `TextToSpeechStreamingInput` but accepts `phonemes` instead of `text`. @@ -204,8 +213,5 @@ export interface TextToSpeechStreamingInput extends TextToSpeechInput { * @category Types */ export interface TextToSpeechStreamingPhonemeInput - extends TextToSpeechPhonemeInput { - onBegin?: () => void | Promise; - onNext?: (audio: Float32Array) => void | Promise; - onEnd?: () => void | Promise; -} + extends TextToSpeechPhonemeInput, + TextToSpeechStreamingCallbacks {} From 377f6596b7a770ae613bd1c434e9feb7442f26a1 Mon Sep 17 00:00:00 2001 From: IgorSwat Date: Mon, 9 Mar 2026 10:11:12 +0100 Subject: [PATCH 3/3] Update docs & formatting --- .cspell-wordlist.txt | 1 + .../useTextToSpeech.md | 61 +++++++++++++++++-- .../TextToSpeechModule.md | 47 ++++++++++++-- .../react-native-executorch/src/types/tts.ts | 6 +- 4 files changed, 101 insertions(+), 14 deletions(-) diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt index 2238f7142..6b23cdc46 100644 --- a/.cspell-wordlist.txt +++ b/.cspell-wordlist.txt @@ -127,3 +127,4 @@ detr metaprogramming ktlint lefthook +espeak \ No newline at end of file diff --git a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md index b52726c9e..10e9986de 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md @@ -82,17 +82,24 @@ You need more details? Check the following resources: ## Running the model -The module provides two ways to generate speech: +The module provides two ways to generate speech using either raw text or pre-generated phonemes: -1. [**`forward(text, speed)`**](../../06-api-reference/interfaces/TextToSpeechType.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`. +### Using Text + +1. [**`forward({ text, speed })`**](../../06-api-reference/interfaces/TextToSpeechType.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`. +2. [**`stream({ text, speed, onNext, ... })`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences. + +### Using Phonemes + +If you have pre-computed phonemes (e.g., from an external dictionary or a custom G2P model), you can skip the internal phoneme generation step: + +1. [**`forwardFromPhonemes({ phonemes, speed })`**](../../06-api-reference/interfaces/TextToSpeechType.md#forwardfromphonemes): Generates the complete audio waveform from a phoneme string. +2. [**`streamFromPhonemes({ phonemes, speed, onNext, ... })`**](../../06-api-reference/interfaces/TextToSpeechType.md#streamfromphonemes): Streams audio chunks generated from a phoneme string. :::note -Since it processes the entire text at once, it might take a significant amount of time to produce an audio for long text inputs. +Since `forward` and `forwardFromPhonemes` process the entire input at once, they might take a significant amount of time to produce audio for long inputs. ::: -2. [**`stream({ text, speed })`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator that yields chunks of audio as they are computed. - This is ideal for reducing the "time to first audio" for long sentences. - ## Example ### Speech Synthesis @@ -185,6 +192,48 @@ export default function App() { } ``` +### Synthesis from Phonemes + +If you already have a phoneme string obtained from an external source (e.g. the Python `phonemizer` library, +`espeak-ng`, or any custom phonemizer), you can use `forwardFromPhonemes` or `streamFromPhonemes` to synthesize audio directly, skipping the phoneme generation stage. + +```tsx +import React from 'react'; +import { Button, View } from 'react-native'; +import { + useTextToSpeech, + KOKORO_MEDIUM, + KOKORO_VOICE_AF_HEART, +} from 'react-native-executorch'; + +export default function App() { + const tts = useTextToSpeech({ + model: KOKORO_MEDIUM, + voice: KOKORO_VOICE_AF_HEART, + }); + + const synthesizePhonemes = async () => { + // Example phonemes for "Hello" + const audioData = await tts.forwardFromPhonemes({ + phonemes: + 'ɐ mˈæn hˌu dˈʌzᵊnt tɹˈʌst hɪmsˈɛlf, kæn nˈɛvəɹ ɹˈiᵊli tɹˈʌst ˈɛniwˌʌn ˈɛls.', + }); + + // ... process or play audioData ... + }; + + return ( + +