diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt
index 2238f7142..6b23cdc46 100644
--- a/.cspell-wordlist.txt
+++ b/.cspell-wordlist.txt
@@ -127,3 +127,4 @@ detr
metaprogramming
ktlint
lefthook
+espeak
\ No newline at end of file
diff --git a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
index b52726c9e..10e9986de 100644
--- a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
+++ b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
@@ -82,17 +82,24 @@ You need more details? Check the following resources:
## Running the model
-The module provides two ways to generate speech:
+The module provides two ways to generate speech using either raw text or pre-generated phonemes:
-1. [**`forward(text, speed)`**](../../06-api-reference/interfaces/TextToSpeechType.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`.
+### Using Text
+
+1. [**`forward({ text, speed })`**](../../06-api-reference/interfaces/TextToSpeechType.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`.
+2. [**`stream({ text, speed, onNext, ... })`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences.
+
+### Using Phonemes
+
+If you have pre-computed phonemes (e.g., from an external dictionary or a custom G2P model), you can skip the internal phoneme generation step:
+
+1. [**`forwardFromPhonemes({ phonemes, speed })`**](../../06-api-reference/interfaces/TextToSpeechType.md#forwardfromphonemes): Generates the complete audio waveform from a phoneme string.
+2. [**`streamFromPhonemes({ phonemes, speed, onNext, ... })`**](../../06-api-reference/interfaces/TextToSpeechType.md#streamfromphonemes): Streams audio chunks generated from a phoneme string.
:::note
-Since it processes the entire text at once, it might take a significant amount of time to produce an audio for long text inputs.
+Since `forward` and `forwardFromPhonemes` process the entire input at once, they might take a significant amount of time to produce audio for long inputs.
:::
-2. [**`stream({ text, speed })`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator that yields chunks of audio as they are computed.
- This is ideal for reducing the "time to first audio" for long sentences.
-
## Example
### Speech Synthesis
@@ -185,6 +192,48 @@ export default function App() {
}
```
+### Synthesis from Phonemes
+
+If you already have a phoneme string obtained from an external source (e.g. the Python `phonemizer` library,
+`espeak-ng`, or any custom phonemizer), you can use `forwardFromPhonemes` or `streamFromPhonemes` to synthesize audio directly, skipping the phoneme generation stage.
+
+```tsx
+import React from 'react';
+import { Button, View } from 'react-native';
+import {
+ useTextToSpeech,
+ KOKORO_MEDIUM,
+ KOKORO_VOICE_AF_HEART,
+} from 'react-native-executorch';
+
+export default function App() {
+ const tts = useTextToSpeech({
+ model: KOKORO_MEDIUM,
+ voice: KOKORO_VOICE_AF_HEART,
+ });
+
+ const synthesizePhonemes = async () => {
+ // Example phonemes for "Hello"
+ const audioData = await tts.forwardFromPhonemes({
+ phonemes:
+ 'ɐ mˈæn hˌu dˈʌzᵊnt tɹˈʌst hɪmsˈɛlf, kæn nˈɛvəɹ ɹˈiᵊli tɹˈʌst ˈɛniwˌʌn ˈɛls.',
+ });
+
+ // ... process or play audioData ...
+ };
+
+ return (
+
+
+
+ );
+}
+```
+
## Supported models
| Model | Language |
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
index bc297ecf4..53bde1685 100644
--- a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
+++ b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
@@ -53,16 +53,24 @@ For more information on resource sources, see [loading models](../../01-fundamen
## Running the model
-The module provides two ways to generate speech:
+The module provides two ways to generate speech using either raw text or pre-generated phonemes:
+
+### Using Text
1. [**`forward(text, speed)`**](../../06-api-reference/classes/TextToSpeechModule.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`.
+2. [**`stream({ text, speed })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences.
+
+### Using Phonemes
+
+If you have pre-computed phonemes (e.g., from an external dictionary or a custom G2P model), you can skip the internal phoneme generation step:
+
+1. [**`forwardFromPhonemes(phonemes, speed)`**](../../06-api-reference/classes/TextToSpeechModule.md#forwardfromphonemes): Generates the complete audio waveform from a phoneme string.
+2. [**`streamFromPhonemes({ phonemes, speed })`**](../../06-api-reference/classes/TextToSpeechModule.md#streamfromphonemes): Streams audio chunks generated from a phoneme string.
:::note
-Since it processes the entire text at once, it might take a significant amount of time to produce an audio for long text inputs.
+Since `forward` and `forwardFromPhonemes` process the entire input at once, they might take a significant amount of time to produce audio for long inputs.
:::
-2. [**`stream({ text, speed })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences.
-
## Example
### Speech Synthesis
@@ -135,3 +143,34 @@ try {
console.error('Streaming failed:', error);
}
```
+
+### Synthesis from Phonemes
+
+If you already have a phoneme string (e.g., from an external library), you can use `forwardFromPhonemes` or `streamFromPhonemes` to synthesize audio directly, skipping the internal phonemizer stage.
+
+```typescript
+import {
+ TextToSpeechModule,
+ KOKORO_MEDIUM,
+ KOKORO_VOICE_AF_HEART,
+} from 'react-native-executorch';
+
+const tts = new TextToSpeechModule();
+
+await tts.load({
+ model: KOKORO_MEDIUM,
+ voice: KOKORO_VOICE_AF_HEART,
+});
+
+// Example phonemes for "ExecuTorch"
+const waveform = await tts.forwardFromPhonemes('həlˈO wˈɜɹld!', 1.0);
+
+// Or stream from phonemes
+for await (const chunk of tts.streamFromPhonemes({
+ phonemes:
+ 'ɐ mˈæn hˌu dˈʌzᵊnt tɹˈʌst hɪmsˈɛlf, kæn nˈɛvəɹ ɹˈiᵊli tɹˈʌst ˈɛniwˌʌn ˈɛls.',
+ speed: 1.0,
+})) {
+ // ... process chunk ...
+}
+```
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index d6489c9be..7ece18a93 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -169,6 +169,14 @@ template class ModelHostObject : public JsiHostObject {
addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject,
promiseHostFunction<&Model::stream>,
"stream"));
+ addFunctions(JSI_EXPORT_FUNCTION(
+ ModelHostObject,
+ promiseHostFunction<&Model::generateFromPhonemes>,
+ "generateFromPhonemes"));
+ addFunctions(JSI_EXPORT_FUNCTION(
+ ModelHostObject,
+ promiseHostFunction<&Model::streamFromPhonemes>,
+ "streamFromPhonemes"));
}
if constexpr (meta::HasGenerateFromString) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
index d73fb6205..52da0fc46 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
@@ -4,6 +4,7 @@
#include
#include
+#include
#include
#include
@@ -73,16 +74,9 @@ void Kokoro::loadVoice(const std::string &voiceSource) {
}
}
-std::vector Kokoro::generate(std::string text, float speed) {
- if (text.size() > params::kMaxTextSize) {
- throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
- "Kokoro: maximum input text size exceeded");
- }
-
- // G2P (Grapheme to Phoneme) conversion
- auto phonemes = phonemizer_.process(text);
-
- // Divide the phonemes string intro substrings.
+std::vector
+Kokoro::generateFromPhonemesImpl(const std::u32string &phonemes, float speed) {
+ // Divide the phonemes string into substrings.
// Affects the further calculations only in case of string size
// exceeding the biggest model's input.
auto subsentences =
@@ -98,26 +92,20 @@ std::vector Kokoro::generate(std::string text, float speed) {
size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
? params::kPauseValues.at(lastPhoneme)
: params::kDefaultPause;
- std::vector pause(pauseMs * constants::kSamplesPerMilisecond, 0.F);
- // Add audio part and pause to the main audio vector
+ // Add audio part and silence pause to the main audio vector
audio.insert(audio.end(), std::make_move_iterator(audioPart.begin()),
std::make_move_iterator(audioPart.end()));
- audio.insert(audio.end(), std::make_move_iterator(pause.begin()),
- std::make_move_iterator(pause.end()));
+ audio.resize(audio.size() + pauseMs * constants::kSamplesPerMilisecond,
+ 0.F);
}
return audio;
}
-void Kokoro::stream(std::string text, float speed,
- std::shared_ptr callback) {
- if (text.size() > params::kMaxTextSize) {
- throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
- "Kokoro: maximum input text size exceeded");
- }
-
- // Build a full callback function
+void Kokoro::streamFromPhonemesImpl(
+ const std::u32string &phonemes, float speed,
+ std::shared_ptr callback) {
auto nativeCallback = [this, callback](const std::vector &audioVec) {
if (this->isStreaming_) {
this->callInvoker_->invokeAsync([callback, audioVec](jsi::Runtime &rt) {
@@ -127,21 +115,12 @@ void Kokoro::stream(std::string text, float speed,
}
};
- // Mark the beginning of the streaming process
isStreaming_ = true;
- // G2P (Grapheme to Phoneme) conversion
- auto phonemes = phonemizer_.process(text);
-
- // Divide the phonemes string intro substrings.
- // Use specialized implementation to minimize the latency between the
- // sentences.
+ // Use LATENCY strategy to minimize the time-to-first-audio for streaming
auto subsentences =
partitioner_.divide(phonemes);
- // We follow the implementation of generate() method, but
- // instead of accumulating results in a vector, we push them
- // back to the JS side with the callback.
for (size_t i = 0; i < subsentences.size(); i++) {
if (!isStreaming_) {
break;
@@ -151,7 +130,7 @@ void Kokoro::stream(std::string text, float speed,
// Determine the silent padding duration to be stripped from the edges of
// the generated audio. If a chunk ends with a space or follows one that
- // did, it indicates a word boundary split – we use a shorter padding (20ms)
+ // did, it indicates a word boundary split – we use a shorter padding
// to ensure natural speech flow. Otherwise, we use 50ms for standard
// pauses.
bool endsWithSpace = (subsentence.back() == U' ');
@@ -161,25 +140,67 @@ void Kokoro::stream(std::string text, float speed,
// Generate an audio vector with the Kokoro model
auto audioPart = synthesize(subsentence, speed, paddingMs);
- // Calculate a pause between the sentences
+ // Calculate and append a pause between the sentences
char32_t lastPhoneme = subsentence.back();
size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
? params::kPauseValues.at(lastPhoneme)
: params::kDefaultPause;
- std::vector pause(pauseMs * constants::kSamplesPerMilisecond, 0.F);
-
- // Add pause to the audio vector
- audioPart.insert(audioPart.end(), std::make_move_iterator(pause.begin()),
- std::make_move_iterator(pause.end()));
+ audioPart.resize(
+ audioPart.size() + pauseMs * constants::kSamplesPerMilisecond, 0.F);
// Push the audio right away to the JS side
nativeCallback(audioPart);
}
- // Mark the end of the streaming process
isStreaming_ = false;
}
+std::vector Kokoro::generate(std::string text, float speed) {
+ if (text.size() > params::kMaxTextSize) {
+ throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+ "Kokoro: maximum input text size exceeded");
+ }
+
+ // G2P (Grapheme to Phoneme) conversion
+ auto phonemes = phonemizer_.process(text);
+
+ return generateFromPhonemesImpl(phonemes, speed);
+}
+
+std::vector Kokoro::generateFromPhonemes(std::string phonemes,
+ float speed) {
+ if (phonemes.empty()) {
+ throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+ "Kokoro: phoneme string must not be empty");
+ }
+ return generateFromPhonemesImpl(
+ phonemis::utilities::string_utils::utf8_to_u32string(phonemes), speed);
+}
+
+void Kokoro::stream(std::string text, float speed,
+ std::shared_ptr callback) {
+ if (text.size() > params::kMaxTextSize) {
+ throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+ "Kokoro: maximum input text size exceeded");
+ }
+
+ // G2P (Grapheme to Phoneme) conversion
+ auto phonemes = phonemizer_.process(text);
+
+ streamFromPhonemesImpl(phonemes, speed, callback);
+}
+
+void Kokoro::streamFromPhonemes(std::string phonemes, float speed,
+ std::shared_ptr callback) {
+ if (phonemes.empty()) {
+ throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+ "Kokoro: phoneme string must not be empty");
+ }
+ streamFromPhonemesImpl(
+ phonemis::utilities::string_utils::utf8_to_u32string(phonemes), speed,
+ callback);
+}
+
void Kokoro::streamStop() noexcept { isStreaming_ = false; }
std::vector Kokoro::synthesize(const std::u32string &phonemes,
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
index f27ba8018..d7a4c2ae6 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
@@ -27,11 +27,22 @@ class Kokoro {
// Processes the entire text at once, before sending back to the JS side.
std::vector generate(std::string text, float speed = 1.F);
+ // Accepts pre-computed phonemes (as a UTF-8 IPA string) and synthesizes
+ // audio, bypassing the built-in phonemizer. This allows callers to use
+ // an external G2P system (e.g. the Python `phonemizer` library, espeak-ng,
+ // or any custom phonemizer).
+ std::vector generateFromPhonemes(std::string phonemes,
+ float speed = 1.F);
+
// Processes text in chunks, sending each chunk individualy to the JS side
// with asynchronous callbacks.
void stream(std::string text, float speed,
std::shared_ptr callback);
+ // Streaming variant that accepts pre-computed phonemes instead of text.
+ void streamFromPhonemes(std::string phonemes, float speed,
+ std::shared_ptr callback);
+
// Stops the streaming process
void streamStop() noexcept;
@@ -42,6 +53,12 @@ class Kokoro {
// Helper function - loading voice array
void loadVoice(const std::string &voiceSource);
+ // Helper function - shared synthesis pipeline (partition + synthesize)
+ std::vector generateFromPhonemesImpl(const std::u32string &phonemes,
+ float speed);
+ void streamFromPhonemesImpl(const std::u32string &phonemes, float speed,
+ std::shared_ptr callback);
+
// Helper function - generate specialization for given input size
std::vector synthesize(const std::u32string &phonemes, float speed,
size_t paddingMs = 50);
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
index b29b4bc8d..1a751f42d 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
@@ -3,8 +3,11 @@ import { TextToSpeechModule } from '../../modules/natural_language_processing/Te
import {
TextToSpeechProps,
TextToSpeechInput,
+ TextToSpeechPhonemeInput,
TextToSpeechType,
+ TextToSpeechStreamingCallbacks,
TextToSpeechStreamingInput,
+ TextToSpeechStreamingPhonemeInput,
} from '../../types/tts';
import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
@@ -62,17 +65,47 @@ export const useTextToSpeech = ({
preventLoad,
]);
- const forward = async (input: TextToSpeechInput) => {
+ // Shared guard for all generation methods
+ const guardReady = (methodName: string) => {
if (!isReady)
throw new RnExecutorchError(
RnExecutorchErrorCode.ModuleNotLoaded,
- 'The model is currently not loaded. Please load the model before calling forward().'
+ `The model is currently not loaded. Please load the model before calling ${methodName}().`
);
if (isGenerating)
throw new RnExecutorchError(
RnExecutorchErrorCode.ModelGenerating,
'The model is currently generating. Please wait until previous model run is complete.'
);
+ };
+
+ // Shared streaming orchestration (guards + onBegin/onNext/onEnd lifecycle)
+ const runStream = useCallback(
+ async (
+ methodName: string,
+ generator: AsyncGenerator,
+ callbacks: TextToSpeechStreamingCallbacks
+ ) => {
+ guardReady(methodName);
+ setIsGenerating(true);
+ try {
+ await callbacks.onBegin?.();
+ for await (const audio of generator) {
+ if (callbacks.onNext) {
+ await callbacks.onNext(audio);
+ }
+ }
+ } finally {
+ await callbacks.onEnd?.();
+ setIsGenerating(false);
+ }
+ },
+ // eslint-disable-next-line react-hooks/exhaustive-deps
+ [isReady, isGenerating, moduleInstance]
+ );
+
+ const forward = async (input: TextToSpeechInput) => {
+ guardReady('forward');
try {
setIsGenerating(true);
return await moduleInstance.forward(input.text, input.speed ?? 1.0);
@@ -81,35 +114,42 @@ export const useTextToSpeech = ({
}
};
+ const forwardFromPhonemes = async (input: TextToSpeechPhonemeInput) => {
+ guardReady('forwardFromPhonemes');
+ try {
+ setIsGenerating(true);
+ return await moduleInstance.forwardFromPhonemes(
+ input.phonemes,
+ input.speed ?? 1.0
+ );
+ } finally {
+ setIsGenerating(false);
+ }
+ };
+
const stream = useCallback(
async (input: TextToSpeechStreamingInput) => {
- if (!isReady)
- throw new RnExecutorchError(
- RnExecutorchErrorCode.ModuleNotLoaded,
- 'The model is currently not loaded. Please load the model before calling stream().'
- );
- if (isGenerating)
- throw new RnExecutorchError(
- RnExecutorchErrorCode.ModelGenerating,
- 'The model is currently generating. Please wait until previous model run is complete.'
- );
- setIsGenerating(true);
- try {
- await input.onBegin?.();
- for await (const audio of moduleInstance.stream({
- text: input.text,
+ await runStream(
+ 'stream',
+ moduleInstance.stream({ text: input.text, speed: input.speed ?? 1.0 }),
+ input
+ );
+ },
+ [runStream, moduleInstance]
+ );
+
+ const streamFromPhonemes = useCallback(
+ async (input: TextToSpeechStreamingPhonemeInput) => {
+ await runStream(
+ 'streamFromPhonemes',
+ moduleInstance.streamFromPhonemes({
+ phonemes: input.phonemes,
speed: input.speed ?? 1.0,
- })) {
- if (input.onNext) {
- await input.onNext(audio);
- }
- }
- } finally {
- await input.onEnd?.();
- setIsGenerating(false);
- }
+ }),
+ input
+ );
},
- [isReady, isGenerating, moduleInstance]
+ [runStream, moduleInstance]
);
return {
@@ -117,7 +157,9 @@ export const useTextToSpeech = ({
isReady,
isGenerating,
forward,
+ forwardFromPhonemes,
stream,
+ streamFromPhonemes,
streamStop: moduleInstance.streamStop,
downloadProgress,
};
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
index 849c25676..932f166e7 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
@@ -5,6 +5,7 @@ import {
KokoroConfig,
TextToSpeechConfig,
TextToSpeechStreamingInput,
+ TextToSpeechStreamingPhonemeInput,
VoiceConfig,
} from '../../types/tts';
import { Logger } from '../../common/Logger';
@@ -98,6 +99,14 @@ export class TextToSpeechModule {
}
}
+ private ensureLoaded(methodName: string): void {
+ if (this.nativeModule == null)
+ throw new RnExecutorchError(
+ RnExecutorchErrorCode.ModuleNotLoaded,
+ `The model is currently not loaded. Please load the model before calling ${methodName}().`
+ );
+ }
+
/**
* Synthesizes the provided text into speech.
* Returns a promise that resolves to the full audio waveform as a `Float32Array`.
@@ -110,25 +119,34 @@ export class TextToSpeechModule {
text: string,
speed: number = 1.0
): Promise {
- if (this.nativeModule == null)
- throw new RnExecutorchError(
- RnExecutorchErrorCode.ModuleNotLoaded,
- 'The model is currently not loaded. Please load the model before calling forward().'
- );
+ this.ensureLoaded('forward');
return await this.nativeModule.generate(text, speed);
}
/**
- * Starts a streaming synthesis session. Yields audio chunks as they are generated.
+ * Synthesizes pre-computed phonemes into speech, bypassing the built-in phonemizer.
+ * This allows using an external G2P system (e.g. the Python `phonemizer` library,
+ * espeak-ng, or any custom phonemizer).
*
- * @param input - Input object containing text and optional speed.
- * @returns An async generator yielding Float32Array audio chunks.
+ * @param phonemes The pre-computed IPA phoneme string.
+ * @param speed Optional speed multiplier for the speech synthesis (default is 1.0).
+ * @returns A promise resolving to the synthesized audio waveform.
*/
- public async *stream({
- text,
- speed,
- }: TextToSpeechStreamingInput): AsyncGenerator {
- // Stores computed audio segments
+ public async forwardFromPhonemes(
+ phonemes: string,
+ speed: number = 1.0
+ ): Promise {
+ this.ensureLoaded('forwardFromPhonemes');
+ return await this.nativeModule.generateFromPhonemes(phonemes, speed);
+ }
+
+ /**
+ * Shared streaming implementation. Wraps a native streaming call in an
+ * async generator that yields Float32Array audio chunks as they arrive.
+ */
+ private async *streamImpl(
+ nativeCall: (cb: (audio: number[]) => void) => Promise
+ ): AsyncGenerator {
const queue: Float32Array[] = [];
let waiter: (() => void) | null = null;
@@ -142,7 +160,7 @@ export class TextToSpeechModule {
(async () => {
try {
- await this.nativeModule.stream(text, speed, (audio: number[]) => {
+ await nativeCall((audio: number[]) => {
queue.push(new Float32Array(audio));
wake();
});
@@ -169,6 +187,35 @@ export class TextToSpeechModule {
}
}
+ /**
+ * Starts a streaming synthesis session. Yields audio chunks as they are generated.
+ *
+ * @param input - Input object containing text and optional speed.
+ * @returns An async generator yielding Float32Array audio chunks.
+ */
+ public async *stream({
+ text,
+ speed,
+ }: TextToSpeechStreamingInput): AsyncGenerator {
+ yield* this.streamImpl((cb) => this.nativeModule.stream(text, speed, cb));
+ }
+
+ /**
+ * Starts a streaming synthesis session from pre-computed phonemes.
+ * Bypasses the built-in phonemizer, allowing use of external G2P systems.
+ *
+ * @param input - Input object containing phonemes and optional speed.
+ * @returns An async generator yielding Float32Array audio chunks.
+ */
+ public async *streamFromPhonemes({
+ phonemes,
+ speed,
+ }: TextToSpeechStreamingPhonemeInput): AsyncGenerator {
+ yield* this.streamImpl((cb) =>
+ this.nativeModule.streamFromPhonemes(phonemes, speed, cb)
+ );
+ }
+
/**
* Stops the streaming process if there is any ongoing.
*/
diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts
index 55937be49..ebc4b065a 100644
--- a/packages/react-native-executorch/src/types/tts.ts
+++ b/packages/react-native-executorch/src/types/tts.ts
@@ -90,6 +90,21 @@ export interface TextToSpeechInput {
speed?: number;
}
+/**
+ * Text to Speech module input for pre-computed phonemes.
+ * Use this when you have your own phonemizer (e.g. the Python `phonemizer`
+ * library, espeak-ng, or any custom G2P system) and want to bypass the
+ * built-in phonemizer pipeline.
+ *
+ * @category Types
+ * @property {string} phonemes - pre-computed IPA phoneme string
+ * @property {number} [speed] - optional speed argument - the higher it is, the faster the speech becomes
+ */
+export interface TextToSpeechPhonemeInput {
+ phonemes: string;
+ speed?: number;
+}
+
/**
* Return type for the `useTextToSpeech` hook.
* Manages the state and operations for Text-to-Speech generation.
@@ -125,6 +140,18 @@ export interface TextToSpeechType {
*/
forward: (input: TextToSpeechInput) => Promise;
+ /**
+ * Synthesizes pre-computed phonemes into speech audio in a single pass.
+ * Bypasses the built-in phonemizer, allowing use of external G2P systems.
+ *
+ * @param input - The `TextToSpeechPhonemeInput` object containing pre-computed `phonemes` and optional `speed`.
+ * @returns A Promise that resolves with the generated audio data.
+ * @throws {RnExecutorchError} If the model is not loaded or is currently generating.
+ */
+ forwardFromPhonemes: (
+ input: TextToSpeechPhonemeInput
+ ) => Promise;
+
/**
* Streams the generated audio data incrementally.
* This is optimal for real-time playback, allowing audio to start playing before the full text is synthesized.
@@ -134,6 +161,17 @@ export interface TextToSpeechType {
*/
stream: (input: TextToSpeechStreamingInput) => Promise;
+ /**
+ * Streams pre-computed phonemes incrementally, bypassing the built-in phonemizer.
+ *
+ * @param input - The streaming input with pre-computed `phonemes` instead of `text`.
+ * @returns A Promise that resolves when the streaming process is complete.
+ * @throws {RnExecutorchError} If the model is not loaded or is currently generating.
+ */
+ streamFromPhonemes: (
+ input: TextToSpeechStreamingPhonemeInput
+ ) => Promise;
+
/**
* Interrupts and stops the currently active audio generation stream.
*/
@@ -141,20 +179,37 @@ export interface TextToSpeechType {
}
/**
- * Text to Speech streaming input definition
- *
- * Streaming mode in T2S is synchronized by passing specific callbacks
- * executed at given moments of the streaming.
- * Actions such as playing the audio should happen within the onNext callback.
- * Callbacks can be both synchronous or asynchronous.
+ * Shared streaming lifecycle callbacks for TTS streaming modes.
*
* @category Types
* @property {() => void | Promise} [onBegin] - Called when streaming begins
* @property {(audio: Float32Array) => void | Promise} [onNext] - Called after each audio chunk gets calculated.
* @property {() => void | Promise} [onEnd] - Called when streaming ends
*/
-export interface TextToSpeechStreamingInput extends TextToSpeechInput {
+export interface TextToSpeechStreamingCallbacks {
onBegin?: () => void | Promise;
onNext?: (audio: Float32Array) => void | Promise;
onEnd?: () => void | Promise;
}
+
+/**
+ * Text to Speech streaming input definition
+ *
+ * Streaming mode in T2S is synchronized by passing specific callbacks
+ * executed at given moments of the streaming.
+ * Actions such as playing the audio should happen within the onNext callback.
+ * Callbacks can be both synchronous or asynchronous.
+ *
+ * @category Types
+ */
+export interface TextToSpeechStreamingInput
+ extends TextToSpeechInput, TextToSpeechStreamingCallbacks {}
+
+/**
+ * Streaming input definition for pre-computed phonemes.
+ * Same as `TextToSpeechStreamingInput` but accepts `phonemes` instead of `text`.
+ *
+ * @category Types
+ */
+export interface TextToSpeechStreamingPhonemeInput
+ extends TextToSpeechPhonemeInput, TextToSpeechStreamingCallbacks {}