diff --git a/apps/speech/App.tsx b/apps/speech/App.tsx
index ab036678e..ddbfe3c98 100644
--- a/apps/speech/App.tsx
+++ b/apps/speech/App.tsx
@@ -5,6 +5,7 @@ import { SpeechToTextScreen } from './screens/SpeechToTextScreen';
import ColorPalette from './colors';
import ExecutorchLogo from './assets/executorch.svg';
import { Quiz } from './screens/Quiz';
+import { TextToSpeechLLMScreen } from './screens/TextToSpeechLLMScreen';
import { initExecutorch } from 'react-native-executorch';
import { ExpoResourceFetcher } from '@react-native-executorch/expo-resource-fetcher';
@@ -14,7 +15,7 @@ initExecutorch({
export default function App() {
const [currentScreen, setCurrentScreen] = useState<
- 'menu' | 'speech-to-text' | 'text-to-speech' | 'quiz'
+ 'menu' | 'speech-to-text' | 'text-to-speech' | 'quiz' | 'text-to-speech-llm'
>('menu');
const goToMenu = () => setCurrentScreen('menu');
@@ -31,6 +32,10 @@ export default function App() {
return ;
}
+ if (currentScreen === 'text-to-speech-llm') {
+ return ;
+ }
+
return (
@@ -54,6 +59,12 @@ export default function App() {
>
Text to Speech - Quiz
+ setCurrentScreen('text-to-speech-llm')}
+ >
+ Text to Speech - LLM Streaming
+
);
diff --git a/apps/speech/screens/TextToSpeechLLMScreen.tsx b/apps/speech/screens/TextToSpeechLLMScreen.tsx
new file mode 100644
index 000000000..73df0f299
--- /dev/null
+++ b/apps/speech/screens/TextToSpeechLLMScreen.tsx
@@ -0,0 +1,324 @@
+import React, { useEffect, useState, useRef } from 'react';
+import {
+ View,
+ Text,
+ StyleSheet,
+ TouchableOpacity,
+ ScrollView,
+} from 'react-native';
+import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
+import FontAwesome from '@expo/vector-icons/FontAwesome';
+import SWMIcon from '../assets/swm_icon.svg';
+import {
+ useLLM,
+ useTextToSpeech,
+ KOKORO_MEDIUM,
+ KOKORO_VOICE_AF_HEART,
+ LLAMA3_2_1B_QLORA,
+} from 'react-native-executorch';
+import {
+ AudioManager,
+ AudioContext,
+ AudioBuffer,
+ AudioBufferSourceNode,
+} from 'react-native-audio-api';
+
+interface TextToSpeechLLMProps {
+ onBack: () => void;
+}
+
+/**
+ * Converts an audio vector (Float32Array) to an AudioBuffer for playback
+ * @param audioVector - The generated audio samples from the model
+ * @param sampleRate - The sample rate (default: 24000 Hz for Kokoro)
+ * @returns AudioBuffer ready for playback
+ */
+const createAudioBufferFromVector = (
+ audioVector: Float32Array,
+ audioContext: AudioContext,
+ sampleRate: number = 24000
+): AudioBuffer => {
+ const audioBuffer = audioContext.createBuffer(
+ 1,
+ audioVector.length,
+ sampleRate
+ );
+ const channelData = audioBuffer.getChannelData(0);
+ channelData.set(audioVector);
+
+ return audioBuffer;
+};
+
+export const TextToSpeechLLMScreen = ({ onBack }: TextToSpeechLLMProps) => {
+ const [displayText, setDisplayText] = useState('');
+ const [isTtsStreaming, setIsTtsStreaming] = useState(false);
+ const llm = useLLM({ model: LLAMA3_2_1B_QLORA });
+ const tts = useTextToSpeech({
+ model: KOKORO_MEDIUM,
+ voice: KOKORO_VOICE_AF_HEART,
+ });
+
+ const processedLengthRef = useRef(0);
+ const audioContextRef = useRef(null);
+ const sourceRef = useRef(null);
+
+ useEffect(() => {
+ AudioManager.setAudioSessionOptions({
+ iosCategory: 'playAndRecord',
+ iosMode: 'spokenAudio',
+ iosOptions: ['defaultToSpeaker'],
+ });
+
+ audioContextRef.current = new AudioContext({ sampleRate: 24000 });
+ audioContextRef.current.suspend();
+
+ return () => {
+ audioContextRef.current?.close();
+ audioContextRef.current = null;
+ };
+ }, []);
+
+ // Update displayText gradually as response gets generated and insert new text chunks into TTS stream
+ useEffect(() => {
+ if (llm.response && tts.isReady) {
+ setDisplayText(llm.response);
+
+ const previousLength = processedLengthRef.current;
+ if (llm.response.length > previousLength) {
+ const newChunk = llm.response.slice(previousLength);
+ tts.streamInsert(newChunk);
+ processedLengthRef.current = llm.response.length;
+ }
+ } else {
+ processedLengthRef.current = 0;
+ }
+ }, [llm.response, tts]);
+
+ const handleGenerate = async () => {
+ setDisplayText('');
+ processedLengthRef.current = 0;
+ setIsTtsStreaming(true);
+
+ const startTTS = async () => {
+ try {
+ const audioContext = audioContextRef.current;
+ if (!audioContext) return;
+
+ if (audioContext.state === 'suspended') {
+ await audioContext.resume();
+ }
+
+ const onNext = async (audioVec: Float32Array) => {
+ return new Promise((resolve) => {
+ const audioBuffer = createAudioBufferFromVector(
+ audioVec,
+ audioContext,
+ 24000
+ );
+
+ const source = (sourceRef.current =
+ audioContext.createBufferSource());
+ source.buffer = audioBuffer;
+ source.connect(audioContext.destination);
+
+ source.onEnded = () => resolve();
+
+ source.start();
+ });
+ };
+
+ await tts.stream({
+ text: '',
+ speed: 0.9,
+ stopAutomatically: false,
+ onNext,
+ });
+ } catch (e) {
+ console.error('TTS streaming error:', e);
+ } finally {
+ setIsTtsStreaming(false);
+ }
+ };
+
+ const ttsPromise = startTTS();
+
+ try {
+ await llm.sendMessage(
+ 'Generate a short story about a robot learning to paint. The story should be around 200 words long.'
+ );
+ } catch (e) {
+ console.error('Generation failed:', e);
+ } finally {
+ tts.streamStop(false);
+ await ttsPromise;
+
+ if (
+ audioContextRef.current &&
+ audioContextRef.current.state === 'running'
+ ) {
+ await audioContextRef.current.suspend();
+ }
+ }
+ };
+
+ const handleStop = () => {
+ llm.interrupt();
+ tts.streamStop(true);
+ if (sourceRef.current) {
+ try {
+ sourceRef.current.stop();
+ } catch (e) {
+ // Source might have already stopped or disconnected
+ }
+ }
+ };
+
+ const isProcessing = llm.isGenerating || isTtsStreaming;
+ const isModelsReady = llm.isReady && tts.isReady;
+
+ const getModelStatus = () => {
+ if (llm.error) return `LLM Error: ${llm.error.message}`;
+ if (tts.error) return `TTS Error: ${tts.error.message}`;
+ if (!llm.isReady)
+ return `Loading LLM: ${(100 * llm.downloadProgress).toFixed(2)}%`;
+ if (!tts.isReady)
+ return `Loading TTS: ${(100 * tts.downloadProgress).toFixed(2)}%`;
+ if (isProcessing) return 'Generating/Streaming...';
+ return 'Ready';
+ };
+
+ return (
+
+
+
+
+
+
+
+ React Native ExecuTorch
+ LLM to Speech Demo
+
+
+
+ Status: {getModelStatus()}
+
+
+
+ Generated Story
+
+
+
+ {displayText ||
+ (isModelsReady
+ ? 'Press the button to generate a story and hear it spoken aloud.'
+ : 'Please wait for models to load...')}
+
+
+
+
+
+
+ {isProcessing ? (
+
+
+ Stop Generation
+
+ ) : (
+
+
+ Generate & Stream Speech
+
+ )}
+
+
+
+ );
+};
+
+const styles = StyleSheet.create({
+ container: {
+ flex: 1,
+ alignItems: 'center',
+ backgroundColor: 'white',
+ paddingHorizontal: 16,
+ },
+ header: {
+ alignItems: 'center',
+ position: 'relative',
+ width: '100%',
+ },
+ backButton: {
+ position: 'absolute',
+ left: 0,
+ top: 10,
+ padding: 10,
+ zIndex: 1,
+ },
+ headerText: {
+ fontSize: 22,
+ fontWeight: 'bold',
+ color: '#0f186e',
+ },
+ statusContainer: {
+ marginTop: 12,
+ alignItems: 'center',
+ },
+ contentContainer: {
+ width: '100%',
+ marginTop: 24,
+ flex: 1,
+ marginBottom: 24,
+ },
+ label: {
+ marginLeft: 12,
+ marginBottom: 4,
+ color: '#0f186e',
+ fontWeight: '600',
+ },
+ responseContainer: {
+ borderRadius: 12,
+ borderWidth: 1,
+ borderColor: '#0f186e',
+ flex: 1,
+ },
+ responseContent: {
+ padding: 12,
+ },
+ responseText: {
+ fontSize: 16,
+ color: '#333',
+ lineHeight: 24,
+ },
+ buttonContainer: {
+ marginBottom: 24,
+ width: '100%',
+ },
+ actionButton: {
+ backgroundColor: '#0f186e',
+ flexDirection: 'row',
+ justifyContent: 'center',
+ alignItems: 'center',
+ padding: 12,
+ borderRadius: 12,
+ gap: 8,
+ },
+ stopButton: {
+ backgroundColor: '#ff4444',
+ },
+ buttonText: {
+ color: 'white',
+ fontWeight: '600',
+ letterSpacing: -0.5,
+ fontSize: 16,
+ },
+ disabled: {
+ opacity: 0.5,
+ },
+});
diff --git a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
index b52726c9e..8bd1d07e2 100644
--- a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
+++ b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
@@ -90,8 +90,8 @@ The module provides two ways to generate speech:
Since it processes the entire text at once, it might take a significant amount of time to produce an audio for long text inputs.
:::
-2. [**`stream({ text, speed })`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator that yields chunks of audio as they are computed.
- This is ideal for reducing the "time to first audio" for long sentences.
+2. [**`stream(input)`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator-like functionality (managed via callbacks like `onNext`) that yields chunks of audio as they are computed.
+ This is ideal for reducing the "time to first audio" for long sentences. You can also dynamically insert text during the generation process using `streamInsert(text)` and stop it with `streamStop(instant)`.
## Example
@@ -160,8 +160,11 @@ export default function App() {
const generateStream = async () => {
const ctx = contextRef.current;
+ // Instead of using streamInsert() directly, we can pass initial text to the stream() method
await tts.stream({
- text: "This is a longer text, which is being streamed chunk by chunk. Let's see how it works!",
+ text: "This is an initial text, which is being streamed chunk by chunk. Let's see how it works!",
+ onBegin: async () => console.log('Started streaming'),
+ onEnd: async () => console.log('Finished streaming'),
onNext: async (chunk) => {
return new Promise((resolve) => {
const buffer = ctx.createBuffer(1, chunk.length, 24000);
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
index bc297ecf4..5f2a66617 100644
--- a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
+++ b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
@@ -61,7 +61,7 @@ The module provides two ways to generate speech:
Since it processes the entire text at once, it might take a significant amount of time to produce an audio for long text inputs.
:::
-2. [**`stream({ text, speed })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences.
+2. [**`stream({ speed, stopAutomatically })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences. In contrast to `forward`, it enables inserting text chunks dynamically into processing buffer with [**`streamInsert(text)`**](../../06-api-reference/classes/TextToSpeechModule.md#streaminsert) and allows stopping generation early with [**`streamStop(instant)`**](../../06-api-reference/classes/TextToSpeechModule.md#streamstop).
## Example
@@ -115,9 +115,12 @@ const audioContext = new AudioContext({ sampleRate: 24000 });
await tts.load({ model: KOKORO_MEDIUM, voice: KOKORO_VOICE_AF_HEART });
try {
+ // Pre-load the first chunk of text to the buffer
+ tts.streamInsert('This is a streaming test, with a sample input.');
+
for await (const chunk of tts.stream({
- text: 'This is a streaming test, with a sample input.',
speed: 1.0,
+ stopAutomatically: true, // Will stop the stream automatically after clearing the input buffer
})) {
// Play each chunk sequentially
await new Promise((resolve) => {
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index d6489c9be..9688eef15 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -169,6 +169,12 @@ template class ModelHostObject : public JsiHostObject {
addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject,
promiseHostFunction<&Model::stream>,
"stream"));
+ addFunctions(JSI_EXPORT_FUNCTION(
+ ModelHostObject, synchronousHostFunction<&Model::streamInsert>,
+ "streamInsert"));
+ addFunctions(JSI_EXPORT_FUNCTION(
+ ModelHostObject, synchronousHostFunction<&Model::streamStop>,
+ "streamStop"));
}
if constexpr (meta::HasGenerateFromString) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h
index 3bc7f7f83..050fb902c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h
@@ -34,6 +34,10 @@ inline constexpr int32_t kSamplingRate =
24000; // Corresponds to Kokoro's model audio frequency
inline constexpr int32_t kSamplesPerMilisecond = kSamplingRate / 1000;
+// Special text characters
+inline const std::unordered_set kEndOfSentenceCharacters = {'.', '?', '!',
+ ';'};
+
// Special phonemes
inline const std::unordered_set kEndOfSentencePhonemes = {
U'.', U'?', U'!', U';', U'…'};
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
index d73fb6205..7ead055f9 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
@@ -6,6 +6,9 @@
#include
#include
#include
+#include
+
+#include
namespace rnexecutorch::models::text_to_speech::kokoro {
@@ -110,13 +113,8 @@ std::vector Kokoro::generate(std::string text, float speed) {
return audio;
}
-void Kokoro::stream(std::string text, float speed,
+void Kokoro::stream(float speed, bool stopOnEmptyBuffer,
std::shared_ptr callback) {
- if (text.size() > params::kMaxTextSize) {
- throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
- "Kokoro: maximum input text size exceeded");
- }
-
// Build a full callback function
auto nativeCallback = [this, callback](const std::vector &audioVec) {
if (this->isStreaming_) {
@@ -127,60 +125,111 @@ void Kokoro::stream(std::string text, float speed,
}
};
- // Mark the beginning of the streaming process
isStreaming_ = true;
+ stopOnEmptyBuffer_ = stopOnEmptyBuffer;
- // G2P (Grapheme to Phoneme) conversion
- auto phonemes = phonemizer_.process(text);
-
- // Divide the phonemes string intro substrings.
- // Use specialized implementation to minimize the latency between the
- // sentences.
- auto subsentences =
- partitioner_.divide(phonemes);
-
- // We follow the implementation of generate() method, but
- // instead of accumulating results in a vector, we push them
- // back to the JS side with the callback.
- for (size_t i = 0; i < subsentences.size(); i++) {
- if (!isStreaming_) {
+ // The outer streaming loop is responsible for handling the input buffer.
+ // The extracted text is then passed to the inner loop, which performs a
+ // standard streaming on a fixed amount of input text.
+ while (isStreaming_) {
+ if (inputTextBuffer_.empty() && stopOnEmptyBuffer_) {
break;
}
- const auto &subsentence = subsentences[i];
-
- // Determine the silent padding duration to be stripped from the edges of
- // the generated audio. If a chunk ends with a space or follows one that
- // did, it indicates a word boundary split – we use a shorter padding (20ms)
- // to ensure natural speech flow. Otherwise, we use 50ms for standard
- // pauses.
- bool endsWithSpace = (subsentence.back() == U' ');
- bool prevEndsWithSpace = (i > 0 && subsentences[i - 1].back() == U' ');
- size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50; // [ms]
-
- // Generate an audio vector with the Kokoro model
- auto audioPart = synthesize(subsentence, speed, paddingMs);
-
- // Calculate a pause between the sentences
- char32_t lastPhoneme = subsentence.back();
- size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
- ? params::kPauseValues.at(lastPhoneme)
- : params::kDefaultPause;
- std::vector pause(pauseMs * constants::kSamplesPerMilisecond, 0.F);
-
- // Add pause to the audio vector
- audioPart.insert(audioPart.end(), std::make_move_iterator(pause.begin()),
- std::make_move_iterator(pause.end()));
+ // Try to find the most recent available end of sentence character.
+ size_t searchLimit =
+ std::min(inputTextBuffer_.size(), params::kMaxTextSize);
+ auto eosIt = std::find_first_of(
+ inputTextBuffer_.rbegin() + (inputTextBuffer_.size() - searchLimit),
+ inputTextBuffer_.rend(), constants::kEndOfSentenceCharacters.begin(),
+ constants::kEndOfSentenceCharacters.end());
+ size_t chunkSize = (eosIt != inputTextBuffer_.rend())
+ ? std::distance(eosIt, inputTextBuffer_.rend())
+ : 0;
+
+ // To maximize the quality of the speech, we try to avoid processing
+ // chunks which end in the middle of a sentence.
+ if (chunkSize > 0 ||
+ streamSkippedIterations >= params::kStreamMaxSkippedIterations) {
+ std::string text = inputTextBuffer_.substr(0, chunkSize);
+ inputTextBuffer_.erase(0, chunkSize);
+
+ // Now we proceed with a standard streaming logic for fixed-size input.
+ auto phonemes = phonemizer_.process(text);
+
+ // Divide the phonemes string intro substrings.
+ // Use specialized implementation to minimize the latency between the
+ // sentences.
+ auto subsentences =
+ partitioner_.divide(phonemes);
+
+ // We follow the implementation of generate() method, but
+ // instead of accumulating results in a vector, we push them
+ // back to the JS side with the callback.
+ for (size_t i = 0; i < subsentences.size(); i++) {
+ if (!isStreaming_) {
+ break;
+ }
+
+ const auto &subsentence = subsentences[i];
+
+ // Determine the silent padding duration to be stripped from the edges
+ // of the generated audio. If a chunk ends with a space or follows one
+ // that did, it indicates a word boundary split – we use a shorter
+ // padding (20ms) to ensure natural speech flow. Otherwise, we use 50ms
+ // for standard pauses.
+ bool endsWithSpace = (subsentence.back() == U' ');
+ bool prevEndsWithSpace = (i > 0 && subsentences[i - 1].back() == U' ');
+ size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50; // [ms]
+
+ // Generate an audio vector with the Kokoro model
+ auto audioPart = synthesize(subsentence, speed, paddingMs);
+
+ // Calculate a pause between the sentences
+ char32_t lastPhoneme = subsentence.back();
+ size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
+ ? params::kPauseValues.at(lastPhoneme)
+ : params::kDefaultPause;
+ std::vector pause(pauseMs * constants::kSamplesPerMilisecond,
+ 0.F);
+
+ // Add pause to the audio vector
+ audioPart.insert(audioPart.end(),
+ std::make_move_iterator(pause.begin()),
+ std::make_move_iterator(pause.end()));
+
+ // Push the audio right away to the JS side
+ nativeCallback(audioPart);
+ }
+
+ streamSkippedIterations = 0;
+ } else {
+ streamSkippedIterations++;
+ }
- // Push the audio right away to the JS side
- nativeCallback(audioPart);
+ // A little bit of pause to not overload the thread.
+ if (isStreaming_) {
+ std::this_thread::sleep_for(
+ std::chrono::milliseconds(params::kStreamPause));
+ }
}
- // Mark the end of the streaming process
+ inputTextBuffer_.clear();
isStreaming_ = false;
+ streamSkippedIterations = 0;
}
-void Kokoro::streamStop() noexcept { isStreaming_ = false; }
+void Kokoro::streamInsert(std::string textChunk) noexcept {
+ inputTextBuffer_.append(textChunk);
+}
+
+void Kokoro::streamStop(bool instant) noexcept {
+ if (instant) {
+ isStreaming_ = false;
+ } else {
+ stopOnEmptyBuffer_ = true;
+ }
+}
std::vector Kokoro::synthesize(const std::u32string &phonemes,
float speed, size_t paddingMs) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
index f27ba8018..b7091310c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
@@ -24,25 +24,50 @@ class Kokoro {
const std::string &synthesizerSource, const std::string &voiceSource,
std::shared_ptr callInvoker);
- // Processes the entire text at once, before sending back to the JS side.
+ /**
+ * Processes the entire text at once, before sending back to the JS side.
+ *
+ * @param text An input text to be processed.
+ * @param speed Determines the speed of generated speech. Passed directly to
+ * the Kokoro model.
+ */
std::vector generate(std::string text, float speed = 1.F);
- // Processes text in chunks, sending each chunk individualy to the JS side
- // with asynchronous callbacks.
- void stream(std::string text, float speed,
+ /**
+ * Processes text from inputTextBuffer_ in chunks, sending each chunk
+ * individualy to the JS side with asynchronous callbacks.
+ *
+ * @param speed Determines the speed of generated speech. Passed directly to
+ * the Kokoro model.
+ * @param stopOnEmptyBuffer If true, the streaming ends automatically when the
+ * input buffer is empty.
+ * @param callback A callback to the JS side.
+ */
+ void stream(float speed, bool stopOnEmptyBuffer,
std::shared_ptr callback);
- // Stops the streaming process
- void streamStop() noexcept;
+ /**
+ * Updates the input streaming buffer by adding more text to be processed.
+ *
+ * @param text A new chunk of text, appended to the end of the input buffer.
+ */
+ void streamInsert(std::string textChunk) noexcept;
+
+ /**
+ * Stops the streaming process.
+ *
+ * @param instant If true, stops the streaming as soon as possible by
+ * switching the isStreaming_ flag. Otherwise allows to process the rest of
+ * the buffer first, by switching the stopOnEmptyBuffer_ flag.
+ */
+ void streamStop(bool instant) noexcept;
std::size_t getMemoryLowerBound() const noexcept;
void unload() noexcept;
private:
- // Helper function - loading voice array
void loadVoice(const std::string &voiceSource);
- // Helper function - generate specialization for given input size
std::vector synthesize(const std::u32string &phonemes, float speed,
size_t paddingMs = 50);
@@ -65,8 +90,11 @@ class Kokoro {
constants::kMaxInputTokens>
voice_;
- // Extra control variables
+ // Streaming state control variables
+ std::string inputTextBuffer_ = "";
bool isStreaming_ = false;
+ bool stopOnEmptyBuffer_ = true;
+ int32_t streamSkippedIterations = 0;
};
} // namespace models::text_to_speech::kokoro
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h
index f6b910b03..f517db031 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h
@@ -20,6 +20,17 @@ namespace rnexecutorch::models::text_to_speech::kokoro::params {
*/
inline constexpr size_t kMaxTextSize = 2048;
+/**
+ * A number of skipped streaming iterations after which we process the remaining
+ * input no matter how it looks like.
+ */
+inline constexpr int32_t kStreamMaxSkippedIterations = 3;
+
+/**
+ * A size of pause (in miliseconds) applied after each streaming iteration.
+ */
+inline constexpr int32_t kStreamPause = 200;
+
/**
* A set of punctation - pause values. Determines how much pause (silence) is
* being added at the end of each calculated audio vector. This is primarly used
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
index c45ab9107..62842b517 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -63,7 +63,6 @@ elseif(ANDROID_ABI STREQUAL "x86_64")
set(OPENCV_THIRD_PARTY_LIBS "")
endif()
-
add_library(opencv_deps INTERFACE)
target_link_libraries(opencv_deps INTERFACE
${OPENCV_LIBS_DIR}/libopencv_core.a
@@ -84,6 +83,12 @@ target_link_options(opencv_deps INTERFACE -fopenmp -static-openmp)
add_library(tokenizers_deps INTERFACE)
target_include_directories(tokenizers_deps INTERFACE "${TOKENIZERS_DIR}")
+# Phonemis
+set(LIBS_DIR "${PACKAGE_ROOT}/third-party/android/libs")
+set(PHONEMIS_LIBS
+ "${LIBS_DIR}/phonemis/${ANDROID_ABI}/libphonemis.a"
+)
+
# Source Definitions
set(CORE_SOURCES
${RNEXECUTORCH_DIR}/models/BaseModel.cpp
@@ -261,3 +266,13 @@ add_rn_test(VerticalOCRTests integration/VerticalOCRTest.cpp
${IMAGE_UTILS_SOURCES}
LIBS opencv_deps
)
+
+add_rn_test(TextToSpeechTests integration/TextToSpeechTest.cpp
+ SOURCES
+ ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Kokoro.cpp
+ ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/DurationPredictor.cpp
+ ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Synthesizer.cpp
+ ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Partitioner.cpp
+ ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Utils.cpp
+ LIBS ${PHONEMIS_LIBS}
+)
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToSpeechTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToSpeechTest.cpp
new file mode 100644
index 000000000..997de11bb
--- /dev/null
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToSpeechTest.cpp
@@ -0,0 +1,123 @@
+#include "BaseModelTests.h"
+#include "utils/TestUtils.h"
+#include
+#include
+#include
+
+using namespace rnexecutorch;
+using namespace rnexecutorch::models::text_to_speech::kokoro;
+
+constexpr auto kValidLang = "en-us";
+constexpr auto kValidTaggerPath = "kokoro_en_tagger.json";
+constexpr auto kValidPhonemizerPath = "kokoro_us_lexicon.json";
+constexpr auto kValidDurationPath = "kokoro_duration_predictor.pte";
+constexpr auto kValidSynthesizerPath = "kokoro_synthesizer.pte";
+constexpr auto kValidVoicePath = "kokoro_af_heart.bin";
+
+namespace {
+bool isAudioValid(const std::vector &audio) {
+ if (audio.empty())
+ return false;
+ // Check for non-silence (amplitude greater than an arbitrary small noise
+ // threshold)
+ for (float sample : audio) {
+ if (std::abs(sample) > 1e-4f) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool isAudioSimilar(const std::vector &audio1,
+ const std::vector &audio2, float tolerance = 0.1f) {
+ if (audio1.empty() || audio2.empty())
+ return false;
+
+ double sumSqDiff = 0;
+ size_t steps = std::max(audio1.size(), audio2.size());
+
+ for (size_t i = 0; i < steps; ++i) {
+ float idx1 = (static_cast(i) / steps) * audio1.size();
+ float idx2 = (static_cast(i) / steps) * audio2.size();
+
+ float diff =
+ audio1[static_cast(idx1)] - audio2[static_cast(idx2)];
+ sumSqDiff += diff * diff;
+ }
+
+ double rmse = std::sqrt(sumSqDiff / steps);
+ if (rmse >= tolerance) {
+ std::cerr << "Audio structural RMSE difference: " << rmse
+ << " (tolerance: " << tolerance << ")" << std::endl;
+ return false;
+ }
+ return true;
+}
+
+class KokoroTest : public ::testing::Test {
+protected:
+ void SetUp() override {
+ try {
+ model_ = std::make_unique(
+ kValidLang, kValidTaggerPath, kValidPhonemizerPath,
+ kValidDurationPath, kValidSynthesizerPath, kValidVoicePath, nullptr);
+ } catch (...) {
+ model_ = nullptr;
+ }
+ }
+
+ std::unique_ptr model_;
+};
+} // namespace
+
+TEST(TTSCtorTests, InvalidVoicePathThrows) {
+ EXPECT_THROW(Kokoro(kValidLang, kValidTaggerPath, kValidPhonemizerPath,
+ kValidDurationPath, kValidSynthesizerPath,
+ "nonexistent_voice.bin", nullptr),
+ RnExecutorchError);
+}
+
+TEST_F(KokoroTest, MaxTextSizeExceededThrows) {
+ if (!model_) {
+ GTEST_SKIP() << "Model assets not available, skipping test.";
+ }
+ std::string hugeText(10000, 'A'); // beyond params::kMaxTextSize
+ EXPECT_THROW(model_->generate(hugeText, 1.0f), RnExecutorchError);
+}
+
+TEST_F(KokoroTest, EmptyStringReturnsEmptyVector) {
+ if (!model_) {
+ GTEST_SKIP() << "Model assets not available, skipping test.";
+ }
+ auto result = model_->generate("", 1.0f);
+ EXPECT_TRUE(result.empty());
+}
+
+TEST_F(KokoroTest, GenerateReturnsValidAudio) {
+ if (!model_) {
+ GTEST_SKIP() << "Model assets not available, skipping test.";
+ }
+ auto result = model_->generate("Hello world! How are you doing?", 1.0f);
+ auto reference = test_utils::loadAudioFromFile("test_speech.raw");
+
+ ASSERT_FALSE(reference.empty())
+ << "Reference audio 'test_speech.raw' not found.";
+
+ // Compare against an audio waveform obtained from the original
+ // Kokoro model (PyTorch)
+ EXPECT_TRUE(isAudioSimilar(result, reference));
+}
+
+TEST_F(KokoroTest, GenerateSpeedAdjustsAudioLength) {
+ if (!model_) {
+ GTEST_SKIP() << "Model assets not available, skipping test.";
+ }
+ std::string text = "This is a sentence to test the speed modifications.";
+ auto resultNormal = model_->generate(text, 1.0f);
+ auto resultFast = model_->generate(text, 1.5f);
+
+ EXPECT_TRUE(isAudioValid(resultNormal));
+ EXPECT_TRUE(isAudioValid(resultFast));
+ // Fast speech should result in a noticeably shorter output waveform
+ EXPECT_LT(resultFast.size(), resultNormal.size());
+}
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/test_speech.raw b/packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/test_speech.raw
new file mode 100644
index 000000000..2cf55af04
Binary files /dev/null and b/packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/test_speech.raw differ
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
index 63d738eb3..a3b5ff2eb 100755
--- a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
@@ -32,6 +32,7 @@ TEST_EXECUTABLES=(
"TextToImageTests"
"OCRTests"
"VerticalOCRTests"
+ "TextToSpeechTests"
)
# ============================================================================
@@ -39,6 +40,7 @@ TEST_EXECUTABLES=(
# ============================================================================
TEST_ASSETS=(
"integration/assets/test_audio_float.raw"
+ "integration/assets/test_speech.raw"
"integration/assets/we_are_software_mansion.jpg"
)
@@ -66,6 +68,11 @@ MODELS=(
"t2i_encoder.pte|https://huggingface.co/software-mansion/react-native-executorch-bk-sdm-tiny/resolve/v0.6.0/text_encoder/model.pte"
"t2i_unet.pte|https://huggingface.co/software-mansion/react-native-executorch-bk-sdm-tiny/resolve/v0.6.0/unet/model.256.pte"
"t2i_decoder.pte|https://huggingface.co/software-mansion/react-native-executorch-bk-sdm-tiny/resolve/v0.6.0/vae/model.256.pte"
+ "kokoro_duration_predictor.pte|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/xnnpack/medium/duration_predictor.pte"
+ "kokoro_synthesizer.pte|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/xnnpack/medium/synthesizer.pte"
+ "kokoro_af_heart.bin|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/voices/af_heart.bin"
+ "kokoro_us_lexicon.json|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/phonemizer/us_merged.json"
+ "kokoro_en_tagger.json|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/phonemizer/tags.json"
)
# ============================================================================
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
index b29b4bc8d..19d1645f2 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
@@ -95,10 +95,14 @@ export const useTextToSpeech = ({
);
setIsGenerating(true);
try {
+ if (input.text) {
+ moduleInstance.streamInsert(input.text);
+ }
+
await input.onBegin?.();
for await (const audio of moduleInstance.stream({
- text: input.text,
speed: input.speed ?? 1.0,
+ stopAutomatically: input.stopAutomatically ?? true,
})) {
if (input.onNext) {
await input.onNext(audio);
@@ -118,7 +122,8 @@ export const useTextToSpeech = ({
isGenerating,
forward,
stream,
- streamStop: moduleInstance.streamStop,
+ streamInsert: (text: string) => moduleInstance.streamInsert(text),
+ streamStop: (instant: boolean = true) => moduleInstance.streamStop(instant),
downloadProgress,
};
};
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
index 849c25676..bbc36bdad 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
@@ -15,11 +15,10 @@ import { Logger } from '../../common/Logger';
* @category Typescript API
*/
export class TextToSpeechModule {
- /**
- * Native module instance
- */
nativeModule: any = null;
+ streamFinished: boolean = false;
+
/**
* Loads the model and voice assets specified by the config object.
* `onDownloadProgressCallback` allows you to monitor the current progress.
@@ -125,16 +124,17 @@ export class TextToSpeechModule {
* @returns An async generator yielding Float32Array audio chunks.
*/
public async *stream({
- text,
speed,
+ stopAutomatically,
}: TextToSpeechStreamingInput): AsyncGenerator {
// Stores computed audio segments
const queue: Float32Array[] = [];
let waiter: (() => void) | null = null;
- let finished = false;
let error: unknown;
+ this.streamFinished = false;
+
const wake = () => {
waiter?.();
waiter = null;
@@ -142,38 +142,53 @@ export class TextToSpeechModule {
(async () => {
try {
- await this.nativeModule.stream(text, speed, (audio: number[]) => {
- queue.push(new Float32Array(audio));
- wake();
- });
- finished = true;
+ await this.nativeModule.stream(
+ speed,
+ stopAutomatically,
+ (audio: number[]) => {
+ queue.push(new Float32Array(audio));
+ wake();
+ }
+ );
+ this.streamFinished = true;
wake();
} catch (e) {
error = e;
- finished = true;
+ this.streamFinished = true;
wake();
}
})();
- while (true) {
+ while (!this.streamFinished) {
if (queue.length > 0) {
yield queue.shift()!;
- if (finished && queue.length === 0) {
+ if (this.streamFinished && queue.length === 0) {
return;
}
continue;
}
if (error) throw error;
- if (finished) return;
+ if (this.streamFinished) return;
await new Promise((r) => (waiter = r));
}
}
+ /**
+ * Inserts new text chunk into the buffer to be processed in streaming mode.
+ */
+ public streamInsert(textChunk: string): void {
+ this.nativeModule.streamInsert(textChunk);
+ }
+
/**
* Stops the streaming process if there is any ongoing.
+ *
+ * * @param instant If true, stops the streaming as soon as possible. Otherwise
+ * allows the module to complete processing for the remains of the buffer.
*/
- public streamStop(): void {
- this.nativeModule.streamStop();
+ public streamStop(instant: boolean = true): void {
+ this.nativeModule.streamStop(instant);
+ this.streamFinished = true;
}
/**
diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts
index 55937be49..efe4e111f 100644
--- a/packages/react-native-executorch/src/types/tts.ts
+++ b/packages/react-native-executorch/src/types/tts.ts
@@ -134,10 +134,18 @@ export interface TextToSpeechType {
*/
stream: (input: TextToSpeechStreamingInput) => Promise;
+ /**
+ * Inserts new text chunk into the buffer to be processed in streaming mode.
+ */
+ streamInsert: (textChunk: string) => void;
+
/**
* Interrupts and stops the currently active audio generation stream.
+ *
+ * @param instant If true, stops the streaming as soon as possible. Otherwise
+ * allows the module to complete processing for the remains of the buffer.
*/
- streamStop: () => void;
+ streamStop: (instant?: boolean) => void;
}
/**
@@ -149,11 +157,17 @@ export interface TextToSpeechType {
* Callbacks can be both synchronous or asynchronous.
*
* @category Types
- * @property {() => void | Promise} [onBegin] - Called when streaming begins
+ * @property {string} [text] - Initial text to be spoken. The streaming input buffer is initially filled with this value.
+ * @property {number} [speed] - Optional speed argument; higher values increase the speech rate.
+ * @property {boolean} [stopAutomatically] - If true, streaming will stop automatically when the buffer is empty.
+ * @property {() => void | Promise} [onBegin] - Called when streaming begins.
* @property {(audio: Float32Array) => void | Promise} [onNext] - Called after each audio chunk gets calculated.
- * @property {() => void | Promise} [onEnd] - Called when streaming ends
+ * @property {() => void | Promise} [onEnd] - Called when streaming ends.
*/
-export interface TextToSpeechStreamingInput extends TextToSpeechInput {
+export interface TextToSpeechStreamingInput {
+ text?: string;
+ speed?: number;
+ stopAutomatically?: boolean;
onBegin?: () => void | Promise;
onNext?: (audio: Float32Array) => void | Promise;
onEnd?: () => void | Promise;