diff --git a/apps/speech/App.tsx b/apps/speech/App.tsx index ab036678e..ddbfe3c98 100644 --- a/apps/speech/App.tsx +++ b/apps/speech/App.tsx @@ -5,6 +5,7 @@ import { SpeechToTextScreen } from './screens/SpeechToTextScreen'; import ColorPalette from './colors'; import ExecutorchLogo from './assets/executorch.svg'; import { Quiz } from './screens/Quiz'; +import { TextToSpeechLLMScreen } from './screens/TextToSpeechLLMScreen'; import { initExecutorch } from 'react-native-executorch'; import { ExpoResourceFetcher } from '@react-native-executorch/expo-resource-fetcher'; @@ -14,7 +15,7 @@ initExecutorch({ export default function App() { const [currentScreen, setCurrentScreen] = useState< - 'menu' | 'speech-to-text' | 'text-to-speech' | 'quiz' + 'menu' | 'speech-to-text' | 'text-to-speech' | 'quiz' | 'text-to-speech-llm' >('menu'); const goToMenu = () => setCurrentScreen('menu'); @@ -31,6 +32,10 @@ export default function App() { return ; } + if (currentScreen === 'text-to-speech-llm') { + return ; + } + return ( @@ -54,6 +59,12 @@ export default function App() { > Text to Speech - Quiz + setCurrentScreen('text-to-speech-llm')} + > + Text to Speech - LLM Streaming + ); diff --git a/apps/speech/screens/TextToSpeechLLMScreen.tsx b/apps/speech/screens/TextToSpeechLLMScreen.tsx new file mode 100644 index 000000000..73df0f299 --- /dev/null +++ b/apps/speech/screens/TextToSpeechLLMScreen.tsx @@ -0,0 +1,324 @@ +import React, { useEffect, useState, useRef } from 'react'; +import { + View, + Text, + StyleSheet, + TouchableOpacity, + ScrollView, +} from 'react-native'; +import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; +import FontAwesome from '@expo/vector-icons/FontAwesome'; +import SWMIcon from '../assets/swm_icon.svg'; +import { + useLLM, + useTextToSpeech, + KOKORO_MEDIUM, + KOKORO_VOICE_AF_HEART, + LLAMA3_2_1B_QLORA, +} from 'react-native-executorch'; +import { + AudioManager, + AudioContext, + AudioBuffer, + AudioBufferSourceNode, +} from 'react-native-audio-api'; + +interface TextToSpeechLLMProps { + onBack: () => void; +} + +/** + * Converts an audio vector (Float32Array) to an AudioBuffer for playback + * @param audioVector - The generated audio samples from the model + * @param sampleRate - The sample rate (default: 24000 Hz for Kokoro) + * @returns AudioBuffer ready for playback + */ +const createAudioBufferFromVector = ( + audioVector: Float32Array, + audioContext: AudioContext, + sampleRate: number = 24000 +): AudioBuffer => { + const audioBuffer = audioContext.createBuffer( + 1, + audioVector.length, + sampleRate + ); + const channelData = audioBuffer.getChannelData(0); + channelData.set(audioVector); + + return audioBuffer; +}; + +export const TextToSpeechLLMScreen = ({ onBack }: TextToSpeechLLMProps) => { + const [displayText, setDisplayText] = useState(''); + const [isTtsStreaming, setIsTtsStreaming] = useState(false); + const llm = useLLM({ model: LLAMA3_2_1B_QLORA }); + const tts = useTextToSpeech({ + model: KOKORO_MEDIUM, + voice: KOKORO_VOICE_AF_HEART, + }); + + const processedLengthRef = useRef(0); + const audioContextRef = useRef(null); + const sourceRef = useRef(null); + + useEffect(() => { + AudioManager.setAudioSessionOptions({ + iosCategory: 'playAndRecord', + iosMode: 'spokenAudio', + iosOptions: ['defaultToSpeaker'], + }); + + audioContextRef.current = new AudioContext({ sampleRate: 24000 }); + audioContextRef.current.suspend(); + + return () => { + audioContextRef.current?.close(); + audioContextRef.current = null; + }; + }, []); + + // Update displayText gradually as response gets generated and insert new text chunks into TTS stream + useEffect(() => { + if (llm.response && tts.isReady) { + setDisplayText(llm.response); + + const previousLength = processedLengthRef.current; + if (llm.response.length > previousLength) { + const newChunk = llm.response.slice(previousLength); + tts.streamInsert(newChunk); + processedLengthRef.current = llm.response.length; + } + } else { + processedLengthRef.current = 0; + } + }, [llm.response, tts]); + + const handleGenerate = async () => { + setDisplayText(''); + processedLengthRef.current = 0; + setIsTtsStreaming(true); + + const startTTS = async () => { + try { + const audioContext = audioContextRef.current; + if (!audioContext) return; + + if (audioContext.state === 'suspended') { + await audioContext.resume(); + } + + const onNext = async (audioVec: Float32Array) => { + return new Promise((resolve) => { + const audioBuffer = createAudioBufferFromVector( + audioVec, + audioContext, + 24000 + ); + + const source = (sourceRef.current = + audioContext.createBufferSource()); + source.buffer = audioBuffer; + source.connect(audioContext.destination); + + source.onEnded = () => resolve(); + + source.start(); + }); + }; + + await tts.stream({ + text: '', + speed: 0.9, + stopAutomatically: false, + onNext, + }); + } catch (e) { + console.error('TTS streaming error:', e); + } finally { + setIsTtsStreaming(false); + } + }; + + const ttsPromise = startTTS(); + + try { + await llm.sendMessage( + 'Generate a short story about a robot learning to paint. The story should be around 200 words long.' + ); + } catch (e) { + console.error('Generation failed:', e); + } finally { + tts.streamStop(false); + await ttsPromise; + + if ( + audioContextRef.current && + audioContextRef.current.state === 'running' + ) { + await audioContextRef.current.suspend(); + } + } + }; + + const handleStop = () => { + llm.interrupt(); + tts.streamStop(true); + if (sourceRef.current) { + try { + sourceRef.current.stop(); + } catch (e) { + // Source might have already stopped or disconnected + } + } + }; + + const isProcessing = llm.isGenerating || isTtsStreaming; + const isModelsReady = llm.isReady && tts.isReady; + + const getModelStatus = () => { + if (llm.error) return `LLM Error: ${llm.error.message}`; + if (tts.error) return `TTS Error: ${tts.error.message}`; + if (!llm.isReady) + return `Loading LLM: ${(100 * llm.downloadProgress).toFixed(2)}%`; + if (!tts.isReady) + return `Loading TTS: ${(100 * tts.downloadProgress).toFixed(2)}%`; + if (isProcessing) return 'Generating/Streaming...'; + return 'Ready'; + }; + + return ( + + + + + + + + React Native ExecuTorch + LLM to Speech Demo + + + + Status: {getModelStatus()} + + + + Generated Story + + + + {displayText || + (isModelsReady + ? 'Press the button to generate a story and hear it spoken aloud.' + : 'Please wait for models to load...')} + + + + + + + {isProcessing ? ( + + + Stop Generation + + ) : ( + + + Generate & Stream Speech + + )} + + + + ); +}; + +const styles = StyleSheet.create({ + container: { + flex: 1, + alignItems: 'center', + backgroundColor: 'white', + paddingHorizontal: 16, + }, + header: { + alignItems: 'center', + position: 'relative', + width: '100%', + }, + backButton: { + position: 'absolute', + left: 0, + top: 10, + padding: 10, + zIndex: 1, + }, + headerText: { + fontSize: 22, + fontWeight: 'bold', + color: '#0f186e', + }, + statusContainer: { + marginTop: 12, + alignItems: 'center', + }, + contentContainer: { + width: '100%', + marginTop: 24, + flex: 1, + marginBottom: 24, + }, + label: { + marginLeft: 12, + marginBottom: 4, + color: '#0f186e', + fontWeight: '600', + }, + responseContainer: { + borderRadius: 12, + borderWidth: 1, + borderColor: '#0f186e', + flex: 1, + }, + responseContent: { + padding: 12, + }, + responseText: { + fontSize: 16, + color: '#333', + lineHeight: 24, + }, + buttonContainer: { + marginBottom: 24, + width: '100%', + }, + actionButton: { + backgroundColor: '#0f186e', + flexDirection: 'row', + justifyContent: 'center', + alignItems: 'center', + padding: 12, + borderRadius: 12, + gap: 8, + }, + stopButton: { + backgroundColor: '#ff4444', + }, + buttonText: { + color: 'white', + fontWeight: '600', + letterSpacing: -0.5, + fontSize: 16, + }, + disabled: { + opacity: 0.5, + }, +}); diff --git a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md index b52726c9e..8bd1d07e2 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md @@ -90,8 +90,8 @@ The module provides two ways to generate speech: Since it processes the entire text at once, it might take a significant amount of time to produce an audio for long text inputs. ::: -2. [**`stream({ text, speed })`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator that yields chunks of audio as they are computed. - This is ideal for reducing the "time to first audio" for long sentences. +2. [**`stream(input)`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator-like functionality (managed via callbacks like `onNext`) that yields chunks of audio as they are computed. + This is ideal for reducing the "time to first audio" for long sentences. You can also dynamically insert text during the generation process using `streamInsert(text)` and stop it with `streamStop(instant)`. ## Example @@ -160,8 +160,11 @@ export default function App() { const generateStream = async () => { const ctx = contextRef.current; + // Instead of using streamInsert() directly, we can pass initial text to the stream() method await tts.stream({ - text: "This is a longer text, which is being streamed chunk by chunk. Let's see how it works!", + text: "This is an initial text, which is being streamed chunk by chunk. Let's see how it works!", + onBegin: async () => console.log('Started streaming'), + onEnd: async () => console.log('Finished streaming'), onNext: async (chunk) => { return new Promise((resolve) => { const buffer = ctx.createBuffer(1, chunk.length, 24000); diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md index bc297ecf4..5f2a66617 100644 --- a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md +++ b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md @@ -61,7 +61,7 @@ The module provides two ways to generate speech: Since it processes the entire text at once, it might take a significant amount of time to produce an audio for long text inputs. ::: -2. [**`stream({ text, speed })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences. +2. [**`stream({ speed, stopAutomatically })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences. In contrast to `forward`, it enables inserting text chunks dynamically into processing buffer with [**`streamInsert(text)`**](../../06-api-reference/classes/TextToSpeechModule.md#streaminsert) and allows stopping generation early with [**`streamStop(instant)`**](../../06-api-reference/classes/TextToSpeechModule.md#streamstop). ## Example @@ -115,9 +115,12 @@ const audioContext = new AudioContext({ sampleRate: 24000 }); await tts.load({ model: KOKORO_MEDIUM, voice: KOKORO_VOICE_AF_HEART }); try { + // Pre-load the first chunk of text to the buffer + tts.streamInsert('This is a streaming test, with a sample input.'); + for await (const chunk of tts.stream({ - text: 'This is a streaming test, with a sample input.', speed: 1.0, + stopAutomatically: true, // Will stop the stream automatically after clearing the input buffer })) { // Play each chunk sequentially await new Promise((resolve) => { diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index d6489c9be..9688eef15 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -169,6 +169,12 @@ template class ModelHostObject : public JsiHostObject { addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject, promiseHostFunction<&Model::stream>, "stream")); + addFunctions(JSI_EXPORT_FUNCTION( + ModelHostObject, synchronousHostFunction<&Model::streamInsert>, + "streamInsert")); + addFunctions(JSI_EXPORT_FUNCTION( + ModelHostObject, synchronousHostFunction<&Model::streamStop>, + "streamStop")); } if constexpr (meta::HasGenerateFromString) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h index 3bc7f7f83..050fb902c 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h @@ -34,6 +34,10 @@ inline constexpr int32_t kSamplingRate = 24000; // Corresponds to Kokoro's model audio frequency inline constexpr int32_t kSamplesPerMilisecond = kSamplingRate / 1000; +// Special text characters +inline const std::unordered_set kEndOfSentenceCharacters = {'.', '?', '!', + ';'}; + // Special phonemes inline const std::unordered_set kEndOfSentencePhonemes = { U'.', U'?', U'!', U';', U'…'}; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp index d73fb6205..7ead055f9 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp @@ -6,6 +6,9 @@ #include #include #include +#include + +#include namespace rnexecutorch::models::text_to_speech::kokoro { @@ -110,13 +113,8 @@ std::vector Kokoro::generate(std::string text, float speed) { return audio; } -void Kokoro::stream(std::string text, float speed, +void Kokoro::stream(float speed, bool stopOnEmptyBuffer, std::shared_ptr callback) { - if (text.size() > params::kMaxTextSize) { - throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, - "Kokoro: maximum input text size exceeded"); - } - // Build a full callback function auto nativeCallback = [this, callback](const std::vector &audioVec) { if (this->isStreaming_) { @@ -127,60 +125,111 @@ void Kokoro::stream(std::string text, float speed, } }; - // Mark the beginning of the streaming process isStreaming_ = true; + stopOnEmptyBuffer_ = stopOnEmptyBuffer; - // G2P (Grapheme to Phoneme) conversion - auto phonemes = phonemizer_.process(text); - - // Divide the phonemes string intro substrings. - // Use specialized implementation to minimize the latency between the - // sentences. - auto subsentences = - partitioner_.divide(phonemes); - - // We follow the implementation of generate() method, but - // instead of accumulating results in a vector, we push them - // back to the JS side with the callback. - for (size_t i = 0; i < subsentences.size(); i++) { - if (!isStreaming_) { + // The outer streaming loop is responsible for handling the input buffer. + // The extracted text is then passed to the inner loop, which performs a + // standard streaming on a fixed amount of input text. + while (isStreaming_) { + if (inputTextBuffer_.empty() && stopOnEmptyBuffer_) { break; } - const auto &subsentence = subsentences[i]; - - // Determine the silent padding duration to be stripped from the edges of - // the generated audio. If a chunk ends with a space or follows one that - // did, it indicates a word boundary split – we use a shorter padding (20ms) - // to ensure natural speech flow. Otherwise, we use 50ms for standard - // pauses. - bool endsWithSpace = (subsentence.back() == U' '); - bool prevEndsWithSpace = (i > 0 && subsentences[i - 1].back() == U' '); - size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50; // [ms] - - // Generate an audio vector with the Kokoro model - auto audioPart = synthesize(subsentence, speed, paddingMs); - - // Calculate a pause between the sentences - char32_t lastPhoneme = subsentence.back(); - size_t pauseMs = params::kPauseValues.contains(lastPhoneme) - ? params::kPauseValues.at(lastPhoneme) - : params::kDefaultPause; - std::vector pause(pauseMs * constants::kSamplesPerMilisecond, 0.F); - - // Add pause to the audio vector - audioPart.insert(audioPart.end(), std::make_move_iterator(pause.begin()), - std::make_move_iterator(pause.end())); + // Try to find the most recent available end of sentence character. + size_t searchLimit = + std::min(inputTextBuffer_.size(), params::kMaxTextSize); + auto eosIt = std::find_first_of( + inputTextBuffer_.rbegin() + (inputTextBuffer_.size() - searchLimit), + inputTextBuffer_.rend(), constants::kEndOfSentenceCharacters.begin(), + constants::kEndOfSentenceCharacters.end()); + size_t chunkSize = (eosIt != inputTextBuffer_.rend()) + ? std::distance(eosIt, inputTextBuffer_.rend()) + : 0; + + // To maximize the quality of the speech, we try to avoid processing + // chunks which end in the middle of a sentence. + if (chunkSize > 0 || + streamSkippedIterations >= params::kStreamMaxSkippedIterations) { + std::string text = inputTextBuffer_.substr(0, chunkSize); + inputTextBuffer_.erase(0, chunkSize); + + // Now we proceed with a standard streaming logic for fixed-size input. + auto phonemes = phonemizer_.process(text); + + // Divide the phonemes string intro substrings. + // Use specialized implementation to minimize the latency between the + // sentences. + auto subsentences = + partitioner_.divide(phonemes); + + // We follow the implementation of generate() method, but + // instead of accumulating results in a vector, we push them + // back to the JS side with the callback. + for (size_t i = 0; i < subsentences.size(); i++) { + if (!isStreaming_) { + break; + } + + const auto &subsentence = subsentences[i]; + + // Determine the silent padding duration to be stripped from the edges + // of the generated audio. If a chunk ends with a space or follows one + // that did, it indicates a word boundary split – we use a shorter + // padding (20ms) to ensure natural speech flow. Otherwise, we use 50ms + // for standard pauses. + bool endsWithSpace = (subsentence.back() == U' '); + bool prevEndsWithSpace = (i > 0 && subsentences[i - 1].back() == U' '); + size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50; // [ms] + + // Generate an audio vector with the Kokoro model + auto audioPart = synthesize(subsentence, speed, paddingMs); + + // Calculate a pause between the sentences + char32_t lastPhoneme = subsentence.back(); + size_t pauseMs = params::kPauseValues.contains(lastPhoneme) + ? params::kPauseValues.at(lastPhoneme) + : params::kDefaultPause; + std::vector pause(pauseMs * constants::kSamplesPerMilisecond, + 0.F); + + // Add pause to the audio vector + audioPart.insert(audioPart.end(), + std::make_move_iterator(pause.begin()), + std::make_move_iterator(pause.end())); + + // Push the audio right away to the JS side + nativeCallback(audioPart); + } + + streamSkippedIterations = 0; + } else { + streamSkippedIterations++; + } - // Push the audio right away to the JS side - nativeCallback(audioPart); + // A little bit of pause to not overload the thread. + if (isStreaming_) { + std::this_thread::sleep_for( + std::chrono::milliseconds(params::kStreamPause)); + } } - // Mark the end of the streaming process + inputTextBuffer_.clear(); isStreaming_ = false; + streamSkippedIterations = 0; } -void Kokoro::streamStop() noexcept { isStreaming_ = false; } +void Kokoro::streamInsert(std::string textChunk) noexcept { + inputTextBuffer_.append(textChunk); +} + +void Kokoro::streamStop(bool instant) noexcept { + if (instant) { + isStreaming_ = false; + } else { + stopOnEmptyBuffer_ = true; + } +} std::vector Kokoro::synthesize(const std::u32string &phonemes, float speed, size_t paddingMs) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h index f27ba8018..b7091310c 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h @@ -24,25 +24,50 @@ class Kokoro { const std::string &synthesizerSource, const std::string &voiceSource, std::shared_ptr callInvoker); - // Processes the entire text at once, before sending back to the JS side. + /** + * Processes the entire text at once, before sending back to the JS side. + * + * @param text An input text to be processed. + * @param speed Determines the speed of generated speech. Passed directly to + * the Kokoro model. + */ std::vector generate(std::string text, float speed = 1.F); - // Processes text in chunks, sending each chunk individualy to the JS side - // with asynchronous callbacks. - void stream(std::string text, float speed, + /** + * Processes text from inputTextBuffer_ in chunks, sending each chunk + * individualy to the JS side with asynchronous callbacks. + * + * @param speed Determines the speed of generated speech. Passed directly to + * the Kokoro model. + * @param stopOnEmptyBuffer If true, the streaming ends automatically when the + * input buffer is empty. + * @param callback A callback to the JS side. + */ + void stream(float speed, bool stopOnEmptyBuffer, std::shared_ptr callback); - // Stops the streaming process - void streamStop() noexcept; + /** + * Updates the input streaming buffer by adding more text to be processed. + * + * @param text A new chunk of text, appended to the end of the input buffer. + */ + void streamInsert(std::string textChunk) noexcept; + + /** + * Stops the streaming process. + * + * @param instant If true, stops the streaming as soon as possible by + * switching the isStreaming_ flag. Otherwise allows to process the rest of + * the buffer first, by switching the stopOnEmptyBuffer_ flag. + */ + void streamStop(bool instant) noexcept; std::size_t getMemoryLowerBound() const noexcept; void unload() noexcept; private: - // Helper function - loading voice array void loadVoice(const std::string &voiceSource); - // Helper function - generate specialization for given input size std::vector synthesize(const std::u32string &phonemes, float speed, size_t paddingMs = 50); @@ -65,8 +90,11 @@ class Kokoro { constants::kMaxInputTokens> voice_; - // Extra control variables + // Streaming state control variables + std::string inputTextBuffer_ = ""; bool isStreaming_ = false; + bool stopOnEmptyBuffer_ = true; + int32_t streamSkippedIterations = 0; }; } // namespace models::text_to_speech::kokoro diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h index f6b910b03..f517db031 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h @@ -20,6 +20,17 @@ namespace rnexecutorch::models::text_to_speech::kokoro::params { */ inline constexpr size_t kMaxTextSize = 2048; +/** + * A number of skipped streaming iterations after which we process the remaining + * input no matter how it looks like. + */ +inline constexpr int32_t kStreamMaxSkippedIterations = 3; + +/** + * A size of pause (in miliseconds) applied after each streaming iteration. + */ +inline constexpr int32_t kStreamPause = 200; + /** * A set of punctation - pause values. Determines how much pause (silence) is * being added at the end of each calculated audio vector. This is primarly used diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt index c45ab9107..62842b517 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt +++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt @@ -63,7 +63,6 @@ elseif(ANDROID_ABI STREQUAL "x86_64") set(OPENCV_THIRD_PARTY_LIBS "") endif() - add_library(opencv_deps INTERFACE) target_link_libraries(opencv_deps INTERFACE ${OPENCV_LIBS_DIR}/libopencv_core.a @@ -84,6 +83,12 @@ target_link_options(opencv_deps INTERFACE -fopenmp -static-openmp) add_library(tokenizers_deps INTERFACE) target_include_directories(tokenizers_deps INTERFACE "${TOKENIZERS_DIR}") +# Phonemis +set(LIBS_DIR "${PACKAGE_ROOT}/third-party/android/libs") +set(PHONEMIS_LIBS + "${LIBS_DIR}/phonemis/${ANDROID_ABI}/libphonemis.a" +) + # Source Definitions set(CORE_SOURCES ${RNEXECUTORCH_DIR}/models/BaseModel.cpp @@ -261,3 +266,13 @@ add_rn_test(VerticalOCRTests integration/VerticalOCRTest.cpp ${IMAGE_UTILS_SOURCES} LIBS opencv_deps ) + +add_rn_test(TextToSpeechTests integration/TextToSpeechTest.cpp + SOURCES + ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Kokoro.cpp + ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/DurationPredictor.cpp + ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Synthesizer.cpp + ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Partitioner.cpp + ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Utils.cpp + LIBS ${PHONEMIS_LIBS} +) diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToSpeechTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToSpeechTest.cpp new file mode 100644 index 000000000..997de11bb --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToSpeechTest.cpp @@ -0,0 +1,123 @@ +#include "BaseModelTests.h" +#include "utils/TestUtils.h" +#include +#include +#include + +using namespace rnexecutorch; +using namespace rnexecutorch::models::text_to_speech::kokoro; + +constexpr auto kValidLang = "en-us"; +constexpr auto kValidTaggerPath = "kokoro_en_tagger.json"; +constexpr auto kValidPhonemizerPath = "kokoro_us_lexicon.json"; +constexpr auto kValidDurationPath = "kokoro_duration_predictor.pte"; +constexpr auto kValidSynthesizerPath = "kokoro_synthesizer.pte"; +constexpr auto kValidVoicePath = "kokoro_af_heart.bin"; + +namespace { +bool isAudioValid(const std::vector &audio) { + if (audio.empty()) + return false; + // Check for non-silence (amplitude greater than an arbitrary small noise + // threshold) + for (float sample : audio) { + if (std::abs(sample) > 1e-4f) { + return true; + } + } + return false; +} + +bool isAudioSimilar(const std::vector &audio1, + const std::vector &audio2, float tolerance = 0.1f) { + if (audio1.empty() || audio2.empty()) + return false; + + double sumSqDiff = 0; + size_t steps = std::max(audio1.size(), audio2.size()); + + for (size_t i = 0; i < steps; ++i) { + float idx1 = (static_cast(i) / steps) * audio1.size(); + float idx2 = (static_cast(i) / steps) * audio2.size(); + + float diff = + audio1[static_cast(idx1)] - audio2[static_cast(idx2)]; + sumSqDiff += diff * diff; + } + + double rmse = std::sqrt(sumSqDiff / steps); + if (rmse >= tolerance) { + std::cerr << "Audio structural RMSE difference: " << rmse + << " (tolerance: " << tolerance << ")" << std::endl; + return false; + } + return true; +} + +class KokoroTest : public ::testing::Test { +protected: + void SetUp() override { + try { + model_ = std::make_unique( + kValidLang, kValidTaggerPath, kValidPhonemizerPath, + kValidDurationPath, kValidSynthesizerPath, kValidVoicePath, nullptr); + } catch (...) { + model_ = nullptr; + } + } + + std::unique_ptr model_; +}; +} // namespace + +TEST(TTSCtorTests, InvalidVoicePathThrows) { + EXPECT_THROW(Kokoro(kValidLang, kValidTaggerPath, kValidPhonemizerPath, + kValidDurationPath, kValidSynthesizerPath, + "nonexistent_voice.bin", nullptr), + RnExecutorchError); +} + +TEST_F(KokoroTest, MaxTextSizeExceededThrows) { + if (!model_) { + GTEST_SKIP() << "Model assets not available, skipping test."; + } + std::string hugeText(10000, 'A'); // beyond params::kMaxTextSize + EXPECT_THROW(model_->generate(hugeText, 1.0f), RnExecutorchError); +} + +TEST_F(KokoroTest, EmptyStringReturnsEmptyVector) { + if (!model_) { + GTEST_SKIP() << "Model assets not available, skipping test."; + } + auto result = model_->generate("", 1.0f); + EXPECT_TRUE(result.empty()); +} + +TEST_F(KokoroTest, GenerateReturnsValidAudio) { + if (!model_) { + GTEST_SKIP() << "Model assets not available, skipping test."; + } + auto result = model_->generate("Hello world! How are you doing?", 1.0f); + auto reference = test_utils::loadAudioFromFile("test_speech.raw"); + + ASSERT_FALSE(reference.empty()) + << "Reference audio 'test_speech.raw' not found."; + + // Compare against an audio waveform obtained from the original + // Kokoro model (PyTorch) + EXPECT_TRUE(isAudioSimilar(result, reference)); +} + +TEST_F(KokoroTest, GenerateSpeedAdjustsAudioLength) { + if (!model_) { + GTEST_SKIP() << "Model assets not available, skipping test."; + } + std::string text = "This is a sentence to test the speed modifications."; + auto resultNormal = model_->generate(text, 1.0f); + auto resultFast = model_->generate(text, 1.5f); + + EXPECT_TRUE(isAudioValid(resultNormal)); + EXPECT_TRUE(isAudioValid(resultFast)); + // Fast speech should result in a noticeably shorter output waveform + EXPECT_LT(resultFast.size(), resultNormal.size()); +} diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/test_speech.raw b/packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/test_speech.raw new file mode 100644 index 000000000..2cf55af04 Binary files /dev/null and b/packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/test_speech.raw differ diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh index 63d738eb3..a3b5ff2eb 100755 --- a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh +++ b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh @@ -32,6 +32,7 @@ TEST_EXECUTABLES=( "TextToImageTests" "OCRTests" "VerticalOCRTests" + "TextToSpeechTests" ) # ============================================================================ @@ -39,6 +40,7 @@ TEST_EXECUTABLES=( # ============================================================================ TEST_ASSETS=( "integration/assets/test_audio_float.raw" + "integration/assets/test_speech.raw" "integration/assets/we_are_software_mansion.jpg" ) @@ -66,6 +68,11 @@ MODELS=( "t2i_encoder.pte|https://huggingface.co/software-mansion/react-native-executorch-bk-sdm-tiny/resolve/v0.6.0/text_encoder/model.pte" "t2i_unet.pte|https://huggingface.co/software-mansion/react-native-executorch-bk-sdm-tiny/resolve/v0.6.0/unet/model.256.pte" "t2i_decoder.pte|https://huggingface.co/software-mansion/react-native-executorch-bk-sdm-tiny/resolve/v0.6.0/vae/model.256.pte" + "kokoro_duration_predictor.pte|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/xnnpack/medium/duration_predictor.pte" + "kokoro_synthesizer.pte|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/xnnpack/medium/synthesizer.pte" + "kokoro_af_heart.bin|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/voices/af_heart.bin" + "kokoro_us_lexicon.json|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/phonemizer/us_merged.json" + "kokoro_en_tagger.json|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/phonemizer/tags.json" ) # ============================================================================ diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts index b29b4bc8d..19d1645f2 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts @@ -95,10 +95,14 @@ export const useTextToSpeech = ({ ); setIsGenerating(true); try { + if (input.text) { + moduleInstance.streamInsert(input.text); + } + await input.onBegin?.(); for await (const audio of moduleInstance.stream({ - text: input.text, speed: input.speed ?? 1.0, + stopAutomatically: input.stopAutomatically ?? true, })) { if (input.onNext) { await input.onNext(audio); @@ -118,7 +122,8 @@ export const useTextToSpeech = ({ isGenerating, forward, stream, - streamStop: moduleInstance.streamStop, + streamInsert: (text: string) => moduleInstance.streamInsert(text), + streamStop: (instant: boolean = true) => moduleInstance.streamStop(instant), downloadProgress, }; }; diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts index 849c25676..bbc36bdad 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts @@ -15,11 +15,10 @@ import { Logger } from '../../common/Logger'; * @category Typescript API */ export class TextToSpeechModule { - /** - * Native module instance - */ nativeModule: any = null; + streamFinished: boolean = false; + /** * Loads the model and voice assets specified by the config object. * `onDownloadProgressCallback` allows you to monitor the current progress. @@ -125,16 +124,17 @@ export class TextToSpeechModule { * @returns An async generator yielding Float32Array audio chunks. */ public async *stream({ - text, speed, + stopAutomatically, }: TextToSpeechStreamingInput): AsyncGenerator { // Stores computed audio segments const queue: Float32Array[] = []; let waiter: (() => void) | null = null; - let finished = false; let error: unknown; + this.streamFinished = false; + const wake = () => { waiter?.(); waiter = null; @@ -142,38 +142,53 @@ export class TextToSpeechModule { (async () => { try { - await this.nativeModule.stream(text, speed, (audio: number[]) => { - queue.push(new Float32Array(audio)); - wake(); - }); - finished = true; + await this.nativeModule.stream( + speed, + stopAutomatically, + (audio: number[]) => { + queue.push(new Float32Array(audio)); + wake(); + } + ); + this.streamFinished = true; wake(); } catch (e) { error = e; - finished = true; + this.streamFinished = true; wake(); } })(); - while (true) { + while (!this.streamFinished) { if (queue.length > 0) { yield queue.shift()!; - if (finished && queue.length === 0) { + if (this.streamFinished && queue.length === 0) { return; } continue; } if (error) throw error; - if (finished) return; + if (this.streamFinished) return; await new Promise((r) => (waiter = r)); } } + /** + * Inserts new text chunk into the buffer to be processed in streaming mode. + */ + public streamInsert(textChunk: string): void { + this.nativeModule.streamInsert(textChunk); + } + /** * Stops the streaming process if there is any ongoing. + * + * * @param instant If true, stops the streaming as soon as possible. Otherwise + * allows the module to complete processing for the remains of the buffer. */ - public streamStop(): void { - this.nativeModule.streamStop(); + public streamStop(instant: boolean = true): void { + this.nativeModule.streamStop(instant); + this.streamFinished = true; } /** diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts index 55937be49..efe4e111f 100644 --- a/packages/react-native-executorch/src/types/tts.ts +++ b/packages/react-native-executorch/src/types/tts.ts @@ -134,10 +134,18 @@ export interface TextToSpeechType { */ stream: (input: TextToSpeechStreamingInput) => Promise; + /** + * Inserts new text chunk into the buffer to be processed in streaming mode. + */ + streamInsert: (textChunk: string) => void; + /** * Interrupts and stops the currently active audio generation stream. + * + * @param instant If true, stops the streaming as soon as possible. Otherwise + * allows the module to complete processing for the remains of the buffer. */ - streamStop: () => void; + streamStop: (instant?: boolean) => void; } /** @@ -149,11 +157,17 @@ export interface TextToSpeechType { * Callbacks can be both synchronous or asynchronous. * * @category Types - * @property {() => void | Promise} [onBegin] - Called when streaming begins + * @property {string} [text] - Initial text to be spoken. The streaming input buffer is initially filled with this value. + * @property {number} [speed] - Optional speed argument; higher values increase the speech rate. + * @property {boolean} [stopAutomatically] - If true, streaming will stop automatically when the buffer is empty. + * @property {() => void | Promise} [onBegin] - Called when streaming begins. * @property {(audio: Float32Array) => void | Promise} [onNext] - Called after each audio chunk gets calculated. - * @property {() => void | Promise} [onEnd] - Called when streaming ends + * @property {() => void | Promise} [onEnd] - Called when streaming ends. */ -export interface TextToSpeechStreamingInput extends TextToSpeechInput { +export interface TextToSpeechStreamingInput { + text?: string; + speed?: number; + stopAutomatically?: boolean; onBegin?: () => void | Promise; onNext?: (audio: Float32Array) => void | Promise; onEnd?: () => void | Promise;