From 16e91c0167582361c283f89913c56724d5825d4c Mon Sep 17 00:00:00 2001
From: contra <yo@contra.io>
Date: Sat, 7 Mar 2026 21:07:50 -0800
Subject: [PATCH 1/3] feat: add escape hatch for custom phonemizer (BYO G2P)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `generateFromPhonemes` / `streamFromPhonemes` methods that accept
pre-computed IPA phoneme strings, bypassing the built-in phonemis
pipeline. This enables users to plug in any external G2P system
(e.g. the Python `phonemizer` library, espeak-ng, or custom
phonemizers) while still using the Kokoro synthesis engine.

Changes across all layers:
- C++ Kokoro: new public methods + shared impl helpers + UTF-8→UTF-32
- JSI ModelHostObject: expose new methods via promiseHostFunction
- TextToSpeechModule: `forwardFromPhonemes()` and `streamFromPhonemes()`
- useTextToSpeech hook: corresponding hook methods
- Types: `TextToSpeechPhonemeInput`, `TextToSpeechStreamingPhonemeInput`
---
 .../host_objects/ModelHostObject.h            |   8 ++
 .../models/text_to_speech/kokoro/Kokoro.cpp   | 123 ++++++++++++------
 .../models/text_to_speech/kokoro/Kokoro.h     |  20 +++
 .../useTextToSpeech.ts                        |  57 ++++++++
 .../TextToSpeechModule.ts                     |  77 +++++++++++
 .../react-native-executorch/src/types/tts.ts  |  51 ++++++++
 6 files changed, 294 insertions(+), 42 deletions(-)
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index d6489c9be..7ece18a93 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -169,6 +169,14 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
       addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
                                        promiseHostFunction<&Model::stream>,
                                        "stream"));
+      addFunctions(JSI_EXPORT_FUNCTION(
+          ModelHostObject<Model>,
+          promiseHostFunction<&Model::generateFromPhonemes>,
+          "generateFromPhonemes"));
+      addFunctions(JSI_EXPORT_FUNCTION(
+          ModelHostObject<Model>,
+          promiseHostFunction<&Model::streamFromPhonemes>,
+          "streamFromPhonemes"));
     }
 
     if constexpr (meta::HasGenerateFromString<Model>) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
index d73fb6205..1fd522676 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
@@ -3,7 +3,9 @@
 #include "Utils.h"
 
 #include <algorithm>
+#include <codecvt>
 #include <fstream>
+#include <locale>
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/data_processing/Sequential.h>
 
@@ -73,34 +75,59 @@ void Kokoro::loadVoice(const std::string &voiceSource) {
   }
 }
 
-std::vector<float> Kokoro::generate(std::string text, float speed) {
-  if (text.size() > params::kMaxTextSize) {
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
-                            "Kokoro: maximum input text size exceeded");
+std::u32string Kokoro::utf8ToUtf32(const std::string &utf8) {
+  std::u32string result;
+  result.reserve(utf8.size());
+  size_t i = 0;
+  while (i < utf8.size()) {
+    char32_t cp = 0;
+    unsigned char c = static_cast<unsigned char>(utf8[i]);
+    size_t len = 0;
+    if (c < 0x80) {
+      cp = c;
+      len = 1;
+    } else if ((c >> 5) == 0x06) {
+      cp = c & 0x1F;
+      len = 2;
+    } else if ((c >> 4) == 0x0E) {
+      cp = c & 0x0F;
+      len = 3;
+    } else if ((c >> 3) == 0x1E) {
+      cp = c & 0x07;
+      len = 4;
+    } else {
+      throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                              "Kokoro: invalid UTF-8 in phoneme string");
+    }
+    if (i + len > utf8.size()) {
+      throw RnExecutorchError(
+          RnExecutorchErrorCode::InvalidUserInput,
+          "Kokoro: truncated UTF-8 sequence in phoneme string");
+    }
+    for (size_t j = 1; j < len; j++) {
+      cp = (cp << 6) | (static_cast<unsigned char>(utf8[i + j]) & 0x3F);
+    }
+    result.push_back(cp);
+    i += len;
   }
+  return result;
+}
 
-  // G2P (Grapheme to Phoneme) conversion
-  auto phonemes = phonemizer_.process(text);
-
-  // Divide the phonemes string intro substrings.
-  // Affects the further calculations only in case of string size
-  // exceeding the biggest model's input.
+std::vector<float>
+Kokoro::generateFromPhonemesImpl(const std::u32string &phonemes, float speed) {
   auto subsentences =
       partitioner_.divide<Partitioner::Strategy::TOTAL_TIME>(phonemes);
 
   std::vector<float> audio = {};
   for (const auto &subsentence : subsentences) {
-    // Generate an audio vector with the Kokoro model
     auto audioPart = synthesize(subsentence, speed);
 
-    // Calculate a pause between the sentences
     char32_t lastPhoneme = subsentence.back();
     size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
                          ? params::kPauseValues.at(lastPhoneme)
                          : params::kDefaultPause;
     std::vector<float> pause(pauseMs * constants::kSamplesPerMilisecond, 0.F);
 
-    // Add audio part and pause to the main audio vector
     audio.insert(audio.end(), std::make_move_iterator(audioPart.begin()),
                  std::make_move_iterator(audioPart.end()));
     audio.insert(audio.end(), std::make_move_iterator(pause.begin()),
@@ -110,14 +137,9 @@ std::vector<float> Kokoro::generate(std::string text, float speed) {
   return audio;
 }
 
-void Kokoro::stream(std::string text, float speed,
-                    std::shared_ptr<jsi::Function> callback) {
-  if (text.size() > params::kMaxTextSize) {
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
-                            "Kokoro: maximum input text size exceeded");
-  }
-
-  // Build a full callback function
+void Kokoro::streamFromPhonemesImpl(
+    const std::u32string &phonemes, float speed,
+    std::shared_ptr<jsi::Function> callback) {
   auto nativeCallback = [this, callback](const std::vector<float> &audioVec) {
     if (this->isStreaming_) {
       this->callInvoker_->invokeAsync([callback, audioVec](jsi::Runtime &rt) {
@@ -127,21 +149,11 @@ void Kokoro::stream(std::string text, float speed,
     }
   };
 
-  // Mark the beginning of the streaming process
   isStreaming_ = true;
 
-  // G2P (Grapheme to Phoneme) conversion
-  auto phonemes = phonemizer_.process(text);
-
-  // Divide the phonemes string intro substrings.
-  // Use specialized implementation to minimize the latency between the
-  // sentences.
   auto subsentences =
       partitioner_.divide<Partitioner::Strategy::LATENCY>(phonemes);
 
-  // We follow the implementation of generate() method, but
-  // instead of accumulating results in a vector, we push them
-  // back to the JS side with the callback.
   for (size_t i = 0; i < subsentences.size(); i++) {
     if (!isStreaming_) {
       break;
@@ -149,37 +161,64 @@ void Kokoro::stream(std::string text, float speed,
 
     const auto &subsentence = subsentences[i];
 
-    // Determine the silent padding duration to be stripped from the edges of
-    // the generated audio. If a chunk ends with a space or follows one that
-    // did, it indicates a word boundary split – we use a shorter padding (20ms)
-    // to ensure natural speech flow. Otherwise, we use 50ms for standard
-    // pauses.
     bool endsWithSpace = (subsentence.back() == U' ');
     bool prevEndsWithSpace = (i > 0 && subsentences[i - 1].back() == U' ');
-    size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50; // [ms]
+    size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50;
 
-    // Generate an audio vector with the Kokoro model
     auto audioPart = synthesize(subsentence, speed, paddingMs);
 
-    // Calculate a pause between the sentences
     char32_t lastPhoneme = subsentence.back();
     size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
                          ? params::kPauseValues.at(lastPhoneme)
                          : params::kDefaultPause;
     std::vector<float> pause(pauseMs * constants::kSamplesPerMilisecond, 0.F);
 
-    // Add pause to the audio vector
     audioPart.insert(audioPart.end(), std::make_move_iterator(pause.begin()),
                      std::make_move_iterator(pause.end()));
 
-    // Push the audio right away to the JS side
     nativeCallback(audioPart);
   }
 
-  // Mark the end of the streaming process
   isStreaming_ = false;
 }
 
+std::vector<float> Kokoro::generate(std::string text, float speed) {
+  if (text.size() > params::kMaxTextSize) {
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                            "Kokoro: maximum input text size exceeded");
+  }
+
+  // G2P (Grapheme to Phoneme) conversion
+  auto phonemes = phonemizer_.process(text);
+
+  return generateFromPhonemesImpl(phonemes, speed);
+}
+
+std::vector<float> Kokoro::generateFromPhonemes(std::string phonemes,
+                                                float speed) {
+  auto phonemes32 = utf8ToUtf32(phonemes);
+  return generateFromPhonemesImpl(phonemes32, speed);
+}
+
+void Kokoro::stream(std::string text, float speed,
+                    std::shared_ptr<jsi::Function> callback) {
+  if (text.size() > params::kMaxTextSize) {
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                            "Kokoro: maximum input text size exceeded");
+  }
+
+  // G2P (Grapheme to Phoneme) conversion
+  auto phonemes = phonemizer_.process(text);
+
+  streamFromPhonemesImpl(phonemes, speed, callback);
+}
+
+void Kokoro::streamFromPhonemes(std::string phonemes, float speed,
+                                std::shared_ptr<jsi::Function> callback) {
+  auto phonemes32 = utf8ToUtf32(phonemes);
+  streamFromPhonemesImpl(phonemes32, speed, callback);
+}
+
 void Kokoro::streamStop() noexcept { isStreaming_ = false; }
 
 std::vector<float> Kokoro::synthesize(const std::u32string &phonemes,
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
index f27ba8018..a99435d1c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
@@ -27,11 +27,22 @@ class Kokoro {
   // Processes the entire text at once, before sending back to the JS side.
   std::vector<float> generate(std::string text, float speed = 1.F);
 
+  // Accepts pre-computed phonemes (as a UTF-8 IPA string) and synthesizes
+  // audio, bypassing the built-in phonemizer. This allows callers to use
+  // an external G2P system (e.g. the Python `phonemizer` library, espeak-ng,
+  // or any custom phonemizer).
+  std::vector<float> generateFromPhonemes(std::string phonemes,
+                                          float speed = 1.F);
+
   // Processes text in chunks, sending each chunk individualy to the JS side
   // with asynchronous callbacks.
   void stream(std::string text, float speed,
               std::shared_ptr<jsi::Function> callback);
 
+  // Streaming variant that accepts pre-computed phonemes instead of text.
+  void streamFromPhonemes(std::string phonemes, float speed,
+                          std::shared_ptr<jsi::Function> callback);
+
   // Stops the streaming process
   void streamStop() noexcept;
 
@@ -42,6 +53,15 @@ class Kokoro {
   // Helper function - loading voice array
   void loadVoice(const std::string &voiceSource);
 
+  // Helper function - convert UTF-8 string to UTF-32 for phoneme processing
+  static std::u32string utf8ToUtf32(const std::string &utf8);
+
+  // Helper function - shared synthesis pipeline (partition + synthesize)
+  std::vector<float> generateFromPhonemesImpl(const std::u32string &phonemes,
+                                              float speed);
+  void streamFromPhonemesImpl(const std::u32string &phonemes, float speed,
+                              std::shared_ptr<jsi::Function> callback);
+
   // Helper function - generate specialization for given input size
   std::vector<float> synthesize(const std::u32string &phonemes, float speed,
                                 size_t paddingMs = 50);
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
index b29b4bc8d..c1e1a2760 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
@@ -3,8 +3,10 @@ import { TextToSpeechModule } from '../../modules/natural_language_processing/Te
 import {
   TextToSpeechProps,
   TextToSpeechInput,
+  TextToSpeechPhonemeInput,
   TextToSpeechType,
   TextToSpeechStreamingInput,
+  TextToSpeechStreamingPhonemeInput,
 } from '../../types/tts';
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
 import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
@@ -81,6 +83,28 @@ export const useTextToSpeech = ({
     }
   };
 
+  const forwardFromPhonemes = async (input: TextToSpeechPhonemeInput) => {
+    if (!isReady)
+      throw new RnExecutorchError(
+        RnExecutorchErrorCode.ModuleNotLoaded,
+        'The model is currently not loaded. Please load the model before calling forwardFromPhonemes().'
+      );
+    if (isGenerating)
+      throw new RnExecutorchError(
+        RnExecutorchErrorCode.ModelGenerating,
+        'The model is currently generating. Please wait until previous model run is complete.'
+      );
+    try {
+      setIsGenerating(true);
+      return await moduleInstance.forwardFromPhonemes(
+        input.phonemes,
+        input.speed ?? 1.0
+      );
+    } finally {
+      setIsGenerating(false);
+    }
+  };
+
   const stream = useCallback(
     async (input: TextToSpeechStreamingInput) => {
       if (!isReady)
@@ -112,12 +136,45 @@ export const useTextToSpeech = ({
     [isReady, isGenerating, moduleInstance]
   );
 
+  const streamFromPhonemes = useCallback(
+    async (input: TextToSpeechStreamingPhonemeInput) => {
+      if (!isReady)
+        throw new RnExecutorchError(
+          RnExecutorchErrorCode.ModuleNotLoaded,
+          'The model is currently not loaded. Please load the model before calling streamFromPhonemes().'
+        );
+      if (isGenerating)
+        throw new RnExecutorchError(
+          RnExecutorchErrorCode.ModelGenerating,
+          'The model is currently generating. Please wait until previous model run is complete.'
+        );
+      setIsGenerating(true);
+      try {
+        await input.onBegin?.();
+        for await (const audio of moduleInstance.streamFromPhonemes({
+          phonemes: input.phonemes,
+          speed: input.speed ?? 1.0,
+        })) {
+          if (input.onNext) {
+            await input.onNext(audio);
+          }
+        }
+      } finally {
+        await input.onEnd?.();
+        setIsGenerating(false);
+      }
+    },
+    [isReady, isGenerating, moduleInstance]
+  );
+
   return {
     error,
     isReady,
     isGenerating,
     forward,
+    forwardFromPhonemes,
     stream,
+    streamFromPhonemes,
     streamStop: moduleInstance.streamStop,
     downloadProgress,
   };
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
index 849c25676..d99a6312b 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
@@ -5,6 +5,7 @@ import {
   KokoroConfig,
   TextToSpeechConfig,
   TextToSpeechStreamingInput,
+  TextToSpeechStreamingPhonemeInput,
   VoiceConfig,
 } from '../../types/tts';
 import { Logger } from '../../common/Logger';
@@ -118,6 +119,27 @@ export class TextToSpeechModule {
     return await this.nativeModule.generate(text, speed);
   }
 
+  /**
+   * Synthesizes pre-computed phonemes into speech, bypassing the built-in phonemizer.
+   * This allows using an external G2P system (e.g. the Python `phonemizer` library,
+   * espeak-ng, or any custom phonemizer).
+   *
+   * @param phonemes The pre-computed IPA phoneme string.
+   * @param speed Optional speed multiplier for the speech synthesis (default is 1.0).
+   * @returns A promise resolving to the synthesized audio waveform.
+   */
+  public async forwardFromPhonemes(
+    phonemes: string,
+    speed: number = 1.0
+  ): Promise<Float32Array> {
+    if (this.nativeModule == null)
+      throw new RnExecutorchError(
+        RnExecutorchErrorCode.ModuleNotLoaded,
+        'The model is currently not loaded. Please load the model before calling forwardFromPhonemes().'
+      );
+    return await this.nativeModule.generateFromPhonemes(phonemes, speed);
+  }
+
   /**
    * Starts a streaming synthesis session. Yields audio chunks as they are generated.
    *
@@ -169,6 +191,61 @@ export class TextToSpeechModule {
     }
   }
 
+  /**
+   * Starts a streaming synthesis session from pre-computed phonemes.
+   * Bypasses the built-in phonemizer, allowing use of external G2P systems.
+   *
+   * @param input - Input object containing phonemes and optional speed.
+   * @returns An async generator yielding Float32Array audio chunks.
+   */
+  public async *streamFromPhonemes({
+    phonemes,
+    speed,
+  }: TextToSpeechStreamingPhonemeInput): AsyncGenerator<Float32Array> {
+    const queue: Float32Array[] = [];
+
+    let waiter: (() => void) | null = null;
+    let finished = false;
+    let error: unknown;
+
+    const wake = () => {
+      waiter?.();
+      waiter = null;
+    };
+
+    (async () => {
+      try {
+        await this.nativeModule.streamFromPhonemes(
+          phonemes,
+          speed,
+          (audio: number[]) => {
+            queue.push(new Float32Array(audio));
+            wake();
+          }
+        );
+        finished = true;
+        wake();
+      } catch (e) {
+        error = e;
+        finished = true;
+        wake();
+      }
+    })();
+
+    while (true) {
+      if (queue.length > 0) {
+        yield queue.shift()!;
+        if (finished && queue.length === 0) {
+          return;
+        }
+        continue;
+      }
+      if (error) throw error;
+      if (finished) return;
+      await new Promise<void>((r) => (waiter = r));
+    }
+  }
+
   /**
    * Stops the streaming process if there is any ongoing.
    */
diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts
index 55937be49..b9c878c3b 100644
--- a/packages/react-native-executorch/src/types/tts.ts
+++ b/packages/react-native-executorch/src/types/tts.ts
@@ -90,6 +90,21 @@ export interface TextToSpeechInput {
   speed?: number;
 }
 
+/**
+ * Text to Speech module input for pre-computed phonemes.
+ * Use this when you have your own phonemizer (e.g. the Python `phonemizer`
+ * library, espeak-ng, or any custom G2P system) and want to bypass the
+ * built-in phonemis pipeline.
+ *
+ * @category Types
+ * @property {string} phonemes - pre-computed IPA phoneme string
+ * @property {number} [speed] - optional speed argument - the higher it is, the faster the speech becomes
+ */
+export interface TextToSpeechPhonemeInput {
+  phonemes: string;
+  speed?: number;
+}
+
 /**
  * Return type for the `useTextToSpeech` hook.
  * Manages the state and operations for Text-to-Speech generation.
@@ -125,6 +140,18 @@ export interface TextToSpeechType {
    */
   forward: (input: TextToSpeechInput) => Promise<Float32Array>;
 
+  /**
+   * Synthesizes pre-computed phonemes into speech audio in a single pass.
+   * Bypasses the built-in phonemizer, allowing use of external G2P systems.
+   *
+   * @param input - The `TextToSpeechPhonemeInput` object containing pre-computed `phonemes` and optional `speed`.
+   * @returns A Promise that resolves with the generated audio data.
+   * @throws {RnExecutorchError} If the model is not loaded or is currently generating.
+   */
+  forwardFromPhonemes: (
+    input: TextToSpeechPhonemeInput
+  ) => Promise<Float32Array>;
+
   /**
    * Streams the generated audio data incrementally.
    * This is optimal for real-time playback, allowing audio to start playing before the full text is synthesized.
@@ -134,6 +161,17 @@ export interface TextToSpeechType {
    */
   stream: (input: TextToSpeechStreamingInput) => Promise<void>;
 
+  /**
+   * Streams pre-computed phonemes incrementally, bypassing the built-in phonemizer.
+   *
+   * @param input - The streaming input with pre-computed `phonemes` instead of `text`.
+   * @returns A Promise that resolves when the streaming process is complete.
+   * @throws {RnExecutorchError} If the model is not loaded or is currently generating.
+   */
+  streamFromPhonemes: (
+    input: TextToSpeechStreamingPhonemeInput
+  ) => Promise<void>;
+
   /**
    * Interrupts and stops the currently active audio generation stream.
    */
@@ -158,3 +196,16 @@ export interface TextToSpeechStreamingInput extends TextToSpeechInput {
   onNext?: (audio: Float32Array) => void | Promise<void>;
   onEnd?: () => void | Promise<void>;
 }
+
+/**
+ * Streaming input definition for pre-computed phonemes.
+ * Same as `TextToSpeechStreamingInput` but accepts `phonemes` instead of `text`.
+ *
+ * @category Types
+ */
+export interface TextToSpeechStreamingPhonemeInput
+  extends TextToSpeechPhonemeInput {
+  onBegin?: () => void | Promise<void>;
+  onNext?: (audio: Float32Array) => void | Promise<void>;
+  onEnd?: () => void | Promise<void>;
+}

From 5a125c740256d1beeb2cde7434ad0765c794b880 Mon Sep 17 00:00:00 2001
From: contra <yo@contra.io>
Date: Sat, 7 Mar 2026 21:16:24 -0800
Subject: [PATCH 2/3] =?UTF-8?q?refactor:=20simplify=20after=20review=20?=
 =?UTF-8?q?=E2=80=94=20reuse=20phonemis=20utf8=5Fto=5Fu32string,=20extract?=
 =?UTF-8?q?=20shared=20helpers,=20add=20input=20validation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../models/text_to_speech/kokoro/Kokoro.cpp   |  86 ++++++--------
 .../models/text_to_speech/kokoro/Kokoro.h     |   3 -
 .../useTextToSpeech.ts                        | 109 ++++++++----------
 .../TextToSpeechModule.ts                     |  94 +++++----------
 .../react-native-executorch/src/types/tts.ts  |  32 ++---
 5 files changed, 132 insertions(+), 192 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
index 1fd522676..52da0fc46 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
@@ -3,9 +3,8 @@
 #include "Utils.h"
 
 #include <algorithm>
-#include <codecvt>
 #include <fstream>
-#include <locale>
+#include <phonemis/utilities/string_utils.h>
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/data_processing/Sequential.h>
 
@@ -75,63 +74,30 @@ void Kokoro::loadVoice(const std::string &voiceSource) {
   }
 }
 
-std::u32string Kokoro::utf8ToUtf32(const std::string &utf8) {
-  std::u32string result;
-  result.reserve(utf8.size());
-  size_t i = 0;
-  while (i < utf8.size()) {
-    char32_t cp = 0;
-    unsigned char c = static_cast<unsigned char>(utf8[i]);
-    size_t len = 0;
-    if (c < 0x80) {
-      cp = c;
-      len = 1;
-    } else if ((c >> 5) == 0x06) {
-      cp = c & 0x1F;
-      len = 2;
-    } else if ((c >> 4) == 0x0E) {
-      cp = c & 0x0F;
-      len = 3;
-    } else if ((c >> 3) == 0x1E) {
-      cp = c & 0x07;
-      len = 4;
-    } else {
-      throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
-                              "Kokoro: invalid UTF-8 in phoneme string");
-    }
-    if (i + len > utf8.size()) {
-      throw RnExecutorchError(
-          RnExecutorchErrorCode::InvalidUserInput,
-          "Kokoro: truncated UTF-8 sequence in phoneme string");
-    }
-    for (size_t j = 1; j < len; j++) {
-      cp = (cp << 6) | (static_cast<unsigned char>(utf8[i + j]) & 0x3F);
-    }
-    result.push_back(cp);
-    i += len;
-  }
-  return result;
-}
-
 std::vector<float>
 Kokoro::generateFromPhonemesImpl(const std::u32string &phonemes, float speed) {
+  // Divide the phonemes string into substrings.
+  // Affects the further calculations only in case of string size
+  // exceeding the biggest model's input.
   auto subsentences =
       partitioner_.divide<Partitioner::Strategy::TOTAL_TIME>(phonemes);
 
   std::vector<float> audio = {};
   for (const auto &subsentence : subsentences) {
+    // Generate an audio vector with the Kokoro model
     auto audioPart = synthesize(subsentence, speed);
 
+    // Calculate a pause between the sentences
     char32_t lastPhoneme = subsentence.back();
     size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
                          ? params::kPauseValues.at(lastPhoneme)
                          : params::kDefaultPause;
-    std::vector<float> pause(pauseMs * constants::kSamplesPerMilisecond, 0.F);
 
+    // Add audio part and silence pause to the main audio vector
     audio.insert(audio.end(), std::make_move_iterator(audioPart.begin()),
                  std::make_move_iterator(audioPart.end()));
-    audio.insert(audio.end(), std::make_move_iterator(pause.begin()),
-                 std::make_move_iterator(pause.end()));
+    audio.resize(audio.size() + pauseMs * constants::kSamplesPerMilisecond,
+                 0.F);
   }
 
   return audio;
@@ -151,6 +117,7 @@ void Kokoro::streamFromPhonemesImpl(
 
   isStreaming_ = true;
 
+  // Use LATENCY strategy to minimize the time-to-first-audio for streaming
   auto subsentences =
       partitioner_.divide<Partitioner::Strategy::LATENCY>(phonemes);
 
@@ -161,21 +128,27 @@ void Kokoro::streamFromPhonemesImpl(
 
     const auto &subsentence = subsentences[i];
 
+    // Determine the silent padding duration to be stripped from the edges of
+    // the generated audio. If a chunk ends with a space or follows one that
+    // did, it indicates a word boundary split – we use a shorter padding
+    // to ensure natural speech flow. Otherwise, we use 50ms for standard
+    // pauses.
     bool endsWithSpace = (subsentence.back() == U' ');
     bool prevEndsWithSpace = (i > 0 && subsentences[i - 1].back() == U' ');
-    size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50;
+    size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50; // [ms]
 
+    // Generate an audio vector with the Kokoro model
     auto audioPart = synthesize(subsentence, speed, paddingMs);
 
+    // Calculate and append a pause between the sentences
     char32_t lastPhoneme = subsentence.back();
     size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
                          ? params::kPauseValues.at(lastPhoneme)
                          : params::kDefaultPause;
-    std::vector<float> pause(pauseMs * constants::kSamplesPerMilisecond, 0.F);
-
-    audioPart.insert(audioPart.end(), std::make_move_iterator(pause.begin()),
-                     std::make_move_iterator(pause.end()));
+    audioPart.resize(
+        audioPart.size() + pauseMs * constants::kSamplesPerMilisecond, 0.F);
 
+    // Push the audio right away to the JS side
     nativeCallback(audioPart);
   }
 
@@ -196,8 +169,12 @@ std::vector<float> Kokoro::generate(std::string text, float speed) {
 
 std::vector<float> Kokoro::generateFromPhonemes(std::string phonemes,
                                                 float speed) {
-  auto phonemes32 = utf8ToUtf32(phonemes);
-  return generateFromPhonemesImpl(phonemes32, speed);
+  if (phonemes.empty()) {
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                            "Kokoro: phoneme string must not be empty");
+  }
+  return generateFromPhonemesImpl(
+      phonemis::utilities::string_utils::utf8_to_u32string(phonemes), speed);
 }
 
 void Kokoro::stream(std::string text, float speed,
@@ -215,8 +192,13 @@ void Kokoro::stream(std::string text, float speed,
 
 void Kokoro::streamFromPhonemes(std::string phonemes, float speed,
                                 std::shared_ptr<jsi::Function> callback) {
-  auto phonemes32 = utf8ToUtf32(phonemes);
-  streamFromPhonemesImpl(phonemes32, speed, callback);
+  if (phonemes.empty()) {
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                            "Kokoro: phoneme string must not be empty");
+  }
+  streamFromPhonemesImpl(
+      phonemis::utilities::string_utils::utf8_to_u32string(phonemes), speed,
+      callback);
 }
 
 void Kokoro::streamStop() noexcept { isStreaming_ = false; }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
index a99435d1c..d7a4c2ae6 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
@@ -53,9 +53,6 @@ class Kokoro {
   // Helper function - loading voice array
   void loadVoice(const std::string &voiceSource);
 
-  // Helper function - convert UTF-8 string to UTF-32 for phoneme processing
-  static std::u32string utf8ToUtf32(const std::string &utf8);
-
   // Helper function - shared synthesis pipeline (partition + synthesize)
   std::vector<float> generateFromPhonemesImpl(const std::u32string &phonemes,
                                               float speed);
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
index c1e1a2760..1a751f42d 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
@@ -5,6 +5,7 @@ import {
   TextToSpeechInput,
   TextToSpeechPhonemeInput,
   TextToSpeechType,
+  TextToSpeechStreamingCallbacks,
   TextToSpeechStreamingInput,
   TextToSpeechStreamingPhonemeInput,
 } from '../../types/tts';
@@ -64,17 +65,47 @@ export const useTextToSpeech = ({
     preventLoad,
   ]);
 
-  const forward = async (input: TextToSpeechInput) => {
+  // Shared guard for all generation methods
+  const guardReady = (methodName: string) => {
     if (!isReady)
       throw new RnExecutorchError(
         RnExecutorchErrorCode.ModuleNotLoaded,
-        'The model is currently not loaded. Please load the model before calling forward().'
+        `The model is currently not loaded. Please load the model before calling ${methodName}().`
       );
     if (isGenerating)
       throw new RnExecutorchError(
         RnExecutorchErrorCode.ModelGenerating,
         'The model is currently generating. Please wait until previous model run is complete.'
       );
+  };
+
+  // Shared streaming orchestration (guards + onBegin/onNext/onEnd lifecycle)
+  const runStream = useCallback(
+    async (
+      methodName: string,
+      generator: AsyncGenerator<Float32Array>,
+      callbacks: TextToSpeechStreamingCallbacks
+    ) => {
+      guardReady(methodName);
+      setIsGenerating(true);
+      try {
+        await callbacks.onBegin?.();
+        for await (const audio of generator) {
+          if (callbacks.onNext) {
+            await callbacks.onNext(audio);
+          }
+        }
+      } finally {
+        await callbacks.onEnd?.();
+        setIsGenerating(false);
+      }
+    },
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+    [isReady, isGenerating, moduleInstance]
+  );
+
+  const forward = async (input: TextToSpeechInput) => {
+    guardReady('forward');
     try {
       setIsGenerating(true);
       return await moduleInstance.forward(input.text, input.speed ?? 1.0);
@@ -84,16 +115,7 @@ export const useTextToSpeech = ({
   };
 
   const forwardFromPhonemes = async (input: TextToSpeechPhonemeInput) => {
-    if (!isReady)
-      throw new RnExecutorchError(
-        RnExecutorchErrorCode.ModuleNotLoaded,
-        'The model is currently not loaded. Please load the model before calling forwardFromPhonemes().'
-      );
-    if (isGenerating)
-      throw new RnExecutorchError(
-        RnExecutorchErrorCode.ModelGenerating,
-        'The model is currently generating. Please wait until previous model run is complete.'
-      );
+    guardReady('forwardFromPhonemes');
     try {
       setIsGenerating(true);
       return await moduleInstance.forwardFromPhonemes(
@@ -107,64 +129,27 @@ export const useTextToSpeech = ({
 
   const stream = useCallback(
     async (input: TextToSpeechStreamingInput) => {
-      if (!isReady)
-        throw new RnExecutorchError(
-          RnExecutorchErrorCode.ModuleNotLoaded,
-          'The model is currently not loaded. Please load the model before calling stream().'
-        );
-      if (isGenerating)
-        throw new RnExecutorchError(
-          RnExecutorchErrorCode.ModelGenerating,
-          'The model is currently generating. Please wait until previous model run is complete.'
-        );
-      setIsGenerating(true);
-      try {
-        await input.onBegin?.();
-        for await (const audio of moduleInstance.stream({
-          text: input.text,
-          speed: input.speed ?? 1.0,
-        })) {
-          if (input.onNext) {
-            await input.onNext(audio);
-          }
-        }
-      } finally {
-        await input.onEnd?.();
-        setIsGenerating(false);
-      }
+      await runStream(
+        'stream',
+        moduleInstance.stream({ text: input.text, speed: input.speed ?? 1.0 }),
+        input
+      );
     },
-    [isReady, isGenerating, moduleInstance]
+    [runStream, moduleInstance]
   );
 
   const streamFromPhonemes = useCallback(
     async (input: TextToSpeechStreamingPhonemeInput) => {
-      if (!isReady)
-        throw new RnExecutorchError(
-          RnExecutorchErrorCode.ModuleNotLoaded,
-          'The model is currently not loaded. Please load the model before calling streamFromPhonemes().'
-        );
-      if (isGenerating)
-        throw new RnExecutorchError(
-          RnExecutorchErrorCode.ModelGenerating,
-          'The model is currently generating. Please wait until previous model run is complete.'
-        );
-      setIsGenerating(true);
-      try {
-        await input.onBegin?.();
-        for await (const audio of moduleInstance.streamFromPhonemes({
+      await runStream(
+        'streamFromPhonemes',
+        moduleInstance.streamFromPhonemes({
           phonemes: input.phonemes,
           speed: input.speed ?? 1.0,
-        })) {
-          if (input.onNext) {
-            await input.onNext(audio);
-          }
-        }
-      } finally {
-        await input.onEnd?.();
-        setIsGenerating(false);
-      }
+        }),
+        input
+      );
     },
-    [isReady, isGenerating, moduleInstance]
+    [runStream, moduleInstance]
   );
 
   return {
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
index d99a6312b..932f166e7 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
@@ -99,6 +99,14 @@ export class TextToSpeechModule {
     }
   }
 
+  private ensureLoaded(methodName: string): void {
+    if (this.nativeModule == null)
+      throw new RnExecutorchError(
+        RnExecutorchErrorCode.ModuleNotLoaded,
+        `The model is currently not loaded. Please load the model before calling ${methodName}().`
+      );
+  }
+
   /**
    * Synthesizes the provided text into speech.
    * Returns a promise that resolves to the full audio waveform as a `Float32Array`.
@@ -111,11 +119,7 @@ export class TextToSpeechModule {
     text: string,
     speed: number = 1.0
   ): Promise<Float32Array> {
-    if (this.nativeModule == null)
-      throw new RnExecutorchError(
-        RnExecutorchErrorCode.ModuleNotLoaded,
-        'The model is currently not loaded. Please load the model before calling forward().'
-      );
+    this.ensureLoaded('forward');
     return await this.nativeModule.generate(text, speed);
   }
 
@@ -132,25 +136,17 @@ export class TextToSpeechModule {
     phonemes: string,
     speed: number = 1.0
   ): Promise<Float32Array> {
-    if (this.nativeModule == null)
-      throw new RnExecutorchError(
-        RnExecutorchErrorCode.ModuleNotLoaded,
-        'The model is currently not loaded. Please load the model before calling forwardFromPhonemes().'
-      );
+    this.ensureLoaded('forwardFromPhonemes');
     return await this.nativeModule.generateFromPhonemes(phonemes, speed);
   }
 
   /**
-   * Starts a streaming synthesis session. Yields audio chunks as they are generated.
-   *
-   * @param input - Input object containing text and optional speed.
-   * @returns An async generator yielding Float32Array audio chunks.
+   * Shared streaming implementation. Wraps a native streaming call in an
+   * async generator that yields Float32Array audio chunks as they arrive.
    */
-  public async *stream({
-    text,
-    speed,
-  }: TextToSpeechStreamingInput): AsyncGenerator<Float32Array> {
-    // Stores computed audio segments
+  private async *streamImpl(
+    nativeCall: (cb: (audio: number[]) => void) => Promise<void>
+  ): AsyncGenerator<Float32Array> {
     const queue: Float32Array[] = [];
 
     let waiter: (() => void) | null = null;
@@ -164,7 +160,7 @@ export class TextToSpeechModule {
 
     (async () => {
       try {
-        await this.nativeModule.stream(text, speed, (audio: number[]) => {
+        await nativeCall((audio: number[]) => {
           queue.push(new Float32Array(audio));
           wake();
         });
@@ -191,6 +187,19 @@ export class TextToSpeechModule {
     }
   }
 
+  /**
+   * Starts a streaming synthesis session. Yields audio chunks as they are generated.
+   *
+   * @param input - Input object containing text and optional speed.
+   * @returns An async generator yielding Float32Array audio chunks.
+   */
+  public async *stream({
+    text,
+    speed,
+  }: TextToSpeechStreamingInput): AsyncGenerator<Float32Array> {
+    yield* this.streamImpl((cb) => this.nativeModule.stream(text, speed, cb));
+  }
+
   /**
    * Starts a streaming synthesis session from pre-computed phonemes.
    * Bypasses the built-in phonemizer, allowing use of external G2P systems.
@@ -202,48 +211,9 @@ export class TextToSpeechModule {
     phonemes,
     speed,
   }: TextToSpeechStreamingPhonemeInput): AsyncGenerator<Float32Array> {
-    const queue: Float32Array[] = [];
-
-    let waiter: (() => void) | null = null;
-    let finished = false;
-    let error: unknown;
-
-    const wake = () => {
-      waiter?.();
-      waiter = null;
-    };
-
-    (async () => {
-      try {
-        await this.nativeModule.streamFromPhonemes(
-          phonemes,
-          speed,
-          (audio: number[]) => {
-            queue.push(new Float32Array(audio));
-            wake();
-          }
-        );
-        finished = true;
-        wake();
-      } catch (e) {
-        error = e;
-        finished = true;
-        wake();
-      }
-    })();
-
-    while (true) {
-      if (queue.length > 0) {
-        yield queue.shift()!;
-        if (finished && queue.length === 0) {
-          return;
-        }
-        continue;
-      }
-      if (error) throw error;
-      if (finished) return;
-      await new Promise<void>((r) => (waiter = r));
-    }
+    yield* this.streamImpl((cb) =>
+      this.nativeModule.streamFromPhonemes(phonemes, speed, cb)
+    );
   }
 
   /**
diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts
index b9c878c3b..4df331494 100644
--- a/packages/react-native-executorch/src/types/tts.ts
+++ b/packages/react-native-executorch/src/types/tts.ts
@@ -94,7 +94,7 @@ export interface TextToSpeechInput {
  * Text to Speech module input for pre-computed phonemes.
  * Use this when you have your own phonemizer (e.g. the Python `phonemizer`
  * library, espeak-ng, or any custom G2P system) and want to bypass the
- * built-in phonemis pipeline.
+ * built-in phonemizer pipeline.
  *
  * @category Types
  * @property {string} phonemes - pre-computed IPA phoneme string
@@ -179,24 +179,33 @@ export interface TextToSpeechType {
 }
 
 /**
- * Text to Speech streaming input definition
- *
- * Streaming mode in T2S is synchronized by passing specific callbacks
- * executed at given moments of the streaming.
- * Actions such as playing the audio should happen within the onNext callback.
- * Callbacks can be both synchronous or asynchronous.
+ * Shared streaming lifecycle callbacks for TTS streaming modes.
  *
  * @category Types
  * @property {() => void | Promise<void>} [onBegin] - Called when streaming begins
  * @property {(audio: Float32Array) => void | Promise<void>} [onNext] - Called after each audio chunk gets calculated.
  * @property {() => void | Promise<void>} [onEnd] - Called when streaming ends
  */
-export interface TextToSpeechStreamingInput extends TextToSpeechInput {
+export interface TextToSpeechStreamingCallbacks {
   onBegin?: () => void | Promise<void>;
   onNext?: (audio: Float32Array) => void | Promise<void>;
   onEnd?: () => void | Promise<void>;
 }
 
+/**
+ * Text to Speech streaming input definition
+ *
+ * Streaming mode in T2S is synchronized by passing specific callbacks
+ * executed at given moments of the streaming.
+ * Actions such as playing the audio should happen within the onNext callback.
+ * Callbacks can be both synchronous or asynchronous.
+ *
+ * @category Types
+ */
+export interface TextToSpeechStreamingInput
+  extends TextToSpeechInput,
+    TextToSpeechStreamingCallbacks {}
+
 /**
  * Streaming input definition for pre-computed phonemes.
  * Same as `TextToSpeechStreamingInput` but accepts `phonemes` instead of `text`.
@@ -204,8 +213,5 @@ export interface TextToSpeechStreamingInput extends TextToSpeechInput {
  * @category Types
  */
 export interface TextToSpeechStreamingPhonemeInput
-  extends TextToSpeechPhonemeInput {
-  onBegin?: () => void | Promise<void>;
-  onNext?: (audio: Float32Array) => void | Promise<void>;
-  onEnd?: () => void | Promise<void>;
-}
+  extends TextToSpeechPhonemeInput,
+    TextToSpeechStreamingCallbacks {}

From 377f6596b7a770ae613bd1c434e9feb7442f26a1 Mon Sep 17 00:00:00 2001
From: IgorSwat <igorswat2002@o2.pl>
Date: Mon, 9 Mar 2026 10:11:12 +0100
Subject: [PATCH 3/3] Update docs & formatting

---
 .cspell-wordlist.txt                          |  1 +
 .../useTextToSpeech.md                        | 61 +++++++++++++++++--
 .../TextToSpeechModule.md                     | 47 ++++++++++++--
 .../react-native-executorch/src/types/tts.ts  |  6 +-
 4 files changed, 101 insertions(+), 14 deletions(-)

diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt
index 2238f7142..6b23cdc46 100644
--- a/.cspell-wordlist.txt
+++ b/.cspell-wordlist.txt
@@ -127,3 +127,4 @@ detr
 metaprogramming
 ktlint
 lefthook
+espeak
\ No newline at end of file
diff --git a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
index b52726c9e..10e9986de 100644
--- a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
+++ b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
@@ -82,17 +82,24 @@ You need more details? Check the following resources:
 
 ## Running the model
 
-The module provides two ways to generate speech:
+The module provides two ways to generate speech using either raw text or pre-generated phonemes:
 
-1.  [**`forward(text, speed)`**](../../06-api-reference/interfaces/TextToSpeechType.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`.
+### Using Text
+
+1.  [**`forward({ text, speed })`**](../../06-api-reference/interfaces/TextToSpeechType.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`.
+2.  [**`stream({ text, speed, onNext, ... })`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences.
+
+### Using Phonemes
+
+If you have pre-computed phonemes (e.g., from an external dictionary or a custom G2P model), you can skip the internal phoneme generation step:
+
+1.  [**`forwardFromPhonemes({ phonemes, speed })`**](../../06-api-reference/interfaces/TextToSpeechType.md#forwardfromphonemes): Generates the complete audio waveform from a phoneme string.
+2.  [**`streamFromPhonemes({ phonemes, speed, onNext, ... })`**](../../06-api-reference/interfaces/TextToSpeechType.md#streamfromphonemes): Streams audio chunks generated from a phoneme string.
 
 :::note
-Since it processes the entire text at once, it might take a significant amount of time to produce an audio for long text inputs.
+Since `forward` and `forwardFromPhonemes` process the entire input at once, they might take a significant amount of time to produce audio for long inputs.
 :::
 
-2.  [**`stream({ text, speed })`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator that yields chunks of audio as they are computed.
-    This is ideal for reducing the "time to first audio" for long sentences.
-
 ## Example
 
 ### Speech Synthesis
@@ -185,6 +192,48 @@ export default function App() {
 }
 ```
 
+### Synthesis from Phonemes
+
+If you already have a phoneme string obtained from an external source (e.g. the Python `phonemizer` library,
+`espeak-ng`, or any custom phonemizer), you can use `forwardFromPhonemes` or `streamFromPhonemes` to synthesize audio directly, skipping the phoneme generation stage.
+
+```tsx
+import React from 'react';
+import { Button, View } from 'react-native';
+import {
+  useTextToSpeech,
+  KOKORO_MEDIUM,
+  KOKORO_VOICE_AF_HEART,
+} from 'react-native-executorch';
+
+export default function App() {
+  const tts = useTextToSpeech({
+    model: KOKORO_MEDIUM,
+    voice: KOKORO_VOICE_AF_HEART,
+  });
+
+  const synthesizePhonemes = async () => {
+    // Example phonemes for "Hello"
+    const audioData = await tts.forwardFromPhonemes({
+      phonemes:
+        'ɐ mˈæn hˌu dˈʌzᵊnt tɹˈʌst hɪmsˈɛlf, kæn nˈɛvəɹ ɹˈiᵊli tɹˈʌst ˈɛniwˌʌn ˈɛls.',
+    });
+
+    // ... process or play audioData ...
+  };
+
+  return (
+    <View style={{ flex: 1, justifyContent: 'center', alignItems: 'center' }}>
+      <Button
+        title="Synthesize Phonemes"
+        onPress={synthesizePhonemes}
+        disabled={!tts.isReady}
+      />
+    </View>
+  );
+}
+```
+
 ## Supported models
 
 | Model                                                                            | Language |
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
index bc297ecf4..53bde1685 100644
--- a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
+++ b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
@@ -53,16 +53,24 @@ For more information on resource sources, see [loading models](../../01-fundamen
 
 ## Running the model
 
-The module provides two ways to generate speech:
+The module provides two ways to generate speech using either raw text or pre-generated phonemes:
+
+### Using Text
 
 1.  [**`forward(text, speed)`**](../../06-api-reference/classes/TextToSpeechModule.md#forward): Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`.
+2.  [**`stream({ text, speed })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences.
+
+### Using Phonemes
+
+If you have pre-computed phonemes (e.g., from an external dictionary or a custom G2P model), you can skip the internal phoneme generation step:
+
+1.  [**`forwardFromPhonemes(phonemes, speed)`**](../../06-api-reference/classes/TextToSpeechModule.md#forwardfromphonemes): Generates the complete audio waveform from a phoneme string.
+2.  [**`streamFromPhonemes({ phonemes, speed })`**](../../06-api-reference/classes/TextToSpeechModule.md#streamfromphonemes): Streams audio chunks generated from a phoneme string.
 
 :::note
-Since it processes the entire text at once, it might take a significant amount of time to produce an audio for long text inputs.
+Since `forward` and `forwardFromPhonemes` process the entire input at once, they might take a significant amount of time to produce audio for long inputs.
 :::
 
-2.  [**`stream({ text, speed })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences.
-
 ## Example
 
 ### Speech Synthesis
@@ -135,3 +143,34 @@ try {
   console.error('Streaming failed:', error);
 }
 ```
+
+### Synthesis from Phonemes
+
+If you already have a phoneme string (e.g., from an external library), you can use `forwardFromPhonemes` or `streamFromPhonemes` to synthesize audio directly, skipping the internal phonemizer stage.
+
+```typescript
+import {
+  TextToSpeechModule,
+  KOKORO_MEDIUM,
+  KOKORO_VOICE_AF_HEART,
+} from 'react-native-executorch';
+
+const tts = new TextToSpeechModule();
+
+await tts.load({
+  model: KOKORO_MEDIUM,
+  voice: KOKORO_VOICE_AF_HEART,
+});
+
+// Example phonemes for "ExecuTorch"
+const waveform = await tts.forwardFromPhonemes('həlˈO wˈɜɹld!', 1.0);
+
+// Or stream from phonemes
+for await (const chunk of tts.streamFromPhonemes({
+  phonemes:
+    'ɐ mˈæn hˌu dˈʌzᵊnt tɹˈʌst hɪmsˈɛlf, kæn nˈɛvəɹ ɹˈiᵊli tɹˈʌst ˈɛniwˌʌn ˈɛls.',
+  speed: 1.0,
+})) {
+  // ... process chunk ...
+}
+```
diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts
index 4df331494..ebc4b065a 100644
--- a/packages/react-native-executorch/src/types/tts.ts
+++ b/packages/react-native-executorch/src/types/tts.ts
@@ -203,8 +203,7 @@ export interface TextToSpeechStreamingCallbacks {
  * @category Types
  */
 export interface TextToSpeechStreamingInput
-  extends TextToSpeechInput,
-    TextToSpeechStreamingCallbacks {}
+  extends TextToSpeechInput, TextToSpeechStreamingCallbacks {}
 
 /**
  * Streaming input definition for pre-computed phonemes.
@@ -213,5 +212,4 @@ export interface TextToSpeechStreamingInput
  * @category Types
  */
 export interface TextToSpeechStreamingPhonemeInput
-  extends TextToSpeechPhonemeInput,
-    TextToSpeechStreamingCallbacks {}
+  extends TextToSpeechPhonemeInput, TextToSpeechStreamingCallbacks {}