diff --git a/apps/speech/App.tsx b/apps/speech/App.tsx
index ab036678e..ddbfe3c98 100644
--- a/apps/speech/App.tsx
+++ b/apps/speech/App.tsx
@@ -5,6 +5,7 @@ import { SpeechToTextScreen } from './screens/SpeechToTextScreen';
 import ColorPalette from './colors';
 import ExecutorchLogo from './assets/executorch.svg';
 import { Quiz } from './screens/Quiz';
+import { TextToSpeechLLMScreen } from './screens/TextToSpeechLLMScreen';
 import { initExecutorch } from 'react-native-executorch';
 import { ExpoResourceFetcher } from '@react-native-executorch/expo-resource-fetcher';
 
@@ -14,7 +15,7 @@ initExecutorch({
 
 export default function App() {
   const [currentScreen, setCurrentScreen] = useState<
-    'menu' | 'speech-to-text' | 'text-to-speech' | 'quiz'
+    'menu' | 'speech-to-text' | 'text-to-speech' | 'quiz' | 'text-to-speech-llm'
   >('menu');
 
   const goToMenu = () => setCurrentScreen('menu');
@@ -31,6 +32,10 @@ export default function App() {
     return <Quiz onBack={goToMenu} />;
   }
 
+  if (currentScreen === 'text-to-speech-llm') {
+    return <TextToSpeechLLMScreen onBack={goToMenu} />;
+  }
+
   return (
     <View style={styles.container}>
       <ExecutorchLogo width={64} height={64} />
@@ -54,6 +59,12 @@ export default function App() {
         >
           <Text style={styles.buttonText}>Text to Speech - Quiz</Text>
         </TouchableOpacity>
+        <TouchableOpacity
+          style={styles.button}
+          onPress={() => setCurrentScreen('text-to-speech-llm')}
+        >
+          <Text style={styles.buttonText}>Text to Speech - LLM Streaming</Text>
+        </TouchableOpacity>
       </View>
     </View>
   );
diff --git a/apps/speech/screens/TextToSpeechLLMScreen.tsx b/apps/speech/screens/TextToSpeechLLMScreen.tsx
new file mode 100644
index 000000000..73df0f299
--- /dev/null
+++ b/apps/speech/screens/TextToSpeechLLMScreen.tsx
@@ -0,0 +1,324 @@
+import React, { useEffect, useState, useRef } from 'react';
+import {
+  View,
+  Text,
+  StyleSheet,
+  TouchableOpacity,
+  ScrollView,
+} from 'react-native';
+import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
+import FontAwesome from '@expo/vector-icons/FontAwesome';
+import SWMIcon from '../assets/swm_icon.svg';
+import {
+  useLLM,
+  useTextToSpeech,
+  KOKORO_MEDIUM,
+  KOKORO_VOICE_AF_HEART,
+  LLAMA3_2_1B_QLORA,
+} from 'react-native-executorch';
+import {
+  AudioManager,
+  AudioContext,
+  AudioBuffer,
+  AudioBufferSourceNode,
+} from 'react-native-audio-api';
+
+interface TextToSpeechLLMProps {
+  onBack: () => void;
+}
+
+/**
+ * Converts an audio vector (Float32Array) to an AudioBuffer for playback
+ * @param audioVector - The generated audio samples from the model
+ * @param sampleRate - The sample rate (default: 24000 Hz for Kokoro)
+ * @returns AudioBuffer ready for playback
+ */
+const createAudioBufferFromVector = (
+  audioVector: Float32Array,
+  audioContext: AudioContext,
+  sampleRate: number = 24000
+): AudioBuffer => {
+  const audioBuffer = audioContext.createBuffer(
+    1,
+    audioVector.length,
+    sampleRate
+  );
+  const channelData = audioBuffer.getChannelData(0);
+  channelData.set(audioVector);
+
+  return audioBuffer;
+};
+
+export const TextToSpeechLLMScreen = ({ onBack }: TextToSpeechLLMProps) => {
+  const [displayText, setDisplayText] = useState('');
+  const [isTtsStreaming, setIsTtsStreaming] = useState(false);
+  const llm = useLLM({ model: LLAMA3_2_1B_QLORA });
+  const tts = useTextToSpeech({
+    model: KOKORO_MEDIUM,
+    voice: KOKORO_VOICE_AF_HEART,
+  });
+
+  const processedLengthRef = useRef(0);
+  const audioContextRef = useRef<AudioContext | null>(null);
+  const sourceRef = useRef<AudioBufferSourceNode>(null);
+
+  useEffect(() => {
+    AudioManager.setAudioSessionOptions({
+      iosCategory: 'playAndRecord',
+      iosMode: 'spokenAudio',
+      iosOptions: ['defaultToSpeaker'],
+    });
+
+    audioContextRef.current = new AudioContext({ sampleRate: 24000 });
+    audioContextRef.current.suspend();
+
+    return () => {
+      audioContextRef.current?.close();
+      audioContextRef.current = null;
+    };
+  }, []);
+
+  // Update displayText gradually as response gets generated and insert new text chunks into TTS stream
+  useEffect(() => {
+    if (llm.response && tts.isReady) {
+      setDisplayText(llm.response);
+
+      const previousLength = processedLengthRef.current;
+      if (llm.response.length > previousLength) {
+        const newChunk = llm.response.slice(previousLength);
+        tts.streamInsert(newChunk);
+        processedLengthRef.current = llm.response.length;
+      }
+    } else {
+      processedLengthRef.current = 0;
+    }
+  }, [llm.response, tts]);
+
+  const handleGenerate = async () => {
+    setDisplayText('');
+    processedLengthRef.current = 0;
+    setIsTtsStreaming(true);
+
+    const startTTS = async () => {
+      try {
+        const audioContext = audioContextRef.current;
+        if (!audioContext) return;
+
+        if (audioContext.state === 'suspended') {
+          await audioContext.resume();
+        }
+
+        const onNext = async (audioVec: Float32Array) => {
+          return new Promise<void>((resolve) => {
+            const audioBuffer = createAudioBufferFromVector(
+              audioVec,
+              audioContext,
+              24000
+            );
+
+            const source = (sourceRef.current =
+              audioContext.createBufferSource());
+            source.buffer = audioBuffer;
+            source.connect(audioContext.destination);
+
+            source.onEnded = () => resolve();
+
+            source.start();
+          });
+        };
+
+        await tts.stream({
+          text: '',
+          speed: 0.9,
+          stopAutomatically: false,
+          onNext,
+        });
+      } catch (e) {
+        console.error('TTS streaming error:', e);
+      } finally {
+        setIsTtsStreaming(false);
+      }
+    };
+
+    const ttsPromise = startTTS();
+
+    try {
+      await llm.sendMessage(
+        'Generate a short story about a robot learning to paint. The story should be around 200 words long.'
+      );
+    } catch (e) {
+      console.error('Generation failed:', e);
+    } finally {
+      tts.streamStop(false);
+      await ttsPromise;
+
+      if (
+        audioContextRef.current &&
+        audioContextRef.current.state === 'running'
+      ) {
+        await audioContextRef.current.suspend();
+      }
+    }
+  };
+
+  const handleStop = () => {
+    llm.interrupt();
+    tts.streamStop(true);
+    if (sourceRef.current) {
+      try {
+        sourceRef.current.stop();
+      } catch (e) {
+        // Source might have already stopped or disconnected
+      }
+    }
+  };
+
+  const isProcessing = llm.isGenerating || isTtsStreaming;
+  const isModelsReady = llm.isReady && tts.isReady;
+
+  const getModelStatus = () => {
+    if (llm.error) return `LLM Error: ${llm.error.message}`;
+    if (tts.error) return `TTS Error: ${tts.error.message}`;
+    if (!llm.isReady)
+      return `Loading LLM: ${(100 * llm.downloadProgress).toFixed(2)}%`;
+    if (!tts.isReady)
+      return `Loading TTS: ${(100 * tts.downloadProgress).toFixed(2)}%`;
+    if (isProcessing) return 'Generating/Streaming...';
+    return 'Ready';
+  };
+
+  return (
+    <SafeAreaProvider>
+      <SafeAreaView style={styles.container}>
+        <View style={styles.header}>
+          <TouchableOpacity style={styles.backButton} onPress={onBack}>
+            <FontAwesome name="chevron-left" size={20} color="#0f186e" />
+          </TouchableOpacity>
+          <SWMIcon width={60} height={60} />
+          <Text style={styles.headerText}>React Native ExecuTorch</Text>
+          <Text style={styles.headerText}>LLM to Speech Demo</Text>
+        </View>
+
+        <View style={styles.statusContainer}>
+          <Text>Status: {getModelStatus()}</Text>
+        </View>
+
+        <View style={styles.contentContainer}>
+          <Text style={styles.label}>Generated Story</Text>
+          <View style={styles.responseContainer}>
+            <ScrollView contentContainerStyle={styles.responseContent}>
+              <Text style={styles.responseText}>
+                {displayText ||
+                  (isModelsReady
+                    ? 'Press the button to generate a story and hear it spoken aloud.'
+                    : 'Please wait for models to load...')}
+              </Text>
+            </ScrollView>
+          </View>
+        </View>
+
+        <View style={styles.buttonContainer}>
+          {isProcessing ? (
+            <TouchableOpacity
+              style={[styles.actionButton, styles.stopButton]}
+              onPress={handleStop}
+            >
+              <FontAwesome name="stop" size={20} color="white" />
+              <Text style={styles.buttonText}>Stop Generation</Text>
+            </TouchableOpacity>
+          ) : (
+            <TouchableOpacity
+              disabled={!isModelsReady}
+              onPress={handleGenerate}
+              style={[styles.actionButton, !isModelsReady && styles.disabled]}
+            >
+              <FontAwesome name="magic" size={20} color="white" />
+              <Text style={styles.buttonText}>Generate & Stream Speech</Text>
+            </TouchableOpacity>
+          )}
+        </View>
+      </SafeAreaView>
+    </SafeAreaProvider>
+  );
+};
+
+const styles = StyleSheet.create({
+  container: {
+    flex: 1,
+    alignItems: 'center',
+    backgroundColor: 'white',
+    paddingHorizontal: 16,
+  },
+  header: {
+    alignItems: 'center',
+    position: 'relative',
+    width: '100%',
+  },
+  backButton: {
+    position: 'absolute',
+    left: 0,
+    top: 10,
+    padding: 10,
+    zIndex: 1,
+  },
+  headerText: {
+    fontSize: 22,
+    fontWeight: 'bold',
+    color: '#0f186e',
+  },
+  statusContainer: {
+    marginTop: 12,
+    alignItems: 'center',
+  },
+  contentContainer: {
+    width: '100%',
+    marginTop: 24,
+    flex: 1,
+    marginBottom: 24,
+  },
+  label: {
+    marginLeft: 12,
+    marginBottom: 4,
+    color: '#0f186e',
+    fontWeight: '600',
+  },
+  responseContainer: {
+    borderRadius: 12,
+    borderWidth: 1,
+    borderColor: '#0f186e',
+    flex: 1,
+  },
+  responseContent: {
+    padding: 12,
+  },
+  responseText: {
+    fontSize: 16,
+    color: '#333',
+    lineHeight: 24,
+  },
+  buttonContainer: {
+    marginBottom: 24,
+    width: '100%',
+  },
+  actionButton: {
+    backgroundColor: '#0f186e',
+    flexDirection: 'row',
+    justifyContent: 'center',
+    alignItems: 'center',
+    padding: 12,
+    borderRadius: 12,
+    gap: 8,
+  },
+  stopButton: {
+    backgroundColor: '#ff4444',
+  },
+  buttonText: {
+    color: 'white',
+    fontWeight: '600',
+    letterSpacing: -0.5,
+    fontSize: 16,
+  },
+  disabled: {
+    opacity: 0.5,
+  },
+});
diff --git a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
index b52726c9e..8bd1d07e2 100644
--- a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
+++ b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
@@ -90,8 +90,8 @@ The module provides two ways to generate speech:
 Since it processes the entire text at once, it might take a significant amount of time to produce an audio for long text inputs.
 :::
 
-2.  [**`stream({ text, speed })`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator that yields chunks of audio as they are computed.
-    This is ideal for reducing the "time to first audio" for long sentences.
+2. [**`stream(input)`**](../../06-api-reference/interfaces/TextToSpeechType.md#stream): An async generator-like functionality (managed via callbacks like `onNext`) that yields chunks of audio as they are computed.
+   This is ideal for reducing the "time to first audio" for long sentences. You can also dynamically insert text during the generation process using `streamInsert(text)` and stop it with `streamStop(instant)`.
 
 ## Example
 
@@ -160,8 +160,11 @@ export default function App() {
   const generateStream = async () => {
     const ctx = contextRef.current;
 
+    // Instead of using streamInsert() directly, we can pass initial text to the stream() method
     await tts.stream({
-      text: "This is a longer text, which is being streamed chunk by chunk. Let's see how it works!",
+      text: "This is an initial text, which is being streamed chunk by chunk. Let's see how it works!",
+      onBegin: async () => console.log('Started streaming'),
+      onEnd: async () => console.log('Finished streaming'),
       onNext: async (chunk) => {
         return new Promise((resolve) => {
           const buffer = ctx.createBuffer(1, chunk.length, 24000);
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
index bc297ecf4..5f2a66617 100644
--- a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
+++ b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md
@@ -61,7 +61,7 @@ The module provides two ways to generate speech:
 Since it processes the entire text at once, it might take a significant amount of time to produce an audio for long text inputs.
 :::
 
-2.  [**`stream({ text, speed })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences.
+2. [**`stream({ speed, stopAutomatically })`**](../../06-api-reference/classes/TextToSpeechModule.md#stream): An async generator that yields chunks of audio as they are computed. This is ideal for reducing the "time to first audio" for long sentences. In contrast to `forward`, it enables inserting text chunks dynamically into processing buffer with [**`streamInsert(text)`**](../../06-api-reference/classes/TextToSpeechModule.md#streaminsert) and allows stopping generation early with [**`streamStop(instant)`**](../../06-api-reference/classes/TextToSpeechModule.md#streamstop).
 
 ## Example
 
@@ -115,9 +115,12 @@ const audioContext = new AudioContext({ sampleRate: 24000 });
 await tts.load({ model: KOKORO_MEDIUM, voice: KOKORO_VOICE_AF_HEART });
 
 try {
+  // Pre-load the first chunk of text to the buffer
+  tts.streamInsert('This is a streaming test, with a sample input.');
+
   for await (const chunk of tts.stream({
-    text: 'This is a streaming test, with a sample input.',
     speed: 1.0,
+    stopAutomatically: true,  // Will stop the stream automatically after clearing the input buffer
   })) {
     // Play each chunk sequentially
     await new Promise<void>((resolve) => {
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index d6489c9be..9688eef15 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -169,6 +169,12 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
       addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
                                        promiseHostFunction<&Model::stream>,
                                        "stream"));
+      addFunctions(JSI_EXPORT_FUNCTION(
+          ModelHostObject<Model>, synchronousHostFunction<&Model::streamInsert>,
+          "streamInsert"));
+      addFunctions(JSI_EXPORT_FUNCTION(
+          ModelHostObject<Model>, synchronousHostFunction<&Model::streamStop>,
+          "streamStop"));
     }
 
     if constexpr (meta::HasGenerateFromString<Model>) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h
index 3bc7f7f83..050fb902c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h
@@ -34,6 +34,10 @@ inline constexpr int32_t kSamplingRate =
     24000; // Corresponds to Kokoro's model audio frequency
 inline constexpr int32_t kSamplesPerMilisecond = kSamplingRate / 1000;
 
+// Special text characters
+inline const std::unordered_set<char> kEndOfSentenceCharacters = {'.', '?', '!',
+                                                                  ';'};
+
 // Special phonemes
 inline const std::unordered_set<char32_t> kEndOfSentencePhonemes = {
     U'.', U'?', U'!', U';', U'…'};
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
index d73fb6205..7ead055f9 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
@@ -6,6 +6,9 @@
 #include <fstream>
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/data_processing/Sequential.h>
+#include <thread>
+
+#include <rnexecutorch/Log.h>
 
 namespace rnexecutorch::models::text_to_speech::kokoro {
 
@@ -110,13 +113,8 @@ std::vector<float> Kokoro::generate(std::string text, float speed) {
   return audio;
 }
 
-void Kokoro::stream(std::string text, float speed,
+void Kokoro::stream(float speed, bool stopOnEmptyBuffer,
                     std::shared_ptr<jsi::Function> callback) {
-  if (text.size() > params::kMaxTextSize) {
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
-                            "Kokoro: maximum input text size exceeded");
-  }
-
   // Build a full callback function
   auto nativeCallback = [this, callback](const std::vector<float> &audioVec) {
     if (this->isStreaming_) {
@@ -127,60 +125,111 @@ void Kokoro::stream(std::string text, float speed,
     }
   };
 
-  // Mark the beginning of the streaming process
   isStreaming_ = true;
+  stopOnEmptyBuffer_ = stopOnEmptyBuffer;
 
-  // G2P (Grapheme to Phoneme) conversion
-  auto phonemes = phonemizer_.process(text);
-
-  // Divide the phonemes string intro substrings.
-  // Use specialized implementation to minimize the latency between the
-  // sentences.
-  auto subsentences =
-      partitioner_.divide<Partitioner::Strategy::LATENCY>(phonemes);
-
-  // We follow the implementation of generate() method, but
-  // instead of accumulating results in a vector, we push them
-  // back to the JS side with the callback.
-  for (size_t i = 0; i < subsentences.size(); i++) {
-    if (!isStreaming_) {
+  // The outer streaming loop is responsible for handling the input buffer.
+  // The extracted text is then passed to the inner loop, which performs a
+  // standard streaming on a fixed amount of input text.
+  while (isStreaming_) {
+    if (inputTextBuffer_.empty() && stopOnEmptyBuffer_) {
       break;
     }
 
-    const auto &subsentence = subsentences[i];
-
-    // Determine the silent padding duration to be stripped from the edges of
-    // the generated audio. If a chunk ends with a space or follows one that
-    // did, it indicates a word boundary split – we use a shorter padding (20ms)
-    // to ensure natural speech flow. Otherwise, we use 50ms for standard
-    // pauses.
-    bool endsWithSpace = (subsentence.back() == U' ');
-    bool prevEndsWithSpace = (i > 0 && subsentences[i - 1].back() == U' ');
-    size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50; // [ms]
-
-    // Generate an audio vector with the Kokoro model
-    auto audioPart = synthesize(subsentence, speed, paddingMs);
-
-    // Calculate a pause between the sentences
-    char32_t lastPhoneme = subsentence.back();
-    size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
-                         ? params::kPauseValues.at(lastPhoneme)
-                         : params::kDefaultPause;
-    std::vector<float> pause(pauseMs * constants::kSamplesPerMilisecond, 0.F);
-
-    // Add pause to the audio vector
-    audioPart.insert(audioPart.end(), std::make_move_iterator(pause.begin()),
-                     std::make_move_iterator(pause.end()));
+    // Try to find the most recent available end of sentence character.
+    size_t searchLimit =
+        std::min(inputTextBuffer_.size(), params::kMaxTextSize);
+    auto eosIt = std::find_first_of(
+        inputTextBuffer_.rbegin() + (inputTextBuffer_.size() - searchLimit),
+        inputTextBuffer_.rend(), constants::kEndOfSentenceCharacters.begin(),
+        constants::kEndOfSentenceCharacters.end());
+    size_t chunkSize = (eosIt != inputTextBuffer_.rend())
+                           ? std::distance(eosIt, inputTextBuffer_.rend())
+                           : 0;
+
+    // To maximize the quality of the speech, we try to avoid processing
+    // chunks which end in the middle of a sentence.
+    if (chunkSize > 0 ||
+        streamSkippedIterations >= params::kStreamMaxSkippedIterations) {
+      std::string text = inputTextBuffer_.substr(0, chunkSize);
+      inputTextBuffer_.erase(0, chunkSize);
+
+      // Now we proceed with a standard streaming logic for fixed-size input.
+      auto phonemes = phonemizer_.process(text);
+
+      // Divide the phonemes string intro substrings.
+      // Use specialized implementation to minimize the latency between the
+      // sentences.
+      auto subsentences =
+          partitioner_.divide<Partitioner::Strategy::LATENCY>(phonemes);
+
+      // We follow the implementation of generate() method, but
+      // instead of accumulating results in a vector, we push them
+      // back to the JS side with the callback.
+      for (size_t i = 0; i < subsentences.size(); i++) {
+        if (!isStreaming_) {
+          break;
+        }
+
+        const auto &subsentence = subsentences[i];
+
+        // Determine the silent padding duration to be stripped from the edges
+        // of the generated audio. If a chunk ends with a space or follows one
+        // that did, it indicates a word boundary split – we use a shorter
+        // padding (20ms) to ensure natural speech flow. Otherwise, we use 50ms
+        // for standard pauses.
+        bool endsWithSpace = (subsentence.back() == U' ');
+        bool prevEndsWithSpace = (i > 0 && subsentences[i - 1].back() == U' ');
+        size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50; // [ms]
+
+        // Generate an audio vector with the Kokoro model
+        auto audioPart = synthesize(subsentence, speed, paddingMs);
+
+        // Calculate a pause between the sentences
+        char32_t lastPhoneme = subsentence.back();
+        size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
+                             ? params::kPauseValues.at(lastPhoneme)
+                             : params::kDefaultPause;
+        std::vector<float> pause(pauseMs * constants::kSamplesPerMilisecond,
+                                 0.F);
+
+        // Add pause to the audio vector
+        audioPart.insert(audioPart.end(),
+                         std::make_move_iterator(pause.begin()),
+                         std::make_move_iterator(pause.end()));
+
+        // Push the audio right away to the JS side
+        nativeCallback(audioPart);
+      }
+
+      streamSkippedIterations = 0;
+    } else {
+      streamSkippedIterations++;
+    }
 
-    // Push the audio right away to the JS side
-    nativeCallback(audioPart);
+    // A little bit of pause to not overload the thread.
+    if (isStreaming_) {
+      std::this_thread::sleep_for(
+          std::chrono::milliseconds(params::kStreamPause));
+    }
   }
 
-  // Mark the end of the streaming process
+  inputTextBuffer_.clear();
   isStreaming_ = false;
+  streamSkippedIterations = 0;
 }
 
-void Kokoro::streamStop() noexcept { isStreaming_ = false; }
+void Kokoro::streamInsert(std::string textChunk) noexcept {
+  inputTextBuffer_.append(textChunk);
+}
+
+void Kokoro::streamStop(bool instant) noexcept {
+  if (instant) {
+    isStreaming_ = false;
+  } else {
+    stopOnEmptyBuffer_ = true;
+  }
+}
 
 std::vector<float> Kokoro::synthesize(const std::u32string &phonemes,
                                       float speed, size_t paddingMs) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
index f27ba8018..b7091310c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
@@ -24,25 +24,50 @@ class Kokoro {
          const std::string &synthesizerSource, const std::string &voiceSource,
          std::shared_ptr<react::CallInvoker> callInvoker);
 
-  // Processes the entire text at once, before sending back to the JS side.
+  /**
+   * Processes the entire text at once, before sending back to the JS side.
+   *
+   * @param text An input text to be processed.
+   * @param speed Determines the speed of generated speech. Passed directly to
+   * the Kokoro model.
+   */
   std::vector<float> generate(std::string text, float speed = 1.F);
 
-  // Processes text in chunks, sending each chunk individualy to the JS side
-  // with asynchronous callbacks.
-  void stream(std::string text, float speed,
+  /**
+   * Processes text from inputTextBuffer_ in chunks, sending each chunk
+   * individualy to the JS side with asynchronous callbacks.
+   *
+   * @param speed Determines the speed of generated speech. Passed directly to
+   * the Kokoro model.
+   * @param stopOnEmptyBuffer If true, the streaming ends automatically when the
+   * input buffer is empty.
+   * @param callback A callback to the JS side.
+   */
+  void stream(float speed, bool stopOnEmptyBuffer,
               std::shared_ptr<jsi::Function> callback);
 
-  // Stops the streaming process
-  void streamStop() noexcept;
+  /**
+   * Updates the input streaming buffer by adding more text to be processed.
+   *
+   * @param text A new chunk of text, appended to the end of the input buffer.
+   */
+  void streamInsert(std::string textChunk) noexcept;
+
+  /**
+   * Stops the streaming process.
+   *
+   * @param instant If true, stops the streaming as soon as possible by
+   * switching the isStreaming_ flag. Otherwise allows to process the rest of
+   * the buffer first, by switching the stopOnEmptyBuffer_ flag.
+   */
+  void streamStop(bool instant) noexcept;
 
   std::size_t getMemoryLowerBound() const noexcept;
   void unload() noexcept;
 
 private:
-  // Helper function - loading voice array
   void loadVoice(const std::string &voiceSource);
 
-  // Helper function - generate specialization for given input size
   std::vector<float> synthesize(const std::u32string &phonemes, float speed,
                                 size_t paddingMs = 50);
 
@@ -65,8 +90,11 @@ class Kokoro {
              constants::kMaxInputTokens>
       voice_;
 
-  // Extra control variables
+  // Streaming state control variables
+  std::string inputTextBuffer_ = "";
   bool isStreaming_ = false;
+  bool stopOnEmptyBuffer_ = true;
+  int32_t streamSkippedIterations = 0;
 };
 } // namespace models::text_to_speech::kokoro
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h
index f6b910b03..f517db031 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h
@@ -20,6 +20,17 @@ namespace rnexecutorch::models::text_to_speech::kokoro::params {
  */
 inline constexpr size_t kMaxTextSize = 2048;
 
+/**
+ * A number of skipped streaming iterations after which we process the remaining
+ * input no matter how it looks like.
+ */
+inline constexpr int32_t kStreamMaxSkippedIterations = 3;
+
+/**
+ * A size of pause (in miliseconds) applied after each streaming iteration.
+ */
+inline constexpr int32_t kStreamPause = 200;
+
 /**
  * A set of punctation - pause values. Determines how much pause (silence) is
  * being added at the end of each calculated audio vector. This is primarly used
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
index c45ab9107..62842b517 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -63,7 +63,6 @@ elseif(ANDROID_ABI STREQUAL "x86_64")
   set(OPENCV_THIRD_PARTY_LIBS "")
 endif()
 
-
 add_library(opencv_deps INTERFACE)
 target_link_libraries(opencv_deps INTERFACE
     ${OPENCV_LIBS_DIR}/libopencv_core.a
@@ -84,6 +83,12 @@ target_link_options(opencv_deps INTERFACE -fopenmp -static-openmp)
 add_library(tokenizers_deps INTERFACE)
 target_include_directories(tokenizers_deps INTERFACE "${TOKENIZERS_DIR}")
 
+# Phonemis
+set(LIBS_DIR "${PACKAGE_ROOT}/third-party/android/libs")
+set(PHONEMIS_LIBS
+  "${LIBS_DIR}/phonemis/${ANDROID_ABI}/libphonemis.a"
+)
+
 # Source Definitions
 set(CORE_SOURCES
     ${RNEXECUTORCH_DIR}/models/BaseModel.cpp
@@ -261,3 +266,13 @@ add_rn_test(VerticalOCRTests integration/VerticalOCRTest.cpp
         ${IMAGE_UTILS_SOURCES}
     LIBS opencv_deps
 )
+
+add_rn_test(TextToSpeechTests integration/TextToSpeechTest.cpp
+    SOURCES
+        ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Kokoro.cpp
+        ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/DurationPredictor.cpp
+        ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Synthesizer.cpp
+        ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Partitioner.cpp
+        ${RNEXECUTORCH_DIR}/models/text_to_speech/kokoro/Utils.cpp
+    LIBS ${PHONEMIS_LIBS}
+)
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToSpeechTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToSpeechTest.cpp
new file mode 100644
index 000000000..997de11bb
--- /dev/null
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextToSpeechTest.cpp
@@ -0,0 +1,123 @@
+#include "BaseModelTests.h"
+#include "utils/TestUtils.h"
+#include <gtest/gtest.h>
+#include <rnexecutorch/Error.h>
+#include <rnexecutorch/models/text_to_speech/kokoro/Kokoro.h>
+
+using namespace rnexecutorch;
+using namespace rnexecutorch::models::text_to_speech::kokoro;
+
+constexpr auto kValidLang = "en-us";
+constexpr auto kValidTaggerPath = "kokoro_en_tagger.json";
+constexpr auto kValidPhonemizerPath = "kokoro_us_lexicon.json";
+constexpr auto kValidDurationPath = "kokoro_duration_predictor.pte";
+constexpr auto kValidSynthesizerPath = "kokoro_synthesizer.pte";
+constexpr auto kValidVoicePath = "kokoro_af_heart.bin";
+
+namespace {
+bool isAudioValid(const std::vector<float> &audio) {
+  if (audio.empty())
+    return false;
+  // Check for non-silence (amplitude greater than an arbitrary small noise
+  // threshold)
+  for (float sample : audio) {
+    if (std::abs(sample) > 1e-4f) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool isAudioSimilar(const std::vector<float> &audio1,
+                    const std::vector<float> &audio2, float tolerance = 0.1f) {
+  if (audio1.empty() || audio2.empty())
+    return false;
+
+  double sumSqDiff = 0;
+  size_t steps = std::max(audio1.size(), audio2.size());
+
+  for (size_t i = 0; i < steps; ++i) {
+    float idx1 = (static_cast<float>(i) / steps) * audio1.size();
+    float idx2 = (static_cast<float>(i) / steps) * audio2.size();
+
+    float diff =
+        audio1[static_cast<size_t>(idx1)] - audio2[static_cast<size_t>(idx2)];
+    sumSqDiff += diff * diff;
+  }
+
+  double rmse = std::sqrt(sumSqDiff / steps);
+  if (rmse >= tolerance) {
+    std::cerr << "Audio structural RMSE difference: " << rmse
+              << " (tolerance: " << tolerance << ")" << std::endl;
+    return false;
+  }
+  return true;
+}
+
+class KokoroTest : public ::testing::Test {
+protected:
+  void SetUp() override {
+    try {
+      model_ = std::make_unique<Kokoro>(
+          kValidLang, kValidTaggerPath, kValidPhonemizerPath,
+          kValidDurationPath, kValidSynthesizerPath, kValidVoicePath, nullptr);
+    } catch (...) {
+      model_ = nullptr;
+    }
+  }
+
+  std::unique_ptr<Kokoro> model_;
+};
+} // namespace
+
+TEST(TTSCtorTests, InvalidVoicePathThrows) {
+  EXPECT_THROW(Kokoro(kValidLang, kValidTaggerPath, kValidPhonemizerPath,
+                      kValidDurationPath, kValidSynthesizerPath,
+                      "nonexistent_voice.bin", nullptr),
+               RnExecutorchError);
+}
+
+TEST_F(KokoroTest, MaxTextSizeExceededThrows) {
+  if (!model_) {
+    GTEST_SKIP() << "Model assets not available, skipping test.";
+  }
+  std::string hugeText(10000, 'A'); // beyond params::kMaxTextSize
+  EXPECT_THROW(model_->generate(hugeText, 1.0f), RnExecutorchError);
+}
+
+TEST_F(KokoroTest, EmptyStringReturnsEmptyVector) {
+  if (!model_) {
+    GTEST_SKIP() << "Model assets not available, skipping test.";
+  }
+  auto result = model_->generate("", 1.0f);
+  EXPECT_TRUE(result.empty());
+}
+
+TEST_F(KokoroTest, GenerateReturnsValidAudio) {
+  if (!model_) {
+    GTEST_SKIP() << "Model assets not available, skipping test.";
+  }
+  auto result = model_->generate("Hello world! How are you doing?", 1.0f);
+  auto reference = test_utils::loadAudioFromFile("test_speech.raw");
+
+  ASSERT_FALSE(reference.empty())
+      << "Reference audio 'test_speech.raw' not found.";
+
+  // Compare against an audio waveform obtained from the original
+  // Kokoro model (PyTorch)
+  EXPECT_TRUE(isAudioSimilar(result, reference));
+}
+
+TEST_F(KokoroTest, GenerateSpeedAdjustsAudioLength) {
+  if (!model_) {
+    GTEST_SKIP() << "Model assets not available, skipping test.";
+  }
+  std::string text = "This is a sentence to test the speed modifications.";
+  auto resultNormal = model_->generate(text, 1.0f);
+  auto resultFast = model_->generate(text, 1.5f);
+
+  EXPECT_TRUE(isAudioValid(resultNormal));
+  EXPECT_TRUE(isAudioValid(resultFast));
+  // Fast speech should result in a noticeably shorter output waveform
+  EXPECT_LT(resultFast.size(), resultNormal.size());
+}
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/test_speech.raw b/packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/test_speech.raw
new file mode 100644
index 000000000..2cf55af04
Binary files /dev/null and b/packages/react-native-executorch/common/rnexecutorch/tests/integration/assets/test_speech.raw differ
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
index 63d738eb3..a3b5ff2eb 100755
--- a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
@@ -32,6 +32,7 @@ TEST_EXECUTABLES=(
   "TextToImageTests"
   "OCRTests"
   "VerticalOCRTests"
+  "TextToSpeechTests"
 )
 
 # ============================================================================
@@ -39,6 +40,7 @@ TEST_EXECUTABLES=(
 # ============================================================================
 TEST_ASSETS=(
   "integration/assets/test_audio_float.raw"
+  "integration/assets/test_speech.raw"
   "integration/assets/we_are_software_mansion.jpg"
 )
 
@@ -66,6 +68,11 @@ MODELS=(
   "t2i_encoder.pte|https://huggingface.co/software-mansion/react-native-executorch-bk-sdm-tiny/resolve/v0.6.0/text_encoder/model.pte"
   "t2i_unet.pte|https://huggingface.co/software-mansion/react-native-executorch-bk-sdm-tiny/resolve/v0.6.0/unet/model.256.pte"
   "t2i_decoder.pte|https://huggingface.co/software-mansion/react-native-executorch-bk-sdm-tiny/resolve/v0.6.0/vae/model.256.pte"
+  "kokoro_duration_predictor.pte|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/xnnpack/medium/duration_predictor.pte"
+  "kokoro_synthesizer.pte|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/xnnpack/medium/synthesizer.pte"
+  "kokoro_af_heart.bin|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/voices/af_heart.bin"
+  "kokoro_us_lexicon.json|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/phonemizer/us_merged.json"
+  "kokoro_en_tagger.json|https://huggingface.co/software-mansion/react-native-executorch-kokoro/resolve/main/phonemizer/tags.json"
 )
 
 # ============================================================================
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
index b29b4bc8d..19d1645f2 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
@@ -95,10 +95,14 @@ export const useTextToSpeech = ({
         );
       setIsGenerating(true);
       try {
+        if (input.text) {
+          moduleInstance.streamInsert(input.text);
+        }
+
         await input.onBegin?.();
         for await (const audio of moduleInstance.stream({
-          text: input.text,
           speed: input.speed ?? 1.0,
+          stopAutomatically: input.stopAutomatically ?? true,
         })) {
           if (input.onNext) {
             await input.onNext(audio);
@@ -118,7 +122,8 @@ export const useTextToSpeech = ({
     isGenerating,
     forward,
     stream,
-    streamStop: moduleInstance.streamStop,
+    streamInsert: (text: string) => moduleInstance.streamInsert(text),
+    streamStop: (instant: boolean = true) => moduleInstance.streamStop(instant),
     downloadProgress,
   };
 };
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
index 849c25676..bbc36bdad 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
@@ -15,11 +15,10 @@ import { Logger } from '../../common/Logger';
  * @category Typescript API
  */
 export class TextToSpeechModule {
-  /**
-   * Native module instance
-   */
   nativeModule: any = null;
 
+  streamFinished: boolean = false;
+
   /**
    * Loads the model and voice assets specified by the config object.
    * `onDownloadProgressCallback` allows you to monitor the current progress.
@@ -125,16 +124,17 @@ export class TextToSpeechModule {
    * @returns An async generator yielding Float32Array audio chunks.
    */
   public async *stream({
-    text,
     speed,
+    stopAutomatically,
   }: TextToSpeechStreamingInput): AsyncGenerator<Float32Array> {
     // Stores computed audio segments
     const queue: Float32Array[] = [];
 
     let waiter: (() => void) | null = null;
-    let finished = false;
     let error: unknown;
 
+    this.streamFinished = false;
+
     const wake = () => {
       waiter?.();
       waiter = null;
@@ -142,38 +142,53 @@ export class TextToSpeechModule {
 
     (async () => {
       try {
-        await this.nativeModule.stream(text, speed, (audio: number[]) => {
-          queue.push(new Float32Array(audio));
-          wake();
-        });
-        finished = true;
+        await this.nativeModule.stream(
+          speed,
+          stopAutomatically,
+          (audio: number[]) => {
+            queue.push(new Float32Array(audio));
+            wake();
+          }
+        );
+        this.streamFinished = true;
         wake();
       } catch (e) {
         error = e;
-        finished = true;
+        this.streamFinished = true;
         wake();
       }
     })();
 
-    while (true) {
+    while (!this.streamFinished) {
       if (queue.length > 0) {
         yield queue.shift()!;
-        if (finished && queue.length === 0) {
+        if (this.streamFinished && queue.length === 0) {
           return;
         }
         continue;
       }
       if (error) throw error;
-      if (finished) return;
+      if (this.streamFinished) return;
       await new Promise<void>((r) => (waiter = r));
     }
   }
 
+  /**
+   * Inserts new text chunk into the buffer to be processed in streaming mode.
+   */
+  public streamInsert(textChunk: string): void {
+    this.nativeModule.streamInsert(textChunk);
+  }
+
   /**
    * Stops the streaming process if there is any ongoing.
+   *
+   * * @param instant If true, stops the streaming as soon as possible. Otherwise
+   *                  allows the module to complete processing for the remains of the buffer.
    */
-  public streamStop(): void {
-    this.nativeModule.streamStop();
+  public streamStop(instant: boolean = true): void {
+    this.nativeModule.streamStop(instant);
+    this.streamFinished = true;
   }
 
   /**
diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts
index 55937be49..efe4e111f 100644
--- a/packages/react-native-executorch/src/types/tts.ts
+++ b/packages/react-native-executorch/src/types/tts.ts
@@ -134,10 +134,18 @@ export interface TextToSpeechType {
    */
   stream: (input: TextToSpeechStreamingInput) => Promise<void>;
 
+  /**
+   * Inserts new text chunk into the buffer to be processed in streaming mode.
+   */
+  streamInsert: (textChunk: string) => void;
+
   /**
    * Interrupts and stops the currently active audio generation stream.
+   *
+   * @param instant If true, stops the streaming as soon as possible. Otherwise
+   *                allows the module to complete processing for the remains of the buffer.
    */
-  streamStop: () => void;
+  streamStop: (instant?: boolean) => void;
 }
 
 /**
@@ -149,11 +157,17 @@ export interface TextToSpeechType {
  * Callbacks can be both synchronous or asynchronous.
  *
  * @category Types
- * @property {() => void | Promise<void>} [onBegin] - Called when streaming begins
+ * @property {string} [text] - Initial text to be spoken. The streaming input buffer is initially filled with this value.
+ * @property {number} [speed] - Optional speed argument; higher values increase the speech rate.
+ * @property {boolean} [stopAutomatically] - If true, streaming will stop automatically when the buffer is empty.
+ * @property {() => void | Promise<void>} [onBegin] - Called when streaming begins.
  * @property {(audio: Float32Array) => void | Promise<void>} [onNext] - Called after each audio chunk gets calculated.
- * @property {() => void | Promise<void>} [onEnd] - Called when streaming ends
+ * @property {() => void | Promise<void>} [onEnd] - Called when streaming ends.
  */
-export interface TextToSpeechStreamingInput extends TextToSpeechInput {
+export interface TextToSpeechStreamingInput {
+  text?: string;
+  speed?: number;
+  stopAutomatically?: boolean;
   onBegin?: () => void | Promise<void>;
   onNext?: (audio: Float32Array) => void | Promise<void>;
   onEnd?: () => void | Promise<void>;