dyoburon · DannyNs · Mar 18, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,12 @@
 ## Unreleased
 
 ### Added
+- **JARVIS-style TTS status reports** — Neural voice (edge-tts, en-GB-RyanNeural) speaks concise status after each action ("Got it. 12 words captured", "Located 4 files, sir")
+- **TTS HTTP API server** — `POST http://127.0.0.1:7865/api/speak` endpoint lets any external tool (Claude Code, scripts, etc.) trigger spoken feedback
+- **Feedback hotkey mode** (`Cmd+Shift+F`) — Pastes transcription with TTS endpoint instructions so the receiving LLM can speak back
+- **Chunked TTS playback** — Long text splits into rolling chunks of 2-3 sentences; first chunk plays immediately while the rest generate in the background
+- **Cross-platform TTS** — edge-tts with ffplay/afplay for headless playback on Windows, macOS, and Linux; SAPI/AVSpeechSynthesizer/espeak-ng fallback when offline
+- **TTS config keys** — `tts_enabled`, `tts_voice`, `tts_edge_rate`, `tts_edge_pitch`, `tts_rate`, `tts_volume` in `~/.vibetotext/config.json`
 - **Gemini LLM integration** — New `llm.py` module that uses Google Gemini to clean up rambling voice transcriptions into clear prompts and generate structured implementation plans
 - **Window state persistence** — History app now remembers its position and size between sessions
 - **Startup/stop scripts** — `start-all.sh` and `stop-all.sh` to launch and kill both services in one command

diff --git a/README.md b/README.md
@@ -20,12 +20,20 @@ All implementations share the same SQLite database at `~/.vibetotext/history.db`
 - `Cmd+Shift` — **Greppy** mode with semantic code search
 - `Alt+Shift` — **Cleanup** mode (AI refines rambling into clear prompts)
 - `Cmd+Alt` — **Plan** mode (generates structured implementation plans)
+- `Cmd+Shift+F` — **Feedback** mode (pastes transcription with TTS endpoint instructions so any LLM can speak back)
 
 **Fast Local Transcription**
 - Whisper.cpp for 2-4x faster transcription than Python Whisper
 - Technical vocabulary bias for programming terms
 - Auto-paste to cursor
 
+**JARVIS-style TTS Status Reports**
+- Neural voice (edge-tts, en-GB-RyanNeural) speaks status after each action
+- Chunked playback for long text — first sentence plays immediately while rest generates in background
+- HTTP API server at `http://127.0.0.1:7865` enables any external tool to speak via `POST /api/speak`
+- Configurable voice, rate, pitch, and volume in `~/.vibetotext/config.json`
+- Falls back to platform TTS (SAPI/say/espeak-ng) when offline
+
 ## Analytics & Settings
 
 ![Analytics Dashboard](docs/analytics.png)

diff --git a/macos-native/Sources/Core/ApiServer.swift b/macos-native/Sources/Core/ApiServer.swift
@@ -0,0 +1,166 @@
+import Foundation
+import Network
+
+/// Minimal HTTP API server using NWListener — exposes TTS and status
+/// endpoints for external tools (e.g. DevGlide MCP voice server).
+final class ApiServer {
+    private var listener: NWListener?
+    let port: UInt16
+
+    init(port: UInt16 = 7865) {
+        self.port = port
+    }
+
+    // MARK: - Lifecycle
+
+    func start() {
+        do {
+            let params = NWParameters.tcp
+            listener = try NWListener(using: params, on: NWEndpoint.Port(rawValue: port)!)
+            listener?.newConnectionHandler = { [weak self] connection in
+                self?.handleConnection(connection)
+            }
+            listener?.stateUpdateHandler = { state in
+                switch state {
+                case .ready:
+                    print("[API] Server running on http://127.0.0.1:\(self.port)")
+                case .failed(let error):
+                    print("[API] Listener failed: \(error)")
+                default:
+                    break
+                }
+            }
+            listener?.start(queue: .global())
+        } catch {
+            print("[API] Failed to start: \(error)")
+        }
+    }
+
+    func stop() {
+        listener?.cancel()
+        listener = nil
+    }
+
+    // MARK: - Connection handling
+
+    private func handleConnection(_ connection: NWConnection) {
+        connection.start(queue: .global())
+        connection.receive(minimumIncompleteLength: 1, maximumLength: 65536) { [weak self] data, _, _, error in
+            if let error {
+                print("[API] Receive error: \(error)")
+                connection.cancel()
+                return
+            }
+            guard let data, let request = String(data: data, encoding: .utf8) else {
+                connection.cancel()
+                return
+            }
+            self?.handleHTTP(request: request, connection: connection)
+        }
+    }
+
+    // MARK: - HTTP routing
+
+    private func handleHTTP(request: String, connection: NWConnection) {
+        let lines = request.components(separatedBy: "\r\n")
+        guard let requestLine = lines.first else {
+            sendResponse(connection, status: 400, body: #"{"error":"malformed request"}"#)
+            return
+        }
+
+        let parts = requestLine.split(separator: " ", maxSplits: 2)
+        guard parts.count >= 2 else {
+            sendResponse(connection, status: 400, body: #"{"error":"malformed request line"}"#)
+            return
+        }
+
+        let method = String(parts[0])
+        let path = String(parts[1])
+
+        // Handle CORS preflight
+        if method == "OPTIONS" {
+            sendCORSPreflight(connection)
+            return
+        }
+
+        switch (method, path) {
+        case ("POST", "/api/speak"):
+            handleSpeak(request: request, connection: connection)
+
+        case ("POST", "/api/stop"):
+            TtsService.shared.stop()
+            sendResponse(connection, status: 200, body: #"{"status":"stopped"}"#)
+
+        case ("GET", "/api/status"):
+            sendResponse(connection, status: 200, body: #"{"status":"ok","tts_enabled":\#(ConfigStore.shared.ttsEnabled)}"#)
+
+        default:
+            sendResponse(connection, status: 404, body: #"{"error":"not found"}"#)
+        }
+    }
+
+    // MARK: - Endpoint handlers
+
+    private func handleSpeak(request: String, connection: NWConnection) {
+        // Extract JSON body after the blank line separating headers from body
+        guard let bodyRange = request.range(of: "\r\n\r\n") else {
+            sendResponse(connection, status: 400, body: #"{"error":"no body"}"#)
+            return
+        }
+        let bodyStr = String(request[bodyRange.upperBound...])
+
+        guard let bodyData = bodyStr.data(using: .utf8),
+              let json = try? JSONSerialization.jsonObject(with: bodyData) as? [String: Any],
+              let text = json["text"] as? String, !text.isEmpty else {
+            sendResponse(connection, status: 400, body: #"{"error":"missing or empty \"text\" field"}"#)
+            return
+        }
+
+        TtsService.shared.speak(text)
+        sendResponse(connection, status: 200, body: #"{"status":"speaking"}"#)
+    }
+
+    // MARK: - Response helpers
+
+    private func sendResponse(_ connection: NWConnection, status: Int, body: String) {
+        let statusText: String
+        switch status {
+        case 200: statusText = "OK"
+        case 400: statusText = "Bad Request"
+        case 404: statusText = "Not Found"
+        case 405: statusText = "Method Not Allowed"
+        default:  statusText = "Error"
+        }
+
+        let response = [
+            "HTTP/1.1 \(status) \(statusText)",
+            "Content-Type: application/json",
+            "Access-Control-Allow-Origin: *",
+            "Access-Control-Allow-Methods: GET, POST, OPTIONS",
+            "Access-Control-Allow-Headers: Content-Type",
+            "Content-Length: \(body.utf8.count)",
+            "",
+            body,
+        ].joined(separator: "\r\n")
+
+        connection.send(content: response.data(using: .utf8), completion: .contentProcessed { _ in
+            connection.cancel()
+        })
+    }
+
+    private func sendCORSPreflight(_ connection: NWConnection) {
+        let response = [
+            "HTTP/1.1 204 No Content",
+            "Access-Control-Allow-Origin: *",
+            "Access-Control-Allow-Methods: GET, POST, OPTIONS",
+            "Access-Control-Allow-Headers: Content-Type",
+            "Content-Length: 0",
+            "",
+            "",
+        ].joined(separator: "\r\n")
+
+        connection.send(content: response.data(using: .utf8), completion: .contentProcessed { _ in
+            connection.cancel()
+        })
+    }
+}
diff --git a/macos-native/Sources/Core/GeminiService.swift b/macos-native/Sources/Core/GeminiService.swift
@@ -29,6 +29,18 @@ final class GeminiService {
         return try await generateContent(prompt: prompt, apiKey: apiKey, temperature: 0.4, maxTokens: 4096)
     }
 
+    // MARK: - Feedback mode
+
+    func feedback(text: String) async throws -> String? {
+        guard let apiKey = ConfigStore.shared.geminiAPIKey else {
+            print("[Gemini] No API key configured")
+            return nil
+        }
+
+        let prompt = Self.feedbackPrompt.replacingOccurrences(of: "{text}", with: text)
+        return try await generateContent(prompt: prompt, apiKey: apiKey, temperature: 0.5, maxTokens: 256)
+    }
+
     // MARK: - REST API call
 
     private func generateContent(prompt: String, apiKey: String, temperature: Double, maxTokens: Int) async throws -> String? {
@@ -88,6 +100,23 @@ final class GeminiService {
     Refined output:
     """
 
+    static let feedbackPrompt = """
+    You are JARVIS, a calm and concise AI assistant. The user has spoken to you and needs a brief verbal response.
+
+    Rules:
+    - Respond in 1-3 short sentences MAX. This will be spoken aloud via TTS.
+    - Be direct and helpful. No filler, no "I think", no hedging.
+    - Use natural spoken English — contractions, simple words. No markdown, no bullet points.
+    - If they asked a question, answer it. If they described something, give concise feedback.
+    - Address the user as "sir" occasionally but not every sentence.
+    - Sound like a knowledgeable, confident AI assistant.
+
+    User said:
+    {text}
+
+    Your spoken response:
+    """
+
     static let planPrompt = """
     You are a senior software architect. Transform a rambling voice description into a concise implementation plan.
 

diff --git a/macos-native/Sources/Core/HotkeyManager.swift b/macos-native/Sources/Core/HotkeyManager.swift
@@ -33,6 +33,8 @@ final class HotkeyManager {
     private let hotkeys: [HotkeyDef] = [
         // cmd+alt+p (key code 35 = 'p')
         HotkeyDef(modifiers: [.maskCommand, .maskAlternate], keyCode: 35, mode: "plan"),
+        // cmd+shift+f (key code 3 = 'f')
+        HotkeyDef(modifiers: [.maskCommand, .maskShift], keyCode: 3, mode: "feedback"),
         // cmd+alt+shift (modifiers only) — must be before alt+shift (more specific)
         HotkeyDef(modifiers: [.maskCommand, .maskAlternate, .maskShift], keyCode: nil, mode: "greppy"),
         // alt+shift (modifiers only)
@@ -76,6 +78,7 @@ final class HotkeyManager {
         print("  [alt+shift]     = cleanup")
         print("  [cmd+alt+shift] = greppy")
         print("  [cmd+alt+p]     = plan")
+        print("  [cmd+shift+f]   = feedback")
     }
 
     func stop() {

diff --git a/macos-native/Sources/Core/TranscriptionPipeline.swift b/macos-native/Sources/Core/TranscriptionPipeline.swift
@@ -10,6 +10,7 @@ final class TranscriptionPipeline {
     private var geminiService: GeminiService?
     private var greppyService: GreppyService?
     private var waveformController: WaveformOverlayController?
+    private var apiServer: ApiServer?
 
     private var isRecording = false
     private var currentMode: String?
@@ -32,6 +33,10 @@ final class TranscriptionPipeline {
         }
 
         hotkeyManager?.start()
+
+        apiServer = ApiServer()
+        apiServer?.start()
+
         print("[Pipeline] Started — hold hotkey to record")
     }
 
@@ -41,6 +46,8 @@ final class TranscriptionPipeline {
             _ = recorder?.stop()
         }
         waveformController?.hide()
+        TtsService.shared.stop()
+        apiServer?.stop()
     }
 
     private func startRecording(mode: String) {
@@ -90,6 +97,7 @@ final class TranscriptionPipeline {
 
                 // 2. Process based on mode
                 var output = text
+                var fileCount = 0
                 switch mode {
                 case "cleanup":
                     if let refined = try await geminiService?.cleanup(text: text) {
@@ -103,7 +111,15 @@ final class TranscriptionPipeline {
                     let context = await greppyService?.search(query: text) ?? ""
                     if !context.isEmpty {
                         output = text + "\n\n" + context
+                        fileCount = context.components(separatedBy: "### ").count - 1
                     }
+                case "feedback":
+                    if let feedback = try await geminiService?.feedback(text: text) {
+                        TtsService.shared.speak(feedback)
+                    } else {
+                        TtsService.shared.speak("I couldn't generate feedback, sir")
+                    }
+                    // output stays as original text for paste
                 default:
                     break // transcribe mode: use raw text
                 }
@@ -119,8 +135,14 @@ final class TranscriptionPipeline {
                 PasteService.pasteAtCursor(output)
                 print("[Pipeline] Pasted at cursor.")
 
+                // 5. Speak status report
+                let status = TtsService.generateStatusMessage(
+                    mode: mode, text: text, output: output, fileCount: fileCount)
+                TtsService.shared.speak(status)
+
             } catch {
                 print("[Pipeline] Error: \(error)")
+                TtsService.shared.speak("Processing failed")
             }
         }
     }