Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
## Unreleased

### Added
- **JARVIS-style TTS status reports** β€” Neural voice (edge-tts, en-GB-RyanNeural) speaks concise status after each action ("Got it. 12 words captured", "Located 4 files, sir")
- **TTS HTTP API server** β€” `POST http://127.0.0.1:7865/api/speak` endpoint lets any external tool (Claude Code, scripts, etc.) trigger spoken feedback
- **Feedback hotkey mode** (`Cmd+Shift+F`) β€” Pastes transcription with TTS endpoint instructions so the receiving LLM can speak back
- **Chunked TTS playback** β€” Long text splits into rolling chunks of 2-3 sentences; first chunk plays immediately while the rest generate in the background
- **Cross-platform TTS** β€” edge-tts with ffplay/afplay for headless playback on Windows, macOS, and Linux; SAPI/AVSpeechSynthesizer/espeak-ng fallback when offline
- **TTS config keys** β€” `tts_enabled`, `tts_voice`, `tts_edge_rate`, `tts_edge_pitch`, `tts_rate`, `tts_volume` in `~/.vibetotext/config.json`
- **Gemini LLM integration** β€” New `llm.py` module that uses Google Gemini to clean up rambling voice transcriptions into clear prompts and generate structured implementation plans
- **Window state persistence** β€” History app now remembers its position and size between sessions
- **Startup/stop scripts** β€” `start-all.sh` and `stop-all.sh` to launch and kill both services in one command
Expand Down
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,20 @@ All implementations share the same SQLite database at `~/.vibetotext/history.db`
- `Cmd+Shift` β€” **Greppy** mode with semantic code search
- `Alt+Shift` β€” **Cleanup** mode (AI refines rambling into clear prompts)
- `Cmd+Alt` β€” **Plan** mode (generates structured implementation plans)
- `Cmd+Shift+F` β€” **Feedback** mode (pastes transcription with TTS endpoint instructions so any LLM can speak back)

**Fast Local Transcription**
- Whisper.cpp for 2-4x faster transcription than Python Whisper
- Technical vocabulary bias for programming terms
- Auto-paste to cursor

**JARVIS-style TTS Status Reports**
- Neural voice (edge-tts, en-GB-RyanNeural) speaks status after each action
- Chunked playback for long text β€” first sentence plays immediately while rest generates in background
- HTTP API server at `http://127.0.0.1:7865` enables any external tool to speak via `POST /api/speak`
- Configurable voice, rate, pitch, and volume in `~/.vibetotext/config.json`
- Falls back to platform TTS (SAPI/say/espeak-ng) when offline

## Analytics & Settings

![Analytics Dashboard](docs/analytics.png)
Expand Down
166 changes: 166 additions & 0 deletions macos-native/Sources/Core/ApiServer.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import Foundation
import Network

/// Minimal HTTP API server using NWListener β€” exposes TTS and status
/// endpoints for external tools (e.g. DevGlide MCP voice server).
final class ApiServer {
private var listener: NWListener?
let port: UInt16

init(port: UInt16 = 7865) {
self.port = port
}

// MARK: - Lifecycle

func start() {
do {
let params = NWParameters.tcp
listener = try NWListener(using: params, on: NWEndpoint.Port(rawValue: port)!)
listener?.newConnectionHandler = { [weak self] connection in
self?.handleConnection(connection)
}
listener?.stateUpdateHandler = { state in
switch state {
case .ready:
print("[API] Server running on http://127.0.0.1:\(self.port)")
case .failed(let error):
print("[API] Listener failed: \(error)")
default:
break
}
}
listener?.start(queue: .global())
} catch {
print("[API] Failed to start: \(error)")
}
}

func stop() {
listener?.cancel()
listener = nil
}

// MARK: - Connection handling

private func handleConnection(_ connection: NWConnection) {
connection.start(queue: .global())
connection.receive(minimumIncompleteLength: 1, maximumLength: 65536) { [weak self] data, _, _, error in
if let error {
print("[API] Receive error: \(error)")
connection.cancel()
return
}
guard let data, let request = String(data: data, encoding: .utf8) else {
connection.cancel()
return
}
self?.handleHTTP(request: request, connection: connection)
}
}

// MARK: - HTTP routing

private func handleHTTP(request: String, connection: NWConnection) {
let lines = request.components(separatedBy: "\r\n")
guard let requestLine = lines.first else {
sendResponse(connection, status: 400, body: #"{"error":"malformed request"}"#)
return
}

let parts = requestLine.split(separator: " ", maxSplits: 2)
guard parts.count >= 2 else {
sendResponse(connection, status: 400, body: #"{"error":"malformed request line"}"#)
return
}

let method = String(parts[0])
let path = String(parts[1])

// Handle CORS preflight
if method == "OPTIONS" {
sendCORSPreflight(connection)
return
}

switch (method, path) {
case ("POST", "/api/speak"):
handleSpeak(request: request, connection: connection)

case ("POST", "/api/stop"):
TtsService.shared.stop()
sendResponse(connection, status: 200, body: #"{"status":"stopped"}"#)

case ("GET", "/api/status"):
sendResponse(connection, status: 200, body: #"{"status":"ok","tts_enabled":\#(ConfigStore.shared.ttsEnabled)}"#)

default:
sendResponse(connection, status: 404, body: #"{"error":"not found"}"#)
}
}

// MARK: - Endpoint handlers

private func handleSpeak(request: String, connection: NWConnection) {
// Extract JSON body after the blank line separating headers from body
guard let bodyRange = request.range(of: "\r\n\r\n") else {
sendResponse(connection, status: 400, body: #"{"error":"no body"}"#)
return
}
let bodyStr = String(request[bodyRange.upperBound...])

guard let bodyData = bodyStr.data(using: .utf8),
let json = try? JSONSerialization.jsonObject(with: bodyData) as? [String: Any],
let text = json["text"] as? String, !text.isEmpty else {
sendResponse(connection, status: 400, body: #"{"error":"missing or empty \"text\" field"}"#)
return
}

TtsService.shared.speak(text)
sendResponse(connection, status: 200, body: #"{"status":"speaking"}"#)
}

// MARK: - Response helpers

private func sendResponse(_ connection: NWConnection, status: Int, body: String) {
let statusText: String
switch status {
case 200: statusText = "OK"
case 400: statusText = "Bad Request"
case 404: statusText = "Not Found"
case 405: statusText = "Method Not Allowed"
default: statusText = "Error"
}

let response = [
"HTTP/1.1 \(status) \(statusText)",
"Content-Type: application/json",
"Access-Control-Allow-Origin: *",
"Access-Control-Allow-Methods: GET, POST, OPTIONS",
"Access-Control-Allow-Headers: Content-Type",
"Content-Length: \(body.utf8.count)",
"",
body,
].joined(separator: "\r\n")

connection.send(content: response.data(using: .utf8), completion: .contentProcessed { _ in
connection.cancel()
})
}

private func sendCORSPreflight(_ connection: NWConnection) {
let response = [
"HTTP/1.1 204 No Content",
"Access-Control-Allow-Origin: *",
"Access-Control-Allow-Methods: GET, POST, OPTIONS",
"Access-Control-Allow-Headers: Content-Type",
"Content-Length: 0",
"",
"",
].joined(separator: "\r\n")

connection.send(content: response.data(using: .utf8), completion: .contentProcessed { _ in
connection.cancel()
})
}
}
29 changes: 29 additions & 0 deletions macos-native/Sources/Core/GeminiService.swift
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,18 @@ final class GeminiService {
return try await generateContent(prompt: prompt, apiKey: apiKey, temperature: 0.4, maxTokens: 4096)
}

// MARK: - Feedback mode

func feedback(text: String) async throws -> String? {
guard let apiKey = ConfigStore.shared.geminiAPIKey else {
print("[Gemini] No API key configured")
return nil
}

let prompt = Self.feedbackPrompt.replacingOccurrences(of: "{text}", with: text)
return try await generateContent(prompt: prompt, apiKey: apiKey, temperature: 0.5, maxTokens: 256)
}

// MARK: - REST API call

private func generateContent(prompt: String, apiKey: String, temperature: Double, maxTokens: Int) async throws -> String? {
Expand Down Expand Up @@ -88,6 +100,23 @@ final class GeminiService {
Refined output:
"""

static let feedbackPrompt = """
You are JARVIS, a calm and concise AI assistant. The user has spoken to you and needs a brief verbal response.

Rules:
- Respond in 1-3 short sentences MAX. This will be spoken aloud via TTS.
- Be direct and helpful. No filler, no "I think", no hedging.
- Use natural spoken English β€” contractions, simple words. No markdown, no bullet points.
- If they asked a question, answer it. If they described something, give concise feedback.
- Address the user as "sir" occasionally but not every sentence.
- Sound like a knowledgeable, confident AI assistant.

User said:
{text}

Your spoken response:
"""

static let planPrompt = """
You are a senior software architect. Transform a rambling voice description into a concise implementation plan.

Expand Down
3 changes: 3 additions & 0 deletions macos-native/Sources/Core/HotkeyManager.swift
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ final class HotkeyManager {
private let hotkeys: [HotkeyDef] = [
// cmd+alt+p (key code 35 = 'p')
HotkeyDef(modifiers: [.maskCommand, .maskAlternate], keyCode: 35, mode: "plan"),
// cmd+shift+f (key code 3 = 'f')
HotkeyDef(modifiers: [.maskCommand, .maskShift], keyCode: 3, mode: "feedback"),
// cmd+alt+shift (modifiers only) β€” must be before alt+shift (more specific)
HotkeyDef(modifiers: [.maskCommand, .maskAlternate, .maskShift], keyCode: nil, mode: "greppy"),
// alt+shift (modifiers only)
Expand Down Expand Up @@ -76,6 +78,7 @@ final class HotkeyManager {
print(" [alt+shift] = cleanup")
print(" [cmd+alt+shift] = greppy")
print(" [cmd+alt+p] = plan")
print(" [cmd+shift+f] = feedback")
}

func stop() {
Expand Down
22 changes: 22 additions & 0 deletions macos-native/Sources/Core/TranscriptionPipeline.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ final class TranscriptionPipeline {
private var geminiService: GeminiService?
private var greppyService: GreppyService?
private var waveformController: WaveformOverlayController?
private var apiServer: ApiServer?

private var isRecording = false
private var currentMode: String?
Expand All @@ -32,6 +33,10 @@ final class TranscriptionPipeline {
}

hotkeyManager?.start()

apiServer = ApiServer()
apiServer?.start()

print("[Pipeline] Started β€” hold hotkey to record")
}

Expand All @@ -41,6 +46,8 @@ final class TranscriptionPipeline {
_ = recorder?.stop()
}
waveformController?.hide()
TtsService.shared.stop()
apiServer?.stop()
}

private func startRecording(mode: String) {
Expand Down Expand Up @@ -90,6 +97,7 @@ final class TranscriptionPipeline {

// 2. Process based on mode
var output = text
var fileCount = 0
switch mode {
case "cleanup":
if let refined = try await geminiService?.cleanup(text: text) {
Expand All @@ -103,7 +111,15 @@ final class TranscriptionPipeline {
let context = await greppyService?.search(query: text) ?? ""
if !context.isEmpty {
output = text + "\n\n" + context
fileCount = context.components(separatedBy: "### ").count - 1
}
case "feedback":
if let feedback = try await geminiService?.feedback(text: text) {
TtsService.shared.speak(feedback)
} else {
TtsService.shared.speak("I couldn't generate feedback, sir")
}
// output stays as original text for paste
default:
break // transcribe mode: use raw text
}
Expand All @@ -119,8 +135,14 @@ final class TranscriptionPipeline {
PasteService.pasteAtCursor(output)
print("[Pipeline] Pasted at cursor.")

// 5. Speak status report
let status = TtsService.generateStatusMessage(
mode: mode, text: text, output: output, fileCount: fileCount)
TtsService.shared.speak(status)

} catch {
print("[Pipeline] Error: \(error)")
TtsService.shared.speak("Processing failed")
}
}
}
Expand Down
Loading