From 2494b025e40f37250397b40e83a403f09e153f97 Mon Sep 17 00:00:00 2001 From: Nathanael Date: Sun, 22 Mar 2026 14:26:26 +0000 Subject: [PATCH 01/10] docs: add hand gesture rewind design spec Co-Authored-By: Claude Opus 4.6 (1M context) --- .../2026-03-22-hand-gesture-rewind-design.md | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 docs/superpowers/specs/2026-03-22-hand-gesture-rewind-design.md diff --git a/docs/superpowers/specs/2026-03-22-hand-gesture-rewind-design.md b/docs/superpowers/specs/2026-03-22-hand-gesture-rewind-design.md new file mode 100644 index 0000000..1a67653 --- /dev/null +++ b/docs/superpowers/specs/2026-03-22-hand-gesture-rewind-design.md @@ -0,0 +1,84 @@ +# Hand Gesture Rewind — Design Spec + +## Problem + +When presenting with Textream, the speaker sometimes needs to go back a few sentences in the script — to re-read a section, correct a mistake, or recover after going off-script. Currently there is no hands-free way to rewind. The speaker would have to walk to the laptop and tap the screen. + +## Solution + +Use the Mac's front-facing camera and Apple's Vision framework to detect a raised hand. While the hand is raised, the script rewinds continuously. Hand height controls rewind speed — higher hand = faster rewind. Lowering the hand resumes normal operation. + +## Hand Detection Pipeline + +A new `HandGestureController` class owns camera capture and Vision processing: + +1. **AVCaptureSession** captures frames from the default front-facing camera at low resolution (~640x480) and low frame rate (~15fps) +2. Each frame is processed by **VNDetectHumanHandPoseRequest** which returns hand landmark coordinates +3. The **wrist Y-position** (0.0 = bottom of frame, 1.0 = top in Vision coordinates) is extracted and smoothed with a rolling average of the last 3-4 frames to reduce jitter +4. A **raise threshold** (wrist Y > 0.6) determines whether the hand is raised. Below this, the hand is in the speaker's lap or at their side and is ignored +5. The controller publishes two observable values: + - `isHandRaised: Bool` + - `handHeight: Float` (0.0 = just above threshold, 1.0 = top of frame) + +### Lifecycle + +- Camera starts when a reading session begins (`SpeechRecognizer.start()`) +- Camera stops when the reading session ends (`SpeechRecognizer.stop()`) +- Camera is NOT running when the app is idle + +### Camera Selection + +Uses the default front-facing camera. No settings UI for camera selection in this iteration. + +## Rewind Behavior + +### When hand is raised (`isHandRaised` becomes true): + +1. **Pause speech recognition** — stop the audio engine and recognition task without setting `isListening = false` (we intend to resume) +2. **Start a rewind timer** — a `Timer` firing every 0.25 seconds +3. Each tick moves `recognizedCharCount` backward by N words (finding previous space characters in `sourceText`). The `handHeight` value controls speed: + - Low hand (0.0–0.3): 1 word per tick (~4 words/sec) + - Mid hand (0.3–0.7): 2 words per tick (~8 words/sec) + - High hand (0.7–1.0): 4 words per tick (~16 words/sec) +4. `recognizedCharCount` is clamped to never go below 0 + +### When hand is lowered (`isHandRaised` becomes false): + +Behavior depends on the current `ListeningMode`: + +- **wordTracking:** Immediately set `matchStartOffset = recognizedCharCount` and call `beginRecognition()` to resume speech tracking from the new position +- **classic / silencePaused:** Wait 1.5 seconds before resuming auto-scroll from the new position + +### Visual Feedback + +MarqueeTextView already observes `recognizedCharCount` and animates scroll position — the rewind will visually scroll backward with no additional UI work needed. + +## File Organization + +### New File + +| File | Responsibility | +|------|---------------| +| `HandGestureController.swift` | AVCaptureSession setup, VNDetectHumanHandPoseRequest processing, wrist position smoothing, publishes `isHandRaised` and `handHeight` | + +### Modified Files + +| File | Change | +|------|--------| +| `SpeechRecognizer.swift` | Add `HandGestureController` property, start/stop camera with reading session, rewind logic on hand raise, resume logic on hand lower | +| `Info.plist` | Add `NSCameraUsageDescription` for camera permission prompt | + +### Unchanged + +MarqueeTextView, NotchSettings, ContentView, SettingsView, BrowserServer, ExternalDisplayController — everything downstream of `recognizedCharCount` is untouched. + +## Privacy + +The app needs `NSCameraUsageDescription` in `Info.plist`. macOS will prompt for camera permission on first use. The camera feed is processed locally — no frames leave the device. + +## Supported Listening Modes + +The gesture rewind works in all three listening modes: +- **Word Tracking** (speech recognition) +- **Classic** (constant auto-scroll) +- **Voice-Activated** (silence-paused auto-scroll) From c874416371ee2fe5901e08873e952ebe5beac03b Mon Sep 17 00:00:00 2001 From: Nathanael Date: Sun, 22 Mar 2026 14:28:36 +0000 Subject: [PATCH 02/10] docs: revise hand gesture rewind spec for dual scroll-state architecture Address mode-specific rewind mechanisms: recognizedCharCount for wordTracking, timerWordProgress for classic/silencePaused. Add edge cases for camera permission, multiple hands, no camera hardware. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../2026-03-22-hand-gesture-rewind-design.md | 63 ++++++++++++------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/docs/superpowers/specs/2026-03-22-hand-gesture-rewind-design.md b/docs/superpowers/specs/2026-03-22-hand-gesture-rewind-design.md index 1a67653..bd02229 100644 --- a/docs/superpowers/specs/2026-03-22-hand-gesture-rewind-design.md +++ b/docs/superpowers/specs/2026-03-22-hand-gesture-rewind-design.md @@ -16,42 +16,60 @@ A new `HandGestureController` class owns camera capture and Vision processing: 2. Each frame is processed by **VNDetectHumanHandPoseRequest** which returns hand landmark coordinates 3. The **wrist Y-position** (0.0 = bottom of frame, 1.0 = top in Vision coordinates) is extracted and smoothed with a rolling average of the last 3-4 frames to reduce jitter 4. A **raise threshold** (wrist Y > 0.6) determines whether the hand is raised. Below this, the hand is in the speaker's lap or at their side and is ignored -5. The controller publishes two observable values: +5. If multiple hands are detected, use the one with the highest wrist Y-position +6. The controller publishes two observable values: - `isHandRaised: Bool` - `handHeight: Float` (0.0 = just above threshold, 1.0 = top of frame) ### Lifecycle -- Camera starts when a reading session begins (`SpeechRecognizer.start()`) -- Camera stops when the reading session ends (`SpeechRecognizer.stop()`) +- Camera starts when a reading session begins — triggered by the overlay controller's `startReading()` flow, not tied to `SpeechRecognizer.start()` (since classic mode never calls it) +- Camera stops when the reading session ends - Camera is NOT running when the app is idle +- If no camera is available (Mac Mini, Mac Pro, external displays without camera), the gesture feature is silently disabled ### Camera Selection Uses the default front-facing camera. No settings UI for camera selection in this iteration. +### Camera Permission + +If camera access is denied, the gesture feature is silently unavailable — no error is shown, no functionality is blocked. The rest of the app works normally. The `NSCameraUsageDescription` key in `Info.plist` provides the permission prompt text. + ## Rewind Behavior +The scroll state lives in different places depending on the listening mode: +- **wordTracking:** `recognizedCharCount` on `SpeechRecognizer` +- **classic / silencePaused:** `timerWordProgress` on the overlay controller + +The `HandGestureController` publishes `isHandRaised` and `handHeight`. The overlay controller observes these and dispatches rewind to the appropriate state. + ### When hand is raised (`isHandRaised` becomes true): -1. **Pause speech recognition** — stop the audio engine and recognition task without setting `isListening = false` (we intend to resume) -2. **Start a rewind timer** — a `Timer` firing every 0.25 seconds -3. Each tick moves `recognizedCharCount` backward by N words (finding previous space characters in `sourceText`). The `handHeight` value controls speed: - - Low hand (0.0–0.3): 1 word per tick (~4 words/sec) - - Mid hand (0.3–0.7): 2 words per tick (~8 words/sec) - - High hand (0.7–1.0): 4 words per tick (~16 words/sec) -4. `recognizedCharCount` is clamped to never go below 0 +**In wordTracking mode:** +1. Pause speech recognition — call a new `pauseForRewind()` method on `SpeechRecognizer` that stops the audio engine and recognition task without setting `isListening = false` +2. Start a rewind timer (every 0.25 seconds) that calls a new `rewindByWords(_ count: Int)` method on `SpeechRecognizer`, which moves `recognizedCharCount` backward by N words (finding previous space characters in `sourceText`) and updates `matchStartOffset` to match + +**In classic / silencePaused mode:** +1. Pause the scroll timer +2. Start a rewind timer (every 0.25 seconds) that decrements `timerWordProgress` by N words + +**Speed (all modes):** The `handHeight` value controls how many words per tick: +- Low hand (0.0–0.3): 1 word per tick (~4 words/sec) +- Mid hand (0.3–0.7): 2 words per tick (~8 words/sec) +- High hand (0.7–1.0): 4 words per tick (~16 words/sec) + +Position is clamped to never go below 0. ### When hand is lowered (`isHandRaised` becomes false): -Behavior depends on the current `ListeningMode`: +**In wordTracking mode:** Call a new `resumeAfterRewind()` method on `SpeechRecognizer` that sets `matchStartOffset = recognizedCharCount` and calls `beginRecognition()` to resume speech tracking from the new position. -- **wordTracking:** Immediately set `matchStartOffset = recognizedCharCount` and call `beginRecognition()` to resume speech tracking from the new position -- **classic / silencePaused:** Wait 1.5 seconds before resuming auto-scroll from the new position +**In classic / silencePaused mode:** Wait 1.5 seconds, then resume the scroll timer from the current `timerWordProgress` position. ### Visual Feedback -MarqueeTextView already observes `recognizedCharCount` and animates scroll position — the rewind will visually scroll backward with no additional UI work needed. +In wordTracking mode, MarqueeTextView observes `recognizedCharCount` — rewind scrolls backward automatically. In classic/silencePaused modes, the view observes `timerWordProgress` — same effect. No new UI components needed. ## File Organization @@ -65,20 +83,17 @@ MarqueeTextView already observes `recognizedCharCount` and animates scroll posit | File | Change | |------|--------| -| `SpeechRecognizer.swift` | Add `HandGestureController` property, start/stop camera with reading session, rewind logic on hand raise, resume logic on hand lower | -| `Info.plist` | Add `NSCameraUsageDescription` for camera permission prompt | +| `SpeechRecognizer.swift` | Add `pauseForRewind()`, `rewindByWords(_:)`, and `resumeAfterRewind()` methods | +| `NotchOverlayController.swift` | Create and own `HandGestureController`, observe hand state, dispatch rewind to `SpeechRecognizer` or `timerWordProgress` depending on mode, manage rewind timer | +| `Info.plist` | Add `NSCameraUsageDescription` | ### Unchanged -MarqueeTextView, NotchSettings, ContentView, SettingsView, BrowserServer, ExternalDisplayController — everything downstream of `recognizedCharCount` is untouched. - -## Privacy - -The app needs `NSCameraUsageDescription` in `Info.plist`. macOS will prompt for camera permission on first use. The camera feed is processed locally — no frames leave the device. +MarqueeTextView, NotchSettings, ContentView, SettingsView, BrowserServer, ExternalDisplayController. ## Supported Listening Modes The gesture rewind works in all three listening modes: -- **Word Tracking** (speech recognition) -- **Classic** (constant auto-scroll) -- **Voice-Activated** (silence-paused auto-scroll) +- **Word Tracking** (speech recognition) — rewinds `recognizedCharCount` +- **Classic** (constant auto-scroll) — rewinds `timerWordProgress` +- **Voice-Activated** (silence-paused auto-scroll) — rewinds `timerWordProgress` From b4653a0419373e4f0219b26b20dd3ef7c70091b1 Mon Sep 17 00:00:00 2001 From: Nathanael Date: Sun, 22 Mar 2026 14:37:33 +0000 Subject: [PATCH 03/10] docs: fix plan review issues in hand gesture rewind plan Fix silencePaused mode grouping, correct method names to show()/dismiss()/forceClose(), add .onDisappear cleanup, fix O(n) string indexing in rewindByWords. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../plans/2026-03-22-hand-gesture-rewind.md | 501 ++++++++++++++++++ 1 file changed, 501 insertions(+) create mode 100644 docs/superpowers/plans/2026-03-22-hand-gesture-rewind.md diff --git a/docs/superpowers/plans/2026-03-22-hand-gesture-rewind.md b/docs/superpowers/plans/2026-03-22-hand-gesture-rewind.md new file mode 100644 index 0000000..db29b4e --- /dev/null +++ b/docs/superpowers/plans/2026-03-22-hand-gesture-rewind.md @@ -0,0 +1,501 @@ +# Hand Gesture Rewind Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add hands-free script rewind via raised hand detection using the Mac's front-facing camera and Apple Vision framework. + +**Architecture:** A new `HandGestureController` owns camera capture and Vision hand pose detection, publishing `isHandRaised` and `handHeight`. The overlay views observe these values and dispatch rewind to the appropriate scroll state (`recognizedCharCount` for wordTracking, `timerWordProgress` for classic/silencePaused). New methods on `SpeechRecognizer` handle pause/rewind/resume for wordTracking mode. + +**Tech Stack:** Swift, AVFoundation (camera capture), Vision framework (VNDetectHumanHandPoseRequest) + +**Spec:** `docs/superpowers/specs/2026-03-22-hand-gesture-rewind-design.md` + +**Note:** This project has no test target. All Swift files live in `Textream/Textream/`. The project builds via `xcodebuild` from `Textream/Textream.xcodeproj`. Use `CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY=""` when building. + +**Important — Adding files to Xcode project:** This is a pure Xcode project (no SPM). New `.swift` files are auto-discovered by the build system. Bundle resources and entitlements changes need manual verification. + +--- + +## File Map + +### New Files + +| File | Responsibility | +|------|---------------| +| `Textream/Textream/HandGestureController.swift` | AVCaptureSession setup, VNDetectHumanHandPoseRequest processing, wrist Y smoothing, publishes `isHandRaised` and `handHeight` | + +### Modified Files + +| File | Change | +|------|--------| +| `Textream/Textream/SpeechRecognizer.swift` | Add `pauseForRewind()`, `rewindByWords(_:)`, `resumeAfterRewind()` methods | +| `Textream/Textream/NotchOverlayController.swift` | Create `HandGestureController`, observe hand state in both overlay views, manage rewind timer, dispatch to correct scroll state per mode | +| `Textream/Info.plist` | Add `NSCameraUsageDescription` | +| `Textream/Textream/Textream.entitlements` | Add `com.apple.security.device.camera` | + +--- + +### Task 1: Camera Permission and Entitlements + +**Files:** +- Modify: `Textream/Info.plist` +- Modify: `Textream/Textream/Textream.entitlements` + +- [ ] **Step 1: Add camera usage description to Info.plist** + +Add the following key/value pair inside the `` in `Textream/Info.plist`, after the existing `NSServices` block: + +```xml +NSCameraUsageDescription +Textream uses the camera to detect hand gestures for hands-free script control. +``` + +- [ ] **Step 2: Add camera entitlement** + +Add the following key/value pair inside the `` in `Textream/Textream/Textream.entitlements`, after the existing `com.apple.security.device.audio-input` entry: + +```xml +com.apple.security.device.camera + +``` + +- [ ] **Step 3: Verify build** + +```bash +cd Textream && xcodebuild -scheme Textream -configuration Debug CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY="" build 2>&1 | tail -5 +``` + +Expected: BUILD SUCCEEDED + +- [ ] **Step 4: Commit** + +```bash +git add Textream/Info.plist Textream/Textream/Textream.entitlements +git commit -m "feat: add camera permission and entitlement for hand gesture rewind" +``` + +--- + +### Task 2: HandGestureController + +**Files:** +- Create: `Textream/Textream/HandGestureController.swift` + +This is the core camera + Vision processing class. It owns the capture session, runs hand pose detection on each frame, smooths the wrist position, and publishes state. + +- [ ] **Step 1: Implement HandGestureController** + +```swift +import AVFoundation +import Vision +import AppKit + +@Observable +class HandGestureController: NSObject { + var isHandRaised: Bool = false + var handHeight: Float = 0.0 // 0.0 = just above threshold, 1.0 = top of frame + + private var captureSession: AVCaptureSession? + private let videoOutput = AVCaptureVideoDataOutput() + private let processingQueue = DispatchQueue(label: "com.textream.handgesture", qos: .userInteractive) + private let handPoseRequest = VNDetectHumanHandPoseRequest() + + private let raiseThreshold: Float = 0.6 // wrist Y must exceed this to count as raised + private var recentWristY: [Float] = [] // rolling buffer for smoothing + private let smoothingWindow = 4 + + private var isRunning = false + + override init() { + super.init() + handPoseRequest.maximumHandCount = 2 + } + + func start() { + guard !isRunning else { return } + + // Check camera permission + switch AVCaptureDevice.authorizationStatus(for: .video) { + case .authorized: + setupAndStart() + case .notDetermined: + AVCaptureDevice.requestAccess(for: .video) { [weak self] granted in + if granted { + DispatchQueue.main.async { self?.setupAndStart() } + } + } + default: + // Permission denied or restricted — silently disable + return + } + } + + func stop() { + guard isRunning else { return } + captureSession?.stopRunning() + isRunning = false + isHandRaised = false + handHeight = 0.0 + recentWristY = [] + } + + private func setupAndStart() { + let session = AVCaptureSession() + session.sessionPreset = .low // ~640x480, minimal resource usage + + // Find front-facing camera + guard let camera = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .front) + ?? AVCaptureDevice.default(for: .video) else { + // No camera available — silently disable + return + } + + guard let input = try? AVCaptureDeviceInput(device: camera) else { return } + guard session.canAddInput(input) else { return } + session.addInput(input) + + videoOutput.setSampleBufferDelegate(self, queue: processingQueue) + videoOutput.alwaysDiscardsLateVideoFrames = true + guard session.canAddOutput(videoOutput) else { return } + session.addOutput(videoOutput) + + // Limit frame rate to ~15fps to save CPU + if let connection = videoOutput.connection(with: .video) { + connection.isEnabled = true + } + try? camera.lockForConfiguration() + camera.activeVideoMinFrameDuration = CMTime(value: 1, timescale: 15) + camera.activeVideoMaxFrameDuration = CMTime(value: 1, timescale: 15) + camera.unlockForConfiguration() + + captureSession = session + + processingQueue.async { + session.startRunning() + } + isRunning = true + } +} + +extension HandGestureController: AVCaptureVideoDataOutputSampleBufferDelegate { + func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } + + let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: .up, options: [:]) + try? handler.perform([handPoseRequest]) + + guard let results = handPoseRequest.results, !results.isEmpty else { + DispatchQueue.main.async { + self.updateWristPosition(nil) + } + return + } + + // Find the hand with the highest wrist Y + var highestWristY: Float = 0 + for hand in results { + if let wrist = try? hand.recognizedPoint(.wrist), + wrist.confidence > 0.3 { + let y = Float(wrist.location.y) // Vision coords: 0=bottom, 1=top + if y > highestWristY { + highestWristY = y + } + } + } + + DispatchQueue.main.async { + self.updateWristPosition(highestWristY > 0 ? highestWristY : nil) + } + } + + private func updateWristPosition(_ wristY: Float?) { + guard let y = wristY else { + // No hand detected — decay smoothly + recentWristY = [] + isHandRaised = false + handHeight = 0.0 + return + } + + // Smooth with rolling average + recentWristY.append(y) + if recentWristY.count > smoothingWindow { + recentWristY.removeFirst() + } + let smoothed = recentWristY.reduce(0, +) / Float(recentWristY.count) + + if smoothed > raiseThreshold { + isHandRaised = true + // Map threshold..1.0 → 0.0..1.0 + handHeight = min(1.0, (smoothed - raiseThreshold) / (1.0 - raiseThreshold)) + } else { + isHandRaised = false + handHeight = 0.0 + } + } +} +``` + +- [ ] **Step 2: Verify build** + +```bash +cd Textream && xcodebuild -scheme Textream -configuration Debug CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY="" build 2>&1 | tail -5 +``` + +Expected: BUILD SUCCEEDED + +- [ ] **Step 3: Commit** + +```bash +git add Textream/Textream/HandGestureController.swift +git commit -m "feat: add HandGestureController with Vision hand pose detection" +``` + +--- + +### Task 3: SpeechRecognizer Rewind Methods + +**Files:** +- Modify: `Textream/Textream/SpeechRecognizer.swift` + +Add three new public methods that the overlay views will call during hand gesture rewind. These methods encapsulate access to the private `matchStartOffset`, `sourceText`, `cleanupRecognition()`, and `beginRecognition()`. + +- [ ] **Step 1: Add pauseForRewind()** + +Add the following method after `resume()` (after line 210 in SpeechRecognizer.swift): + +```swift + /// Pause speech recognition for gesture rewind without changing isListening state. + func pauseForRewind() { + cleanupRecognition() + } +``` + +- [ ] **Step 2: Add rewindByWords(\_:)** + +Add directly after `pauseForRewind()`: + +```swift + /// Move recognizedCharCount backward by N words. Used during gesture rewind. + func rewindByWords(_ count: Int) { + // Work with the string as an array for O(1) indexing + let chars = Array(sourceText) + var remaining = count + var offset = recognizedCharCount + + while remaining > 0 && offset > 0 { + // Skip any spaces at current position + while offset > 0 && chars[offset - 1] == " " { + offset -= 1 + } + // Skip to start of current word + while offset > 0 && chars[offset - 1] != " " { + offset -= 1 + } + remaining -= 1 + } + + recognizedCharCount = max(0, offset) + matchStartOffset = recognizedCharCount + } +``` + +- [ ] **Step 3: Add resumeAfterRewind()** + +Add directly after `rewindByWords(_:)`: + +```swift + /// Resume speech recognition after gesture rewind from current position. + func resumeAfterRewind() { + matchStartOffset = recognizedCharCount + retryCount = 0 + beginRecognition() + } +``` + +- [ ] **Step 4: Verify build** + +```bash +cd Textream && xcodebuild -scheme Textream -configuration Debug CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY="" build 2>&1 | tail -5 +``` + +Expected: BUILD SUCCEEDED + +- [ ] **Step 5: Commit** + +```bash +git add Textream/Textream/SpeechRecognizer.swift +git commit -m "feat: add pauseForRewind, rewindByWords, resumeAfterRewind to SpeechRecognizer" +``` + +--- + +### Task 4: Integrate HandGestureController into Overlay Views + +**Files:** +- Modify: `Textream/Textream/NotchOverlayController.swift` + +This is the integration task. The `HandGestureController` needs to be: +1. Created and owned by `NotchOverlayController` +2. Started/stopped with reading sessions +3. Observed by both `NotchOverlayView` and `FloatingOverlayView` to drive rewind + +**Key context:** `timerWordProgress` is `@State private` on both `NotchOverlayView` (line 625) and `FloatingOverlayView` (line 1153). The rewind logic for classic/silencePaused must live inside these views since they own the state. The `HandGestureController` is passed to both views and observed via `onChange(of:)`. + +- [ ] **Step 1: Add HandGestureController to NotchOverlayController** + +In the `NotchOverlayController` class (around line 47), add a property: + +```swift +let handGestureController = HandGestureController() +``` + +- [ ] **Step 2: Start/stop camera with reading sessions** + +Find `show(text:hasNextPage:onComplete:)` (line 62) — after the existing `speechRecognizer.start(with:)` call (line 118), add: + +```swift +handGestureController.start() +``` + +Find `updateContent(text:hasNextPage:)` (line 122) — similarly add `handGestureController.start()` after the recognizer start. + +Find `dismiss()` (line 376) and `forceClose()` (line 411) — add to both: + +```swift +handGestureController.stop() +``` + +- [ ] **Step 3: Pass HandGestureController to NotchOverlayView** + +The `NotchOverlayView` needs access to `handGestureController`. Add it as a parameter to the view's init. Find where `NotchOverlayView` is created in `NotchOverlayController` and pass `handGestureController`. + +In `NotchOverlayView`, add a property: + +```swift +var handGesture: HandGestureController +``` + +- [ ] **Step 4: Add rewind logic to NotchOverlayView** + +Add a rewind timer state and handler to `NotchOverlayView`. Add these properties near the other `@State` declarations (around line 625): + +```swift +@State private var rewindTimer: Timer? +@State private var resumeDelay: DispatchWorkItem? +``` + +Add a helper to compute words-per-tick from hand height: + +```swift +private func rewindWordsPerTick(handHeight: Float) -> Int { + if handHeight < 0.3 { return 1 } + if handHeight < 0.7 { return 2 } + return 4 +} +``` + +Add `onChange` handlers in the view body (inside the main container, near the existing `onChange` handlers): + +```swift +.onChange(of: handGesture.isHandRaised) { _, raised in + if raised { + // Cancel any pending resume delay + resumeDelay?.cancel() + resumeDelay = nil + + // Pause current mode + switch listeningMode { + case .wordTracking: + speechRecognizer.pauseForRewind() + case .classic: + isPaused = true + case .silencePaused: + speechRecognizer.pauseForRewind() + isPaused = true // also pause the scroll timer + } + + // Start rewind timer + rewindTimer = Timer.scheduledTimer(withTimeInterval: 0.25, repeats: true) { _ in + let words = rewindWordsPerTick(handHeight: handGesture.handHeight) + switch listeningMode { + case .wordTracking: + speechRecognizer.rewindByWords(words) + case .classic, .silencePaused: + timerWordProgress = max(0, timerWordProgress - Double(words)) + } + } + } else { + // Stop rewind timer + rewindTimer?.invalidate() + rewindTimer = nil + + // Resume based on mode + switch listeningMode { + case .wordTracking: + speechRecognizer.resumeAfterRewind() + case .classic: + let work = DispatchWorkItem { isPaused = false } + resumeDelay = work + DispatchQueue.main.asyncAfter(deadline: .now() + 1.5, execute: work) + case .silencePaused: + speechRecognizer.resumeAfterRewind() + let work = DispatchWorkItem { isPaused = false } + resumeDelay = work + DispatchQueue.main.asyncAfter(deadline: .now() + 1.5, execute: work) + } + } +} +.onDisappear { + rewindTimer?.invalidate() + rewindTimer = nil + resumeDelay?.cancel() + resumeDelay = nil +} +``` + +- [ ] **Step 5: Repeat for FloatingOverlayView** + +Apply the same changes to `FloatingOverlayView` (starts around line 1136): +- Add `handGesture: HandGestureController` property +- Add `rewindTimer`, `resumeDelay` state +- Add `rewindWordsPerTick` helper +- Add the same `onChange(of: handGesture.isHandRaised)` handler with `.onDisappear` cleanup +- Pass `handGestureController` from where `FloatingOverlayView` is created + +**Finding all view instantiation sites:** Search `NotchOverlayController.swift` for `NotchOverlayView(` and `FloatingOverlayView(` to find every place these views are created. Each call site must pass the `handGestureController`. There are typically 1-2 sites per view (in `showPinned`, `showFollowCursor`, `showFloating`, etc.). + +**Note:** Fullscreen mode uses `ExternalDisplayView` which is not addressed in this plan — gesture rewind in fullscreen is a future enhancement. + +- [ ] **Step 6: Verify build** + +```bash +cd Textream && xcodebuild -scheme Textream -configuration Debug CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY="" build 2>&1 | tail -5 +``` + +Expected: BUILD SUCCEEDED + +- [ ] **Step 7: Build and launch the app** + +```bash +cd Textream && xcodebuild -scheme Textream -configuration Debug CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY="" build 2>&1 | tail -5 +``` + +Then launch the app and test: +1. Load a script and start reading in Word Tracking mode +2. Raise your hand above shoulder height — script should start rewinding +3. Raise hand higher — rewind should speed up +4. Lower hand — speech recognition should resume from new position +5. Switch to Classic mode, start auto-scroll, raise hand — should rewind `timerWordProgress` +6. Lower hand — 1.5s pause, then auto-scroll resumes + +- [ ] **Step 8: Commit** + +```bash +git add Textream/Textream/NotchOverlayController.swift +git commit -m "feat: integrate hand gesture rewind into overlay views + +Start/stop camera with reading sessions. Both NotchOverlayView +and FloatingOverlayView observe HandGestureController to drive +rewind in all three listening modes." +``` From d7c77586b49c263cebe6f29accfc2120d38bc423 Mon Sep 17 00:00:00 2001 From: Nathanael Date: Sun, 22 Mar 2026 14:41:43 +0000 Subject: [PATCH 04/10] feat: add camera permission and entitlement for hand gesture rewind Co-Authored-By: Claude Sonnet 4.6 --- Textream/Info.plist | 2 ++ Textream/Textream/Textream.entitlements | 2 ++ 2 files changed, 4 insertions(+) diff --git a/Textream/Info.plist b/Textream/Info.plist index 41c3c5c..ece44f0 100644 --- a/Textream/Info.plist +++ b/Textream/Info.plist @@ -69,5 +69,7 @@ + NSCameraUsageDescription + Textream uses the camera to detect hand gestures for hands-free script control. diff --git a/Textream/Textream/Textream.entitlements b/Textream/Textream/Textream.entitlements index e2726b6..7c2b752 100644 --- a/Textream/Textream/Textream.entitlements +++ b/Textream/Textream/Textream.entitlements @@ -6,6 +6,8 @@ com.apple.security.device.audio-input + com.apple.security.device.camera + com.apple.security.files.user-selected.read-write com.apple.security.network.client From be369e8e60daf4c3f72dea7b5f1be5ac4dd9c8c3 Mon Sep 17 00:00:00 2001 From: Nathanael Date: Sun, 22 Mar 2026 14:42:06 +0000 Subject: [PATCH 05/10] feat: add HandGestureController with Vision hand pose detection Co-Authored-By: Claude Sonnet 4.6 --- Textream/Textream/HandGestureController.swift | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 Textream/Textream/HandGestureController.swift diff --git a/Textream/Textream/HandGestureController.swift b/Textream/Textream/HandGestureController.swift new file mode 100644 index 0000000..e680abf --- /dev/null +++ b/Textream/Textream/HandGestureController.swift @@ -0,0 +1,148 @@ +import AVFoundation +import Vision +import AppKit + +@Observable +class HandGestureController: NSObject { + var isHandRaised: Bool = false + var handHeight: Float = 0.0 // 0.0 = just above threshold, 1.0 = top of frame + + private var captureSession: AVCaptureSession? + private let videoOutput = AVCaptureVideoDataOutput() + private let processingQueue = DispatchQueue(label: "com.textream.handgesture", qos: .userInteractive) + private let handPoseRequest = VNDetectHumanHandPoseRequest() + + private let raiseThreshold: Float = 0.6 // wrist Y must exceed this to count as raised + private var recentWristY: [Float] = [] // rolling buffer for smoothing + private let smoothingWindow = 4 + + private var isRunning = false + + override init() { + super.init() + handPoseRequest.maximumHandCount = 2 + } + + func start() { + guard !isRunning else { return } + + // Check camera permission + switch AVCaptureDevice.authorizationStatus(for: .video) { + case .authorized: + setupAndStart() + case .notDetermined: + AVCaptureDevice.requestAccess(for: .video) { [weak self] granted in + if granted { + DispatchQueue.main.async { self?.setupAndStart() } + } + } + default: + // Permission denied or restricted — silently disable + return + } + } + + func stop() { + guard isRunning else { return } + captureSession?.stopRunning() + isRunning = false + isHandRaised = false + handHeight = 0.0 + recentWristY = [] + } + + private func setupAndStart() { + let session = AVCaptureSession() + session.sessionPreset = .low // ~640x480, minimal resource usage + + // Find front-facing camera + guard let camera = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .front) + ?? AVCaptureDevice.default(for: .video) else { + // No camera available — silently disable + return + } + + guard let input = try? AVCaptureDeviceInput(device: camera) else { return } + guard session.canAddInput(input) else { return } + session.addInput(input) + + videoOutput.setSampleBufferDelegate(self, queue: processingQueue) + videoOutput.alwaysDiscardsLateVideoFrames = true + guard session.canAddOutput(videoOutput) else { return } + session.addOutput(videoOutput) + + // Limit frame rate to ~15fps to save CPU + if let connection = videoOutput.connection(with: .video) { + connection.isEnabled = true + } + try? camera.lockForConfiguration() + camera.activeVideoMinFrameDuration = CMTime(value: 1, timescale: 15) + camera.activeVideoMaxFrameDuration = CMTime(value: 1, timescale: 15) + camera.unlockForConfiguration() + + captureSession = session + + processingQueue.async { + session.startRunning() + } + isRunning = true + } +} + +extension HandGestureController: AVCaptureVideoDataOutputSampleBufferDelegate { + func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } + + let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: .up, options: [:]) + try? handler.perform([handPoseRequest]) + + guard let results = handPoseRequest.results, !results.isEmpty else { + DispatchQueue.main.async { + self.updateWristPosition(nil) + } + return + } + + // Find the hand with the highest wrist Y + var highestWristY: Float = 0 + for hand in results { + if let wrist = try? hand.recognizedPoint(.wrist), + wrist.confidence > 0.3 { + let y = Float(wrist.location.y) // Vision coords: 0=bottom, 1=top + if y > highestWristY { + highestWristY = y + } + } + } + + DispatchQueue.main.async { + self.updateWristPosition(highestWristY > 0 ? highestWristY : nil) + } + } + + private func updateWristPosition(_ wristY: Float?) { + guard let y = wristY else { + // No hand detected — decay smoothly + recentWristY = [] + isHandRaised = false + handHeight = 0.0 + return + } + + // Smooth with rolling average + recentWristY.append(y) + if recentWristY.count > smoothingWindow { + recentWristY.removeFirst() + } + let smoothed = recentWristY.reduce(0, +) / Float(recentWristY.count) + + if smoothed > raiseThreshold { + isHandRaised = true + // Map threshold..1.0 → 0.0..1.0 + handHeight = min(1.0, (smoothed - raiseThreshold) / (1.0 - raiseThreshold)) + } else { + isHandRaised = false + handHeight = 0.0 + } + } +} From cb15bbac3c8aa00e9c64a1ade284f4efdda8f2ba Mon Sep 17 00:00:00 2001 From: Nathanael Date: Sun, 22 Mar 2026 14:42:24 +0000 Subject: [PATCH 06/10] feat: add pauseForRewind, rewindByWords, resumeAfterRewind to SpeechRecognizer --- Textream/Textream/SpeechRecognizer.swift | 34 ++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/Textream/Textream/SpeechRecognizer.swift b/Textream/Textream/SpeechRecognizer.swift index 0730b1c..fe53dc8 100644 --- a/Textream/Textream/SpeechRecognizer.swift +++ b/Textream/Textream/SpeechRecognizer.swift @@ -209,6 +209,40 @@ class SpeechRecognizer { beginRecognition() } + /// Pause speech recognition for gesture rewind without changing isListening state. + func pauseForRewind() { + cleanupRecognition() + } + + /// Move recognizedCharCount backward by N words. Used during gesture rewind. + func rewindByWords(_ count: Int) { + let chars = Array(sourceText) + var remaining = count + var offset = recognizedCharCount + + while remaining > 0 && offset > 0 { + // Skip any spaces at current position + while offset > 0 && chars[offset - 1] == " " { + offset -= 1 + } + // Skip to start of current word + while offset > 0 && chars[offset - 1] != " " { + offset -= 1 + } + remaining -= 1 + } + + recognizedCharCount = max(0, offset) + matchStartOffset = recognizedCharCount + } + + /// Resume speech recognition after gesture rewind from current position. + func resumeAfterRewind() { + matchStartOffset = recognizedCharCount + retryCount = 0 + beginRecognition() + } + private func cleanupRecognition() { // Cancel any pending restart to prevent overlapping beginRecognition calls pendingRestart?.cancel() From 8b771ceba19f2b66479b59089515decfec0815eb Mon Sep 17 00:00:00 2001 From: Nathanael Date: Sun, 22 Mar 2026 14:46:12 +0000 Subject: [PATCH 07/10] feat: integrate hand gesture rewind into overlay views Start/stop camera with reading sessions. Both NotchOverlayView and FloatingOverlayView observe HandGestureController to drive rewind in all three listening modes. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../Textream/NotchOverlayController.swift | 131 +++++++++++++++++- 1 file changed, 130 insertions(+), 1 deletion(-) diff --git a/Textream/Textream/NotchOverlayController.swift b/Textream/Textream/NotchOverlayController.swift index 330d202..cb3f124 100644 --- a/Textream/Textream/NotchOverlayController.swift +++ b/Textream/Textream/NotchOverlayController.swift @@ -47,6 +47,7 @@ class OverlayContent { class NotchOverlayController: NSObject { private var panel: NSPanel? let speechRecognizer = SpeechRecognizer() + let handGestureController = HandGestureController() let overlayContent = OverlayContent() var onComplete: (() -> Void)? var onNextPage: (() -> Void)? @@ -117,6 +118,8 @@ class NotchOverlayController: NSObject { if settings.listeningMode != .classic { speechRecognizer.start(with: text) } + + handGestureController.start() } func updateContent(text: String, hasNextPage: Bool) { @@ -136,6 +139,8 @@ class NotchOverlayController: NSObject { if settings.listeningMode != .classic { speechRecognizer.start(with: text) } + + handGestureController.start() } private func screenUnderMouse() -> NSScreen? { @@ -232,7 +237,7 @@ class NotchOverlayController: NSObject { self.frameTracker = tracker self.currentScreenID = screen.displayID - let overlayView = NotchOverlayView(content: overlayContent, speechRecognizer: speechRecognizer, menuBarHeight: menuBarHeight, baseTextHeight: textAreaHeight, maxExtraHeight: maxExtraHeight, frameTracker: tracker) + let overlayView = NotchOverlayView(content: overlayContent, speechRecognizer: speechRecognizer, handGesture: handGestureController, menuBarHeight: menuBarHeight, baseTextHeight: textAreaHeight, maxExtraHeight: maxExtraHeight, frameTracker: tracker) let contentView = NSHostingView(rootView: overlayView) // Start panel at full target size (SwiftUI animates the notch shape inside) @@ -277,6 +282,7 @@ class NotchOverlayController: NSObject { let floatingView = FloatingOverlayView( content: overlayContent, speechRecognizer: speechRecognizer, + handGesture: handGestureController, baseHeight: panelHeight, followingCursor: true ) @@ -345,6 +351,7 @@ class NotchOverlayController: NSObject { let floatingView = FloatingOverlayView( content: overlayContent, speechRecognizer: speechRecognizer, + handGesture: handGestureController, baseHeight: panelHeight ) let contentView = NSHostingView(rootView: floatingView) @@ -377,6 +384,7 @@ class NotchOverlayController: NSObject { // Trigger the shrink animation speechRecognizer.shouldDismiss = true speechRecognizer.forceStop() + handGestureController.stop() // Wait for animation, then remove panel DispatchQueue.main.asyncAfter(deadline: .now() + 0.4) { [weak self] in @@ -415,6 +423,7 @@ class NotchOverlayController: NSObject { removeEscMonitor() cancellables.removeAll() speechRecognizer.forceStop() + handGestureController.stop() speechRecognizer.recognizedCharCount = 0 panel?.orderOut(nil) panel = nil @@ -605,6 +614,7 @@ struct DynamicIslandShape: Shape { struct NotchOverlayView: View { @Bindable var content: OverlayContent @Bindable var speechRecognizer: SpeechRecognizer + var handGesture: HandGestureController let menuBarHeight: CGFloat let baseTextHeight: CGFloat let maxExtraHeight: CGFloat @@ -627,6 +637,10 @@ struct NotchOverlayView: View { @State private var isUserScrolling: Bool = false private let scrollTimer = Timer.publish(every: 0.05, on: .main, in: .common).autoconnect() + // Hand-gesture rewind state + @State private var rewindTimer: Timer? + @State private var resumeDelay: DispatchWorkItem? + // Auto next page countdown @State private var countdownRemaining: Int = 0 @State private var countdownTimer: Timer? = nil @@ -642,6 +656,12 @@ struct NotchOverlayView: View { NotchSettings.shared.listeningMode } + private func rewindWordsPerTick(handHeight: Float) -> Int { + if handHeight < 0.3 { return 1 } + if handHeight < 0.7 { return 2 } + return 4 + } + /// Convert fractional word index to char offset using actual word lengths private func charOffsetForWordProgress(_ progress: Double) -> Int { let wholeWord = Int(progress) @@ -798,6 +818,55 @@ struct NotchOverlayView: View { .onChange(of: content.totalCharCount) { _, _ in timerWordProgress = 0 } + .onChange(of: handGesture.isHandRaised) { _, raised in + if raised { + resumeDelay?.cancel() + resumeDelay = nil + + switch listeningMode { + case .wordTracking: + speechRecognizer.pauseForRewind() + case .classic: + isPaused = true + case .silencePaused: + speechRecognizer.pauseForRewind() + isPaused = true + } + + rewindTimer = Timer.scheduledTimer(withTimeInterval: 0.25, repeats: true) { _ in + let words = rewindWordsPerTick(handHeight: handGesture.handHeight) + switch listeningMode { + case .wordTracking: + speechRecognizer.rewindByWords(words) + case .classic, .silencePaused: + timerWordProgress = max(0, timerWordProgress - Double(words)) + } + } + } else { + rewindTimer?.invalidate() + rewindTimer = nil + + switch listeningMode { + case .wordTracking: + speechRecognizer.resumeAfterRewind() + case .classic: + let work = DispatchWorkItem { isPaused = false } + resumeDelay = work + DispatchQueue.main.asyncAfter(deadline: .now() + 1.5, execute: work) + case .silencePaused: + speechRecognizer.resumeAfterRewind() + let work = DispatchWorkItem { isPaused = false } + resumeDelay = work + DispatchQueue.main.asyncAfter(deadline: .now() + 1.5, execute: work) + } + } + } + .onDisappear { + rewindTimer?.invalidate() + rewindTimer = nil + resumeDelay?.cancel() + resumeDelay = nil + } } private func updateFrameTracker() { @@ -1136,6 +1205,7 @@ struct GlassEffectView: NSViewRepresentable { struct FloatingOverlayView: View { @Bindable var content: OverlayContent @Bindable var speechRecognizer: SpeechRecognizer + var handGesture: HandGestureController let baseHeight: CGFloat var followingCursor: Bool = false @@ -1155,10 +1225,20 @@ struct FloatingOverlayView: View { @State private var isUserScrolling: Bool = false private let scrollTimer = Timer.publish(every: 0.05, on: .main, in: .common).autoconnect() + // Hand-gesture rewind state + @State private var rewindTimer: Timer? + @State private var resumeDelay: DispatchWorkItem? + private var listeningMode: ListeningMode { NotchSettings.shared.listeningMode } + private func rewindWordsPerTick(handHeight: Float) -> Int { + if handHeight < 0.3 { return 1 } + if handHeight < 0.7 { return 2 } + return 4 + } + /// Convert fractional word index to char offset using actual word lengths private func charOffsetForWordProgress(_ progress: Double) -> Int { let wholeWord = Int(progress) @@ -1292,6 +1372,55 @@ struct FloatingOverlayView: View { .onChange(of: content.totalCharCount) { _, _ in timerWordProgress = 0 } + .onChange(of: handGesture.isHandRaised) { _, raised in + if raised { + resumeDelay?.cancel() + resumeDelay = nil + + switch listeningMode { + case .wordTracking: + speechRecognizer.pauseForRewind() + case .classic: + isPaused = true + case .silencePaused: + speechRecognizer.pauseForRewind() + isPaused = true + } + + rewindTimer = Timer.scheduledTimer(withTimeInterval: 0.25, repeats: true) { _ in + let words = rewindWordsPerTick(handHeight: handGesture.handHeight) + switch listeningMode { + case .wordTracking: + speechRecognizer.rewindByWords(words) + case .classic, .silencePaused: + timerWordProgress = max(0, timerWordProgress - Double(words)) + } + } + } else { + rewindTimer?.invalidate() + rewindTimer = nil + + switch listeningMode { + case .wordTracking: + speechRecognizer.resumeAfterRewind() + case .classic: + let work = DispatchWorkItem { isPaused = false } + resumeDelay = work + DispatchQueue.main.asyncAfter(deadline: .now() + 1.5, execute: work) + case .silencePaused: + speechRecognizer.resumeAfterRewind() + let work = DispatchWorkItem { isPaused = false } + resumeDelay = work + DispatchQueue.main.asyncAfter(deadline: .now() + 1.5, execute: work) + } + } + } + .onDisappear { + rewindTimer?.invalidate() + rewindTimer = nil + resumeDelay?.cancel() + resumeDelay = nil + } } private var floatingPrompterView: some View { From 03777cf85b2cb4901d62c202fec8f1eb64e09b2b Mon Sep 17 00:00:00 2001 From: Nathanael Date: Sun, 22 Mar 2026 14:52:16 +0000 Subject: [PATCH 08/10] fix: remove frame rate forcing that crashes on DAL cameras AVCaptureDALDevice does not support setActiveVideoMinFrameDuration. The .low session preset already limits resource usage. Co-Authored-By: Claude Opus 4.6 (1M context) --- Textream/Textream/HandGestureController.swift | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Textream/Textream/HandGestureController.swift b/Textream/Textream/HandGestureController.swift index e680abf..9a490fd 100644 --- a/Textream/Textream/HandGestureController.swift +++ b/Textream/Textream/HandGestureController.swift @@ -71,15 +71,6 @@ class HandGestureController: NSObject { guard session.canAddOutput(videoOutput) else { return } session.addOutput(videoOutput) - // Limit frame rate to ~15fps to save CPU - if let connection = videoOutput.connection(with: .video) { - connection.isEnabled = true - } - try? camera.lockForConfiguration() - camera.activeVideoMinFrameDuration = CMTime(value: 1, timescale: 15) - camera.activeVideoMaxFrameDuration = CMTime(value: 1, timescale: 15) - camera.unlockForConfiguration() - captureSession = session processingQueue.async { From de0e85ef0229e3a8ca263ae1df5f229742ef1333 Mon Sep 17 00:00:00 2001 From: Nathanael Date: Sun, 22 Mar 2026 16:49:20 +0000 Subject: [PATCH 09/10] feat: hand gesture rewind with visual indicator Raise hand to rewind script, lower to resume. Green spinning circle appears when rewinding. Uses Vision framework hand pose detection with hysteresis thresholds to prevent flicker. Fixes: threshold tuning, camera DAL crash, indicator z-order, callback-based state change, session cleanup on stop. Co-Authored-By: Claude Opus 4.6 (1M context) --- Textream/Textream/HandGestureController.swift | 101 ++++++-- .../Textream/NotchOverlayController.swift | 238 +++++++++--------- Textream/Textream/SpeechRecognizer.swift | 1 + 3 files changed, 206 insertions(+), 134 deletions(-) diff --git a/Textream/Textream/HandGestureController.swift b/Textream/Textream/HandGestureController.swift index 9a490fd..225391d 100644 --- a/Textream/Textream/HandGestureController.swift +++ b/Textream/Textream/HandGestureController.swift @@ -4,77 +4,126 @@ import AppKit @Observable class HandGestureController: NSObject { - var isHandRaised: Bool = false + private static let logFile: FileHandle? = { + let path = "/tmp/textream_hand.log" + FileManager.default.createFile(atPath: path, contents: nil) + return FileHandle(forWritingAtPath: path) + }() + + static func log(_ msg: String) { + let line = "\(Date()): \(msg)\n" + logFile?.seekToEndOfFile() + logFile?.write(line.data(using: .utf8)!) + } + var isHandRaised: Bool = false { + didSet { + if isHandRaised != oldValue { + onHandStateChanged?(isHandRaised, handHeight) + } + } + } var handHeight: Float = 0.0 // 0.0 = just above threshold, 1.0 = top of frame + /// Called on main thread when hand raise state changes. (raised, height) + var onHandStateChanged: ((Bool, Float) -> Void)? + private var captureSession: AVCaptureSession? - private let videoOutput = AVCaptureVideoDataOutput() + private var videoOutput = AVCaptureVideoDataOutput() private let processingQueue = DispatchQueue(label: "com.textream.handgesture", qos: .userInteractive) private let handPoseRequest = VNDetectHumanHandPoseRequest() - private let raiseThreshold: Float = 0.6 // wrist Y must exceed this to count as raised + private let raiseThreshold: Float = 0.25 // wrist Y must exceed this to trigger raise + private let lowerThreshold: Float = 0.20 // wrist Y must drop below this to trigger lower (hysteresis) private var recentWristY: [Float] = [] // rolling buffer for smoothing private let smoothingWindow = 4 private var isRunning = false + private var frameCount = 0 override init() { super.init() handPoseRequest.maximumHandCount = 2 + Self.log("[HandGesture] init()") } func start() { - guard !isRunning else { return } + guard !isRunning else { + Self.log("[HandGesture] start() skipped — already running") + return + } - // Check camera permission - switch AVCaptureDevice.authorizationStatus(for: .video) { + let status = AVCaptureDevice.authorizationStatus(for: .video) + Self.log("[HandGesture] start() called, auth status=\(status.rawValue)") + switch status { case .authorized: setupAndStart() case .notDetermined: AVCaptureDevice.requestAccess(for: .video) { [weak self] granted in + Self.log("[HandGesture] camera permission granted=\(granted)") if granted { DispatchQueue.main.async { self?.setupAndStart() } } } default: - // Permission denied or restricted — silently disable + Self.log("[HandGesture] camera permission denied/restricted") return } } func stop() { guard isRunning else { return } + Self.log("[HandGesture] stop()") + // Clear callback first to prevent triggering rewind logic during teardown + let savedCallback = onHandStateChanged + onHandStateChanged = nil + captureSession?.stopRunning() + captureSession = nil // release session so videoOutput can be re-added later isRunning = false isHandRaised = false handHeight = 0.0 recentWristY = [] + + // Restore callback for next start + onHandStateChanged = savedCallback } private func setupAndStart() { + Self.log("[HandGesture] setupAndStart()") let session = AVCaptureSession() - session.sessionPreset = .low // ~640x480, minimal resource usage + session.sessionPreset = .low - // Find front-facing camera guard let camera = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .front) ?? AVCaptureDevice.default(for: .video) else { - // No camera available — silently disable + Self.log("[HandGesture] No camera found") return } + Self.log("[HandGesture] Using camera: \(camera.localizedName)") - guard let input = try? AVCaptureDeviceInput(device: camera) else { return } - guard session.canAddInput(input) else { return } + guard let input = try? AVCaptureDeviceInput(device: camera) else { + Self.log("[HandGesture] Failed to create camera input") + return + } + guard session.canAddInput(input) else { + Self.log("[HandGesture] Cannot add input to session") + return + } session.addInput(input) + videoOutput = AVCaptureVideoDataOutput() videoOutput.setSampleBufferDelegate(self, queue: processingQueue) videoOutput.alwaysDiscardsLateVideoFrames = true - guard session.canAddOutput(videoOutput) else { return } + guard session.canAddOutput(videoOutput) else { + Self.log("[HandGesture] Cannot add output to session") + return + } session.addOutput(videoOutput) captureSession = session processingQueue.async { session.startRunning() + Self.log("[HandGesture] session.startRunning() completed") } isRunning = true } @@ -82,6 +131,10 @@ class HandGestureController: NSObject { extension HandGestureController: AVCaptureVideoDataOutputSampleBufferDelegate { func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + frameCount += 1 + if frameCount % 30 == 1 { + Self.log("[HandGesture] frame \(frameCount) received") + } guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: .up, options: [:]) @@ -99,7 +152,8 @@ extension HandGestureController: AVCaptureVideoDataOutputSampleBufferDelegate { for hand in results { if let wrist = try? hand.recognizedPoint(.wrist), wrist.confidence > 0.3 { - let y = Float(wrist.location.y) // Vision coords: 0=bottom, 1=top + let y = Float(wrist.location.y) + Self.log("[HandGesture] wrist y=\(String(format: "%.3f", y)) conf=\(String(format: "%.2f", wrist.confidence))") if y > highestWristY { highestWristY = y } @@ -113,27 +167,30 @@ extension HandGestureController: AVCaptureVideoDataOutputSampleBufferDelegate { private func updateWristPosition(_ wristY: Float?) { guard let y = wristY else { - // No hand detected — decay smoothly recentWristY = [] - isHandRaised = false handHeight = 0.0 + isHandRaised = false // set after height so callback has correct height return } - // Smooth with rolling average recentWristY.append(y) if recentWristY.count > smoothingWindow { recentWristY.removeFirst() } let smoothed = recentWristY.reduce(0, +) / Float(recentWristY.count) - if smoothed > raiseThreshold { + // Hysteresis: raise at raiseThreshold, lower at lowerThreshold + if !isHandRaised && smoothed > raiseThreshold { + Self.log("[HandGesture] HAND RAISED (smoothed=\(String(format: "%.3f", smoothed)))") + handHeight = min(1.0, (smoothed - lowerThreshold) / (1.0 - lowerThreshold)) isHandRaised = true - // Map threshold..1.0 → 0.0..1.0 - handHeight = min(1.0, (smoothed - raiseThreshold) / (1.0 - raiseThreshold)) - } else { - isHandRaised = false + } else if isHandRaised && smoothed < lowerThreshold { + Self.log("[HandGesture] HAND LOWERED (smoothed=\(String(format: "%.3f", smoothed)))") handHeight = 0.0 + isHandRaised = false + } else if isHandRaised { + // Update height while raised (for speed control) + handHeight = min(1.0, (smoothed - lowerThreshold) / (1.0 - lowerThreshold)) } } } diff --git a/Textream/Textream/NotchOverlayController.swift b/Textream/Textream/NotchOverlayController.swift index cb3f124..cf6c450 100644 --- a/Textream/Textream/NotchOverlayController.swift +++ b/Textream/Textream/NotchOverlayController.swift @@ -48,6 +48,9 @@ class NotchOverlayController: NSObject { private var panel: NSPanel? let speechRecognizer = SpeechRecognizer() let handGestureController = HandGestureController() + private var rewindTimer: Timer? + private var indicatorWindow: NSWindow? + private var indicatorView: NSHostingView? let overlayContent = OverlayContent() var onComplete: (() -> Void)? var onNextPage: (() -> Void)? @@ -119,9 +122,89 @@ class NotchOverlayController: NSObject { speechRecognizer.start(with: text) } + handGestureController.onHandStateChanged = { [weak self] raised, height in + self?.handleHandGesture(raised: raised, height: height) + } handGestureController.start() } + private func handleHandGesture(raised: Bool, height: Float) { + let settings = NotchSettings.shared + HandGestureController.log("[Controller] handleHandGesture raised=\(raised) height=\(height) mode=\(settings.listeningMode.rawValue)") + + if raised { + showHandIndicator() + + // Pause current mode + speechRecognizer.pauseForRewind() + + // Start rewind timer + rewindTimer?.invalidate() + rewindTimer = Timer.scheduledTimer(withTimeInterval: 0.25, repeats: true) { [weak self] _ in + guard let self else { return } + let h = self.handGestureController.handHeight + let words: Int + if h < 0.3 { words = 1 } + else if h < 0.7 { words = 2 } + else { words = 4 } + + self.speechRecognizer.rewindByWords(words) + } + } else { + HandGestureController.log("[Controller] hiding indicator, window=\(indicatorWindow != nil)") + hideHandIndicator() + + // Stop rewind + rewindTimer?.invalidate() + rewindTimer = nil + + HandGestureController.log("[Controller] calling resumeAfterRewind, isListening=\(speechRecognizer.isListening)") + switch settings.listeningMode { + case .wordTracking: + speechRecognizer.resumeAfterRewind() + HandGestureController.log("[Controller] resumeAfterRewind called, isListening=\(speechRecognizer.isListening)") + case .classic, .silencePaused: + DispatchQueue.main.asyncAfter(deadline: .now() + 1.5) { [weak self] in + self?.speechRecognizer.resumeAfterRewind() + } + } + } + } + + private func showHandIndicator() { + guard indicatorWindow == nil else { return } + guard let screen = NSScreen.main ?? NSScreen.screens.first else { return } + + let size: CGFloat = 60 + let margin: CGFloat = 20 + let frame = NSRect( + x: screen.frame.maxX - size - margin, + y: screen.frame.maxY - size - margin - 30, // below menu bar + width: size, + height: size + ) + + let window = NSWindow(contentRect: frame, styleMask: .borderless, backing: .buffered, defer: false) + window.isOpaque = false + window.backgroundColor = .clear + window.level = NSWindow.Level(Int(CGShieldingWindowLevel()) + 1) + window.ignoresMouseEvents = true + window.hasShadow = false + + let hostView = NSHostingView(rootView: HandIndicatorView(isRewinding: true)) + window.contentView = hostView + window.orderFront(nil) + + indicatorWindow = window + indicatorView = hostView + } + + private func hideHandIndicator() { + indicatorWindow?.orderOut(nil) + indicatorWindow = nil + indicatorView = nil + } + func updateContent(text: String, hasNextPage: Bool) { let normalized = splitTextIntoWords(text) @@ -385,6 +468,9 @@ class NotchOverlayController: NSObject { speechRecognizer.shouldDismiss = true speechRecognizer.forceStop() handGestureController.stop() + hideHandIndicator() + rewindTimer?.invalidate() + rewindTimer = nil // Wait for animation, then remove panel DispatchQueue.main.asyncAfter(deadline: .now() + 0.4) { [weak self] in @@ -424,6 +510,9 @@ class NotchOverlayController: NSObject { cancellables.removeAll() speechRecognizer.forceStop() handGestureController.stop() + hideHandIndicator() + rewindTimer?.invalidate() + rewindTimer = nil speechRecognizer.recognizedCharCount = 0 panel?.orderOut(nil) panel = nil @@ -609,6 +698,43 @@ struct DynamicIslandShape: Shape { } } +// MARK: - Hand Gesture Indicator + +struct HandIndicatorView: View { + let isRewinding: Bool + + @State private var rotation: Double = 0 + + var body: some View { + ZStack { + // Background circle + Circle() + .stroke(Color.white.opacity(0.3), lineWidth: 3) + .frame(width: 44, height: 44) + + // Animated arc + Circle() + .trim(from: 0, to: 0.7) + .stroke(Color.green, style: StrokeStyle(lineWidth: 3, lineCap: .round)) + .frame(width: 44, height: 44) + .rotationEffect(.degrees(rotation)) + + // Rewind icon + Image(systemName: "backward.fill") + .font(.system(size: 16, weight: .bold)) + .foregroundColor(.green) + } + .frame(width: 60, height: 60) + .background(Color.black.opacity(0.6)) + .clipShape(Circle()) + .onAppear { + withAnimation(.linear(duration: 1).repeatForever(autoreverses: false)) { + rotation = -360 + } + } + } +} + // MARK: - Overlay SwiftUI View struct NotchOverlayView: View { @@ -638,8 +764,6 @@ struct NotchOverlayView: View { private let scrollTimer = Timer.publish(every: 0.05, on: .main, in: .common).autoconnect() // Hand-gesture rewind state - @State private var rewindTimer: Timer? - @State private var resumeDelay: DispatchWorkItem? // Auto next page countdown @State private var countdownRemaining: Int = 0 @@ -656,11 +780,6 @@ struct NotchOverlayView: View { NotchSettings.shared.listeningMode } - private func rewindWordsPerTick(handHeight: Float) -> Int { - if handHeight < 0.3 { return 1 } - if handHeight < 0.7 { return 2 } - return 4 - } /// Convert fractional word index to char offset using actual word lengths private func charOffsetForWordProgress(_ progress: Double) -> Int { @@ -818,55 +937,6 @@ struct NotchOverlayView: View { .onChange(of: content.totalCharCount) { _, _ in timerWordProgress = 0 } - .onChange(of: handGesture.isHandRaised) { _, raised in - if raised { - resumeDelay?.cancel() - resumeDelay = nil - - switch listeningMode { - case .wordTracking: - speechRecognizer.pauseForRewind() - case .classic: - isPaused = true - case .silencePaused: - speechRecognizer.pauseForRewind() - isPaused = true - } - - rewindTimer = Timer.scheduledTimer(withTimeInterval: 0.25, repeats: true) { _ in - let words = rewindWordsPerTick(handHeight: handGesture.handHeight) - switch listeningMode { - case .wordTracking: - speechRecognizer.rewindByWords(words) - case .classic, .silencePaused: - timerWordProgress = max(0, timerWordProgress - Double(words)) - } - } - } else { - rewindTimer?.invalidate() - rewindTimer = nil - - switch listeningMode { - case .wordTracking: - speechRecognizer.resumeAfterRewind() - case .classic: - let work = DispatchWorkItem { isPaused = false } - resumeDelay = work - DispatchQueue.main.asyncAfter(deadline: .now() + 1.5, execute: work) - case .silencePaused: - speechRecognizer.resumeAfterRewind() - let work = DispatchWorkItem { isPaused = false } - resumeDelay = work - DispatchQueue.main.asyncAfter(deadline: .now() + 1.5, execute: work) - } - } - } - .onDisappear { - rewindTimer?.invalidate() - rewindTimer = nil - resumeDelay?.cancel() - resumeDelay = nil - } } private func updateFrameTracker() { @@ -1226,18 +1296,11 @@ struct FloatingOverlayView: View { private let scrollTimer = Timer.publish(every: 0.05, on: .main, in: .common).autoconnect() // Hand-gesture rewind state - @State private var rewindTimer: Timer? - @State private var resumeDelay: DispatchWorkItem? private var listeningMode: ListeningMode { NotchSettings.shared.listeningMode } - private func rewindWordsPerTick(handHeight: Float) -> Int { - if handHeight < 0.3 { return 1 } - if handHeight < 0.7 { return 2 } - return 4 - } /// Convert fractional word index to char offset using actual word lengths private func charOffsetForWordProgress(_ progress: Double) -> Int { @@ -1372,55 +1435,6 @@ struct FloatingOverlayView: View { .onChange(of: content.totalCharCount) { _, _ in timerWordProgress = 0 } - .onChange(of: handGesture.isHandRaised) { _, raised in - if raised { - resumeDelay?.cancel() - resumeDelay = nil - - switch listeningMode { - case .wordTracking: - speechRecognizer.pauseForRewind() - case .classic: - isPaused = true - case .silencePaused: - speechRecognizer.pauseForRewind() - isPaused = true - } - - rewindTimer = Timer.scheduledTimer(withTimeInterval: 0.25, repeats: true) { _ in - let words = rewindWordsPerTick(handHeight: handGesture.handHeight) - switch listeningMode { - case .wordTracking: - speechRecognizer.rewindByWords(words) - case .classic, .silencePaused: - timerWordProgress = max(0, timerWordProgress - Double(words)) - } - } - } else { - rewindTimer?.invalidate() - rewindTimer = nil - - switch listeningMode { - case .wordTracking: - speechRecognizer.resumeAfterRewind() - case .classic: - let work = DispatchWorkItem { isPaused = false } - resumeDelay = work - DispatchQueue.main.asyncAfter(deadline: .now() + 1.5, execute: work) - case .silencePaused: - speechRecognizer.resumeAfterRewind() - let work = DispatchWorkItem { isPaused = false } - resumeDelay = work - DispatchQueue.main.asyncAfter(deadline: .now() + 1.5, execute: work) - } - } - } - .onDisappear { - rewindTimer?.invalidate() - rewindTimer = nil - resumeDelay?.cancel() - resumeDelay = nil - } } private var floatingPrompterView: some View { diff --git a/Textream/Textream/SpeechRecognizer.swift b/Textream/Textream/SpeechRecognizer.swift index fe53dc8..9e01414 100644 --- a/Textream/Textream/SpeechRecognizer.swift +++ b/Textream/Textream/SpeechRecognizer.swift @@ -240,6 +240,7 @@ class SpeechRecognizer { func resumeAfterRewind() { matchStartOffset = recognizedCharCount retryCount = 0 + isListening = true beginRecognition() } From 2f3763ebc7d444893cf6d24923be1c89a3f45239 Mon Sep 17 00:00:00 2001 From: Nathanael Date: Sun, 22 Mar 2026 20:12:20 +0000 Subject: [PATCH 10/10] feat: add Hand Gesture Rewind toggle in Guidance settings Adds a discoverable toggle with description explaining the feature: raise hand to pause and rewind, height controls speed, lower to resume. Enabled by default. When disabled, camera is not started and no hand detection runs. Prevents surprise camera access for users who do not want it. Co-Authored-By: Claude Opus 4.6 (1M context) --- Textream/Textream/NotchOverlayController.swift | 12 ++++++++---- Textream/Textream/NotchSettings.swift | 5 +++++ Textream/Textream/SettingsView.swift | 13 +++++++++++++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/Textream/Textream/NotchOverlayController.swift b/Textream/Textream/NotchOverlayController.swift index cf6c450..b292c95 100644 --- a/Textream/Textream/NotchOverlayController.swift +++ b/Textream/Textream/NotchOverlayController.swift @@ -122,10 +122,12 @@ class NotchOverlayController: NSObject { speechRecognizer.start(with: text) } - handGestureController.onHandStateChanged = { [weak self] raised, height in - self?.handleHandGesture(raised: raised, height: height) + if settings.handGestureRewind { + handGestureController.onHandStateChanged = { [weak self] raised, height in + self?.handleHandGesture(raised: raised, height: height) + } + handGestureController.start() } - handGestureController.start() } private func handleHandGesture(raised: Bool, height: Float) { @@ -223,7 +225,9 @@ class NotchOverlayController: NSObject { speechRecognizer.start(with: text) } - handGestureController.start() + if NotchSettings.shared.handGestureRewind { + handGestureController.start() + } } private func screenUnderMouse() -> NSScreen? { diff --git a/Textream/Textream/NotchSettings.swift b/Textream/Textream/NotchSettings.swift index 1a02c9d..9b4f7a0 100644 --- a/Textream/Textream/NotchSettings.swift +++ b/Textream/Textream/NotchSettings.swift @@ -407,6 +407,10 @@ class NotchSettings { didSet { UserDefaults.standard.set(selectedMicUID, forKey: "selectedMicUID") } } + var handGestureRewind: Bool { + didSet { UserDefaults.standard.set(handGestureRewind, forKey: "handGestureRewind") } + } + var autoNextPage: Bool { didSet { UserDefaults.standard.set(autoNextPage, forKey: "autoNextPage") } } @@ -483,6 +487,7 @@ class NotchSettings { self.hideFromScreenShare = UserDefaults.standard.object(forKey: "hideFromScreenShare") as? Bool ?? true self.showElapsedTime = UserDefaults.standard.object(forKey: "showElapsedTime") as? Bool ?? true self.selectedMicUID = UserDefaults.standard.string(forKey: "selectedMicUID") ?? "" + self.handGestureRewind = UserDefaults.standard.object(forKey: "handGestureRewind") as? Bool ?? true self.autoNextPage = UserDefaults.standard.object(forKey: "autoNextPage") as? Bool ?? false let savedDelay = UserDefaults.standard.integer(forKey: "autoNextPageDelay") self.autoNextPageDelay = savedDelay > 0 ? savedDelay : 3 diff --git a/Textream/Textream/SettingsView.swift b/Textream/Textream/SettingsView.swift index 581526b..727a3c1 100644 --- a/Textream/Textream/SettingsView.swift +++ b/Textream/Textream/SettingsView.swift @@ -743,6 +743,19 @@ struct SettingsView: View { } } + Divider() + + Toggle(isOn: $settings.handGestureRewind) { + VStack(alignment: .leading, spacing: 2) { + Text("Hand Gesture Rewind") + .font(.system(size: 13, weight: .medium)) + Text("Raise your hand to pause and rewind. The higher you raise, the faster it rewinds. Lower your hand to resume. Uses your camera to detect hand position.") + .font(.system(size: 11)) + .foregroundStyle(.secondary) + } + } + .toggleStyle(.checkbox) + Spacer() } .padding(16)