diff --git a/Textream/Info.plist b/Textream/Info.plist index 41c3c5c..ece44f0 100644 --- a/Textream/Info.plist +++ b/Textream/Info.plist @@ -69,5 +69,7 @@ + NSCameraUsageDescription + Textream uses the camera to detect hand gestures for hands-free script control. diff --git a/Textream/Textream/HandGestureController.swift b/Textream/Textream/HandGestureController.swift new file mode 100644 index 0000000..225391d --- /dev/null +++ b/Textream/Textream/HandGestureController.swift @@ -0,0 +1,196 @@ +import AVFoundation +import Vision +import AppKit + +@Observable +class HandGestureController: NSObject { + private static let logFile: FileHandle? = { + let path = "/tmp/textream_hand.log" + FileManager.default.createFile(atPath: path, contents: nil) + return FileHandle(forWritingAtPath: path) + }() + + static func log(_ msg: String) { + let line = "\(Date()): \(msg)\n" + logFile?.seekToEndOfFile() + logFile?.write(line.data(using: .utf8)!) + } + var isHandRaised: Bool = false { + didSet { + if isHandRaised != oldValue { + onHandStateChanged?(isHandRaised, handHeight) + } + } + } + var handHeight: Float = 0.0 // 0.0 = just above threshold, 1.0 = top of frame + + /// Called on main thread when hand raise state changes. (raised, height) + var onHandStateChanged: ((Bool, Float) -> Void)? + + private var captureSession: AVCaptureSession? + private var videoOutput = AVCaptureVideoDataOutput() + private let processingQueue = DispatchQueue(label: "com.textream.handgesture", qos: .userInteractive) + private let handPoseRequest = VNDetectHumanHandPoseRequest() + + private let raiseThreshold: Float = 0.25 // wrist Y must exceed this to trigger raise + private let lowerThreshold: Float = 0.20 // wrist Y must drop below this to trigger lower (hysteresis) + private var recentWristY: [Float] = [] // rolling buffer for smoothing + private let smoothingWindow = 4 + + private var isRunning = false + private var frameCount = 0 + + override init() { + super.init() + handPoseRequest.maximumHandCount = 2 + Self.log("[HandGesture] init()") + } + + func start() { + guard !isRunning else { + Self.log("[HandGesture] start() skipped — already running") + return + } + + let status = AVCaptureDevice.authorizationStatus(for: .video) + Self.log("[HandGesture] start() called, auth status=\(status.rawValue)") + switch status { + case .authorized: + setupAndStart() + case .notDetermined: + AVCaptureDevice.requestAccess(for: .video) { [weak self] granted in + Self.log("[HandGesture] camera permission granted=\(granted)") + if granted { + DispatchQueue.main.async { self?.setupAndStart() } + } + } + default: + Self.log("[HandGesture] camera permission denied/restricted") + return + } + } + + func stop() { + guard isRunning else { return } + Self.log("[HandGesture] stop()") + // Clear callback first to prevent triggering rewind logic during teardown + let savedCallback = onHandStateChanged + onHandStateChanged = nil + + captureSession?.stopRunning() + captureSession = nil // release session so videoOutput can be re-added later + isRunning = false + isHandRaised = false + handHeight = 0.0 + recentWristY = [] + + // Restore callback for next start + onHandStateChanged = savedCallback + } + + private func setupAndStart() { + Self.log("[HandGesture] setupAndStart()") + let session = AVCaptureSession() + session.sessionPreset = .low + + guard let camera = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .front) + ?? AVCaptureDevice.default(for: .video) else { + Self.log("[HandGesture] No camera found") + return + } + Self.log("[HandGesture] Using camera: \(camera.localizedName)") + + guard let input = try? AVCaptureDeviceInput(device: camera) else { + Self.log("[HandGesture] Failed to create camera input") + return + } + guard session.canAddInput(input) else { + Self.log("[HandGesture] Cannot add input to session") + return + } + session.addInput(input) + + videoOutput = AVCaptureVideoDataOutput() + videoOutput.setSampleBufferDelegate(self, queue: processingQueue) + videoOutput.alwaysDiscardsLateVideoFrames = true + guard session.canAddOutput(videoOutput) else { + Self.log("[HandGesture] Cannot add output to session") + return + } + session.addOutput(videoOutput) + + captureSession = session + + processingQueue.async { + session.startRunning() + Self.log("[HandGesture] session.startRunning() completed") + } + isRunning = true + } +} + +extension HandGestureController: AVCaptureVideoDataOutputSampleBufferDelegate { + func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + frameCount += 1 + if frameCount % 30 == 1 { + Self.log("[HandGesture] frame \(frameCount) received") + } + guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } + + let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: .up, options: [:]) + try? handler.perform([handPoseRequest]) + + guard let results = handPoseRequest.results, !results.isEmpty else { + DispatchQueue.main.async { + self.updateWristPosition(nil) + } + return + } + + // Find the hand with the highest wrist Y + var highestWristY: Float = 0 + for hand in results { + if let wrist = try? hand.recognizedPoint(.wrist), + wrist.confidence > 0.3 { + let y = Float(wrist.location.y) + Self.log("[HandGesture] wrist y=\(String(format: "%.3f", y)) conf=\(String(format: "%.2f", wrist.confidence))") + if y > highestWristY { + highestWristY = y + } + } + } + + DispatchQueue.main.async { + self.updateWristPosition(highestWristY > 0 ? highestWristY : nil) + } + } + + private func updateWristPosition(_ wristY: Float?) { + guard let y = wristY else { + recentWristY = [] + handHeight = 0.0 + isHandRaised = false // set after height so callback has correct height + return + } + + recentWristY.append(y) + if recentWristY.count > smoothingWindow { + recentWristY.removeFirst() + } + let smoothed = recentWristY.reduce(0, +) / Float(recentWristY.count) + + // Hysteresis: raise at raiseThreshold, lower at lowerThreshold + if !isHandRaised && smoothed > raiseThreshold { + Self.log("[HandGesture] HAND RAISED (smoothed=\(String(format: "%.3f", smoothed)))") + handHeight = min(1.0, (smoothed - lowerThreshold) / (1.0 - lowerThreshold)) + isHandRaised = true + } else if isHandRaised && smoothed < lowerThreshold { + Self.log("[HandGesture] HAND LOWERED (smoothed=\(String(format: "%.3f", smoothed)))") + handHeight = 0.0 + isHandRaised = false + } else if isHandRaised { + // Update height while raised (for speed control) + handHeight = min(1.0, (smoothed - lowerThreshold) / (1.0 - lowerThreshold)) + } + } +} diff --git a/Textream/Textream/NotchOverlayController.swift b/Textream/Textream/NotchOverlayController.swift index 330d202..b292c95 100644 --- a/Textream/Textream/NotchOverlayController.swift +++ b/Textream/Textream/NotchOverlayController.swift @@ -47,6 +47,10 @@ class OverlayContent { class NotchOverlayController: NSObject { private var panel: NSPanel? let speechRecognizer = SpeechRecognizer() + let handGestureController = HandGestureController() + private var rewindTimer: Timer? + private var indicatorWindow: NSWindow? + private var indicatorView: NSHostingView? let overlayContent = OverlayContent() var onComplete: (() -> Void)? var onNextPage: (() -> Void)? @@ -117,6 +121,90 @@ class NotchOverlayController: NSObject { if settings.listeningMode != .classic { speechRecognizer.start(with: text) } + + if settings.handGestureRewind { + handGestureController.onHandStateChanged = { [weak self] raised, height in + self?.handleHandGesture(raised: raised, height: height) + } + handGestureController.start() + } + } + + private func handleHandGesture(raised: Bool, height: Float) { + let settings = NotchSettings.shared + HandGestureController.log("[Controller] handleHandGesture raised=\(raised) height=\(height) mode=\(settings.listeningMode.rawValue)") + + if raised { + showHandIndicator() + + // Pause current mode + speechRecognizer.pauseForRewind() + + // Start rewind timer + rewindTimer?.invalidate() + rewindTimer = Timer.scheduledTimer(withTimeInterval: 0.25, repeats: true) { [weak self] _ in + guard let self else { return } + let h = self.handGestureController.handHeight + let words: Int + if h < 0.3 { words = 1 } + else if h < 0.7 { words = 2 } + else { words = 4 } + + self.speechRecognizer.rewindByWords(words) + } + } else { + HandGestureController.log("[Controller] hiding indicator, window=\(indicatorWindow != nil)") + hideHandIndicator() + + // Stop rewind + rewindTimer?.invalidate() + rewindTimer = nil + + HandGestureController.log("[Controller] calling resumeAfterRewind, isListening=\(speechRecognizer.isListening)") + switch settings.listeningMode { + case .wordTracking: + speechRecognizer.resumeAfterRewind() + HandGestureController.log("[Controller] resumeAfterRewind called, isListening=\(speechRecognizer.isListening)") + case .classic, .silencePaused: + DispatchQueue.main.asyncAfter(deadline: .now() + 1.5) { [weak self] in + self?.speechRecognizer.resumeAfterRewind() + } + } + } + } + + private func showHandIndicator() { + guard indicatorWindow == nil else { return } + guard let screen = NSScreen.main ?? NSScreen.screens.first else { return } + + let size: CGFloat = 60 + let margin: CGFloat = 20 + let frame = NSRect( + x: screen.frame.maxX - size - margin, + y: screen.frame.maxY - size - margin - 30, // below menu bar + width: size, + height: size + ) + + let window = NSWindow(contentRect: frame, styleMask: .borderless, backing: .buffered, defer: false) + window.isOpaque = false + window.backgroundColor = .clear + window.level = NSWindow.Level(Int(CGShieldingWindowLevel()) + 1) + window.ignoresMouseEvents = true + window.hasShadow = false + + let hostView = NSHostingView(rootView: HandIndicatorView(isRewinding: true)) + window.contentView = hostView + window.orderFront(nil) + + indicatorWindow = window + indicatorView = hostView + } + + private func hideHandIndicator() { + indicatorWindow?.orderOut(nil) + indicatorWindow = nil + indicatorView = nil } func updateContent(text: String, hasNextPage: Bool) { @@ -136,6 +224,10 @@ class NotchOverlayController: NSObject { if settings.listeningMode != .classic { speechRecognizer.start(with: text) } + + if NotchSettings.shared.handGestureRewind { + handGestureController.start() + } } private func screenUnderMouse() -> NSScreen? { @@ -232,7 +324,7 @@ class NotchOverlayController: NSObject { self.frameTracker = tracker self.currentScreenID = screen.displayID - let overlayView = NotchOverlayView(content: overlayContent, speechRecognizer: speechRecognizer, menuBarHeight: menuBarHeight, baseTextHeight: textAreaHeight, maxExtraHeight: maxExtraHeight, frameTracker: tracker) + let overlayView = NotchOverlayView(content: overlayContent, speechRecognizer: speechRecognizer, handGesture: handGestureController, menuBarHeight: menuBarHeight, baseTextHeight: textAreaHeight, maxExtraHeight: maxExtraHeight, frameTracker: tracker) let contentView = NSHostingView(rootView: overlayView) // Start panel at full target size (SwiftUI animates the notch shape inside) @@ -277,6 +369,7 @@ class NotchOverlayController: NSObject { let floatingView = FloatingOverlayView( content: overlayContent, speechRecognizer: speechRecognizer, + handGesture: handGestureController, baseHeight: panelHeight, followingCursor: true ) @@ -345,6 +438,7 @@ class NotchOverlayController: NSObject { let floatingView = FloatingOverlayView( content: overlayContent, speechRecognizer: speechRecognizer, + handGesture: handGestureController, baseHeight: panelHeight ) let contentView = NSHostingView(rootView: floatingView) @@ -377,6 +471,10 @@ class NotchOverlayController: NSObject { // Trigger the shrink animation speechRecognizer.shouldDismiss = true speechRecognizer.forceStop() + handGestureController.stop() + hideHandIndicator() + rewindTimer?.invalidate() + rewindTimer = nil // Wait for animation, then remove panel DispatchQueue.main.asyncAfter(deadline: .now() + 0.4) { [weak self] in @@ -415,6 +513,10 @@ class NotchOverlayController: NSObject { removeEscMonitor() cancellables.removeAll() speechRecognizer.forceStop() + handGestureController.stop() + hideHandIndicator() + rewindTimer?.invalidate() + rewindTimer = nil speechRecognizer.recognizedCharCount = 0 panel?.orderOut(nil) panel = nil @@ -600,11 +702,49 @@ struct DynamicIslandShape: Shape { } } +// MARK: - Hand Gesture Indicator + +struct HandIndicatorView: View { + let isRewinding: Bool + + @State private var rotation: Double = 0 + + var body: some View { + ZStack { + // Background circle + Circle() + .stroke(Color.white.opacity(0.3), lineWidth: 3) + .frame(width: 44, height: 44) + + // Animated arc + Circle() + .trim(from: 0, to: 0.7) + .stroke(Color.green, style: StrokeStyle(lineWidth: 3, lineCap: .round)) + .frame(width: 44, height: 44) + .rotationEffect(.degrees(rotation)) + + // Rewind icon + Image(systemName: "backward.fill") + .font(.system(size: 16, weight: .bold)) + .foregroundColor(.green) + } + .frame(width: 60, height: 60) + .background(Color.black.opacity(0.6)) + .clipShape(Circle()) + .onAppear { + withAnimation(.linear(duration: 1).repeatForever(autoreverses: false)) { + rotation = -360 + } + } + } +} + // MARK: - Overlay SwiftUI View struct NotchOverlayView: View { @Bindable var content: OverlayContent @Bindable var speechRecognizer: SpeechRecognizer + var handGesture: HandGestureController let menuBarHeight: CGFloat let baseTextHeight: CGFloat let maxExtraHeight: CGFloat @@ -627,6 +767,8 @@ struct NotchOverlayView: View { @State private var isUserScrolling: Bool = false private let scrollTimer = Timer.publish(every: 0.05, on: .main, in: .common).autoconnect() + // Hand-gesture rewind state + // Auto next page countdown @State private var countdownRemaining: Int = 0 @State private var countdownTimer: Timer? = nil @@ -642,6 +784,7 @@ struct NotchOverlayView: View { NotchSettings.shared.listeningMode } + /// Convert fractional word index to char offset using actual word lengths private func charOffsetForWordProgress(_ progress: Double) -> Int { let wholeWord = Int(progress) @@ -1136,6 +1279,7 @@ struct GlassEffectView: NSViewRepresentable { struct FloatingOverlayView: View { @Bindable var content: OverlayContent @Bindable var speechRecognizer: SpeechRecognizer + var handGesture: HandGestureController let baseHeight: CGFloat var followingCursor: Bool = false @@ -1155,10 +1299,13 @@ struct FloatingOverlayView: View { @State private var isUserScrolling: Bool = false private let scrollTimer = Timer.publish(every: 0.05, on: .main, in: .common).autoconnect() + // Hand-gesture rewind state + private var listeningMode: ListeningMode { NotchSettings.shared.listeningMode } + /// Convert fractional word index to char offset using actual word lengths private func charOffsetForWordProgress(_ progress: Double) -> Int { let wholeWord = Int(progress) diff --git a/Textream/Textream/NotchSettings.swift b/Textream/Textream/NotchSettings.swift index 1a02c9d..9b4f7a0 100644 --- a/Textream/Textream/NotchSettings.swift +++ b/Textream/Textream/NotchSettings.swift @@ -407,6 +407,10 @@ class NotchSettings { didSet { UserDefaults.standard.set(selectedMicUID, forKey: "selectedMicUID") } } + var handGestureRewind: Bool { + didSet { UserDefaults.standard.set(handGestureRewind, forKey: "handGestureRewind") } + } + var autoNextPage: Bool { didSet { UserDefaults.standard.set(autoNextPage, forKey: "autoNextPage") } } @@ -483,6 +487,7 @@ class NotchSettings { self.hideFromScreenShare = UserDefaults.standard.object(forKey: "hideFromScreenShare") as? Bool ?? true self.showElapsedTime = UserDefaults.standard.object(forKey: "showElapsedTime") as? Bool ?? true self.selectedMicUID = UserDefaults.standard.string(forKey: "selectedMicUID") ?? "" + self.handGestureRewind = UserDefaults.standard.object(forKey: "handGestureRewind") as? Bool ?? true self.autoNextPage = UserDefaults.standard.object(forKey: "autoNextPage") as? Bool ?? false let savedDelay = UserDefaults.standard.integer(forKey: "autoNextPageDelay") self.autoNextPageDelay = savedDelay > 0 ? savedDelay : 3 diff --git a/Textream/Textream/SettingsView.swift b/Textream/Textream/SettingsView.swift index 581526b..727a3c1 100644 --- a/Textream/Textream/SettingsView.swift +++ b/Textream/Textream/SettingsView.swift @@ -743,6 +743,19 @@ struct SettingsView: View { } } + Divider() + + Toggle(isOn: $settings.handGestureRewind) { + VStack(alignment: .leading, spacing: 2) { + Text("Hand Gesture Rewind") + .font(.system(size: 13, weight: .medium)) + Text("Raise your hand to pause and rewind. The higher you raise, the faster it rewinds. Lower your hand to resume. Uses your camera to detect hand position.") + .font(.system(size: 11)) + .foregroundStyle(.secondary) + } + } + .toggleStyle(.checkbox) + Spacer() } .padding(16) diff --git a/Textream/Textream/SpeechRecognizer.swift b/Textream/Textream/SpeechRecognizer.swift index 0730b1c..9e01414 100644 --- a/Textream/Textream/SpeechRecognizer.swift +++ b/Textream/Textream/SpeechRecognizer.swift @@ -209,6 +209,41 @@ class SpeechRecognizer { beginRecognition() } + /// Pause speech recognition for gesture rewind without changing isListening state. + func pauseForRewind() { + cleanupRecognition() + } + + /// Move recognizedCharCount backward by N words. Used during gesture rewind. + func rewindByWords(_ count: Int) { + let chars = Array(sourceText) + var remaining = count + var offset = recognizedCharCount + + while remaining > 0 && offset > 0 { + // Skip any spaces at current position + while offset > 0 && chars[offset - 1] == " " { + offset -= 1 + } + // Skip to start of current word + while offset > 0 && chars[offset - 1] != " " { + offset -= 1 + } + remaining -= 1 + } + + recognizedCharCount = max(0, offset) + matchStartOffset = recognizedCharCount + } + + /// Resume speech recognition after gesture rewind from current position. + func resumeAfterRewind() { + matchStartOffset = recognizedCharCount + retryCount = 0 + isListening = true + beginRecognition() + } + private func cleanupRecognition() { // Cancel any pending restart to prevent overlapping beginRecognition calls pendingRestart?.cancel() diff --git a/Textream/Textream/Textream.entitlements b/Textream/Textream/Textream.entitlements index e2726b6..7c2b752 100644 --- a/Textream/Textream/Textream.entitlements +++ b/Textream/Textream/Textream.entitlements @@ -6,6 +6,8 @@ com.apple.security.device.audio-input + com.apple.security.device.camera + com.apple.security.files.user-selected.read-write com.apple.security.network.client diff --git a/docs/superpowers/plans/2026-03-22-hand-gesture-rewind.md b/docs/superpowers/plans/2026-03-22-hand-gesture-rewind.md new file mode 100644 index 0000000..db29b4e --- /dev/null +++ b/docs/superpowers/plans/2026-03-22-hand-gesture-rewind.md @@ -0,0 +1,501 @@ +# Hand Gesture Rewind Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add hands-free script rewind via raised hand detection using the Mac's front-facing camera and Apple Vision framework. + +**Architecture:** A new `HandGestureController` owns camera capture and Vision hand pose detection, publishing `isHandRaised` and `handHeight`. The overlay views observe these values and dispatch rewind to the appropriate scroll state (`recognizedCharCount` for wordTracking, `timerWordProgress` for classic/silencePaused). New methods on `SpeechRecognizer` handle pause/rewind/resume for wordTracking mode. + +**Tech Stack:** Swift, AVFoundation (camera capture), Vision framework (VNDetectHumanHandPoseRequest) + +**Spec:** `docs/superpowers/specs/2026-03-22-hand-gesture-rewind-design.md` + +**Note:** This project has no test target. All Swift files live in `Textream/Textream/`. The project builds via `xcodebuild` from `Textream/Textream.xcodeproj`. Use `CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY=""` when building. + +**Important — Adding files to Xcode project:** This is a pure Xcode project (no SPM). New `.swift` files are auto-discovered by the build system. Bundle resources and entitlements changes need manual verification. + +--- + +## File Map + +### New Files + +| File | Responsibility | +|------|---------------| +| `Textream/Textream/HandGestureController.swift` | AVCaptureSession setup, VNDetectHumanHandPoseRequest processing, wrist Y smoothing, publishes `isHandRaised` and `handHeight` | + +### Modified Files + +| File | Change | +|------|--------| +| `Textream/Textream/SpeechRecognizer.swift` | Add `pauseForRewind()`, `rewindByWords(_:)`, `resumeAfterRewind()` methods | +| `Textream/Textream/NotchOverlayController.swift` | Create `HandGestureController`, observe hand state in both overlay views, manage rewind timer, dispatch to correct scroll state per mode | +| `Textream/Info.plist` | Add `NSCameraUsageDescription` | +| `Textream/Textream/Textream.entitlements` | Add `com.apple.security.device.camera` | + +--- + +### Task 1: Camera Permission and Entitlements + +**Files:** +- Modify: `Textream/Info.plist` +- Modify: `Textream/Textream/Textream.entitlements` + +- [ ] **Step 1: Add camera usage description to Info.plist** + +Add the following key/value pair inside the `` in `Textream/Info.plist`, after the existing `NSServices` block: + +```xml +NSCameraUsageDescription +Textream uses the camera to detect hand gestures for hands-free script control. +``` + +- [ ] **Step 2: Add camera entitlement** + +Add the following key/value pair inside the `` in `Textream/Textream/Textream.entitlements`, after the existing `com.apple.security.device.audio-input` entry: + +```xml +com.apple.security.device.camera + +``` + +- [ ] **Step 3: Verify build** + +```bash +cd Textream && xcodebuild -scheme Textream -configuration Debug CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY="" build 2>&1 | tail -5 +``` + +Expected: BUILD SUCCEEDED + +- [ ] **Step 4: Commit** + +```bash +git add Textream/Info.plist Textream/Textream/Textream.entitlements +git commit -m "feat: add camera permission and entitlement for hand gesture rewind" +``` + +--- + +### Task 2: HandGestureController + +**Files:** +- Create: `Textream/Textream/HandGestureController.swift` + +This is the core camera + Vision processing class. It owns the capture session, runs hand pose detection on each frame, smooths the wrist position, and publishes state. + +- [ ] **Step 1: Implement HandGestureController** + +```swift +import AVFoundation +import Vision +import AppKit + +@Observable +class HandGestureController: NSObject { + var isHandRaised: Bool = false + var handHeight: Float = 0.0 // 0.0 = just above threshold, 1.0 = top of frame + + private var captureSession: AVCaptureSession? + private let videoOutput = AVCaptureVideoDataOutput() + private let processingQueue = DispatchQueue(label: "com.textream.handgesture", qos: .userInteractive) + private let handPoseRequest = VNDetectHumanHandPoseRequest() + + private let raiseThreshold: Float = 0.6 // wrist Y must exceed this to count as raised + private var recentWristY: [Float] = [] // rolling buffer for smoothing + private let smoothingWindow = 4 + + private var isRunning = false + + override init() { + super.init() + handPoseRequest.maximumHandCount = 2 + } + + func start() { + guard !isRunning else { return } + + // Check camera permission + switch AVCaptureDevice.authorizationStatus(for: .video) { + case .authorized: + setupAndStart() + case .notDetermined: + AVCaptureDevice.requestAccess(for: .video) { [weak self] granted in + if granted { + DispatchQueue.main.async { self?.setupAndStart() } + } + } + default: + // Permission denied or restricted — silently disable + return + } + } + + func stop() { + guard isRunning else { return } + captureSession?.stopRunning() + isRunning = false + isHandRaised = false + handHeight = 0.0 + recentWristY = [] + } + + private func setupAndStart() { + let session = AVCaptureSession() + session.sessionPreset = .low // ~640x480, minimal resource usage + + // Find front-facing camera + guard let camera = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .front) + ?? AVCaptureDevice.default(for: .video) else { + // No camera available — silently disable + return + } + + guard let input = try? AVCaptureDeviceInput(device: camera) else { return } + guard session.canAddInput(input) else { return } + session.addInput(input) + + videoOutput.setSampleBufferDelegate(self, queue: processingQueue) + videoOutput.alwaysDiscardsLateVideoFrames = true + guard session.canAddOutput(videoOutput) else { return } + session.addOutput(videoOutput) + + // Limit frame rate to ~15fps to save CPU + if let connection = videoOutput.connection(with: .video) { + connection.isEnabled = true + } + try? camera.lockForConfiguration() + camera.activeVideoMinFrameDuration = CMTime(value: 1, timescale: 15) + camera.activeVideoMaxFrameDuration = CMTime(value: 1, timescale: 15) + camera.unlockForConfiguration() + + captureSession = session + + processingQueue.async { + session.startRunning() + } + isRunning = true + } +} + +extension HandGestureController: AVCaptureVideoDataOutputSampleBufferDelegate { + func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } + + let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, orientation: .up, options: [:]) + try? handler.perform([handPoseRequest]) + + guard let results = handPoseRequest.results, !results.isEmpty else { + DispatchQueue.main.async { + self.updateWristPosition(nil) + } + return + } + + // Find the hand with the highest wrist Y + var highestWristY: Float = 0 + for hand in results { + if let wrist = try? hand.recognizedPoint(.wrist), + wrist.confidence > 0.3 { + let y = Float(wrist.location.y) // Vision coords: 0=bottom, 1=top + if y > highestWristY { + highestWristY = y + } + } + } + + DispatchQueue.main.async { + self.updateWristPosition(highestWristY > 0 ? highestWristY : nil) + } + } + + private func updateWristPosition(_ wristY: Float?) { + guard let y = wristY else { + // No hand detected — decay smoothly + recentWristY = [] + isHandRaised = false + handHeight = 0.0 + return + } + + // Smooth with rolling average + recentWristY.append(y) + if recentWristY.count > smoothingWindow { + recentWristY.removeFirst() + } + let smoothed = recentWristY.reduce(0, +) / Float(recentWristY.count) + + if smoothed > raiseThreshold { + isHandRaised = true + // Map threshold..1.0 → 0.0..1.0 + handHeight = min(1.0, (smoothed - raiseThreshold) / (1.0 - raiseThreshold)) + } else { + isHandRaised = false + handHeight = 0.0 + } + } +} +``` + +- [ ] **Step 2: Verify build** + +```bash +cd Textream && xcodebuild -scheme Textream -configuration Debug CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY="" build 2>&1 | tail -5 +``` + +Expected: BUILD SUCCEEDED + +- [ ] **Step 3: Commit** + +```bash +git add Textream/Textream/HandGestureController.swift +git commit -m "feat: add HandGestureController with Vision hand pose detection" +``` + +--- + +### Task 3: SpeechRecognizer Rewind Methods + +**Files:** +- Modify: `Textream/Textream/SpeechRecognizer.swift` + +Add three new public methods that the overlay views will call during hand gesture rewind. These methods encapsulate access to the private `matchStartOffset`, `sourceText`, `cleanupRecognition()`, and `beginRecognition()`. + +- [ ] **Step 1: Add pauseForRewind()** + +Add the following method after `resume()` (after line 210 in SpeechRecognizer.swift): + +```swift + /// Pause speech recognition for gesture rewind without changing isListening state. + func pauseForRewind() { + cleanupRecognition() + } +``` + +- [ ] **Step 2: Add rewindByWords(\_:)** + +Add directly after `pauseForRewind()`: + +```swift + /// Move recognizedCharCount backward by N words. Used during gesture rewind. + func rewindByWords(_ count: Int) { + // Work with the string as an array for O(1) indexing + let chars = Array(sourceText) + var remaining = count + var offset = recognizedCharCount + + while remaining > 0 && offset > 0 { + // Skip any spaces at current position + while offset > 0 && chars[offset - 1] == " " { + offset -= 1 + } + // Skip to start of current word + while offset > 0 && chars[offset - 1] != " " { + offset -= 1 + } + remaining -= 1 + } + + recognizedCharCount = max(0, offset) + matchStartOffset = recognizedCharCount + } +``` + +- [ ] **Step 3: Add resumeAfterRewind()** + +Add directly after `rewindByWords(_:)`: + +```swift + /// Resume speech recognition after gesture rewind from current position. + func resumeAfterRewind() { + matchStartOffset = recognizedCharCount + retryCount = 0 + beginRecognition() + } +``` + +- [ ] **Step 4: Verify build** + +```bash +cd Textream && xcodebuild -scheme Textream -configuration Debug CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY="" build 2>&1 | tail -5 +``` + +Expected: BUILD SUCCEEDED + +- [ ] **Step 5: Commit** + +```bash +git add Textream/Textream/SpeechRecognizer.swift +git commit -m "feat: add pauseForRewind, rewindByWords, resumeAfterRewind to SpeechRecognizer" +``` + +--- + +### Task 4: Integrate HandGestureController into Overlay Views + +**Files:** +- Modify: `Textream/Textream/NotchOverlayController.swift` + +This is the integration task. The `HandGestureController` needs to be: +1. Created and owned by `NotchOverlayController` +2. Started/stopped with reading sessions +3. Observed by both `NotchOverlayView` and `FloatingOverlayView` to drive rewind + +**Key context:** `timerWordProgress` is `@State private` on both `NotchOverlayView` (line 625) and `FloatingOverlayView` (line 1153). The rewind logic for classic/silencePaused must live inside these views since they own the state. The `HandGestureController` is passed to both views and observed via `onChange(of:)`. + +- [ ] **Step 1: Add HandGestureController to NotchOverlayController** + +In the `NotchOverlayController` class (around line 47), add a property: + +```swift +let handGestureController = HandGestureController() +``` + +- [ ] **Step 2: Start/stop camera with reading sessions** + +Find `show(text:hasNextPage:onComplete:)` (line 62) — after the existing `speechRecognizer.start(with:)` call (line 118), add: + +```swift +handGestureController.start() +``` + +Find `updateContent(text:hasNextPage:)` (line 122) — similarly add `handGestureController.start()` after the recognizer start. + +Find `dismiss()` (line 376) and `forceClose()` (line 411) — add to both: + +```swift +handGestureController.stop() +``` + +- [ ] **Step 3: Pass HandGestureController to NotchOverlayView** + +The `NotchOverlayView` needs access to `handGestureController`. Add it as a parameter to the view's init. Find where `NotchOverlayView` is created in `NotchOverlayController` and pass `handGestureController`. + +In `NotchOverlayView`, add a property: + +```swift +var handGesture: HandGestureController +``` + +- [ ] **Step 4: Add rewind logic to NotchOverlayView** + +Add a rewind timer state and handler to `NotchOverlayView`. Add these properties near the other `@State` declarations (around line 625): + +```swift +@State private var rewindTimer: Timer? +@State private var resumeDelay: DispatchWorkItem? +``` + +Add a helper to compute words-per-tick from hand height: + +```swift +private func rewindWordsPerTick(handHeight: Float) -> Int { + if handHeight < 0.3 { return 1 } + if handHeight < 0.7 { return 2 } + return 4 +} +``` + +Add `onChange` handlers in the view body (inside the main container, near the existing `onChange` handlers): + +```swift +.onChange(of: handGesture.isHandRaised) { _, raised in + if raised { + // Cancel any pending resume delay + resumeDelay?.cancel() + resumeDelay = nil + + // Pause current mode + switch listeningMode { + case .wordTracking: + speechRecognizer.pauseForRewind() + case .classic: + isPaused = true + case .silencePaused: + speechRecognizer.pauseForRewind() + isPaused = true // also pause the scroll timer + } + + // Start rewind timer + rewindTimer = Timer.scheduledTimer(withTimeInterval: 0.25, repeats: true) { _ in + let words = rewindWordsPerTick(handHeight: handGesture.handHeight) + switch listeningMode { + case .wordTracking: + speechRecognizer.rewindByWords(words) + case .classic, .silencePaused: + timerWordProgress = max(0, timerWordProgress - Double(words)) + } + } + } else { + // Stop rewind timer + rewindTimer?.invalidate() + rewindTimer = nil + + // Resume based on mode + switch listeningMode { + case .wordTracking: + speechRecognizer.resumeAfterRewind() + case .classic: + let work = DispatchWorkItem { isPaused = false } + resumeDelay = work + DispatchQueue.main.asyncAfter(deadline: .now() + 1.5, execute: work) + case .silencePaused: + speechRecognizer.resumeAfterRewind() + let work = DispatchWorkItem { isPaused = false } + resumeDelay = work + DispatchQueue.main.asyncAfter(deadline: .now() + 1.5, execute: work) + } + } +} +.onDisappear { + rewindTimer?.invalidate() + rewindTimer = nil + resumeDelay?.cancel() + resumeDelay = nil +} +``` + +- [ ] **Step 5: Repeat for FloatingOverlayView** + +Apply the same changes to `FloatingOverlayView` (starts around line 1136): +- Add `handGesture: HandGestureController` property +- Add `rewindTimer`, `resumeDelay` state +- Add `rewindWordsPerTick` helper +- Add the same `onChange(of: handGesture.isHandRaised)` handler with `.onDisappear` cleanup +- Pass `handGestureController` from where `FloatingOverlayView` is created + +**Finding all view instantiation sites:** Search `NotchOverlayController.swift` for `NotchOverlayView(` and `FloatingOverlayView(` to find every place these views are created. Each call site must pass the `handGestureController`. There are typically 1-2 sites per view (in `showPinned`, `showFollowCursor`, `showFloating`, etc.). + +**Note:** Fullscreen mode uses `ExternalDisplayView` which is not addressed in this plan — gesture rewind in fullscreen is a future enhancement. + +- [ ] **Step 6: Verify build** + +```bash +cd Textream && xcodebuild -scheme Textream -configuration Debug CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY="" build 2>&1 | tail -5 +``` + +Expected: BUILD SUCCEEDED + +- [ ] **Step 7: Build and launch the app** + +```bash +cd Textream && xcodebuild -scheme Textream -configuration Debug CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY="" build 2>&1 | tail -5 +``` + +Then launch the app and test: +1. Load a script and start reading in Word Tracking mode +2. Raise your hand above shoulder height — script should start rewinding +3. Raise hand higher — rewind should speed up +4. Lower hand — speech recognition should resume from new position +5. Switch to Classic mode, start auto-scroll, raise hand — should rewind `timerWordProgress` +6. Lower hand — 1.5s pause, then auto-scroll resumes + +- [ ] **Step 8: Commit** + +```bash +git add Textream/Textream/NotchOverlayController.swift +git commit -m "feat: integrate hand gesture rewind into overlay views + +Start/stop camera with reading sessions. Both NotchOverlayView +and FloatingOverlayView observe HandGestureController to drive +rewind in all three listening modes." +``` diff --git a/docs/superpowers/specs/2026-03-22-hand-gesture-rewind-design.md b/docs/superpowers/specs/2026-03-22-hand-gesture-rewind-design.md new file mode 100644 index 0000000..bd02229 --- /dev/null +++ b/docs/superpowers/specs/2026-03-22-hand-gesture-rewind-design.md @@ -0,0 +1,99 @@ +# Hand Gesture Rewind — Design Spec + +## Problem + +When presenting with Textream, the speaker sometimes needs to go back a few sentences in the script — to re-read a section, correct a mistake, or recover after going off-script. Currently there is no hands-free way to rewind. The speaker would have to walk to the laptop and tap the screen. + +## Solution + +Use the Mac's front-facing camera and Apple's Vision framework to detect a raised hand. While the hand is raised, the script rewinds continuously. Hand height controls rewind speed — higher hand = faster rewind. Lowering the hand resumes normal operation. + +## Hand Detection Pipeline + +A new `HandGestureController` class owns camera capture and Vision processing: + +1. **AVCaptureSession** captures frames from the default front-facing camera at low resolution (~640x480) and low frame rate (~15fps) +2. Each frame is processed by **VNDetectHumanHandPoseRequest** which returns hand landmark coordinates +3. The **wrist Y-position** (0.0 = bottom of frame, 1.0 = top in Vision coordinates) is extracted and smoothed with a rolling average of the last 3-4 frames to reduce jitter +4. A **raise threshold** (wrist Y > 0.6) determines whether the hand is raised. Below this, the hand is in the speaker's lap or at their side and is ignored +5. If multiple hands are detected, use the one with the highest wrist Y-position +6. The controller publishes two observable values: + - `isHandRaised: Bool` + - `handHeight: Float` (0.0 = just above threshold, 1.0 = top of frame) + +### Lifecycle + +- Camera starts when a reading session begins — triggered by the overlay controller's `startReading()` flow, not tied to `SpeechRecognizer.start()` (since classic mode never calls it) +- Camera stops when the reading session ends +- Camera is NOT running when the app is idle +- If no camera is available (Mac Mini, Mac Pro, external displays without camera), the gesture feature is silently disabled + +### Camera Selection + +Uses the default front-facing camera. No settings UI for camera selection in this iteration. + +### Camera Permission + +If camera access is denied, the gesture feature is silently unavailable — no error is shown, no functionality is blocked. The rest of the app works normally. The `NSCameraUsageDescription` key in `Info.plist` provides the permission prompt text. + +## Rewind Behavior + +The scroll state lives in different places depending on the listening mode: +- **wordTracking:** `recognizedCharCount` on `SpeechRecognizer` +- **classic / silencePaused:** `timerWordProgress` on the overlay controller + +The `HandGestureController` publishes `isHandRaised` and `handHeight`. The overlay controller observes these and dispatches rewind to the appropriate state. + +### When hand is raised (`isHandRaised` becomes true): + +**In wordTracking mode:** +1. Pause speech recognition — call a new `pauseForRewind()` method on `SpeechRecognizer` that stops the audio engine and recognition task without setting `isListening = false` +2. Start a rewind timer (every 0.25 seconds) that calls a new `rewindByWords(_ count: Int)` method on `SpeechRecognizer`, which moves `recognizedCharCount` backward by N words (finding previous space characters in `sourceText`) and updates `matchStartOffset` to match + +**In classic / silencePaused mode:** +1. Pause the scroll timer +2. Start a rewind timer (every 0.25 seconds) that decrements `timerWordProgress` by N words + +**Speed (all modes):** The `handHeight` value controls how many words per tick: +- Low hand (0.0–0.3): 1 word per tick (~4 words/sec) +- Mid hand (0.3–0.7): 2 words per tick (~8 words/sec) +- High hand (0.7–1.0): 4 words per tick (~16 words/sec) + +Position is clamped to never go below 0. + +### When hand is lowered (`isHandRaised` becomes false): + +**In wordTracking mode:** Call a new `resumeAfterRewind()` method on `SpeechRecognizer` that sets `matchStartOffset = recognizedCharCount` and calls `beginRecognition()` to resume speech tracking from the new position. + +**In classic / silencePaused mode:** Wait 1.5 seconds, then resume the scroll timer from the current `timerWordProgress` position. + +### Visual Feedback + +In wordTracking mode, MarqueeTextView observes `recognizedCharCount` — rewind scrolls backward automatically. In classic/silencePaused modes, the view observes `timerWordProgress` — same effect. No new UI components needed. + +## File Organization + +### New File + +| File | Responsibility | +|------|---------------| +| `HandGestureController.swift` | AVCaptureSession setup, VNDetectHumanHandPoseRequest processing, wrist position smoothing, publishes `isHandRaised` and `handHeight` | + +### Modified Files + +| File | Change | +|------|--------| +| `SpeechRecognizer.swift` | Add `pauseForRewind()`, `rewindByWords(_:)`, and `resumeAfterRewind()` methods | +| `NotchOverlayController.swift` | Create and own `HandGestureController`, observe hand state, dispatch rewind to `SpeechRecognizer` or `timerWordProgress` depending on mode, manage rewind timer | +| `Info.plist` | Add `NSCameraUsageDescription` | + +### Unchanged + +MarqueeTextView, NotchSettings, ContentView, SettingsView, BrowserServer, ExternalDisplayController. + +## Supported Listening Modes + +The gesture rewind works in all three listening modes: +- **Word Tracking** (speech recognition) — rewinds `recognizedCharCount` +- **Classic** (constant auto-scroll) — rewinds `timerWordProgress` +- **Voice-Activated** (silence-paused auto-scroll) — rewinds `timerWordProgress`