Desktop_AI_Agent/audio_record.py at main · Anish-CodeDev/Desktop_AI_Agent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import torch
import numpy as np
import sounddevice as sd
import scipy.io.wavfile as wav
import io
from silero_vad import load_silero_vad, get_speech_timestamps, collect_chunks

model, utils = torch.hub.load(
    repo_or_dir='snakers4/silero-vad',
    model='silero_vad',
    force_reload=False
)
(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils


SAMPLE_RATE = 16000
CHANNELS = 1
MAX_DURATION = 20

def record_speech():

    print('Started Recording')
    # Record from microphone
    recording = sd.rec(int(MAX_DURATION * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=CHANNELS, dtype="float32")
    sd.wait()

    # Convert to mono numpy
    audio = recording.squeeze()

    # 🔹 Detect speech segments
    torch_audio = torch.from_numpy(audio).float()
    speech_timestamps = get_speech_timestamps(torch_audio, model, sampling_rate=SAMPLE_RATE)

    if not speech_timestamps:
        print("❌ No speech detected.")
        return False
    else:
        # Collect only speech parts (auto-trim silence)
        speech_audio = collect_chunks(speech_timestamps, torch_audio)

        # Save trimmed audio to memory (WAV)
        buffer = io.BytesIO()
        wav.write(buffer, SAMPLE_RATE, speech_audio.numpy())
        buffer.seek(0) # Moves the pointer to the starting of the audio file in memory

        with open('recordings/recording.mp3','wb') as f:
            f.write(buffer.read())

        print('Recording done')
    return True