diff --git a/sdk/rt/speechmatics/rt/_async_client.py b/sdk/rt/speechmatics/rt/_async_client.py index 5e581e15..b409bfb7 100644 --- a/sdk/rt/speechmatics/rt/_async_client.py +++ b/sdk/rt/speechmatics/rt/_async_client.py @@ -12,6 +12,7 @@ from ._exceptions import TimeoutError from ._exceptions import TranscriptionError from ._logging import get_logger +from ._models import AudioEncoding from ._models import AudioEventsConfig from ._models import AudioFormat from ._models import ClientMessageType @@ -97,6 +98,8 @@ def __init__( self.on(ServerMessageType.WARNING, self._on_warning) self.on(ServerMessageType.AUDIO_ADDED, self._on_audio_added) + self._audio_format = AudioFormat(encoding=AudioEncoding.PCM_S16LE, sample_rate=44100, chunk_size=4096) + self._logger.debug("AsyncClient initialized (request_id=%s)", self._session.request_id) async def start_session( @@ -133,6 +136,9 @@ async def start_session( ... await client.start_session() ... await client.send_audio(frame) """ + if audio_format is not None: + self._audio_format = audio_format + await self._start_recognition_session( transcription_config=transcription_config, audio_format=audio_format, @@ -161,12 +167,18 @@ async def stop_session(self) -> None: await self._session_done_evt.wait() # Wait for end of transcript event to indicate we can stop listening await self.close() - async def force_end_of_utterance(self) -> None: + async def force_end_of_utterance(self, timestamp: Optional[float] = None) -> None: """ This method sends a ForceEndOfUtterance message to the server to signal the end of an utterance. Forcing end of utterance will cause the final transcript to be sent to the client early. + Takes an optional timestamp parameter to specify a marker for the engine + to use for timing of the end of the utterance. + + Args: + timestamp: Optional timestamp for the request. + Raises: ConnectionError: If the WebSocket connection fails. TranscriptionError: If the server reports an error during teardown. @@ -179,7 +191,21 @@ async def force_end_of_utterance(self) -> None: ... await client.send_audio(frame) ... await client.force_end_of_utterance() """ - await self.send_message({"message": ClientMessageType.FORCE_END_OF_UTTERANCE}) + + msg: dict = {"message": ClientMessageType.FORCE_END_OF_UTTERANCE} + if timestamp is not None: + msg["timestamp"] = timestamp + + await self.send_message(msg) + + @property + def audio_seconds_sent(self) -> float: + """Number of audio seconds sent to the server. + + Raises: + ValueError: If the audio format does not have an encoding set. + """ + return self._audio_bytes_sent / (self._audio_format.sample_rate * self._audio_format.bytes_per_sample) async def transcribe( self, diff --git a/sdk/rt/speechmatics/rt/_base_client.py b/sdk/rt/speechmatics/rt/_base_client.py index 0ac6d085..89167e20 100644 --- a/sdk/rt/speechmatics/rt/_base_client.py +++ b/sdk/rt/speechmatics/rt/_base_client.py @@ -42,6 +42,7 @@ def __init__(self, transport: Transport) -> None: self._recv_task: Optional[asyncio.Task[None]] = None self._closed_evt = asyncio.Event() self._eos_sent = False + self._audio_bytes_sent = 0 self._seq_no = 0 self._logger = get_logger("speechmatics.rt.base_client") @@ -122,11 +123,17 @@ async def send_audio(self, payload: bytes) -> None: try: await self._transport.send_message(payload) + self._audio_bytes_sent += len(payload) self._seq_no += 1 except Exception: self._closed_evt.set() raise + @property + def audio_bytes_sent(self) -> int: + """Number of audio bytes sent to the server.""" + return self._audio_bytes_sent + async def send_message(self, message: dict[str, Any]) -> None: """ Send a message through the WebSocket. diff --git a/sdk/rt/speechmatics/rt/_models.py b/sdk/rt/speechmatics/rt/_models.py index 84e57204..d1f6acbf 100644 --- a/sdk/rt/speechmatics/rt/_models.py +++ b/sdk/rt/speechmatics/rt/_models.py @@ -183,6 +183,29 @@ class AudioFormat: sample_rate: int = 44100 chunk_size: int = 4096 + _BYTES_PER_SAMPLE = { + AudioEncoding.PCM_F32LE: 4, + AudioEncoding.PCM_S16LE: 2, + AudioEncoding.MULAW: 1, + } + + @property + def bytes_per_sample(self) -> int: + """Number of bytes per audio sample based on encoding. + + Raises: + ValueError: If encoding is None (file type) or unrecognized. + """ + if self.encoding is None: + raise ValueError( + "Cannot determine bytes per sample for file-type audio format. " + "Set an explicit encoding on AudioFormat." + ) + try: + return self._BYTES_PER_SAMPLE[self.encoding] + except KeyError: + raise ValueError(f"Unknown encoding: {self.encoding}") + def to_dict(self) -> dict[str, Any]: """ Convert audio format to dictionary.