diff --git a/include/ctrlm_ipc.h b/include/ctrlm_ipc.h index 341d4ff9..2cfc02d3 100644 --- a/include/ctrlm_ipc.h +++ b/include/ctrlm_ipc.h @@ -302,7 +302,8 @@ typedef enum { CTRLM_RCU_IARM_EVENT_RF4CE_PAIRING_WINDOW_TIMEOUT = 35, ///< Indicates that a battery milestone event occured CTRLM_RCU_IARM_EVENT_FIRMWARE_UPDATE_PROGRESS = 36, ///< Generated when an milestone is reached for remote firmware upgrade CTRLM_RCU_IARM_EVENT_VALIDATION_STATUS = 37, ///< Generated when the validation status changes - CTRLM_MAIN_IARM_EVENT_MAX = 38 ///< Placeholder for the last event (used in registration) + CTRLM_VOICE_IARM_EVENT_SESSION_SILENT = 38, ///< Voice session was silent (no speech detected) + CTRLM_MAIN_IARM_EVENT_MAX = 39 ///< Placeholder for the last event (used in registration) } ctrlm_main_iarm_event_t; /// @brief Remote Control Key Status diff --git a/include/ctrlm_ipc_voice.h b/include/ctrlm_ipc_voice.h index ed96c951..4a912cad 100644 --- a/include/ctrlm_ipc_voice.h +++ b/include/ctrlm_ipc_voice.h @@ -281,6 +281,16 @@ typedef struct { long return_code_internal; ///< Internally generated return code } ctrlm_voice_iarm_event_session_short_t; +typedef struct { + unsigned char api_revision; ///< The revision of this API. + ctrlm_network_id_t network_id; ///< Identifier of network on which the controller is bound + ctrlm_network_type_t network_type; ///< Type of network on which the controller is bound + ctrlm_controller_id_t controller_id; ///< A unique identifier of the remote + unsigned long session_id; ///< A unique id for the voice session. + ctrlm_voice_session_end_reason_t reason; ///< The reason that the voice session was silent + long return_code_internal; ///< Internally generated return code +} ctrlm_voice_iarm_event_session_silent_t; + typedef struct { unsigned char api_revision; ///< The revision of this API char media_service_url[2083]; ///< The url for the media service (null terminated string) @@ -376,6 +386,7 @@ typedef struct { /// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_STATS | ctrlm_voice_iarm_event_session_stats_t * | Generated when the statistics of the voice session are available | /// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_ABORT | ctrlm_voice_iarm_event_session_abort_t * | Generated when a voice session is aborted (denied) | /// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_SHORT | ctrlm_voice_iarm_event_session_short_t * | Generated when a short voice session is detected | +/// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_SILENT | ctrlm_voice_iarm_event_session_silent_t * | Generated when a silent voice session is detected | /// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_MEDIA_SERVICE | ctrlm_voice_iarm_event_media_service_t * | Generated when a media service response is received | /// /// IARM events are available on a subscription basis. In order to receive an event, a client must explicitly register to receive the event by calling diff --git a/src/ctrlm_config_default.json b/src/ctrlm_config_default.json index af65c196..21aa49c6 100755 --- a/src/ctrlm_config_default.json +++ b/src/ctrlm_config_default.json @@ -291,7 +291,9 @@ "par_voice_eos_method" : 1, "par_voice_eos_timeout" : 2500, "server_hosts" : [], - "telemetry_session_stats" : false + "telemetry_session_stats" : false, + "voice_activity_detection_mode" : "enabled" + }, "device_update" : { "dir_root" : "/srv/device_update/", diff --git a/src/telemetry/ctrlm_telemetry_markers.h b/src/telemetry/ctrlm_telemetry_markers.h index 23c0f775..31e19e00 100644 --- a/src/telemetry/ctrlm_telemetry_markers.h +++ b/src/telemetry/ctrlm_telemetry_markers.h @@ -93,7 +93,7 @@ // The Voice Session Statistics Marker reports statistics for the voice session(s). The format of the marker is a json array of arrays with each event in the format below: // // [[event1], [event2], [event3], ...] -// [,,,,,,,,,,,,,,,,,,,,] +// [,,,,,,,,,,,,,,,,,,,,,,,,,] // // - Version of the marker format. // - Name of the device that started the session. @@ -116,8 +116,14 @@ // - server success/error code. // - server message. // - flag to indicate if session was successful. +// - reason why the stream ended (if available). +// - protocol return code. +// - flag to indicate if voice was detected. +// - peak confidence level. +// - peak RMS level. + #define MARKER_VOICE_SESSION_STATS "ctrlm.voice.session.stats" -#define MARKER_VOICE_SESSION_STATS_VERSION "2" +#define MARKER_VOICE_SESSION_STATS_VERSION "3" // End Voice Session Statistics diff --git a/src/voice/ctrlm_voice_obj.cpp b/src/voice/ctrlm_voice_obj.cpp index a6787b1e..aa0738fd 100644 --- a/src/voice/ctrlm_voice_obj.cpp +++ b/src/voice/ctrlm_voice_obj.cpp @@ -186,39 +186,40 @@ ctrlm_voice_t::ctrlm_voice_t() { #ifdef JSON_ARRAY_VAL_STR_VOICE_SERVER_HOSTS_3 this->url_hostname_pattern_add(JSON_ARRAY_VAL_STR_VOICE_SERVER_HOSTS_3); #endif - this->prefs.aspect_ratio = JSON_STR_VALUE_VOICE_ASPECT_RATIO; - this->prefs.guide_language = JSON_STR_VALUE_VOICE_LANGUAGE; - this->prefs.app_id_http = JSON_STR_VALUE_VOICE_APP_ID_HTTP; - this->prefs.app_id_ws = JSON_STR_VALUE_VOICE_APP_ID_WS; - this->prefs.timeout_vrex_connect = JSON_INT_VALUE_VOICE_VREX_REQUEST_TIMEOUT; - this->prefs.timeout_vrex_session = JSON_INT_VALUE_VOICE_VREX_RESPONSE_TIMEOUT; - this->prefs.timeout_stats = JSON_INT_VALUE_VOICE_TIMEOUT_STATS; - this->prefs.timeout_packet_initial = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_INITIAL; - this->prefs.timeout_packet_subsequent = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_SUBSEQUENT; - this->prefs.bitrate_minimum = JSON_INT_VALUE_VOICE_BITRATE_MINIMUM; - this->prefs.time_threshold = JSON_INT_VALUE_VOICE_TIME_THRESHOLD; - this->prefs.utterance_save = ctrlm_is_production_build() ? JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_0 : JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_1; - this->prefs.utterance_use_curtail = JSON_BOOL_VALUE_VOICE_UTTERANCE_USE_CURTAIL; - this->prefs.utterance_file_qty_max = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_QTY_MAX; - this->prefs.utterance_file_size_max = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_SIZE_MAX; - this->prefs.utterance_path = JSON_STR_VALUE_VOICE_UTTERANCE_PATH; - this->prefs.utterance_duration_min = JSON_INT_VALUE_VOICE_MINIMUM_DURATION; - this->prefs.ffv_leading_samples = JSON_INT_VALUE_VOICE_FFV_LEADING_SAMPLES; - this->prefs.force_voice_settings = JSON_BOOL_VALUE_VOICE_FORCE_VOICE_SETTINGS; - this->prefs.vrex_test_flag = JSON_BOOL_VALUE_VOICE_VREX_TEST_FLAG; - this->prefs.vrex_wuw_bypass_success_flag = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_SUCCESS_FLAG; - this->prefs.vrex_wuw_bypass_failure_flag = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_FAILURE_FLAG; - this->prefs.force_toggle_fallback = JSON_BOOL_VALUE_VOICE_FORCE_TOGGLE_FALLBACK; - this->prefs.telemetry_session_stats = JSON_BOOL_VALUE_VOICE_TELEMETRY_SESSION_STATS; - this->prefs.par_voice_enabled = false; - this->prefs.par_voice_eos_method = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_METHOD; - this->prefs.par_voice_eos_timeout = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_TIMEOUT; + this->prefs.aspect_ratio = JSON_STR_VALUE_VOICE_ASPECT_RATIO; + this->prefs.guide_language = JSON_STR_VALUE_VOICE_LANGUAGE; + this->prefs.app_id_http = JSON_STR_VALUE_VOICE_APP_ID_HTTP; + this->prefs.app_id_ws = JSON_STR_VALUE_VOICE_APP_ID_WS; + this->prefs.timeout_vrex_connect = JSON_INT_VALUE_VOICE_VREX_REQUEST_TIMEOUT; + this->prefs.timeout_vrex_session = JSON_INT_VALUE_VOICE_VREX_RESPONSE_TIMEOUT; + this->prefs.timeout_stats = JSON_INT_VALUE_VOICE_TIMEOUT_STATS; + this->prefs.timeout_packet_initial = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_INITIAL; + this->prefs.timeout_packet_subsequent = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_SUBSEQUENT; + this->prefs.bitrate_minimum = JSON_INT_VALUE_VOICE_BITRATE_MINIMUM; + this->prefs.time_threshold = JSON_INT_VALUE_VOICE_TIME_THRESHOLD; + this->prefs.utterance_save = ctrlm_is_production_build() ? JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_0 : JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_1; + this->prefs.utterance_use_curtail = JSON_BOOL_VALUE_VOICE_UTTERANCE_USE_CURTAIL; + this->prefs.utterance_file_qty_max = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_QTY_MAX; + this->prefs.utterance_file_size_max = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_SIZE_MAX; + this->prefs.utterance_path = JSON_STR_VALUE_VOICE_UTTERANCE_PATH; + this->prefs.utterance_duration_min = JSON_INT_VALUE_VOICE_MINIMUM_DURATION; + this->prefs.ffv_leading_samples = JSON_INT_VALUE_VOICE_FFV_LEADING_SAMPLES; + this->prefs.voice_activity_detection_mode = this->voice_activity_detection_mode_to_xrsr(JSON_STR_VALUE_VOICE_VOICE_ACTIVITY_DETECTION_MODE); + this->prefs.force_voice_settings = JSON_BOOL_VALUE_VOICE_FORCE_VOICE_SETTINGS; + this->prefs.vrex_test_flag = JSON_BOOL_VALUE_VOICE_VREX_TEST_FLAG; + this->prefs.vrex_wuw_bypass_success_flag = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_SUCCESS_FLAG; + this->prefs.vrex_wuw_bypass_failure_flag = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_FAILURE_FLAG; + this->prefs.force_toggle_fallback = JSON_BOOL_VALUE_VOICE_FORCE_TOGGLE_FALLBACK; + this->prefs.telemetry_session_stats = JSON_BOOL_VALUE_VOICE_TELEMETRY_SESSION_STATS; + this->prefs.par_voice_enabled = false; + this->prefs.par_voice_eos_method = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_METHOD; + this->prefs.par_voice_eos_timeout = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_TIMEOUT; this->voice_params_opus_encoder_default(); - this->xrsr_opened = false; - this->voice_ipc = NULL; - this->packet_loss_threshold = JSON_INT_VALUE_VOICE_PACKET_LOSS_THRESHOLD; - this->vsdk_config = NULL; - this->nsm_voice_session = false; + this->xrsr_opened = false; + this->voice_ipc = NULL; + this->packet_loss_threshold = JSON_INT_VALUE_VOICE_PACKET_LOSS_THRESHOLD; + this->vsdk_config = NULL; + this->nsm_voice_session = false; #ifndef TELEMETRY_SUPPORT XLOGD_WARN("telemetry is not enabled"); @@ -261,6 +262,7 @@ ctrlm_voice_t::ctrlm_voice_t() { this->secure_url_required = JSON_BOOL_VALUE_VOICE_REQUIRE_SECURE_URL; XLOGD_TELEMETRY("require i_SAT <%s> i_MTLS <%s> i_secure_url <%s>", this->sat_token_required ? "YES" : "NO", this->mtls_required ? "YES" : "NO", this->secure_url_required ? "YES" : "NO"); + XLOGD_INFO("voice activity detection mode <%s>", xrsr_stream_voice_activity_mode_str(this->prefs.voice_activity_detection_mode)); errno_t safec_rc = memset_s(this->sat_token, sizeof(this->sat_token), 0, sizeof(this->sat_token)); ERR_CHK(safec_rc); @@ -425,6 +427,13 @@ bool ctrlm_voice_t::voice_configure_config_file_json(json_t *obj_voice, json_t * conf.config_value_get(JSON_STR_NAME_VOICE_URL_SRC_MIC_TAP, this->prefs.server_url_src_mic_tap); conf.config_value_get(JSON_STR_NAME_VOICE_LANGUAGE, this->prefs.guide_language); conf.config_value_get(JSON_INT_NAME_VOICE_MINIMUM_DURATION, this->prefs.utterance_duration_min); + + std::string voice_activity_detection_mode; + if(conf.config_value_get(JSON_STR_NAME_VOICE_VOICE_ACTIVITY_DETECTION_MODE, voice_activity_detection_mode)) { + this->prefs.voice_activity_detection_mode = this->voice_activity_detection_mode_to_xrsr(voice_activity_detection_mode); + XLOGD_INFO("voice activity detection mode <%s>", xrsr_stream_voice_activity_mode_str(this->prefs.voice_activity_detection_mode)); + } + if(conf.config_value_get(JSON_BOOL_NAME_VOICE_ENABLE_SAT, this->sat_token_required)) { ctrlm_sm_voice_sat_enable_write(this->sat_token_required); XLOGD_TELEMETRY("require c_SAT <%s>", this->sat_token_required ? "YES" : "NO"); @@ -1089,6 +1098,17 @@ void ctrlm_voice_t::voice_params_opus_encoder_default(void) { this->voice_params_opus_samples_per_packet_set(); } +xrsr_stream_voice_activity_mode_t ctrlm_voice_t::voice_activity_detection_mode_to_xrsr(std::string mode) { + // Configure voice activity detection parameters based on mode + if(mode == "enabled") { // Voice activity detection will be used but not enforced + return(XRSR_STREAM_VOICE_ACTIVITY_MODE_ENABLED); + } else if(mode == "enforced") { // Voice session will only proceed if voice activity is detected + return(XRSR_STREAM_VOICE_ACTIVITY_MODE_ENFORCED); + } + // Voice activity detection is disabled (or invalid mode) + return(XRSR_STREAM_VOICE_ACTIVITY_MODE_DISABLED); +} + void ctrlm_voice_t::voice_params_opus_samples_per_packet_set(void) { guchar fr_dur = (this->prefs.opus_encoder_params[3] >> 4) & 0xF; switch(fr_dur) { @@ -2645,6 +2665,12 @@ void ctrlm_voice_t::voice_session_end_callback(ctrlm_voice_session_end_cb_t *ses end.result = SESSION_END_SHORT_UTTERANCE; end.reason = (int)session->end_reason_rcu; this->voice_ipc->session_end(end); + } else if(stats->session_end_reason == XRSR_SESSION_END_REASON_ERROR_AUDIO_SILENT) { + ctrlm_voice_ipc_event_session_end_t end; + end.common = session->ipc_common_data; + end.result = SESSION_END_SILENT_UTTERANCE; + end.reason = (int)session->end_reason_rcu; + this->voice_ipc->session_end(end); } else { ctrlm_voice_ipc_event_session_end_server_stats_t server_stats; ctrlm_voice_ipc_event_session_end_t end; @@ -2931,7 +2957,16 @@ void ctrlm_voice_t::voice_stream_end_callback(ctrlm_voice_stream_end_cb_t *strea #ifdef TELEMETRY_SUPPORT if(this->prefs.telemetry_session_stats) { uint32_t packets_total = session->packets_lost + session->packets_processed; - session->telemetry_session_stats.update_on_stream_end(stream_duration, packets_total, session->packets_lost, packets_total * samples_per_packet, session->packets_lost * samples_per_packet, decoder_failures, 0); + int32_t voice_detected = -1; + uint32_t peak_confidence = 0; + int32_t peak_rms_level = 0; + if(stats->audio_stats.vad_frames_processed > 0) { + voice_detected = (stats->audio_stats.vad_voice_detected) ? 1 : 0; + peak_confidence = (stats->audio_stats.vad_confidence_peak * 100); + peak_rms_level = stats->audio_stats.vad_rms_level_peak; + } + + session->telemetry_session_stats.update_on_stream_end(stream_duration, packets_total, session->packets_lost, packets_total * samples_per_packet, session->packets_lost * samples_per_packet, decoder_failures, 0, voice_detected, peak_confidence, peak_rms_level); } #endif } else if(samples_processed > 0) { @@ -2939,7 +2974,7 @@ void ctrlm_voice_t::voice_stream_end_callback(ctrlm_voice_stream_end_cb_t *strea XLOGD_INFO("src <%s> Samples Lost/Total <%u/%u> %.02f%% buffered max <%u> duration <%u> ms", ctrlm_voice_device_str(session->voice_device), samples_lost, samples_lost + samples_processed, 100.0 * ((double)samples_lost / (double)(samples_lost + samples_processed)), samples_buffered_max, stream_duration); #ifdef TELEMETRY_SUPPORT if(this->prefs.telemetry_session_stats) { - session->telemetry_session_stats.update_on_stream_end(stream_duration, 0, 0, samples_lost + samples_processed, samples_lost, decoder_failures, samples_buffered_max); + session->telemetry_session_stats.update_on_stream_end(stream_duration, 0, 0, samples_lost + samples_processed, samples_lost, decoder_failures, samples_buffered_max, -1, 0, 0); } #endif } @@ -4052,7 +4087,6 @@ void ctrlm_voice_t::voice_rfc_retrieved_handler(const ctrlm_rfc_attr_t& attr) { attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_STANDBY_TIMEOUT_SESSION, this->prefs.dst_params_standby.timeout_session) | attr.get_rfc_value(JSON_BOOL_NAME_VOICE_DST_PARAMS_STANDBY_IPV4_FALLBACK, this->prefs.dst_params_standby.ipv4_fallback) | attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_STANDBY_BACKOFF_DELAY, this->prefs.dst_params_standby.backoff_delay) | - attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_LOW_LATENCY_CONNECT_CHECK_INTERVAL, this->prefs.dst_params_low_latency.connect_check_interval) | attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_LOW_LATENCY_TIMEOUT_CONNECT, this->prefs.dst_params_low_latency.timeout_connect) | attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_LOW_LATENCY_TIMEOUT_INACTIVITY, this->prefs.dst_params_low_latency.timeout_inactivity) | @@ -4062,6 +4096,13 @@ void ctrlm_voice_t::voice_rfc_retrieved_handler(const ctrlm_rfc_attr_t& attr) { reroute = true; } + std::string voice_activity_detection_mode; + if(attr.get_rfc_value(JSON_STR_NAME_VOICE_VOICE_ACTIVITY_DETECTION_MODE, voice_activity_detection_mode)) { + this->prefs.voice_activity_detection_mode = this->voice_activity_detection_mode_to_xrsr(voice_activity_detection_mode); + XLOGD_INFO("voice activity detection mode <%s>", xrsr_stream_voice_activity_mode_str(this->prefs.voice_activity_detection_mode)); + reroute = true; + } + std::vector obj_server_hosts; if(attr.get_rfc_value(JSON_ARRAY_NAME_VOICE_SERVER_HOSTS, obj_server_hosts)) { this->url_hostname_patterns(obj_server_hosts); diff --git a/src/voice/ctrlm_voice_obj.h b/src/voice/ctrlm_voice_obj.h index 3ec217b2..897c24c9 100644 --- a/src/voice/ctrlm_voice_obj.h +++ b/src/voice/ctrlm_voice_obj.h @@ -273,41 +273,42 @@ typedef struct { // End Event Callback Structures typedef struct { - std::string server_url_src_ptt; - std::string server_url_src_ff; - std::string server_url_src_mic_tap; - std::vector server_hosts; - std::string aspect_ratio; - std::string guide_language; - std::string app_id_http; - std::string app_id_ws; - unsigned long timeout_vrex_connect; - unsigned long timeout_vrex_session; - unsigned long timeout_stats; - unsigned long timeout_packet_initial; - unsigned long timeout_packet_subsequent; - guchar bitrate_minimum; - guint16 time_threshold; - bool utterance_save; - bool utterance_use_curtail; - unsigned long utterance_file_qty_max; - unsigned long utterance_file_size_max; - std::string utterance_path; - unsigned long utterance_duration_min; - unsigned long ffv_leading_samples; - bool force_voice_settings; - bool vrex_test_flag; - bool vrex_wuw_bypass_success_flag; - bool vrex_wuw_bypass_failure_flag; - std::string opus_encoder_params_str; - uint8_t opus_encoder_params[CTRLM_RCU_RIB_ATTR_LEN_OPUS_ENCODING_PARAMS]; - bool force_toggle_fallback; - bool telemetry_session_stats; - xrsr_dst_params_t dst_params_standby; - xrsr_dst_params_t dst_params_low_latency; - bool par_voice_enabled; - uint8_t par_voice_eos_method; - uint16_t par_voice_eos_timeout; + std::string server_url_src_ptt; + std::string server_url_src_ff; + std::string server_url_src_mic_tap; + std::vector server_hosts; + std::string aspect_ratio; + std::string guide_language; + std::string app_id_http; + std::string app_id_ws; + unsigned long timeout_vrex_connect; + unsigned long timeout_vrex_session; + unsigned long timeout_stats; + unsigned long timeout_packet_initial; + unsigned long timeout_packet_subsequent; + guchar bitrate_minimum; + guint16 time_threshold; + bool utterance_save; + bool utterance_use_curtail; + unsigned long utterance_file_qty_max; + unsigned long utterance_file_size_max; + std::string utterance_path; + unsigned long utterance_duration_min; + unsigned long ffv_leading_samples; + xrsr_stream_voice_activity_mode_t voice_activity_detection_mode; + bool force_voice_settings; + bool vrex_test_flag; + bool vrex_wuw_bypass_success_flag; + bool vrex_wuw_bypass_failure_flag; + std::string opus_encoder_params_str; + uint8_t opus_encoder_params[CTRLM_RCU_RIB_ATTR_LEN_OPUS_ENCODING_PARAMS]; + bool force_toggle_fallback; + bool telemetry_session_stats; + xrsr_dst_params_t dst_params_standby; + xrsr_dst_params_t dst_params_low_latency; + bool par_voice_enabled; + uint8_t par_voice_eos_method; + uint16_t par_voice_eos_timeout; } voice_session_prefss_t; typedef struct { @@ -637,6 +638,8 @@ class ctrlm_voice_t { bool voice_params_opus_encoder_validate(std::string &opus_encoder_params_str); void voice_params_opus_samples_per_packet_set(void); bool voice_params_hex_str_to_bytes(std::string hex_string, guchar *data, guint32 length); + + xrsr_stream_voice_activity_mode_t voice_activity_detection_mode_to_xrsr(std::string mode); bool voice_session_can_request(ctrlm_voice_device_t device); void voice_session_set_active(ctrlm_voice_device_t device); diff --git a/src/voice/ctrlm_voice_obj_generic.cpp b/src/voice/ctrlm_voice_obj_generic.cpp index 4cb35f99..948d6039 100644 --- a/src/voice/ctrlm_voice_obj_generic.cpp +++ b/src/voice/ctrlm_voice_obj_generic.cpp @@ -172,12 +172,17 @@ void ctrlm_voice_generic_t::voice_sdk_update_routes() { xrsr_src_t src = (xrsr_src_t)j; ctrlm_voice_device_t src_device = xrsr_to_voice_device(src); std::string *url = NULL; + + xrsr_stream_voice_activity_mode_t stream_vad_mode = XRSR_STREAM_VOICE_ACTIVITY_MODE_DISABLED; sem_wait(&this->device_status_semaphore); if(this->device_status[src_device] != CTRLM_VOICE_DEVICE_STATUS_DISABLED && this->device_status[src_device] != CTRLM_VOICE_DEVICE_STATUS_NOT_SUPPORTED) { switch(src_device) { case CTRLM_VOICE_DEVICE_PTT: { url = &this->prefs.server_url_src_ptt; + + // For PTT source, use the configured VAD mode which may be disabled, enabled, or enforced + stream_vad_mode = this->prefs.voice_activity_detection_mode; break; } case CTRLM_VOICE_DEVICE_MICROPHONE: @@ -232,15 +237,16 @@ void ctrlm_voice_generic_t::voice_sdk_update_routes() { if(!this->obj_ws_nextgen->get_handlers(&handlers_xrsr)) { XLOGD_ERROR("failed to get handlers ws"); } else { - routes[i].src = src; - routes[i].dst_qty = 1; - routes[i].dsts[0].url = urls_translated[translated_index].c_str(); - routes[i].dsts[0].handlers = handlers_xrsr; - routes[i].dsts[0].formats = XRSR_AUDIO_FORMAT_PCM; - routes[i].dsts[0].stream_time_min = this->prefs.utterance_duration_min; - routes[i].dsts[0].stream_from = stream_from; - routes[i].dsts[0].stream_offset = stream_offset; - routes[i].dsts[0].stream_until = stream_until; + routes[i].src = src; + routes[i].dst_qty = 1; + routes[i].dsts[0].url = urls_translated[translated_index].c_str(); + routes[i].dsts[0].handlers = handlers_xrsr; + routes[i].dsts[0].formats = XRSR_AUDIO_FORMAT_PCM; + routes[i].dsts[0].stream_time_min = this->prefs.utterance_duration_min; + routes[i].dsts[0].stream_vad_mode = stream_vad_mode; + routes[i].dsts[0].stream_from = stream_from; + routes[i].dsts[0].stream_offset = stream_offset; + routes[i].dsts[0].stream_until = stream_until; if(networked_standby_supported && (src == XRSR_SRC_MICROPHONE)) { routes[i].dsts[0].params[XRSR_POWER_MODE_LOW] = &this->prefs.dst_params_standby; } @@ -264,15 +270,16 @@ void ctrlm_voice_generic_t::voice_sdk_update_routes() { if(!this->obj_ws_nsp->get_handlers(&handlers_xrsr)) { XLOGD_ERROR("failed to get handlers ws"); } else { - routes[i].src = src; - routes[i].dst_qty = 1; - routes[i].dsts[0].url = urls_translated[translated_index].c_str(); - routes[i].dsts[0].handlers = handlers_xrsr; - routes[i].dsts[0].formats = XRSR_AUDIO_FORMAT_PCM | XRSR_AUDIO_FORMAT_PCM_32_BIT | XRSR_AUDIO_FORMAT_PCM_32_BIT_MULTI | XRSR_AUDIO_FORMAT_PCM_RAW; - routes[i].dsts[0].stream_time_min = 0; - routes[i].dsts[0].stream_from = XRSR_STREAM_FROM_LIVE; - routes[i].dsts[0].stream_offset = 0; - routes[i].dsts[0].stream_until = XRSR_STREAM_UNTIL_END_OF_STREAM; + routes[i].src = src; + routes[i].dst_qty = 1; + routes[i].dsts[0].url = urls_translated[translated_index].c_str(); + routes[i].dsts[0].handlers = handlers_xrsr; + routes[i].dsts[0].formats = XRSR_AUDIO_FORMAT_PCM | XRSR_AUDIO_FORMAT_PCM_32_BIT | XRSR_AUDIO_FORMAT_PCM_32_BIT_MULTI | XRSR_AUDIO_FORMAT_PCM_RAW; + routes[i].dsts[0].stream_time_min = 0; + routes[i].dsts[0].stream_vad_mode = XRSR_STREAM_VOICE_ACTIVITY_MODE_DISABLED; + routes[i].dsts[0].stream_from = XRSR_STREAM_FROM_LIVE; + routes[i].dsts[0].stream_offset = 0; + routes[i].dsts[0].stream_until = XRSR_STREAM_UNTIL_END_OF_STREAM; if(networked_standby_supported && (src == XRSR_SRC_MICROPHONE)) { routes[i].dsts[0].params[XRSR_POWER_MODE_LOW] = &this->prefs.dst_params_standby; } @@ -305,19 +312,20 @@ void ctrlm_voice_generic_t::voice_sdk_update_routes() { url->append("speech?"); } - routes[i].src = src; - routes[i].dst_qty = 1; - routes[i].dsts[0].url = url->c_str(); - routes[i].dsts[0].handlers = handlers_xrsr; + routes[i].src = src; + routes[i].dst_qty = 1; + routes[i].dsts[0].url = url->c_str(); + routes[i].dsts[0].handlers = handlers_xrsr; #ifdef AUDIO_DECODE - routes[i].dsts[0].formats = XRSR_AUDIO_FORMAT_PCM; + routes[i].dsts[0].formats = XRSR_AUDIO_FORMAT_PCM; #else - routes[i].dsts[0].formats = XRSR_AUDIO_FORMAT_PCM | XRSR_AUDIO_FORMAT_ADPCM; + routes[i].dsts[0].formats = XRSR_AUDIO_FORMAT_PCM | XRSR_AUDIO_FORMAT_ADPCM; #endif - routes[i].dsts[0].stream_time_min = this->prefs.utterance_duration_min; - routes[i].dsts[0].stream_from = stream_from; - routes[i].dsts[0].stream_offset = stream_offset; - routes[i].dsts[0].stream_until = stream_until; + routes[i].dsts[0].stream_time_min = this->prefs.utterance_duration_min; + routes[i].dsts[0].stream_vad_mode = stream_vad_mode; + routes[i].dsts[0].stream_from = stream_from; + routes[i].dsts[0].stream_offset = stream_offset; + routes[i].dsts[0].stream_until = stream_until; if(networked_standby_supported && (src == XRSR_SRC_MICROPHONE)) { routes[i].dsts[0].params[XRSR_POWER_MODE_LOW] = &this->prefs.dst_params_standby; } @@ -334,15 +342,16 @@ void ctrlm_voice_generic_t::voice_sdk_update_routes() { XLOGD_ERROR("failed to get handlers ws"); } else { - routes[i].src = src; - routes[i].dst_qty = 1; - routes[i].dsts[0].url = url->c_str(); - routes[i].dsts[0].handlers = handlers_xrsr; - routes[i].dsts[0].formats = XRSR_AUDIO_FORMAT_PCM; - routes[i].dsts[0].stream_time_min = this->prefs.utterance_duration_min; - routes[i].dsts[0].stream_from = stream_from; - routes[i].dsts[0].stream_offset = stream_offset; - routes[i].dsts[0].stream_until = stream_until; + routes[i].src = src; + routes[i].dst_qty = 1; + routes[i].dsts[0].url = url->c_str(); + routes[i].dsts[0].handlers = handlers_xrsr; + routes[i].dsts[0].formats = XRSR_AUDIO_FORMAT_PCM; + routes[i].dsts[0].stream_time_min = this->prefs.utterance_duration_min; + routes[i].dsts[0].stream_vad_mode = stream_vad_mode; + routes[i].dsts[0].stream_from = stream_from; + routes[i].dsts[0].stream_offset = stream_offset; + routes[i].dsts[0].stream_until = stream_until; if(networked_standby_supported && (src == XRSR_SRC_MICROPHONE)) { routes[i].dsts[0].params[XRSR_POWER_MODE_LOW] = &this->prefs.dst_params_standby; } @@ -357,15 +366,16 @@ void ctrlm_voice_generic_t::voice_sdk_update_routes() { XLOGD_ERROR("failed to get handlers ws"); } else { - routes[i].src = src; - routes[i].dst_qty = 1; - routes[i].dsts[0].url = urls_translated[translated_index].c_str(); - routes[i].dsts[0].handlers = handlers_xrsr; - routes[i].dsts[0].formats = XRSR_AUDIO_FORMAT_PCM; - routes[i].dsts[0].stream_time_min = this->prefs.utterance_duration_min; - routes[i].dsts[0].stream_from = stream_from; - routes[i].dsts[0].stream_offset = stream_offset; - routes[i].dsts[0].stream_until = stream_until; + routes[i].src = src; + routes[i].dst_qty = 1; + routes[i].dsts[0].url = urls_translated[translated_index].c_str(); + routes[i].dsts[0].handlers = handlers_xrsr; + routes[i].dsts[0].formats = XRSR_AUDIO_FORMAT_PCM; + routes[i].dsts[0].stream_time_min = this->prefs.utterance_duration_min; + routes[i].dsts[0].stream_vad_mode = stream_vad_mode; + routes[i].dsts[0].stream_from = stream_from; + routes[i].dsts[0].stream_offset = stream_offset; + routes[i].dsts[0].stream_until = stream_until; if(networked_standby_supported && (src == XRSR_SRC_MICROPHONE)) { routes[i].dsts[0].params[XRSR_POWER_MODE_LOW] = &this->prefs.dst_params_standby; } diff --git a/src/voice/ipc/ctrlm_voice_ipc.h b/src/voice/ipc/ctrlm_voice_ipc.h index 969a8380..db8dc582 100644 --- a/src/voice/ipc/ctrlm_voice_ipc.h +++ b/src/voice/ipc/ctrlm_voice_ipc.h @@ -92,7 +92,8 @@ typedef enum { SESSION_END_SUCCESS, SESSION_END_FAILURE, SESSION_END_ABORT, - SESSION_END_SHORT_UTTERANCE + SESSION_END_SHORT_UTTERANCE, + SESSION_END_SILENT_UTTERANCE } ctrlm_voice_ipc_event_session_end_result_t; typedef struct { diff --git a/src/voice/ipc/ctrlm_voice_ipc_iarm_legacy.cpp b/src/voice/ipc/ctrlm_voice_ipc_iarm_legacy.cpp index 6564ef4c..bdcc85d8 100644 --- a/src/voice/ipc/ctrlm_voice_ipc_iarm_legacy.cpp +++ b/src/voice/ipc/ctrlm_voice_ipc_iarm_legacy.cpp @@ -158,6 +158,18 @@ bool ctrlm_voice_ipc_iarm_legacy_t::session_end(const ctrlm_voice_ipc_event_sess ret = broadcast_iarm_event_legacy(CTRLM_MAIN_IARM_BUS_NAME, CTRLM_VOICE_IARM_EVENT_SESSION_SHORT, &event, sizeof(event)); break; } + case SESSION_END_SILENT_UTTERANCE: { + ctrlm_voice_iarm_event_session_silent_t event = {0}; + event.api_revision = CTRLM_VOICE_IARM_BUS_API_REVISION; + event.network_id = session_end.common.network_id; + event.network_type = session_end.common.network_type; + event.controller_id = session_end.common.controller_id; + event.session_id = session_end.common.session_id_ctrlm; + event.reason = (ctrlm_voice_session_end_reason_t)session_end.reason; + event.return_code_internal = session_end.return_code_internal; + ret = broadcast_iarm_event_legacy(CTRLM_MAIN_IARM_BUS_NAME, CTRLM_VOICE_IARM_EVENT_SESSION_SILENT, &event, sizeof(event)); + break; + } } } // Reset state diff --git a/src/voice/ipc/ctrlm_voice_ipc_iarm_thunder.cpp b/src/voice/ipc/ctrlm_voice_ipc_iarm_thunder.cpp index 6b407173..b148dda5 100644 --- a/src/voice/ipc/ctrlm_voice_ipc_iarm_thunder.cpp +++ b/src/voice/ipc/ctrlm_voice_ipc_iarm_thunder.cpp @@ -62,6 +62,7 @@ #define JSON_SESSION_END_RESULT_ERROR "error" #define JSON_SESSION_END_RESULT_ABORT "abort" #define JSON_SESSION_END_RESULT_SHORT "shortUtterance" +#define JSON_SESSION_END_RESULT_SILENT "silentUtterance" #define JSON_SESSION_END_TRANSCRIPTION "transcription" #define JSON_SESSION_END_PROTOCOL_ERROR "protocolErrorCode" #define JSON_SESSION_END_PROTOCOL_LIBRARY_ERROR "protocolLibraryErrorCode" @@ -71,6 +72,7 @@ #define JSON_SESSION_END_ERROR_REASON "reason" #define JSON_SESSION_END_ABORT_REASON "reason" #define JSON_SESSION_END_SHORT_REASON "reason" +#define JSON_SESSION_END_SILENT_REASON "reason" #define JSON_SESSION_END_STB_STATS "stbStats" #define JSON_SESSION_END_STB_STATS_TYPE "type" #define JSON_SESSION_END_STB_STATS_FIRMWARE "firmware" @@ -291,6 +293,20 @@ bool ctrlm_voice_ipc_iarm_thunder_t::session_end(const ctrlm_voice_ipc_event_ses } break; } + case SESSION_END_SILENT_UTTERANCE: { + int rc_silent; + rc |= json_object_set_new_nocheck(event_data, JSON_SESSION_END_RESULT, json_string(JSON_SESSION_END_RESULT_SILENT)); + + // Add Audio Silent Data to result object + rc_silent = json_object_set_new_nocheck(event_result, JSON_SESSION_END_SILENT_REASON, json_integer(session_end.reason)); + if(0 != rc_silent) { + XLOGD_ERROR("Error creating audio silent JSON subobject"); + JSON_DEREFERENCE(event_result); + } else { + rc |= json_object_set_new_nocheck(event_data, JSON_SESSION_END_RESULT_SILENT, event_result); + } + break; + } } if(session_end.stb_stats) { int stats_rc; diff --git a/src/voice/telemetry/ctrlm_voice_telemetry_events.cpp b/src/voice/telemetry/ctrlm_voice_telemetry_events.cpp index d5f6bba3..041d420a 100644 --- a/src/voice/telemetry/ctrlm_voice_telemetry_events.cpp +++ b/src/voice/telemetry/ctrlm_voice_telemetry_events.cpp @@ -212,7 +212,10 @@ bool ctrlm_voice_telemetry_session_t::event() { ss << "\"" << m_server_message << "\","; ss << m_result << ","; ss << m_end_reason_stream << ","; - ss << m_ret_code_protocol << "]]"; + ss << m_ret_code_protocol << ","; + ss << m_voice_detected << ","; + ss << m_peak_confidence << ","; + ss << m_peak_rms_level << "]]"; if(m_event_list.length() + ss.str().length() > m_event_list_max_size) { // Maximum data size exceeded XLOGD_WARN("telemetry event exceeds max size <%s,%s>", val_marker.c_str(), ss.str().c_str()); @@ -267,7 +270,7 @@ void ctrlm_voice_telemetry_session_t::update_on_key_release(int32_t time_start_l m_has_key_release = true; } -void ctrlm_voice_telemetry_session_t::update_on_stream_end(uint32_t time_stream_len_act, uint32_t packets_total, uint32_t packets_lost, uint32_t samples_total, uint32_t samples_lost, uint32_t decoder_failures, uint32_t samples_buffered_max) { +void ctrlm_voice_telemetry_session_t::update_on_stream_end(uint32_t time_stream_len_act, uint32_t packets_total, uint32_t packets_lost, uint32_t samples_total, uint32_t samples_lost, uint32_t decoder_failures, uint32_t samples_buffered_max, int32_t voice_detected, uint32_t peak_confidence, int32_t peak_rms_level) { m_time_stream_len_act = time_stream_len_act; m_packets_total = packets_total; m_packets_lost = packets_lost; @@ -276,6 +279,12 @@ void ctrlm_voice_telemetry_session_t::update_on_stream_end(uint32_t time_stream_ m_decoder_failures = decoder_failures; m_samples_buffered_max = samples_buffered_max; + if(voice_detected >= 0) { + m_voice_detected = voice_detected; + m_peak_confidence = peak_confidence; + m_peak_rms_level = peak_rms_level; + } + if(m_has_key_release) { m_time_stream_delta = m_time_stream_len_act - m_time_stream_len_exp; } @@ -320,6 +329,10 @@ void ctrlm_voice_telemetry_session_t::reset_stats() { m_result = false; m_ret_code_protocol = 0; + m_voice_detected = -1; + m_peak_confidence = 0; + m_peak_rms_level = 0; + m_server_message.clear(); m_device_type.clear(); m_device_version.clear(); diff --git a/src/voice/telemetry/ctrlm_voice_telemetry_events.h b/src/voice/telemetry/ctrlm_voice_telemetry_events.h index adadfde6..f0b6ccd9 100644 --- a/src/voice/telemetry/ctrlm_voice_telemetry_events.h +++ b/src/voice/telemetry/ctrlm_voice_telemetry_events.h @@ -75,7 +75,7 @@ class ctrlm_voice_telemetry_session_t : public ctrlm_telemetry_event_t