rdkcentral · dwolaver · Mar 23, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 26, 2026
@@ -302,7 +302,8 @@ typedef enum {
    CTRLM_RCU_IARM_EVENT_RF4CE_PAIRING_WINDOW_TIMEOUT = 35, ///< Indicates that a battery milestone event occured
    CTRLM_RCU_IARM_EVENT_FIRMWARE_UPDATE_PROGRESS    = 36, ///< Generated when an milestone is reached for remote firmware upgrade 
    CTRLM_RCU_IARM_EVENT_VALIDATION_STATUS           = 37, ///< Generated when the validation status changes
-   CTRLM_MAIN_IARM_EVENT_MAX                        = 38  ///< Placeholder for the last event (used in registration)
+   CTRLM_VOICE_IARM_EVENT_SESSION_SILENT            = 38, ///< Voice session was silent (no speech detected)
+   CTRLM_MAIN_IARM_EVENT_MAX                        = 39  ///< Placeholder for the last event (used in registration)
 } ctrlm_main_iarm_event_t;
 
 /// @brief Remote Control Key Status

@@ -281,6 +281,16 @@ typedef struct {
    long                             return_code_internal; ///< Internally generated return code
 } ctrlm_voice_iarm_event_session_short_t;
 
+typedef struct {
+   unsigned char                    api_revision;         ///< The revision of this API.
+   ctrlm_network_id_t               network_id;           ///< Identifier of network on which the controller is bound
+   ctrlm_network_type_t             network_type;         ///< Type of network on which the controller is bound
+   ctrlm_controller_id_t            controller_id;        ///< A unique identifier of the remote
+   unsigned long                    session_id;           ///< A unique id for the voice session.
+   ctrlm_voice_session_end_reason_t reason;               ///< The reason that the voice session was silent
+   long                             return_code_internal; ///< Internally generated return code
+} ctrlm_voice_iarm_event_session_silent_t;
+
 typedef struct {
    unsigned char api_revision;            ///< The revision of this API
    char          media_service_url[2083]; ///< The url for the media service (null terminated string)
@@ -376,6 +386,7 @@ typedef struct {
 /// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_STATS  | ctrlm_voice_iarm_event_session_stats_t *  | Generated when the statistics of the voice session are available |
 /// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_ABORT  | ctrlm_voice_iarm_event_session_abort_t *  | Generated when a voice session is aborted (denied)               |
 /// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_SHORT  | ctrlm_voice_iarm_event_session_short_t *  | Generated when a short voice session is detected                 |
+/// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_SILENT | ctrlm_voice_iarm_event_session_silent_t * | Generated when a silent voice session is detected                |
 /// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_MEDIA_SERVICE  | ctrlm_voice_iarm_event_media_service_t *  | Generated when a media service response is received              |
 ///
 /// IARM events are available on a subscription basis. In order to receive an event, a client must explicitly register to receive the event by calling

@@ -291,7 +291,9 @@
       "par_voice_eos_method"             :         1,
       "par_voice_eos_timeout"            :      2500,
       "server_hosts"                     :        [],
-      "telemetry_session_stats"          :     false
+      "telemetry_session_stats"          :     false,
+      "voice_activity_detection_mode"    :  "enabled"
+
    },
    "device_update" : {
       "dir_root"                         : "/srv/device_update/",

@@ -93,7 +93,7 @@
 // The Voice Session Statistics Marker reports statistics for the voice session(s). The format of the marker is a json array of arrays with each event in the format below:
 //
 // [[event1], [event2], [event3], ...]
-// [<version>,<device_type>,<device_version>,<encoding>,<interaction_mode>,<time_prev_session>,<time_start_lag>,<time_stream_len_exp>,<time_stream_len_act>,<time_stream_delta>,<packets_total>,<packets_lost>,<samples_total>,<samples_lost>,<decoder_failures>,<samples_buffered_max>,<stream_ret_code>,<protocol_ret_code>,<server_ret_code>,<server_message>,<result>]
+// [<version>,<device_type>,<device_version>,<encoding>,<interaction_mode>,<time_prev_session>,<time_start_lag>,<time_stream_len_exp>,<time_stream_len_act>,<time_stream_delta>,<packets_total>,<packets_lost>,<samples_total>,<samples_lost>,<decoder_failures>,<samples_buffered_max>,<stream_ret_code>,<protocol_ret_code>,<server_ret_code>,<server_message>,<result>,<end_reason_stream>,<ret_code_protocol>,<voice_detected>,<peak_confidence>,<peak_rms_level>]
 //
 // <version>              - Version of the marker format.
 // <device_type>          - Name of the device that started the session.
@@ -116,8 +116,14 @@
 // <server_ret_code>      - server success/error code.
 // <server_message>       - server message.
 // <result>               - flag to indicate if session was successful.
+// <end_reason_stream>    - reason why the stream ended (if available).
+// <ret_code_protocol>    - protocol return code.
+// <voice_detected>       - flag to indicate if voice was detected.
+// <peak_confidence>      - peak confidence level.
+// <peak_rms_level>       - peak RMS level.
+
 #define MARKER_VOICE_SESSION_STATS                 "ctrlm.voice.session.stats"
-#define MARKER_VOICE_SESSION_STATS_VERSION         "2"
+#define MARKER_VOICE_SESSION_STATS_VERSION         "3"
 
 // End Voice Session Statistics
 

@@ -186,39 +186,40 @@ ctrlm_voice_t::ctrlm_voice_t() {
     #ifdef JSON_ARRAY_VAL_STR_VOICE_SERVER_HOSTS_3
     this->url_hostname_pattern_add(JSON_ARRAY_VAL_STR_VOICE_SERVER_HOSTS_3);
     #endif
-    this->prefs.aspect_ratio                 = JSON_STR_VALUE_VOICE_ASPECT_RATIO;
-    this->prefs.guide_language               = JSON_STR_VALUE_VOICE_LANGUAGE;
-    this->prefs.app_id_http                  = JSON_STR_VALUE_VOICE_APP_ID_HTTP;
-    this->prefs.app_id_ws                    = JSON_STR_VALUE_VOICE_APP_ID_WS;
-    this->prefs.timeout_vrex_connect         = JSON_INT_VALUE_VOICE_VREX_REQUEST_TIMEOUT;
-    this->prefs.timeout_vrex_session         = JSON_INT_VALUE_VOICE_VREX_RESPONSE_TIMEOUT;
-    this->prefs.timeout_stats                = JSON_INT_VALUE_VOICE_TIMEOUT_STATS;
-    this->prefs.timeout_packet_initial       = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_INITIAL;
-    this->prefs.timeout_packet_subsequent    = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_SUBSEQUENT;
-    this->prefs.bitrate_minimum              = JSON_INT_VALUE_VOICE_BITRATE_MINIMUM;
-    this->prefs.time_threshold               = JSON_INT_VALUE_VOICE_TIME_THRESHOLD;
-    this->prefs.utterance_save               = ctrlm_is_production_build() ? JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_0 : JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_1;
-    this->prefs.utterance_use_curtail        = JSON_BOOL_VALUE_VOICE_UTTERANCE_USE_CURTAIL;
-    this->prefs.utterance_file_qty_max       = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_QTY_MAX;
-    this->prefs.utterance_file_size_max      = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_SIZE_MAX;
-    this->prefs.utterance_path               = JSON_STR_VALUE_VOICE_UTTERANCE_PATH;
-    this->prefs.utterance_duration_min       = JSON_INT_VALUE_VOICE_MINIMUM_DURATION;
-    this->prefs.ffv_leading_samples          = JSON_INT_VALUE_VOICE_FFV_LEADING_SAMPLES;
-    this->prefs.force_voice_settings         = JSON_BOOL_VALUE_VOICE_FORCE_VOICE_SETTINGS;
-    this->prefs.vrex_test_flag               = JSON_BOOL_VALUE_VOICE_VREX_TEST_FLAG;
-    this->prefs.vrex_wuw_bypass_success_flag = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_SUCCESS_FLAG;
-    this->prefs.vrex_wuw_bypass_failure_flag = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_FAILURE_FLAG;
-    this->prefs.force_toggle_fallback        = JSON_BOOL_VALUE_VOICE_FORCE_TOGGLE_FALLBACK;
-    this->prefs.telemetry_session_stats      = JSON_BOOL_VALUE_VOICE_TELEMETRY_SESSION_STATS;
-    this->prefs.par_voice_enabled            = false;
-    this->prefs.par_voice_eos_method         = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_METHOD;
-    this->prefs.par_voice_eos_timeout        = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_TIMEOUT;
+    this->prefs.aspect_ratio                     = JSON_STR_VALUE_VOICE_ASPECT_RATIO;
+    this->prefs.guide_language                   = JSON_STR_VALUE_VOICE_LANGUAGE;
+    this->prefs.app_id_http                      = JSON_STR_VALUE_VOICE_APP_ID_HTTP;
+    this->prefs.app_id_ws                        = JSON_STR_VALUE_VOICE_APP_ID_WS;
+    this->prefs.timeout_vrex_connect             = JSON_INT_VALUE_VOICE_VREX_REQUEST_TIMEOUT;
+    this->prefs.timeout_vrex_session             = JSON_INT_VALUE_VOICE_VREX_RESPONSE_TIMEOUT;
+    this->prefs.timeout_stats                    = JSON_INT_VALUE_VOICE_TIMEOUT_STATS;
+    this->prefs.timeout_packet_initial           = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_INITIAL;
+    this->prefs.timeout_packet_subsequent        = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_SUBSEQUENT;
+    this->prefs.bitrate_minimum                  = JSON_INT_VALUE_VOICE_BITRATE_MINIMUM;
+    this->prefs.time_threshold                   = JSON_INT_VALUE_VOICE_TIME_THRESHOLD;
+    this->prefs.utterance_save                   = ctrlm_is_production_build() ? JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_0 : JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_1;
+    this->prefs.utterance_use_curtail            = JSON_BOOL_VALUE_VOICE_UTTERANCE_USE_CURTAIL;
+    this->prefs.utterance_file_qty_max           = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_QTY_MAX;
+    this->prefs.utterance_file_size_max          = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_SIZE_MAX;
+    this->prefs.utterance_path                   = JSON_STR_VALUE_VOICE_UTTERANCE_PATH;
+    this->prefs.utterance_duration_min           = JSON_INT_VALUE_VOICE_MINIMUM_DURATION;
+    this->prefs.ffv_leading_samples              = JSON_INT_VALUE_VOICE_FFV_LEADING_SAMPLES;
+    this->prefs.voice_activity_detection_mode    = this->voice_activity_detection_mode_to_xrsr(JSON_STR_VALUE_VOICE_VOICE_ACTIVITY_DETECTION_MODE);
+    this->prefs.force_voice_settings             = JSON_BOOL_VALUE_VOICE_FORCE_VOICE_SETTINGS;
+    this->prefs.vrex_test_flag                   = JSON_BOOL_VALUE_VOICE_VREX_TEST_FLAG;
+    this->prefs.vrex_wuw_bypass_success_flag     = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_SUCCESS_FLAG;
+    this->prefs.vrex_wuw_bypass_failure_flag     = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_FAILURE_FLAG;
+    this->prefs.force_toggle_fallback            = JSON_BOOL_VALUE_VOICE_FORCE_TOGGLE_FALLBACK;
+    this->prefs.telemetry_session_stats          = JSON_BOOL_VALUE_VOICE_TELEMETRY_SESSION_STATS;
+    this->prefs.par_voice_enabled                = false;
+    this->prefs.par_voice_eos_method             = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_METHOD;
+    this->prefs.par_voice_eos_timeout            = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_TIMEOUT;
     this->voice_params_opus_encoder_default();
-    this->xrsr_opened                        = false;
-    this->voice_ipc                          = NULL;
-    this->packet_loss_threshold              = JSON_INT_VALUE_VOICE_PACKET_LOSS_THRESHOLD;
-    this->vsdk_config                        = NULL;
-    this->nsm_voice_session                  = false;
+    this->xrsr_opened                            = false;
+    this->voice_ipc                              = NULL;
+    this->packet_loss_threshold                  = JSON_INT_VALUE_VOICE_PACKET_LOSS_THRESHOLD;
+    this->vsdk_config                            = NULL;
+    this->nsm_voice_session                      = false;
 
     #ifndef TELEMETRY_SUPPORT
     XLOGD_WARN("telemetry is not enabled");
@@ -261,6 +262,7 @@ ctrlm_voice_t::ctrlm_voice_t() {
     this->secure_url_required       = JSON_BOOL_VALUE_VOICE_REQUIRE_SECURE_URL;
 
     XLOGD_TELEMETRY("require i_SAT <%s> i_MTLS <%s> i_secure_url <%s>", this->sat_token_required ? "YES" : "NO", this->mtls_required ? "YES" : "NO", this->secure_url_required ? "YES" : "NO");
+    XLOGD_INFO("voice activity detection mode <%s>", xrsr_stream_voice_activity_mode_str(this->prefs.voice_activity_detection_mode));
 
     errno_t safec_rc = memset_s(this->sat_token, sizeof(this->sat_token), 0, sizeof(this->sat_token));
     ERR_CHK(safec_rc);
@@ -425,6 +427,13 @@ bool ctrlm_voice_t::voice_configure_config_file_json(json_t *obj_voice, json_t *
             conf.config_value_get(JSON_STR_NAME_VOICE_URL_SRC_MIC_TAP,              this->prefs.server_url_src_mic_tap);
             conf.config_value_get(JSON_STR_NAME_VOICE_LANGUAGE,                     this->prefs.guide_language);
             conf.config_value_get(JSON_INT_NAME_VOICE_MINIMUM_DURATION,             this->prefs.utterance_duration_min);
+
+            std::string voice_activity_detection_mode;
+            if(conf.config_value_get(JSON_STR_NAME_VOICE_VOICE_ACTIVITY_DETECTION_MODE, voice_activity_detection_mode)) {
+                this->prefs.voice_activity_detection_mode = this->voice_activity_detection_mode_to_xrsr(voice_activity_detection_mode);
+                XLOGD_INFO("voice activity detection mode <%s>", xrsr_stream_voice_activity_mode_str(this->prefs.voice_activity_detection_mode));
+            }
+
             if(conf.config_value_get(JSON_BOOL_NAME_VOICE_ENABLE_SAT,                  this->sat_token_required)) {
                 ctrlm_sm_voice_sat_enable_write(this->sat_token_required);
                 XLOGD_TELEMETRY("require c_SAT <%s>", this->sat_token_required ? "YES" : "NO");
@@ -1089,6 +1098,17 @@ void ctrlm_voice_t::voice_params_opus_encoder_default(void) {
    this->voice_params_opus_samples_per_packet_set();
 }
 
+xrsr_stream_voice_activity_mode_t ctrlm_voice_t::voice_activity_detection_mode_to_xrsr(std::string mode) {
+   // Configure voice activity detection parameters based on mode
+   if(mode == "enabled") { // Voice activity detection will be used but not enforced
+       return(XRSR_STREAM_VOICE_ACTIVITY_MODE_ENABLED);
+   } else if(mode == "enforced") { // Voice session will only proceed if voice activity is detected
+       return(XRSR_STREAM_VOICE_ACTIVITY_MODE_ENFORCED);
+   }
+   // Voice activity detection is disabled (or invalid mode)
+   return(XRSR_STREAM_VOICE_ACTIVITY_MODE_DISABLED);
+}
+
 void ctrlm_voice_t::voice_params_opus_samples_per_packet_set(void) {
    guchar fr_dur = (this->prefs.opus_encoder_params[3] >> 4) & 0xF;
    switch(fr_dur) {
@@ -2645,6 +2665,12 @@ void ctrlm_voice_t::voice_session_end_callback(ctrlm_voice_session_end_cb_t *ses
             end.result = SESSION_END_SHORT_UTTERANCE;
             end.reason = (int)session->end_reason_rcu;
             this->voice_ipc->session_end(end);
+        } else if(stats->session_end_reason == XRSR_SESSION_END_REASON_ERROR_AUDIO_SILENT) {
+            ctrlm_voice_ipc_event_session_end_t end;
+            end.common = session->ipc_common_data;
+            end.result = SESSION_END_SILENT_UTTERANCE;
+            end.reason = (int)session->end_reason_rcu;
+            this->voice_ipc->session_end(end);
         } else {
             ctrlm_voice_ipc_event_session_end_server_stats_t server_stats;
             ctrlm_voice_ipc_event_session_end_t end;
@@ -2931,15 +2957,24 @@ void ctrlm_voice_t::voice_stream_end_callback(ctrlm_voice_stream_end_cb_t *strea
             #ifdef TELEMETRY_SUPPORT
             if(this->prefs.telemetry_session_stats) {
                 uint32_t packets_total = session->packets_lost + session->packets_processed;
-                session->telemetry_session_stats.update_on_stream_end(stream_duration, packets_total, session->packets_lost, packets_total * samples_per_packet, session->packets_lost * samples_per_packet, decoder_failures, 0);
+                int32_t  voice_detected  = -1;
+                uint32_t peak_confidence = 0;
+                int32_t  peak_rms_level  = 0;
+                if(stats->audio_stats.vad_frames_processed > 0) {
+                    voice_detected  = (stats->audio_stats.vad_voice_detected) ? 1 : 0;
+                    peak_confidence = (stats->audio_stats.vad_confidence_peak * 100);
+                    peak_rms_level  = stats->audio_stats.vad_rms_level_peak;
+                }
+
+                session->telemetry_session_stats.update_on_stream_end(stream_duration, packets_total, session->packets_lost, packets_total * samples_per_packet, session->packets_lost * samples_per_packet, decoder_failures, 0, voice_detected, peak_confidence, peak_rms_level);
             }
             #endif
         } else if(samples_processed > 0) {
             uint32_t stream_duration = (samples_processed / 16); // 16 kHz samples to ms
             XLOGD_INFO("src <%s> Samples Lost/Total <%u/%u> %.02f%% buffered max <%u> duration <%u> ms", ctrlm_voice_device_str(session->voice_device), samples_lost, samples_lost + samples_processed, 100.0 * ((double)samples_lost / (double)(samples_lost + samples_processed)), samples_buffered_max, stream_duration);
             #ifdef TELEMETRY_SUPPORT
             if(this->prefs.telemetry_session_stats) {
-                session->telemetry_session_stats.update_on_stream_end(stream_duration, 0, 0, samples_lost + samples_processed, samples_lost, decoder_failures, samples_buffered_max);
+                session->telemetry_session_stats.update_on_stream_end(stream_duration, 0, 0, samples_lost + samples_processed, samples_lost, decoder_failures, samples_buffered_max, -1, 0, 0);
             }
             #endif
         }
@@ -4052,7 +4087,6 @@ void ctrlm_voice_t::voice_rfc_retrieved_handler(const ctrlm_rfc_attr_t& attr) {
        attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_STANDBY_TIMEOUT_SESSION,            this->prefs.dst_params_standby.timeout_session) |
        attr.get_rfc_value(JSON_BOOL_NAME_VOICE_DST_PARAMS_STANDBY_IPV4_FALLBACK,             this->prefs.dst_params_standby.ipv4_fallback) |
        attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_STANDBY_BACKOFF_DELAY,              this->prefs.dst_params_standby.backoff_delay) |
-
        attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_LOW_LATENCY_CONNECT_CHECK_INTERVAL, this->prefs.dst_params_low_latency.connect_check_interval) |
        attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_LOW_LATENCY_TIMEOUT_CONNECT,        this->prefs.dst_params_low_latency.timeout_connect) |
        attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_LOW_LATENCY_TIMEOUT_INACTIVITY,     this->prefs.dst_params_low_latency.timeout_inactivity) |
@@ -4062,6 +4096,13 @@ void ctrlm_voice_t::voice_rfc_retrieved_handler(const ctrlm_rfc_attr_t& attr) {
         reroute = true;
     }
 
+    std::string voice_activity_detection_mode;
+    if(attr.get_rfc_value(JSON_STR_NAME_VOICE_VOICE_ACTIVITY_DETECTION_MODE, voice_activity_detection_mode)) {
+       this->prefs.voice_activity_detection_mode = this->voice_activity_detection_mode_to_xrsr(voice_activity_detection_mode);
+       XLOGD_INFO("voice activity detection mode <%s>", xrsr_stream_voice_activity_mode_str(this->prefs.voice_activity_detection_mode));
+       reroute = true;
+    }
+
     std::vector<std::string> obj_server_hosts;
     if(attr.get_rfc_value(JSON_ARRAY_NAME_VOICE_SERVER_HOSTS, obj_server_hosts)) {
         this->url_hostname_patterns(obj_server_hosts);