Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/ctrlm_ipc.h
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,8 @@ typedef enum {
CTRLM_RCU_IARM_EVENT_RF4CE_PAIRING_WINDOW_TIMEOUT = 35, ///< Indicates that a battery milestone event occured
CTRLM_RCU_IARM_EVENT_FIRMWARE_UPDATE_PROGRESS = 36, ///< Generated when an milestone is reached for remote firmware upgrade
CTRLM_RCU_IARM_EVENT_VALIDATION_STATUS = 37, ///< Generated when the validation status changes
CTRLM_MAIN_IARM_EVENT_MAX = 38 ///< Placeholder for the last event (used in registration)
CTRLM_VOICE_IARM_EVENT_SESSION_SILENT = 38, ///< Voice session was silent (no speech detected)
CTRLM_MAIN_IARM_EVENT_MAX = 39 ///< Placeholder for the last event (used in registration)
} ctrlm_main_iarm_event_t;

/// @brief Remote Control Key Status
Expand Down
11 changes: 11 additions & 0 deletions include/ctrlm_ipc_voice.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,16 @@ typedef struct {
long return_code_internal; ///< Internally generated return code
} ctrlm_voice_iarm_event_session_short_t;

typedef struct {
unsigned char api_revision; ///< The revision of this API.
ctrlm_network_id_t network_id; ///< Identifier of network on which the controller is bound
ctrlm_network_type_t network_type; ///< Type of network on which the controller is bound
ctrlm_controller_id_t controller_id; ///< A unique identifier of the remote
unsigned long session_id; ///< A unique id for the voice session.
ctrlm_voice_session_end_reason_t reason; ///< The reason that the voice session was silent
long return_code_internal; ///< Internally generated return code
} ctrlm_voice_iarm_event_session_silent_t;

typedef struct {
unsigned char api_revision; ///< The revision of this API
char media_service_url[2083]; ///< The url for the media service (null terminated string)
Expand Down Expand Up @@ -376,6 +386,7 @@ typedef struct {
/// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_STATS | ctrlm_voice_iarm_event_session_stats_t * | Generated when the statistics of the voice session are available |
/// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_ABORT | ctrlm_voice_iarm_event_session_abort_t * | Generated when a voice session is aborted (denied) |
/// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_SHORT | ctrlm_voice_iarm_event_session_short_t * | Generated when a short voice session is detected |
/// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_SESSION_SILENT | ctrlm_voice_iarm_event_session_silent_t * | Generated when a silent voice session is detected |
/// | CTRLM_MAIN_IARM_BUS_NAME | CTRLM_VOICE_IARM_EVENT_MEDIA_SERVICE | ctrlm_voice_iarm_event_media_service_t * | Generated when a media service response is received |
///
/// IARM events are available on a subscription basis. In order to receive an event, a client must explicitly register to receive the event by calling
Expand Down
4 changes: 3 additions & 1 deletion src/ctrlm_config_default.json
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,9 @@
"par_voice_eos_method" : 1,
"par_voice_eos_timeout" : 2500,
"server_hosts" : [],
"telemetry_session_stats" : false
"telemetry_session_stats" : false,
"voice_activity_detection_mode" : "enabled"

},
"device_update" : {
"dir_root" : "/srv/device_update/",
Expand Down
10 changes: 8 additions & 2 deletions src/telemetry/ctrlm_telemetry_markers.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@
// The Voice Session Statistics Marker reports statistics for the voice session(s). The format of the marker is a json array of arrays with each event in the format below:
//
// [[event1], [event2], [event3], ...]
// [<version>,<device_type>,<device_version>,<encoding>,<interaction_mode>,<time_prev_session>,<time_start_lag>,<time_stream_len_exp>,<time_stream_len_act>,<time_stream_delta>,<packets_total>,<packets_lost>,<samples_total>,<samples_lost>,<decoder_failures>,<samples_buffered_max>,<stream_ret_code>,<protocol_ret_code>,<server_ret_code>,<server_message>,<result>]
// [<version>,<device_type>,<device_version>,<encoding>,<interaction_mode>,<time_prev_session>,<time_start_lag>,<time_stream_len_exp>,<time_stream_len_act>,<time_stream_delta>,<packets_total>,<packets_lost>,<samples_total>,<samples_lost>,<decoder_failures>,<samples_buffered_max>,<stream_ret_code>,<protocol_ret_code>,<server_ret_code>,<server_message>,<result>,<end_reason_stream>,<ret_code_protocol>,<voice_detected>,<peak_confidence>,<peak_rms_level>]
//
// <version> - Version of the marker format.
// <device_type> - Name of the device that started the session.
Expand All @@ -116,8 +116,14 @@
// <server_ret_code> - server success/error code.
// <server_message> - server message.
// <result> - flag to indicate if session was successful.
// <end_reason_stream> - reason why the stream ended (if available).
// <ret_code_protocol> - protocol return code.
// <voice_detected> - flag to indicate if voice was detected.
// <peak_confidence> - peak confidence level.
// <peak_rms_level> - peak RMS level.

#define MARKER_VOICE_SESSION_STATS "ctrlm.voice.session.stats"
#define MARKER_VOICE_SESSION_STATS_VERSION "2"
#define MARKER_VOICE_SESSION_STATS_VERSION "3"

// End Voice Session Statistics

Expand Down
111 changes: 76 additions & 35 deletions src/voice/ctrlm_voice_obj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,39 +186,40 @@ ctrlm_voice_t::ctrlm_voice_t() {
#ifdef JSON_ARRAY_VAL_STR_VOICE_SERVER_HOSTS_3
this->url_hostname_pattern_add(JSON_ARRAY_VAL_STR_VOICE_SERVER_HOSTS_3);
#endif
this->prefs.aspect_ratio = JSON_STR_VALUE_VOICE_ASPECT_RATIO;
this->prefs.guide_language = JSON_STR_VALUE_VOICE_LANGUAGE;
this->prefs.app_id_http = JSON_STR_VALUE_VOICE_APP_ID_HTTP;
this->prefs.app_id_ws = JSON_STR_VALUE_VOICE_APP_ID_WS;
this->prefs.timeout_vrex_connect = JSON_INT_VALUE_VOICE_VREX_REQUEST_TIMEOUT;
this->prefs.timeout_vrex_session = JSON_INT_VALUE_VOICE_VREX_RESPONSE_TIMEOUT;
this->prefs.timeout_stats = JSON_INT_VALUE_VOICE_TIMEOUT_STATS;
this->prefs.timeout_packet_initial = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_INITIAL;
this->prefs.timeout_packet_subsequent = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_SUBSEQUENT;
this->prefs.bitrate_minimum = JSON_INT_VALUE_VOICE_BITRATE_MINIMUM;
this->prefs.time_threshold = JSON_INT_VALUE_VOICE_TIME_THRESHOLD;
this->prefs.utterance_save = ctrlm_is_production_build() ? JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_0 : JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_1;
this->prefs.utterance_use_curtail = JSON_BOOL_VALUE_VOICE_UTTERANCE_USE_CURTAIL;
this->prefs.utterance_file_qty_max = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_QTY_MAX;
this->prefs.utterance_file_size_max = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_SIZE_MAX;
this->prefs.utterance_path = JSON_STR_VALUE_VOICE_UTTERANCE_PATH;
this->prefs.utterance_duration_min = JSON_INT_VALUE_VOICE_MINIMUM_DURATION;
this->prefs.ffv_leading_samples = JSON_INT_VALUE_VOICE_FFV_LEADING_SAMPLES;
this->prefs.force_voice_settings = JSON_BOOL_VALUE_VOICE_FORCE_VOICE_SETTINGS;
this->prefs.vrex_test_flag = JSON_BOOL_VALUE_VOICE_VREX_TEST_FLAG;
this->prefs.vrex_wuw_bypass_success_flag = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_SUCCESS_FLAG;
this->prefs.vrex_wuw_bypass_failure_flag = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_FAILURE_FLAG;
this->prefs.force_toggle_fallback = JSON_BOOL_VALUE_VOICE_FORCE_TOGGLE_FALLBACK;
this->prefs.telemetry_session_stats = JSON_BOOL_VALUE_VOICE_TELEMETRY_SESSION_STATS;
this->prefs.par_voice_enabled = false;
this->prefs.par_voice_eos_method = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_METHOD;
this->prefs.par_voice_eos_timeout = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_TIMEOUT;
this->prefs.aspect_ratio = JSON_STR_VALUE_VOICE_ASPECT_RATIO;
this->prefs.guide_language = JSON_STR_VALUE_VOICE_LANGUAGE;
this->prefs.app_id_http = JSON_STR_VALUE_VOICE_APP_ID_HTTP;
this->prefs.app_id_ws = JSON_STR_VALUE_VOICE_APP_ID_WS;
this->prefs.timeout_vrex_connect = JSON_INT_VALUE_VOICE_VREX_REQUEST_TIMEOUT;
this->prefs.timeout_vrex_session = JSON_INT_VALUE_VOICE_VREX_RESPONSE_TIMEOUT;
this->prefs.timeout_stats = JSON_INT_VALUE_VOICE_TIMEOUT_STATS;
this->prefs.timeout_packet_initial = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_INITIAL;
this->prefs.timeout_packet_subsequent = JSON_INT_VALUE_VOICE_TIMEOUT_PACKET_SUBSEQUENT;
this->prefs.bitrate_minimum = JSON_INT_VALUE_VOICE_BITRATE_MINIMUM;
this->prefs.time_threshold = JSON_INT_VALUE_VOICE_TIME_THRESHOLD;
this->prefs.utterance_save = ctrlm_is_production_build() ? JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_0 : JSON_ARRAY_VAL_BOOL_VOICE_SAVE_LAST_UTTERANCE_1;
this->prefs.utterance_use_curtail = JSON_BOOL_VALUE_VOICE_UTTERANCE_USE_CURTAIL;
this->prefs.utterance_file_qty_max = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_QTY_MAX;
this->prefs.utterance_file_size_max = JSON_INT_VALUE_VOICE_UTTERANCE_FILE_SIZE_MAX;
this->prefs.utterance_path = JSON_STR_VALUE_VOICE_UTTERANCE_PATH;
this->prefs.utterance_duration_min = JSON_INT_VALUE_VOICE_MINIMUM_DURATION;
this->prefs.ffv_leading_samples = JSON_INT_VALUE_VOICE_FFV_LEADING_SAMPLES;
this->prefs.voice_activity_detection_mode = this->voice_activity_detection_mode_to_xrsr(JSON_STR_VALUE_VOICE_VOICE_ACTIVITY_DETECTION_MODE);
this->prefs.force_voice_settings = JSON_BOOL_VALUE_VOICE_FORCE_VOICE_SETTINGS;
this->prefs.vrex_test_flag = JSON_BOOL_VALUE_VOICE_VREX_TEST_FLAG;
this->prefs.vrex_wuw_bypass_success_flag = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_SUCCESS_FLAG;
this->prefs.vrex_wuw_bypass_failure_flag = JSON_BOOL_VALUE_VOICE_VREX_WUW_BYPASS_FAILURE_FLAG;
this->prefs.force_toggle_fallback = JSON_BOOL_VALUE_VOICE_FORCE_TOGGLE_FALLBACK;
this->prefs.telemetry_session_stats = JSON_BOOL_VALUE_VOICE_TELEMETRY_SESSION_STATS;
this->prefs.par_voice_enabled = false;
this->prefs.par_voice_eos_method = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_METHOD;
this->prefs.par_voice_eos_timeout = JSON_INT_VALUE_VOICE_PAR_VOICE_EOS_TIMEOUT;
this->voice_params_opus_encoder_default();
this->xrsr_opened = false;
this->voice_ipc = NULL;
this->packet_loss_threshold = JSON_INT_VALUE_VOICE_PACKET_LOSS_THRESHOLD;
this->vsdk_config = NULL;
this->nsm_voice_session = false;
this->xrsr_opened = false;
this->voice_ipc = NULL;
this->packet_loss_threshold = JSON_INT_VALUE_VOICE_PACKET_LOSS_THRESHOLD;
this->vsdk_config = NULL;
this->nsm_voice_session = false;

#ifndef TELEMETRY_SUPPORT
XLOGD_WARN("telemetry is not enabled");
Expand Down Expand Up @@ -261,6 +262,7 @@ ctrlm_voice_t::ctrlm_voice_t() {
this->secure_url_required = JSON_BOOL_VALUE_VOICE_REQUIRE_SECURE_URL;

XLOGD_TELEMETRY("require i_SAT <%s> i_MTLS <%s> i_secure_url <%s>", this->sat_token_required ? "YES" : "NO", this->mtls_required ? "YES" : "NO", this->secure_url_required ? "YES" : "NO");
XLOGD_INFO("voice activity detection mode <%s>", xrsr_stream_voice_activity_mode_str(this->prefs.voice_activity_detection_mode));

errno_t safec_rc = memset_s(this->sat_token, sizeof(this->sat_token), 0, sizeof(this->sat_token));
ERR_CHK(safec_rc);
Expand Down Expand Up @@ -425,6 +427,13 @@ bool ctrlm_voice_t::voice_configure_config_file_json(json_t *obj_voice, json_t *
conf.config_value_get(JSON_STR_NAME_VOICE_URL_SRC_MIC_TAP, this->prefs.server_url_src_mic_tap);
conf.config_value_get(JSON_STR_NAME_VOICE_LANGUAGE, this->prefs.guide_language);
conf.config_value_get(JSON_INT_NAME_VOICE_MINIMUM_DURATION, this->prefs.utterance_duration_min);

std::string voice_activity_detection_mode;
if(conf.config_value_get(JSON_STR_NAME_VOICE_VOICE_ACTIVITY_DETECTION_MODE, voice_activity_detection_mode)) {
this->prefs.voice_activity_detection_mode = this->voice_activity_detection_mode_to_xrsr(voice_activity_detection_mode);
XLOGD_INFO("voice activity detection mode <%s>", xrsr_stream_voice_activity_mode_str(this->prefs.voice_activity_detection_mode));
}

if(conf.config_value_get(JSON_BOOL_NAME_VOICE_ENABLE_SAT, this->sat_token_required)) {
ctrlm_sm_voice_sat_enable_write(this->sat_token_required);
XLOGD_TELEMETRY("require c_SAT <%s>", this->sat_token_required ? "YES" : "NO");
Expand Down Expand Up @@ -1089,6 +1098,17 @@ void ctrlm_voice_t::voice_params_opus_encoder_default(void) {
this->voice_params_opus_samples_per_packet_set();
}

xrsr_stream_voice_activity_mode_t ctrlm_voice_t::voice_activity_detection_mode_to_xrsr(std::string mode) {
// Configure voice activity detection parameters based on mode
if(mode == "enabled") { // Voice activity detection will be used but not enforced
return(XRSR_STREAM_VOICE_ACTIVITY_MODE_ENABLED);
} else if(mode == "enforced") { // Voice session will only proceed if voice activity is detected
return(XRSR_STREAM_VOICE_ACTIVITY_MODE_ENFORCED);
}
// Voice activity detection is disabled (or invalid mode)
return(XRSR_STREAM_VOICE_ACTIVITY_MODE_DISABLED);
}

void ctrlm_voice_t::voice_params_opus_samples_per_packet_set(void) {
guchar fr_dur = (this->prefs.opus_encoder_params[3] >> 4) & 0xF;
switch(fr_dur) {
Expand Down Expand Up @@ -2645,6 +2665,12 @@ void ctrlm_voice_t::voice_session_end_callback(ctrlm_voice_session_end_cb_t *ses
end.result = SESSION_END_SHORT_UTTERANCE;
end.reason = (int)session->end_reason_rcu;
this->voice_ipc->session_end(end);
} else if(stats->session_end_reason == XRSR_SESSION_END_REASON_ERROR_AUDIO_SILENT) {
ctrlm_voice_ipc_event_session_end_t end;
end.common = session->ipc_common_data;
end.result = SESSION_END_SILENT_UTTERANCE;
end.reason = (int)session->end_reason_rcu;
this->voice_ipc->session_end(end);
} else {
ctrlm_voice_ipc_event_session_end_server_stats_t server_stats;
ctrlm_voice_ipc_event_session_end_t end;
Expand Down Expand Up @@ -2931,15 +2957,24 @@ void ctrlm_voice_t::voice_stream_end_callback(ctrlm_voice_stream_end_cb_t *strea
#ifdef TELEMETRY_SUPPORT
if(this->prefs.telemetry_session_stats) {
uint32_t packets_total = session->packets_lost + session->packets_processed;
session->telemetry_session_stats.update_on_stream_end(stream_duration, packets_total, session->packets_lost, packets_total * samples_per_packet, session->packets_lost * samples_per_packet, decoder_failures, 0);
int32_t voice_detected = -1;
uint32_t peak_confidence = 0;
int32_t peak_rms_level = 0;
if(stats->audio_stats.vad_frames_processed > 0) {
voice_detected = (stats->audio_stats.vad_voice_detected) ? 1 : 0;
peak_confidence = (stats->audio_stats.vad_confidence_peak * 100);
peak_rms_level = stats->audio_stats.vad_rms_level_peak;
}

session->telemetry_session_stats.update_on_stream_end(stream_duration, packets_total, session->packets_lost, packets_total * samples_per_packet, session->packets_lost * samples_per_packet, decoder_failures, 0, voice_detected, peak_confidence, peak_rms_level);
}
#endif
} else if(samples_processed > 0) {
uint32_t stream_duration = (samples_processed / 16); // 16 kHz samples to ms
XLOGD_INFO("src <%s> Samples Lost/Total <%u/%u> %.02f%% buffered max <%u> duration <%u> ms", ctrlm_voice_device_str(session->voice_device), samples_lost, samples_lost + samples_processed, 100.0 * ((double)samples_lost / (double)(samples_lost + samples_processed)), samples_buffered_max, stream_duration);
#ifdef TELEMETRY_SUPPORT
if(this->prefs.telemetry_session_stats) {
session->telemetry_session_stats.update_on_stream_end(stream_duration, 0, 0, samples_lost + samples_processed, samples_lost, decoder_failures, samples_buffered_max);
session->telemetry_session_stats.update_on_stream_end(stream_duration, 0, 0, samples_lost + samples_processed, samples_lost, decoder_failures, samples_buffered_max, -1, 0, 0);
}
#endif
}
Expand Down Expand Up @@ -4052,7 +4087,6 @@ void ctrlm_voice_t::voice_rfc_retrieved_handler(const ctrlm_rfc_attr_t& attr) {
attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_STANDBY_TIMEOUT_SESSION, this->prefs.dst_params_standby.timeout_session) |
attr.get_rfc_value(JSON_BOOL_NAME_VOICE_DST_PARAMS_STANDBY_IPV4_FALLBACK, this->prefs.dst_params_standby.ipv4_fallback) |
attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_STANDBY_BACKOFF_DELAY, this->prefs.dst_params_standby.backoff_delay) |

attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_LOW_LATENCY_CONNECT_CHECK_INTERVAL, this->prefs.dst_params_low_latency.connect_check_interval) |
attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_LOW_LATENCY_TIMEOUT_CONNECT, this->prefs.dst_params_low_latency.timeout_connect) |
attr.get_rfc_value(JSON_INT_NAME_VOICE_DST_PARAMS_LOW_LATENCY_TIMEOUT_INACTIVITY, this->prefs.dst_params_low_latency.timeout_inactivity) |
Expand All @@ -4062,6 +4096,13 @@ void ctrlm_voice_t::voice_rfc_retrieved_handler(const ctrlm_rfc_attr_t& attr) {
reroute = true;
}

std::string voice_activity_detection_mode;
if(attr.get_rfc_value(JSON_STR_NAME_VOICE_VOICE_ACTIVITY_DETECTION_MODE, voice_activity_detection_mode)) {
this->prefs.voice_activity_detection_mode = this->voice_activity_detection_mode_to_xrsr(voice_activity_detection_mode);
XLOGD_INFO("voice activity detection mode <%s>", xrsr_stream_voice_activity_mode_str(this->prefs.voice_activity_detection_mode));
reroute = true;
}

std::vector<std::string> obj_server_hosts;
if(attr.get_rfc_value(JSON_ARRAY_NAME_VOICE_SERVER_HOSTS, obj_server_hosts)) {
this->url_hostname_patterns(obj_server_hosts);
Expand Down
Loading
Loading