Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion libdd-trace-obfuscation/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ authors.workspace = true
anyhow = "1.0"
regex = "1"
serde = { version = "1.0.145", features = ["derive"] }
serde_json = "1.0"
serde_json = { version = "1.0", features = ["preserve_order"] }
url = "^2.5.4"
percent-encoding = "2.1"
log = "0.4"
Expand Down
306 changes: 306 additions & 0 deletions libdd-trace-obfuscation/src/json.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
// Copyright 2026-Present Datadog, Inc. https://www.datadoghq.com/
// SPDX-License-Identifier: Apache-2.0

use std::collections::HashSet;

use crate::json_scanner::{Op, Scanner};

type Transformer = Box<dyn Fn(&str) -> String + Send + Sync>;

/// Obfuscates a JSON string by replacing all leaf values with `"?"`, unless the value
/// belongs to a key listed in `keep_keys`, in which case it is left verbatim.
/// Keys in `transform_keys` have their string values passed through a transformer function
/// (e.g. SQL obfuscation) instead of being replaced with `"?"`.
///
/// Multiple concatenated JSON objects in the input are each obfuscated independently.
/// On a parse error the output so far is returned with `"..."` appended.
pub struct JsonObfuscator {
keep_keys: HashSet<String>,
transform_keys: HashSet<String>,
transformer: Option<Transformer>,
}

enum ClosureKind {
Array,
Object,
}

impl JsonObfuscator {
pub fn new(
keep_keys: impl IntoIterator<Item = String>,
transform_keys: impl IntoIterator<Item = String>,
transformer: Option<Transformer>,
) -> Self {
Self {
keep_keys: keep_keys.into_iter().collect(),
transform_keys: transform_keys.into_iter().collect(),
transformer,
}
}

/// Obfuscates json string and return an optional error on malformatted json
/// If an error occurs, an value is returned anyways which might be truncated (...)
pub fn obfuscate(&self, input: &str) -> (String, Option<String>) {
if input.is_empty() {
return (String::new(), None);
}

let mut out = String::with_capacity(input.len());
let mut scanner = Scanner::new();
let mut buf = String::new(); // accumulates key chars or transform-value chars
let mut closures: Vec<ClosureKind> = Vec::new();
let mut keep_depth: usize = 0;
let mut key = false;
let mut wiped = false;
let mut keeping = false;
let mut transforming_value = false;

for c in input.chars() {
let op = scanner.step(c);
let depth = closures.len(); // snapshot before any mutation

match op {
Op::BeginObject => {
closures.push(ClosureKind::Object);
set_key(&closures, &mut key, &mut wiped);
transforming_value = false;
}
Op::BeginArray => {
closures.push(ClosureKind::Array);
set_key(&closures, &mut key, &mut wiped);
transforming_value = false;
}
Op::EndArray | Op::EndObject => {
closures.pop();
set_key(&closures, &mut key, &mut wiped);
handle_value_done(
&mut out,
&mut buf,
&mut keeping,
&mut transforming_value,
&mut keep_depth,
depth,
self.transformer.as_deref(),
);
}
Op::ObjectValue | Op::ArrayValue => {
set_key(&closures, &mut key, &mut wiped);
handle_value_done(
&mut out,
&mut buf,
&mut keeping,
&mut transforming_value,
&mut keep_depth,
depth,
self.transformer.as_deref(),
);
}
Op::BeginLiteral | Op::Continue => {
if transforming_value {
buf.push(c);
continue;
} else if key {
buf.push(c);
} else if !keeping {
if !wiped {
out.push_str("\"?\"");
wiped = true;
}
continue;
}
Comment thread
Eldolfin marked this conversation as resolved.
}
Op::ObjectKey => {
let k = buf.trim_matches('"');
if !keeping && self.keep_keys.contains(k) {
keeping = true;
keep_depth = depth + 1;
} else if !transforming_value
&& self.transformer.is_some()
&& self.transform_keys.contains(k)
{
transforming_value = true;
}
buf.clear();
key = false;
}
Op::SkipSpace => continue,
Op::Error => {
out.push_str("...");
return (out, scanner.err);
}
Op::End => {} // whitespace between JSON objects — fall through to output char
}

out.push(c);
}

if scanner.eof() == Op::Error {
out.push_str("...");
}
(out, scanner.err)
}
}

/// Updates `key` and `wiped` based on the current closure stack.
/// `key` is true at top level or when inside an object (not an array).
fn set_key(closures: &[ClosureKind], key: &mut bool, wiped: &mut bool) {
let n = closures.len();
*key = n == 0 || matches!(closures[n - 1], ClosureKind::Object);
*wiped = false;
}

/// Handles the "value is done" logic after a value-ending opcode.
/// Writes the transformer result if applicable, or stops keeping if depth shrinks.
fn handle_value_done(
out: &mut String,
buf: &mut String,
keeping: &mut bool,
transforming_value: &mut bool,
keep_depth: &mut usize,
depth: usize,
transformer: Option<&(dyn Fn(&str) -> String + Send + Sync)>,
) {
if *transforming_value {
if let Some(t) = transformer {
// Unquote the collected JSON string literal (handles escape sequences).
let raw: String =
serde_json::from_str(buf).unwrap_or_else(|_| buf.trim_matches('"').to_string());
let result = t(&raw);
out.push('"');
out.push_str(&result);
out.push('"');
Comment thread
Eldolfin marked this conversation as resolved.
*transforming_value = false;
buf.clear();
}
} else if *keeping && depth < *keep_depth {
*keeping = false;
}
}

#[cfg(test)]
mod tests {
use duplicate::duplicate_item;
use serde_json::json;

use super::JsonObfuscator;
use crate::sql::obfuscate_sql_string;

fn obf(keep_keys: &[&str]) -> JsonObfuscator {
JsonObfuscator::new(keep_keys.iter().map(|s| s.to_string()), [], None)
}

fn obf_sql(keep_keys: &[&str], transform_keys: &[&str]) -> JsonObfuscator {
JsonObfuscator::new(
keep_keys.iter().map(|s| s.to_string()),
transform_keys.iter().map(|s| s.to_string()),
Some(Box::new(obfuscate_sql_string)),
)
}

fn assert_json_eq(result: &str, expected: &str) {
let result: serde_json::Value =
serde_json::from_str(result).expect("result is not valid JSON");
let expected: serde_json::Value =
serde_json::from_str(expected).expect("expected is not valid JSON");
assert_eq!(result, expected);
}

// Basic obfuscation tests — parametric over (keep_keys, input, expected).
// Uses assert_json_eq (structural comparison, whitespace-insensitive).
#[duplicate_item(
test_name keep_keys input expected;
[test_empty_object] [&[]] ["{}"] ["{}"];
[test_empty_array] [&[]] ["[]"] ["[]"];
[test_emoji_object] [&["🐵"]] [r#"{"🐵":"🙊"}"#] [r#"{"🐵":"🙊"}"#];
[test_nested_empty_objects] [&[]] [r#"{"a":{},"b":{"c":{}}}"#] [r#"{"a":{},"b":{"c":{}}}"#];
[test_boolean_and_null_obfuscated][&[]] [r#"{"a":true,"b":false,"c":null}"#] [r#"{"a":"?","b":"?","c":"?"}"#];
[test_all_values_obfuscated] [&[]] [r#"{"query":{"multi_match":{"query":"guide","fields":["_all",{"key":"value","other":["1","2",{"k":"v"}]},"2"]}}}"#] [r#"{"query":{"multi_match":{"query":"?","fields":["?",{"key":"?","other":["?","?",{"k":"?"}]},"?"]}}}"#];
[test_numbers_obfuscated] [&[]] [r#"{"highlight":{"pre_tags":["<em>"],"post_tags":["</em>"],"index":1}}"#] [r#"{"highlight":{"pre_tags":["?"],"post_tags":["?"],"index":"?"}}"#];
[test_keep_key_keeps_entire_value][&["other"]] [r#"{"query":{"multi_match":{"query":"guide","fields":["_all",{"key":"value","other":["1","2",{"k":"v"}]},"2"]}}}"#] [r#"{"query":{"multi_match":{"query":"?","fields":["?",{"key":"?","other":["1","2",{"k":"v"}]},"?"]}}}"#];
[test_keep_key_nested_array_fully_kept][&["fields"]] [r#"{"fields":["_all",{"key":"value","other":["1","2",{"k":"v"}]},"2"]}"#] [r#"{"fields":["_all",{"key":"value","other":["1","2",{"k":"v"}]},"2"]}"#];
[test_keep_key_deep_nested] [&["k"]] [r#"{"fields":["_all",{"key":"value","other":["1","2",{"k":"v"}]},"2"]}"#] [r#"{"fields":["?",{"key":"?","other":["?","?",{"k":"v"}]},"?"]}"#];
[test_keep_key_in_nested_object] [&["C"]] [r#"{"fields":[{"A":1,"B":{"C":3}},"2"]}"#] [r#"{"fields":[{"A":"?","B":{"C":3}},"?"]}"#];
[test_keep_key_large_nested_structure][&["hits"]] [r#"{"outer":{"total":2,"max_score":0.9105287,"hits":[{"_index":"bookdb_index","_score":0.9105287}]}}"#] [r#"{"outer":{"total":"?","max_score":"?","hits":[{"_index":"bookdb_index","_score":0.9105287}]}}"#];
[test_keep_multiple_keys] [&["_index","title"]][r#"{"hits":[{"_index":"bookdb_index","_type":"book","_score":0.9,"_source":{"summary":"text","title":"ES in Action","publish_date":"2015-12-03"},"highlight":{"title":["ES Action"]}}]}"#] [r#"{"hits":[{"_index":"bookdb_index","_type":"?","_score":"?","_source":{"summary":"?","title":"ES in Action","publish_date":"?"},"highlight":{"title":["ES Action"]}}]}"#];
[test_keep_key_wallet] [&["company_wallet_configuration_id"]] [r#"{"email":"dev@datadoghq.com","company_wallet_configuration_id":1}"#] [r#"{"email":"?","company_wallet_configuration_id":1}"#];
)]
#[test]
fn test_name() {
let (res, err) = obf(keep_keys).obfuscate(input);
assert_eq!(err, None);
assert_json_eq(&res, expected);
}

// Truncation / error tests — parametric over (input, expected_exact_string).
#[duplicate_item(
test_name input expected expected_error;
[test_empty_input] [""] [""] [None];
[test_invalid_emoji] ["🤨"] ["..."] [Some("invalid character '🤨' looking for beginning of value".to_owned())];
[test_invalid_unicode] ["ჸ"] ["..."] [Some("invalid character 'ჸ' looking for beginning of value".to_owned())];
[test_invalid_json_appends_ellipsis]["INVALID"] ["..."] [Some("invalid character 'I' looking for beginning of value".to_owned())];
[test_invalid_single_char] [")"] ["..."] [Some("invalid character ')' looking for beginning of value".to_owned())];
[test_truncated_open_value_string] [r#"{"query":""#] [r#"{"query":"?"..."#] [Some("unexpected end of JSON input at char position 11".to_owned())];
[test_truncated_multi_json] [r#"{"first json": "valid"} {"second json": "unfinished"#] [r#"{"first json":"?"} {"second json":"?"..."#] [Some("unexpected end of JSON input at char position 53".to_owned())];
)]
#[test]
fn test_name() {
let (res, err) = obf(&[]).obfuscate(input);
assert_eq!(res, expected);
assert_eq!(err, expected_error);
}

#[test]
fn test_multiple_json_objects() {
// Multiple concatenated JSON objects (elasticsearch bulk API pattern).
let input = r#"{"index":{"_index":"traces","_type":"trace"}} {"value":1,"name":"test"}"#;
let (result, err) = obf(&[]).obfuscate(input);
assert_eq!(err, None);
let mut stream =
serde_json::Deserializer::from_str(&result).into_iter::<serde_json::Value>();
let first = stream
.next()
.expect("first value")
.expect("first value is valid JSON");
let second = stream
.next()
.expect("second value")
.expect("second value is valid JSON");
assert_eq!(first, json!({"index":{"_index":"?","_type":"?"}}));
assert_eq!(second, json!({"value":"?","name":"?"}));
}

#[test]
fn test_transform_key_sql_basic() {
let input = r#"{"query":"select * from table where id = 2","hello":"world","hi":"there"}"#;
let (result, err) = obf_sql(&["hello"], &["query"]).obfuscate(input);
assert_eq!(err, None);

let val: serde_json::Value = serde_json::from_str(&result).unwrap();
assert_eq!(val["hello"], json!("world"));
assert_eq!(val["hi"], json!("?"));
assert!(
val["query"].as_str().unwrap().contains('?'),
"SQL value should be obfuscated"
);
}

#[test]
fn test_transform_key_with_object_value_falls_through() {
let input = r#"{"object":{"not a":"query"}}"#;
let expected = r#"{"object":{"not a":"?"}}"#;
let (res, err) = obf_sql(&[], &["object"]).obfuscate(input);
assert_eq!(err, None);

assert_json_eq(&res, expected);
}

#[test]
fn test_transform_key_with_array_value_falls_through() {
let input = r#"{"object":["not","a","query"]}"#;
let expected = r#"{"object":["?","?","?"]}"#;
let (res, err) = obf_sql(&[], &["object"]).obfuscate(input);
assert_eq!(err, None);

assert_json_eq(&res, expected);
}
}
Loading
Loading