From 1c953c7e804979279850905546276b5e8ac4ca27 Mon Sep 17 00:00:00 2001 From: LiaCastaneda Date: Wed, 18 Mar 2026 12:51:03 +0100 Subject: [PATCH 1/2] Fix tdigest --- .../tests/dataframe/dataframe_functions.rs | 2 +- .../functions-aggregate-common/src/tdigest.rs | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index 014f356cd64cd..c4feb89e33e70 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -510,7 +510,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { +-------------------------------------------------------------------------------------+ | approx_percentile_cont(Float64(0.1),Int32(2)) WITHIN GROUP [test.b DESC NULLS LAST] | +-------------------------------------------------------------------------------------+ - | 69 | + | 100 | +-------------------------------------------------------------------------------------+ "); diff --git a/datafusion/functions-aggregate-common/src/tdigest.rs b/datafusion/functions-aggregate-common/src/tdigest.rs index a7450f0eb52e9..da4c0c4e7452a 100644 --- a/datafusion/functions-aggregate-common/src/tdigest.rs +++ b/datafusion/functions-aggregate-common/src/tdigest.rs @@ -443,6 +443,13 @@ impl TDigest { return self.max(); } + // If rank reaches the last unit of weight, return max directly. + // Without this, interpolation at the last centroid boundary can + // produce p90 > p99 on sparse data (e.g. 10 values). + if rank >= self.count - 1.0 { + return self.max(); + } + pos = 0; t = self.count; @@ -735,6 +742,23 @@ mod tests { assert_state_roundtrip!(t); } + // On sparse data, higher quantiles must not return lower values than lower quantiles. + #[test] + fn test_sparse_dataset_quantile_ordering() { + let values = vec![1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0, 1000.0]; + let t = TDigest::new(100); + let t = t.merge_unsorted_f64(values); + + let p50 = t.estimate_quantile(0.5); + let p90 = t.estimate_quantile(0.9); + let p99 = t.estimate_quantile(0.99); + + assert!(p50 <= p90, "p50 ({p50}) should be <= p90 ({p90})"); + assert!(p90 <= p99, "p90 ({p90}) should be <= p99 ({p99})"); + assert_eq!(p90, 1000.0, "p90 should be max on boundary rank"); + assert_eq!(p99, 1000.0, "p99 should be max on boundary rank"); + } + #[test] fn test_size() { let t = TDigest::new(10); From c89eb6600cec7ae274ff4a6078be3cd2ace64e7e Mon Sep 17 00:00:00 2001 From: LiaCastaneda Date: Wed, 18 Mar 2026 13:56:09 +0100 Subject: [PATCH 2/2] Adjust sqllogictest --- datafusion/sqllogictest/test_files/aggregate.slt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index cf894a494ad90..a29cde3047c8f 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -2292,7 +2292,7 @@ SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FRO a 73 b 68 c 122 -d 124 +d 125 e 115 @@ -2303,7 +2303,7 @@ SELECT c1, approx_percentile_cont(c3, 0.95) AS c3_p95 FROM aggregate_test_100 GR a 73 b 68 c 122 -d 124 +d 125 e 115 @@ -2314,7 +2314,7 @@ SELECT c1, approx_percentile_cont(c2, 0.95) AS c2, approx_percentile_cont(c3, 0. a 5 73 b 5 68 c 5 122 -d 5 124 +d 5 125 e 5 115 # error is unique to this UDAF @@ -2345,7 +2345,7 @@ SELECT c1, approx_percentile_cont_with_weight(1, 0.95) WITHIN GROUP (ORDER BY c3 a 73 b 68 c 122 -d 124 +d 125 e 115 # csv_query_approx_percentile_cont_with_weight alternate syntax @@ -2355,7 +2355,7 @@ SELECT c1, approx_percentile_cont_with_weight(c3, 1, 0.95) AS c3_p95 FROM aggreg a 73 b 68 c 122 -d 124 +d 125 e 115 @@ -2375,7 +2375,7 @@ SELECT c1, approx_percentile_cont(0.95, 200) WITHIN GROUP (ORDER BY c3) AS c3_p9 a 73 b 68 c 122 -d 124 +d 125 e 115 query TI