Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datafusion/core/tests/dataframe/dataframe_functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> {
+-------------------------------------------------------------------------------------+
| approx_percentile_cont(Float64(0.1),Int32(2)) WITHIN GROUP [test.b DESC NULLS LAST] |
+-------------------------------------------------------------------------------------+
| 69 |
| 100 |
+-------------------------------------------------------------------------------------+
");

Expand Down
24 changes: 24 additions & 0 deletions datafusion/functions-aggregate-common/src/tdigest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,13 @@ impl TDigest {
return self.max();
}

// If rank reaches the last unit of weight, return max directly.
// Without this, interpolation at the last centroid boundary can
// produce p90 > p99 on sparse data (e.g. 10 values).
if rank >= self.count - 1.0 {
return self.max();
}

pos = 0;
t = self.count;

Expand Down Expand Up @@ -735,6 +742,23 @@ mod tests {
assert_state_roundtrip!(t);
}

// On sparse data, higher quantiles must not return lower values than lower quantiles.
#[test]
fn test_sparse_dataset_quantile_ordering() {
let values = vec![1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0, 1000.0];
let t = TDigest::new(100);
let t = t.merge_unsorted_f64(values);

let p50 = t.estimate_quantile(0.5);
let p90 = t.estimate_quantile(0.9);
let p99 = t.estimate_quantile(0.99);

assert!(p50 <= p90, "p50 ({p50}) should be <= p90 ({p90})");
assert!(p90 <= p99, "p90 ({p90}) should be <= p99 ({p99})");
assert_eq!(p90, 1000.0, "p90 should be max on boundary rank");
assert_eq!(p99, 1000.0, "p99 should be max on boundary rank");
}

#[test]
fn test_size() {
let t = TDigest::new(10);
Expand Down
12 changes: 6 additions & 6 deletions datafusion/sqllogictest/test_files/aggregate.slt
Original file line number Diff line number Diff line change
Expand Up @@ -2292,7 +2292,7 @@ SELECT c1, approx_percentile_cont(0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FRO
a 73
b 68
c 122
d 124
d 125
e 115


Expand All @@ -2303,7 +2303,7 @@ SELECT c1, approx_percentile_cont(c3, 0.95) AS c3_p95 FROM aggregate_test_100 GR
a 73
b 68
c 122
d 124
d 125
e 115


Expand All @@ -2314,7 +2314,7 @@ SELECT c1, approx_percentile_cont(c2, 0.95) AS c2, approx_percentile_cont(c3, 0.
a 5 73
b 5 68
c 5 122
d 5 124
d 5 125
e 5 115

# error is unique to this UDAF
Expand Down Expand Up @@ -2345,7 +2345,7 @@ SELECT c1, approx_percentile_cont_with_weight(1, 0.95) WITHIN GROUP (ORDER BY c3
a 73
b 68
c 122
d 124
d 125
e 115

# csv_query_approx_percentile_cont_with_weight alternate syntax
Expand All @@ -2355,7 +2355,7 @@ SELECT c1, approx_percentile_cont_with_weight(c3, 1, 0.95) AS c3_p95 FROM aggreg
a 73
b 68
c 122
d 124
d 125
e 115


Expand All @@ -2375,7 +2375,7 @@ SELECT c1, approx_percentile_cont(0.95, 200) WITHIN GROUP (ORDER BY c3) AS c3_p9
a 73
b 68
c 122
d 124
d 125
e 115

query TI
Expand Down
Loading