diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index d099964a383..15f519615af 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -7591,7 +7591,6 @@ dependencies = [ "arrow", "async-trait", "once_cell", - "parquet", "prost 0.14.3", "quickwit-common", "quickwit-config", @@ -7611,10 +7610,14 @@ dependencies = [ name = "quickwit-parquet-engine" version = "0.8.0" dependencies = [ + "anyhow", "arrow", + "chrono", "parquet", "proptest", + "prost 0.14.3", "quickwit-common", + "quickwit-proto", "sea-query", "serde", "serde_json", diff --git a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs index 9f79032d7c5..e29d4af2540 100644 --- a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs +++ b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs @@ -68,7 +68,7 @@ pub struct IndexingSchedulerState { /// /// Scheduling executes the following steps: /// 1. Builds a [`PhysicalIndexingPlan`] from the list of logical indexing tasks. See -/// [`build_physical_indexing_plan`] for the implementation details. +/// `build_physical_indexing_plan` for the implementation details. /// 2. Apply the [`PhysicalIndexingPlan`]: for each indexer, the scheduler send the indexing tasks /// by gRPC. An indexer immediately returns an Ok and apply asynchronously the received plan. Any /// errors (network) happening in this step are ignored. The scheduler runs a control loop that @@ -98,7 +98,7 @@ pub struct IndexingSchedulerState { /// Concretely, it will send the faulty nodes of the plan they are supposed to follow. // /// Finally, in order to give the time for each indexer to run their indexing tasks, the control -/// plane will wait at least [`MIN_DURATION_BETWEEN_SCHEDULING`] before comparing the desired +/// plane will wait at least `MIN_DURATION_BETWEEN_SCHEDULING` before comparing the desired /// plan with the running plan. pub struct IndexingScheduler { cluster_id: String, diff --git a/quickwit/quickwit-indexing/Cargo.toml b/quickwit/quickwit-indexing/Cargo.toml index 677b95e32aa..5d3a6504d29 100644 --- a/quickwit/quickwit-indexing/Cargo.toml +++ b/quickwit/quickwit-indexing/Cargo.toml @@ -123,6 +123,7 @@ sqlx = { workspace = true, features = ["runtime-tokio", "postgres"] } tempfile = { workspace = true } quickwit-actors = { workspace = true, features = ["testsuite"] } +quickwit-parquet-engine = { workspace = true, features = ["testsuite"] } quickwit-cluster = { workspace = true, features = ["testsuite"] } quickwit-common = { workspace = true, features = ["testsuite"] } quickwit-config = { workspace = true, features = ["testsuite"] } diff --git a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs index 16daf18102f..cfb2ade9361 100644 --- a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs @@ -637,10 +637,8 @@ impl IndexingPipeline { .spawn(parquet_uploader); // ParquetPackager - let parquet_schema = quickwit_parquet_engine::schema::ParquetSchema::new(); let writer_config = quickwit_parquet_engine::storage::ParquetWriterConfig::default(); let split_writer = quickwit_parquet_engine::storage::ParquetSplitWriter::new( - parquet_schema, writer_config, self.params.indexing_directory.path(), ); diff --git a/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs b/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs index 2a83c9610c9..eb51621a30f 100644 --- a/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs +++ b/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs @@ -24,7 +24,6 @@ use quickwit_common::rate_limited_tracing::rate_limited_warn; use quickwit_common::runtimes::RuntimeType; use quickwit_metastore::checkpoint::SourceCheckpointDelta; use quickwit_parquet_engine::ingest::{IngestError, ParquetIngestProcessor}; -use quickwit_parquet_engine::schema::ParquetSchema; use quickwit_proto::types::{IndexId, SourceId}; use serde::Serialize; use tokio::runtime::Handle; @@ -143,8 +142,7 @@ impl ParquetDocProcessor { source_id: SourceId, indexer_mailbox: Mailbox, ) -> Self { - let schema = ParquetSchema::new(); - let processor = ParquetIngestProcessor::new(schema); + let processor = ParquetIngestProcessor; let counters = ParquetDocProcessorCounters::new(index_id.clone(), source_id.clone()); info!( @@ -306,7 +304,7 @@ impl Handler for ParquetDocProcessor { // forever. if !checkpoint_forwarded && !checkpoint_delta.is_empty() { let empty_batch = - RecordBatch::new_empty(self.processor.schema().arrow_schema().clone()); + RecordBatch::new_empty(std::sync::Arc::new(arrow::datatypes::Schema::empty())); let processed_batch = ProcessedParquetBatch::new(empty_batch, checkpoint_delta, force_commit); ctx.send_message(&self.indexer_mailbox, processed_batch) @@ -399,14 +397,8 @@ mod tests { #[tokio::test] async fn test_metrics_doc_processor_valid_arrow_ipc() { - use std::sync::Arc as StdArc; + use quickwit_parquet_engine::test_helpers::create_test_batch_with_tags; - use arrow::array::{ - ArrayRef, BinaryViewArray, DictionaryArray, Float64Array, Int32Array, StringArray, - StructArray, UInt8Array, UInt64Array, - }; - use arrow::datatypes::{DataType, Field, Int32Type}; - use arrow::record_batch::RecordBatch; let universe = Universe::with_accelerated_time(); let (indexer_mailbox, _indexer_inbox) = universe.create_test_mailbox::(); @@ -419,103 +411,7 @@ mod tests { let (metrics_doc_processor_mailbox, metrics_doc_processor_handle) = universe.spawn_builder().spawn(metrics_doc_processor); - // Create a test batch matching the metrics schema - let schema = ParquetSchema::new(); - let num_rows = 3; - - // Helper to create dictionary arrays - fn create_dict_array(values: &[&str]) -> ArrayRef { - let keys: Vec = (0..values.len()).map(|i| i as i32).collect(); - let string_array = StringArray::from(values.to_vec()); - StdArc::new( - DictionaryArray::::try_new( - Int32Array::from(keys), - StdArc::new(string_array), - ) - .unwrap(), - ) - } - - fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef { - let keys: Vec> = values - .iter() - .enumerate() - .map(|(i, v)| v.map(|_| i as i32)) - .collect(); - let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect(); - let string_array = StringArray::from(string_values); - StdArc::new( - DictionaryArray::::try_new( - Int32Array::from(keys), - StdArc::new(string_array), - ) - .unwrap(), - ) - } - - let metric_name: ArrayRef = create_dict_array(&vec!["cpu.usage"; num_rows]); - let metric_type: ArrayRef = StdArc::new(UInt8Array::from(vec![0u8; num_rows])); - let metric_unit: ArrayRef = StdArc::new(StringArray::from(vec![Some("bytes"); num_rows])); - let timestamp_secs: ArrayRef = StdArc::new(UInt64Array::from(vec![100u64, 101u64, 102u64])); - let start_timestamp_secs: ArrayRef = - StdArc::new(UInt64Array::from(vec![None::; num_rows])); - let value: ArrayRef = StdArc::new(Float64Array::from(vec![42.0, 43.0, 44.0])); - let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]); - let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]); - let tag_datacenter: ArrayRef = - create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]); - let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]); - let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]); - - // Create empty Variant (Struct with metadata and value BinaryView fields) - let metadata_array = StdArc::new(BinaryViewArray::from(vec![b"" as &[u8]; num_rows])); - let value_array = StdArc::new(BinaryViewArray::from(vec![b"" as &[u8]; num_rows])); - let attributes: ArrayRef = StdArc::new(StructArray::from(vec![ - ( - StdArc::new(Field::new("metadata", DataType::BinaryView, false)), - metadata_array.clone() as ArrayRef, - ), - ( - StdArc::new(Field::new("value", DataType::BinaryView, false)), - value_array.clone() as ArrayRef, - ), - ])); - - let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]); - - let resource_attributes: ArrayRef = StdArc::new(StructArray::from(vec![ - ( - StdArc::new(Field::new("metadata", DataType::BinaryView, false)), - metadata_array as ArrayRef, - ), - ( - StdArc::new(Field::new("value", DataType::BinaryView, false)), - value_array as ArrayRef, - ), - ])); - - let batch = RecordBatch::try_new( - schema.arrow_schema().clone(), - vec![ - metric_name, - metric_type, - metric_unit, - timestamp_secs, - start_timestamp_secs, - value, - tag_service, - tag_env, - tag_datacenter, - tag_region, - tag_host, - attributes, - service_name, - resource_attributes, - ], - ) - .unwrap(); - - // Serialize to Arrow IPC + let batch = create_test_batch_with_tags(3, &["service"]); let ipc_bytes = record_batch_to_ipc(&batch).unwrap(); // Create RawDocBatch with the IPC bytes @@ -624,13 +520,8 @@ mod tests { async fn test_metrics_doc_processor_with_indexer() { use std::sync::Arc as StdArc; - use arrow::array::{ - ArrayRef, BinaryViewArray, DictionaryArray, Float64Array, Int32Array, StringArray, - StructArray, UInt8Array, UInt64Array, - }; - use arrow::datatypes::{DataType, Field, Int32Type}; - use arrow::record_batch::RecordBatch; use quickwit_parquet_engine::storage::{ParquetSplitWriter, ParquetWriterConfig}; + use quickwit_parquet_engine::test_helpers::create_test_batch_with_tags; use quickwit_proto::metastore::MockMetastoreService; use quickwit_storage::RamStorage; @@ -657,9 +548,8 @@ mod tests { let (uploader_mailbox, _uploader_handle) = universe.spawn_builder().spawn(uploader); // Create ParquetPackager - let parquet_schema = ParquetSchema::new(); let writer_config = ParquetWriterConfig::default(); - let split_writer = ParquetSplitWriter::new(parquet_schema, writer_config, temp_dir.path()); + let split_writer = ParquetSplitWriter::new(writer_config, temp_dir.path()); let packager = ParquetPackager::new(split_writer, uploader_mailbox); let (packager_mailbox, packager_handle) = universe.spawn_builder().spawn(packager); @@ -681,104 +571,7 @@ mod tests { let (metrics_doc_processor_mailbox, metrics_doc_processor_handle) = universe.spawn_builder().spawn(metrics_doc_processor); - // Create a test batch - let schema = ParquetSchema::new(); - let num_rows = 5; - - fn create_dict_array(values: &[&str]) -> ArrayRef { - let keys: Vec = (0..values.len()).map(|i| i as i32).collect(); - let string_array = StringArray::from(values.to_vec()); - StdArc::new( - DictionaryArray::::try_new( - Int32Array::from(keys), - StdArc::new(string_array), - ) - .unwrap(), - ) - } - - fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef { - let keys: Vec> = values - .iter() - .enumerate() - .map(|(i, v)| v.map(|_| i as i32)) - .collect(); - let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect(); - let string_array = StringArray::from(string_values); - StdArc::new( - DictionaryArray::::try_new( - Int32Array::from(keys), - StdArc::new(string_array), - ) - .unwrap(), - ) - } - - let metric_name: ArrayRef = create_dict_array(&vec!["cpu.usage"; num_rows]); - let metric_type: ArrayRef = StdArc::new(UInt8Array::from(vec![0u8; num_rows])); - let metric_unit: ArrayRef = StdArc::new(StringArray::from(vec![Some("bytes"); num_rows])); - let timestamps: Vec = (0..num_rows).map(|i| 100 + i as u64).collect(); - let timestamp_secs: ArrayRef = StdArc::new(UInt64Array::from(timestamps)); - let start_timestamp_secs: ArrayRef = - StdArc::new(UInt64Array::from(vec![None::; num_rows])); - let values: Vec = (0..num_rows).map(|i| 42.0 + i as f64).collect(); - let value: ArrayRef = StdArc::new(Float64Array::from(values)); - let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]); - let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]); - let tag_datacenter: ArrayRef = - create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]); - let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]); - let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]); - - // Create empty Variant (Struct with metadata and value BinaryView fields) - let metadata_array = StdArc::new(BinaryViewArray::from(vec![b"" as &[u8]; num_rows])); - let value_array = StdArc::new(BinaryViewArray::from(vec![b"" as &[u8]; num_rows])); - let attributes: ArrayRef = StdArc::new(StructArray::from(vec![ - ( - StdArc::new(Field::new("metadata", DataType::BinaryView, false)), - metadata_array.clone() as ArrayRef, - ), - ( - StdArc::new(Field::new("value", DataType::BinaryView, false)), - value_array.clone() as ArrayRef, - ), - ])); - - let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]); - - let resource_attributes: ArrayRef = StdArc::new(StructArray::from(vec![ - ( - StdArc::new(Field::new("metadata", DataType::BinaryView, false)), - metadata_array as ArrayRef, - ), - ( - StdArc::new(Field::new("value", DataType::BinaryView, false)), - value_array as ArrayRef, - ), - ])); - - let batch = RecordBatch::try_new( - schema.arrow_schema().clone(), - vec![ - metric_name, - metric_type, - metric_unit, - timestamp_secs, - start_timestamp_secs, - value, - tag_service, - tag_env, - tag_datacenter, - tag_region, - tag_host, - attributes, - service_name, - resource_attributes, - ], - ) - .unwrap(); - - // Serialize to Arrow IPC + let batch = create_test_batch_with_tags(5, &["service"]); let ipc_bytes = record_batch_to_ipc(&batch).unwrap(); // Create RawDocBatch with force_commit to trigger split production diff --git a/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs b/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs index 0823ce4ad04..d613fc96003 100644 --- a/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs +++ b/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs @@ -24,15 +24,13 @@ use std::time::Duration; use arrow::array::{ ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array, UInt64Array, }; -use arrow::datatypes::Int32Type; +use arrow::datatypes::{DataType, Field, Int32Type, Schema as ArrowSchema}; use arrow::record_batch::RecordBatch; use bytes::Bytes; -use parquet::variant::{VariantArrayBuilder, VariantBuilderExt}; use quickwit_actors::{ActorHandle, Universe}; use quickwit_common::test_utils::wait_until_predicate; use quickwit_metastore::checkpoint::SourceCheckpointDelta; use quickwit_parquet_engine::ingest::record_batch_to_ipc; -use quickwit_parquet_engine::schema::ParquetSchema; use quickwit_parquet_engine::storage::{ParquetSplitWriter, ParquetWriterConfig}; use quickwit_proto::metastore::{EmptyResponse, MockMetastoreService}; use quickwit_proto::types::IndexUid; @@ -65,48 +63,6 @@ async fn wait_for_published_splits( .map_err(|_| anyhow::anyhow!("Timeout waiting for {} published splits", expected_splits)) } -fn create_dict_array(values: &[&str]) -> ArrayRef { - let keys: Vec = (0..values.len()).map(|i| i as i32).collect(); - let string_array = StringArray::from(values.to_vec()); - Arc::new( - DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) - .unwrap(), - ) -} - -fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef { - let keys: Vec> = values - .iter() - .enumerate() - .map(|(i, v)| v.map(|_| i as i32)) - .collect(); - let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect(); - let string_array = StringArray::from(string_values); - Arc::new( - DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) - .unwrap(), - ) -} - -fn create_variant_array(num_rows: usize, fields: Option<&[(&str, &str)]>) -> ArrayRef { - let mut builder = VariantArrayBuilder::new(num_rows); - for _ in 0..num_rows { - match fields { - Some(kv_pairs) => { - let mut obj = builder.new_object(); - for &(key, value) in kv_pairs { - obj = obj.with_field(key, value); - } - obj.finish(); - } - None => { - builder.append_null(); - } - } - } - ArrayRef::from(builder.build()) -} - fn create_test_batch( num_rows: usize, metric_name: &str, @@ -114,44 +70,46 @@ fn create_test_batch( base_timestamp: u64, base_value: f64, ) -> RecordBatch { - let schema = ParquetSchema::new(); - - let metric_names: Vec<&str> = vec![metric_name; num_rows]; - let metric_name_arr: ArrayRef = create_dict_array(&metric_names); + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "metric_name", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + Field::new("metric_type", DataType::UInt8, false), + Field::new("timestamp_secs", DataType::UInt64, false), + Field::new("value", DataType::Float64, false), + Field::new( + "service", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + true, + ), + ])); + + let metric_name_arr: ArrayRef = { + let keys = Int32Array::from(vec![0i32; num_rows]); + let vals = StringArray::from(vec![metric_name]); + Arc::new(DictionaryArray::::try_new(keys, Arc::new(vals)).unwrap()) + }; let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows])); - let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("count"); num_rows])); let timestamps: Vec = (0..num_rows).map(|i| base_timestamp + i as u64).collect(); let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps)); - let start_timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(vec![None::; num_rows])); let values: Vec = (0..num_rows).map(|i| base_value + i as f64).collect(); let value: ArrayRef = Arc::new(Float64Array::from(values)); - let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some(service); num_rows]); - let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]); - let tag_datacenter: ArrayRef = create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]); - let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]); - let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]); - let attributes: ArrayRef = create_variant_array(num_rows, None); - let service_names: Vec<&str> = vec![service; num_rows]; - let service_name: ArrayRef = create_dict_array(&service_names); - let resource_attributes: ArrayRef = create_variant_array(num_rows, None); + let service_arr: ArrayRef = { + let keys = Int32Array::from(vec![0i32; num_rows]); + let vals = StringArray::from(vec![service]); + Arc::new(DictionaryArray::::try_new(keys, Arc::new(vals)).unwrap()) + }; RecordBatch::try_new( - schema.arrow_schema().clone(), + schema, vec![ metric_name_arr, metric_type, - metric_unit, timestamp_secs, - start_timestamp_secs, value, - tag_service, - tag_env, - tag_datacenter, - tag_region, - tag_host, - attributes, - service_name, - resource_attributes, + service_arr, ], ) .unwrap() @@ -213,9 +171,8 @@ async fn test_metrics_pipeline_e2e() { let (uploader_mailbox, _uploader_handle) = universe.spawn_builder().spawn(uploader); // ParquetPackager between indexer and uploader - let parquet_schema = ParquetSchema::new(); let writer_config = ParquetWriterConfig::default(); - let split_writer = ParquetSplitWriter::new(parquet_schema, writer_config, temp_dir.path()); + let split_writer = ParquetSplitWriter::new(writer_config, temp_dir.path()); let packager = ParquetPackager::new(split_writer, uploader_mailbox); let (packager_mailbox, packager_handle) = universe.spawn_builder().spawn(packager); diff --git a/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs b/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs index 2738a42163f..254ff2dc719 100644 --- a/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs +++ b/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs @@ -530,16 +530,10 @@ mod tests { use std::sync::atomic::Ordering; use std::time::Duration; - use arrow::array::{ - ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array, UInt64Array, - }; - use arrow::datatypes::Int32Type; - use arrow::record_batch::RecordBatch; - use parquet::variant::{VariantArrayBuilder, VariantBuilderExt}; use quickwit_actors::{ActorHandle, Universe}; use quickwit_common::test_utils::wait_until_predicate; - use quickwit_parquet_engine::schema::ParquetSchema; use quickwit_parquet_engine::storage::{ParquetSplitWriter, ParquetWriterConfig}; + use quickwit_parquet_engine::test_helpers::create_test_batch; use quickwit_proto::metastore::{EmptyResponse, MockMetastoreService}; use quickwit_storage::RamStorage; @@ -598,9 +592,8 @@ mod tests { temp_dir: &std::path::Path, uploader_mailbox: Mailbox, ) -> (Mailbox, ActorHandle) { - let schema = ParquetSchema::new(); let writer_config = ParquetWriterConfig::default(); - let split_writer = ParquetSplitWriter::new(schema, writer_config, temp_dir); + let split_writer = ParquetSplitWriter::new(writer_config, temp_dir); let packager = ParquetPackager::new(split_writer, uploader_mailbox); universe.spawn_builder().spawn(packager) @@ -624,97 +617,6 @@ mod tests { .map_err(|_| anyhow::anyhow!("Timeout waiting for {} staged splits", expected_splits)) } - /// Create dictionary array for string fields with Int32 keys. - fn create_dict_array(values: &[&str]) -> ArrayRef { - let keys: Vec = (0..values.len()).map(|i| i as i32).collect(); - let string_array = StringArray::from(values.to_vec()); - Arc::new( - DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) - .unwrap(), - ) - } - - /// Create nullable dictionary array for optional string fields. - fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef { - let keys: Vec> = values - .iter() - .enumerate() - .map(|(i, v)| v.map(|_| i as i32)) - .collect(); - let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect(); - let string_array = StringArray::from(string_values); - Arc::new( - DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) - .unwrap(), - ) - } - - /// Create a VARIANT array for testing with specified number of rows. - fn create_variant_array(num_rows: usize, fields: Option<&[(&str, &str)]>) -> ArrayRef { - let mut builder = VariantArrayBuilder::new(num_rows); - for _ in 0..num_rows { - match fields { - Some(kv_pairs) => { - let mut obj = builder.new_object(); - for &(key, value) in kv_pairs { - obj = obj.with_field(key, value); - } - obj.finish(); - } - None => { - builder.append_null(); - } - } - } - ArrayRef::from(builder.build()) - } - - /// Create a test batch matching the metrics schema. - fn create_test_batch(num_rows: usize) -> RecordBatch { - let schema = ParquetSchema::new(); - - let metric_names: Vec<&str> = vec!["cpu.usage"; num_rows]; - let metric_name: ArrayRef = create_dict_array(&metric_names); - let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows])); - let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows])); - let timestamps: Vec = (0..num_rows).map(|i| 100 + i as u64).collect(); - let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps)); - let start_timestamp_secs: ArrayRef = - Arc::new(UInt64Array::from(vec![None::; num_rows])); - let values: Vec = (0..num_rows).map(|i| 42.0 + i as f64).collect(); - let value: ArrayRef = Arc::new(Float64Array::from(values)); - let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]); - let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]); - let tag_datacenter: ArrayRef = - create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]); - let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]); - let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]); - let attributes: ArrayRef = create_variant_array(num_rows, None); - let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]); - let resource_attributes: ArrayRef = create_variant_array(num_rows, None); - - RecordBatch::try_new( - schema.arrow_schema().clone(), - vec![ - metric_name, - metric_type, - metric_unit, - timestamp_secs, - start_timestamp_secs, - value, - tag_service, - tag_env, - tag_datacenter, - tag_region, - tag_host, - attributes, - service_name, - resource_attributes, - ], - ) - .unwrap() - } - #[tokio::test] async fn test_metrics_indexer_receives_batch() { let universe = Universe::with_accelerated_time(); diff --git a/quickwit/quickwit-indexing/src/actors/parquet_packager.rs b/quickwit/quickwit-indexing/src/actors/parquet_packager.rs index a23b88789e4..b0950141b10 100644 --- a/quickwit/quickwit-indexing/src/actors/parquet_packager.rs +++ b/quickwit/quickwit-indexing/src/actors/parquet_packager.rs @@ -235,110 +235,17 @@ mod tests { use std::sync::atomic::Ordering as AtomicOrdering; use std::time::Duration; - use arrow::array::{ - ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array, UInt64Array, - }; - use arrow::datatypes::Int32Type; - use arrow::record_batch::RecordBatch; - use parquet::variant::{VariantArrayBuilder, VariantBuilderExt}; use quickwit_actors::{ActorHandle, Universe}; use quickwit_common::test_utils::wait_until_predicate; use quickwit_metastore::checkpoint::{IndexCheckpointDelta, SourceCheckpointDelta}; - use quickwit_parquet_engine::schema::ParquetSchema; use quickwit_parquet_engine::storage::ParquetWriterConfig; + use quickwit_parquet_engine::test_helpers::create_test_batch; use quickwit_proto::metastore::{EmptyResponse, MockMetastoreService}; use quickwit_storage::RamStorage; use super::*; use crate::actors::{ParquetPublisher, SplitsUpdateMailbox, UploaderType}; - fn create_dict_array(values: &[&str]) -> ArrayRef { - let keys: Vec = (0..values.len()).map(|i| i as i32).collect(); - let string_array = StringArray::from(values.to_vec()); - Arc::new( - DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) - .unwrap(), - ) - } - - fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef { - let keys: Vec> = values - .iter() - .enumerate() - .map(|(i, v)| v.map(|_| i as i32)) - .collect(); - let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect(); - let string_array = StringArray::from(string_values); - Arc::new( - DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) - .unwrap(), - ) - } - - fn create_variant_array(num_rows: usize, fields: Option<&[(&str, &str)]>) -> ArrayRef { - let mut builder = VariantArrayBuilder::new(num_rows); - for _ in 0..num_rows { - match fields { - Some(kv_pairs) => { - let mut obj = builder.new_object(); - for &(key, value) in kv_pairs { - obj = obj.with_field(key, value); - } - obj.finish(); - } - None => { - builder.append_null(); - } - } - } - ArrayRef::from(builder.build()) - } - - fn create_test_batch(num_rows: usize) -> RecordBatch { - let schema = ParquetSchema::new(); - - let metric_names: Vec<&str> = vec!["cpu.usage"; num_rows]; - let metric_name: ArrayRef = create_dict_array(&metric_names); - let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows])); - let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows])); - let timestamps: Vec = (0..num_rows).map(|i| 100 + i as u64).collect(); - let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps)); - let start_timestamp_secs: ArrayRef = - Arc::new(UInt64Array::from(vec![None::; num_rows])); - let values: Vec = (0..num_rows).map(|i| 42.0 + i as f64).collect(); - let value: ArrayRef = Arc::new(Float64Array::from(values)); - let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]); - let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]); - let tag_datacenter: ArrayRef = - create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]); - let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]); - let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]); - let attributes: ArrayRef = create_variant_array(num_rows, None); - let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]); - let resource_attributes: ArrayRef = create_variant_array(num_rows, None); - - RecordBatch::try_new( - schema.arrow_schema().clone(), - vec![ - metric_name, - metric_type, - metric_unit, - timestamp_secs, - start_timestamp_secs, - value, - tag_service, - tag_env, - tag_datacenter, - tag_region, - tag_host, - attributes, - service_name, - resource_attributes, - ], - ) - .unwrap() - } - fn create_test_uploader( universe: &Universe, ) -> (Mailbox, ActorHandle) { @@ -366,9 +273,8 @@ mod tests { temp_dir: &std::path::Path, uploader_mailbox: Mailbox, ) -> (Mailbox, ActorHandle) { - let schema = ParquetSchema::new(); let writer_config = ParquetWriterConfig::default(); - let split_writer = ParquetSplitWriter::new(schema, writer_config, temp_dir); + let split_writer = ParquetSplitWriter::new(writer_config, temp_dir); let packager = ParquetPackager::new(split_writer, uploader_mailbox); universe.spawn_builder().spawn(packager) diff --git a/quickwit/quickwit-indexing/src/actors/uploader.rs b/quickwit/quickwit-indexing/src/actors/uploader.rs index 1827af7a153..448b7a4e312 100644 --- a/quickwit/quickwit-indexing/src/actors/uploader.rs +++ b/quickwit/quickwit-indexing/src/actors/uploader.rs @@ -66,7 +66,7 @@ pub enum UploaderType { /// [`SplitsUpdateMailbox`] wraps either a [`Mailbox>`] or [`Mailbox

`]. /// /// It makes it possible to send a splits update either to the [`Sequencer`] or directly -/// to the publisher actor `P`. It is used in combination with [`SplitsUpdateSender`] that +/// to the publisher actor `P`. It is used in combination with `SplitsUpdateSender` that /// will do the send. /// /// This is useful as we have different requirements between the indexing pipeline and diff --git a/quickwit/quickwit-indexing/src/models/processed_parquet_batch.rs b/quickwit/quickwit-indexing/src/models/processed_parquet_batch.rs index 0db83abcc02..c70afe35976 100644 --- a/quickwit/quickwit-indexing/src/models/processed_parquet_batch.rs +++ b/quickwit/quickwit-indexing/src/models/processed_parquet_batch.rs @@ -96,112 +96,10 @@ impl fmt::Debug for ProcessedParquetBatch { #[cfg(test)] mod tests { - use std::sync::Arc; - - use arrow::array::{ - ArrayRef, BinaryViewArray, DictionaryArray, Float64Array, Int32Array, StringArray, - StructArray, UInt8Array, UInt64Array, - }; - use arrow::datatypes::{DataType, Field, Int32Type}; - use quickwit_parquet_engine::schema::ParquetSchema; + use quickwit_parquet_engine::test_helpers::create_test_batch; use super::*; - /// Create dictionary array for string fields with Int32 keys. - fn create_dict_array(values: &[&str]) -> ArrayRef { - let keys: Vec = (0..values.len()).map(|i| i as i32).collect(); - let string_array = StringArray::from(values.to_vec()); - Arc::new( - DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) - .unwrap(), - ) - } - - /// Create nullable dictionary array for optional string fields. - fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef { - let keys: Vec> = values - .iter() - .enumerate() - .map(|(i, v)| v.map(|_| i as i32)) - .collect(); - let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect(); - let string_array = StringArray::from(string_values); - Arc::new( - DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) - .unwrap(), - ) - } - - /// Create a test batch matching the metrics schema. - fn create_test_batch(num_rows: usize) -> RecordBatch { - let schema = ParquetSchema::new(); - - let metric_names: Vec<&str> = vec!["cpu.usage"; num_rows]; - let metric_name: ArrayRef = create_dict_array(&metric_names); - let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows])); - let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows])); - let timestamps: Vec = (0..num_rows).map(|i| 100 + i as u64).collect(); - let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps)); - let start_timestamp_secs: ArrayRef = - Arc::new(UInt64Array::from(vec![None::; num_rows])); - let values: Vec = (0..num_rows).map(|i| 42.0 + i as f64).collect(); - let value: ArrayRef = Arc::new(Float64Array::from(values)); - let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]); - let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]); - let tag_datacenter: ArrayRef = - create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]); - let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]); - let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]); - - // Create empty Variant (Struct with metadata and value BinaryView fields) - let metadata_array = Arc::new(BinaryViewArray::from(vec![b"" as &[u8]; num_rows])); - let value_array = Arc::new(BinaryViewArray::from(vec![b"" as &[u8]; num_rows])); - let attributes: ArrayRef = Arc::new(StructArray::from(vec![ - ( - Arc::new(Field::new("metadata", DataType::BinaryView, false)), - metadata_array.clone() as ArrayRef, - ), - ( - Arc::new(Field::new("value", DataType::BinaryView, false)), - value_array.clone() as ArrayRef, - ), - ])); - - let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]); - - let resource_attributes: ArrayRef = Arc::new(StructArray::from(vec![ - ( - Arc::new(Field::new("metadata", DataType::BinaryView, false)), - metadata_array as ArrayRef, - ), - ( - Arc::new(Field::new("value", DataType::BinaryView, false)), - value_array as ArrayRef, - ), - ])); - - RecordBatch::try_new( - schema.arrow_schema().clone(), - vec![ - metric_name, - metric_type, - metric_unit, - timestamp_secs, - start_timestamp_secs, - value, - tag_service, - tag_env, - tag_datacenter, - tag_region, - tag_host, - attributes, - service_name, - resource_attributes, - ], - ) - .unwrap() - } - #[test] fn test_processed_parquet_batch_new() { let batch = create_test_batch(10); diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs index 6fd5ce244be..e35618b99f8 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! [`FileBackedIndex`] module. It is public so that the crate `quickwit-backward-compat` can -//! import [`FileBackedIndex`] and run backward-compatibility tests. You should not have to import +//! `FileBackedIndex` module. It is public so that the crate `quickwit-backward-compat` can +//! import `FileBackedIndex` and run backward-compatibility tests. You should not have to import //! anything from here directly. mod serialize; diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/lazy_file_backed_index.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/lazy_file_backed_index.rs index c13711de0b2..43979334f55 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/lazy_file_backed_index.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/lazy_file_backed_index.rs @@ -24,7 +24,7 @@ use tracing::error; use super::file_backed_index::FileBackedIndex; use super::store_operations::{METASTORE_FILE_NAME, load_index}; -/// Lazy [`FileBackedIndex`]. It loads a `FileBackedIndex` on demand. When the index is first +/// Lazy `FileBackedIndex`. It loads a `FileBackedIndex` on demand. When the index is first /// loaded, it optionally spawns a task to periodically poll the storage and update the index. pub(crate) struct LazyFileBackedIndex { index_id: IndexId, diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs index 00791488e65..74e0f05abae 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs @@ -13,7 +13,7 @@ // limitations under the License. //! Module for [`FileBackedMetastore`]. It is public so that the crate `quickwit-backward-compat` -//! can import [`FileBackedIndex`] and run backward-compatibility tests. You should not have to +//! can import `FileBackedIndex` and run backward-compatibility tests. You should not have to //! import anything from here directly. pub mod file_backed_index; @@ -116,9 +116,9 @@ impl From for MutationOccurred<()> { /// into as many files and stores a map of indexes /// (index_id, index_status) in a dedicated file `manifest.json`. /// -/// A [`LazyIndexStatus`] describes the lifecycle of an index: [`LazyIndexStatus::Creating`] and -/// [`LazyIndexStatus::Deleting`] are transitioning states that indicates that the index is not -/// yet available. On the contrary, the [`LazyIndexStatus::Active`] status indicates the index is +/// A `LazyIndexStatus` describes the lifecycle of an index: `LazyIndexStatus::Creating` and +/// `LazyIndexStatus::Deleting` are transitioning states that indicates that the index is not +/// yet available. On the contrary, the `LazyIndexStatus::Active` status indicates the index is /// ready to be fetched and updated. /// /// Transitioning states are useful to track inconsistencies between the in-memory and on-disk data diff --git a/quickwit/quickwit-metastore/src/metastore_resolver.rs b/quickwit/quickwit-metastore/src/metastore_resolver.rs index 7793fdbbf45..265d9069f3c 100644 --- a/quickwit/quickwit-metastore/src/metastore_resolver.rs +++ b/quickwit/quickwit-metastore/src/metastore_resolver.rs @@ -45,7 +45,7 @@ impl fmt::Debug for MetastoreResolver { } impl MetastoreResolver { - /// Creates an empty [`MetastoreResolverBuilder`]. + /// Creates an empty `MetastoreResolverBuilder`. pub fn builder() -> MetastoreResolverBuilder { MetastoreResolverBuilder::default() } diff --git a/quickwit/quickwit-opentelemetry/Cargo.toml b/quickwit/quickwit-opentelemetry/Cargo.toml index d6e897990a2..a7432ac5403 100644 --- a/quickwit/quickwit-opentelemetry/Cargo.toml +++ b/quickwit/quickwit-opentelemetry/Cargo.toml @@ -15,7 +15,6 @@ anyhow = { workspace = true } arrow = { workspace = true } async-trait = { workspace = true } once_cell = { workspace = true } -parquet = { workspace = true } prost = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/quickwit/quickwit-opentelemetry/src/otlp/arrow_metrics.rs b/quickwit/quickwit-opentelemetry/src/otlp/arrow_metrics.rs index 1811a63a909..d36c2f1b98b 100644 --- a/quickwit/quickwit-opentelemetry/src/otlp/arrow_metrics.rs +++ b/quickwit/quickwit-opentelemetry/src/otlp/arrow_metrics.rs @@ -12,272 +12,138 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! Arrow-based batch building for metrics with dictionary encoding. +//! Arrow-based batch building for metrics with dynamic schema discovery. //! //! This module provides Arrow RecordBatch construction with dictionary-encoded -//! string columns for efficient storage of metrics with low cardinality tags. +//! string columns for efficient storage of metrics with dynamic tag keys. +//! The schema is discovered at `finish()` time by scanning all accumulated +//! data points for the union of tag keys. +use std::collections::BTreeSet; use std::io::Cursor; use std::sync::Arc; use arrow::array::{ - ArrayBuilder, ArrayRef, Float64Builder, RecordBatch, StringBuilder, StringDictionaryBuilder, - UInt8Builder, UInt64Builder, + ArrayRef, Float64Builder, RecordBatch, StringDictionaryBuilder, UInt8Builder, UInt64Builder, }; -use arrow::datatypes::{DataType, Field, Fields, Int32Type, Schema as ArrowSchema}; +use arrow::datatypes::{DataType, Field, Int32Type, Schema as ArrowSchema}; use arrow::ipc::reader::StreamReader; use arrow::ipc::writer::StreamWriter; -use parquet::variant::{VariantArrayBuilder, VariantBuilderExt, VariantType}; use quickwit_proto::bytes::Bytes; use quickwit_proto::ingest::{DocBatchV2, DocFormat}; use quickwit_proto::types::DocUid; use super::otel_metrics::{MetricDataPoint, MetricType}; -/// Creates the Arrow schema for metrics with dictionary-encoded string columns. -/// -/// Dictionary encoding stores unique string values once and references them by -/// integer index, providing significant compression for low cardinality tag values. -pub fn metrics_arrow_schema() -> ArrowSchema { - ArrowSchema::new(vec![ - // Dictionary-encoded string columns for low cardinality fields - Field::new( - "metric_name", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - false, - ), - // MetricType enum stored as UInt8 (only ~5 possible values) - Field::new("metric_type", DataType::UInt8, false), - Field::new("metric_unit", DataType::Utf8, true), - // Measurement timestamp in seconds since Unix epoch. - Field::new("timestamp_secs", DataType::UInt64, false), - Field::new("start_timestamp_secs", DataType::UInt64, true), - Field::new("value", DataType::Float64, false), - // Dictionary-encoded tag columns (low cardinality expected) - Field::new( - "tag_service", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - true, - ), - Field::new( - "tag_env", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - true, - ), - Field::new( - "tag_datacenter", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - true, - ), - Field::new( - "tag_region", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - true, - ), - Field::new( - "tag_host", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - true, - ), - // VARIANT fields for semi-structured attributes - // VariantArrayBuilder produces BinaryView fields, not Binary - Field::new( - "attributes", - DataType::Struct(Fields::from(vec![ - Field::new("metadata", DataType::BinaryView, false), - Field::new("value", DataType::BinaryView, false), - ])), - true, - ) - .with_extension_type(VariantType), - // Service name (low cardinality) - Field::new( - "service_name", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - false, - ), - Field::new( - "resource_attributes", - DataType::Struct(Fields::from(vec![ - Field::new("metadata", DataType::BinaryView, false), - Field::new("value", DataType::BinaryView, false), - ])), - true, - ) - .with_extension_type(VariantType), - ]) -} - /// Builder for creating Arrow RecordBatch from MetricDataPoints. /// -/// Uses dictionary encoding for low cardinality string columns -/// (tags, service names, metric names) to achieve significant compression. -/// Uses VARIANT encoding for semi-structured attributes. +/// Accumulates data points and discovers the schema dynamically at `finish()` +/// time. Uses dictionary encoding for string columns (metric_name, all tags). pub struct ArrowMetricsBatchBuilder { - metric_name: StringDictionaryBuilder, - metric_type: UInt8Builder, - metric_unit: StringBuilder, - timestamp_secs: UInt64Builder, - start_timestamp_secs: UInt64Builder, - value: Float64Builder, - tag_service: StringDictionaryBuilder, - tag_env: StringDictionaryBuilder, - tag_datacenter: StringDictionaryBuilder, - tag_region: StringDictionaryBuilder, - tag_host: StringDictionaryBuilder, - attributes: VariantArrayBuilder, - service_name: StringDictionaryBuilder, - resource_attributes: VariantArrayBuilder, + data_points: Vec, } impl ArrowMetricsBatchBuilder { /// Creates a new builder with pre-allocated capacity. pub fn with_capacity(capacity: usize) -> Self { Self { - metric_name: StringDictionaryBuilder::new(), - metric_type: UInt8Builder::with_capacity(capacity), - metric_unit: StringBuilder::with_capacity(capacity, capacity * 8), - timestamp_secs: UInt64Builder::with_capacity(capacity), - start_timestamp_secs: UInt64Builder::with_capacity(capacity), - value: Float64Builder::with_capacity(capacity), - tag_service: StringDictionaryBuilder::new(), - tag_env: StringDictionaryBuilder::new(), - tag_datacenter: StringDictionaryBuilder::new(), - tag_region: StringDictionaryBuilder::new(), - tag_host: StringDictionaryBuilder::new(), - attributes: VariantArrayBuilder::new(capacity), - service_name: StringDictionaryBuilder::new(), - resource_attributes: VariantArrayBuilder::new(capacity), + data_points: Vec::with_capacity(capacity), } } /// Appends a MetricDataPoint to the batch. - pub fn append(&mut self, data_point: &MetricDataPoint) { - self.metric_name.append_value(&data_point.metric_name); - self.metric_type.append_value(data_point.metric_type as u8); + pub fn append(&mut self, data_point: MetricDataPoint) { + self.data_points.push(data_point); + } - match &data_point.metric_unit { - Some(unit) => self.metric_unit.append_value(unit), - None => self.metric_unit.append_null(), + /// Finalizes and returns the RecordBatch. + /// + /// Performs two passes: + /// 1. Schema discovery: scans all data points to collect the union of tag keys. + /// 2. Array building: creates per-column builders and populates them. + pub fn finish(self) -> RecordBatch { + let num_rows = self.data_points.len(); + + // Pass 1: discover all tag keys across all data points. + let mut tag_keys: BTreeSet<&str> = BTreeSet::new(); + for dp in &self.data_points { + for key in dp.tags.keys() { + tag_keys.insert(key.as_str()); + } } + let sorted_tag_keys: Vec<&str> = tag_keys.into_iter().collect(); - self.timestamp_secs.append_value(data_point.timestamp_secs); - match data_point.start_timestamp_secs { - Some(ts) => self.start_timestamp_secs.append_value(ts), - None => self.start_timestamp_secs.append_null(), + // Build the Arrow schema dynamically + let mut fields = Vec::with_capacity(4 + sorted_tag_keys.len()); + fields.push(Field::new( + "metric_name", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + )); + fields.push(Field::new("metric_type", DataType::UInt8, false)); + fields.push(Field::new("timestamp_secs", DataType::UInt64, false)); + fields.push(Field::new("value", DataType::Float64, false)); + + for &tag_key in &sorted_tag_keys { + fields.push(Field::new( + tag_key, + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + true, + )); } - self.value.append_value(data_point.value); - - append_optional_dict(&mut self.tag_service, &data_point.tag_service); - append_optional_dict(&mut self.tag_env, &data_point.tag_env); - append_optional_dict(&mut self.tag_datacenter, &data_point.tag_datacenter); - append_optional_dict(&mut self.tag_region, &data_point.tag_region); - append_optional_dict(&mut self.tag_host, &data_point.tag_host); - - if data_point.attributes.is_empty() { - self.attributes.append_null(); - } else { - append_variant_object(&mut self.attributes, &data_point.attributes); + + let schema = Arc::new(ArrowSchema::new(fields)); + + // Pass 2: build arrays + let mut metric_name_builder: StringDictionaryBuilder = + StringDictionaryBuilder::new(); + let mut metric_type_builder = UInt8Builder::with_capacity(num_rows); + let mut timestamp_secs_builder = UInt64Builder::with_capacity(num_rows); + let mut value_builder = Float64Builder::with_capacity(num_rows); + + let mut tag_builders: Vec> = sorted_tag_keys + .iter() + .map(|_| StringDictionaryBuilder::new()) + .collect(); + + for dp in &self.data_points { + metric_name_builder.append_value(&dp.metric_name); + metric_type_builder.append_value(dp.metric_type as u8); + timestamp_secs_builder.append_value(dp.timestamp_secs); + value_builder.append_value(dp.value); + + for (tag_idx, tag_key) in sorted_tag_keys.iter().enumerate() { + match dp.tags.get(*tag_key) { + Some(tag_val) => tag_builders[tag_idx].append_value(tag_val), + None => tag_builders[tag_idx].append_null(), + } + } } - self.service_name.append_value(&data_point.service_name); + let mut arrays: Vec = Vec::with_capacity(4 + sorted_tag_keys.len()); + arrays.push(Arc::new(metric_name_builder.finish())); + arrays.push(Arc::new(metric_type_builder.finish())); + arrays.push(Arc::new(timestamp_secs_builder.finish())); + arrays.push(Arc::new(value_builder.finish())); - if data_point.resource_attributes.is_empty() { - self.resource_attributes.append_null(); - } else { - append_variant_object( - &mut self.resource_attributes, - &data_point.resource_attributes, - ); + for tag_builder in &mut tag_builders { + arrays.push(Arc::new(tag_builder.finish())); } - } - /// Finalizes and returns the RecordBatch. - pub fn finish(mut self) -> RecordBatch { - // Build variant arrays and convert to ArrayRef - let attributes_array = self.attributes.build(); - let resource_attributes_array = self.resource_attributes.build(); - - let arrays: Vec = vec![ - Arc::new(self.metric_name.finish()), - Arc::new(self.metric_type.finish()), - Arc::new(self.metric_unit.finish()), - Arc::new(self.timestamp_secs.finish()), - Arc::new(self.start_timestamp_secs.finish()), - Arc::new(self.value.finish()), - Arc::new(self.tag_service.finish()), - Arc::new(self.tag_env.finish()), - Arc::new(self.tag_datacenter.finish()), - Arc::new(self.tag_region.finish()), - Arc::new(self.tag_host.finish()), - ArrayRef::from(attributes_array), - Arc::new(self.service_name.finish()), - ArrayRef::from(resource_attributes_array), - ]; - - RecordBatch::try_new(Arc::new(metrics_arrow_schema()), arrays) - .expect("record batch should match Arrow schema") + RecordBatch::try_new(schema, arrays).expect("record batch should match Arrow schema") } /// Returns the number of rows appended so far. pub fn len(&self) -> usize { - self.timestamp_secs.len() + self.data_points.len() } /// Returns true if no rows have been appended. pub fn is_empty(&self) -> bool { - self.len() == 0 + self.data_points.is_empty() } } -/// Helper to append optional string values to dictionary builder. -fn append_optional_dict(builder: &mut StringDictionaryBuilder, value: &Option) { - match value { - Some(s) => builder.append_value(s), - None => builder.append_null(), - } -} - -/// Helper to append a HashMap as a VARIANT object to the builder. -fn append_variant_object( - builder: &mut VariantArrayBuilder, - map: &std::collections::HashMap, -) { - // Use a macro-like approach with fold to build the object - // We need to chain with_field calls which consume and return the builder - let obj_builder = builder.new_object(); - - // Build object by folding over the map entries - let final_builder = map.iter().fold(obj_builder, |b, (key, value)| { - match value { - serde_json::Value::Null => b.with_field(key.as_str(), ()), - serde_json::Value::Bool(v) => b.with_field(key.as_str(), *v), - serde_json::Value::Number(n) => { - if let Some(i) = n.as_i64() { - b.with_field(key.as_str(), i) - } else if let Some(f) = n.as_f64() { - b.with_field(key.as_str(), f) - } else { - b.with_field(key.as_str(), ()) - } - } - serde_json::Value::String(s) => b.with_field(key.as_str(), s.as_str()), - serde_json::Value::Array(arr) => { - // For arrays, serialize to JSON string as fallback - let json_str = serde_json::to_string(arr).unwrap_or_default(); - b.with_field(key.as_str(), json_str.as_str()) - } - serde_json::Value::Object(obj) => { - // For nested objects, serialize to JSON string as fallback - let json_str = serde_json::to_string(obj).unwrap_or_default(); - b.with_field(key.as_str(), json_str.as_str()) - } - } - }); - - final_builder.finish(); -} - /// Error type for Arrow IPC operations. #[derive(Debug, thiserror::Error)] pub enum ArrowIpcError { @@ -462,32 +328,26 @@ impl ArrowDocBatchV2Builder { mod tests { use std::collections::HashMap; - use serde_json::Value as JsonValue; - use super::*; fn make_test_data_point() -> MetricDataPoint { + let mut tags = HashMap::new(); + tags.insert("service".to_string(), "api".to_string()); + tags.insert("env".to_string(), "prod".to_string()); + tags.insert("datacenter".to_string(), "us-east-1a".to_string()); + tags.insert("region".to_string(), "us-east-1".to_string()); + tags.insert("host".to_string(), "server-001".to_string()); + tags.insert("endpoint".to_string(), "/health".to_string()); + tags.insert("metric_unit".to_string(), "%".to_string()); + tags.insert("start_timestamp_secs".to_string(), "1704067190".to_string()); + tags.insert("service_name".to_string(), "api-service".to_string()); + MetricDataPoint { metric_name: "cpu.usage".to_string(), metric_type: MetricType::Gauge, - metric_unit: Some("%".to_string()), timestamp_secs: 1704067200, - start_timestamp_secs: Some(1704067190), value: 85.5, - tag_service: Some("api".to_string()), - tag_env: Some("prod".to_string()), - tag_datacenter: Some("us-east-1a".to_string()), - tag_region: Some("us-east-1".to_string()), - tag_host: Some("server-001".to_string()), - attributes: HashMap::from([( - "endpoint".to_string(), - JsonValue::String("/health".to_string()), - )]), - service_name: "api-service".to_string(), - resource_attributes: HashMap::from([( - "k8s.pod".to_string(), - JsonValue::String("pod-123".to_string()), - )]), + tags, } } @@ -495,38 +355,36 @@ mod tests { fn test_arrow_batch_builder_single_row() { let dp = make_test_data_point(); let mut builder = ArrowMetricsBatchBuilder::with_capacity(1); - builder.append(&dp); + builder.append(dp); assert_eq!(builder.len(), 1); assert!(!builder.is_empty()); let batch = builder.finish(); assert_eq!(batch.num_rows(), 1); - assert_eq!(batch.num_columns(), 14); + // 4 fixed columns + 9 tag columns + assert_eq!(batch.num_columns(), 13); } #[test] fn test_arrow_batch_builder_multiple_rows() { let mut builder = ArrowMetricsBatchBuilder::with_capacity(100); - for i in 0..100 { + for idx in 0..100 { + let mut tags = HashMap::new(); + tags.insert("service".to_string(), format!("service-{}", idx % 10)); + tags.insert("env".to_string(), "prod".to_string()); + tags.insert("host".to_string(), format!("host-{}", idx % 5)); + tags.insert("service_name".to_string(), "test-service".to_string()); + let dp = MetricDataPoint { metric_name: "test.metric".to_string(), metric_type: MetricType::Gauge, - metric_unit: None, - timestamp_secs: 1704067200 + i as u64, - start_timestamp_secs: None, - value: i as f64 * 0.1, - tag_service: Some(format!("service-{}", i % 10)), // 10 unique values - tag_env: Some("prod".to_string()), // 1 unique value - tag_datacenter: None, - tag_region: None, - tag_host: Some(format!("host-{}", i % 5)), // 5 unique values - attributes: HashMap::new(), - service_name: "test-service".to_string(), - resource_attributes: HashMap::new(), + timestamp_secs: 1704067200 + idx as u64, + value: idx as f64 * 0.1, + tags, }; - builder.append(&dp); + builder.append(dp); } assert_eq!(builder.len(), 100); @@ -539,89 +397,108 @@ mod tests { let mut builder = ArrowMetricsBatchBuilder::with_capacity(1000); // Create 1000 data points with only 10 unique service values - for i in 0..1000 { + for idx in 0..1000 { + let mut tags = HashMap::new(); + tags.insert("service".to_string(), format!("service-{}", idx % 10)); + tags.insert("env".to_string(), "prod".to_string()); + tags.insert("datacenter".to_string(), format!("dc-{}", idx % 4)); + tags.insert("service_name".to_string(), format!("svc-{}", idx % 5)); + let dp = MetricDataPoint { metric_name: "test.metric".to_string(), metric_type: MetricType::Gauge, - metric_unit: None, - timestamp_secs: 1704067200 + i as u64, - start_timestamp_secs: None, - value: i as f64, - tag_service: Some(format!("service-{}", i % 10)), - tag_env: Some("prod".to_string()), - tag_datacenter: Some(format!("dc-{}", i % 4)), - tag_region: None, - tag_host: None, - attributes: HashMap::new(), - service_name: format!("svc-{}", i % 5), - resource_attributes: HashMap::new(), + timestamp_secs: 1704067200 + idx as u64, + value: idx as f64, + tags, }; - builder.append(&dp); + builder.append(dp); } let batch = builder.finish(); assert_eq!(batch.num_rows(), 1000); // Verify the batch was created successfully with dictionary encoding - // The dictionary arrays should have far fewer unique values than rows let schema = batch.schema(); - // Check that tag_service uses dictionary encoding - let tag_service_field = schema.field_with_name("tag_service").unwrap(); + // Check that the service tag uses dictionary encoding + let service_field = schema.field_with_name("service").unwrap(); assert!(matches!( - tag_service_field.data_type(), + service_field.data_type(), DataType::Dictionary(_, _) )); } #[test] fn test_null_handling() { + let mut tags = HashMap::new(); + tags.insert("service_name".to_string(), "unknown".to_string()); + let dp = MetricDataPoint { metric_name: "minimal.metric".to_string(), metric_type: MetricType::Gauge, - metric_unit: None, // null timestamp_secs: 1704067200, - start_timestamp_secs: None, // null value: 0.0, - tag_service: None, // null - tag_env: None, // null - tag_datacenter: None, - tag_region: None, - tag_host: None, - attributes: HashMap::new(), // empty -> null - service_name: "unknown".to_string(), - resource_attributes: HashMap::new(), // empty -> null + tags, }; let mut builder = ArrowMetricsBatchBuilder::with_capacity(1); - builder.append(&dp); + builder.append(dp); let batch = builder.finish(); assert_eq!(batch.num_rows(), 1); - // The batch should handle nulls correctly + // 4 fixed columns + 1 tag column (service_name) + assert_eq!(batch.num_columns(), 5); } #[test] - fn test_schema_field_count() { - let schema = metrics_arrow_schema(); - assert_eq!(schema.fields().len(), 14); + fn test_dynamic_schema_discovery() { + let mut builder = ArrowMetricsBatchBuilder::with_capacity(2); - // Verify field names + // First data point has tags: env, host + let mut tags1 = HashMap::new(); + tags1.insert("env".to_string(), "prod".to_string()); + tags1.insert("host".to_string(), "server-1".to_string()); + + builder.append(MetricDataPoint { + metric_name: "metric.a".to_string(), + metric_type: MetricType::Gauge, + timestamp_secs: 1704067200, + value: 1.0, + tags: tags1, + }); + + // Second data point has tags: env, region (different set) + let mut tags2 = HashMap::new(); + tags2.insert("env".to_string(), "staging".to_string()); + tags2.insert("region".to_string(), "us-west".to_string()); + + builder.append(MetricDataPoint { + metric_name: "metric.b".to_string(), + metric_type: MetricType::Sum, + timestamp_secs: 1704067201, + value: 2.0, + tags: tags2, + }); + + let batch = builder.finish(); + assert_eq!(batch.num_rows(), 2); + // 4 fixed + 3 tag columns (env, host, region) - sorted alphabetically + assert_eq!(batch.num_columns(), 7); + + let schema = batch.schema(); let field_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect(); - assert!(field_names.contains(&"metric_name")); - assert!(field_names.contains(&"metric_type")); - assert!(field_names.contains(&"metric_unit")); - assert!(field_names.contains(&"timestamp_secs")); - assert!(field_names.contains(&"start_timestamp_secs")); - assert!(field_names.contains(&"value")); - assert!(field_names.contains(&"tag_service")); - assert!(field_names.contains(&"tag_env")); - assert!(field_names.contains(&"tag_datacenter")); - assert!(field_names.contains(&"tag_region")); - assert!(field_names.contains(&"tag_host")); - assert!(field_names.contains(&"attributes")); - assert!(field_names.contains(&"service_name")); - assert!(field_names.contains(&"resource_attributes")); + assert_eq!( + field_names, + vec![ + "metric_name", + "metric_type", + "timestamp_secs", + "value", + "env", + "host", + "region", + ] + ); } #[test] @@ -638,24 +515,20 @@ mod tests { fn test_ipc_round_trip() { // Build a RecordBatch let mut builder = ArrowMetricsBatchBuilder::with_capacity(10); - for i in 0..10 { + for idx in 0..10 { + let mut tags = HashMap::new(); + tags.insert("service".to_string(), format!("service-{}", idx % 3)); + tags.insert("env".to_string(), "prod".to_string()); + tags.insert("service_name".to_string(), "test-service".to_string()); + let dp = MetricDataPoint { metric_name: "test.metric".to_string(), metric_type: MetricType::Gauge, - metric_unit: None, - timestamp_secs: 1704067200 + i as u64, - start_timestamp_secs: None, - value: i as f64 * 0.1, - tag_service: Some(format!("service-{}", i % 3)), - tag_env: Some("prod".to_string()), - tag_datacenter: None, - tag_region: None, - tag_host: None, - attributes: HashMap::new(), - service_name: "test-service".to_string(), - resource_attributes: HashMap::new(), + timestamp_secs: 1704067200 + idx as u64, + value: idx as f64 * 0.1, + tags, }; - builder.append(&dp); + builder.append(dp); } let original_batch = builder.finish(); @@ -682,24 +555,18 @@ mod tests { let mut doc_uid_generator = DocUidGenerator::default(); let mut doc_uids = Vec::new(); - for i in 0..5 { + for idx in 0..5 { + let mut tags = HashMap::new(); + tags.insert("service_name".to_string(), "test".to_string()); + let dp = MetricDataPoint { metric_name: "test.metric".to_string(), metric_type: MetricType::Gauge, - metric_unit: None, - timestamp_secs: 1704067200 + i as u64, - start_timestamp_secs: None, - value: i as f64, - tag_service: None, - tag_env: None, - tag_datacenter: None, - tag_region: None, - tag_host: None, - attributes: HashMap::new(), - service_name: "test".to_string(), - resource_attributes: HashMap::new(), + timestamp_secs: 1704067200 + idx as u64, + value: idx as f64, + tags, }; - builder.append(&dp); + builder.append(dp); doc_uids.push(doc_uid_generator.next_doc_uid()); } let batch = builder.finish(); diff --git a/quickwit/quickwit-opentelemetry/src/otlp/mod.rs b/quickwit/quickwit-opentelemetry/src/otlp/mod.rs index 67abd6c1d68..9927c4fd800 100644 --- a/quickwit/quickwit-opentelemetry/src/otlp/mod.rs +++ b/quickwit/quickwit-opentelemetry/src/otlp/mod.rs @@ -37,7 +37,7 @@ mod traces; pub use arrow_metrics::{ ArrowDocBatchV2Builder, ArrowIpcError, ArrowMetricsBatchBuilder, ipc_to_json_values, - ipc_to_record_batch, metrics_arrow_schema, record_batch_to_ipc, + ipc_to_record_batch, record_batch_to_ipc, }; pub use logs::{ JsonLogIterator, OTEL_LOGS_INDEX_ID, OtlpGrpcLogsService, OtlpLogsError, parse_otlp_logs_json, diff --git a/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs b/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs index bcb67c5fadb..994bf7cb324 100644 --- a/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs +++ b/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs @@ -134,59 +134,14 @@ impl From for tonic::Status { } } -/// Represents a single metric data point document +/// Represents a single metric data point document. #[derive(Debug, Clone)] pub struct MetricDataPoint { - // Metric identity pub metric_name: String, pub metric_type: MetricType, - pub metric_unit: Option, - - // Timestamps (seconds granularity) pub timestamp_secs: u64, - pub start_timestamp_secs: Option, - - // Value (f64 only) pub value: f64, - - // Explicit tag columns - pub tag_service: Option, - pub tag_env: Option, - pub tag_datacenter: Option, - pub tag_region: Option, - pub tag_host: Option, - - // Dynamic tags (remaining attributes) - pub attributes: HashMap, - - // Resource metadata - pub service_name: String, - pub resource_attributes: HashMap, -} - -struct ExplicitTags { - service: Option, - env: Option, - datacenter: Option, - region: Option, - host: Option, -} - -fn extract_string_tag(attributes: &mut HashMap, key: &str) -> Option { - attributes.remove(key).and_then(|v| match v { - JsonValue::String(s) => Some(s), - _ => None, - }) -} - -fn extract_explicit_tags(attributes: &mut HashMap) -> ExplicitTags { - ExplicitTags { - service: extract_string_tag(attributes, "service"), - env: extract_string_tag(attributes, "env"), - datacenter: extract_string_tag(attributes, "datacenter"), - region: extract_string_tag(attributes, "region"), - host: extract_string_tag(attributes, "host"), - } + pub tags: HashMap, } /// Convert nanoseconds to seconds @@ -194,6 +149,16 @@ fn nanos_to_secs(nanos: u64) -> u64 { nanos / 1_000_000_000 } +/// Convert a `serde_json::Value` to a plain `String`. +fn json_value_to_string(value: JsonValue) -> String { + match value { + JsonValue::String(s) => s, + JsonValue::Number(n) => n.to_string(), + JsonValue::Bool(b) => b.to_string(), + other => serde_json::to_string(&other).unwrap_or_default(), + } +} + struct ParsedMetrics { doc_batch: DocBatchV2, num_data_points: u64, @@ -243,7 +208,7 @@ impl OtlpGrpcMetricsService { Status::internal("failed to parse metric records") })??; - if num_data_points == num_parse_errors { + if num_data_points > 0 && num_data_points == num_parse_errors { return Err(tonic::Status::internal(error_message)); } @@ -274,15 +239,18 @@ impl OtlpGrpcMetricsService { request: ExportMetricsServiceRequest, parent_span: RuntimeSpan, ) -> tonic::Result { - let data_points = parse_otlp_metrics(request)?; - let num_data_points = data_points.len() as u64; - - // Build Arrow RecordBatch from data points - let mut arrow_builder = ArrowMetricsBatchBuilder::with_capacity(num_data_points as usize); + let ParseOtlpResult { + data_points, + num_rejected, + } = parse_otlp_metrics(request); + let num_data_points = data_points.len() as u64 + num_rejected; + + // Build Arrow RecordBatch from valid data points + let mut arrow_builder = ArrowMetricsBatchBuilder::with_capacity(data_points.len()); let mut doc_uid_generator = DocUidGenerator::default(); - let mut doc_uids = Vec::with_capacity(num_data_points as usize); + let mut doc_uids = Vec::with_capacity(data_points.len()); - for data_point in &data_points { + for data_point in data_points { arrow_builder.append(data_point); doc_uids.push(doc_uid_generator.next_doc_uid()); } @@ -300,13 +268,22 @@ impl OtlpGrpcMetricsService { let current_span = RuntimeSpan::current(); current_span.record("num_data_points", num_data_points); current_span.record("num_bytes", doc_batch.num_bytes()); - current_span.record("num_parse_errors", 0u64); + current_span.record("num_parse_errors", num_rejected); + + let error_message = if num_rejected > 0 { + format!( + "{num_rejected} data point(s) rejected (unsupported temporality or missing \ + required fields)" + ) + } else { + String::new() + }; let parsed_metrics = ParsedMetrics { doc_batch, num_data_points, - num_parse_errors: 0, - error_message: String::new(), + num_parse_errors: num_rejected, + error_message, }; Ok(parsed_metrics) } @@ -381,16 +358,19 @@ impl MetricsService for OtlpGrpcMetricsService { } } -fn parse_otlp_metrics( - request: ExportMetricsServiceRequest, -) -> Result, OtlpMetricsError> { +struct ParseOtlpResult { + data_points: Vec, + num_rejected: u64, +} + +fn parse_otlp_metrics(request: ExportMetricsServiceRequest) -> ParseOtlpResult { let mut data_points = Vec::new(); + let mut num_rejected: u64 = 0; for resource_metrics in request.resource_metrics { let mut resource_attributes = extract_attributes( resource_metrics .resource - .clone() .map(|rsrc| rsrc.attributes) .unwrap_or_default(), ); @@ -401,25 +381,23 @@ fn parse_otlp_metrics( for scope_metrics in resource_metrics.scope_metrics { for metric in scope_metrics.metrics { - parse_metric( - &metric, - &service_name, - &resource_attributes, - &mut data_points, - )?; + parse_metric(&metric, &service_name, &mut data_points, &mut num_rejected); } } } - Ok(data_points) + ParseOtlpResult { + data_points, + num_rejected, + } } fn parse_metric( metric: &Metric, service_name: &str, - resource_attributes: &HashMap, data_points: &mut Vec, -) -> Result<(), OtlpMetricsError> { + num_rejected: &mut u64, +) { let metric_name = metric.name.clone(); let metric_unit = if metric.unit.is_empty() { None @@ -430,36 +408,47 @@ fn parse_metric( match &metric.data { Some(metric::Data::Gauge(gauge)) => { for dp in &gauge.data_points { - let data_point = create_number_data_point( + match create_number_data_point( &metric_name, MetricType::Gauge, &metric_unit, dp, service_name, - resource_attributes, - )?; - data_points.push(data_point); + ) { + Ok(Some(data_point)) => data_points.push(data_point), + Ok(None) => *num_rejected += 1, + Err(err) => { + warn!(error = %err, metric_name, "skipping invalid gauge data point"); + *num_rejected += 1; + } + } } } Some(metric::Data::Sum(sum)) => { - // Only support DELTA temporality if sum.aggregation_temporality == AggregationTemporality::Cumulative as i32 { - return Err(OtlpMetricsError::InvalidArgument( - "cumulative aggregation temporality is not supported, only delta is supported" - .to_string(), - )); + warn!( + metric_name, + "skipping sum metric with cumulative temporality (only delta is supported)" + ); + *num_rejected += sum.data_points.len() as u64; + return; } for dp in &sum.data_points { - let data_point = create_number_data_point( + match create_number_data_point( &metric_name, MetricType::Sum, &metric_unit, dp, service_name, - resource_attributes, - )?; - data_points.push(data_point); + ) { + Ok(Some(data_point)) => data_points.push(data_point), + Ok(None) => *num_rejected += 1, + Err(err) => { + warn!(error = %err, metric_name, "skipping invalid sum data point"); + *num_rejected += 1; + } + } } } Some(metric::Data::Histogram(_)) => { @@ -475,8 +464,6 @@ fn parse_metric( warn!("metric has no data, skipping"); } } - - Ok(()) } fn create_number_data_point( @@ -485,8 +472,20 @@ fn create_number_data_point( metric_unit: &Option, dp: &NumberDataPoint, service_name: &str, - resource_attributes: &HashMap, -) -> Result { +) -> Result, OtlpMetricsError> { + // Convert timestamps to seconds + let timestamp_secs = nanos_to_secs(dp.time_unix_nano); + + // Validate: skip data points with empty metric_name or zero timestamp + if metric_name.is_empty() { + warn!("skipping data point with empty metric_name"); + return Ok(None); + } + if timestamp_secs == 0 { + warn!("skipping data point with zero timestamp_secs"); + return Ok(None); + } + // Extract value as f64 let value = match &dp.value { Some( @@ -500,34 +499,39 @@ fn create_number_data_point( None => 0.0, }; - // Extract attributes and explicit tags - let mut attributes = extract_attributes(dp.attributes.clone()); - let explicit_tags = extract_explicit_tags(&mut attributes); + // Extract attributes and convert all values to strings for tags + let attributes = extract_attributes(dp.attributes.clone()); + let mut tags = HashMap::with_capacity(attributes.len() + 3); - // Convert timestamps to seconds - let timestamp_secs = nanos_to_secs(dp.time_unix_nano); - let start_timestamp_secs = if dp.start_time_unix_nano != 0 { - Some(nanos_to_secs(dp.start_time_unix_nano)) - } else { - None - }; + for (key, json_val) in attributes { + tags.insert(key, json_value_to_string(json_val)); + } + + // Add metric_unit and start_timestamp_secs using or_insert_with so a + // data-point attribute with the same name is not silently overwritten. + if let Some(unit) = metric_unit { + tags.entry("metric_unit".to_string()) + .or_insert_with(|| unit.clone()); + } - Ok(MetricDataPoint { + if dp.start_time_unix_nano != 0 { + let start_ts = nanos_to_secs(dp.start_time_unix_nano); + tags.entry("start_timestamp_secs".to_string()) + .or_insert_with(|| start_ts.to_string()); + } + + // Fall back to the resource-level service.name if no data-point-level + // "service" tag was set. Data-point attributes take precedence. + tags.entry("service".to_string()) + .or_insert_with(|| service_name.to_string()); + + Ok(Some(MetricDataPoint { metric_name: metric_name.to_string(), metric_type, - metric_unit: metric_unit.clone(), timestamp_secs, - start_timestamp_secs, value, - tag_service: explicit_tags.service, - tag_env: explicit_tags.env, - tag_datacenter: explicit_tags.datacenter, - tag_region: explicit_tags.region, - tag_host: explicit_tags.host, - attributes, - service_name: service_name.to_string(), - resource_attributes: resource_attributes.clone(), - }) + tags, + })) } #[cfg(test)] @@ -562,42 +566,6 @@ mod tests { assert_eq!(nanos_to_secs(2_000_000_000), 2); } - #[test] - fn test_extract_explicit_tags() { - let mut attributes = HashMap::from([ - ("service".to_string(), JsonValue::String("api".to_string())), - ("env".to_string(), JsonValue::String("prod".to_string())), - ( - "datacenter".to_string(), - JsonValue::String("us-east".to_string()), - ), - ( - "region".to_string(), - JsonValue::String("us-east-1".to_string()), - ), - ( - "host".to_string(), - JsonValue::String("server-1".to_string()), - ), - ( - "custom_tag".to_string(), - JsonValue::String("custom_value".to_string()), - ), - ]); - - let explicit_tags = extract_explicit_tags(&mut attributes); - - assert_eq!(explicit_tags.service, Some("api".to_string())); - assert_eq!(explicit_tags.env, Some("prod".to_string())); - assert_eq!(explicit_tags.datacenter, Some("us-east".to_string())); - assert_eq!(explicit_tags.region, Some("us-east-1".to_string())); - assert_eq!(explicit_tags.host, Some("server-1".to_string())); - - // custom_tag should remain in attributes - assert_eq!(attributes.len(), 1); - assert!(attributes.contains_key("custom_tag")); - } - fn make_test_gauge_request() -> ExportMetricsServiceRequest { use quickwit_proto::opentelemetry::proto::common::v1::{AnyValue, KeyValue, any_value}; use quickwit_proto::opentelemetry::proto::metrics::v1::{ @@ -747,35 +715,41 @@ mod tests { #[test] fn test_parse_gauge_metrics() { let request = make_test_gauge_request(); - let data_points = parse_otlp_metrics(request).unwrap(); + let data_points = parse_otlp_metrics(request).data_points; assert_eq!(data_points.len(), 1); let dp = &data_points[0]; assert_eq!(dp.metric_name, "cpu.usage"); assert_eq!(dp.metric_type, MetricType::Gauge); - assert_eq!(dp.metric_unit, Some("%".to_string())); + assert_eq!(dp.tags.get("metric_unit").map(|s| s.as_str()), Some("%")); assert_eq!(dp.timestamp_secs, 2); - assert_eq!(dp.start_timestamp_secs, Some(1)); + assert_eq!( + dp.tags.get("start_timestamp_secs").map(|s| s.as_str()), + Some("1") + ); assert_eq!(dp.value, 85.5); - assert_eq!(dp.tag_service, Some("api".to_string())); - assert_eq!(dp.tag_env, Some("prod".to_string())); - assert_eq!(dp.service_name, "test-service"); + // Data-point attribute "service" takes precedence over resource-level service.name. + assert_eq!(dp.tags.get("service").map(|s| s.as_str()), Some("api")); + assert_eq!(dp.tags.get("env").map(|s| s.as_str()), Some("prod")); } #[test] fn test_parse_sum_delta_metrics() { let request = make_test_sum_delta_request(); - let data_points = parse_otlp_metrics(request).unwrap(); + let data_points = parse_otlp_metrics(request).data_points; assert_eq!(data_points.len(), 1); let dp = &data_points[0]; assert_eq!(dp.metric_name, "http.requests"); assert_eq!(dp.metric_type, MetricType::Sum); - assert_eq!(dp.metric_unit, Some("1".to_string())); + assert_eq!(dp.tags.get("metric_unit").map(|s| s.as_str()), Some("1")); assert_eq!(dp.timestamp_secs, 2); assert_eq!(dp.value, 100.0); // int converted to f64 - assert_eq!(dp.tag_host, Some("server-1".to_string())); - assert_eq!(dp.service_name, "counter-service"); + assert_eq!(dp.tags.get("host").map(|s| s.as_str()), Some("server-1")); + assert_eq!( + dp.tags.get("service").map(|s| s.as_str()), + Some("counter-service") + ); } #[test] @@ -783,11 +757,10 @@ mod tests { let request = make_test_sum_cumulative_request(); let result = parse_otlp_metrics(request); - assert!(result.is_err()); - match result.unwrap_err() { - OtlpMetricsError::InvalidArgument(_) => {} - err => panic!("unexpected error type: {:?}", err), - } + // Cumulative sums are skipped (not a hard error) so other metrics in the same + // request can still be processed. The rejected count is incremented instead. + assert_eq!(result.data_points.len(), 0); + assert_eq!(result.num_rejected, 1); } /// Test parsing metrics with various attribute types @@ -860,19 +833,14 @@ mod tests { }], }; - let data_points = parse_otlp_metrics(request).unwrap(); + let data_points = parse_otlp_metrics(request).data_points; assert_eq!(data_points.len(), 1); let dp = &data_points[0]; - assert_eq!(dp.service_name, "test"); - - // Verify resource attributes contain the non-service.name attributes - assert!(dp.resource_attributes.contains_key("int_attr")); - assert!(dp.resource_attributes.contains_key("bool_attr")); - assert!(dp.resource_attributes.contains_key("double_attr")); + assert_eq!(dp.tags.get("service").map(|s| s.as_str()), Some("test")); - // Verify data point attributes - assert!(dp.attributes.contains_key("string_tag")); + // Verify data point attributes are in tags as strings + assert_eq!(dp.tags.get("string_tag").map(|s| s.as_str()), Some("value")); } /// Test metrics with empty and missing values @@ -908,17 +876,94 @@ mod tests { }], }; - let data_points = parse_otlp_metrics(request).unwrap(); + let data_points = parse_otlp_metrics(request).data_points; assert_eq!(data_points.len(), 1); let dp = &data_points[0]; assert_eq!(dp.metric_name, "minimal.metric"); - assert_eq!(dp.service_name, "unknown_service"); // Default value - assert!(dp.metric_unit.is_none()); - assert!(dp.start_timestamp_secs.is_none()); - assert!(dp.tag_service.is_none()); - assert!(dp.tag_env.is_none()); - assert!(dp.attributes.is_empty()); - assert!(dp.resource_attributes.is_empty()); + assert_eq!( + dp.tags.get("service").map(|s| s.as_str()), + Some("unknown_service") + ); + // No metric_unit tag when unit is empty + assert!(!dp.tags.contains_key("metric_unit")); + // No start_timestamp_secs tag when start time is 0 + assert!(!dp.tags.contains_key("start_timestamp_secs")); + // Only "service" should be in tags (no attributes, no unit, no start time) + assert_eq!(dp.tags.len(), 1); + } + + /// Test that data points with empty metric_name are skipped + #[test] + fn test_skip_empty_metric_name() { + use quickwit_proto::opentelemetry::proto::metrics::v1::{ + Gauge, ResourceMetrics, ScopeMetrics, number_data_point, + }; + + let request = ExportMetricsServiceRequest { + resource_metrics: vec![ResourceMetrics { + resource: None, + scope_metrics: vec![ScopeMetrics { + scope: None, + metrics: vec![Metric { + name: String::new(), // Empty name + description: String::new(), + unit: String::new(), + data: Some(metric::Data::Gauge(Gauge { + data_points: vec![NumberDataPoint { + attributes: Vec::new(), + start_time_unix_nano: 0, + time_unix_nano: 1_000_000_000, + exemplars: Vec::new(), + flags: 0, + value: Some(number_data_point::Value::AsDouble(1.0)), + }], + })), + }], + schema_url: String::new(), + }], + schema_url: String::new(), + }], + }; + + let data_points = parse_otlp_metrics(request).data_points; + assert_eq!(data_points.len(), 0); + } + + /// Test that data points with zero timestamp are skipped + #[test] + fn test_skip_zero_timestamp() { + use quickwit_proto::opentelemetry::proto::metrics::v1::{ + Gauge, ResourceMetrics, ScopeMetrics, number_data_point, + }; + + let request = ExportMetricsServiceRequest { + resource_metrics: vec![ResourceMetrics { + resource: None, + scope_metrics: vec![ScopeMetrics { + scope: None, + metrics: vec![Metric { + name: "test.metric".to_string(), + description: String::new(), + unit: String::new(), + data: Some(metric::Data::Gauge(Gauge { + data_points: vec![NumberDataPoint { + attributes: Vec::new(), + start_time_unix_nano: 0, + time_unix_nano: 0, // Zero timestamp + exemplars: Vec::new(), + flags: 0, + value: Some(number_data_point::Value::AsDouble(1.0)), + }], + })), + }], + schema_url: String::new(), + }], + schema_url: String::new(), + }], + }; + + let data_points = parse_otlp_metrics(request).data_points; + assert_eq!(data_points.len(), 0); } } diff --git a/quickwit/quickwit-parquet-engine/Cargo.toml b/quickwit/quickwit-parquet-engine/Cargo.toml index f886083eded..9744dbcc2bc 100644 --- a/quickwit/quickwit-parquet-engine/Cargo.toml +++ b/quickwit/quickwit-parquet-engine/Cargo.toml @@ -11,9 +11,12 @@ authors.workspace = true license.workspace = true [dependencies] +anyhow = { workspace = true } arrow = { workspace = true } +chrono = { workspace = true } parquet = { workspace = true } quickwit-common = { workspace = true } +quickwit-proto = { workspace = true } sea-query = { workspace = true, optional = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/quickwit/quickwit-parquet-engine/src/index/accumulator.rs b/quickwit/quickwit-parquet-engine/src/index/accumulator.rs index 29065847136..817f3f16d93 100644 --- a/quickwit/quickwit-parquet-engine/src/index/accumulator.rs +++ b/quickwit/quickwit-parquet-engine/src/index/accumulator.rs @@ -14,15 +14,18 @@ //! Batch accumulator for producing splits from RecordBatches. +use std::collections::BTreeMap; +use std::sync::Arc; use std::time::Instant; +use arrow::array::{ArrayRef, new_null_array}; use arrow::compute::concat_batches; +use arrow::datatypes::{DataType, Field, Schema as ArrowSchema, SchemaRef}; use arrow::record_batch::RecordBatch; use tracing::{debug, info}; use super::config::ParquetIndexingConfig; use crate::metrics::PARQUET_ENGINE_METRICS; -use crate::schema::ParquetSchema; /// Error type for index operations. #[derive(Debug, thiserror::Error)] @@ -38,11 +41,14 @@ pub enum IndexingError { /// Batches are accumulated until either `max_rows` or `max_bytes` threshold is reached, /// at which point they are concatenated and returned for downstream processing (writing to /// Parquet by the ParquetPackager actor). +/// +/// Consecutive batches may have different column sets. The accumulator tracks the union +/// schema incrementally and aligns all batches to it on flush. pub struct ParquetBatchAccumulator { /// Configuration for accumulation thresholds. config: ParquetIndexingConfig, - /// Metrics schema for concatenation. - schema: ParquetSchema, + /// Union of all fields seen across pending batches. + union_fields: BTreeMap, /// Pending batches waiting to be flushed. pending_batches: Vec, /// Total rows in pending batches. @@ -57,11 +63,9 @@ impl ParquetBatchAccumulator { /// # Arguments /// * `config` - Configuration for accumulation thresholds pub fn new(config: ParquetIndexingConfig) -> Self { - let schema = ParquetSchema::new(); - Self { config, - schema, + union_fields: BTreeMap::new(), pending_batches: Vec::new(), pending_rows: 0, pending_bytes: 0, @@ -83,6 +87,13 @@ impl ParquetBatchAccumulator { .index_rows_total .inc_by(batch_rows as u64); + // Merge fields into union schema before pushing (we need the schema reference) + for field in batch.schema().fields() { + self.union_fields + .entry(field.name().clone()) + .or_insert_with(|| (field.data_type().clone(), field.is_nullable())); + } + self.pending_batches.push(batch); self.pending_rows += batch_rows; self.pending_bytes += batch_bytes; @@ -92,7 +103,7 @@ impl ParquetBatchAccumulator { batch_bytes, total_pending_rows = self.pending_rows, total_pending_bytes = self.pending_bytes, - "Added batch to accumulator" + "added batch to accumulator" ); let flushed = if self.should_flush() { @@ -101,7 +112,7 @@ impl ParquetBatchAccumulator { pending_bytes = self.pending_bytes, max_rows = self.config.max_rows, max_bytes = self.config.max_bytes, - "Threshold exceeded, triggering flush" + "threshold exceeded, triggering flush" ); self.flush_internal()? } else { @@ -119,6 +130,7 @@ impl ParquetBatchAccumulator { /// Discard all pending data without producing output. pub fn discard(&mut self) { self.pending_batches.clear(); + self.union_fields.clear(); self.pending_rows = 0; self.pending_bytes = 0; } @@ -136,14 +148,33 @@ impl ParquetBatchAccumulator { return Ok(None); } - // Concatenate all pending batches into one - let combined = concat_batches(self.schema.arrow_schema(), self.pending_batches.iter())?; + // Build the union schema from accumulated fields. + // All fields are marked nullable=true regardless of their source schema: + // any field that appears in some batches but not others will be null-filled + // for the missing batches, so non-nullable would cause Arrow to reject the concat. + let fields: Vec = self + .union_fields + .iter() + .map(|(name, (data_type, _nullable))| Field::new(name, data_type.clone(), true)) + .collect(); + let union_schema: SchemaRef = Arc::new(ArrowSchema::new(fields)); + + // Align each pending batch to the union schema + let aligned: Vec = self + .pending_batches + .iter() + .map(|batch| align_batch_to_schema(batch, &union_schema)) + .collect::, _>>()?; + + // Concatenate all aligned batches + let combined = concat_batches(&union_schema, aligned.iter())?; let num_rows = combined.num_rows(); - info!(num_rows, "Flushed accumulated batches"); + info!(num_rows, "flushed accumulated batches"); // Reset state self.pending_batches.clear(); + self.union_fields.clear(); self.pending_rows = 0; self.pending_bytes = 0; @@ -173,6 +204,24 @@ impl ParquetBatchAccumulator { } } +/// Align a RecordBatch to a target schema, inserting null columns where needed. +fn align_batch_to_schema( + batch: &RecordBatch, + target_schema: &SchemaRef, +) -> Result { + let num_rows = batch.num_rows(); + let batch_schema = batch.schema(); + let columns: Vec = target_schema + .fields() + .iter() + .map(|field| match batch_schema.index_of(field.name()) { + Ok(idx) => Arc::clone(batch.column(idx)), + Err(_) => new_null_array(field.data_type(), num_rows), + }) + .collect(); + RecordBatch::try_new(target_schema.clone(), columns) +} + /// Estimate the memory size of a RecordBatch. fn estimate_batch_bytes(batch: &RecordBatch) -> usize { batch @@ -184,106 +233,8 @@ fn estimate_batch_bytes(batch: &RecordBatch) -> usize { #[cfg(test)] mod tests { - use std::sync::Arc; - - use arrow::array::{ - ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array, UInt64Array, - }; - use arrow::datatypes::Int32Type; - use parquet::variant::{VariantArrayBuilder, VariantBuilderExt}; - use super::*; - - /// Create dictionary array for string fields with Int32 keys. - fn create_dict_array(values: &[&str]) -> ArrayRef { - let keys: Vec = (0..values.len()).map(|i| i as i32).collect(); - let string_array = StringArray::from(values.to_vec()); - Arc::new( - DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) - .unwrap(), - ) - } - - /// Create nullable dictionary array for optional string fields. - fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef { - let keys: Vec> = values - .iter() - .enumerate() - .map(|(i, v)| v.map(|_| i as i32)) - .collect(); - let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect(); - let string_array = StringArray::from(string_values); - Arc::new( - DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) - .unwrap(), - ) - } - - /// Create a VARIANT array for testing with specified number of rows. - fn create_variant_array(num_rows: usize, fields: Option<&[(&str, &str)]>) -> ArrayRef { - let mut builder = VariantArrayBuilder::new(num_rows); - for _ in 0..num_rows { - match fields { - Some(kv_pairs) => { - let mut obj = builder.new_object(); - for (key, value) in kv_pairs { - obj = obj.with_field(key, *value); - } - obj.finish(); - } - None => { - builder.append_null(); - } - } - } - ArrayRef::from(builder.build()) - } - - /// Create a test batch matching the metrics schema. - fn create_test_batch(num_rows: usize) -> RecordBatch { - let schema = ParquetSchema::new(); - - let metric_names: Vec<&str> = vec!["cpu.usage"; num_rows]; - let metric_name: ArrayRef = create_dict_array(&metric_names); - let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows])); - let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows])); - let timestamps: Vec = (0..num_rows).map(|i| 100 + i as u64).collect(); - let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps)); - let start_timestamp_secs: ArrayRef = - Arc::new(UInt64Array::from(vec![None::; num_rows])); - let values: Vec = (0..num_rows).map(|i| 42.0 + i as f64).collect(); - let value: ArrayRef = Arc::new(Float64Array::from(values)); - let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]); - let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]); - let tag_datacenter: ArrayRef = - create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]); - let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]); - let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]); - let attributes: ArrayRef = create_variant_array(num_rows, None); - let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]); - let resource_attributes: ArrayRef = create_variant_array(num_rows, None); - - RecordBatch::try_new( - schema.arrow_schema().clone(), - vec![ - metric_name, - metric_type, - metric_unit, - timestamp_secs, - start_timestamp_secs, - value, - tag_service, - tag_env, - tag_datacenter, - tag_region, - tag_host, - attributes, - service_name, - resource_attributes, - ], - ) - .unwrap() - } + use crate::test_helpers::{create_test_batch, create_test_batch_with_tags}; #[test] fn test_accumulator_below_threshold() { @@ -351,6 +302,93 @@ mod tests { assert_eq!(combined.num_rows(), 50); } + #[test] + fn test_accumulator_merges_different_tag_sets() { + let config = ParquetIndexingConfig::default().with_max_rows(1000); + let mut accumulator = ParquetBatchAccumulator::new(config); + + // First batch has "service" tag + let batch1 = create_test_batch_with_tags(3, &["service"]); + let _ = accumulator.add_batch(batch1).unwrap(); + + // Second batch has "host" tag + let batch2 = create_test_batch_with_tags(2, &["host"]); + let _ = accumulator.add_batch(batch2).unwrap(); + + let combined = accumulator.flush().unwrap().unwrap(); + assert_eq!(combined.num_rows(), 5); + + // Union schema should have all 4 required fields + both tags + let schema = combined.schema(); + assert!(schema.index_of("metric_name").is_ok()); + assert!(schema.index_of("metric_type").is_ok()); + assert!(schema.index_of("timestamp_secs").is_ok()); + assert!(schema.index_of("value").is_ok()); + assert!(schema.index_of("service").is_ok()); + assert!(schema.index_of("host").is_ok()); + assert_eq!(schema.fields().len(), 6); + + // First 3 rows should have null "host", last 2 rows should have null "service" + let host_idx = schema.index_of("host").unwrap(); + let host_col = combined.column(host_idx); + assert_eq!(host_col.null_count(), 3); // first batch had no host + + let service_idx = schema.index_of("service").unwrap(); + let service_col = combined.column(service_idx); + assert_eq!(service_col.null_count(), 2); // second batch had no service + + // No duplicate column names — each name appears exactly once. + let field_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect(); + let unique_count = field_names + .iter() + .collect::>() + .len(); + assert_eq!( + unique_count, + field_names.len(), + "duplicate columns in union schema: {field_names:?}" + ); + } + + #[test] + fn test_accumulator_no_duplicates_with_overlapping_tags() { + let config = ParquetIndexingConfig::default().with_max_rows(1000); + let mut accumulator = ParquetBatchAccumulator::new(config); + + // Both batches share "service"; second also has "host". + // "service" must appear exactly once in the flushed schema. + let batch1 = create_test_batch_with_tags(3, &["service"]); + let batch2 = create_test_batch_with_tags(2, &["service", "host"]); + let _ = accumulator.add_batch(batch1).unwrap(); + let _ = accumulator.add_batch(batch2).unwrap(); + + let combined = accumulator.flush().unwrap().unwrap(); + assert_eq!(combined.num_rows(), 5); + + let schema = combined.schema(); + let field_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect(); + let unique_count = field_names + .iter() + .collect::>() + .len(); + assert_eq!( + unique_count, + field_names.len(), + "duplicate columns in union schema: {field_names:?}" + ); + + // 4 required + service + host = 6 + assert_eq!(schema.fields().len(), 6); + + // Rows from batch1 have no "host" → 3 nulls; batch2 has "host" for all 2 rows → 0 nulls. + let host_idx = schema.index_of("host").unwrap(); + assert_eq!(combined.column(host_idx).null_count(), 3); + + // "service" present in both batches → 0 nulls total. + let service_idx = schema.index_of("service").unwrap(); + assert_eq!(combined.column(service_idx).null_count(), 0); + } + #[test] fn test_estimate_batch_bytes() { let batch = create_test_batch(100); diff --git a/quickwit/quickwit-parquet-engine/src/ingest/processor.rs b/quickwit/quickwit-parquet-engine/src/ingest/processor.rs index 80dd4f322e5..21412414b2b 100644 --- a/quickwit/quickwit-parquet-engine/src/ingest/processor.rs +++ b/quickwit/quickwit-parquet-engine/src/ingest/processor.rs @@ -21,7 +21,7 @@ use arrow::record_batch::RecordBatch; use tracing::{debug, instrument, warn}; use crate::metrics::PARQUET_ENGINE_METRICS; -use crate::schema::ParquetSchema; +use crate::schema::validate_required_fields; /// Error type for ingest operations. #[derive(Debug, thiserror::Error)] @@ -34,9 +34,9 @@ pub enum IngestError { #[error("IPC decode error: {0}")] IpcDecode(#[from] arrow::error::ArrowError), - /// RecordBatch schema doesn't match expected metrics schema. - #[error("Schema mismatch: expected {expected} fields, got {actual}")] - SchemaMismatch { expected: usize, actual: usize }, + /// RecordBatch schema validation failed. + #[error("Schema validation failed: {0}")] + SchemaValidation(String), /// Expected exactly one RecordBatch in IPC stream. #[error("Expected 1 RecordBatch in IPC stream, got {0}")] @@ -53,17 +53,10 @@ pub enum IngestError { /// Processor that converts Arrow IPC bytes to RecordBatch. /// -/// Validates that the decoded batch matches the expected metrics schema. -pub struct ParquetIngestProcessor { - schema: ParquetSchema, -} +/// Validates that the decoded batch contains all required fields. +pub struct ParquetIngestProcessor; impl ParquetIngestProcessor { - /// Create a new ParquetIngestProcessor. - pub fn new(schema: ParquetSchema) -> Self { - Self { schema } - } - /// Convert Arrow IPC bytes to RecordBatch. /// /// Returns error if IPC is malformed or schema doesn't match. @@ -95,65 +88,15 @@ impl ParquetIngestProcessor { debug!( num_rows = batch.num_rows(), - "Successfully decoded IPC to RecordBatch" + "successfully decoded IPC to RecordBatch" ); Ok(batch) } - /// Validate that the RecordBatch schema matches expected metrics schema. + /// Validate that the RecordBatch schema contains all required fields. fn validate_schema(&self, batch: &RecordBatch) -> Result<(), IngestError> { - let expected_fields = self.schema.num_fields(); - let actual_fields = batch.schema().fields().len(); - - if expected_fields != actual_fields { - warn!( - expected = expected_fields, - actual = actual_fields, - "Schema mismatch: field count differs" - ); - return Err(IngestError::SchemaMismatch { - expected: expected_fields, - actual: actual_fields, - }); - } - - // Verify field names match (in order) - let schema_fields = self.schema.arrow_schema().fields(); - let batch_schema = batch.schema(); - let batch_fields = batch_schema.fields(); - - for (expected, actual) in schema_fields.iter().zip(batch_fields.iter()) { - if expected.name() != actual.name() { - warn!( - expected_name = expected.name(), - actual_name = actual.name(), - "Schema mismatch: field name differs" - ); - return Err(IngestError::SchemaMismatch { - expected: expected_fields, - actual: actual_fields, - }); - } - if expected.data_type() != actual.data_type() { - warn!( - field = expected.name(), - expected_type = ?expected.data_type(), - actual_type = ?actual.data_type(), - "Schema mismatch: field type differs" - ); - return Err(IngestError::SchemaMismatch { - expected: expected_fields, - actual: actual_fields, - }); - } - } - - Ok(()) - } - - /// Get a reference to the schema. - pub fn schema(&self) -> &ParquetSchema { - &self.schema + validate_required_fields(batch.schema().as_ref()) + .map_err(|e| IngestError::SchemaValidation(e.to_string())) } } @@ -168,11 +111,10 @@ fn ipc_to_record_batch(ipc_bytes: &[u8]) -> Result { return Err(IngestError::UnexpectedBatchCount(batches.len())); } - // Safe: we verified exactly 1 batch above, but use ok_or for defensive programming - batches + Ok(batches .into_iter() .next() - .ok_or(IngestError::UnexpectedBatchCount(0)) + .expect("len verified to be 1 above")) } /// Serialize a RecordBatch to Arrow IPC stream format. @@ -190,100 +132,12 @@ pub fn record_batch_to_ipc(batch: &RecordBatch) -> Result, IngestError> #[cfg(test)] mod tests { - use std::sync::Arc; - - use arrow::array::{ - ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array, UInt64Array, - }; - use arrow::datatypes::Int32Type; - use parquet::variant::VariantArrayBuilder; - use super::*; - - /// Create dictionary array for string fields with Int32 keys. - fn create_dict_array(values: &[&str]) -> ArrayRef { - let keys: Vec = (0..values.len()).map(|i| i as i32).collect(); - let string_array = StringArray::from(values.to_vec()); - Arc::new( - DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) - .unwrap(), - ) - } - - /// Create nullable dictionary array for optional string fields. - fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef { - let keys: Vec> = values - .iter() - .enumerate() - .map(|(i, v)| v.map(|_| i as i32)) - .collect(); - let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect(); - let string_array = StringArray::from(string_values); - Arc::new( - DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) - .unwrap(), - ) - } - - /// Create a VARIANT array for testing with specified number of rows (all nulls). - fn create_variant_array(num_rows: usize) -> ArrayRef { - let mut builder = VariantArrayBuilder::new(num_rows); - for _ in 0..num_rows { - builder.append_null(); - } - ArrayRef::from(builder.build()) - } - - /// Create a test batch matching the metrics schema. - fn create_test_batch(num_rows: usize) -> RecordBatch { - let schema = ParquetSchema::new(); - - let metric_names: Vec<&str> = vec!["cpu.usage"; num_rows]; - let metric_name: ArrayRef = create_dict_array(&metric_names); - let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows])); - let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows])); - let timestamps: Vec = (0..num_rows).map(|i| 100 + i as u64).collect(); - let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps)); - let start_timestamp_secs: ArrayRef = - Arc::new(UInt64Array::from(vec![None::; num_rows])); - let values: Vec = (0..num_rows).map(|i| 42.0 + i as f64).collect(); - let value: ArrayRef = Arc::new(Float64Array::from(values)); - let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]); - let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]); - let tag_datacenter: ArrayRef = - create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]); - let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]); - let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]); - let attributes: ArrayRef = create_variant_array(num_rows); - let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]); - let resource_attributes: ArrayRef = create_variant_array(num_rows); - - RecordBatch::try_new( - schema.arrow_schema().clone(), - vec![ - metric_name, - metric_type, - metric_unit, - timestamp_secs, - start_timestamp_secs, - value, - tag_service, - tag_env, - tag_datacenter, - tag_region, - tag_host, - attributes, - service_name, - resource_attributes, - ], - ) - .unwrap() - } + use crate::test_helpers::create_test_batch; #[test] fn test_process_ipc() { - let schema = ParquetSchema::new(); - let processor = ParquetIngestProcessor::new(schema); + let processor = ParquetIngestProcessor; // Create a valid batch let batch = create_test_batch(10); @@ -295,13 +149,12 @@ mod tests { let recovered = result.unwrap(); assert_eq!(recovered.num_rows(), 10); - assert_eq!(recovered.num_columns(), 14); + assert_eq!(recovered.num_columns(), 6); // 4 required + 2 tags } #[test] fn test_process_ipc_invalid_bytes() { - let schema = ParquetSchema::new(); - let processor = ParquetIngestProcessor::new(schema); + let processor = ParquetIngestProcessor; let result = processor.process_ipc(&[0u8; 10]); assert!(result.is_err()); diff --git a/quickwit/quickwit-parquet-engine/src/lib.rs b/quickwit/quickwit-parquet-engine/src/lib.rs index eac11bfa8e9..d34c67c665d 100644 --- a/quickwit/quickwit-parquet-engine/src/lib.rs +++ b/quickwit/quickwit-parquet-engine/src/lib.rs @@ -24,5 +24,10 @@ pub mod index; pub mod ingest; pub mod metrics; pub mod schema; +pub mod sort_fields; pub mod split; pub mod storage; +pub mod table_config; + +#[cfg(any(test, feature = "testsuite"))] +pub mod test_helpers; diff --git a/quickwit/quickwit-parquet-engine/src/schema/fields.rs b/quickwit/quickwit-parquet-engine/src/schema/fields.rs index b6a52c738aa..9f46dcf3b8c 100644 --- a/quickwit/quickwit-parquet-engine/src/schema/fields.rs +++ b/quickwit/quickwit-parquet-engine/src/schema/fields.rs @@ -12,151 +12,57 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! Parquet field definitions with column metadata. +//! Parquet field definitions with sort order constants and validation. -use arrow::datatypes::{DataType, Field, Fields}; -use parquet::variant::VariantType; +use anyhow::{Result, bail}; +use arrow::datatypes::DataType; -/// All fields in the parquet schema. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum ParquetField { - MetricName, - MetricType, - MetricUnit, - TimestampSecs, - StartTimestampSecs, - Value, - TagService, - TagEnv, - TagDatacenter, - TagRegion, - TagHost, - Attributes, - ServiceName, - ResourceAttributes, -} +/// Required field names that must exist in every batch. +pub const REQUIRED_FIELDS: &[&str] = &["metric_name", "metric_type", "timestamp_secs", "value"]; -impl ParquetField { - /// Field name as stored in Parquet. - pub fn name(&self) -> &'static str { - match self { - Self::MetricName => "metric_name", - Self::MetricType => "metric_type", - Self::MetricUnit => "metric_unit", - Self::TimestampSecs => "timestamp_secs", - Self::StartTimestampSecs => "start_timestamp_secs", - Self::Value => "value", - Self::TagService => "tag_service", - Self::TagEnv => "tag_env", - Self::TagDatacenter => "tag_datacenter", - Self::TagRegion => "tag_region", - Self::TagHost => "tag_host", - Self::Attributes => "attributes", - Self::ServiceName => "service_name", - Self::ResourceAttributes => "resource_attributes", - } - } +/// Sort order column names. Columns not present in a batch are skipped. +pub const SORT_ORDER: &[&str] = &[ + "metric_name", + "service", + "env", + "datacenter", + "region", + "host", + "timestamp_secs", +]; - /// Whether this field is nullable. - pub fn is_nullable(&self) -> bool { - matches!( - self, - Self::MetricUnit - | Self::StartTimestampSecs - | Self::TagService - | Self::TagEnv - | Self::TagDatacenter - | Self::TagRegion - | Self::TagHost - | Self::Attributes - | Self::ResourceAttributes - ) +/// Arrow type for required fields by name. +pub fn required_field_type(name: &str) -> Option { + match name { + "metric_name" => Some(DataType::Dictionary( + Box::new(DataType::Int32), + Box::new(DataType::Utf8), + )), + "metric_type" => Some(DataType::UInt8), + "timestamp_secs" => Some(DataType::UInt64), + "value" => Some(DataType::Float64), + _ => None, } +} - /// Arrow DataType for this field. - /// Use dictionary encoding for high-cardinality strings. - pub fn arrow_type(&self) -> DataType { - match self { - // Dictionary-encoded strings for high cardinality - Self::MetricName | Self::ServiceName => { - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)) +/// Validate that a batch schema contains all required fields with correct types. +pub fn validate_required_fields(schema: &arrow::datatypes::Schema) -> Result<()> { + for &name in REQUIRED_FIELDS { + match schema.index_of(name) { + Ok(idx) => { + let expected_type = required_field_type(name).unwrap(); + let actual_type = schema.field(idx).data_type(); + if *actual_type != expected_type { + bail!( + "field '{}' has type {:?}, expected {:?}", + name, + actual_type, + expected_type + ); + } } - // Dictionary-encoded optional tags - Self::TagService - | Self::TagEnv - | Self::TagDatacenter - | Self::TagRegion - | Self::TagHost => { - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)) - } - // Enum stored as UInt8 - Self::MetricType => DataType::UInt8, - // Timestamps as UInt64 seconds - Self::TimestampSecs | Self::StartTimestampSecs => DataType::UInt64, - // Metric value - Self::Value => DataType::Float64, - // Plain string for metric unit - Self::MetricUnit => DataType::Utf8, - // VARIANT type for semi-structured attributes - // Uses the Parquet Variant binary encoding format - Self::Attributes | Self::ResourceAttributes => { - // VARIANT is stored as a struct with metadata and value BinaryView fields - // VariantArrayBuilder produces BinaryView, not Binary - DataType::Struct(Fields::from(vec![ - Field::new("metadata", DataType::BinaryView, false), - Field::new("value", DataType::BinaryView, false), - ])) - } - } - } - - /// Convert to Arrow Field. - pub fn to_arrow_field(&self) -> Field { - let field = Field::new(self.name(), self.arrow_type(), self.is_nullable()); - - // Add VARIANT extension type metadata for attributes fields - match self { - Self::Attributes | Self::ResourceAttributes => field.with_extension_type(VariantType), - _ => field, + Err(_) => bail!("missing required field '{}'", name), } } - - /// All fields in schema order. - pub fn all() -> &'static [ParquetField] { - &[ - Self::MetricName, - Self::MetricType, - Self::MetricUnit, - Self::TimestampSecs, - Self::StartTimestampSecs, - Self::Value, - Self::TagService, - Self::TagEnv, - Self::TagDatacenter, - Self::TagRegion, - Self::TagHost, - Self::Attributes, - Self::ServiceName, - Self::ResourceAttributes, - ] - } - - /// Sort order for metrics data (used for pruning). - /// Order: metric_name, common tags (service, env, datacenter, region, host), timestamp. - pub fn sort_order() -> &'static [ParquetField] { - &[ - Self::MetricName, - Self::TagService, - Self::TagEnv, - Self::TagDatacenter, - Self::TagRegion, - Self::TagHost, - Self::TimestampSecs, - ] - } - - /// Get the column index in the schema. - pub fn column_index(&self) -> usize { - Self::all().iter().position(|f| f == self).unwrap() - } + Ok(()) } diff --git a/quickwit/quickwit-parquet-engine/src/schema/mod.rs b/quickwit/quickwit-parquet-engine/src/schema/mod.rs index 4cbe31d357d..f9b5c06d9c4 100644 --- a/quickwit/quickwit-parquet-engine/src/schema/mod.rs +++ b/quickwit/quickwit-parquet-engine/src/schema/mod.rs @@ -20,5 +20,5 @@ mod fields; mod parquet; -pub use fields::ParquetField; +pub use fields::{REQUIRED_FIELDS, SORT_ORDER, required_field_type, validate_required_fields}; pub use parquet::ParquetSchema; diff --git a/quickwit/quickwit-parquet-engine/src/schema/parquet.rs b/quickwit/quickwit-parquet-engine/src/schema/parquet.rs index 83ef99e351c..d0991f89807 100644 --- a/quickwit/quickwit-parquet-engine/src/schema/parquet.rs +++ b/quickwit/quickwit-parquet-engine/src/schema/parquet.rs @@ -14,13 +14,7 @@ //! Parquet schema construction for metrics. -use std::sync::Arc; - -use arrow::datatypes::{Schema as ArrowSchema, SchemaRef}; -use parquet::arrow::ArrowSchemaConverter; -use parquet::schema::types::SchemaDescriptor; - -use super::fields::ParquetField; +use arrow::datatypes::SchemaRef; /// Parquet schema for storage. #[derive(Debug, Clone)] @@ -29,15 +23,11 @@ pub struct ParquetSchema { } impl ParquetSchema { - /// Create a new ParquetSchema. - pub fn new() -> Self { - let fields: Vec<_> = ParquetField::all() - .iter() - .map(|f| f.to_arrow_field()) - .collect(); - - let arrow_schema = Arc::new(ArrowSchema::new(fields)); - Self { arrow_schema } + /// Create a ParquetSchema from an Arrow schema. + pub fn from_arrow_schema(schema: SchemaRef) -> Self { + Self { + arrow_schema: schema, + } } /// Get the Arrow schema. @@ -45,11 +35,6 @@ impl ParquetSchema { &self.arrow_schema } - /// Convert to Parquet schema descriptor. - pub fn parquet_schema(&self) -> Result { - ArrowSchemaConverter::new().convert(&self.arrow_schema) - } - /// Get field by name. pub fn field(&self, name: &str) -> Option<&arrow::datatypes::Field> { self.arrow_schema.field_with_name(name).ok() @@ -61,33 +46,42 @@ impl ParquetSchema { } } -impl Default for ParquetSchema { - fn default() -> Self { - Self::new() - } -} - #[cfg(test)] mod tests { + use std::sync::Arc; + + use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use super::*; + fn create_test_schema() -> SchemaRef { + Arc::new(ArrowSchema::new(vec![ + Field::new( + "metric_name", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + Field::new("metric_type", DataType::UInt8, false), + Field::new("timestamp_secs", DataType::UInt64, false), + Field::new("value", DataType::Float64, false), + Field::new( + "service", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + true, + ), + ])) + } + #[test] fn test_schema_creation() { - let schema = ParquetSchema::new(); - assert_eq!(schema.num_fields(), 14); + let schema = ParquetSchema::from_arrow_schema(create_test_schema()); + assert_eq!(schema.num_fields(), 5); } #[test] fn test_field_lookup() { - let schema = ParquetSchema::new(); + let schema = ParquetSchema::from_arrow_schema(create_test_schema()); let field = schema.field("metric_name").unwrap(); assert!(!field.is_nullable()); } - - #[test] - fn test_parquet_conversion() { - let schema = ParquetSchema::new(); - let parquet_schema = schema.parquet_schema(); - assert!(parquet_schema.is_ok()); - } } diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/column_type.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/column_type.rs new file mode 100644 index 00000000000..21690f01e0e --- /dev/null +++ b/quickwit/quickwit-parquet-engine/src/sort_fields/column_type.rs @@ -0,0 +1,240 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Column type identification from name suffixes and string names. +//! +//! Type can be specified via Husky-convention suffixes (`__s`, `__i`, `__nf`) +//! or inferred from well-known bare names. The discriminant values match +//! the Go iota exactly for cross-system interoperability. + +use std::str::FromStr; + +use super::SortFieldsError; + +/// Well-known column name for timestamps. +pub const TIMESTAMP: &str = "timestamp"; + +/// Well-known column name for tiebreaker. +pub const TIEBREAKER: &str = "tiebreaker"; + +/// Well-known column name for timeseries ID hash. +pub const TIMESERIES_ID: &str = "timeseries_id"; + +/// Well-known column name for metric value. +pub const METRIC_VALUE: &str = "metric_value"; + +/// Column type IDs matching Go `types.TypeID` iota values. +/// +/// Only the types that appear in sort schemas are included here. +/// The discriminant values MUST match Go exactly for cross-system interop. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[repr(u64)] +pub enum ColumnTypeId { + Int64 = 2, + Float64 = 10, + String = 14, + Sketch = 17, + CpcSketch = 20, + ItemSketch = 22, +} + +impl ColumnTypeId { + /// The Husky-convention suffix for this column type. + /// + /// Used when serializing back to the string format with explicit types. + pub fn suffix(self) -> &'static str { + match self { + Self::Int64 => "__i", + Self::Float64 => "__nf", + Self::String => "__s", + Self::Sketch => "__sk", + Self::CpcSketch => "__cpcsk", + Self::ItemSketch => "__isk", + } + } + + /// Human-readable type name matching Go `TypeID.String()`. + pub fn as_str(self) -> &'static str { + match self { + Self::Int64 => "dense-int64", + Self::Float64 => "dense-float64", + Self::String => "dense-string", + Self::Sketch => "dense-sketch", + Self::CpcSketch => "dense-cpc-sketch", + Self::ItemSketch => "dense-item-sketch", + } + } + + /// Resolve column type from a column name, stripping any type suffix. + /// + /// Returns `(bare_name, type)`. Type resolution order: + /// 1. Explicit suffix (`__s`, `__i`, `__nf`, etc.) — stripped, type from suffix + /// 2. Well-known bare name defaults: + /// - `timestamp`, `tiebreaker`, `timeseries_id` → Int64 + /// - `metric_value` → Float64 + /// - everything else → String + pub fn from_column_name(name: &str) -> Result<(&str, Self), SortFieldsError> { + // Try explicit suffixes first (longest match first to avoid ambiguity). + if let Some(bare) = name.strip_suffix("__isk") { + return Ok((bare, Self::ItemSketch)); + } + if let Some(bare) = name.strip_suffix("__cpcsk") { + return Ok((bare, Self::CpcSketch)); + } + if let Some(bare) = name.strip_suffix("__sk") { + return Ok((bare, Self::Sketch)); + } + if let Some(bare) = name.strip_suffix("__nf") { + return Ok((bare, Self::Float64)); + } + if let Some(bare) = name.strip_suffix("__i") { + return Ok((bare, Self::Int64)); + } + if let Some(bare) = name.strip_suffix("__s") { + return Ok((bare, Self::String)); + } + + // No suffix — use well-known name defaults. + Ok((name, default_type_for_name(name))) + } +} + +/// Default column type and sort direction for a bare column name. +/// +/// This is the single source of truth for well-known column defaults. +/// Used by the parser (type inference, default direction), display +/// (suffix omission, direction omission), and validation. +pub struct ColumnDefaults { + pub column_type: ColumnTypeId, + /// True if the default sort direction is descending. + pub descending: bool, +} + +/// Well-known name → default type and sort direction lookup table. +/// +/// Columns not in this table default to String, ascending. +static WELL_KNOWN_COLUMNS: &[(&str, ColumnDefaults)] = &[ + ( + TIMESTAMP, + ColumnDefaults { + column_type: ColumnTypeId::Int64, + descending: true, + }, + ), + ( + "timestamp_secs", + ColumnDefaults { + column_type: ColumnTypeId::Int64, + descending: true, + }, + ), + ( + TIEBREAKER, + ColumnDefaults { + column_type: ColumnTypeId::Int64, + descending: false, + }, + ), + ( + TIMESERIES_ID, + ColumnDefaults { + column_type: ColumnTypeId::Int64, + descending: false, + }, + ), + ( + METRIC_VALUE, + ColumnDefaults { + column_type: ColumnTypeId::Float64, + descending: false, + }, + ), + ( + "value", + ColumnDefaults { + column_type: ColumnTypeId::Float64, + descending: false, + }, + ), +]; + +const DEFAULT_COLUMN: ColumnDefaults = ColumnDefaults { + column_type: ColumnTypeId::String, + descending: false, +}; + +/// Look up default type and direction for a bare column name. +pub fn column_defaults(name: &str) -> &'static ColumnDefaults { + WELL_KNOWN_COLUMNS + .iter() + .find(|(n, _)| *n == name) + .map(|(_, d)| d) + .unwrap_or(&DEFAULT_COLUMN) +} + +/// Default column type for a bare name (convenience wrapper). +pub fn default_type_for_name(name: &str) -> ColumnTypeId { + column_defaults(name).column_type +} + +/// Whether this bare name defaults to descending sort. +pub fn default_is_descending(name: &str) -> bool { + column_defaults(name).descending +} + +impl std::fmt::Display for ColumnTypeId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +/// Parse a type name string (e.g., "dense-int64") into a `ColumnTypeId`. +impl FromStr for ColumnTypeId { + type Err = SortFieldsError; + + fn from_str(s: &str) -> Result { + match s { + "dense-int64" => Ok(Self::Int64), + "dense-float64" => Ok(Self::Float64), + "dense-string" => Ok(Self::String), + "dense-sketch" => Ok(Self::Sketch), + "dense-cpc-sketch" => Ok(Self::CpcSketch), + "dense-item-sketch" => Ok(Self::ItemSketch), + _ => Err(SortFieldsError::UnknownColumnType(format!( + "unknown column type '{}'", + s + ))), + } + } +} + +/// Convert a proto `column_type` u64 back to a `ColumnTypeId`. +impl TryFrom for ColumnTypeId { + type Error = SortFieldsError; + + fn try_from(value: u64) -> Result { + match value { + 2 => Ok(Self::Int64), + 10 => Ok(Self::Float64), + 14 => Ok(Self::String), + 17 => Ok(Self::Sketch), + 20 => Ok(Self::CpcSketch), + 22 => Ok(Self::ItemSketch), + _ => Err(SortFieldsError::UnknownColumnType(format!( + "unknown column type id: {}", + value + ))), + } + } +} diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/display.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/display.rs new file mode 100644 index 00000000000..8a6d74f482a --- /dev/null +++ b/quickwit/quickwit-parquet-engine/src/sort_fields/display.rs @@ -0,0 +1,124 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Sort schema to string serialization -- direct port of Go `SchemaToString` and +//! `SchemaToStringShort`. +//! +//! The proto `SortColumn.name` stores the bare Parquet column name (no type suffix). +//! These functions reconstruct the Husky-format suffixed name for serialization +//! using `SortColumn.column_type` to determine the suffix. + +use quickwit_proto::sortschema::{SortColumn, SortColumnDirection, SortSchema}; + +use super::column_type::{ColumnTypeId, default_is_descending, default_type_for_name}; + +fn direction_str(sort_direction: i32) -> &'static str { + match SortColumnDirection::try_from(sort_direction) { + Ok(SortColumnDirection::SortDirectionAscending) => ":+", + Ok(SortColumnDirection::SortDirectionDescending) => ":-", + _ => ":???", + } +} + +fn type_str(column_type: u64) -> &'static str { + match ColumnTypeId::try_from(column_type) { + Ok(ct) => ct.as_str(), + Err(_) => "unknown", + } +} + +/// Reconstruct the column name for the Husky string format. +/// +/// Only appends the type suffix when the column's type differs from the +/// default for its bare name. This keeps the string short and readable: +/// `metric_name` (default String) → no suffix needed +/// `timestamp` (default Int64) → no suffix needed +/// `my_counter__i` → suffix needed (Int64 differs from default String) +fn display_name(col: &SortColumn) -> String { + let col_type = match ColumnTypeId::try_from(col.column_type) { + Ok(ct) => ct, + Err(_) => return col.name.clone(), + }; + let default_type = default_type_for_name(&col.name); + if col_type == default_type { + col.name.clone() + } else { + format!("{}{}", col.name, col_type.suffix()) + } +} + +/// Convert a `SortSchema` to its full string representation. +/// +/// Format: `[name=]column__suffix:type:+/-[|...][/V#]` +/// +/// Direct port of Go `SchemaToString`. +pub fn schema_to_string(schema: &SortSchema) -> String { + schema_to_string_inner(schema, true) +} + +/// Convert a `SortSchema` to its short string representation. +/// +/// Format: `[name=]column__suffix[|...][/V#]` +/// +/// Omits the explicit type and skips the sort direction when it matches the +/// default (ascending for non-timestamp, descending for timestamp). +/// +/// Direct port of Go `SchemaToStringShort`. +pub fn schema_to_string_short(schema: &SortSchema) -> String { + schema_to_string_inner(schema, false) +} + +/// Shared implementation for both full and short schema string formats. +/// +/// When `verbose` is true, includes the explicit type and always emits direction. +/// When `verbose` is false, omits type and skips direction when it matches the default. +fn schema_to_string_inner(schema: &SortSchema, verbose: bool) -> String { + let mut rv = String::new(); + + if !schema.name.is_empty() { + rv.push_str(&schema.name); + rv.push('='); + } + + for (i, col) in schema.column.iter().enumerate() { + if i > 0 { + rv.push('|'); + } + if schema.lsm_comparison_cutoff > 0 && i == schema.lsm_comparison_cutoff as usize { + rv.push('&'); + } + rv.push_str(&display_name(col)); + + if verbose { + rv.push(':'); + rv.push_str(type_str(col.column_type)); + rv.push_str(direction_str(col.sort_direction)); + } else { + let is_default_direction = if default_is_descending(&col.name) { + col.sort_direction == SortColumnDirection::SortDirectionDescending as i32 + } else { + col.sort_direction == SortColumnDirection::SortDirectionAscending as i32 + }; + if !is_default_direction { + rv.push_str(direction_str(col.sort_direction)); + } + } + } + + if schema.sort_version > 0 { + rv.push_str(&format!("/V{}", schema.sort_version)); + } + + rv +} diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/equivalence.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/equivalence.rs new file mode 100644 index 00000000000..403ec9fd135 --- /dev/null +++ b/quickwit/quickwit-parquet-engine/src/sort_fields/equivalence.rs @@ -0,0 +1,62 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Sort schema equivalence comparison -- direct port of Go `EquivalentSchemas` +//! and `EquivalentSchemasForCompaction`. + +use quickwit_proto::sortschema::SortSchema; + +/// Base comparison: checks sort_version and all columns match (name, type, direction). +/// +/// Hand-rolled comparison (not proto equality) because the Go compactor calls this +/// in tight loops on 10s-100s of thousands of fragments and proto.Equal allocates. +fn equivalent_schemas_base(a: &SortSchema, b: &SortSchema) -> bool { + if a.sort_version != b.sort_version { + return false; + } + if a.column.len() != b.column.len() { + return false; + } + for (a_col, b_col) in a.column.iter().zip(b.column.iter()) { + if a_col.name != b_col.name { + return false; + } + if a_col.column_type != b_col.column_type { + return false; + } + if a_col.sort_direction != b_col.sort_direction { + return false; + } + } + true +} + +/// Check if two schemas are equivalent, ignoring names and versioned schema. +/// +/// Compares columns, sort_version, and `lsm_comparison_cutoff`. +/// +/// Direct port of Go `EquivalentSchemas`. +pub fn equivalent_schemas(a: &SortSchema, b: &SortSchema) -> bool { + equivalent_schemas_base(a, b) && a.lsm_comparison_cutoff == b.lsm_comparison_cutoff +} + +/// Check if two schemas are equivalent for compaction purposes. +/// +/// Same as `equivalent_schemas` but ignores `lsm_comparison_cutoff`, providing +/// backward compatibility when old cplanners send steps without LSM cutoff info. +/// +/// Direct port of Go `EquivalentSchemasForCompaction`. +pub fn equivalent_schemas_for_compaction(a: &SortSchema, b: &SortSchema) -> bool { + equivalent_schemas_base(a, b) +} diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/mod.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/mod.rs new file mode 100644 index 00000000000..3043531c557 --- /dev/null +++ b/quickwit/quickwit-parquet-engine/src/sort_fields/mod.rs @@ -0,0 +1,52 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Sort fields types, parsing, and time-window arithmetic for metrics compaction. +//! +//! The sort fields define how rows are ordered within a Parquet split, +//! which determines merge and compaction behavior. The parser is a direct +//! port of Husky's Go `schemautils.go` for cross-system interoperability. + +// TableConfig.effective_sort_fields() is wired into ParquetWriter at construction +// time. The writer resolves sort field names to physical ParquetField columns; +// columns not yet in the schema (e.g., timeseries_id) are skipped during sort +// but recorded in metadata. +// +// TODO(Phase 32): Wire per-index TableConfig into IndexConfig so each index can +// override the default sort fields. Currently all metrics indexes use +// ProductType::Metrics default. +// +// When accepting user-supplied sort_fields for metrics indexes, validation MUST +// reject schemas that do not include timeseries_id__i immediately before timestamp. +// The timeseries_id tiebreaker is mandatory for metrics to ensure deterministic +// sort order across splits with identical tag combinations. + +pub mod column_type; +pub mod display; +pub mod equivalence; +pub mod parser; +pub mod validation; +pub mod window; + +#[cfg(test)] +mod tests; + +// Public API re-exports. +pub use column_type::ColumnTypeId; +pub use display::{schema_to_string, schema_to_string_short}; +pub use equivalence::{equivalent_schemas, equivalent_schemas_for_compaction}; +pub use parser::parse_sort_fields; +pub use quickwit_proto::SortFieldsError; +pub use validation::validate_schema; +pub use window::{validate_window_duration, window_start}; diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/parser.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/parser.rs new file mode 100644 index 00000000000..77b11757b29 --- /dev/null +++ b/quickwit/quickwit-parquet-engine/src/sort_fields/parser.rs @@ -0,0 +1,338 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Sort schema string parser -- direct port of Go `StringToSchema`. +//! +//! Parses Husky-style sort schema strings like: +//! `"metric_name|host|env|timeseries_id|timestamp/V2"` +//! into a `SortSchema` proto with correct column names, types, directions, and version. + +use quickwit_proto::sortschema::{SortColumn, SortColumnDirection, SortSchema}; + +use super::SortFieldsError; +use super::column_type::{ColumnTypeId, default_is_descending}; +use super::validation::validate_schema; + +/// The minimum sort version we accept. V0 (INCORRECT_TRIM) and V1 (TRIMMED_WITH_BUDGET) +/// are rejected per the strict V2-only decision. +const MINIMUM_SORT_VERSION: i32 = 2; + +/// Parse a sort schema string into a `SortSchema` proto. +/// +/// Direct port of Go `StringToSchema`. Accepts the format: +/// `[name=]column[|column...][/V#]` +/// +/// Each column can be: +/// - `name` (1-part): infer type from suffix, default direction +/// - `+name` or `name+` (1-part with prefix/suffix direction) +/// - `name:+/-` (2-part): infer type from suffix, explicit direction +/// - `name:type:+/-` (3-part): explicit type + verify matches suffix +/// +/// Direction (`+`/`-`) may appear as a prefix (`+name`), suffix (`name+`), +/// or after a colon (`name:+`). It is an error for direction to appear in +/// more than one position for a given column (e.g., `+name+` or `+name:+`). +/// +/// The `&` marker before a column name indicates the LSM comparison cutoff. +/// +/// **V2-only enforcement**: Only sort_version >= 2 is accepted. Unversioned +/// strings (default to 0), V0, and V1 are rejected. +pub fn parse_sort_fields(s: &str) -> Result { + let mut schema = SortSchema::default(); + let mut input = s; + + // Split on `=` for optional name prefix. + if let Some((name, rest)) = split_once_max2(input, '=', s)? { + schema.name = name.to_string(); + input = rest; + } + + // Split on `/` for version suffix. + let sort_version = parse_version_suffix(&mut input, s)?; + if sort_version < MINIMUM_SORT_VERSION { + return Err(SortFieldsError::UnsupportedVersion { + version: sort_version, + minimum: MINIMUM_SORT_VERSION, + }); + } + schema.sort_version = sort_version; + + // Parse columns. + let mut cutoff_marker_count = 0; + + for (i, col_str) in input.split('|').enumerate() { + let col_remaining = parse_cutoff_marker(col_str, i, &mut cutoff_marker_count, &mut schema)?; + + let (prefix_dir, after_prefix) = strip_direction_prefix(col_remaining); + let parts: Vec<&str> = after_prefix.split(':').collect(); + + let column = match parts.len() { + 3 => parse_3part(parts, prefix_dir, col_str)?, + 2 => parse_2part(parts, prefix_dir, col_str)?, + 1 => { + schema + .column + .push(parse_1part(parts[0], prefix_dir, col_str)?); + continue; + } + _ => { + return Err(SortFieldsError::InvalidColumnFormat(format!( + "columns should be of the form 'name:type:+/-' or 'name:+/-' or 'name', \ + found: {}", + col_str + ))); + } + }; + + schema.column.push(column); + } + + if cutoff_marker_count > 0 && schema.column.len() < 2 { + return Err(SortFieldsError::InvalidCutoffPlacement( + "LSM cutoff marker (&) requires at least 2 columns".to_string(), + )); + } + + validate_schema(&schema)?; + Ok(schema) +} + +/// Parse the `/V#` version suffix, updating `input` to point at the columns portion. +/// Returns 0 if no version suffix is present. +fn parse_version_suffix(input: &mut &str, original: &str) -> Result { + let Some((columns, version_str)) = split_once_max2(input, '/', original)? else { + return Ok(0); + }; + let version_str = version_str.strip_prefix('V').ok_or_else(|| { + SortFieldsError::BadSortVersion(format!( + "mal-formatted sort schema '{}' -- bad sort version", + *input + )) + })?; + let version = version_str.parse::().map_err(|_| { + SortFieldsError::BadSortVersion(format!( + "mal-formatted sort schema '{}' parsing sort version", + *input + )) + })?; + *input = columns; + Ok(version) +} + +/// Handle the `&` LSM cutoff marker at the start of a column string. +/// Returns the remaining column string after stripping `&`. +fn parse_cutoff_marker<'a>( + col_str: &'a str, + column_index: usize, + cutoff_count: &mut usize, + schema: &mut SortSchema, +) -> Result<&'a str, SortFieldsError> { + let Some(rest) = col_str.strip_prefix('&') else { + if col_str.contains('&') { + return Err(SortFieldsError::MalformedSchema(format!( + "LSM cutoff marker (&) must appear at the beginning of column name, found in \ + middle of: {}", + col_str + ))); + } + return Ok(col_str); + }; + + *cutoff_count += 1; + if *cutoff_count > 1 { + return Err(SortFieldsError::InvalidCutoffPlacement( + "only one LSM cutoff marker (&) is allowed per schema".to_string(), + )); + } + schema.lsm_comparison_cutoff = column_index as i32; + if rest.is_empty() { + return Err(SortFieldsError::InvalidCutoffPlacement( + "LSM cutoff marker (&) must be followed by a valid column name".to_string(), + )); + } + if column_index == 0 { + return Err(SortFieldsError::InvalidCutoffPlacement( + "LSM cutoff marker (&) cannot be used on the first column as it would ignore all \ + columns" + .to_string(), + )); + } + if rest.contains('&') { + return Err(SortFieldsError::MalformedSchema(format!( + "LSM cutoff marker (&) must appear at the beginning of column name, found in middle \ + of: {}", + rest + ))); + } + Ok(rest) +} + +/// Resolve bare name and type from a column name string via `ColumnTypeId::from_column_name`. +fn resolve_name_type(name: &str) -> Result<(&str, ColumnTypeId), SortFieldsError> { + ColumnTypeId::from_column_name(name).map_err(|_| { + SortFieldsError::UnknownColumnType(format!( + "error determining type for column {} from suffix", + name + )) + }) +} + +/// Parse a 3-part column: `name__suffix:type:+/-`. +fn parse_3part( + parts: Vec<&str>, + prefix_dir: Option, + col_str: &str, +) -> Result { + let explicit_type: ColumnTypeId = parts[1].parse().map_err(|_| { + SortFieldsError::UnknownColumnType(format!( + "error determining type for column {}: unknown type '{}'", + parts[0], parts[1] + )) + })?; + let (bare_name, suffix_type) = resolve_name_type(parts[0])?; + if explicit_type != suffix_type { + return Err(SortFieldsError::TypeMismatch { + column: parts[0].to_string(), + from_suffix: suffix_type.to_string(), + explicit: explicit_type.to_string(), + }); + } + let colon_dir = parse_direction(parts[2])?; + if prefix_dir.is_some() { + return Err(SortFieldsError::DuplicateDirection(col_str.to_string())); + } + Ok(SortColumn { + name: bare_name.to_string(), + column_type: explicit_type as u64, + sort_direction: colon_dir, + }) +} + +/// Parse a 2-part column: `name__suffix:+/-`. +fn parse_2part( + parts: Vec<&str>, + prefix_dir: Option, + col_str: &str, +) -> Result { + // Reject direction suffix embedded in the name part: `name-:-` has + // direction in both the name suffix and the colon position. + let (embedded_suffix_dir, name_without_suffix) = strip_direction_suffix(parts[0]); + if embedded_suffix_dir.is_some() { + return Err(SortFieldsError::DuplicateDirection(col_str.to_string())); + } + let (bare_name, col_type) = resolve_name_type(name_without_suffix)?; + let colon_dir = parse_direction(parts[1])?; + if prefix_dir.is_some() { + return Err(SortFieldsError::DuplicateDirection(col_str.to_string())); + } + Ok(SortColumn { + name: bare_name.to_string(), + column_type: col_type as u64, + sort_direction: colon_dir, + }) +} + +/// Parse a 1-part column: `name__suffix` with optional direction prefix/suffix. +fn parse_1part( + part: &str, + prefix_dir: Option, + col_str: &str, +) -> Result { + let (suffix_dir, suffixed_name) = strip_direction_suffix(part); + if prefix_dir.is_some() && suffix_dir.is_some() { + return Err(SortFieldsError::DuplicateDirection(col_str.to_string())); + } + let (bare_name, col_type) = resolve_name_type(suffixed_name)?; + let direction = prefix_dir.or(suffix_dir).unwrap_or_else(|| { + if default_is_descending(bare_name) { + SortColumnDirection::SortDirectionDescending as i32 + } else { + SortColumnDirection::SortDirectionAscending as i32 + } + }); + Ok(SortColumn { + name: bare_name.to_string(), + column_type: col_type as u64, + sort_direction: direction, + }) +} + +/// Split `input` on the first `sep`, returning None if no separator. +/// Errors if there are more than 2 parts (i.e., multiple separators). +fn split_once_max2<'a>( + input: &'a str, + sep: char, + original: &str, +) -> Result, SortFieldsError> { + let mut iter = input.splitn(3, sep); + let first = iter.next().unwrap(); // always present + let second = match iter.next() { + Some(s) => s, + None => return Ok(None), + }; + if iter.next().is_some() { + return Err(SortFieldsError::MalformedSchema(format!( + "mal-formatted sort schema '{}'", + original + ))); + } + Ok(Some((first, second))) +} + +/// Strip a leading `+` or `-` from a string, returning the direction and remainder. +fn strip_direction_prefix(s: &str) -> (Option, &str) { + if let Some(rest) = s.strip_prefix('+') { + ( + Some(SortColumnDirection::SortDirectionAscending as i32), + rest, + ) + } else if let Some(rest) = s.strip_prefix('-') { + ( + Some(SortColumnDirection::SortDirectionDescending as i32), + rest, + ) + } else { + (None, s) + } +} + +/// Strip a trailing `+` or `-` from a string, returning the direction and trimmed name. +fn strip_direction_suffix(s: &str) -> (Option, &str) { + if s.len() > 1 { + if let Some(rest) = s.strip_suffix('+') { + return ( + Some(SortColumnDirection::SortDirectionAscending as i32), + rest, + ); + } + if let Some(rest) = s.strip_suffix('-') { + return ( + Some(SortColumnDirection::SortDirectionDescending as i32), + rest, + ); + } + } + (None, s) +} + +/// Parse a sort direction string ("+" or "-") into the proto enum value. +fn parse_direction(s: &str) -> Result { + match s { + "+" => Ok(SortColumnDirection::SortDirectionAscending as i32), + "-" => Ok(SortColumnDirection::SortDirectionDescending as i32), + _ => Err(SortFieldsError::UnknownSortDirection(format!( + "unknown sort direction '{}'", + s + ))), + } +} diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/tests.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/tests.rs new file mode 100644 index 00000000000..ee8e188916f --- /dev/null +++ b/quickwit/quickwit-parquet-engine/src/sort_fields/tests.rs @@ -0,0 +1,963 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Test suite ported from Go `schemautils_test.go` plus strict V2-only enforcement tests. + +use quickwit_proto::sortschema::{SortColumnDirection, SortSchema}; + +use super::column_type::ColumnTypeId; +use super::display::{schema_to_string, schema_to_string_short}; +use super::equivalence::{equivalent_schemas, equivalent_schemas_for_compaction}; +use super::parser::parse_sort_fields; + +// --------------------------------------------------------------------------- +// Helper +// --------------------------------------------------------------------------- + +fn must_parse(s: &str) -> SortSchema { + parse_sort_fields(s).unwrap_or_else(|e| panic!("failed to parse '{}': {}", s, e)) +} + +// --------------------------------------------------------------------------- +// Strict V2-only enforcement tests +// --------------------------------------------------------------------------- + +#[test] +fn test_v2_only_rejects_unversioned() { + // No version suffix defaults to version 0 -> rejected. + let err = parse_sort_fields("timestamp").unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("unsupported") || msg.contains("version"), + "expected unsupported version error, got: {}", + msg + ); +} + +#[test] +fn test_v2_only_rejects_v0() { + let err = parse_sort_fields("timestamp/V0").unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("unsupported") || msg.contains("version"), + "expected unsupported version error, got: {}", + msg + ); +} + +#[test] +fn test_v2_only_rejects_v1() { + let err = parse_sort_fields("timestamp/V1").unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("unsupported") || msg.contains("version"), + "expected unsupported version error, got: {}", + msg + ); +} + +#[test] +fn test_v2_only_accepts_v2() { + let schema = must_parse("timestamp/V2"); + assert_eq!(schema.sort_version, 2); + assert_eq!(schema.column.len(), 1); + assert_eq!(schema.column[0].name, "timestamp"); +} + +#[test] +fn test_v2_only_accepts_v3() { + let schema = must_parse("timestamp/V3"); + assert_eq!(schema.sort_version, 3); +} + +// --------------------------------------------------------------------------- +// Port of Go TestStringToSchema -- error paths +// --------------------------------------------------------------------------- + +#[test] +fn test_string_to_schema_mangled() { + // Mangled schema with multiple `=` + assert!(parse_sort_fields("a=b=c/V2").is_err(), "must disallow >1 ="); +} + +#[test] +fn test_string_to_schema_invalid_column_format() { + assert!( + parse_sort_fields("name:dense-int64:+:what-is-this?/V2").is_err(), + "must disallow invalid column formats" + ); + assert!( + parse_sort_fields("name:dense-int64:+:what-is-this?:really?/V2").is_err(), + "must disallow invalid column formats" + ); +} + +#[test] +fn test_string_to_schema_3part_errors() { + // Invalid type name. + assert!( + parse_sort_fields("name__i:invalid-type:+|timestamp/V2").is_err(), + "must disallow an invalid type" + ); + // Type mismatch: suffix says int64 but explicit says string. + assert!( + parse_sort_fields("name__i:dense-string:+|timestamp/V2").is_err(), + "must disallow mismatch between type suffix and explicit type" + ); + // Invalid sort direction. + assert!( + parse_sort_fields("name__i:dense-int64:invalid-sort-direction|timestamp/V2").is_err(), + "must disallow an invalid sort direction" + ); +} + +#[test] +fn test_string_to_schema_2part_errors() { + // `name__x` no longer errors: unknown suffixes are treated as bare column names + // with default type (String), so `name__x` is a valid column name. + assert!( + parse_sort_fields("name__x:+|timestamp/V2").is_ok(), + "bare column names with unknown suffix-like patterns are now valid" + ); + // Invalid sort direction. + assert!( + parse_sort_fields("name__i:invalid-sort-direction|timestamp/V2").is_err(), + "must disallow an invalid sort direction" + ); +} + +#[test] +fn test_string_to_schema_missing_timestamp() { + // Semantically invalid: missing timestamp column. + assert!( + parse_sort_fields("name__i:dense-int64:+/V2").is_err(), + "must disallow schema with missing timestamp column" + ); +} + +#[test] +fn test_string_to_schema_bad_version() { + // `/X` is not a valid version specification. + assert!( + parse_sort_fields("timestamp/X").is_err(), + "/X isn't a valid version specification" + ); + // `/VX` -- V followed by non-numeric. + assert!( + parse_sort_fields("timestamp/VX").is_err(), + "/VX isn't a valid version specification" + ); +} + +// --------------------------------------------------------------------------- +// LSM cutoff marker error paths (from Go TestStringToSchema) +// --------------------------------------------------------------------------- + +#[test] +fn test_lsm_cutoff_multiple_markers() { + assert!( + parse_sort_fields("service__s|&env__s|&source__s|timestamp/V2").is_err(), + "must disallow multiple LSM cutoff markers" + ); +} + +#[test] +fn test_lsm_cutoff_double_ampersand() { + assert!( + parse_sort_fields("service__s|&&env__s|timestamp/V2").is_err(), + "must disallow multiple consecutive LSM cutoff markers" + ); +} + +#[test] +fn test_lsm_cutoff_empty_after_marker() { + assert!( + parse_sort_fields("service__s|&|timestamp/V2").is_err(), + "must disallow empty column name after LSM cutoff marker" + ); +} + +#[test] +fn test_lsm_cutoff_in_middle() { + assert!( + parse_sort_fields("service__s|env&__s|timestamp/V2").is_err(), + "must disallow LSM cutoff marker in middle of column name" + ); +} + +#[test] +fn test_lsm_cutoff_single_column() { + assert!( + parse_sort_fields("×tamp/V2").is_err(), + "must disallow LSM cutoff marker on single column schema" + ); +} + +#[test] +fn test_lsm_cutoff_first_column() { + assert!( + parse_sort_fields("&service__s|env__s|timestamp/V2").is_err(), + "must disallow LSM cutoff marker on first column" + ); +} + +// --------------------------------------------------------------------------- +// Port of Go TestStringToSchema -- valid schemas +// --------------------------------------------------------------------------- + +#[test] +fn test_string_to_schema_timestamp_only() { + let s = must_parse("timestamp/V2"); + assert_eq!(s.column.len(), 1); + assert_eq!(s.column[0].name, "timestamp"); + assert_eq!(s.column[0].column_type, ColumnTypeId::Int64 as u64); + assert_eq!( + s.column[0].sort_direction, + SortColumnDirection::SortDirectionDescending as i32 + ); +} + +#[test] +fn test_string_to_schema_named_timestamp() { + let s = must_parse("defaultTimestampSchema=timestamp/V2"); + assert_eq!(s.name, "defaultTimestampSchema"); + assert_eq!(s.column.len(), 1); + assert_eq!(s.column[0].name, "timestamp"); + assert_eq!( + s.column[0].sort_direction, + SortColumnDirection::SortDirectionDescending as i32 + ); +} + +#[test] +fn test_string_to_schema_named_timestamp_explicit_direction() { + let s = must_parse("defaultTimestampSchema=timestamp:-/V2"); + assert_eq!(s.name, "defaultTimestampSchema"); + assert_eq!(s.column[0].name, "timestamp"); + assert_eq!( + s.column[0].sort_direction, + SortColumnDirection::SortDirectionDescending as i32 + ); +} + +#[test] +fn test_string_to_schema_named_timestamp_explicit_type() { + let s = must_parse("defaultTimestampSchema=timestamp:dense-int64:-/V2"); + assert_eq!(s.name, "defaultTimestampSchema"); + assert_eq!(s.column[0].name, "timestamp"); + assert_eq!(s.column[0].column_type, ColumnTypeId::Int64 as u64); + assert_eq!( + s.column[0].sort_direction, + SortColumnDirection::SortDirectionDescending as i32 + ); +} + +#[test] +fn test_string_to_schema_multi_column() { + let s = must_parse("testSchema=columnA__s|columnB__i:-|timestamp/V2"); + assert_eq!(s.name, "testSchema"); + assert_eq!(s.sort_version, 2); + assert_eq!(s.column.len(), 3); + + // columnA__s: string, ascending (default) + assert_eq!(s.column[0].name, "columnA"); + assert_eq!(s.column[0].column_type, ColumnTypeId::String as u64); + assert_eq!( + s.column[0].sort_direction, + SortColumnDirection::SortDirectionAscending as i32 + ); + + // columnB__i: int64, descending (explicit) + assert_eq!(s.column[1].name, "columnB"); + assert_eq!(s.column[1].column_type, ColumnTypeId::Int64 as u64); + assert_eq!( + s.column[1].sort_direction, + SortColumnDirection::SortDirectionDescending as i32 + ); + + // timestamp: int64, descending (default) + assert_eq!(s.column[2].name, "timestamp"); + assert_eq!(s.column[2].column_type, ColumnTypeId::Int64 as u64); + assert_eq!( + s.column[2].sort_direction, + SortColumnDirection::SortDirectionDescending as i32 + ); +} + +#[test] +fn test_string_to_schema_multi_column_explicit_type() { + let s = must_parse("testSchema=columnA__s:dense-string:+|columnB__i:-|timestamp/V2"); + assert_eq!(s.column[0].column_type, ColumnTypeId::String as u64); + assert_eq!( + s.column[0].sort_direction, + SortColumnDirection::SortDirectionAscending as i32 + ); +} + +// --------------------------------------------------------------------------- +// SchemaToString and SchemaToStringShort +// --------------------------------------------------------------------------- + +#[test] +fn test_schema_to_string_full() { + let s = must_parse("testSchema=columnA__s|columnB__i:-|timestamp/V2"); + let full = schema_to_string(&s); + // columnA has type String == default for a generic name, so no suffix in output. + // columnB has type Int64 != default String, so __i suffix is preserved. + assert_eq!( + full, + "testSchema=columnA:dense-string:+|columnB__i:dense-int64:-|timestamp:dense-int64:-/V2" + ); +} + +#[test] +fn test_schema_to_string_short() { + let s = must_parse("testSchema=columnA__s|columnB__i:-|timestamp/V2"); + let short = schema_to_string_short(&s); + // columnA has type String == default, so no suffix. columnB keeps __i (Int64 != default). + assert_eq!(short, "testSchema=columnA|columnB__i:-|timestamp/V2"); +} + +// --------------------------------------------------------------------------- +// Round-trip tests +// --------------------------------------------------------------------------- + +#[test] +fn test_round_trip_short_form() { + // Inputs that round-trip to themselves: bare names (no suffix) and non-default + // typed columns already serialize without a suffix change. + let exact_round_trip_cases = [ + "timestamp/V2", + // `service` with no suffix: default String, serializes as `service` (no suffix). + "service|timestamp/V2", + "service|env|timestamp/V2", + // columnB__i: Int64 != default String, keeps __i suffix. + "testSchema=columnA|columnB__i:-|timestamp/V2", + ]; + for input in exact_round_trip_cases { + let parsed = must_parse(input); + let short = schema_to_string_short(&parsed); + assert_eq!(short, input, "short round-trip failed for '{}'", input); + + // Also verify full round-trip: parse(to_string(schema)) == schema. + let full = schema_to_string(&parsed); + let reparsed = must_parse(&full); + assert_eq!( + parsed.column.len(), + reparsed.column.len(), + "column count mismatch after full round-trip for '{}'", + input + ); + for (a, b) in parsed.column.iter().zip(reparsed.column.iter()) { + assert_eq!(a.name, b.name); + assert_eq!(a.column_type, b.column_type); + assert_eq!(a.sort_direction, b.sort_direction); + } + assert_eq!(parsed.sort_version, reparsed.sort_version); + assert_eq!(parsed.lsm_comparison_cutoff, reparsed.lsm_comparison_cutoff); + } + + // Inputs with explicit __s suffix on default-String columns: the short form drops + // the suffix (it is not needed), but the semantic content is preserved. + let semantic_round_trip_cases = [ + ("service__s|timestamp/V2", "service|timestamp/V2"), + ("service__s|env__s|timestamp/V2", "service|env|timestamp/V2"), + ( + "testSchema=columnA__s|columnB__i:-|timestamp/V2", + "testSchema=columnA|columnB__i:-|timestamp/V2", + ), + ]; + for (input, expected_short) in semantic_round_trip_cases { + let parsed = must_parse(input); + let short = schema_to_string_short(&parsed); + assert_eq!(short, expected_short, "short form mismatch for '{}'", input); + + // Verify semantic round-trip: parse the short output and compare protos. + let reparsed = must_parse(&short); + assert_eq!( + parsed.column.len(), + reparsed.column.len(), + "column count mismatch after semantic round-trip for '{}'", + input + ); + for (a, b) in parsed.column.iter().zip(reparsed.column.iter()) { + assert_eq!(a.name, b.name); + assert_eq!(a.column_type, b.column_type); + assert_eq!(a.sort_direction, b.sort_direction); + } + assert_eq!(parsed.sort_version, reparsed.sort_version); + assert_eq!(parsed.lsm_comparison_cutoff, reparsed.lsm_comparison_cutoff); + } +} + +// --------------------------------------------------------------------------- +// Port of Go TestEquivalentSchemas +// --------------------------------------------------------------------------- + +#[test] +fn test_equivalent_schemas_identical() { + let a = must_parse("timestamp/V2"); + let b = must_parse("timestamp/V2"); + assert!(equivalent_schemas(&a, &b)); + assert!(equivalent_schemas_for_compaction(&a, &b)); +} + +#[test] +fn test_equivalent_schemas_different_column_counts() { + let a = must_parse("service__s|timestamp/V2"); + let b = must_parse("timestamp/V2"); + assert!(!equivalent_schemas(&a, &b)); + assert!(!equivalent_schemas_for_compaction(&a, &b)); +} + +#[test] +fn test_equivalent_schemas_same_columns_different_names() { + let a = must_parse("service__s|timestamp/V2"); + let b = must_parse("serviceSchema=service__s|timestamp/V2"); + assert!(equivalent_schemas(&a, &b)); + assert!(equivalent_schemas_for_compaction(&a, &b)); +} + +#[test] +fn test_equivalent_schemas_different_column_order() { + let a = must_parse("env__s|service__s|timestamp/V2"); + let b = must_parse("serviceSchema=service__s|timestamp/V2"); + assert!(!equivalent_schemas(&a, &b)); + assert!(!equivalent_schemas_for_compaction(&a, &b)); +} + +#[test] +fn test_equivalent_schemas_different_versions() { + let a = must_parse("service__s|timestamp/V2"); + let b = must_parse("serviceSchema=service__s|timestamp/V3"); + assert!(!equivalent_schemas(&a, &b)); + assert!(!equivalent_schemas_for_compaction(&a, &b)); +} + +#[test] +fn test_equivalent_schemas_different_lsm_cutoffs() { + // Different LSM cutoffs: affects EquivalentSchemas but NOT EquivalentSchemasForCompaction. + let a = must_parse("service__s|&env__s|timestamp/V2"); + let b = must_parse("service__s|env__s|timestamp/V2"); + assert!( + !equivalent_schemas(&a, &b), + "different LSM cutoffs should not be equivalent" + ); + assert!( + equivalent_schemas_for_compaction(&a, &b), + "different LSM cutoffs should be equivalent for compaction" + ); +} + +#[test] +fn test_equivalent_schemas_different_cutoff_positions() { + let a = must_parse("service__s|&env__s|timestamp/V2"); + let b = must_parse("service__s|env__s|×tamp/V2"); + assert!(!equivalent_schemas(&a, &b)); + assert!(equivalent_schemas_for_compaction(&a, &b)); +} + +#[test] +fn test_equivalent_schemas_identical_lsm_cutoffs() { + let a = must_parse("service__s|&env__s|timestamp/V2"); + let b = must_parse("service__s|&env__s|timestamp/V2"); + assert!(equivalent_schemas(&a, &b)); + assert!(equivalent_schemas_for_compaction(&a, &b)); +} + +// --------------------------------------------------------------------------- +// Port of Go TestStringToSchemaWithLSMCutoff +// --------------------------------------------------------------------------- + +#[test] +fn test_lsm_cutoff_after_first_column() { + let s = must_parse("service__s|&env__s|timestamp/V2"); + assert_eq!(s.lsm_comparison_cutoff, 1); + assert_eq!(s.column[1].name, "env"); // "&" stripped +} + +#[test] +fn test_lsm_cutoff_after_second_column() { + let s = must_parse("service__s|env__s|&source__s|timestamp/V2"); + assert_eq!(s.lsm_comparison_cutoff, 2); + assert_eq!(s.column[2].name, "source"); +} + +#[test] +fn test_lsm_cutoff_before_timestamp() { + let s = must_parse("service__s|env__s|source__s|×tamp/V2"); + assert_eq!(s.lsm_comparison_cutoff, 3); + assert_eq!(s.column[3].name, "timestamp"); +} + +#[test] +fn test_lsm_cutoff_named_schema() { + let s = must_parse("testSchema=service__s|&env__s|timestamp/V2"); + assert_eq!(s.name, "testSchema"); + assert_eq!(s.lsm_comparison_cutoff, 1); +} + +#[test] +fn test_lsm_cutoff_with_version() { + let s = must_parse("service__s|&env__s|timestamp/V2"); + assert_eq!(s.lsm_comparison_cutoff, 1); + assert_eq!(s.sort_version, 2); +} + +#[test] +fn test_lsm_cutoff_with_explicit_type_direction() { + let s = must_parse("service__s:dense-string:+|&env__s:dense-string:+|timestamp/V2"); + assert_eq!(s.lsm_comparison_cutoff, 1); + assert_eq!( + s.column[0].sort_direction, + SortColumnDirection::SortDirectionAscending as i32 + ); +} + +#[test] +fn test_no_lsm_cutoff() { + let s = must_parse("service__s|env__s|timestamp/V2"); + assert_eq!(s.lsm_comparison_cutoff, 0); +} + +// --------------------------------------------------------------------------- +// Port of Go TestSchemaToStringWithLSMCutoff +// --------------------------------------------------------------------------- + +#[test] +fn test_schema_to_string_preserves_cutoff_marker_short() { + let s = must_parse("service__s|&env__s|timestamp/V2"); + // String columns with default type serialize without the __s suffix. + assert_eq!(schema_to_string_short(&s), "service|&env|timestamp/V2"); +} + +#[test] +fn test_schema_to_string_preserves_cutoff_marker_full() { + let s = must_parse("service__s|&env__s|timestamp/V2"); + // String columns with default type serialize without the __s suffix in full form too. + let expected = "service:dense-string:+|&env:dense-string:+|timestamp:dense-int64:-/V2"; + assert_eq!(schema_to_string(&s), expected); +} + +#[test] +fn test_schema_to_string_named_with_cutoff() { + let s = must_parse("testSchema=service__s|&env__s|timestamp/V2"); + // String columns with default type serialize without the __s suffix. + assert_eq!( + schema_to_string_short(&s), + "testSchema=service|&env|timestamp/V2" + ); +} + +#[test] +fn test_schema_to_string_no_cutoff() { + let s = must_parse("service__s|env__s|timestamp/V2"); + // String columns with default type serialize without the __s suffix. + assert_eq!(schema_to_string_short(&s), "service|env|timestamp/V2"); +} + +// --------------------------------------------------------------------------- +// Port of Go TestLSMCutoffRoundTrip +// --------------------------------------------------------------------------- + +#[test] +fn test_lsm_cutoff_round_trip() { + // Pairs of (input, expected_short_output). + // String columns with default type serialize without the __s suffix, so inputs + // using __s produce shorter output that is semantically equivalent. + let test_cases = [ + ( + "service__s|&env__s|timestamp/V2", + "service|&env|timestamp/V2", + ), + ( + "service__s|env__s|&source__s|timestamp/V2", + "service|env|&source|timestamp/V2", + ), + ( + "service__s|env__s|source__s|×tamp/V2", + "service|env|source|×tamp/V2", + ), + ( + "testSchema=service__s|&env__s|timestamp/V2", + "testSchema=service|&env|timestamp/V2", + ), + ("service__s|env__s|timestamp/V2", "service|env|timestamp/V2"), + ]; + for (input, expected_short) in test_cases { + let parsed = must_parse(input); + let result = schema_to_string_short(&parsed); + assert_eq!(result, expected_short, "round-trip failed for '{}'", input); + + // Verify the output parses back to the same proto (semantic round-trip). + let reparsed = must_parse(&result); + assert_eq!( + parsed.column.len(), + reparsed.column.len(), + "column count mismatch after round-trip for '{}'", + input + ); + for (a, b) in parsed.column.iter().zip(reparsed.column.iter()) { + assert_eq!(a.name, b.name); + assert_eq!(a.column_type, b.column_type); + assert_eq!(a.sort_direction, b.sort_direction); + } + assert_eq!( + parsed.lsm_comparison_cutoff, reparsed.lsm_comparison_cutoff, + "LSM cutoff mismatch for '{}'", + input + ); + } +} + +// --------------------------------------------------------------------------- +// Direction prefix/suffix tests +// --------------------------------------------------------------------------- + +#[test] +fn test_direction_prefix_ascending() { + let s = must_parse("+service__s|timestamp/V2"); + assert_eq!(s.column[0].name, "service"); + assert_eq!( + s.column[0].sort_direction, + SortColumnDirection::SortDirectionAscending as i32 + ); +} + +#[test] +fn test_direction_prefix_descending() { + let s = must_parse("-service__s|timestamp/V2"); + assert_eq!(s.column[0].name, "service"); + assert_eq!( + s.column[0].sort_direction, + SortColumnDirection::SortDirectionDescending as i32 + ); +} + +#[test] +fn test_direction_suffix_ascending() { + let s = must_parse("service__s+|timestamp/V2"); + assert_eq!(s.column[0].name, "service"); + assert_eq!( + s.column[0].sort_direction, + SortColumnDirection::SortDirectionAscending as i32 + ); +} + +#[test] +fn test_direction_suffix_descending() { + let s = must_parse("service__s-|timestamp/V2"); + assert_eq!(s.column[0].name, "service"); + assert_eq!( + s.column[0].sort_direction, + SortColumnDirection::SortDirectionDescending as i32 + ); +} + +#[test] +fn test_direction_prefix_descending_on_timestamp() { + // Explicit descending prefix on timestamp matches the default and is accepted. + let s = must_parse("service__s|-timestamp/V2"); + assert_eq!(s.column[1].name, "timestamp"); + assert_eq!( + s.column[1].sort_direction, + SortColumnDirection::SortDirectionDescending as i32 + ); +} + +#[test] +fn test_direction_prefix_ascending_on_timestamp_rejected() { + // Ascending timestamp is rejected by validation (timestamp must be descending). + assert!( + parse_sort_fields("service__s|+timestamp/V2").is_err(), + "ascending timestamp must be rejected by validation" + ); +} + +#[test] +fn test_direction_suffix_on_timestamp() { + let s = must_parse("service__s|timestamp-/V2"); + assert_eq!(s.column[1].name, "timestamp"); + assert_eq!( + s.column[1].sort_direction, + SortColumnDirection::SortDirectionDescending as i32 + ); +} + +#[test] +fn test_direction_prefix_and_suffix_is_error() { + // +name+ : direction on both sides + assert!( + parse_sort_fields("+service__s+|timestamp/V2").is_err(), + "must reject direction on both prefix and suffix" + ); + // -name- : same direction both sides, still error + assert!( + parse_sort_fields("-service__s-|timestamp/V2").is_err(), + "must reject direction on both prefix and suffix (same direction)" + ); + // +name- : conflicting directions + assert!( + parse_sort_fields("+service__s-|timestamp/V2").is_err(), + "must reject conflicting direction prefix and suffix" + ); +} + +#[test] +fn test_direction_prefix_with_colon_direction_is_error() { + // +name:+ : prefix direction + colon direction + assert!( + parse_sort_fields("+service__s:+|timestamp/V2").is_err(), + "must reject direction in both prefix and colon form" + ); +} + +#[test] +fn test_direction_suffix_with_colon_direction_is_error() { + // name-:- : suffix direction + colon direction + assert!( + parse_sort_fields("service__s-:-|timestamp/V2").is_err(), + "must reject direction in both suffix and colon form" + ); +} + +#[test] +fn test_direction_prefix_multi_column() { + let s = must_parse("-metric_name__s|+host__s|timestamp/V2"); + assert_eq!( + s.column[0].sort_direction, + SortColumnDirection::SortDirectionDescending as i32 + ); + assert_eq!( + s.column[1].sort_direction, + SortColumnDirection::SortDirectionAscending as i32 + ); + assert_eq!( + s.column[2].sort_direction, + SortColumnDirection::SortDirectionDescending as i32 // default for timestamp + ); +} + +#[test] +fn test_direction_prefix_with_lsm_cutoff() { + // &+env__s : cutoff marker then direction prefix + let s = must_parse("service__s|&+env__s|timestamp/V2"); + assert_eq!(s.lsm_comparison_cutoff, 1); + assert_eq!(s.column[1].name, "env"); + assert_eq!( + s.column[1].sort_direction, + SortColumnDirection::SortDirectionAscending as i32 + ); +} + +#[test] +fn test_direction_suffix_with_lsm_cutoff() { + let s = must_parse("service__s|&env__s-|timestamp/V2"); + assert_eq!(s.lsm_comparison_cutoff, 1); + assert_eq!(s.column[1].name, "env"); + assert_eq!( + s.column[1].sort_direction, + SortColumnDirection::SortDirectionDescending as i32 + ); +} + +// --------------------------------------------------------------------------- +// Bare name parsing (no type suffix — uses defaults) +// --------------------------------------------------------------------------- + +#[test] +fn test_bare_names_default_to_string() { + let s = must_parse("service|env|host|timestamp/V2"); + assert_eq!(s.column[0].name, "service"); + assert_eq!(s.column[0].column_type, ColumnTypeId::String as u64); + assert_eq!(s.column[1].name, "env"); + assert_eq!(s.column[1].column_type, ColumnTypeId::String as u64); + assert_eq!(s.column[2].name, "host"); + assert_eq!(s.column[2].column_type, ColumnTypeId::String as u64); +} + +#[test] +fn test_bare_timestamp_defaults_to_int64() { + let s = must_parse("service|timestamp/V2"); + assert_eq!(s.column[1].name, "timestamp"); + assert_eq!(s.column[1].column_type, ColumnTypeId::Int64 as u64); +} + +#[test] +fn test_bare_timeseries_id_defaults_to_int64() { + let s = must_parse("service|timeseries_id|timestamp/V2"); + assert_eq!(s.column[1].name, "timeseries_id"); + assert_eq!(s.column[1].column_type, ColumnTypeId::Int64 as u64); +} + +#[test] +fn test_bare_tiebreaker_defaults_to_int64() { + let s = must_parse("service|timestamp|tiebreaker/V2"); + assert_eq!(s.column[2].name, "tiebreaker"); + assert_eq!(s.column[2].column_type, ColumnTypeId::Int64 as u64); +} + +#[test] +fn test_bare_metric_value_defaults_to_float64() { + let s = must_parse("metric_value|timestamp/V2"); + assert_eq!(s.column[0].name, "metric_value"); + assert_eq!(s.column[0].column_type, ColumnTypeId::Float64 as u64); +} + +#[test] +fn test_bare_and_suffixed_produce_same_proto() { + let bare = must_parse("metric_name|host|timeseries_id|timestamp/V2"); + let suffixed = must_parse("metric_name__s|host__s|timeseries_id__i|timestamp/V2"); + assert_eq!(bare.column.len(), suffixed.column.len()); + for (a, b) in bare.column.iter().zip(suffixed.column.iter()) { + assert_eq!(a.name, b.name, "names should match"); + assert_eq!( + a.column_type, b.column_type, + "types should match for {}", + a.name + ); + assert_eq!( + a.sort_direction, b.sort_direction, + "directions should match for {}", + a.name + ); + } +} + +#[test] +fn test_suffix_overrides_default() { + // metric_value defaults to Float64, but __i suffix forces Int64 + let s = must_parse("metric_value__i|timestamp/V2"); + assert_eq!(s.column[0].name, "metric_value"); + assert_eq!(s.column[0].column_type, ColumnTypeId::Int64 as u64); +} + +#[test] +fn test_display_omits_default_suffix() { + let s = must_parse("metric_name|host|timestamp/V2"); + let short = schema_to_string_short(&s); + // All columns use default types, so no suffixes in output. + assert_eq!(short, "metric_name|host|timestamp/V2"); +} + +#[test] +fn test_display_includes_non_default_suffix() { + // Force host to Int64 (non-default for bare "host" which defaults to String) + let s = must_parse("metric_name|host__i|timestamp/V2"); + let short = schema_to_string_short(&s); + assert_eq!(short, "metric_name|host__i|timestamp/V2"); +} + +// --------------------------------------------------------------------------- +// timeseries_id handling +// --------------------------------------------------------------------------- + +#[test] +fn test_timeseries_id_as_int64() { + let s = must_parse("metric_name__s|host__s|timeseries_id__i|timestamp/V2"); + assert_eq!(s.column.len(), 4); + + // timeseries_id__i should be TypeIDInt64, ascending. + let ts_id_col = &s.column[2]; + assert_eq!(ts_id_col.name, "timeseries_id"); + assert_eq!(ts_id_col.column_type, ColumnTypeId::Int64 as u64); + assert_eq!( + ts_id_col.sort_direction, + SortColumnDirection::SortDirectionAscending as i32 + ); +} + +// --------------------------------------------------------------------------- +// Float and sketch column types +// --------------------------------------------------------------------------- + +#[test] +fn test_float_column() { + let s = must_parse("value__nf|timestamp/V2"); + assert_eq!(s.column[0].column_type, ColumnTypeId::Float64 as u64); + assert_eq!( + s.column[0].sort_direction, + SortColumnDirection::SortDirectionAscending as i32 + ); +} + +#[test] +fn test_sketch_column() { + let s = must_parse("latency__sk|timestamp/V2"); + assert_eq!(s.column[0].column_type, ColumnTypeId::Sketch as u64); +} + +// --------------------------------------------------------------------------- +// SchemasToString / SchemasToStringShort multi-schema (convenience) +// --------------------------------------------------------------------------- + +#[test] +fn test_schemas_to_string() { + let schema1 = must_parse("test=key1__s|timestamp/V2"); + let schema2 = must_parse("key2__i|timestamp/V2"); + + let schemas = [schema1, schema2]; + let strings: Vec = schemas.iter().map(schema_to_string).collect(); + let actual = strings.join(","); + + // key1 has type String == default, so no suffix. key2 has type Int64 != default, keeps __i. + assert_eq!( + actual, + "test=key1:dense-string:+|timestamp:dense-int64:-/V2,key2__i:dense-int64:+|timestamp:\ + dense-int64:-/V2" + ); +} + +#[test] +fn test_schemas_to_string_short() { + let schema1 = must_parse("test=key1__s|timestamp/V2"); + let schema2 = must_parse("key2__i|timestamp/V2"); + + let schemas = [schema1, schema2]; + let strings: Vec = schemas.iter().map(schema_to_string_short).collect(); + let actual = strings.join(","); + + // key1 has type String == default, so no suffix. key2 has type Int64 != default, keeps __i. + assert_eq!(actual, "test=key1|timestamp/V2,key2__i|timestamp/V2"); +} + +// --------------------------------------------------------------------------- +// ColumnTypeId TryFrom (proto deserialization path) +// --------------------------------------------------------------------------- + +#[test] +fn test_column_type_try_from_u64_valid() { + assert_eq!(ColumnTypeId::try_from(2u64).unwrap(), ColumnTypeId::Int64); + assert_eq!( + ColumnTypeId::try_from(10u64).unwrap(), + ColumnTypeId::Float64 + ); + assert_eq!(ColumnTypeId::try_from(14u64).unwrap(), ColumnTypeId::String); + assert_eq!(ColumnTypeId::try_from(17u64).unwrap(), ColumnTypeId::Sketch); + assert_eq!( + ColumnTypeId::try_from(20u64).unwrap(), + ColumnTypeId::CpcSketch + ); + assert_eq!( + ColumnTypeId::try_from(22u64).unwrap(), + ColumnTypeId::ItemSketch + ); +} + +#[test] +fn test_column_type_try_from_u64_invalid() { + assert!(ColumnTypeId::try_from(0u64).is_err()); + assert!(ColumnTypeId::try_from(1u64).is_err()); + assert!(ColumnTypeId::try_from(99u64).is_err()); +} diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/validation.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/validation.rs new file mode 100644 index 00000000000..3a33db82f6f --- /dev/null +++ b/quickwit/quickwit-parquet-engine/src/sort_fields/validation.rs @@ -0,0 +1,114 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Sort schema validation -- direct port of Go `ValidateSchema`. + +use std::collections::HashSet; + +use quickwit_proto::sortschema::{SortColumnDirection, SortSchema}; + +use super::SortFieldsError; +use super::column_type::{ColumnTypeId, TIEBREAKER, default_is_descending}; + +/// Name used for the special skip-builder schema that does not require timestamp. +const DEFAULT_SKIP_BUILDER_SCHEMA_NAME: &str = "defaultSkipBuilderSchema"; + +/// Check if a bare column name is a timestamp column (defaults to descending). +fn is_timestamp_column(name: &str) -> bool { + default_is_descending(name) +} + +/// Validate a sort schema, enforcing all rules from Go `ValidateSchema`. +/// +/// Rules: +/// - Schema must have at least one column. +/// - No duplicate column names. +/// - Sort direction must not be Unknown. +/// - `timestamp` must be present (unless schema name is `defaultSkipBuilderSchema`). +/// - `timestamp` must be Int64 and descending (unless it's a msgid schema). +/// - `timestamp` must come before `tiebreaker`. +/// - No non-tiebreaker columns may appear after `timestamp`. +pub fn validate_schema(schema: &SortSchema) -> Result<(), SortFieldsError> { + if schema.column.is_empty() { + return Err(SortFieldsError::ValidationError("empty schema".to_string())); + } + + let mut seen: HashSet<&str> = HashSet::new(); + let is_msgid = schema.version == 2 || schema.name == "defaultMsgIDsSchema"; + + for col in &schema.column { + let name = col.name.as_str(); + + if seen.contains(name) { + return Err(SortFieldsError::ValidationError(format!( + "column {} is duplicated in schema", + name + ))); + } + seen.insert(name); + + if col.sort_direction == SortColumnDirection::SortDirectionUnknown as i32 { + return Err(SortFieldsError::ValidationError(format!( + "column {} does not specify a sort direction in schema", + name + ))); + } + + let has_seen_timestamp = seen.iter().any(|s| is_timestamp_column(s)); + + if is_timestamp_column(name) { + if seen.contains(TIEBREAKER) { + return Err(SortFieldsError::ValidationError(format!( + "{} column must come before {} in schema", + name, TIEBREAKER + ))); + } + if col.sort_direction != SortColumnDirection::SortDirectionDescending as i32 + && !is_msgid + { + return Err(SortFieldsError::ValidationError(format!( + "{} column must sorted in descending order in schema", + name + ))); + } + if col.column_type != ColumnTypeId::Int64 as u64 { + return Err(SortFieldsError::ValidationError(format!( + "{} column must be of type int64 in schema", + name + ))); + } + } else if name == TIEBREAKER { + if !has_seen_timestamp { + return Err(SortFieldsError::ValidationError(format!( + "timestamp column must come before {} in schema", + TIEBREAKER + ))); + } + } else if has_seen_timestamp && !is_msgid { + return Err(SortFieldsError::ValidationError(format!( + "column {} is after timestamp but timestamp must be the last schema column", + name + ))); + } + } + + let has_timestamp = schema.column.iter().any(|c| is_timestamp_column(&c.name)); + if !has_timestamp && schema.name != DEFAULT_SKIP_BUILDER_SCHEMA_NAME { + return Err(SortFieldsError::ValidationError( + "timestamp column is required, but is missing from schema".to_string(), + )); + } + + Ok(()) +} diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/window.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/window.rs new file mode 100644 index 00000000000..b1ad896e31f --- /dev/null +++ b/quickwit/quickwit-parquet-engine/src/sort_fields/window.rs @@ -0,0 +1,240 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Canonical time-window functions for metrics compaction. +//! +//! `window_start` is the foundational time-partitioning function used by +//! ingestion (Phase 32), merge policy (Phase 33), and compaction planning +//! (Phase 35). Correctness at boundary conditions -- especially negative +//! timestamps and zero-crossing -- is critical because an off-by-one error +//! silently misroutes data to wrong windows. +//! +//! The implementation uses `rem_euclid` instead of the `%` operator to handle +//! negative timestamps correctly. Standard `%` truncates toward zero, which +//! gives wrong results for negative inputs: +//! - `-1 % 900 = -1` (wrong: would compute window_start as 0) +//! - `(-1i64).rem_euclid(900) = 899` (correct: window_start = -1 - 899 = -900) + +use chrono::{DateTime, Utc}; + +use super::SortFieldsError; + +/// Validate that a window duration evenly divides one hour (3600 seconds). +/// +/// This is ADR-003 invariant TW-2: all windows within an hour must have +/// identical boundaries regardless of when counting starts. 3600 has 45 +/// positive divisors (1, 2, 3, ..., 1800, 3600). Any of these are accepted. +/// In practice, metrics systems use durations >= 60s: 60, 120, 180, 240, +/// 300, 360, 600, 720, 900, 1200, 1800, 3600. +/// +/// A duration of 0 is rejected as nonsensical. Durations that do not evenly +/// divide 3600 are rejected because they would produce inconsistent window +/// boundaries across different starting points within an hour. +pub fn validate_window_duration(duration_secs: u32) -> Result<(), SortFieldsError> { + if duration_secs == 0 { + return Err(SortFieldsError::InvalidWindowDuration { + duration_secs, + reason: "must be positive", + }); + } + if 3600 % duration_secs != 0 { + return Err(SortFieldsError::InvalidWindowDuration { + duration_secs, + reason: "must evenly divide 3600 (one hour)", + }); + } + Ok(()) +} + +/// Compute the start of the time window containing the given timestamp. +/// +/// Uses `rem_euclid` for correct handling of negative timestamps (before Unix +/// epoch). Standard `%` truncates toward zero: `-1 % 900 = -1` (wrong). +/// `rem_euclid` always returns non-negative: `(-1i64).rem_euclid(900) = 899`. +/// So `window_start(-1, 900) = -1 - 899 = -900` (correct: timestamp -1 is in +/// window [-900, 0)). +/// +/// # Invariants (verified by proptest) +/// - Window start is aligned: `window_start % duration == 0` +/// - Timestamp is contained: `window_start <= timestamp < window_start + duration` +/// - Deterministic: same inputs always produce same output +/// +/// # Errors +/// Returns `SortFieldsError::WindowStartOutOfRange` if the computed start +/// timestamp cannot be represented as a `DateTime`. +pub fn window_start( + timestamp_secs: i64, + duration_secs: i64, +) -> Result, SortFieldsError> { + debug_assert!(duration_secs > 0, "window duration must be positive"); + // TW-2 (ADR-003): window duration must evenly divide one hour. + // This ensures window boundaries align across hours and days. + debug_assert!( + 3600 % duration_secs == 0, + "TW-2 violated: duration_secs={} does not divide 3600", + duration_secs + ); + let remainder = timestamp_secs.rem_euclid(duration_secs); + let start_secs = timestamp_secs - remainder; + DateTime::from_timestamp(start_secs, 0).ok_or(SortFieldsError::WindowStartOutOfRange { + timestamp_secs: start_secs, + }) +} + +#[cfg(test)] +mod tests { + use proptest::prelude::*; + + use super::*; + + // ----------------------------------------------------------------------- + // Proptest properties + // ----------------------------------------------------------------------- + + proptest! { + #[test] + fn window_start_is_aligned( + ts in -1_000_000_000i64..2_000_000_000i64, + dur in prop::sample::select(vec![60i64, 120, 180, 240, 300, 360, + 600, 720, 900, 1200, 1800, 3600]) + ) { + let ws = window_start(ts, dur).unwrap(); + let ws_secs = ws.timestamp(); + // window_start is aligned to duration + prop_assert_eq!(ws_secs.rem_euclid(dur), 0); + // timestamp is within [window_start, window_start + duration) + prop_assert!(ws_secs <= ts); + prop_assert!(ts < ws_secs + dur); + } + + #[test] + fn window_start_is_deterministic( + ts in -1_000_000_000i64..2_000_000_000i64, + dur in prop::sample::select(vec![60i64, 300, 900, 3600]) + ) { + let ws1 = window_start(ts, dur).unwrap(); + let ws2 = window_start(ts, dur).unwrap(); + prop_assert_eq!(ws1, ws2); + } + + #[test] + fn adjacent_windows_do_not_overlap( + ts in 0i64..1_000_000_000i64, + dur in prop::sample::select(vec![60i64, 300, 900, 3600]) + ) { + let ws = window_start(ts, dur).unwrap(); + let next_ws = window_start(ws.timestamp() + dur, dur).unwrap(); + // Next window starts exactly at current window end + prop_assert_eq!(next_ws.timestamp(), ws.timestamp() + dur); + } + } + + // ----------------------------------------------------------------------- + // Unit tests: edge cases + // ----------------------------------------------------------------------- + + #[test] + fn test_negative_timestamp_crossing() { + let ws = window_start(-1, 900).unwrap(); + assert_eq!(ws.timestamp(), -900); + } + + #[test] + fn test_zero_timestamp() { + let ws = window_start(0, 900).unwrap(); + assert_eq!(ws.timestamp(), 0); + } + + #[test] + fn test_exactly_on_boundary() { + let ws = window_start(900, 900).unwrap(); + assert_eq!(ws.timestamp(), 900); + } + + #[test] + fn test_one_before_boundary() { + let ws = window_start(899, 900).unwrap(); + assert_eq!(ws.timestamp(), 0); + } + + #[test] + fn test_large_negative_timestamp() { + let ws = window_start(-3601, 3600).unwrap(); + assert_eq!(ws.timestamp(), -7200); + } + + #[test] + fn test_60s_window() { + let ws = window_start(1_700_000_042, 60).unwrap(); + assert_eq!(ws.timestamp(), 1_700_000_040); + } + + // ----------------------------------------------------------------------- + // Validation tests + // ----------------------------------------------------------------------- + + #[test] + fn test_valid_window_durations() { + let valid = [60, 120, 180, 240, 300, 360, 600, 720, 900, 1200, 1800, 3600]; + for dur in valid { + assert!( + validate_window_duration(dur).is_ok(), + "duration {} should be valid", + dur + ); + } + } + + #[test] + fn test_invalid_window_durations() { + // None of these evenly divide 3600. + let invalid = [0, 7, 11, 13, 17, 700, 1000, 1500, 2000, 2400, 7200]; + for dur in invalid { + assert!( + validate_window_duration(dur).is_err(), + "duration {} should be invalid", + dur + ); + } + } + + #[test] + fn test_small_valid_divisors_also_accepted() { + // The function accepts all positive divisors of 3600, not just >= 60. + let small_valid = [ + 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25, 30, 36, 40, 45, 48, 50, + ]; + for dur in small_valid { + assert!( + validate_window_duration(dur).is_ok(), + "duration {} should be valid (divides 3600)", + dur + ); + } + } + + #[test] + fn test_zero_duration_error_message() { + let err = validate_window_duration(0).unwrap_err(); + let msg = err.to_string(); + assert!(msg.contains("must be positive"), "got: {msg}"); + } + + #[test] + fn test_non_divisor_error_message() { + let err = validate_window_duration(7).unwrap_err(); + let msg = err.to_string(); + assert!(msg.contains("must evenly divide 3600"), "got: {msg}"); + } +} diff --git a/quickwit/quickwit-parquet-engine/src/storage/config.rs b/quickwit/quickwit-parquet-engine/src/storage/config.rs index f87845d2ba9..2eb63d73510 100644 --- a/quickwit/quickwit-parquet-engine/src/storage/config.rs +++ b/quickwit/quickwit-parquet-engine/src/storage/config.rs @@ -14,12 +14,13 @@ //! Parquet writer configuration for metrics storage. +use arrow::datatypes::{DataType, Schema as ArrowSchema}; use parquet::basic::Compression as ParquetCompression; use parquet::file::metadata::SortingColumn; use parquet::file::properties::{EnabledStatistics, WriterProperties, WriterPropertiesBuilder}; use parquet::schema::types::ColumnPath; -use crate::schema::ParquetField; +use crate::schema::SORT_ORDER; /// Default row group size: 128K rows for efficient columnar scans. const DEFAULT_ROW_GROUP_SIZE: usize = 128 * 1024; @@ -117,8 +118,9 @@ impl ParquetWriterConfig { self } - /// Convert to Parquet WriterProperties. - pub fn to_writer_properties(&self) -> WriterProperties { + /// Convert to Parquet WriterProperties using the given Arrow schema to configure + /// per-column settings like dictionary encoding and bloom filters. + pub fn to_writer_properties(&self, schema: &ArrowSchema) -> WriterProperties { let mut builder = WriterProperties::builder() .set_max_row_group_size(self.row_group_size) .set_data_page_size_limit(self.data_page_size) @@ -126,7 +128,7 @@ impl ParquetWriterConfig { // Enable column index for efficient pruning on sorted data (64 bytes default) .set_column_index_truncate_length(Some(64)) // Set sorting columns metadata for readers to use during pruning - .set_sorting_columns(Some(Self::sorting_columns())) + .set_sorting_columns(Some(Self::sorting_columns(schema))) // Enable row group level statistics (min/max/null_count) for query pruning // This allows DataFusion to skip row groups based on timestamp ranges .set_statistics_enabled(EnabledStatistics::Chunk); @@ -142,77 +144,57 @@ impl ParquetWriterConfig { Compression::Uncompressed => builder.set_compression(ParquetCompression::UNCOMPRESSED), }; - // Apply RLE_DICTIONARY encoding and bloom filters for dictionary columns - builder = Self::configure_dictionary_columns(builder); + // Apply dictionary encoding and bloom filters based on schema column types + builder = Self::configure_columns(builder, schema); builder.build() } - /// Configure dictionary encoding and bloom filters for high-cardinality columns. + /// Configure dictionary encoding and bloom filters based on the Arrow schema. /// - /// Dictionary-encoded columns benefit from: - /// - Dictionary encoding: Enabled by default, uses RLE for dictionary indices - /// - Bloom filters: Enable efficient equality filtering without scanning - /// - /// Note: Dictionary encoding is ON by default in Parquet. When enabled, the dictionary - /// indices are automatically encoded using RLE (run-length encoding), which efficiently - /// compresses runs of repeated values. This is ideal for sorted data where consecutive - /// rows often share the same dictionary index. - fn configure_dictionary_columns( + /// - Dictionary encoding is enabled on all Dictionary(Int32, Utf8) columns. + /// - Bloom filters are enabled on metric_name and sort order tag columns. + fn configure_columns( mut builder: WriterPropertiesBuilder, + schema: &ArrowSchema, ) -> WriterPropertiesBuilder { - // Dictionary-encoded columns - ensure dictionary encoding is explicitly enabled - // (default is true, but being explicit documents intent) - let dictionary_columns = [ - ParquetField::MetricName, - ParquetField::TagService, - ParquetField::TagEnv, - ParquetField::TagDatacenter, - ParquetField::TagRegion, - ParquetField::TagHost, - ParquetField::ServiceName, - ]; - - // Columns that benefit from bloom filters (used in WHERE clauses) - // Note: We enable bloom filters on filtering columns, not timestamp_secs or value - let bloom_filter_columns = [ - (ParquetField::MetricName, BLOOM_FILTER_NDV_METRIC_NAME), - (ParquetField::TagService, BLOOM_FILTER_NDV_TAGS), - (ParquetField::TagEnv, BLOOM_FILTER_NDV_TAGS), - (ParquetField::TagDatacenter, BLOOM_FILTER_NDV_TAGS), - (ParquetField::TagHost, BLOOM_FILTER_NDV_TAGS), - (ParquetField::ServiceName, BLOOM_FILTER_NDV_TAGS), - ]; - - // Ensure dictionary encoding is enabled on dictionary columns - // (dictionary encoding uses RLE for indices automatically) - for field in dictionary_columns { + for field in schema.fields() { let col_path = ColumnPath::new(vec![field.name().to_string()]); - builder = builder.set_column_dictionary_enabled(col_path, true); - } - // Enable bloom filters on filtering columns - for (field, ndv) in bloom_filter_columns { - let col_path = ColumnPath::new(vec![field.name().to_string()]); - builder = builder - .set_column_bloom_filter_enabled(col_path.clone(), true) - .set_column_bloom_filter_fpp(col_path.clone(), BLOOM_FILTER_FPP) - .set_column_bloom_filter_ndv(col_path, ndv); - } + // Enable dictionary encoding on all Dictionary(_, _) columns + if matches!(field.data_type(), DataType::Dictionary(_, _)) { + builder = builder.set_column_dictionary_enabled(col_path.clone(), true); + } + // Enable bloom filters on dictionary-typed metric_name and sort order tag columns. + // Exclude non-dictionary columns, like timestamp_secs. + let is_bloom_column = matches!(field.data_type(), DataType::Dictionary(_, _)) + && (field.name() == "metric_name" || SORT_ORDER.contains(&field.name().as_str())); + if is_bloom_column { + let ndv = if field.name() == "metric_name" { + BLOOM_FILTER_NDV_METRIC_NAME + } else { + BLOOM_FILTER_NDV_TAGS + }; + builder = builder + .set_column_bloom_filter_enabled(col_path.clone(), true) + .set_column_bloom_filter_fpp(col_path.clone(), BLOOM_FILTER_FPP) + .set_column_bloom_filter_ndv(col_path, ndv); + } + } builder } - /// Get the sorting columns for parquet metadata. - /// Order: metric_name, tag_service, tag_env, tag_datacenter, tag_region, tag_host, - /// timestamp_secs. - fn sorting_columns() -> Vec { - ParquetField::sort_order() + /// Get the sorting columns for parquet metadata, computed from the schema + /// and SORT_ORDER. Only columns present in the schema are included. + fn sorting_columns(schema: &ArrowSchema) -> Vec { + SORT_ORDER .iter() - .map(|field| SortingColumn { - column_idx: field.column_index() as i32, + .filter_map(|name| schema.index_of(name).ok()) + .map(|idx| SortingColumn { + column_idx: idx as i32, descending: false, - nulls_first: true, + nulls_first: false, }) .collect() } @@ -220,8 +202,39 @@ impl ParquetWriterConfig { #[cfg(test)] mod tests { + use arrow::datatypes::Field; + use super::*; + /// Create a test schema with required fields + some tag columns. + fn create_test_schema() -> ArrowSchema { + ArrowSchema::new(vec![ + Field::new( + "metric_name", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + Field::new("metric_type", DataType::UInt8, false), + Field::new("timestamp_secs", DataType::UInt64, false), + Field::new("value", DataType::Float64, false), + Field::new( + "service", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + true, + ), + Field::new( + "env", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + true, + ), + Field::new( + "host", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + true, + ), + ]) + } + #[test] fn test_default_config() { let config = ParquetWriterConfig::default(); @@ -245,93 +258,89 @@ mod tests { #[test] fn test_to_writer_properties_zstd() { let config = ParquetWriterConfig::default(); - let props = config.to_writer_properties(); - // WriterProperties doesn't expose compression directly, but we can verify it builds + let schema = create_test_schema(); + let props = config.to_writer_properties(&schema); assert!(props.max_row_group_size() == 128 * 1024); } #[test] fn test_to_writer_properties_snappy() { let config = ParquetWriterConfig::new().with_compression(Compression::Snappy); - let props = config.to_writer_properties(); + let schema = create_test_schema(); + let props = config.to_writer_properties(&schema); assert!(props.max_row_group_size() == 128 * 1024); } #[test] fn test_to_writer_properties_uncompressed() { let config = ParquetWriterConfig::new().with_compression(Compression::Uncompressed); - let props = config.to_writer_properties(); + let schema = create_test_schema(); + let props = config.to_writer_properties(&schema); assert!(props.max_row_group_size() == 128 * 1024); } #[test] fn test_bloom_filter_configuration() { let config = ParquetWriterConfig::default(); - let props = config.to_writer_properties(); + let schema = create_test_schema(); + let props = config.to_writer_properties(&schema); // Verify bloom filter is enabled on metric_name column let metric_name_path = ColumnPath::new(vec!["metric_name".to_string()]); let bloom_props = props.bloom_filter_properties(&metric_name_path); assert!( bloom_props.is_some(), - "Bloom filter should be enabled for metric_name" + "bloom filter should be enabled for metric_name" ); let bloom_props = bloom_props.unwrap(); assert!( (bloom_props.fpp - BLOOM_FILTER_FPP).abs() < 0.001, - "Bloom filter FPP should be {}", + "bloom filter FPP should be {}", BLOOM_FILTER_FPP ); assert_eq!( bloom_props.ndv, BLOOM_FILTER_NDV_METRIC_NAME, - "Bloom filter NDV for metric_name should be {}", + "bloom filter NDV for metric_name should be {}", BLOOM_FILTER_NDV_METRIC_NAME ); - // Verify bloom filter is enabled on tag columns - let tag_service_path = ColumnPath::new(vec!["tag_service".to_string()]); - let bloom_props = props.bloom_filter_properties(&tag_service_path); + // Verify bloom filter is enabled on service tag column (in SORT_ORDER) + let service_path = ColumnPath::new(vec!["service".to_string()]); + let bloom_props = props.bloom_filter_properties(&service_path); assert!( bloom_props.is_some(), - "Bloom filter should be enabled for tag_service" + "bloom filter should be enabled for service" ); let bloom_props = bloom_props.unwrap(); assert_eq!( bloom_props.ndv, BLOOM_FILTER_NDV_TAGS, - "Bloom filter NDV for tag columns should be {}", + "bloom filter NDV for tag columns should be {}", BLOOM_FILTER_NDV_TAGS ); - // Verify bloom filter is NOT enabled on timestamp_secs (not a filtering column) - let timestamp_path = ColumnPath::new(vec!["timestamp_secs".to_string()]); - let bloom_props = props.bloom_filter_properties(×tamp_path); - assert!( - bloom_props.is_none(), - "Bloom filter should NOT be enabled for timestamp_secs" - ); - // Verify bloom filter is NOT enabled on value column let value_path = ColumnPath::new(vec!["value".to_string()]); let bloom_props = props.bloom_filter_properties(&value_path); assert!( bloom_props.is_none(), - "Bloom filter should NOT be enabled for value" + "bloom filter should NOT be enabled for value" ); } #[test] fn test_statistics_enabled() { let config = ParquetWriterConfig::default(); - let props = config.to_writer_properties(); + let schema = create_test_schema(); + let props = config.to_writer_properties(&schema); // Verify statistics are enabled at Chunk (row group) level let metric_name_path = ColumnPath::new(vec!["metric_name".to_string()]); assert_eq!( props.statistics_enabled(&metric_name_path), EnabledStatistics::Chunk, - "Statistics should be enabled at Chunk level" + "statistics should be enabled at Chunk level" ); // Verify for timestamp column as well (important for time range pruning) @@ -339,31 +348,24 @@ mod tests { assert_eq!( props.statistics_enabled(×tamp_path), EnabledStatistics::Chunk, - "Statistics should be enabled at Chunk level for timestamp" + "statistics should be enabled at Chunk level for timestamp" ); } #[test] fn test_dictionary_encoding_enabled() { let config = ParquetWriterConfig::default(); - let props = config.to_writer_properties(); - - // Verify dictionary encoding is enabled for dictionary columns - let dictionary_columns = [ - "metric_name", - "tag_service", - "tag_env", - "tag_datacenter", - "tag_region", - "tag_host", - "service_name", - ]; + let schema = create_test_schema(); + let props = config.to_writer_properties(&schema); + + // Verify dictionary encoding is enabled for dictionary-typed columns + let dictionary_columns = ["metric_name", "service", "env", "host"]; for col_name in dictionary_columns { let col_path = ColumnPath::new(vec![col_name.to_string()]); assert!( props.dictionary_enabled(&col_path), - "Dictionary encoding should be enabled for {}", + "dictionary encoding should be enabled for {}", col_name ); } @@ -371,37 +373,31 @@ mod tests { #[test] fn test_sorting_columns_order() { - let sorting_cols = ParquetWriterConfig::sorting_columns(); + let schema = create_test_schema(); + let sorting_cols = ParquetWriterConfig::sorting_columns(&schema); - // Verify we have the expected number of sorting columns + // The test schema has metric_name (idx 0), timestamp_secs (idx 2), + // service (idx 4), env (idx 5), host (idx 6). + // SORT_ORDER is: metric_name, service, env, datacenter, region, host, timestamp_secs + // Only present columns are included, so: metric_name, service, env, host, timestamp_secs assert_eq!( sorting_cols.len(), - 7, - "Should have 7 sorting columns: metric_name, 5 tags, timestamp" + 5, + "should have 5 sorting columns from the test schema" ); - // Verify sort order matches expected: metric_name, tag_service, tag_env, - // tag_datacenter, tag_region, tag_host, timestamp_secs - let expected_order = [ - ParquetField::MetricName, - ParquetField::TagService, - ParquetField::TagEnv, - ParquetField::TagDatacenter, - ParquetField::TagRegion, - ParquetField::TagHost, - ParquetField::TimestampSecs, - ]; - - for (i, expected_field) in expected_order.iter().enumerate() { - assert_eq!( - sorting_cols[i].column_idx, - expected_field.column_index() as i32, - "Sorting column {} should be {}", - i, - expected_field.name() - ); - assert!(!sorting_cols[i].descending, "Sorting should be ascending"); - assert!(sorting_cols[i].nulls_first, "Nulls should be first"); + // Verify all are ascending with nulls first + for col in &sorting_cols { + assert!(!col.descending, "sorting should be ascending"); + assert!(!col.nulls_first, "nulls should be last"); } + + // Verify order matches SORT_ORDER filtered by schema presence: + // metric_name (idx 0), service (idx 4), env (idx 5), host (idx 6), timestamp_secs (idx 2) + assert_eq!(sorting_cols[0].column_idx, 0); // metric_name + assert_eq!(sorting_cols[1].column_idx, 4); // service + assert_eq!(sorting_cols[2].column_idx, 5); // env + assert_eq!(sorting_cols[3].column_idx, 6); // host + assert_eq!(sorting_cols[4].column_idx, 2); // timestamp_secs } } diff --git a/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs b/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs index 8505eb0f75c..466e2c9cdcb 100644 --- a/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs +++ b/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs @@ -25,7 +25,6 @@ use tracing::{debug, info, instrument}; use super::config::ParquetWriterConfig; use super::writer::{ParquetWriteError, ParquetWriter}; -use crate::schema::{ParquetField, ParquetSchema}; use crate::split::{MetricsSplitMetadata, ParquetSplit, SplitId, TAG_SERVICE, TimeRange}; /// Writer that produces complete ParquetSplit with metadata from RecordBatch data. @@ -40,16 +39,11 @@ impl ParquetSplitWriter { /// Create a new ParquetSplitWriter. /// /// # Arguments - /// * `schema` - The metrics schema for validation /// * `config` - Parquet writer configuration /// * `base_path` - Directory where split files will be written - pub fn new( - schema: ParquetSchema, - config: ParquetWriterConfig, - base_path: impl Into, - ) -> Self { + pub fn new(config: ParquetWriterConfig, base_path: impl Into) -> Self { Self { - writer: ParquetWriter::new(schema, config), + writer: ParquetWriter::new(config), base_path: base_path.into(), } } @@ -89,7 +83,7 @@ impl ParquetSplitWriter { debug!( start_secs = time_range.start_secs, end_secs = time_range.end_secs, - "Extracted time range from batch" + "extracted time range from batch" ); // Extract distinct metric names from batch @@ -122,7 +116,7 @@ impl ParquetSplitWriter { split_id = %split_id, file_path = %file_path.display(), size_bytes, - "Split file written successfully" + "split file written successfully" ); Ok(ParquetSplit::new(metadata)) @@ -131,7 +125,11 @@ impl ParquetSplitWriter { /// Extracts the time range (min/max timestamp_secs) from a RecordBatch. fn extract_time_range(batch: &RecordBatch) -> Result { - let timestamp_col = batch.column(ParquetField::TimestampSecs.column_index()); + let timestamp_idx = batch + .schema() + .index_of("timestamp_secs") + .map_err(|_| ParquetWriteError::SchemaValidation("missing timestamp_secs column".into()))?; + let timestamp_col = batch.column(timestamp_idx); let timestamp_array = timestamp_col.as_primitive::(); let min_val = min(timestamp_array); @@ -151,7 +149,11 @@ fn extract_time_range(batch: &RecordBatch) -> Result Result, ParquetWriteError> { - let metric_col = batch.column(ParquetField::MetricName.column_index()); + let metric_idx = batch + .schema() + .index_of("metric_name") + .map_err(|_| ParquetWriteError::SchemaValidation("missing metric_name column".into()))?; + let metric_col = batch.column(metric_idx); let mut names = HashSet::new(); // The column is Dictionary(Int32, Utf8) @@ -180,7 +182,11 @@ fn extract_metric_names(batch: &RecordBatch) -> Result, ParquetW /// Extracts distinct service names from a RecordBatch. fn extract_service_names(batch: &RecordBatch) -> Result, ParquetWriteError> { - let service_col = batch.column(ParquetField::ServiceName.column_index()); + let service_idx = match batch.schema().index_of("service").ok() { + Some(idx) => idx, + None => return Ok(HashSet::new()), + }; + let service_col = batch.column(service_idx); let mut names = HashSet::new(); // The column is Dictionary(Int32, Utf8) @@ -211,84 +217,59 @@ fn extract_service_names(batch: &RecordBatch) -> Result, Parquet mod tests { use std::sync::Arc; - use arrow::array::{ - ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array, UInt64Array, - }; - use arrow::datatypes::Int32Type; - use parquet::variant::{VariantArrayBuilder, VariantBuilderExt}; + use arrow::array::{ArrayRef, Float64Array, UInt8Array, UInt64Array}; + use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; use super::*; - - #[test] - fn test_column_indices_match_schema() { - let schema = ParquetSchema::new(); - for field in ParquetField::all() { - let expected: usize = - schema - .arrow_schema() - .index_of(field.name()) - .unwrap_or_else(|_| { - panic!("field {:?} should exist in arrow schema", field.name()) - }); - assert_eq!( - field.column_index(), - expected, - "column_index() for {:?} does not match arrow schema position", - field.name() - ); + use crate::test_helpers::{create_dict_array, create_nullable_dict_array}; + + /// Create a test batch with required fields, optional service column, and specified tag + /// columns. + fn create_test_batch_with_options( + num_rows: usize, + metric_names: &[&str], + timestamps: &[u64], + service_names: Option<&[&str]>, + tags: &[&str], + ) -> RecordBatch { + let dict_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); + + let mut fields = vec![ + Field::new("metric_name", dict_type.clone(), false), + Field::new("metric_type", DataType::UInt8, false), + Field::new("timestamp_secs", DataType::UInt64, false), + Field::new("value", DataType::Float64, false), + ]; + if service_names.is_some() { + fields.push(Field::new("service", dict_type.clone(), true)); } - } + for tag in tags { + fields.push(Field::new(*tag, dict_type.clone(), true)); + } + let schema = Arc::new(ArrowSchema::new(fields)); - /// Create a VARIANT array for testing with specified number of rows. - fn create_variant_array(num_rows: usize, fields: Option<&[(&str, &str)]>) -> ArrayRef { - let mut builder = VariantArrayBuilder::new(num_rows); - for _ in 0..num_rows { - match fields { - Some(kv_pairs) => { - let mut obj = builder.new_object(); - for (key, value) in kv_pairs { - obj = obj.with_field(key, *value); - } - obj.finish(); - } - None => { - builder.append_null(); - } - } + let metric_name: ArrayRef = create_dict_array(metric_names); + let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows])); + let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps.to_vec())); + let values: Vec = (0..num_rows).map(|i| 42.0 + i as f64).collect(); + let value: ArrayRef = Arc::new(Float64Array::from(values)); + + let mut columns: Vec = vec![metric_name, metric_type, timestamp_secs, value]; + + if let Some(svc_names) = service_names { + columns.push(create_dict_array(svc_names)); } - ArrayRef::from(builder.build()) - } - /// Create dictionary array for string fields with Int32 keys. - fn create_dict_array(values: &[&str]) -> ArrayRef { - let keys: Vec = (0..values.len()).map(|i| i as i32).collect(); - let string_array = StringArray::from(values.to_vec()); - Arc::new( - DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) - .unwrap(), - ) - } + for tag in tags { + let tag_values: Vec> = vec![Some(tag); num_rows]; + columns.push(create_nullable_dict_array(&tag_values)); + } - /// Create nullable dictionary array for optional string fields. - fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef { - let keys: Vec> = values - .iter() - .enumerate() - .map(|(i, v)| v.map(|_| i as i32)) - .collect(); - let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect(); - let string_array = StringArray::from(string_values); - Arc::new( - DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) - .unwrap(), - ) + RecordBatch::try_new(schema, columns).unwrap() } - /// Create a test batch with specified number of rows and test data. + /// Create a simple test batch with default values. fn create_test_batch(num_rows: usize) -> RecordBatch { - let schema = ParquetSchema::new(); - - // Generate test data let metric_names: Vec<&str> = (0..num_rows) .map(|i| { if i % 2 == 0 { @@ -299,82 +280,23 @@ mod tests { }) .collect(); let timestamps: Vec = (0..num_rows).map(|i| 100 + i as u64 * 10).collect(); - - // MetricName: Dictionary(Int32, Utf8) - let metric_name: ArrayRef = create_dict_array(&metric_names); - - // MetricType: UInt8 - let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows])); - - // MetricUnit: Utf8 (nullable) - let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows])); - - // TimestampSecs: UInt64 - let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps)); - - // StartTimestampSecs: UInt64 (nullable) - let start_timestamp_secs: ArrayRef = - Arc::new(UInt64Array::from(vec![None::; num_rows])); - - // Value: Float64 - let values: Vec = (0..num_rows).map(|i| 42.0 + i as f64).collect(); - let value: ArrayRef = Arc::new(Float64Array::from(values)); - - // TagService: Dictionary(Int32, Utf8) (nullable) - let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]); - - // TagEnv: Dictionary(Int32, Utf8) (nullable) - let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]); - - // TagDatacenter: Dictionary(Int32, Utf8) (nullable) - let tag_datacenter: ArrayRef = - create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]); - - // TagRegion: Dictionary(Int32, Utf8) (nullable) - let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]); - - // TagHost: Dictionary(Int32, Utf8) (nullable) - let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]); - - // Attributes: VARIANT (nullable) - let attributes: ArrayRef = create_variant_array(num_rows, Some(&[("key", "value")])); - - // ServiceName: Dictionary(Int32, Utf8) - let service_names: Vec<&str> = (0..num_rows).map(|_| "my-service").collect(); - let service_name: ArrayRef = create_dict_array(&service_names); - - // ResourceAttributes: VARIANT (nullable) - let resource_attributes: ArrayRef = create_variant_array(num_rows, None); - - RecordBatch::try_new( - schema.arrow_schema().clone(), - vec![ - metric_name, - metric_type, - metric_unit, - timestamp_secs, - start_timestamp_secs, - value, - tag_service, - tag_env, - tag_datacenter, - tag_region, - tag_host, - attributes, - service_name, - resource_attributes, - ], + let service_names: Vec<&str> = vec!["my-service"; num_rows]; + + create_test_batch_with_options( + num_rows, + &metric_names, + ×tamps, + Some(&service_names), + &["service", "host"], ) - .unwrap() } #[test] fn test_write_split_creates_file() { - let schema = ParquetSchema::new(); let config = ParquetWriterConfig::default(); let temp_dir = tempfile::tempdir().unwrap(); - let writer = ParquetSplitWriter::new(schema, config, temp_dir.path()); + let writer = ParquetSplitWriter::new(config, temp_dir.path()); let batch = create_test_batch(10); let split = writer.write_split(&batch, "test-index").unwrap(); @@ -393,14 +315,19 @@ mod tests { #[test] fn test_write_split_extracts_time_range() { - let schema = ParquetSchema::new(); let config = ParquetWriterConfig::default(); let temp_dir = tempfile::tempdir().unwrap(); - let writer = ParquetSplitWriter::new(schema, config, temp_dir.path()); + let writer = ParquetSplitWriter::new(config, temp_dir.path()); // Create batch with timestamps [100, 150, 200] - let batch = create_test_batch_with_timestamps(&[100, 150, 200]); + let batch = create_test_batch_with_options( + 3, + &["test.metric", "test.metric", "test.metric"], + &[100, 150, 200], + Some(&["my-service", "my-service", "my-service"]), + &[], + ); let split = writer.write_split(&batch, "test-index").unwrap(); // Verify time range @@ -410,14 +337,19 @@ mod tests { #[test] fn test_write_split_extracts_metric_names() { - let schema = ParquetSchema::new(); let config = ParquetWriterConfig::default(); let temp_dir = tempfile::tempdir().unwrap(); - let writer = ParquetSplitWriter::new(schema, config, temp_dir.path()); + let writer = ParquetSplitWriter::new(config, temp_dir.path()); // Create batch with specific metric names - let batch = create_test_batch_with_metric_names(&["cpu.usage", "memory.used", "cpu.usage"]); + let batch = create_test_batch_with_options( + 3, + &["cpu.usage", "memory.used", "cpu.usage"], + &[100, 100, 100], + Some(&["my-service", "my-service", "my-service"]), + &[], + ); let split = writer.write_split(&batch, "test-index").unwrap(); // Verify metric names (distinct values) @@ -425,93 +357,4 @@ mod tests { assert!(split.metadata.metric_names.contains("memory.used")); assert_eq!(split.metadata.metric_names.len(), 2); } - - /// Create a test batch with specific timestamps. - fn create_test_batch_with_timestamps(timestamps: &[u64]) -> RecordBatch { - let schema = ParquetSchema::new(); - let num_rows = timestamps.len(); - - let metric_names: Vec<&str> = vec!["test.metric"; num_rows]; - let metric_name: ArrayRef = create_dict_array(&metric_names); - let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows])); - let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows])); - let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps.to_vec())); - let start_timestamp_secs: ArrayRef = - Arc::new(UInt64Array::from(vec![None::; num_rows])); - let value: ArrayRef = Arc::new(Float64Array::from(vec![42.0; num_rows])); - let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]); - let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]); - let tag_datacenter: ArrayRef = - create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]); - let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]); - let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]); - let attributes: ArrayRef = create_variant_array(num_rows, None); - let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]); - let resource_attributes: ArrayRef = create_variant_array(num_rows, None); - - RecordBatch::try_new( - schema.arrow_schema().clone(), - vec![ - metric_name, - metric_type, - metric_unit, - timestamp_secs, - start_timestamp_secs, - value, - tag_service, - tag_env, - tag_datacenter, - tag_region, - tag_host, - attributes, - service_name, - resource_attributes, - ], - ) - .unwrap() - } - - /// Create a test batch with specific metric names. - fn create_test_batch_with_metric_names(metric_names: &[&str]) -> RecordBatch { - let schema = ParquetSchema::new(); - let num_rows = metric_names.len(); - - let metric_name: ArrayRef = create_dict_array(metric_names); - let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows])); - let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows])); - let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(vec![100u64; num_rows])); - let start_timestamp_secs: ArrayRef = - Arc::new(UInt64Array::from(vec![None::; num_rows])); - let value: ArrayRef = Arc::new(Float64Array::from(vec![42.0; num_rows])); - let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]); - let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]); - let tag_datacenter: ArrayRef = - create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]); - let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]); - let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]); - let attributes: ArrayRef = create_variant_array(num_rows, None); - let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]); - let resource_attributes: ArrayRef = create_variant_array(num_rows, None); - - RecordBatch::try_new( - schema.arrow_schema().clone(), - vec![ - metric_name, - metric_type, - metric_unit, - timestamp_secs, - start_timestamp_secs, - value, - tag_service, - tag_env, - tag_datacenter, - tag_region, - tag_host, - attributes, - service_name, - resource_attributes, - ], - ) - .unwrap() - } } diff --git a/quickwit/quickwit-parquet-engine/src/storage/writer.rs b/quickwit/quickwit-parquet-engine/src/storage/writer.rs index 50a375f42b8..6f29c0be4cc 100644 --- a/quickwit/quickwit-parquet-engine/src/storage/writer.rs +++ b/quickwit/quickwit-parquet-engine/src/storage/writer.rs @@ -27,7 +27,7 @@ use thiserror::Error; use tracing::{debug, instrument}; use super::config::ParquetWriterConfig; -use crate::schema::{ParquetField, ParquetSchema}; +use crate::schema::{SORT_ORDER, validate_required_fields}; /// Errors that can occur during parquet writing. #[derive(Debug, Error)] @@ -44,21 +44,20 @@ pub enum ParquetWriteError { #[error("Arrow error: {0}")] ArrowError(#[from] arrow::error::ArrowError), - /// Schema mismatch between RecordBatch and ParquetSchema. - #[error("Schema mismatch: expected {expected} fields, got {got}")] - SchemaMismatch { expected: usize, got: usize }, + /// Schema validation failed. + #[error("Schema validation failed: {0}")] + SchemaValidation(String), } /// Writer for metrics data to Parquet format. pub struct ParquetWriter { config: ParquetWriterConfig, - schema: ParquetSchema, } impl ParquetWriter { /// Create a new ParquetWriter. - pub fn new(schema: ParquetSchema, config: ParquetWriterConfig) -> Self { - Self { config, schema } + pub fn new(config: ParquetWriterConfig) -> Self { + Self { config } } /// Get the writer configuration. @@ -66,58 +65,58 @@ impl ParquetWriter { &self.config } - /// Get the metrics schema. - pub fn schema(&self) -> &ParquetSchema { - &self.schema - } - - /// Validate that a RecordBatch matches the expected schema. - fn validate_batch(&self, batch: &RecordBatch) -> Result<(), ParquetWriteError> { - let expected = self.schema.num_fields(); - let got = batch.num_columns(); - if expected != got { - return Err(ParquetWriteError::SchemaMismatch { expected, got }); - } - Ok(()) - } - /// Sort a RecordBatch by the metrics sort order. - /// Order: metric_name, tag_service, tag_env, tag_datacenter, tag_region, tag_host, - /// timestamp_secs. This sorting enables efficient pruning during query execution. + /// Columns from SORT_ORDER that are present in the batch schema are used; + /// missing columns are skipped. fn sort_batch(&self, batch: &RecordBatch) -> Result { - // Build sort columns from the defined sort order - let sort_columns: Vec = ParquetField::sort_order() + let schema = batch.schema(); + let mut sort_columns: Vec = SORT_ORDER .iter() - .map(|field| { - let col_idx = field.column_index(); - SortColumn { - values: Arc::clone(batch.column(col_idx)), - options: Some(SortOptions { - descending: false, - nulls_first: true, - }), - } + .filter_map(|name| schema.index_of(name).ok()) + .map(|idx| SortColumn { + values: Arc::clone(batch.column(idx)), + options: Some(SortOptions { + descending: false, + nulls_first: false, + }), }) .collect(); - // Compute sorted indices - let indices = lexsort_to_indices(&sort_columns, None)?; + if sort_columns.is_empty() { + return Ok(batch.clone()); + } + + // Append the original row index as a tiebreaker so that rows with + // identical sort keys keep their arrival order (stable sort semantics). + // lexsort_to_indices uses an unstable sort internally; the tiebreaker + // makes it behave stably at a small cost (one u32 comparison per + // equal-key pair, 4 bytes × num_rows of extra allocation). + let row_indices = Arc::new(arrow::array::UInt32Array::from_iter_values( + 0..batch.num_rows() as u32, + )); + sort_columns.push(SortColumn { + values: row_indices, + options: Some(SortOptions { + descending: false, + nulls_first: false, + }), + }); - // Reorder the batch using the sorted indices - let sorted_batch = take_record_batch(batch, &indices)?; - Ok(sorted_batch) + let indices = lexsort_to_indices(&sort_columns, None)?; + Ok(take_record_batch(batch, &indices)?) } /// Write a RecordBatch to Parquet bytes in memory. /// The batch is sorted before writing by: metric_name, common tags, timestamp. #[instrument(skip(self, batch), fields(batch_rows = batch.num_rows()))] pub fn write_to_bytes(&self, batch: &RecordBatch) -> Result, ParquetWriteError> { - self.validate_batch(batch)?; + validate_required_fields(&batch.schema()) + .map_err(|e| ParquetWriteError::SchemaValidation(e.to_string()))?; // Sort the batch before writing for efficient pruning let sorted_batch = self.sort_batch(batch)?; - let props = self.config.to_writer_properties(); + let props = self.config.to_writer_properties(&sorted_batch.schema()); let buffer = Cursor::new(Vec::new()); let mut writer = ArrowWriter::try_new(buffer, sorted_batch.schema(), Some(props))?; @@ -125,7 +124,7 @@ impl ParquetWriter { let buffer = writer.into_inner()?; let bytes = buffer.into_inner(); - debug!(bytes_written = bytes.len(), "Completed write to bytes"); + debug!(bytes_written = bytes.len(), "completed write to bytes"); Ok(bytes) } @@ -139,12 +138,13 @@ impl ParquetWriter { batch: &RecordBatch, path: &Path, ) -> Result { - self.validate_batch(batch)?; + validate_required_fields(&batch.schema()) + .map_err(|e| ParquetWriteError::SchemaValidation(e.to_string()))?; // Sort the batch before writing for efficient pruning let sorted_batch = self.sort_batch(batch)?; - let props = self.config.to_writer_properties(); + let props = self.config.to_writer_properties(&sorted_batch.schema()); let file = File::create(path)?; let mut writer = ArrowWriter::try_new(file, sorted_batch.schema(), Some(props))?; @@ -152,7 +152,7 @@ impl ParquetWriter { let file = writer.into_inner()?; let bytes_written = file.metadata()?.len(); - debug!(bytes_written, "Completed write to file"); + debug!(bytes_written, "completed write to file"); Ok(bytes_written) } } @@ -165,148 +165,24 @@ mod tests { ArrayRef, DictionaryArray, Float64Array, StringArray, UInt8Array, UInt64Array, }; use arrow::datatypes::{DataType, Field, Int32Type, Schema}; - use parquet::variant::{VariantArrayBuilder, VariantBuilderExt}; use super::*; - - /// Create dictionary array for string fields with Int32 keys. - fn create_dict_array(values: Vec<&str>) -> ArrayRef { - let string_array = StringArray::from(values); - Arc::new( - DictionaryArray::::try_new( - arrow::array::Int32Array::from(vec![0i32]), - Arc::new(string_array), - ) - .unwrap(), - ) - } - - /// Create nullable dictionary array for optional string fields. - fn create_nullable_dict_array(value: Option<&str>) -> ArrayRef { - match value { - Some(v) => { - let string_array = StringArray::from(vec![v]); - Arc::new( - DictionaryArray::::try_new( - arrow::array::Int32Array::from(vec![0i32]), - Arc::new(string_array), - ) - .unwrap(), - ) - } - None => { - let string_array = StringArray::from(vec![None::<&str>]); - Arc::new( - DictionaryArray::::try_new( - arrow::array::Int32Array::from(vec![None::]), - Arc::new(string_array), - ) - .unwrap(), - ) - } - } - } - - /// Create a VARIANT array for testing. - fn create_variant_array(fields: Option<&[(&str, &str)]>) -> ArrayRef { - let mut builder = VariantArrayBuilder::new(1); - match fields { - Some(kv_pairs) => { - let mut obj = builder.new_object(); - for (key, value) in kv_pairs { - obj = obj.with_field(key, *value); - } - obj.finish(); - } - None => { - builder.append_null(); - } - } - ArrayRef::from(builder.build()) - } + use crate::test_helpers::create_test_batch_with_tags; fn create_test_batch() -> RecordBatch { - let schema = ParquetSchema::new(); - - // Create arrays for all 14 fields in ParquetSchema matching fields.rs: - // MetricName: Dictionary(Int32, Utf8) - let metric_name: ArrayRef = create_dict_array(vec!["test.metric"]); - - // MetricType: UInt8 - let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8])); // gauge - - // MetricUnit: Utf8 (nullable) - let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes")])); - - // TimestampSecs: UInt64 - let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(vec![1704067200u64])); - - // StartTimestampSecs: UInt64 (nullable) - let start_timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(vec![None::])); - - // Value: Float64 - let value: ArrayRef = Arc::new(Float64Array::from(vec![42.0])); - - // TagService: Dictionary(Int32, Utf8) (nullable) - let tag_service: ArrayRef = create_nullable_dict_array(Some("web")); - - // TagEnv: Dictionary(Int32, Utf8) (nullable) - let tag_env: ArrayRef = create_nullable_dict_array(Some("prod")); - - // TagDatacenter: Dictionary(Int32, Utf8) (nullable) - let tag_datacenter: ArrayRef = create_nullable_dict_array(Some("us-east-1")); - - // TagRegion: Dictionary(Int32, Utf8) (nullable) - let tag_region: ArrayRef = create_nullable_dict_array(None); - - // TagHost: Dictionary(Int32, Utf8) (nullable) - let tag_host: ArrayRef = create_nullable_dict_array(Some("host-001")); - - // Attributes: VARIANT (nullable) - let attributes: ArrayRef = create_variant_array(Some(&[("key", "value")])); - - // ServiceName: Dictionary(Int32, Utf8) - let service_name: ArrayRef = create_dict_array(vec!["my-service"]); - - // ResourceAttributes: VARIANT (nullable) - let resource_attributes: ArrayRef = create_variant_array(None); - - RecordBatch::try_new( - schema.arrow_schema().clone(), - vec![ - metric_name, - metric_type, - metric_unit, - timestamp_secs, - start_timestamp_secs, - value, - tag_service, - tag_env, - tag_datacenter, - tag_region, - tag_host, - attributes, - service_name, - resource_attributes, - ], - ) - .unwrap() + create_test_batch_with_tags(1, &["service", "env"]) } #[test] fn test_writer_creation() { - let schema = ParquetSchema::new(); let config = ParquetWriterConfig::default(); - let writer = ParquetWriter::new(schema, config); - - assert_eq!(writer.schema().num_fields(), 14); + let _writer = ParquetWriter::new(config); } #[test] fn test_write_to_bytes() { - let schema = ParquetSchema::new(); let config = ParquetWriterConfig::default(); - let writer = ParquetWriter::new(schema, config); + let writer = ParquetWriter::new(config); let batch = create_test_batch(); let bytes = writer.write_to_bytes(&batch).unwrap(); @@ -318,9 +194,8 @@ mod tests { #[test] fn test_write_to_file() { - let schema = ParquetSchema::new(); let config = ParquetWriterConfig::default(); - let writer = ParquetWriter::new(schema, config); + let writer = ParquetWriter::new(config); let batch = create_test_batch(); let temp_dir = std::env::temp_dir(); @@ -334,12 +209,11 @@ mod tests { } #[test] - fn test_schema_mismatch() { - let schema = ParquetSchema::new(); + fn test_schema_validation_missing_field() { let config = ParquetWriterConfig::default(); - let writer = ParquetWriter::new(schema, config); + let writer = ParquetWriter::new(config); - // Create a batch with wrong number of columns + // Create a batch missing required fields let wrong_schema = Arc::new(Schema::new(vec![Field::new( "single_field", DataType::Utf8, @@ -354,10 +228,37 @@ mod tests { let result = writer.write_to_bytes(&wrong_batch); assert!(matches!( result, - Err(ParquetWriteError::SchemaMismatch { - expected: 14, - got: 1 - }) + Err(ParquetWriteError::SchemaValidation(_)) + )); + } + + #[test] + fn test_schema_validation_wrong_type() { + let config = ParquetWriterConfig::default(); + let writer = ParquetWriter::new(config); + + // Create a batch where metric_name has wrong type (Utf8 instead of Dictionary) + let wrong_schema = Arc::new(Schema::new(vec![ + Field::new("metric_name", DataType::Utf8, false), + Field::new("metric_type", DataType::UInt8, false), + Field::new("timestamp_secs", DataType::UInt64, false), + Field::new("value", DataType::Float64, false), + ])); + let wrong_batch = RecordBatch::try_new( + wrong_schema, + vec![ + Arc::new(StringArray::from(vec!["test"])) as ArrayRef, + Arc::new(UInt8Array::from(vec![0u8])) as ArrayRef, + Arc::new(UInt64Array::from(vec![100u64])) as ArrayRef, + Arc::new(Float64Array::from(vec![1.0])) as ArrayRef, + ], + ) + .unwrap(); + + let result = writer.write_to_bytes(&wrong_batch); + assert!(matches!( + result, + Err(ParquetWriteError::SchemaValidation(_)) )); } @@ -365,9 +266,8 @@ mod tests { fn test_write_with_snappy_compression() { use super::super::config::Compression; - let schema = ParquetSchema::new(); let config = ParquetWriterConfig::new().with_compression(Compression::Snappy); - let writer = ParquetWriter::new(schema, config); + let writer = ParquetWriter::new(config); let batch = create_test_batch(); let bytes = writer.write_to_bytes(&batch).unwrap(); @@ -382,9 +282,25 @@ mod tests { use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; - let schema = ParquetSchema::new(); let config = ParquetWriterConfig::default(); - let writer = ParquetWriter::new(schema.clone(), config); + let writer = ParquetWriter::new(config); + + // Create a schema with required fields + service tag for sort verification + let schema = Arc::new(Schema::new(vec![ + Field::new( + "metric_name", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + Field::new("metric_type", DataType::UInt8, false), + Field::new("timestamp_secs", DataType::UInt64, false), + Field::new("value", DataType::Float64, false), + Field::new( + "service", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + true, + ), + ])); // Create unsorted batch with multiple rows: // Row 0: metric_b, service_a, timestamp=300 @@ -393,11 +309,6 @@ mod tests { // Expected sorted order: metric_a/service_a/200, metric_a/service_b/100, // metric_b/service_a/300 - // Build arrays for 3 rows (original unsorted order in comments above) - let timestamps = [300u64, 100u64, 200u64]; - let values = [1.0, 2.0, 3.0]; - - // metric_name: Dictionary(Int32, Utf8) let metric_name: ArrayRef = { let keys = arrow::array::Int32Array::from(vec![0i32, 1, 1]); let values = StringArray::from(vec!["metric_b", "metric_a"]); @@ -405,88 +316,18 @@ mod tests { }; let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8, 0, 0])); - let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![ - Some("bytes"), - Some("bytes"), - Some("bytes"), - ])); - let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps.to_vec())); - let start_timestamp_secs: ArrayRef = - Arc::new(UInt64Array::from(vec![None::, None, None])); - let value: ArrayRef = Arc::new(Float64Array::from(values.to_vec())); + let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(vec![300u64, 100u64, 200u64])); + let value: ArrayRef = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])); - // tag_service: Dictionary(Int32, Utf8) (nullable) - let tag_service: ArrayRef = { + let service: ArrayRef = { let keys = arrow::array::Int32Array::from(vec![Some(0i32), Some(1), Some(0)]); let values = StringArray::from(vec!["service_a", "service_b"]); Arc::new(DictionaryArray::::try_new(keys, Arc::new(values)).unwrap()) }; - let tag_env: ArrayRef = { - let keys = arrow::array::Int32Array::from(vec![Some(0i32), Some(0), Some(0)]); - let values = StringArray::from(vec!["prod"]); - Arc::new(DictionaryArray::::try_new(keys, Arc::new(values)).unwrap()) - }; - - let tag_datacenter: ArrayRef = { - let keys = arrow::array::Int32Array::from(vec![None::, None, None]); - let values = StringArray::from(vec![None::<&str>]); - Arc::new(DictionaryArray::::try_new(keys, Arc::new(values)).unwrap()) - }; - - let tag_region: ArrayRef = { - let keys = arrow::array::Int32Array::from(vec![None::, None, None]); - let values = StringArray::from(vec![None::<&str>]); - Arc::new(DictionaryArray::::try_new(keys, Arc::new(values)).unwrap()) - }; - - let tag_host: ArrayRef = { - let keys = arrow::array::Int32Array::from(vec![None::, None, None]); - let values = StringArray::from(vec![None::<&str>]); - Arc::new(DictionaryArray::::try_new(keys, Arc::new(values)).unwrap()) - }; - - // Build VARIANT arrays for 3 rows - let attributes: ArrayRef = { - let mut builder = VariantArrayBuilder::new(3); - for _ in 0..3 { - builder.append_null(); - } - ArrayRef::from(builder.build()) - }; - - let service_name: ArrayRef = { - let keys = arrow::array::Int32Array::from(vec![0i32, 1, 0]); - let values = StringArray::from(vec!["service_a", "service_b"]); - Arc::new(DictionaryArray::::try_new(keys, Arc::new(values)).unwrap()) - }; - - let resource_attributes: ArrayRef = { - let mut builder = VariantArrayBuilder::new(3); - for _ in 0..3 { - builder.append_null(); - } - ArrayRef::from(builder.build()) - }; - let batch = RecordBatch::try_new( - schema.arrow_schema().clone(), - vec![ - metric_name, - metric_type, - metric_unit, - timestamp_secs, - start_timestamp_secs, - value, - tag_service, - tag_env, - tag_datacenter, - tag_region, - tag_host, - attributes, - service_name, - resource_attributes, - ], + schema, + vec![metric_name, metric_type, timestamp_secs, value, service], ) .unwrap(); @@ -509,25 +350,29 @@ mod tests { assert_eq!(result.num_rows(), 3); // Extract metric names and timestamps to verify sort order + let metric_idx = result.schema().index_of("metric_name").unwrap(); + let ts_idx = result.schema().index_of("timestamp_secs").unwrap(); + let service_idx = result.schema().index_of("service").unwrap(); + let metric_col = result - .column(ParquetField::MetricName.column_index()) + .column(metric_idx) .as_any() .downcast_ref::>() .unwrap(); let ts_col = result - .column(ParquetField::TimestampSecs.column_index()) + .column(ts_idx) .as_any() .downcast_ref::() .unwrap(); let service_col = result - .column(ParquetField::TagService.column_index()) + .column(service_idx) .as_any() .downcast_ref::>() .unwrap(); // Get string values from dictionary - let get_metric = |i: usize| -> &str { - let key = metric_col.keys().value(i); + let get_metric = |row: usize| -> &str { + let key = metric_col.keys().value(row); metric_col .values() .as_any() @@ -535,8 +380,8 @@ mod tests { .unwrap() .value(key as usize) }; - let get_service = |i: usize| -> &str { - let key = service_col.keys().value(i); + let get_service = |row: usize| -> &str { + let key = service_col.keys().value(row); service_col .values() .as_any() @@ -545,7 +390,7 @@ mod tests { .value(key as usize) }; - // Expected sort order: metric_name ASC, tag_service ASC, timestamp ASC + // Expected sort order: metric_name ASC, service ASC, timestamp_secs ASC // Row 0: metric_a, service_a, 200 (original row 2) // Row 1: metric_a, service_b, 100 (original row 1) // Row 2: metric_b, service_a, 300 (original row 0) diff --git a/quickwit/quickwit-parquet-engine/src/table_config.rs b/quickwit/quickwit-parquet-engine/src/table_config.rs new file mode 100644 index 00000000000..002d99246bc --- /dev/null +++ b/quickwit/quickwit-parquet-engine/src/table_config.rs @@ -0,0 +1,191 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Default table configuration for Parquet pipeline product types. +//! +//! When a table (index) does not specify an explicit sort fields configuration, +//! these defaults are applied based on the product type. The `ParquetWriter` +//! resolves these sort field names to physical `ParquetField` columns at +//! construction time; columns not yet in the schema (e.g., `timeseries_id`) +//! are recorded in metadata but skipped during physical sort. + +use serde::{Deserialize, Serialize}; + +/// Product types supported by the Parquet pipeline. +/// +/// Each product type has a default sort fields schema that matches the common +/// query predicates for that signal type. See ADR-002 for the rationale. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ProductType { + Metrics, + Logs, + Traces, +} + +impl ProductType { + /// Default sort fields string for this product type. + /// + /// The metrics default includes `timeseries_id` as a tiebreaker before + /// `timestamp_secs`. Since `timeseries_id` is not yet a physical column, + /// the writer skips it during sort but records it in the metadata string. + /// When the column is added to the schema, sorting will include it + /// automatically. + /// + /// Logs and traces defaults are placeholders — they will be refined + /// when the Parquet pipeline is extended to those signal types. + pub fn default_sort_fields(self) -> &'static str { + match self { + Self::Metrics => "metric_name|service|env|datacenter|region|host|timeseries_id|timestamp_secs/V2", + // Placeholder: column names TBD when logs Parquet schema is defined. + Self::Logs => "service_name|level|host|timestamp_secs/V2", + // Placeholder: column names TBD when traces Parquet schema is defined. + Self::Traces => "service_name|operation_name|trace_id|timestamp_secs/V2", + } + } +} + +/// Table-level configuration for the Parquet pipeline. +/// +/// Stored per-index. When `sort_fields` is `None`, the default for the +/// product type is used (see `ProductType::default_sort_fields()`). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TableConfig { + /// The product type determines schema defaults. + pub product_type: ProductType, + + /// Explicit sort fields override. When `None`, the product-type default + /// is used. When `Some`, this exact schema string is applied. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub sort_fields: Option, + + /// Window duration in seconds for time-windowed compaction. + /// Default: 900 (15 minutes). Must divide 3600. + #[serde(default = "default_window_duration_secs")] + pub window_duration_secs: u32, +} + +fn default_window_duration_secs() -> u32 { + 900 +} + +impl TableConfig { + /// Create a new TableConfig for the given product type with defaults. + pub fn new(product_type: ProductType) -> Self { + Self { + product_type, + sort_fields: None, + window_duration_secs: default_window_duration_secs(), + } + } + + /// Get the effective sort fields string for this table. + /// + /// Returns the explicit override if set, otherwise the product-type default. + pub fn effective_sort_fields(&self) -> &str { + match &self.sort_fields { + Some(sf) => sf.as_str(), + None => self.product_type.default_sort_fields(), + } + } +} + +impl Default for TableConfig { + fn default() -> Self { + Self::new(ProductType::Metrics) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::sort_fields::parse_sort_fields; + + #[test] + fn test_metrics_default_sort_fields_parses() { + let schema = parse_sort_fields(ProductType::Metrics.default_sort_fields()) + .expect("metrics default sort fields must parse"); + assert_eq!(schema.column.len(), 8); + // Proto names are bare (suffixes stripped by parser). + assert_eq!(schema.column[0].name, "metric_name"); + assert_eq!(schema.column[1].name, "service"); + assert_eq!(schema.column[6].name, "timeseries_id"); + assert_eq!(schema.column[7].name, "timestamp_secs"); + } + + #[test] + fn test_logs_default_sort_fields_parses() { + let schema = parse_sort_fields(ProductType::Logs.default_sort_fields()) + .expect("logs default sort fields must parse"); + assert_eq!(schema.column.len(), 4); + assert_eq!(schema.column[0].name, "service_name"); + } + + #[test] + fn test_traces_default_sort_fields_parses() { + let schema = parse_sort_fields(ProductType::Traces.default_sort_fields()) + .expect("traces default sort fields must parse"); + assert_eq!(schema.column.len(), 4); + assert_eq!(schema.column[0].name, "service_name"); + } + + #[test] + fn test_effective_sort_fields_uses_default() { + let config = TableConfig::new(ProductType::Metrics); + assert_eq!( + config.effective_sort_fields(), + ProductType::Metrics.default_sort_fields() + ); + } + + #[test] + fn test_effective_sort_fields_uses_override() { + let mut config = TableConfig::new(ProductType::Metrics); + config.sort_fields = Some("custom__s|timestamp/V2".to_string()); + assert_eq!(config.effective_sort_fields(), "custom__s|timestamp/V2"); + } + + #[test] + fn test_default_window_duration() { + let config = TableConfig::default(); + assert_eq!(config.window_duration_secs, 900); + } + + #[test] + fn test_table_config_serde_roundtrip() { + let config = TableConfig::new(ProductType::Traces); + let json = serde_json::to_string(&config).unwrap(); + let recovered: TableConfig = serde_json::from_str(&json).unwrap(); + assert_eq!(recovered.product_type, ProductType::Traces); + assert!(recovered.sort_fields.is_none()); + assert_eq!(recovered.window_duration_secs, 900); + } + + #[test] + fn test_table_config_serde_with_override() { + let mut config = TableConfig::new(ProductType::Metrics); + config.sort_fields = Some("host__s|timestamp/V2".to_string()); + config.window_duration_secs = 3600; + + let json = serde_json::to_string(&config).unwrap(); + assert!(json.contains("host__s|timestamp/V2")); + + let recovered: TableConfig = serde_json::from_str(&json).unwrap(); + assert_eq!( + recovered.sort_fields.as_deref(), + Some("host__s|timestamp/V2") + ); + assert_eq!(recovered.window_duration_secs, 3600); + } +} diff --git a/quickwit/quickwit-parquet-engine/src/test_helpers.rs b/quickwit/quickwit-parquet-engine/src/test_helpers.rs new file mode 100644 index 00000000000..dd6892dbe23 --- /dev/null +++ b/quickwit/quickwit-parquet-engine/src/test_helpers.rs @@ -0,0 +1,96 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Shared test helpers for building Arrow RecordBatches in unit tests. + +use std::sync::Arc; + +use arrow::array::{ + ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array, UInt64Array, +}; +use arrow::datatypes::{DataType, Field, Int32Type, Schema as ArrowSchema}; +use arrow::record_batch::RecordBatch; + +/// Creates a dictionary-encoded string array with compact 0-based keys. +pub fn create_dict_array(values: &[&str]) -> ArrayRef { + let keys: Vec = (0..values.len()).map(|i| i as i32).collect(); + let string_array = StringArray::from(values.to_vec()); + Arc::new( + DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) + .unwrap(), + ) +} + +/// Creates a nullable dictionary-encoded string array. +/// +/// Each `Some(value)` gets a key into the dictionary; `None` values produce +/// a null key. +pub fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef { + let keys: Vec> = values + .iter() + .enumerate() + .map(|(i, v)| v.map(|_| i as i32)) + .collect(); + let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect(); + let string_array = StringArray::from(string_values); + Arc::new( + DictionaryArray::::try_new(Int32Array::from(keys), Arc::new(string_array)) + .unwrap(), + ) +} + +/// Creates a RecordBatch with the 4 required fields plus the specified +/// nullable dictionary-encoded tag columns. +/// +/// - `metric_name`: all rows set to `"cpu.usage"` +/// - `metric_type`: all rows `0` (Gauge) +/// - `timestamp_secs`: sequential, starting at `100` +/// - `value`: sequential `f64` starting at `42.0` +/// - each tag column: all rows set to the column name as the value +pub fn create_test_batch_with_tags(num_rows: usize, tags: &[&str]) -> RecordBatch { + let dict_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); + + let mut fields = vec![ + Field::new("metric_name", dict_type.clone(), false), + Field::new("metric_type", DataType::UInt8, false), + Field::new("timestamp_secs", DataType::UInt64, false), + Field::new("value", DataType::Float64, false), + ]; + for tag in tags { + fields.push(Field::new(*tag, dict_type.clone(), true)); + } + let schema = Arc::new(ArrowSchema::new(fields)); + + let metric_names: Vec<&str> = vec!["cpu.usage"; num_rows]; + let metric_name: ArrayRef = create_dict_array(&metric_names); + let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows])); + let timestamps: Vec = (0..num_rows).map(|i| 100 + i as u64).collect(); + let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps)); + let values: Vec = (0..num_rows).map(|i| 42.0 + i as f64).collect(); + let value: ArrayRef = Arc::new(Float64Array::from(values)); + + let mut columns: Vec = vec![metric_name, metric_type, timestamp_secs, value]; + for tag in tags { + let tag_values: Vec> = vec![Some(tag); num_rows]; + columns.push(create_nullable_dict_array(&tag_values)); + } + + RecordBatch::try_new(schema, columns).unwrap() +} + +/// Creates a RecordBatch with the 4 required fields and default tags +/// (`service`, `host`). +pub fn create_test_batch(num_rows: usize) -> RecordBatch { + create_test_batch_with_tags(num_rows, &["service", "host"]) +} diff --git a/quickwit/quickwit-proto/build.rs b/quickwit/quickwit-proto/build.rs index 569d9b5315b..37e7d7c8cb1 100644 --- a/quickwit/quickwit-proto/build.rs +++ b/quickwit/quickwit-proto/build.rs @@ -223,6 +223,19 @@ fn main() -> Result<(), Box> { ], )?; + // Event Store sort schema proto (vendored from dd-source). + let sortschema_prost_config = prost_build::Config::default(); + tonic_prost_build::configure() + .type_attribute(".", "#[derive(serde::Serialize, serde::Deserialize)]") + .out_dir("src/codegen/sortschema") + .compile_with_config( + sortschema_prost_config, + &[std::path::PathBuf::from( + "protos/event_store_sortschema/event_store_sortschema.proto", + )], + &[std::path::PathBuf::from("protos/event_store_sortschema")], + )?; + // OTEL proto let mut prost_config = prost_build::Config::default(); prost_config.protoc_arg("--experimental_allow_proto3_optional"); diff --git a/quickwit/quickwit-proto/protos/event_store_sortschema/event_store_sortschema.proto b/quickwit/quickwit-proto/protos/event_store_sortschema/event_store_sortschema.proto new file mode 100644 index 00000000000..a2e1d245b6d --- /dev/null +++ b/quickwit/quickwit-proto/protos/event_store_sortschema/event_store_sortschema.proto @@ -0,0 +1,231 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +// Vendored from dd-source: domains/event-platform/shared/libs/event-store-proto/protos/event_store_sortschema/ +// Keep identical to upstream -- do NOT strip unused fields. + +package sortschema; + +option java_package = "com.dd.event.store.sortschema.proto"; +option java_multiple_files = true; +option go_package = "github.com/DataDog/dd-source/domains/event-platform/shared/libs/event-store-proto/protos/event_store_sortschema"; + +// NOTE: Be thoughtful making changes to this file. Everything included here is +// included in the metadata of every fragment, and in each fragment header. + +enum SortColumnDirection { + // SortDirectionUnknown will pop up if someone forgets to initialize a field. + SortDirectionUnknown = 0; + + // SortDirectionAscending sorts columns in ascending value order. + SortDirectionAscending = 1; + + // SortDirectionDescending sorts columns in descending value order. + SortDirectionDescending = 2; +} + +// SortColumn represents a single column that's participating in the sort order +// for rows in the fragment files. +// +// NOTE: Carefully consider whether you need to modify the implementation of +// schemautils.go/EquivalentSchemas function whenever this struct is modified. +message SortColumn { + string name = 1; + // This integer corresponds 1:1 with the application-level column type which is also a number. + // We do that instead of breaking out a separate ENUM here to avoid having to maintain a bunch + // of mapping code that converts from column types to protobuf ENUMs and back again for no reason. + uint64 column_type = 2; + SortColumnDirection sort_direction = 3; + + // NOTE: If we decide later to form lexicographic keys from the schema, + // it may be useful to supply an integer here which would indicate + // the sort precedence and could be used for tagging for e.g. + // orderedcode lexicographic keys. +} + +// SortSchema represents a set of column names and precedence values which define +// the partial lexicographical sort order of rows in a fragment file, built from the +// individual column orderings. Precedence of the columns for sorting is given by +// the order of the columns in the SortSchema. "timestamp" must appear, with +// SortColumnType INT64, SortDirection DESCENDING, last. +// +// NOTE: Carefully consider whether you need to modify the implementation of +// schemautils.go/EquivalentSchemas function whenever this struct is modified. +message SortSchema { + // Very common schemas used all over the place are assigned a unique version number (reference + // in dd-go/logs/apps/logs-event-store/storage/fragment/schema_versions.go). + // + // This version number can be used in place of the complete schema description in the fragment + // metadata entries, to reduce their size. When this value is not null, the other fields in the + // SortSchema are not used and don't need to be set. + uint64 version = 4; + + repeated SortColumn column = 1; + + // Used in metrics, etc. + string name = 2; + + // SortVersion specifies what type of sorting of the data has been done. + // Fragments with different SortVersion numbers are *never* merged by the + // compactor. + enum SortVersion { + // The initial version of per-file locality sorting had a bug where it sorted + // the rows by the full values of string fields in the sort columns, but reported + // trimmed values in the RowKeys. This could yield what appeared to be + // "out of order" keys because keys that only differ in the portion that was + // trimmed could sort differently than trimmed values, and this would yield + // what looked like overlapping fragments after m:n merges. Fragments produced + // this way are not compatible with the LSM algorithm and must be excluded. + INCORRECT_TRIM = 0; + + // Files marked with this version use LessTrimmed both in their production + // in the writer, and in the merges produced by the compactor. Trimming has + // a "budget" that allows the leading string fields to take up "more" + // characters if they need to without overflowing into huge values in the + // metadata server. This budgeting is intended to cover when e.g. "service" + // is a leading tag with long service names. + TRIMMED_WITH_BUDGET = 1; + } + + SortVersion sort_version = 3; + + // Cutoff position for LSM compaction comparisons. Only columns 0 through + // lsm_comparison_cutoff-1 are considered for fragment locality decisions. + // Allows sorting by columns that improve compression without creating + // unnecessary extra compaction work. + // This cutoff is represented in the sort schema string format with an `&` + // at the beginning of the name of the first column that should be + // ignored, like "service__s|&env__s|timestamp/V1" which would only use + // "service__s" for locality comparisons. + // When unset, or explicitly set to 0, the schema utils will use the + // legacy logic of ignoring timestamp and tiebreaker, if they are present. + int32 lsm_comparison_cutoff = 5; +} + +// A ColumnValue is a string, int, uint or float which corresponds to a sort column. +message ColumnValue { + oneof value { + bytes type_string = 1; + int64 type_int = 2; + double type_float = 3; + } +} + +// ColumnValues represent a set of column values that correspond with a particular +// SortSchema. +message ColumnValues { + repeated ColumnValue column = 1; +} + +// RowKeys represent the "row keys" of the first and last rows of a fragment file. +// The values are the values of the sort columns at the first and last row, respectively. +message RowKeys { + // These are the values of the sort colums at the first row, thus defining the + // lower inclusive boundary of the fragment file, i.e. using the minimum values of any + // multi-values in the column values(1), which is what the sorting operates upon. + ColumnValues min_row_values = 1; + + // These are the values of the sort columns at the last row, thus defining the + // upper boundary of the sorted rows of the fragment file, i.e. using the minimum + // values of any multi-values in the column values(1), which is what the sorting + // operates upon. + // + // Note that the string fields in min_row_values and max_row_values are trimmed + // to avoid storing "too large" values in the metadata server, and the sort order + // of the rows in the fragment files are determined using these trimmed keys to + // match. + // + // During m:n merging of fragments, when boundaries of fragments are selected, + // they are selected at transitions of key values of the trimmed keys so that the + // [min_row_values, max_row_values] ranges of the output fragments do not overlap. + // This property is important for two reasons. First, we want to prune fragments + // from queries based on the values of the sort schema columns. By making the + // fragments non-overlapping, we enable this pruning. Secondly, we don't want + // compaction strategies like the classic LSM strategy to re-merge the same + // fragments over and over. + // + // An interesting consequence of the key trimming is that a fragment may have + // a row in it with values that come "after" the max_row_values keys outside + // of the "trimmed" range of the column values. For example, if the trim budget + // for a column is four characters, and the fragment ends at "abcd", "abcd" and + // "abcd123" would yield the same trimmed key value and would both be stored in + // the same fragment. However "abce" would be stored in a different fragment. + ColumnValues max_row_values = 2; + + // This set of values defines the all-inclusive range boundary of the file considering + // the max values of any multi-values in the column values(2). This boundary is the + // boundary w.r.t. queries. + ColumnValues all_inclusive_max_row_values = 3; + + + // For track table expirations (splitting M fragments into N1 expired fragments and N2 live fragments), + // there is a potential for output fragments to overlap in key range. This is because the sort schema of track + // table starts with retention at intake + scopeID, and retention can change between the moment the fragment + // is created and the moment it is expired. To give an example, say a fragment has scopeIDs 1, 2, 3, 4, 5, + // all with a retention at intake of 7 days. If the retention for 3 and 5 is changed to 15 days, + // then 7 days later at expiration the fragment will be split into two fragments, + // one "expired" with scopeIDs 1, 2 and 4 and one "live" with scopeIDs 3 and 5. + // The row keys will be (7:1..., 7:4...) for the first fragment, and (7:3...,7:5...) for the second, + // which overlap. To avoid this, we add an `expired` boolean to the row keys, so we allow a live and an expired + // fragment to overlap in key range. + bool expired = 4; + + // Footnotes: + // + // (1) For min_row_values and max_row_values with multi-values in the column + // values, the sort order is by the minimum value of a multi-value for an + // ascending-sort-order column and the maximum value for a descending-sort-order + // column. + // (2) For all_inclusive_max_row_values, the value of any multi-values encountered + // that is used is the maximum value for an ascending-sort-order column and + // the minimum value for a descending-sort-order column. + + + + // Example: + // + // Let's say we have the following columns. Values at each row enclosed with [brackets]. + // Column B is sorted in descending order. The others are sorted in ascending order. + // + // Column A Column B Column C + // ======== ======== ======== + // 1: [1, 2, 99] [B] [x, y] + // 2: [1, 4] [A, C] [y] + // 3: [2] [B, C] [z] + // 4: [2] [A, B, C] [x] + // 5: [2] [A] [y, z] + // 6: [3] [B] [x, y, z] + // + // min_row_key is {A=1, B=B, C=x} + // max_row_key is {A=3, B=B, C=x} + // all_inclusive_max_row_key is {A=99, B=B, C=y} + // + // The idea with these keys is that [min_row_key, max_row_key] defines the range that + // covers a single fragment file for the purposes of computing overlaps for deciding + // what to perform an m:n merge to "flatten" files that overlap in key range so that + // queries don't have to visit all the files. + // + // If there are no multi-valued sort columns, this is the end of the story. However + // if there are multi-valued sort columns, the range that a given file covers in + // terms of queries is given by [min_row_key, all_inclusive_max_row_key]. A lot of + // overlaps with these ranges indicate "atomic rows" for a table that cause queries + // to hit more files, degrading performance. + // + // Initially the compactor will only consider [min_row_key, max_row_key], with the + // all_inclusive_max_row_key present for diagnostics. In the future we may use + // this key to determine whether we want to do anything "special" with files that + // have atomic rows, such as duplicating the rows at each value of the multi-value. +} diff --git a/quickwit/quickwit-proto/src/codegen/sortschema/sortschema.rs b/quickwit/quickwit-proto/src/codegen/sortschema/sortschema.rs new file mode 100644 index 00000000000..2550c847794 --- /dev/null +++ b/quickwit/quickwit-proto/src/codegen/sortschema/sortschema.rs @@ -0,0 +1,230 @@ +// This file is @generated by prost-build. +/// SortColumn represents a single column that's participating in the sort order +/// for rows in the fragment files. +/// +/// NOTE: Carefully consider whether you need to modify the implementation of +/// schemautils.go/EquivalentSchemas function whenever this struct is modified. +#[derive(serde::Serialize, serde::Deserialize)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct SortColumn { + #[prost(string, tag = "1")] + pub name: ::prost::alloc::string::String, + /// This integer corresponds 1:1 with the application-level column type which is also a number. + /// We do that instead of breaking out a separate ENUM here to avoid having to maintain a bunch + /// of mapping code that converts from column types to protobuf ENUMs and back again for no reason. + #[prost(uint64, tag = "2")] + pub column_type: u64, + #[prost(enumeration = "SortColumnDirection", tag = "3")] + pub sort_direction: i32, +} +/// SortSchema represents a set of column names and precedence values which define +/// the partial lexicographical sort order of rows in a fragment file, built from the +/// individual column orderings. Precedence of the columns for sorting is given by +/// the order of the columns in the SortSchema. "timestamp" must appear, with +/// SortColumnType INT64, SortDirection DESCENDING, last. +/// +/// NOTE: Carefully consider whether you need to modify the implementation of +/// schemautils.go/EquivalentSchemas function whenever this struct is modified. +#[derive(serde::Serialize, serde::Deserialize)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct SortSchema { + /// Very common schemas used all over the place are assigned a unique version number (reference + /// in dd-go/logs/apps/logs-event-store/storage/fragment/schema_versions.go). + /// + /// This version number can be used in place of the complete schema description in the fragment + /// metadata entries, to reduce their size. When this value is not null, the other fields in the + /// SortSchema are not used and don't need to be set. + #[prost(uint64, tag = "4")] + pub version: u64, + #[prost(message, repeated, tag = "1")] + pub column: ::prost::alloc::vec::Vec, + /// Used in metrics, etc. + #[prost(string, tag = "2")] + pub name: ::prost::alloc::string::String, + #[prost(enumeration = "sort_schema::SortVersion", tag = "3")] + pub sort_version: i32, + /// Cutoff position for LSM compaction comparisons. Only columns 0 through + /// lsm_comparison_cutoff-1 are considered for fragment locality decisions. + /// Allows sorting by columns that improve compression without creating + /// unnecessary extra compaction work. + /// This cutoff is represented in the sort schema string format with an `&` + /// at the beginning of the name of the first column that should be + /// ignored, like "service__s|&env__s|timestamp/V1" which would only use + /// "service__s" for locality comparisons. + /// When unset, or explicitly set to 0, the schema utils will use the + /// legacy logic of ignoring timestamp and tiebreaker, if they are present. + #[prost(int32, tag = "5")] + pub lsm_comparison_cutoff: i32, +} +/// Nested message and enum types in `SortSchema`. +pub mod sort_schema { + /// SortVersion specifies what type of sorting of the data has been done. + /// Fragments with different SortVersion numbers are *never* merged by the + /// compactor. + #[derive(serde::Serialize, serde::Deserialize)] + #[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + Hash, + PartialOrd, + Ord, + ::prost::Enumeration + )] + #[repr(i32)] + pub enum SortVersion { + /// The initial version of per-file locality sorting had a bug where it sorted + /// the rows by the full values of string fields in the sort columns, but reported + /// trimmed values in the RowKeys. This could yield what appeared to be + /// "out of order" keys because keys that only differ in the portion that was + /// trimmed could sort differently than trimmed values, and this would yield + /// what looked like overlapping fragments after m:n merges. Fragments produced + /// this way are not compatible with the LSM algorithm and must be excluded. + IncorrectTrim = 0, + /// Files marked with this version use LessTrimmed both in their production + /// in the writer, and in the merges produced by the compactor. Trimming has + /// a "budget" that allows the leading string fields to take up "more" + /// characters if they need to without overflowing into huge values in the + /// metadata server. This budgeting is intended to cover when e.g. "service" + /// is a leading tag with long service names. + TrimmedWithBudget = 1, + } + impl SortVersion { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + Self::IncorrectTrim => "INCORRECT_TRIM", + Self::TrimmedWithBudget => "TRIMMED_WITH_BUDGET", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "INCORRECT_TRIM" => Some(Self::IncorrectTrim), + "TRIMMED_WITH_BUDGET" => Some(Self::TrimmedWithBudget), + _ => None, + } + } + } +} +/// A ColumnValue is a string, int, uint or float which corresponds to a sort column. +#[derive(serde::Serialize, serde::Deserialize)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ColumnValue { + #[prost(oneof = "column_value::Value", tags = "1, 2, 3")] + pub value: ::core::option::Option, +} +/// Nested message and enum types in `ColumnValue`. +pub mod column_value { + #[derive(serde::Serialize, serde::Deserialize)] + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum Value { + #[prost(bytes, tag = "1")] + TypeString(::prost::alloc::vec::Vec), + #[prost(int64, tag = "2")] + TypeInt(i64), + #[prost(double, tag = "3")] + TypeFloat(f64), + } +} +/// ColumnValues represent a set of column values that correspond with a particular +/// SortSchema. +#[derive(serde::Serialize, serde::Deserialize)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ColumnValues { + #[prost(message, repeated, tag = "1")] + pub column: ::prost::alloc::vec::Vec, +} +/// RowKeys represent the "row keys" of the first and last rows of a fragment file. +/// The values are the values of the sort columns at the first and last row, respectively. +#[derive(serde::Serialize, serde::Deserialize)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct RowKeys { + /// These are the values of the sort colums at the first row, thus defining the + /// lower inclusive boundary of the fragment file, i.e. using the minimum values of any + /// multi-values in the column values(1), which is what the sorting operates upon. + #[prost(message, optional, tag = "1")] + pub min_row_values: ::core::option::Option, + /// These are the values of the sort columns at the last row, thus defining the + /// upper boundary of the sorted rows of the fragment file, i.e. using the minimum + /// values of any multi-values in the column values(1), which is what the sorting + /// operates upon. + /// + /// Note that the string fields in min_row_values and max_row_values are trimmed + /// to avoid storing "too large" values in the metadata server, and the sort order + /// of the rows in the fragment files are determined using these trimmed keys to + /// match. + /// + /// During m:n merging of fragments, when boundaries of fragments are selected, + /// they are selected at transitions of key values of the trimmed keys so that the + /// \[min_row_values, max_row_values\] ranges of the output fragments do not overlap. + /// This property is important for two reasons. First, we want to prune fragments + /// from queries based on the values of the sort schema columns. By making the + /// fragments non-overlapping, we enable this pruning. Secondly, we don't want + /// compaction strategies like the classic LSM strategy to re-merge the same + /// fragments over and over. + /// + /// An interesting consequence of the key trimming is that a fragment may have + /// a row in it with values that come "after" the max_row_values keys outside + /// of the "trimmed" range of the column values. For example, if the trim budget + /// for a column is four characters, and the fragment ends at "abcd", "abcd" and + /// "abcd123" would yield the same trimmed key value and would both be stored in + /// the same fragment. However "abce" would be stored in a different fragment. + #[prost(message, optional, tag = "2")] + pub max_row_values: ::core::option::Option, + /// This set of values defines the all-inclusive range boundary of the file considering + /// the max values of any multi-values in the column values(2). This boundary is the + /// boundary w.r.t. queries. + #[prost(message, optional, tag = "3")] + pub all_inclusive_max_row_values: ::core::option::Option, + /// For track table expirations (splitting M fragments into N1 expired fragments and N2 live fragments), + /// there is a potential for output fragments to overlap in key range. This is because the sort schema of track + /// table starts with retention at intake + scopeID, and retention can change between the moment the fragment + /// is created and the moment it is expired. To give an example, say a fragment has scopeIDs 1, 2, 3, 4, 5, + /// all with a retention at intake of 7 days. If the retention for 3 and 5 is changed to 15 days, + /// then 7 days later at expiration the fragment will be split into two fragments, + /// one "expired" with scopeIDs 1, 2 and 4 and one "live" with scopeIDs 3 and 5. + /// The row keys will be (7:1..., 7:4...) for the first fragment, and (7:3...,7:5...) for the second, + /// which overlap. To avoid this, we add an `expired` boolean to the row keys, so we allow a live and an expired + /// fragment to overlap in key range. + #[prost(bool, tag = "4")] + pub expired: bool, +} +#[derive(serde::Serialize, serde::Deserialize)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] +#[repr(i32)] +pub enum SortColumnDirection { + /// SortDirectionUnknown will pop up if someone forgets to initialize a field. + SortDirectionUnknown = 0, + /// SortDirectionAscending sorts columns in ascending value order. + SortDirectionAscending = 1, + /// SortDirectionDescending sorts columns in descending value order. + SortDirectionDescending = 2, +} +impl SortColumnDirection { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + Self::SortDirectionUnknown => "SortDirectionUnknown", + Self::SortDirectionAscending => "SortDirectionAscending", + Self::SortDirectionDescending => "SortDirectionDescending", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SortDirectionUnknown" => Some(Self::SortDirectionUnknown), + "SortDirectionAscending" => Some(Self::SortDirectionAscending), + "SortDirectionDescending" => Some(Self::SortDirectionDescending), + _ => None, + } + } +} diff --git a/quickwit/quickwit-proto/src/lib.rs b/quickwit/quickwit-proto/src/lib.rs index dbe850b55b7..3a7f7ea8992 100644 --- a/quickwit/quickwit-proto/src/lib.rs +++ b/quickwit/quickwit-proto/src/lib.rs @@ -15,6 +15,7 @@ #![allow(clippy::derive_partial_eq_without_eq)] #![allow(clippy::disallowed_methods)] #![allow(clippy::doc_lazy_continuation)] +#![allow(deprecated)] // prost::DecodeError::new is deprecated but used in generated decode impls #![allow(rustdoc::invalid_html_tags)] use std::cmp::Ordering; @@ -37,10 +38,16 @@ pub mod indexing; pub mod ingest; pub mod metastore; pub mod search; +pub mod sort_fields_error; pub mod types; pub use error::{GrpcServiceError, ServiceError, ServiceErrorCode}; use search::ReportSplitsRequest; +pub use sort_fields_error::SortFieldsError; + +pub mod sortschema { + include!("codegen/sortschema/sortschema.rs"); +} pub mod jaeger { pub mod api_v2 { @@ -123,7 +130,8 @@ impl TryFrom for search::SearchRequest { pub struct MutMetadataMap<'a>(&'a mut tonic::metadata::MetadataMap); impl Injector for MutMetadataMap<'_> { - /// Sets a key-value pair in the [`MetadataMap`]. No-op if the key or value is invalid. + /// Sets a key-value pair in the [`tonic::metadata::MetadataMap`]. No-op if the key or value + /// is invalid. fn set(&mut self, key: &str, value: String) { if let Ok(metadata_key) = tonic::metadata::MetadataKey::from_bytes(key.as_bytes()) && let Ok(metadata_value) = tonic::metadata::MetadataValue::try_from(&value) @@ -134,13 +142,13 @@ impl Injector for MutMetadataMap<'_> { } impl Extractor for MutMetadataMap<'_> { - /// Gets a value for a key from the MetadataMap. If the value can't be converted to &str, + /// Gets a value for a key from the `MetadataMap`. If the value can't be converted to &str, /// returns None. fn get(&self, key: &str) -> Option<&str> { self.0.get(key).and_then(|metadata| metadata.to_str().ok()) } - /// Collect all the keys from the MetadataMap. + /// Collect all the keys from the `MetadataMap`. fn keys(&self) -> Vec<&str> { self.0 .keys() @@ -174,13 +182,13 @@ impl Interceptor for SpanContextInterceptor { struct MetadataMap<'a>(&'a tonic::metadata::MetadataMap); impl Extractor for MetadataMap<'_> { - /// Gets a value for a key from the MetadataMap. If the value can't be converted to &str, + /// Gets a value for a key from the `MetadataMap`. If the value can't be converted to &str, /// returns None. fn get(&self, key: &str) -> Option<&str> { self.0.get(key).and_then(|metadata| metadata.to_str().ok()) } - /// Collect all the keys from the MetadataMap. + /// Collect all the keys from the `MetadataMap`. fn keys(&self) -> Vec<&str> { self.0 .keys() diff --git a/quickwit/quickwit-proto/src/metastore/mod.rs b/quickwit/quickwit-proto/src/metastore/mod.rs index ba371c13d4a..4f53b9abb5c 100644 --- a/quickwit/quickwit-proto/src/metastore/mod.rs +++ b/quickwit/quickwit-proto/src/metastore/mod.rs @@ -193,6 +193,15 @@ impl From for MetastoreError { } } +impl From for MetastoreError { + fn from(err: crate::SortFieldsError) -> Self { + MetastoreError::Internal { + message: "sort fields error".to_string(), + cause: err.to_string(), + } + } +} + impl ServiceError for MetastoreError { fn error_code(&self) -> ServiceErrorCode { match self { diff --git a/quickwit/quickwit-proto/src/sort_fields_error.rs b/quickwit/quickwit-proto/src/sort_fields_error.rs new file mode 100644 index 00000000000..8fa6ebc15b3 --- /dev/null +++ b/quickwit/quickwit-proto/src/sort_fields_error.rs @@ -0,0 +1,78 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Error types for sort fields parsing and validation. + +/// Errors arising from sort fields parsing, validation, and time-window arithmetic. +#[derive(Debug, thiserror::Error)] +pub enum SortFieldsError { + /// Window duration does not meet the divisibility constraint. + #[error("invalid window duration {duration_secs}s: {reason}")] + InvalidWindowDuration { + duration_secs: u32, + reason: &'static str, + }, + + /// Schema string is syntactically malformed. + #[error("{0}")] + MalformedSchema(String), + + /// Version suffix could not be parsed. + #[error("{0}")] + BadSortVersion(String), + + /// Sort version is below the minimum accepted (V2-only enforcement). + #[error("unsupported sort version {version}, minimum is {minimum}")] + UnsupportedVersion { version: i32, minimum: i32 }, + + /// Invalid placement or usage of the `&` LSM cutoff marker. + #[error("{0}")] + InvalidCutoffPlacement(String), + + /// Column specification has wrong number of parts. + #[error("{0}")] + InvalidColumnFormat(String), + + /// Unknown column type (from suffix or explicit name). + #[error("{0}")] + UnknownColumnType(String), + + /// Explicit column type does not match the type inferred from the suffix. + #[error( + "column type doesn't match type deduced from suffix for {column}, deduced={from_suffix}, \ + explicit={explicit}" + )] + TypeMismatch { + column: String, + from_suffix: String, + explicit: String, + }, + + /// Unrecognized sort direction string. + #[error("{0}")] + UnknownSortDirection(String), + + /// Sort direction specified in multiple places (e.g., both prefix and suffix, + /// or prefix/suffix combined with colon-separated direction). + #[error("sort direction specified multiple times for column '{0}'")] + DuplicateDirection(String), + + /// Schema is structurally invalid (missing timestamp, wrong order, etc.). + #[error("{0}")] + ValidationError(String), + + /// window_start timestamp cannot be represented as a DateTime. + #[error("window_start timestamp {timestamp_secs} is out of representable range")] + WindowStartOutOfRange { timestamp_secs: i64 }, +} diff --git a/quickwit/quickwit-search/src/cluster_client.rs b/quickwit/quickwit-search/src/cluster_client.rs index 79f6ba81702..0a4518174ce 100644 --- a/quickwit/quickwit-search/src/cluster_client.rs +++ b/quickwit/quickwit-search/src/cluster_client.rs @@ -138,7 +138,7 @@ impl ClusterClient { /// Attempts to store a given key value pair within the cluster. /// - /// Tries to replicate the pair to [`TARGET_NUM_REPLICATION`] nodes, but this function may fail + /// Tries to replicate the pair to `TARGET_NUM_REPLICATION` nodes, but this function may fail /// silently (e.g if no client was available). Even in case of success, this storage is not /// persistent. For instance during a rolling upgrade, all replicas will be lost as there is no /// mechanism to maintain the replication count. diff --git a/quickwit/quickwit-storage/src/storage_resolver.rs b/quickwit/quickwit-storage/src/storage_resolver.rs index 6203d6a8d02..85329c19a86 100644 --- a/quickwit/quickwit-storage/src/storage_resolver.rs +++ b/quickwit/quickwit-storage/src/storage_resolver.rs @@ -43,7 +43,7 @@ impl fmt::Debug for StorageResolver { } impl StorageResolver { - /// Creates an empty [`StorageResolverBuilder`]. + /// Creates an empty `StorageResolverBuilder`. pub fn builder() -> StorageResolverBuilder { StorageResolverBuilder::default() }