From b11fb06d73e1d636cfc52f9778c26539c04b63ff Mon Sep 17 00:00:00 2001 From: wecharyu Date: Tue, 17 Mar 2026 00:57:49 +0800 Subject: [PATCH 1/3] GH-48467: [C++][Parquet] Add total_buffered_bytes() API for RowGroupWriter --- cpp/src/parquet/column_writer.cc | 4 ++++ cpp/src/parquet/column_writer.h | 3 +++ cpp/src/parquet/file_writer.cc | 19 +++++++++++++++++++ cpp/src/parquet/file_writer.h | 5 +++++ 4 files changed, 31 insertions(+) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 94b67dfa807e..69425cc80594 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1480,6 +1480,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, return current_encoder_->EstimatedDataEncodedSize(); } + int64_t EstimatedBufferedLevelsBytes() const override { + return definition_levels_sink_.length() + repetition_levels_sink_.length(); + } + protected: std::shared_ptr GetValuesBuffer() override { return current_encoder_->FlushValues(); diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 5b56eb010a24..523486aa77c8 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -164,6 +164,9 @@ class PARQUET_EXPORT ColumnWriter { /// \brief Estimated size of the values that are not written to a page yet. virtual int64_t estimated_buffered_value_bytes() const = 0; + /// \brief Estimated size of the levels that are not written to a page yet. + virtual int64_t EstimatedBufferedLevelsBytes() const = 0; + /// \brief The file-level writer properties virtual const WriterProperties* properties() = 0; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index ddec2c0a5602..1251ce9467b8 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -68,6 +68,10 @@ int64_t RowGroupWriter::total_compressed_bytes_written() const { return contents_->total_compressed_bytes_written(); } +int64_t RowGroupWriter::total_buffered_bytes() const { + return contents_->EstimatedBufferedBytes(); +} + bool RowGroupWriter::buffered() const { return contents_->buffered(); } int RowGroupWriter::current_column() { return contents_->current_column(); } @@ -195,6 +199,21 @@ class RowGroupSerializer : public RowGroupWriter::Contents { return total_compressed_bytes_written; } + int64_t EstimatedBufferedBytes() const override { + if (closed_) { + return 0; + } + int64_t estimated_buffered_value_bytes = 0; + for (size_t i = 0; i < column_writers_.size(); i++) { + if (column_writers_[i]) { + estimated_buffered_value_bytes += + column_writers_[i]->estimated_buffered_value_bytes() + + column_writers_[i]->EstimatedBufferedLevelsBytes(); + } + } + return estimated_buffered_value_bytes; + } + bool buffered() const override { return buffered_row_group_; } void Close() override { diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index d5ea1d7c98a0..8525af37a76c 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -58,6 +58,9 @@ class PARQUET_EXPORT RowGroupWriter { virtual int64_t total_compressed_bytes() const = 0; /// \brief total compressed bytes written by the page writer virtual int64_t total_compressed_bytes_written() const = 0; + /// \brief Estimated bytes of values and levels that are buffered by the page writer + /// but not written to a page yet + virtual int64_t EstimatedBufferedBytes() const = 0; virtual bool buffered() const = 0; }; @@ -99,6 +102,8 @@ class PARQUET_EXPORT RowGroupWriter { int64_t total_compressed_bytes() const; /// \brief total compressed bytes written by the page writer int64_t total_compressed_bytes_written() const; + /// \brief total bytes of values and levels that are buffered by the page writer + int64_t total_buffered_bytes() const; /// Returns whether the current RowGroupWriter is in the buffered mode and is created /// by calling ParquetFileWriter::AppendBufferedRowGroup. From dfd872a729a48bfd91c6f2c909f7f403499e4177 Mon Sep 17 00:00:00 2001 From: wecharyu Date: Thu, 19 Mar 2026 01:22:50 +0800 Subject: [PATCH 2/3] use BufferedStats for buffered data --- cpp/src/parquet/column_writer.cc | 15 +++++++++++++-- cpp/src/parquet/column_writer.h | 10 ++++++++-- cpp/src/parquet/file_writer.cc | 19 ++++++++++--------- cpp/src/parquet/file_writer.h | 18 +++++++++++++----- 4 files changed, 44 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 69425cc80594..795ce679cb1f 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1480,8 +1480,19 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, return current_encoder_->EstimatedDataEncodedSize(); } - int64_t EstimatedBufferedLevelsBytes() const override { - return definition_levels_sink_.length() + repetition_levels_sink_.length(); + int64_t estimated_buffered_def_level_bytes() const override { + return definition_levels_sink_.length(); + } + + int64_t estimated_buffered_rep_level_bytes() const override { + return repetition_levels_sink_.length(); + } + + int64_t estimated_buffered_dict_bytes() const override { + if (current_dict_encoder_) { + return current_dict_encoder_->dict_encoded_size(); + } + return 0; } protected: diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 523486aa77c8..0516d7937cba 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -164,8 +164,14 @@ class PARQUET_EXPORT ColumnWriter { /// \brief Estimated size of the values that are not written to a page yet. virtual int64_t estimated_buffered_value_bytes() const = 0; - /// \brief Estimated size of the levels that are not written to a page yet. - virtual int64_t EstimatedBufferedLevelsBytes() const = 0; + /// \brief Estimated size of the definition levels that are not written to a page yet. + virtual int64_t estimated_buffered_def_level_bytes() const = 0; + + /// \brief Estimated size of the repetition levels that are not written to a page yet. + virtual int64_t estimated_buffered_rep_level_bytes() const = 0; + + /// \brief Estimated size of the dictionary that are not written to a page yet. + virtual int64_t estimated_buffered_dict_bytes() const = 0; /// \brief The file-level writer properties virtual const WriterProperties* properties() = 0; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 1251ce9467b8..23ac9d8ddbf5 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -68,8 +68,8 @@ int64_t RowGroupWriter::total_compressed_bytes_written() const { return contents_->total_compressed_bytes_written(); } -int64_t RowGroupWriter::total_buffered_bytes() const { - return contents_->EstimatedBufferedBytes(); +BufferedStats RowGroupWriter::estimated_buffered_stats() const { + return contents_->EstimatedBufferedStats(); } bool RowGroupWriter::buffered() const { return contents_->buffered(); } @@ -199,19 +199,20 @@ class RowGroupSerializer : public RowGroupWriter::Contents { return total_compressed_bytes_written; } - int64_t EstimatedBufferedBytes() const override { + BufferedStats EstimatedBufferedStats() const override { + BufferedStats stats; if (closed_) { - return 0; + return stats; } - int64_t estimated_buffered_value_bytes = 0; for (size_t i = 0; i < column_writers_.size(); i++) { if (column_writers_[i]) { - estimated_buffered_value_bytes += - column_writers_[i]->estimated_buffered_value_bytes() + - column_writers_[i]->EstimatedBufferedLevelsBytes(); + stats.def_level_bytes += column_writers_[i]->estimated_buffered_def_level_bytes(); + stats.rep_level_bytes += column_writers_[i]->estimated_buffered_rep_level_bytes(); + stats.value_bytes += column_writers_[i]->estimated_buffered_value_bytes(); + stats.dict_bytes += column_writers_[i]->estimated_buffered_dict_bytes(); } } - return estimated_buffered_value_bytes; + return stats; } bool buffered() const override { return buffered_row_group_; } diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 8525af37a76c..d5d753bfd0b9 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -34,6 +34,13 @@ class ColumnWriter; static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; +struct PARQUET_EXPORT BufferedStats { + int64_t def_level_bytes = 0; + int64_t rep_level_bytes = 0; + int64_t value_bytes = 0; + int64_t dict_bytes = 0; +}; + class PARQUET_EXPORT RowGroupWriter { public: // Forward declare a virtual class 'Contents' to aid dependency injection and more @@ -58,9 +65,9 @@ class PARQUET_EXPORT RowGroupWriter { virtual int64_t total_compressed_bytes() const = 0; /// \brief total compressed bytes written by the page writer virtual int64_t total_compressed_bytes_written() const = 0; - /// \brief Estimated bytes of values and levels that are buffered by the page writer - /// but not written to a page yet - virtual int64_t EstimatedBufferedBytes() const = 0; + /// \brief Estimated sizes of buffered data (levels, values, dict) not yet + /// written to a page. + virtual BufferedStats EstimatedBufferedStats() const = 0; virtual bool buffered() const = 0; }; @@ -102,8 +109,9 @@ class PARQUET_EXPORT RowGroupWriter { int64_t total_compressed_bytes() const; /// \brief total compressed bytes written by the page writer int64_t total_compressed_bytes_written() const; - /// \brief total bytes of values and levels that are buffered by the page writer - int64_t total_buffered_bytes() const; + /// \brief Estimated sizes of buffered data (levels, values, dict) not yet + /// written to a page. + BufferedStats estimated_buffered_stats() const; /// Returns whether the current RowGroupWriter is in the buffered mode and is created /// by calling ParquetFileWriter::AppendBufferedRowGroup. From 63137251852d33c96b7b25cd793d2ea2a9c6dbbd Mon Sep 17 00:00:00 2001 From: wecharyu Date: Sun, 22 Mar 2026 15:44:47 +0800 Subject: [PATCH 3/3] add comment for BufferedStats --- cpp/src/parquet/file_writer.cc | 6 +++--- cpp/src/parquet/file_writer.h | 16 +++++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 23ac9d8ddbf5..b8b1d06ea3ad 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -68,7 +68,7 @@ int64_t RowGroupWriter::total_compressed_bytes_written() const { return contents_->total_compressed_bytes_written(); } -BufferedStats RowGroupWriter::estimated_buffered_stats() const { +RowGroupWriter::BufferedStats RowGroupWriter::estimated_buffered_stats() const { return contents_->EstimatedBufferedStats(); } @@ -199,8 +199,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { return total_compressed_bytes_written; } - BufferedStats EstimatedBufferedStats() const override { - BufferedStats stats; + RowGroupWriter::BufferedStats EstimatedBufferedStats() const override { + RowGroupWriter::BufferedStats stats; if (closed_) { return stats; } diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index d5d753bfd0b9..3ca71875d9db 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -34,15 +34,17 @@ class ColumnWriter; static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; -struct PARQUET_EXPORT BufferedStats { - int64_t def_level_bytes = 0; - int64_t rep_level_bytes = 0; - int64_t value_bytes = 0; - int64_t dict_bytes = 0; -}; - class PARQUET_EXPORT RowGroupWriter { public: + // Estimated uncompressed byte sizes of data buffered by column writers + // that have not yet been serialized into data pages. + struct BufferedStats { + int64_t def_level_bytes = 0; + int64_t rep_level_bytes = 0; + int64_t value_bytes = 0; + int64_t dict_bytes = 0; + }; + // Forward declare a virtual class 'Contents' to aid dependency injection and more // easily create test fixtures // An implementation of the Contents class is defined in the .cc file