Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1480,6 +1480,21 @@ class TypedColumnWriterImpl : public ColumnWriterImpl,
return current_encoder_->EstimatedDataEncodedSize();
}

int64_t estimated_buffered_def_level_bytes() const override {
return definition_levels_sink_.length();
}

int64_t estimated_buffered_rep_level_bytes() const override {
return repetition_levels_sink_.length();
}

int64_t estimated_buffered_dict_bytes() const override {
if (current_dict_encoder_) {
return current_dict_encoder_->dict_encoded_size();
}
return 0;
}

protected:
std::shared_ptr<Buffer> GetValuesBuffer() override {
return current_encoder_->FlushValues();
Expand Down
9 changes: 9 additions & 0 deletions cpp/src/parquet/column_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,15 @@ class PARQUET_EXPORT ColumnWriter {
/// \brief Estimated size of the values that are not written to a page yet.
virtual int64_t estimated_buffered_value_bytes() const = 0;

/// \brief Estimated size of the definition levels that are not written to a page yet.
virtual int64_t estimated_buffered_def_level_bytes() const = 0;

/// \brief Estimated size of the repetition levels that are not written to a page yet.
virtual int64_t estimated_buffered_rep_level_bytes() const = 0;

/// \brief Estimated size of the dictionary that are not written to a page yet.
virtual int64_t estimated_buffered_dict_bytes() const = 0;

/// \brief The file-level writer properties
virtual const WriterProperties* properties() = 0;

Expand Down
20 changes: 20 additions & 0 deletions cpp/src/parquet/file_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ int64_t RowGroupWriter::total_compressed_bytes_written() const {
return contents_->total_compressed_bytes_written();
}

RowGroupWriter::BufferedStats RowGroupWriter::estimated_buffered_stats() const {
return contents_->EstimatedBufferedStats();
}

bool RowGroupWriter::buffered() const { return contents_->buffered(); }

int RowGroupWriter::current_column() { return contents_->current_column(); }
Expand Down Expand Up @@ -195,6 +199,22 @@ class RowGroupSerializer : public RowGroupWriter::Contents {
return total_compressed_bytes_written;
}

RowGroupWriter::BufferedStats EstimatedBufferedStats() const override {
RowGroupWriter::BufferedStats stats;
if (closed_) {
return stats;
}
for (size_t i = 0; i < column_writers_.size(); i++) {
if (column_writers_[i]) {
stats.def_level_bytes += column_writers_[i]->estimated_buffered_def_level_bytes();
stats.rep_level_bytes += column_writers_[i]->estimated_buffered_rep_level_bytes();
stats.value_bytes += column_writers_[i]->estimated_buffered_value_bytes();
stats.dict_bytes += column_writers_[i]->estimated_buffered_dict_bytes();
}
}
return stats;
}

bool buffered() const override { return buffered_row_group_; }

void Close() override {
Expand Down
15 changes: 15 additions & 0 deletions cpp/src/parquet/file_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,15 @@ static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'};

class PARQUET_EXPORT RowGroupWriter {
public:
// Estimated uncompressed byte sizes of data buffered by column writers
// that have not yet been serialized into data pages.
struct BufferedStats {
int64_t def_level_bytes = 0;
int64_t rep_level_bytes = 0;
int64_t value_bytes = 0;
int64_t dict_bytes = 0;
};

// Forward declare a virtual class 'Contents' to aid dependency injection and more
// easily create test fixtures
// An implementation of the Contents class is defined in the .cc file
Expand All @@ -58,6 +67,9 @@ class PARQUET_EXPORT RowGroupWriter {
virtual int64_t total_compressed_bytes() const = 0;
/// \brief total compressed bytes written by the page writer
virtual int64_t total_compressed_bytes_written() const = 0;
/// \brief Estimated sizes of buffered data (levels, values, dict) not yet
/// written to a page.
virtual BufferedStats EstimatedBufferedStats() const = 0;

virtual bool buffered() const = 0;
};
Expand Down Expand Up @@ -99,6 +111,9 @@ class PARQUET_EXPORT RowGroupWriter {
int64_t total_compressed_bytes() const;
/// \brief total compressed bytes written by the page writer
int64_t total_compressed_bytes_written() const;
/// \brief Estimated sizes of buffered data (levels, values, dict) not yet
/// written to a page.
BufferedStats estimated_buffered_stats() const;

/// Returns whether the current RowGroupWriter is in the buffered mode and is created
/// by calling ParquetFileWriter::AppendBufferedRowGroup.
Expand Down