Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cpp/src/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ set(PARQUET_SRCS
encryption/internal_file_encryptor.cc
exception.cc
file_reader.cc
file_rewriter.cc
file_writer.cc
geospatial/statistics.cc
geospatial/util_internal.cc
Expand Down Expand Up @@ -406,6 +407,8 @@ add_parquet_test(arrow-reader-writer-test

add_parquet_test(arrow-index-test SOURCES arrow/index_test.cc)

add_parquet_test(arrow-rewriter-test SOURCES arrow/arrow_rewriter_test.cc)

add_parquet_test(arrow-internals-test SOURCES arrow/path_internal_test.cc
arrow/reconstruct_internal_test.cc)

Expand Down
276 changes: 276 additions & 0 deletions cpp/src/parquet/arrow/arrow_rewriter_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include <memory>

#include "arrow/io/memory.h"
#include "arrow/testing/gtest_util.h"
#include "parquet/arrow/reader.h"
#include "parquet/arrow/test_util.h"
#include "parquet/file_reader.h"
#include "parquet/file_rewriter.h"
#include "parquet/platform.h"
#include "parquet/properties.h"
#include "parquet/test_util.h"

using arrow::Table;
using arrow::io::BufferReader;

namespace parquet::arrow {

TEST(ParquetRewriterTest, SimpleRoundTrip) {
auto rewriter_properties =
RewriterProperties::Builder()
.writer_properties(
WriterProperties::Builder().disable_write_page_index()->build())
->build();

auto schema = ::arrow::schema(
{::arrow::field("a", ::arrow::int32()), ::arrow::field("b", ::arrow::utf8())});

std::shared_ptr<Buffer> buffer;

WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(schema, {R"([[1, "a"], [2, "b"]])"}), buffer);

auto sink = CreateOutputStream();
auto rewriter =
ParquetFileRewriter::Open({{std::make_shared<BufferReader>(buffer)}}, sink,
{{NULLPTR}}, NULLPTR, rewriter_properties);
rewriter->Rewrite();
rewriter->Close();

ASSERT_OK_AND_ASSIGN(auto out_buffer, sink->Finish());
auto file_reader = ParquetFileReader::Open(std::make_shared<BufferReader>(out_buffer));
ASSERT_OK_AND_ASSIGN(auto reader, FileReader::Make(::arrow::default_memory_pool(),
std::move(file_reader)));

ASSERT_OK_AND_ASSIGN(auto table, reader->ReadTable());
ASSERT_OK(table->ValidateFull());

auto expected_table = ::arrow::TableFromJSON(schema, {R"([[1, "a"], [2, "b"]])"});
AssertTablesEqual(*expected_table, *table);
}

TEST(ParquetRewriterTest, ConcatRoundTrip) {
auto rewriter_properties =
RewriterProperties::Builder()
.writer_properties(
WriterProperties::Builder().enable_write_page_index()->build())
->build();

auto schema = ::arrow::schema(
{::arrow::field("a", ::arrow::int32()), ::arrow::field("b", ::arrow::utf8())});

std::shared_ptr<Buffer> buffer_up;
std::shared_ptr<Buffer> buffer_down;

WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(schema, {R"([[1, "a"], [2, "b"]])"}), buffer_up);
WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(schema, {R"([[3, "c"]])"}), buffer_down);

auto sink = CreateOutputStream();
auto rewriter =
ParquetFileRewriter::Open({{std::make_shared<BufferReader>(buffer_up),
std::make_shared<BufferReader>(buffer_down)}},
sink, {{NULLPTR, NULLPTR}}, NULLPTR, rewriter_properties);
rewriter->Rewrite();
rewriter->Close();

ASSERT_OK_AND_ASSIGN(auto out_buffer, sink->Finish());
auto file_reader = ParquetFileReader::Open(std::make_shared<BufferReader>(out_buffer));
ASSERT_OK_AND_ASSIGN(auto reader, FileReader::Make(::arrow::default_memory_pool(),
std::move(file_reader)));

ASSERT_OK_AND_ASSIGN(auto table, reader->ReadTable());
ASSERT_OK(table->ValidateFull());

auto expected_table =
::arrow::TableFromJSON(schema, {R"([[1, "a"], [2, "b"], [3, "c"]])"});
AssertTablesEqual(*expected_table, *table);
}

TEST(ParquetRewriterTest, JoinRoundTrip) {
auto rewriter_properties =
RewriterProperties::Builder()
.writer_properties(
WriterProperties::Builder().enable_write_page_index()->build())
->build();

auto left_schema = ::arrow::schema(
{::arrow::field("a", ::arrow::int32()), ::arrow::field("b", ::arrow::utf8())});
auto right_schema = ::arrow::schema({::arrow::field("c", ::arrow::int64())});

std::shared_ptr<Buffer> buffer_left;
std::shared_ptr<Buffer> buffer_right;

WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(left_schema, {R"([[1, "a"], [2, "b"], [3, "c"]])"}),
buffer_left);
WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(right_schema, {R"([[10], [20], [30]])"}),
buffer_right);

auto sink = CreateOutputStream();
auto rewriter = ParquetFileRewriter::Open(
{{std::make_shared<BufferReader>(buffer_left)},
{std::make_shared<BufferReader>(buffer_right)}},
sink, {{NULLPTR}, {NULLPTR}}, NULLPTR, rewriter_properties);
rewriter->Rewrite();
rewriter->Close();

ASSERT_OK_AND_ASSIGN(auto out_buffer, sink->Finish());
auto file_reader = ParquetFileReader::Open(std::make_shared<BufferReader>(out_buffer));
ASSERT_OK_AND_ASSIGN(auto reader, FileReader::Make(::arrow::default_memory_pool(),
std::move(file_reader)));

ASSERT_OK_AND_ASSIGN(auto table, reader->ReadTable());
ASSERT_OK(table->ValidateFull());

auto expected_schema = ::arrow::schema({::arrow::field("a", ::arrow::int32()),
::arrow::field("b", ::arrow::utf8()),
::arrow::field("c", ::arrow::int64())});
auto expected_table = ::arrow::TableFromJSON(
expected_schema, {R"([[1, "a", 10], [2, "b", 20], [3, "c", 30]])"});
AssertTablesEqual(*expected_table, *table);
}

TEST(ParquetRewriterTest, ConcatJoinRoundTrip) {
auto rewriter_properties = RewriterProperties::Builder()
.writer_properties(WriterProperties::Builder()
.enable_write_page_index()
->max_row_group_length(2)
->build())
->build();

auto left_schema = ::arrow::schema(
{::arrow::field("a", ::arrow::int32()), ::arrow::field("b", ::arrow::utf8())});
auto right_schema = ::arrow::schema({::arrow::field("c", ::arrow::int64())});

std::shared_ptr<Buffer> buffer_left_up;
std::shared_ptr<Buffer> buffer_left_down;
std::shared_ptr<Buffer> buffer_right;

WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(left_schema, {R"([[1, "a"], [2, "b"]])"}),
buffer_left_up);
WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(left_schema, {R"([[3, "c"]])"}), buffer_left_down);
WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(right_schema, {R"([[10], [20], [30]])"}),
buffer_right);

auto sink = CreateOutputStream();
auto rewriter = ParquetFileRewriter::Open(
{{std::make_shared<BufferReader>(buffer_left_up),
std::make_shared<BufferReader>(buffer_left_down)},
{std::make_shared<BufferReader>(buffer_right)}},
sink, {{NULLPTR, NULLPTR}, {NULLPTR}}, NULLPTR, rewriter_properties);
rewriter->Rewrite();
rewriter->Close();

ASSERT_OK_AND_ASSIGN(std::shared_ptr<Buffer> out_buffer, sink->Finish());
auto file_reader = ParquetFileReader::Open(std::make_shared<BufferReader>(out_buffer));
ASSERT_OK_AND_ASSIGN(auto reader, FileReader::Make(::arrow::default_memory_pool(),
std::move(file_reader)));

ASSERT_OK_AND_ASSIGN(std::shared_ptr<Table> table, reader->ReadTable());
ASSERT_OK(table->ValidateFull());

auto expected_schema = ::arrow::schema({::arrow::field("a", ::arrow::int32()),
::arrow::field("b", ::arrow::utf8()),
::arrow::field("c", ::arrow::int64())});
auto expected_table = ::arrow::TableFromJSON(
expected_schema, {R"([[1, "a", 10], [2, "b", 20], [3, "c", 30]])"});
AssertTablesEqual(*expected_table, *table);
}

TEST(ParquetRewriterTest, JoinRowCountsMismatch) {
auto rewriter_properties =
RewriterProperties::Builder()
.writer_properties(
WriterProperties::Builder().enable_write_page_index()->build())
->build();

auto schema1 = ::arrow::schema({::arrow::field("a", ::arrow::int32())});
auto schema2 = ::arrow::schema({::arrow::field("b", ::arrow::int32())});

std::shared_ptr<Buffer> buffer1;
std::shared_ptr<Buffer> buffer2;

WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(schema1, {R"([[1], [2]])"}), buffer1);
WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(schema2, {R"([[3], [4], [5]])"}), buffer2);

auto sink = CreateOutputStream();

EXPECT_THROW_THAT(
[&]() {
ParquetFileRewriter::Open({{std::make_shared<BufferReader>(buffer1)},
{std::make_shared<BufferReader>(buffer2)}},
sink, {{NULLPTR}, {NULLPTR}}, NULLPTR,
rewriter_properties);
},
ParquetException,
::testing::Property(
&ParquetException::what,
::testing::HasSubstr("The number of rows in each block must match")));
}

TEST(ParquetRewriterTest, InvalidInputDimensions) {
auto rewriter_properties =
RewriterProperties::Builder()
.writer_properties(
WriterProperties::Builder().enable_write_page_index()->build())
->build();

auto schema = ::arrow::schema({::arrow::field("a", ::arrow::int32())});
std::shared_ptr<Buffer> buffer;
WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(schema, {R"([[1]])"}), buffer);

auto sink = CreateOutputStream();

EXPECT_THROW_THAT(
[&]() {
ParquetFileRewriter::Open({{std::make_shared<BufferReader>(buffer)}}, sink, {},
NULLPTR, rewriter_properties);
},
ParquetException,
::testing::Property(
&ParquetException::what,
::testing::HasSubstr(
"The number of sources and sources_metadata must be the same")));

EXPECT_THROW_THAT(
[&]() {
ParquetFileRewriter::Open({{std::make_shared<BufferReader>(buffer)}}, sink, {{}},
NULLPTR, rewriter_properties);
},
ParquetException,
::testing::Property(
&ParquetException::what,
::testing::HasSubstr(
"The number of sources and sources_metadata must be the same")));
}

} // namespace parquet::arrow
28 changes: 28 additions & 0 deletions cpp/src/parquet/arrow/test_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,23 @@
#include "arrow/array/builder_binary.h"
#include "arrow/array/builder_decimal.h"
#include "arrow/array/builder_primitive.h"
#include "arrow/table.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
#include "arrow/type_fwd.h"
#include "arrow/type_traits.h"
#include "arrow/util/decimal.h"
#include "arrow/util/float16.h"
#include "parquet/arrow/schema.h"
#include "parquet/arrow/writer.h"
#include "parquet/column_reader.h"
#include "parquet/file_writer.h"
#include "parquet/test_util.h"

namespace parquet {

using internal::RecordReader;
using schema::GroupNode;

namespace arrow {

Expand Down Expand Up @@ -482,6 +487,29 @@ void ExpectArrayT<::arrow::BooleanType>(void* expected, Array* result) {
EXPECT_TRUE(result->Equals(*expected_array));
}

void WriteFile(const std::shared_ptr<WriterProperties>& writer_properties,
const std::shared_ptr<::arrow::Table>& table,
std::shared_ptr<Buffer>& buffer) {
// Get schema from table.
auto schema = table->schema();
std::shared_ptr<SchemaDescriptor> parquet_schema;
auto arrow_writer_properties = default_arrow_writer_properties();
ASSERT_OK_NO_THROW(ToParquetSchema(schema.get(), *writer_properties,
*arrow_writer_properties, &parquet_schema));
auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema_root());

// Write table to buffer.
auto sink = CreateOutputStream();
auto pool = ::arrow::default_memory_pool();
auto writer = ParquetFileWriter::Open(sink, schema_node, writer_properties);
std::unique_ptr<FileWriter> arrow_writer;
ASSERT_OK(FileWriter::Make(pool, std::move(writer), schema, arrow_writer_properties,
&arrow_writer));
ASSERT_OK_NO_THROW(arrow_writer->WriteTable(*table));
ASSERT_OK_NO_THROW(arrow_writer->Close());
ASSERT_OK_AND_ASSIGN(buffer, sink->Finish());
}

} // namespace arrow

} // namespace parquet
23 changes: 23 additions & 0 deletions cpp/src/parquet/bloom_filter_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ class BloomFilterBuilderImpl : public BloomFilterBuilder {

BloomFilter* CreateBloomFilter(int32_t column_ordinal) override;

void InsertBloomFilter(int32_t column_ordinal,
std::unique_ptr<BloomFilter> bloom_filter) override;

IndexLocations WriteTo(::arrow::io::OutputStream* sink) override;

private:
Expand Down Expand Up @@ -219,6 +222,26 @@ BloomFilter* BloomFilterBuilderImpl::CreateBloomFilter(int32_t column_ordinal) {
return curr_rg_bfs.emplace(column_ordinal, std::move(bf)).first->second.get();
}

void BloomFilterBuilderImpl::InsertBloomFilter(
int32_t column_ordinal, std::unique_ptr<BloomFilter> bloom_filter) {
auto opts = properties_->bloom_filter_options(schema_->Column(column_ordinal)->path());
if (!opts.has_value() || bloom_filter == nullptr) {
return;
}

CheckState(column_ordinal);

auto& curr_rg_bfs = *bloom_filters_.rbegin();
if (curr_rg_bfs.find(column_ordinal) != curr_rg_bfs.cend()) {
std::stringstream ss;
ss << "Bloom filter already exists for column: " << column_ordinal
<< ", row group: " << (bloom_filters_.size() - 1);
throw ParquetException(ss.str());
}

curr_rg_bfs.emplace(column_ordinal, std::move(bloom_filter));
}

IndexLocations BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink) {
if (finished_) {
throw ParquetException("Cannot write a finished BloomFilterBuilder");
Expand Down
Loading
Loading