diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp index 9fe9d9d1b319..f6e0d3064d59 100644 --- a/c_glib/arrow-glib/reader.cpp +++ b/c_glib/arrow-glib/reader.cpp @@ -668,6 +668,35 @@ garrow_record_batch_file_reader_read_record_batch(GArrowRecordBatchFileReader *r } } +/** + * garrow_record_batch_file_reader_get_metadata: + * @reader: A #GArrowRecordBatchFileReader. + * + * Returns: (nullable) (element-type utf8 utf8) (transfer full): + * The metadata in the footer. + * + * Since: 24.0.0 + */ +GHashTable * +garrow_record_batch_file_reader_get_metadata(GArrowRecordBatchFileReader *reader) +{ + auto arrow_reader = garrow_record_batch_file_reader_get_raw(reader); + auto arrow_metadata = arrow_reader->metadata(); + + if (!arrow_metadata) { + return nullptr; + } + + auto metadata = g_hash_table_new(g_str_hash, g_str_equal); + const auto n = arrow_metadata->size(); + for (int64_t i = 0; i < n; ++i) { + g_hash_table_insert(metadata, + const_cast(arrow_metadata->key(i).c_str()), + const_cast(arrow_metadata->value(i).c_str())); + } + return metadata; +} + struct GArrowFeatherFileReaderPrivate { std::shared_ptr feather_reader; diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h index 5401aa3bb1fc..1e896fd09fd2 100644 --- a/c_glib/arrow-glib/reader.h +++ b/c_glib/arrow-glib/reader.h @@ -166,6 +166,10 @@ garrow_record_batch_file_reader_read_record_batch(GArrowRecordBatchFileReader *r guint i, GError **error); +GARROW_AVAILABLE_IN_24_0 +GHashTable * +garrow_record_batch_file_reader_get_metadata(GArrowRecordBatchFileReader *reader); + #define GARROW_TYPE_FEATHER_FILE_READER (garrow_feather_file_reader_get_type()) GARROW_AVAILABLE_IN_ALL G_DECLARE_DERIVABLE_TYPE(GArrowFeatherFileReader, diff --git a/c_glib/arrow-glib/writer.cpp b/c_glib/arrow-glib/writer.cpp index 4228b6091072..0cbd88a769d4 100644 --- a/c_glib/arrow-glib/writer.cpp +++ b/c_glib/arrow-glib/writer.cpp @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include #include @@ -288,16 +290,50 @@ GArrowRecordBatchFileWriter * garrow_record_batch_file_writer_new(GArrowOutputStream *sink, GArrowSchema *schema, GError **error) +{ + return garrow_record_batch_file_writer_new_full(sink, schema, nullptr, nullptr, error); +} + +/** + * garrow_record_batch_file_writer_new_full: + * @sink: The output of the writer. + * @schema: The schema of the writer. + * @options: (nullable): The options for serialization. + * @metadata: (nullable) (element-type utf8 utf8): The custom metadata in + * the footer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowRecordBatchFileWriter + * or %NULL on error. + * + * Since: 24.0.0 + */ +GArrowRecordBatchFileWriter * +garrow_record_batch_file_writer_new_full(GArrowOutputStream *sink, + GArrowSchema *schema, + GArrowWriteOptions *options, + GHashTable *metadata, + GError **error) { auto arrow_sink = garrow_output_stream_get_raw(sink); auto arrow_schema = garrow_schema_get_raw(schema); + arrow::ipc::IpcWriteOptions arrow_options = arrow::ipc::IpcWriteOptions::Defaults(); + if (options) { + arrow_options = *garrow_write_options_get_raw(options); + } + std::shared_ptr arrow_metadata; + if (metadata) { + arrow_metadata = garrow_internal_hash_table_to_metadata(metadata); + } + std::shared_ptr arrow_writer; - auto arrow_writer_result = arrow::ipc::MakeFileWriter(arrow_sink, arrow_schema); - if (garrow::check(error, arrow_writer_result, "[record-batch-file-writer][open]")) { + auto arrow_writer_result = + arrow::ipc::MakeFileWriter(arrow_sink, arrow_schema, arrow_options, arrow_metadata); + if (garrow::check(error, arrow_writer_result, "[record-batch-file-writer][new]")) { auto arrow_writer = *arrow_writer_result; return garrow_record_batch_file_writer_new_raw(&arrow_writer); } else { - return NULL; + return nullptr; } } diff --git a/c_glib/arrow-glib/writer.h b/c_glib/arrow-glib/writer.h index fc5fe0c2c738..e02da0e30d19 100644 --- a/c_glib/arrow-glib/writer.h +++ b/c_glib/arrow-glib/writer.h @@ -20,6 +20,7 @@ #pragma once #include +#include #include #include @@ -94,6 +95,14 @@ garrow_record_batch_file_writer_new(GArrowOutputStream *sink, GArrowSchema *schema, GError **error); +GARROW_AVAILABLE_IN_24_0 +GArrowRecordBatchFileWriter * +garrow_record_batch_file_writer_new_full(GArrowOutputStream *sink, + GArrowSchema *schema, + GArrowWriteOptions *options, + GHashTable *metadata, + GError **error); + /** * GArrowCSVQuotingStyle: * @GARROW_CSV_QUOTING_STYLE_NEEDED: Only enclose values in quotes which need them. diff --git a/c_glib/test/test-file-writer.rb b/c_glib/test/test-file-writer.rb index 06c9dfa25c7f..41fd00cee4ee 100644 --- a/c_glib/test/test-file-writer.rb +++ b/c_glib/test/test-file-writer.rb @@ -88,4 +88,36 @@ def test_write_table input.close end end + + def test_footer_custom_metadata + tempfile = Tempfile.open("arrow-ipc-file-writer") + output = Arrow::FileOutputStream.new(tempfile.path, false) + + array = build_boolean_array([true, false, true]) + field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) + schema = Arrow::Schema.new([field]) + + options = Arrow::WriteOptions.new + metadata = {"key1" => "value1", "key2" => "value2"} + begin + file_writer = Arrow::RecordBatchFileWriter.new(output, + schema, + options, + metadata) + file_writer.close + assert do + file_writer.closed? + end + ensure + output.close + end + + input = Arrow::MemoryMappedInputStream.new(tempfile.path) + begin + file_reader = Arrow::RecordBatchFileReader.new(input) + assert_equal(metadata, file_reader.metadata) + ensure + input.close + end + end end diff --git a/ruby/red-arrow-format/Gemfile b/ruby/red-arrow-format/Gemfile index 296a7b44358f..34c981237c59 100644 --- a/ruby/red-arrow-format/Gemfile +++ b/ruby/red-arrow-format/Gemfile @@ -26,5 +26,6 @@ gem "red-arrow", path: "../red-arrow" group :development do gem "benchmark-driver" gem "rake" + gem "stringio" gem "test-unit" end diff --git a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb index 7c749e5fbf8e..cec371109622 100644 --- a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb +++ b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb @@ -35,6 +35,7 @@ class FileReader FOOTER_SIZE_SIZE = IO::Buffer.size_of(FOOTER_SIZE_FORMAT) attr_reader :schema + attr_reader :metadata def initialize(input) case input when IO @@ -47,6 +48,7 @@ def initialize(input) validate @footer = read_footer + @metadata = read_custom_metadata(@footer.custom_metadata) @record_batch_blocks = @footer.record_batches || [] @schema = read_schema(@footer.schema) @dictionaries = read_dictionaries diff --git a/ruby/red-arrow-format/lib/arrow-format/file-writer.rb b/ruby/red-arrow-format/lib/arrow-format/file-writer.rb index 27b6b55bbf9a..2ac469518082 100644 --- a/ruby/red-arrow-format/lib/arrow-format/file-writer.rb +++ b/ruby/red-arrow-format/lib/arrow-format/file-writer.rb @@ -29,26 +29,33 @@ def start(schema) super end - def finish - super - write_footer + def finish(metadata=nil) + super() + write_footer(metadata) write_data(MAGIC) @output end private - def build_footer + def build_footer(metadata) fb_footer = FB::Footer::Data.new fb_footer.version = FB::MetadataVersion::V5 fb_footer.schema = @fb_schema fb_footer.dictionaries = @fb_dictionary_blocks fb_footer.record_batches = @fb_record_batch_blocks - # fb_footer.custom_metadata = ... # TODO + if metadata + fb_footer.custom_metadata = metadata.collect do |key, value| + fb_key_value = FB::KeyValue::Data.new + fb_key_value.key = key + fb_key_value.value = value + fb_key_value + end + end FB::Footer.serialize(fb_footer) end - def write_footer - footer = build_footer + def write_footer(metadata) + footer = build_footer(metadata) write_data(footer) write_data([footer.bytesize].pack("l<")) end diff --git a/ruby/red-arrow-format/test/helper.rb b/ruby/red-arrow-format/test/helper.rb index 394d92d0dd4c..29fbfaec4c5f 100644 --- a/ruby/red-arrow-format/test/helper.rb +++ b/ruby/red-arrow-format/test/helper.rb @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +require "stringio" require "tmpdir" require "test-unit" diff --git a/ruby/red-arrow-format/test/test-reader.rb b/ruby/red-arrow-format/test/test-reader.rb index c1c6b2628863..1d3202f4f720 100644 --- a/ruby/red-arrow-format/test/test-reader.rb +++ b/ruby/red-arrow-format/test/test-reader.rb @@ -675,18 +675,36 @@ def test_dictionary end end +module FileReaderTests + def test_custom_metadata_footer + Dir.mktmpdir do |tmp_dir| + table = Arrow::Table.new(value: Arrow::Int8Array.new([1, 2, 3])) + metadata = { + "key1" => "value1", + "key2" => "value2", + } + open_input(table, tmp_dir, metadata: metadata) do |input| + reader = reader_class.new(input) + assert_equal(metadata, reader.metadata) + end + ensure + GC.start + end + end +end + module FileInput - def open_input(table, tmp_dir, &block) + def open_input(table, tmp_dir, **options, &block) path = File.join(tmp_dir, "data.#{file_extension}") - table.save(path) + table.save(path, **options) File.open(path, "rb", &block) end end module PipeInput - def open_input(table, tmp_dir, &block) + def open_input(table, tmp_dir, **options) buffer = Arrow::ResizableBuffer.new(4096) - table.save(buffer, format: format) + table.save(buffer, format: format, **options) IO.pipe do |input, output| write_thread = Thread.new do output.write(buffer.data.to_s) @@ -701,15 +719,16 @@ def open_input(table, tmp_dir, &block) end module StringInput - def open_input(table, tmp_dir) + def open_input(table, tmp_dir, **options) buffer = Arrow::ResizableBuffer.new(4096) - table.save(buffer, format: format) + table.save(buffer, format: format, **options) yield(buffer.data.to_s) end end class TestFileReaderFileInput < Test::Unit::TestCase include ReaderTests + include FileReaderTests include FileInput def file_extension @@ -723,6 +742,7 @@ def reader_class class TestFileReaderStringInput < Test::Unit::TestCase include ReaderTests + include FileReaderTests include StringInput def format diff --git a/ruby/red-arrow-format/test/test-writer.rb b/ruby/red-arrow-format/test/test-writer.rb index 72776f01ab8e..55b3c22b7a96 100644 --- a/ruby/red-arrow-format/test/test-writer.rb +++ b/ruby/red-arrow-format/test/test-writer.rb @@ -924,6 +924,26 @@ def test_dictionary end end +module FileWriterTests + def test_custom_metadata_footer + output = StringIO.new(+"".b) + writer = writer_class.new(output) + field = ArrowFormat::Field.new("value", ArrowFormat::BooleanType.new) + schema = ArrowFormat::Schema.new([field]) + writer.start(schema) + metadata = { + "key1" => "value1", + "key2" => "value2", + } + writer.finish(metadata) + buffer = Arrow::Buffer.new(output.string) + Arrow::BufferInputStream.open(buffer) do |input| + reader = Arrow::RecordBatchFileReader.new(input) + assert_equal(metadata, reader.metadata) + end + end +end + module WriterDictionaryDeltaTests def build_schema(value_type) index_type = ArrowFormat::Int32Type.singleton @@ -1513,6 +1533,7 @@ def read(path) sub_test_case("Basic") do include WriterTests + include FileWriterTests end sub_test_case("Dictionary: delta") do diff --git a/ruby/red-arrow/lib/arrow/table-saver.rb b/ruby/red-arrow/lib/arrow/table-saver.rb index c33e64143873..d456f235e5c9 100644 --- a/ruby/red-arrow/lib/arrow/table-saver.rb +++ b/ruby/red-arrow/lib/arrow/table-saver.rb @@ -130,9 +130,9 @@ def open_output_stream(&block) end end - def save_raw(writer_class) + def save_raw(writer_class, *args) open_output_stream do |output| - writer_class.open(output, @table.schema) do |writer| + writer_class.open(output, @table.schema, *args) do |writer| writer.write_table(@table) end end @@ -144,7 +144,7 @@ def save_as_arrow # @since 1.0.0 def save_as_arrow_file - save_raw(RecordBatchFileWriter) + save_raw(RecordBatchFileWriter, nil, @options[:metadata]) end # @deprecated Use `format: :arrow_batch` instead.