Skip to content

Commit c24bc29

Browse files
authored
GH-49576: [Ruby] Add support for custom metadata in Footer (#49577)
### Rationale for this change In file format, Footer can have custom metadata. ### What changes are included in this PR? * Add `garrow_record_batch_file_reader_get_metadata()` * Add `garrow_record_batch_file_writer_new_full()` * Add `ArrowFormat::FileReader#metadata` * Add `metadata` to `ArrowFormat::FileWriter#finish` * Add `metadata:` to `Arrow::Table#save` ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #49576 Authored-by: Sutou Kouhei <kou@clear-code.com> Signed-off-by: Sutou Kouhei <kou@clear-code.com>
1 parent 10eaafd commit c24bc29

12 files changed

Lines changed: 181 additions & 19 deletions

File tree

c_glib/arrow-glib/reader.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -668,6 +668,35 @@ garrow_record_batch_file_reader_read_record_batch(GArrowRecordBatchFileReader *r
668668
}
669669
}
670670

671+
/**
672+
* garrow_record_batch_file_reader_get_metadata:
673+
* @reader: A #GArrowRecordBatchFileReader.
674+
*
675+
* Returns: (nullable) (element-type utf8 utf8) (transfer full):
676+
* The metadata in the footer.
677+
*
678+
* Since: 24.0.0
679+
*/
680+
GHashTable *
681+
garrow_record_batch_file_reader_get_metadata(GArrowRecordBatchFileReader *reader)
682+
{
683+
auto arrow_reader = garrow_record_batch_file_reader_get_raw(reader);
684+
auto arrow_metadata = arrow_reader->metadata();
685+
686+
if (!arrow_metadata) {
687+
return nullptr;
688+
}
689+
690+
auto metadata = g_hash_table_new(g_str_hash, g_str_equal);
691+
const auto n = arrow_metadata->size();
692+
for (int64_t i = 0; i < n; ++i) {
693+
g_hash_table_insert(metadata,
694+
const_cast<gchar *>(arrow_metadata->key(i).c_str()),
695+
const_cast<gchar *>(arrow_metadata->value(i).c_str()));
696+
}
697+
return metadata;
698+
}
699+
671700
struct GArrowFeatherFileReaderPrivate
672701
{
673702
std::shared_ptr<arrow::ipc::feather::Reader> feather_reader;

c_glib/arrow-glib/reader.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,10 @@ garrow_record_batch_file_reader_read_record_batch(GArrowRecordBatchFileReader *r
166166
guint i,
167167
GError **error);
168168

169+
GARROW_AVAILABLE_IN_24_0
170+
GHashTable *
171+
garrow_record_batch_file_reader_get_metadata(GArrowRecordBatchFileReader *reader);
172+
169173
#define GARROW_TYPE_FEATHER_FILE_READER (garrow_feather_file_reader_get_type())
170174
GARROW_AVAILABLE_IN_ALL
171175
G_DECLARE_DERIVABLE_TYPE(GArrowFeatherFileReader,

c_glib/arrow-glib/writer.cpp

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
#include <arrow-glib/array.hpp>
2121
#include <arrow-glib/enums.h>
2222
#include <arrow-glib/error.hpp>
23+
#include <arrow-glib/internal-hash-table.hpp>
24+
#include <arrow-glib/ipc-options.hpp>
2325
#include <arrow-glib/record-batch.hpp>
2426
#include <arrow-glib/schema.hpp>
2527
#include <arrow-glib/table.hpp>
@@ -288,16 +290,50 @@ GArrowRecordBatchFileWriter *
288290
garrow_record_batch_file_writer_new(GArrowOutputStream *sink,
289291
GArrowSchema *schema,
290292
GError **error)
293+
{
294+
return garrow_record_batch_file_writer_new_full(sink, schema, nullptr, nullptr, error);
295+
}
296+
297+
/**
298+
* garrow_record_batch_file_writer_new_full:
299+
* @sink: The output of the writer.
300+
* @schema: The schema of the writer.
301+
* @options: (nullable): The options for serialization.
302+
* @metadata: (nullable) (element-type utf8 utf8): The custom metadata in
303+
* the footer.
304+
* @error: (nullable): Return location for a #GError or %NULL.
305+
*
306+
* Returns: (nullable): A newly created #GArrowRecordBatchFileWriter
307+
* or %NULL on error.
308+
*
309+
* Since: 24.0.0
310+
*/
311+
GArrowRecordBatchFileWriter *
312+
garrow_record_batch_file_writer_new_full(GArrowOutputStream *sink,
313+
GArrowSchema *schema,
314+
GArrowWriteOptions *options,
315+
GHashTable *metadata,
316+
GError **error)
291317
{
292318
auto arrow_sink = garrow_output_stream_get_raw(sink);
293319
auto arrow_schema = garrow_schema_get_raw(schema);
320+
arrow::ipc::IpcWriteOptions arrow_options = arrow::ipc::IpcWriteOptions::Defaults();
321+
if (options) {
322+
arrow_options = *garrow_write_options_get_raw(options);
323+
}
324+
std::shared_ptr<arrow::KeyValueMetadata> arrow_metadata;
325+
if (metadata) {
326+
arrow_metadata = garrow_internal_hash_table_to_metadata(metadata);
327+
}
328+
294329
std::shared_ptr<arrow::ipc::RecordBatchWriter> arrow_writer;
295-
auto arrow_writer_result = arrow::ipc::MakeFileWriter(arrow_sink, arrow_schema);
296-
if (garrow::check(error, arrow_writer_result, "[record-batch-file-writer][open]")) {
330+
auto arrow_writer_result =
331+
arrow::ipc::MakeFileWriter(arrow_sink, arrow_schema, arrow_options, arrow_metadata);
332+
if (garrow::check(error, arrow_writer_result, "[record-batch-file-writer][new]")) {
297333
auto arrow_writer = *arrow_writer_result;
298334
return garrow_record_batch_file_writer_new_raw(&arrow_writer);
299335
} else {
300-
return NULL;
336+
return nullptr;
301337
}
302338
}
303339

c_glib/arrow-glib/writer.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#pragma once
2121

2222
#include <arrow-glib/array.h>
23+
#include <arrow-glib/ipc-options.h>
2324
#include <arrow-glib/record-batch.h>
2425
#include <arrow-glib/schema.h>
2526

@@ -94,6 +95,14 @@ garrow_record_batch_file_writer_new(GArrowOutputStream *sink,
9495
GArrowSchema *schema,
9596
GError **error);
9697

98+
GARROW_AVAILABLE_IN_24_0
99+
GArrowRecordBatchFileWriter *
100+
garrow_record_batch_file_writer_new_full(GArrowOutputStream *sink,
101+
GArrowSchema *schema,
102+
GArrowWriteOptions *options,
103+
GHashTable *metadata,
104+
GError **error);
105+
97106
/**
98107
* GArrowCSVQuotingStyle:
99108
* @GARROW_CSV_QUOTING_STYLE_NEEDED: Only enclose values in quotes which need them.

c_glib/test/test-file-writer.rb

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,36 @@ def test_write_table
8888
input.close
8989
end
9090
end
91+
92+
def test_footer_custom_metadata
93+
tempfile = Tempfile.open("arrow-ipc-file-writer")
94+
output = Arrow::FileOutputStream.new(tempfile.path, false)
95+
96+
array = build_boolean_array([true, false, true])
97+
field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new)
98+
schema = Arrow::Schema.new([field])
99+
100+
options = Arrow::WriteOptions.new
101+
metadata = {"key1" => "value1", "key2" => "value2"}
102+
begin
103+
file_writer = Arrow::RecordBatchFileWriter.new(output,
104+
schema,
105+
options,
106+
metadata)
107+
file_writer.close
108+
assert do
109+
file_writer.closed?
110+
end
111+
ensure
112+
output.close
113+
end
114+
115+
input = Arrow::MemoryMappedInputStream.new(tempfile.path)
116+
begin
117+
file_reader = Arrow::RecordBatchFileReader.new(input)
118+
assert_equal(metadata, file_reader.metadata)
119+
ensure
120+
input.close
121+
end
122+
end
91123
end

ruby/red-arrow-format/Gemfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,5 +26,6 @@ gem "red-arrow", path: "../red-arrow"
2626
group :development do
2727
gem "benchmark-driver"
2828
gem "rake"
29+
gem "stringio"
2930
gem "test-unit"
3031
end

ruby/red-arrow-format/lib/arrow-format/file-reader.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class FileReader
3535
FOOTER_SIZE_SIZE = IO::Buffer.size_of(FOOTER_SIZE_FORMAT)
3636

3737
attr_reader :schema
38+
attr_reader :metadata
3839
def initialize(input)
3940
case input
4041
when IO
@@ -47,6 +48,7 @@ def initialize(input)
4748

4849
validate
4950
@footer = read_footer
51+
@metadata = read_custom_metadata(@footer.custom_metadata)
5052
@record_batch_blocks = @footer.record_batches || []
5153
@schema = read_schema(@footer.schema)
5254
@dictionaries = read_dictionaries

ruby/red-arrow-format/lib/arrow-format/file-writer.rb

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,26 +29,33 @@ def start(schema)
2929
super
3030
end
3131

32-
def finish
33-
super
34-
write_footer
32+
def finish(metadata=nil)
33+
super()
34+
write_footer(metadata)
3535
write_data(MAGIC)
3636
@output
3737
end
3838

3939
private
40-
def build_footer
40+
def build_footer(metadata)
4141
fb_footer = FB::Footer::Data.new
4242
fb_footer.version = FB::MetadataVersion::V5
4343
fb_footer.schema = @fb_schema
4444
fb_footer.dictionaries = @fb_dictionary_blocks
4545
fb_footer.record_batches = @fb_record_batch_blocks
46-
# fb_footer.custom_metadata = ... # TODO
46+
if metadata
47+
fb_footer.custom_metadata = metadata.collect do |key, value|
48+
fb_key_value = FB::KeyValue::Data.new
49+
fb_key_value.key = key
50+
fb_key_value.value = value
51+
fb_key_value
52+
end
53+
end
4754
FB::Footer.serialize(fb_footer)
4855
end
4956

50-
def write_footer
51-
footer = build_footer
57+
def write_footer(metadata)
58+
footer = build_footer(metadata)
5259
write_data(footer)
5360
write_data([footer.bytesize].pack("l<"))
5461
end

ruby/red-arrow-format/test/helper.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18+
require "stringio"
1819
require "tmpdir"
1920

2021
require "test-unit"

ruby/red-arrow-format/test/test-reader.rb

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -675,18 +675,36 @@ def test_dictionary
675675
end
676676
end
677677

678+
module FileReaderTests
679+
def test_custom_metadata_footer
680+
Dir.mktmpdir do |tmp_dir|
681+
table = Arrow::Table.new(value: Arrow::Int8Array.new([1, 2, 3]))
682+
metadata = {
683+
"key1" => "value1",
684+
"key2" => "value2",
685+
}
686+
open_input(table, tmp_dir, metadata: metadata) do |input|
687+
reader = reader_class.new(input)
688+
assert_equal(metadata, reader.metadata)
689+
end
690+
ensure
691+
GC.start
692+
end
693+
end
694+
end
695+
678696
module FileInput
679-
def open_input(table, tmp_dir, &block)
697+
def open_input(table, tmp_dir, **options, &block)
680698
path = File.join(tmp_dir, "data.#{file_extension}")
681-
table.save(path)
699+
table.save(path, **options)
682700
File.open(path, "rb", &block)
683701
end
684702
end
685703

686704
module PipeInput
687-
def open_input(table, tmp_dir, &block)
705+
def open_input(table, tmp_dir, **options)
688706
buffer = Arrow::ResizableBuffer.new(4096)
689-
table.save(buffer, format: format)
707+
table.save(buffer, format: format, **options)
690708
IO.pipe do |input, output|
691709
write_thread = Thread.new do
692710
output.write(buffer.data.to_s)
@@ -701,15 +719,16 @@ def open_input(table, tmp_dir, &block)
701719
end
702720

703721
module StringInput
704-
def open_input(table, tmp_dir)
722+
def open_input(table, tmp_dir, **options)
705723
buffer = Arrow::ResizableBuffer.new(4096)
706-
table.save(buffer, format: format)
724+
table.save(buffer, format: format, **options)
707725
yield(buffer.data.to_s)
708726
end
709727
end
710728

711729
class TestFileReaderFileInput < Test::Unit::TestCase
712730
include ReaderTests
731+
include FileReaderTests
713732
include FileInput
714733

715734
def file_extension
@@ -723,6 +742,7 @@ def reader_class
723742

724743
class TestFileReaderStringInput < Test::Unit::TestCase
725744
include ReaderTests
745+
include FileReaderTests
726746
include StringInput
727747

728748
def format

0 commit comments

Comments
 (0)