From 78bfbc0859dcc6329736e4575c323a941175fe5e Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Tue, 14 Oct 2025 13:33:10 +0300 Subject: [PATCH 1/5] Fix overflow bug with child_fields.size() = 128 (kMaxTypeCode + 1) --- cpp/src/arrow/type.cc | 8 ++++---- cpp/src/arrow/util/range.h | 8 ++++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 2e9d860a8d6..cba4a0ecd3a 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -3270,7 +3270,7 @@ std::shared_ptr run_end_encoded(std::shared_ptr run_end_type std::shared_ptr sparse_union(FieldVector child_fields, std::vector type_codes) { if (type_codes.empty()) { - type_codes = internal::Iota(static_cast(child_fields.size())); + type_codes = internal::Iota(0, child_fields.size()); } return std::make_shared(std::move(child_fields), std::move(type_codes)); @@ -3278,7 +3278,7 @@ std::shared_ptr sparse_union(FieldVector child_fields, std::shared_ptr dense_union(FieldVector child_fields, std::vector type_codes) { if (type_codes.empty()) { - type_codes = internal::Iota(static_cast(child_fields.size())); + type_codes = internal::Iota(0, child_fields.size()); } return std::make_shared(std::move(child_fields), std::move(type_codes)); } @@ -3310,7 +3310,7 @@ std::shared_ptr sparse_union(const ArrayVector& children, std::vector field_names, std::vector type_codes) { if (type_codes.empty()) { - type_codes = internal::Iota(static_cast(children.size())); + type_codes = internal::Iota(0, children.size()); } auto fields = FieldsFromArraysAndNames(std::move(field_names), children); return sparse_union(std::move(fields), std::move(type_codes)); @@ -3320,7 +3320,7 @@ std::shared_ptr dense_union(const ArrayVector& children, std::vector field_names, std::vector type_codes) { if (type_codes.empty()) { - type_codes = internal::Iota(static_cast(children.size())); + type_codes = internal::Iota(0, children.size()); } auto fields = FieldsFromArraysAndNames(std::move(field_names), children); return dense_union(std::move(fields), std::move(type_codes)); diff --git a/cpp/src/arrow/util/range.h b/cpp/src/arrow/util/range.h index 20553287985..035e9b2414b 100644 --- a/cpp/src/arrow/util/range.h +++ b/cpp/src/arrow/util/range.h @@ -44,6 +44,14 @@ std::vector Iota(T length) { return Iota(static_cast(0), length); } +/// Create a vector containing the values from start with length elements +template +std::vector Iota(T start, size_t length) { + std::vector result(length); + std::iota(result.begin(), result.end(), start); + return result; +} + /// Create a range from a callable which takes a single index parameter /// and returns the value of iterator on each call and a length. /// Only iterators obtained from the same range should be compared, the From 3e8f571b2f74f6211e5ad16b44a3eb79dc9b82a2 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Tue, 14 Oct 2025 13:33:40 +0300 Subject: [PATCH 2/5] Impl test for union types with fields.size() = 128 (kMaxTypeCode + 1) --- cpp/src/arrow/type_test.cc | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index cb17f1ac3fc..ba70d004f1f 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -2191,6 +2191,36 @@ TEST(TestUnionType, Basics) { ASSERT_EQ(ty6->child_ids(), child_ids2); } +TEST(TestUnionType, MaxTypeCode) { + std::vector> fields; + for (int32_t i = 0; i <= UnionType::kMaxTypeCode; i++) { + fields.push_back(field(std::to_string(i), int32())); + } + + std::vector type_codes(fields.size()); + std::iota(type_codes.begin(), type_codes.end(), 0); + + auto t1 = checked_pointer_cast(dense_union(fields, type_codes)); + ASSERT_EQ(t1->type_codes().size(), UnionType::kMaxTypeCode + 1); + ASSERT_EQ(t1->child_ids().size(), UnionType::kMaxTypeCode + 1); + + auto t2 = checked_pointer_cast(dense_union(fields)); + ASSERT_EQ(t2->type_codes().size(), UnionType::kMaxTypeCode + 1); + ASSERT_EQ(t2->child_ids().size(), UnionType::kMaxTypeCode + 1); + + AssertTypeEqual(*t1, *t2); + + auto t3 = checked_pointer_cast(sparse_union(fields, type_codes)); + ASSERT_EQ(t3->type_codes().size(), UnionType::kMaxTypeCode + 1); + ASSERT_EQ(t3->child_ids().size(), UnionType::kMaxTypeCode + 1); + + auto t4 = checked_pointer_cast(sparse_union(fields)); + ASSERT_EQ(t4->type_codes().size(), UnionType::kMaxTypeCode + 1); + ASSERT_EQ(t4->child_ids().size(), UnionType::kMaxTypeCode + 1); + + AssertTypeEqual(*t3, *t4); +} + TEST(TestDictionaryType, Basics) { auto value_type = int32(); From 6624e61430cff2163592bee1ab4f653861f0536e Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Tue, 18 Nov 2025 16:46:12 +0300 Subject: [PATCH 3/5] Fix number of possible types for unions --- docs/source/format/Columnar.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 5002e805247..cf7ff0c3dec 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -880,7 +880,7 @@ each value. Its physical layout is as follows: * One child array for each type * Types buffer: A buffer of 8-bit signed integers. Each type in the union has a corresponding type id whose values are found in this - buffer. A union with more than 127 possible types can be modeled as + buffer. A union with more than 128 possible types can be modeled as a union of unions. * Offsets buffer: A buffer of signed Int32 values indicating the relative offset into the respective child array for the type in a From 0f87d4be3f024e81a07384e142a9f4a075579e67 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Sat, 22 Nov 2025 20:27:52 +0300 Subject: [PATCH 4/5] Try to fix test --- cpp/src/arrow/type_test.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index ba70d004f1f..e9b1d30e6e7 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include From a6e076f203d45800e221bf6cf5ce28d654c96218 Mon Sep 17 00:00:00 2001 From: Daniil Timizhev Date: Sat, 22 Nov 2025 20:28:01 +0300 Subject: [PATCH 5/5] Try to change iota --- cpp/src/arrow/util/range.h | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/util/range.h b/cpp/src/arrow/util/range.h index 035e9b2414b..55155b7e34d 100644 --- a/cpp/src/arrow/util/range.h +++ b/cpp/src/arrow/util/range.h @@ -27,15 +27,21 @@ namespace arrow::internal { +/// Create a vector containing the values from start with length elements +template +std::vector Iota(T start, size_t length) { + std::vector result(length); + std::iota(result.begin(), result.end(), start); + return result; +} + /// Create a vector containing the values from start up to stop template std::vector Iota(T start, T stop) { if (start > stop) { return {}; } - std::vector result(static_cast(stop - start)); - std::iota(result.begin(), result.end(), start); - return result; + return Iota(start, static_cast(stop - start)); } /// Create a vector containing the values from 0 up to length @@ -44,14 +50,6 @@ std::vector Iota(T length) { return Iota(static_cast(0), length); } -/// Create a vector containing the values from start with length elements -template -std::vector Iota(T start, size_t length) { - std::vector result(length); - std::iota(result.begin(), result.end(), start); - return result; -} - /// Create a range from a callable which takes a single index parameter /// and returns the value of iterator on each call and a length. /// Only iterators obtained from the same range should be compared, the