Skip to content

Commit b6d502c

Browse files
committed
Merge remote-tracking branch 'public/main' into support-parse-list
2 parents 49495a6 + 0745bb4 commit b6d502c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+7982
-2753
lines changed

.github/workflows/docs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ jobs:
7171
steps:
7272
- uses: actions/checkout@v5
7373
- name: Download crate docs
74-
uses: actions/download-artifact@v5
74+
uses: actions/download-artifact@v6
7575
with:
7676
name: crate-docs
7777
path: website/build

CHANGELOG-old.md

Lines changed: 414 additions & 0 deletions
Large diffs are not rendered by default.

CHANGELOG.md

Lines changed: 228 additions & 124 deletions
Large diffs are not rendered by default.

Cargo.toml

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ exclude = [
6868
]
6969

7070
[workspace.package]
71-
version = "56.2.0"
71+
version = "57.0.0"
7272
homepage = "https://github.com/apache/arrow-rs"
7373
repository = "https://github.com/apache/arrow-rs"
7474
authors = ["Apache Arrow <dev@arrow.apache.org>"]
@@ -85,22 +85,22 @@ edition = "2024"
8585
rust-version = "1.85"
8686

8787
[workspace.dependencies]
88-
arrow = { version = "56.2.0", path = "./arrow", default-features = false }
89-
arrow-arith = { version = "56.2.0", path = "./arrow-arith" }
90-
arrow-array = { version = "56.2.0", path = "./arrow-array" }
91-
arrow-buffer = { version = "56.2.0", path = "./arrow-buffer" }
92-
arrow-cast = { version = "56.2.0", path = "./arrow-cast" }
93-
arrow-csv = { version = "56.2.0", path = "./arrow-csv" }
94-
arrow-data = { version = "56.2.0", path = "./arrow-data" }
95-
arrow-ipc = { version = "56.2.0", path = "./arrow-ipc" }
96-
arrow-json = { version = "56.2.0", path = "./arrow-json" }
97-
arrow-ord = { version = "56.2.0", path = "./arrow-ord" }
98-
arrow-pyarrow = { version = "56.2.0", path = "./arrow-pyarrow" }
99-
arrow-row = { version = "56.2.0", path = "./arrow-row" }
100-
arrow-schema = { version = "56.2.0", path = "./arrow-schema" }
101-
arrow-select = { version = "56.2.0", path = "./arrow-select" }
102-
arrow-string = { version = "56.2.0", path = "./arrow-string" }
103-
parquet = { version = "56.2.0", path = "./parquet", default-features = false }
88+
arrow = { version = "57.0.0", path = "./arrow", default-features = false }
89+
arrow-arith = { version = "57.0.0", path = "./arrow-arith" }
90+
arrow-array = { version = "57.0.0", path = "./arrow-array" }
91+
arrow-buffer = { version = "57.0.0", path = "./arrow-buffer" }
92+
arrow-cast = { version = "57.0.0", path = "./arrow-cast" }
93+
arrow-csv = { version = "57.0.0", path = "./arrow-csv" }
94+
arrow-data = { version = "57.0.0", path = "./arrow-data" }
95+
arrow-ipc = { version = "57.0.0", path = "./arrow-ipc" }
96+
arrow-json = { version = "57.0.0", path = "./arrow-json" }
97+
arrow-ord = { version = "57.0.0", path = "./arrow-ord" }
98+
arrow-pyarrow = { version = "57.0.0", path = "./arrow-pyarrow" }
99+
arrow-row = { version = "57.0.0", path = "./arrow-row" }
100+
arrow-schema = { version = "57.0.0", path = "./arrow-schema" }
101+
arrow-select = { version = "57.0.0", path = "./arrow-select" }
102+
arrow-string = { version = "57.0.0", path = "./arrow-string" }
103+
parquet = { version = "57.0.0", path = "./parquet", default-features = false }
104104

105105
# These crates have not yet been released and thus do not use the workspace version
106106
parquet-geospatial = { version = "0.1.0", path = "./parquet-geospatial" }

arrow-array/src/array/list_array.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ impl<OffsetSize: OffsetSizeTrait> From<FixedSizeListArray> for GenericListArray<
466466
_ => unreachable!(),
467467
};
468468

469-
let offsets = OffsetBuffer::from_lengths(std::iter::repeat_n(size, value.len()));
469+
let offsets = OffsetBuffer::from_repeated_length(size, value.len());
470470

471471
Self {
472472
data_type: Self::DATA_TYPE_CONSTRUCTOR(field.clone()),

arrow-array/src/builder/generic_bytes_view_builder.rs

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -306,15 +306,30 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
306306
/// - String length exceeds `u32::MAX`
307307
#[inline]
308308
pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
309+
self.try_append_value(value).unwrap()
310+
}
311+
312+
/// Appends a value into the builder
313+
///
314+
/// # Errors
315+
///
316+
/// Returns an error if:
317+
/// - String buffer count exceeds `u32::MAX`
318+
/// - String length exceeds `u32::MAX`
319+
#[inline]
320+
pub fn try_append_value(&mut self, value: impl AsRef<T::Native>) -> Result<(), ArrowError> {
309321
let v: &[u8] = value.as_ref().as_ref();
310-
let length: u32 = v.len().try_into().unwrap();
322+
let length: u32 = v.len().try_into().map_err(|_| {
323+
ArrowError::InvalidArgumentError(format!("String length {} exceeds u32::MAX", v.len()))
324+
})?;
325+
311326
if length <= MAX_INLINE_VIEW_LEN {
312327
let mut view_buffer = [0; 16];
313328
view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
314329
view_buffer[4..4 + v.len()].copy_from_slice(v);
315330
self.views_buffer.push(u128::from_le_bytes(view_buffer));
316331
self.null_buffer_builder.append_non_null();
317-
return;
332+
return Ok(());
318333
}
319334

320335
// Deduplication if:
@@ -339,7 +354,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
339354
self.views_buffer.push(self.views_buffer[*idx]);
340355
self.null_buffer_builder.append_non_null();
341356
self.string_tracker = Some((ht, hasher));
342-
return;
357+
return Ok(());
343358
}
344359
Entry::Vacant(vacant) => {
345360
// o.w. we insert the (string hash -> view index)
@@ -356,17 +371,28 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
356371
let to_reserve = v.len().max(self.block_size.next_size() as usize);
357372
self.in_progress.reserve(to_reserve);
358373
};
374+
359375
let offset = self.in_progress.len() as u32;
360376
self.in_progress.extend_from_slice(v);
361377

378+
let buffer_index: u32 = self.completed.len().try_into().map_err(|_| {
379+
ArrowError::InvalidArgumentError(format!(
380+
"Buffer count {} exceeds u32::MAX",
381+
self.completed.len()
382+
))
383+
})?;
384+
362385
let view = ByteView {
363386
length,
387+
// This won't panic as we checked the length of prefix earlier.
364388
prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()),
365-
buffer_index: self.completed.len() as u32,
389+
buffer_index,
366390
offset,
367391
};
368392
self.views_buffer.push(view.into());
369393
self.null_buffer_builder.append_non_null();
394+
395+
Ok(())
370396
}
371397

372398
/// Append an `Option` value into the builder
@@ -581,7 +607,6 @@ mod tests {
581607
use core::str;
582608

583609
use super::*;
584-
use crate::Array;
585610

586611
#[test]
587612
fn test_string_view_deduplicate() {

arrow-array/src/types.rs

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1324,7 +1324,7 @@ pub trait DecimalType:
13241324
/// Maximum no of digits after the decimal point (note the scale can be negative)
13251325
const MAX_SCALE: i8;
13261326
/// The maximum value for each precision in `0..=MAX_PRECISION`: [0, 9, 99, ...]
1327-
const MAX_FOR_EACH_PRECISION: &[Self::Native];
1327+
const MAX_FOR_EACH_PRECISION: &'static [Self::Native];
13281328
/// fn to create its [`DataType`]
13291329
const TYPE_CONSTRUCTOR: fn(u8, i8) -> DataType;
13301330
/// Default values for [`DataType`]
@@ -1395,7 +1395,8 @@ impl DecimalType for Decimal32Type {
13951395
const BYTE_LENGTH: usize = 4;
13961396
const MAX_PRECISION: u8 = DECIMAL32_MAX_PRECISION;
13971397
const MAX_SCALE: i8 = DECIMAL32_MAX_SCALE;
1398-
const MAX_FOR_EACH_PRECISION: &[i32] = &arrow_data::decimal::MAX_DECIMAL32_FOR_EACH_PRECISION;
1398+
const MAX_FOR_EACH_PRECISION: &'static [i32] =
1399+
&arrow_data::decimal::MAX_DECIMAL32_FOR_EACH_PRECISION;
13991400
const TYPE_CONSTRUCTOR: fn(u8, i8) -> DataType = DataType::Decimal32;
14001401
const DEFAULT_TYPE: DataType =
14011402
DataType::Decimal32(DECIMAL32_MAX_PRECISION, DECIMAL32_DEFAULT_SCALE);
@@ -1430,7 +1431,8 @@ impl DecimalType for Decimal64Type {
14301431
const BYTE_LENGTH: usize = 8;
14311432
const MAX_PRECISION: u8 = DECIMAL64_MAX_PRECISION;
14321433
const MAX_SCALE: i8 = DECIMAL64_MAX_SCALE;
1433-
const MAX_FOR_EACH_PRECISION: &[i64] = &arrow_data::decimal::MAX_DECIMAL64_FOR_EACH_PRECISION;
1434+
const MAX_FOR_EACH_PRECISION: &'static [i64] =
1435+
&arrow_data::decimal::MAX_DECIMAL64_FOR_EACH_PRECISION;
14341436
const TYPE_CONSTRUCTOR: fn(u8, i8) -> DataType = DataType::Decimal64;
14351437
const DEFAULT_TYPE: DataType =
14361438
DataType::Decimal64(DECIMAL64_MAX_PRECISION, DECIMAL64_DEFAULT_SCALE);
@@ -1465,7 +1467,8 @@ impl DecimalType for Decimal128Type {
14651467
const BYTE_LENGTH: usize = 16;
14661468
const MAX_PRECISION: u8 = DECIMAL128_MAX_PRECISION;
14671469
const MAX_SCALE: i8 = DECIMAL128_MAX_SCALE;
1468-
const MAX_FOR_EACH_PRECISION: &[i128] = &arrow_data::decimal::MAX_DECIMAL128_FOR_EACH_PRECISION;
1470+
const MAX_FOR_EACH_PRECISION: &'static [i128] =
1471+
&arrow_data::decimal::MAX_DECIMAL128_FOR_EACH_PRECISION;
14691472
const TYPE_CONSTRUCTOR: fn(u8, i8) -> DataType = DataType::Decimal128;
14701473
const DEFAULT_TYPE: DataType =
14711474
DataType::Decimal128(DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE);
@@ -1500,7 +1503,8 @@ impl DecimalType for Decimal256Type {
15001503
const BYTE_LENGTH: usize = 32;
15011504
const MAX_PRECISION: u8 = DECIMAL256_MAX_PRECISION;
15021505
const MAX_SCALE: i8 = DECIMAL256_MAX_SCALE;
1503-
const MAX_FOR_EACH_PRECISION: &[i256] = &arrow_data::decimal::MAX_DECIMAL256_FOR_EACH_PRECISION;
1506+
const MAX_FOR_EACH_PRECISION: &'static [i256] =
1507+
&arrow_data::decimal::MAX_DECIMAL256_FOR_EACH_PRECISION;
15041508
const TYPE_CONSTRUCTOR: fn(u8, i8) -> DataType = DataType::Decimal256;
15051509
const DEFAULT_TYPE: DataType =
15061510
DataType::Decimal256(DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE);

arrow-avro/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,14 @@ This crate provides:
4444

4545
```toml
4646
[dependencies]
47-
arrow-avro = "56"
47+
arrow-avro = "57.0.0"
4848
````
4949

5050
Disable defaults and pick only what you need (see **Feature Flags**):
5151

5252
```toml
5353
[dependencies]
54-
arrow-avro = { version = "56", default-features = false, features = ["deflate", "snappy"] }
54+
arrow-avro = { version = "57.0.0", default-features = false, features = ["deflate", "snappy"] }
5555
```
5656

5757
---

arrow-buffer/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,8 @@ harness = false
5959
[[bench]]
6060
name = "offset"
6161
harness = false
62+
63+
[[bench]]
64+
name = "mutable_buffer_repeat_slice"
65+
harness = false
66+
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow_buffer::Buffer;
19+
use criterion::*;
20+
use rand::distr::Alphanumeric;
21+
use rand::rngs::StdRng;
22+
use rand::{Rng, SeedableRng};
23+
use std::hint;
24+
25+
fn criterion_benchmark(c: &mut Criterion) {
26+
let mut group = c.benchmark_group("MutableBuffer repeat slice");
27+
let mut rng = StdRng::seed_from_u64(42);
28+
29+
for slice_length in [3, 20, 100] {
30+
let slice_to_repeat: Vec<u8> = hint::black_box(
31+
(&mut rng)
32+
.sample_iter(&Alphanumeric)
33+
.take(slice_length)
34+
.collect(),
35+
);
36+
let slice_to_repeat: &[u8] = slice_to_repeat.as_ref();
37+
38+
for repeat_count in [3, 64, 1024, 8192] {
39+
let parameter_string = format!("slice_len={slice_length} n={repeat_count}");
40+
41+
group.bench_with_input(
42+
BenchmarkId::new("repeat_slice_n_times", &parameter_string),
43+
&(repeat_count),
44+
|b, &repeat_count| {
45+
b.iter(|| {
46+
let mut mutable_buffer = arrow_buffer::MutableBuffer::with_capacity(0);
47+
48+
mutable_buffer.repeat_slice_n_times(slice_to_repeat, repeat_count);
49+
50+
let buffer: Buffer = mutable_buffer.into();
51+
52+
buffer
53+
})
54+
},
55+
);
56+
group.bench_with_input(
57+
BenchmarkId::new("extend_from_slice loop", &parameter_string),
58+
&(repeat_count),
59+
|b, &repeat_count| {
60+
b.iter(|| {
61+
let mut mutable_buffer = arrow_buffer::MutableBuffer::with_capacity(
62+
size_of_val(slice_to_repeat) * repeat_count,
63+
);
64+
65+
for _ in 0..repeat_count {
66+
mutable_buffer.extend_from_slice(slice_to_repeat);
67+
}
68+
69+
let buffer: Buffer = mutable_buffer.into();
70+
71+
buffer
72+
})
73+
},
74+
);
75+
}
76+
}
77+
}
78+
79+
criterion_group!(benches, criterion_benchmark);
80+
criterion_main!(benches);

0 commit comments

Comments
 (0)