Skip to content
Open
26 changes: 13 additions & 13 deletions arrow-json/src/reader/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@

use crate::reader::tape::{Tape, TapeElement};
use crate::reader::{ArrayDecoder, DecoderContext};
use arrow_array::OffsetSizeTrait;
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
use arrow_buffer::buffer::NullBuffer;
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_array::{Array, GenericListArray, OffsetSizeTrait, make_array};
use arrow_buffer::{OffsetBuffer, ScalarBuffer, buffer::NullBuffer};
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
use std::marker::PhantomData;

Expand Down Expand Up @@ -91,17 +91,17 @@ impl<O: OffsetSizeTrait> ArrayDecoder for ListArrayDecoder<O> {
offsets.append(offset)
}

let child_data = self.decoder.decode(tape, &child_pos)?;
let field = match &self.data_type {
DataType::List(f) | DataType::LargeList(f) => f.clone(),
_ => unreachable!(),
};
// SAFETY: offsets are built monotonically starting from 0
let offsets =
unsafe { OffsetBuffer::<O>::new_unchecked(ScalarBuffer::from(offsets.finish())) };
let values = make_array(self.decoder.decode(tape, &child_pos)?);
let nulls = nulls.as_mut().map(|x| NullBuffer::new(x.finish()));

let data = ArrayDataBuilder::new(self.data_type.clone())
.len(pos.len())
.nulls(nulls)
.add_buffer(offsets.finish())
.child_data(vec![child_data]);

// Safety
// Validated lengths above
Ok(unsafe { data.build_unchecked() })
let array = GenericListArray::<O>::try_new(field, offsets, values, nulls)?;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does try_new validate the offsets ? That could be a significant performance hit

Basically as long as this doesn't do additional validation I think it looks good to me

Copy link
Contributor Author

@liamzwbao liamzwbao Mar 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like it will do validation, but I think it's cheap and the local benchmark does not cause a regression. BTW, we don't have an unchecked API for ListArray, the new API simply unwrap the try_new.

/// * `offsets.len() - 1 != nulls.len()`
/// * `offsets.last() > values.len()`
/// * `!field.is_nullable() && values.is_nullable()`
/// * `field.data_type() != values.data_type()`
pub fn try_new(
field: FieldRef,
offsets: OffsetBuffer<OffsetSize>,
values: ArrayRef,
nulls: Option<NullBuffer>,
) -> Result<Self, ArrowError> {
let len = offsets.len() - 1; // Offsets guaranteed to not be empty
let end_offset = offsets.last().unwrap().as_usize();
// don't need to check other values of `offsets` because they are checked
// during construction of `OffsetBuffer`
if end_offset > values.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Max offset of {end_offset} exceeds length of values {}",
values.len()
)));
}
if let Some(n) = nulls.as_ref() {
if n.len() != len {
return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect length of null buffer for {}ListArray, expected {len} got {}",
OffsetSize::PREFIX,
n.len(),
)));
}
}
if !field.is_nullable() && values.is_nullable() {
return Err(ArrowError::InvalidArgumentError(format!(
"Non-nullable field of {}ListArray {:?} cannot contain nulls",
OffsetSize::PREFIX,
field.name()
)));
}
if field.data_type() != values.data_type() {
return Err(ArrowError::InvalidArgumentError(format!(
"{}ListArray expected data type {} got {} for {:?}",
OffsetSize::PREFIX,
field.data_type(),
values.data_type(),
field.name()
)));
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The benchmarks on our runner seem to suggest that this PR is slower for some reason 🤔

Ok(array.into_data())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this call simply creates an ArrayData (which is necessary given the API) so I am not sure it actually avoids any ArrayDatas

In order to avoid ArrayData we would probably need to change the signature of decode to return an ArrayRef directly (rather than ArrayData)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense. I can change the signature of all the decoders if you think it's worth a try

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes I do think it makes sense - - it would be nice to find some way to do that incrementally but if not then we may just have to do one big PR 🤔

}
}
Loading