Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 52 additions & 14 deletions arrow-json/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,17 +179,17 @@ impl JsonSerializable for f64 {

#[cfg(test)]
mod tests {
use std::sync::Arc;

use crate::writer::JsonArray;

use super::*;

use crate::writer::JsonArray;
use crate::writer::LineDelimited;
use arrow_array::{
ArrayRef, GenericBinaryArray, GenericByteViewArray, RecordBatch, RecordBatchWriter,
builder::FixedSizeBinaryBuilder, types::BinaryViewType,
ArrayRef, GenericBinaryArray, GenericByteViewArray, GenericListViewArray, RecordBatch,
RecordBatchWriter, builder::FixedSizeBinaryBuilder, types::BinaryViewType,
};
use arrow_schema::{DataType, Field, Fields, Schema};
use serde_json::Value::{Bool, Number as VNumber, String as VString};
use std::io::Cursor;
use std::sync::Arc;

#[test]
fn test_arrow_native_type_to_json() {
Expand All @@ -216,13 +216,6 @@ mod tests {

#[test]
fn test_json_roundtrip_structs() {
use crate::writer::LineDelimited;
use arrow_schema::DataType;
use arrow_schema::Field;
use arrow_schema::Fields;
use arrow_schema::Schema;
use std::sync::Arc;

let schema = Arc::new(Schema::new(vec![
Field::new(
"c1",
Expand Down Expand Up @@ -352,4 +345,49 @@ mod tests {

assert_eq!(batch, decoded);
}

fn assert_list_view_roundtrip<O: arrow_array::OffsetSizeTrait>() {
let flat_field = Arc::new(Field::new("item", DataType::Int32, true));
let flat_dt = GenericListViewArray::<O>::DATA_TYPE_CONSTRUCTOR(flat_field);

let nested_inner = Arc::new(Field::new("item", DataType::Int32, false));
let nested_inner_dt = GenericListViewArray::<O>::DATA_TYPE_CONSTRUCTOR(nested_inner);
let nested_outer = Arc::new(Field::new("item", nested_inner_dt, true));
let nested_dt = GenericListViewArray::<O>::DATA_TYPE_CONSTRUCTOR(nested_outer);

let schema = Arc::new(Schema::new(vec![
Field::new("flat", flat_dt, true),
Field::new("nested", nested_dt, true),
]));

let input = r#"{"flat":[1,2,3],"nested":[[1,2],[3]]}
{"flat":[4,null]}
{}
{"flat":[6],"nested":[[4,5,6]]}
{"flat":[]}
"#
.as_bytes();

let batches: Vec<RecordBatch> = ReaderBuilder::new(schema.clone())
.with_batch_size(1024)
.build(Cursor::new(input))
.unwrap()
.collect::<Result<Vec<_>, _>>()
.unwrap();

let mut output = Vec::new();
let mut writer = WriterBuilder::new().build::<_, LineDelimited>(&mut output);
for batch in &batches {
writer.write(batch).unwrap();
}
writer.finish().unwrap();

assert_eq!(input, &output);
}

#[test]
fn test_json_roundtrip_list_view() {
assert_list_view_roundtrip::<i32>();
assert_list_view_roundtrip::<i64>();
}
}
43 changes: 30 additions & 13 deletions arrow-json/src/reader/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,28 +18,33 @@
use crate::reader::tape::{Tape, TapeElement};
use crate::reader::{ArrayDecoder, DecoderContext};
use arrow_array::OffsetSizeTrait;
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
use arrow_buffer::buffer::NullBuffer;
use arrow_array::builder::BooleanBufferBuilder;
use arrow_buffer::{Buffer, buffer::NullBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType};
use std::marker::PhantomData;

pub struct ListArrayDecoder<O> {
pub type ListArrayDecoder<O> = ListLikeArrayDecoder<O, false>;
pub type ListViewArrayDecoder<O> = ListLikeArrayDecoder<O, true>;

pub struct ListLikeArrayDecoder<O, const IS_VIEW: bool> {
data_type: DataType,
decoder: Box<dyn ArrayDecoder>,
phantom: PhantomData<O>,
is_nullable: bool,
}

impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
impl<O: OffsetSizeTrait, const IS_VIEW: bool> ListLikeArrayDecoder<O, IS_VIEW> {
pub fn new(
ctx: &DecoderContext,
data_type: &DataType,
is_nullable: bool,
) -> Result<Self, ArrowError> {
let field = match data_type {
DataType::List(f) if !O::IS_LARGE => f,
DataType::LargeList(f) if O::IS_LARGE => f,
let field = match (IS_VIEW, data_type) {
(false, DataType::List(f)) if !O::IS_LARGE => f,
(false, DataType::LargeList(f)) if O::IS_LARGE => f,
(true, DataType::ListView(f)) if !O::IS_LARGE => f,
(true, DataType::LargeListView(f)) if O::IS_LARGE => f,
_ => unreachable!(),
};
let decoder = ctx.make_decoder(field.data_type(), field.is_nullable())?;
Expand All @@ -53,11 +58,11 @@ impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
}
}

impl<O: OffsetSizeTrait> ArrayDecoder for ListArrayDecoder<O> {
impl<O: OffsetSizeTrait, const IS_VIEW: bool> ArrayDecoder for ListLikeArrayDecoder<O, IS_VIEW> {
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
let mut child_pos = Vec::with_capacity(pos.len());
let mut offsets = BufferBuilder::<O>::new(pos.len() + 1);
offsets.append(O::from_usize(0).unwrap());
let mut offsets = Vec::with_capacity(pos.len() + 1);
offsets.push(O::from_usize(0).unwrap());

let mut nulls = self
.is_nullable
Expand Down Expand Up @@ -88,18 +93,30 @@ impl<O: OffsetSizeTrait> ArrayDecoder for ListArrayDecoder<O> {
let offset = O::from_usize(child_pos.len()).ok_or_else(|| {
ArrowError::JsonError(format!("offset overflow decoding {}", self.data_type))
})?;
offsets.append(offset)
offsets.push(offset);
}

let child_data = self.decoder.decode(tape, &child_pos)?;
let nulls = nulls.as_mut().map(|x| NullBuffer::new(x.finish()));

let data = ArrayDataBuilder::new(self.data_type.clone())
let mut data = ArrayDataBuilder::new(self.data_type.clone())
.len(pos.len())
.nulls(nulls)
.add_buffer(offsets.finish())
.child_data(vec![child_data]);

if IS_VIEW {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do wonder if theres a cleaner way to handle this rather than calculating size at the end, though I assume performance impact is probably minimal

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could calculate the size on the fly, but I would prefer calculating here and keeping the main loop clean. This implementation follows the same thought as the variant_to_arrow list builder:

if IS_VIEW {
// NOTE: `offsets` is never empty (constructor pushes an entry)
let mut sizes = Vec::with_capacity(self.offsets.len() - 1);
for i in 1..self.offsets.len() {
sizes.push(self.offsets[i] - self.offsets[i - 1]);
}
self.offsets.pop();
let list_view_array = GenericListViewArray::<O>::new(
field,
ScalarBuffer::from(self.offsets),
ScalarBuffer::from(sizes),
ArrayRef::from(element_array),
self.nulls.finish(),
);
Ok(Arc::new(list_view_array))
} else {
let list_array = GenericListArray::<O>::new(
field,
OffsetBuffer::<O>::new(ScalarBuffer::from(self.offsets)),
ArrayRef::from(element_array),
self.nulls.finish(),
);
Ok(Arc::new(list_array))
}

let mut sizes = Vec::with_capacity(offsets.len() - 1);
for i in 1..offsets.len() {
sizes.push(offsets[i] - offsets[i - 1]);
}
offsets.pop();
data = data
.add_buffer(Buffer::from_vec(offsets))
.add_buffer(Buffer::from_vec(sizes));
} else {
data = data.add_buffer(Buffer::from_vec(offsets));
}

// Safety
// Validated lengths above
Ok(unsafe { data.build_unchecked() })
Expand Down
80 changes: 78 additions & 2 deletions arrow-json/src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ pub use value_iter::ValueIter;

use crate::reader::boolean_array::BooleanArrayDecoder;
use crate::reader::decimal_array::DecimalArrayDecoder;
use crate::reader::list_array::ListArrayDecoder;
use crate::reader::list_array::{ListArrayDecoder, ListViewArrayDecoder};
use crate::reader::map_array::MapArrayDecoder;
use crate::reader::null_array::NullArrayDecoder;
use crate::reader::primitive_array::PrimitiveArrayDecoder;
Expand Down Expand Up @@ -792,6 +792,8 @@ fn make_decoder(
DataType::LargeUtf8 => Ok(Box::new(StringArrayDecoder::<i64>::new(coerce_primitive))),
DataType::List(_) => Ok(Box::new(ListArrayDecoder::<i32>::new(ctx, data_type, is_nullable)?)),
DataType::LargeList(_) => Ok(Box::new(ListArrayDecoder::<i64>::new(ctx, data_type, is_nullable)?)),
DataType::ListView(_) => Ok(Box::new(ListViewArrayDecoder::<i32>::new(ctx, data_type, is_nullable)?)),
DataType::LargeListView(_) => Ok(Box::new(ListViewArrayDecoder::<i64>::new(ctx, data_type, is_nullable)?)),
DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(ctx, data_type, is_nullable)?)),
DataType::Binary => Ok(Box::new(BinaryArrayDecoder::<i32>::default())),
DataType::LargeBinary => Ok(Box::new(BinaryArrayDecoder::<i64>::default())),
Expand All @@ -815,7 +817,10 @@ mod tests {
use std::io::{BufReader, Cursor, Seek};

use arrow_array::cast::AsArray;
use arrow_array::{Array, BooleanArray, Float64Array, ListArray, StringArray, StringViewArray};
use arrow_array::{
Array, BooleanArray, Float64Array, GenericListViewArray, ListArray, OffsetSizeTrait,
StringArray, StringViewArray,
};
use arrow_buffer::{ArrowNativeType, Buffer};
use arrow_cast::display::{ArrayFormatter, FormatOptions};
use arrow_data::ArrayDataBuilder;
Expand Down Expand Up @@ -2192,6 +2197,77 @@ mod tests {
assert_eq!(read, expected);
}

fn assert_read_list_view<O: OffsetSizeTrait>() {
let field = Arc::new(Field::new("item", DataType::Int32, true));
let data_type = GenericListViewArray::<O>::DATA_TYPE_CONSTRUCTOR(field.clone());
let schema = Arc::new(Schema::new(vec![Field::new("lv", data_type, true)]));

let buf = r#"
{"lv": [1, 2, 3]}
{"lv": [4, null]}
{"lv": null}
{"lv": [6]}
{"lv": []}
"#;

let batches = do_read(buf, 1024, false, false, schema);
assert_eq!(batches.len(), 1);
let batch = &batches[0];
let col = batch.column(0);
let list_view = col
.as_any()
.downcast_ref::<GenericListViewArray<O>>()
.unwrap();

assert_eq!(list_view.len(), 5);

// Check offsets and sizes
let expected_offsets: Vec<O> = vec![0, 3, 5, 5, 6]
.into_iter()
.map(|v| O::usize_as(v))
.collect();
let expected_sizes: Vec<O> = vec![3, 2, 0, 1, 0]
.into_iter()
.map(|v| O::usize_as(v))
.collect();
assert_eq!(list_view.value_offsets(), &expected_offsets);
assert_eq!(list_view.value_sizes(), &expected_sizes);

// Row 0: [1, 2, 3]
assert!(list_view.is_valid(0));
let vals = list_view.value(0);
let ints = vals.as_primitive::<Int32Type>();
assert_eq!(ints.values(), &[1, 2, 3]);

// Row 1: [4, null]
assert!(list_view.is_valid(1));
let vals = list_view.value(1);
let ints = vals.as_primitive::<Int32Type>();
assert_eq!(ints.len(), 2);
assert_eq!(ints.value(0), 4);
assert!(ints.is_null(1));

// Row 2: null
assert!(list_view.is_null(2));

// Row 3: [6]
assert!(list_view.is_valid(3));
let vals = list_view.value(3);
let ints = vals.as_primitive::<Int32Type>();
assert_eq!(ints.values(), &[6]);

// Row 4: []
assert!(list_view.is_valid(4));
let vals = list_view.value(4);
assert_eq!(vals.len(), 0);
}

#[test]
fn test_read_list_view() {
assert_read_list_view::<i32>();
assert_read_list_view::<i64>();
}

#[test]
fn test_skip_empty_lines() {
let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);
Expand Down
Loading
Loading