From 88fc3f41658ce88bcb641f0827e7e6a4c8de648b Mon Sep 17 00:00:00 2001 From: Alexander Rafferty Date: Sat, 28 Feb 2026 22:58:34 +1100 Subject: [PATCH 01/10] Rewrite JSON schema inference logic, and move it into its own module (`infer.rs`); remove obsolete tests --- arrow-json/Cargo.toml | 1 + arrow-json/src/reader/schema.rs | 368 ++------------------------ arrow-json/src/reader/schema/infer.rs | 297 +++++++++++++++++++++ 3 files changed, 322 insertions(+), 344 deletions(-) create mode 100644 arrow-json/src/reader/schema/infer.rs diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 851f0a244f53..8faae63b8de4 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -52,6 +52,7 @@ memchr = "2.7.4" simdutf8 = { workspace = true } ryu = "1.0" itoa = "1.0" +bumpalo = "3.20.2" [dev-dependencies] flate2 = { version = "1", default-features = false, features = ["rust_backend"] } diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs index 524e6b2aa560..785e43adacea 100644 --- a/arrow-json/src/reader/schema.rs +++ b/arrow-json/src/reader/schema.rs @@ -15,118 +15,17 @@ // specific language governing permissions and limitations // under the License. -use super::ValueIter; -use arrow_schema::{ArrowError, DataType, Field, Fields, Schema}; -use indexmap::map::IndexMap as HashMap; -use indexmap::set::IndexSet as HashSet; -use serde_json::Value; use std::borrow::Borrow; use std::io::{BufRead, Seek}; -use std::sync::Arc; - -#[derive(Debug, Clone)] -enum InferredType { - Scalar(HashSet), - Array(Box), - Object(HashMap), - Any, -} - -impl InferredType { - fn merge(&mut self, other: InferredType) -> Result<(), ArrowError> { - match (self, other) { - (InferredType::Array(s), InferredType::Array(o)) => { - s.merge(*o)?; - } - (InferredType::Scalar(self_hs), InferredType::Scalar(other_hs)) => { - other_hs.into_iter().for_each(|v| { - self_hs.insert(v); - }); - } - (InferredType::Object(self_map), InferredType::Object(other_map)) => { - for (k, v) in other_map { - self_map.entry(k).or_insert(InferredType::Any).merge(v)?; - } - } - (s @ InferredType::Any, v) => { - *s = v; - } - (_, InferredType::Any) => {} - // convert a scalar type to a single-item scalar array type. - (InferredType::Array(self_inner_type), other_scalar @ InferredType::Scalar(_)) => { - self_inner_type.merge(other_scalar)?; - } - (s @ InferredType::Scalar(_), InferredType::Array(mut other_inner_type)) => { - other_inner_type.merge(s.clone())?; - *s = InferredType::Array(other_inner_type); - } - // incompatible types - (s, o) => { - return Err(ArrowError::JsonError(format!( - "Incompatible type found during schema inference: {s:?} v.s. {o:?}", - ))); - } - } - - Ok(()) - } - - fn is_none_or_any(ty: Option<&Self>) -> bool { - matches!(ty, Some(Self::Any) | None) - } -} -/// Shorthand for building list data type of `ty` -fn list_type_of(ty: DataType) -> DataType { - DataType::List(Arc::new(Field::new_list_field(ty, true))) -} - -/// Coerce data type during inference -/// -/// * `Int64` and `Float64` should be `Float64` -/// * Lists and scalars are coerced to a list of a compatible scalar -/// * All other types are coerced to `Utf8` -fn coerce_data_type(dt: Vec<&DataType>) -> DataType { - let mut dt_iter = dt.into_iter().cloned(); - let dt_init = dt_iter.next().unwrap_or(DataType::Utf8); - - dt_iter.fold(dt_init, |l, r| match (l, r) { - (DataType::Null, o) | (o, DataType::Null) => o, - (DataType::Boolean, DataType::Boolean) => DataType::Boolean, - (DataType::Int64, DataType::Int64) => DataType::Int64, - (DataType::Float64, DataType::Float64) - | (DataType::Float64, DataType::Int64) - | (DataType::Int64, DataType::Float64) => DataType::Float64, - (DataType::List(l), DataType::List(r)) => { - list_type_of(coerce_data_type(vec![l.data_type(), r.data_type()])) - } - // coerce scalar and scalar array into scalar array - (DataType::List(e), not_list) | (not_list, DataType::List(e)) => { - list_type_of(coerce_data_type(vec![e.data_type(), ¬_list])) - } - _ => DataType::Utf8, - }) -} - -fn generate_datatype(t: &InferredType) -> Result { - Ok(match t { - InferredType::Scalar(hs) => coerce_data_type(hs.iter().collect()), - InferredType::Object(spec) => DataType::Struct(generate_fields(spec)?), - InferredType::Array(ele_type) => list_type_of(generate_datatype(ele_type)?), - InferredType::Any => DataType::Null, - }) -} +use arrow_schema::{ArrowError, Schema}; +use bumpalo::Bump; +use serde_json::Value; -fn generate_fields(spec: &HashMap) -> Result { - spec.iter() - .map(|(k, types)| Ok(Field::new(k, generate_datatype(types)?, true))) - .collect() -} +use self::infer::{EMPTY_OBJECT_TY, infer_json_type}; +use super::ValueIter; -/// Generate schema from JSON field names and inferred data types -fn generate_schema(spec: HashMap) -> Result { - Ok(Schema::new(generate_fields(&spec)?)) -} +mod infer; /// Infer the fields of a JSON file by reading the first n records of the file, with /// `max_read_records` controlling the maximum number of records to read. @@ -209,205 +108,6 @@ pub fn infer_json_schema( Ok((schema, values.record_count())) } -fn set_object_scalar_field_type( - field_types: &mut HashMap, - key: &str, - ftype: DataType, -) -> Result<(), ArrowError> { - if InferredType::is_none_or_any(field_types.get(key)) { - field_types.insert(key.to_string(), InferredType::Scalar(HashSet::new())); - } - - match field_types.get_mut(key).unwrap() { - InferredType::Scalar(hs) => { - hs.insert(ftype); - Ok(()) - } - // in case of column contains both scalar type and scalar array type, we convert type of - // this column to scalar array. - scalar_array @ InferredType::Array(_) => { - let mut hs = HashSet::new(); - hs.insert(ftype); - scalar_array.merge(InferredType::Scalar(hs))?; - Ok(()) - } - t => Err(ArrowError::JsonError(format!( - "Expected scalar or scalar array JSON type, found: {t:?}", - ))), - } -} - -fn infer_scalar_array_type(array: &[Value]) -> Result { - let mut hs = HashSet::new(); - - for v in array { - match v { - Value::Null => {} - Value::Number(n) => { - if n.is_i64() { - hs.insert(DataType::Int64); - } else { - hs.insert(DataType::Float64); - } - } - Value::Bool(_) => { - hs.insert(DataType::Boolean); - } - Value::String(_) => { - hs.insert(DataType::Utf8); - } - Value::Array(_) | Value::Object(_) => { - return Err(ArrowError::JsonError(format!( - "Expected scalar value for scalar array, got: {v:?}" - ))); - } - } - } - - Ok(InferredType::Scalar(hs)) -} - -fn infer_nested_array_type(array: &[Value]) -> Result { - let mut inner_ele_type = InferredType::Any; - - for v in array { - match v { - Value::Array(inner_array) => { - inner_ele_type.merge(infer_array_element_type(inner_array)?)?; - } - x => { - return Err(ArrowError::JsonError(format!( - "Got non array element in nested array: {x:?}" - ))); - } - } - } - - Ok(InferredType::Array(Box::new(inner_ele_type))) -} - -fn infer_struct_array_type(array: &[Value]) -> Result { - let mut field_types = HashMap::new(); - - for v in array { - match v { - Value::Object(map) => { - collect_field_types_from_object(&mut field_types, map)?; - } - _ => { - return Err(ArrowError::JsonError(format!( - "Expected struct value for struct array, got: {v:?}" - ))); - } - } - } - - Ok(InferredType::Object(field_types)) -} - -fn infer_array_element_type(array: &[Value]) -> Result { - match array.iter().take(1).next() { - None => Ok(InferredType::Any), // empty array, return any type that can be updated later - Some(a) => match a { - Value::Array(_) => infer_nested_array_type(array), - Value::Object(_) => infer_struct_array_type(array), - _ => infer_scalar_array_type(array), - }, - } -} - -fn collect_field_types_from_object( - field_types: &mut HashMap, - map: &serde_json::map::Map, -) -> Result<(), ArrowError> { - for (k, v) in map { - match v { - Value::Array(array) => { - let ele_type = infer_array_element_type(array)?; - - if InferredType::is_none_or_any(field_types.get(k)) { - match ele_type { - InferredType::Scalar(_) => { - field_types.insert( - k.to_string(), - InferredType::Array(Box::new(InferredType::Scalar(HashSet::new()))), - ); - } - InferredType::Object(_) => { - field_types.insert( - k.to_string(), - InferredType::Array(Box::new(InferredType::Object(HashMap::new()))), - ); - } - InferredType::Any | InferredType::Array(_) => { - // set inner type to any for nested array as well - // so it can be updated properly from subsequent type merges - field_types.insert( - k.to_string(), - InferredType::Array(Box::new(InferredType::Any)), - ); - } - } - } - - match field_types.get_mut(k).unwrap() { - InferredType::Array(inner_type) => { - inner_type.merge(ele_type)?; - } - // in case of column contains both scalar type and scalar array type, we - // convert type of this column to scalar array. - field_type @ InferredType::Scalar(_) => { - field_type.merge(ele_type)?; - *field_type = InferredType::Array(Box::new(field_type.clone())); - } - t => { - return Err(ArrowError::JsonError(format!( - "Expected array json type, found: {t:?}", - ))); - } - } - } - Value::Bool(_) => { - set_object_scalar_field_type(field_types, k, DataType::Boolean)?; - } - Value::Null => { - // we treat json as nullable by default when inferring, so just - // mark existence of a field if it wasn't known before - if !field_types.contains_key(k) { - field_types.insert(k.to_string(), InferredType::Any); - } - } - Value::Number(n) => { - if n.is_i64() { - set_object_scalar_field_type(field_types, k, DataType::Int64)?; - } else { - set_object_scalar_field_type(field_types, k, DataType::Float64)?; - } - } - Value::String(_) => { - set_object_scalar_field_type(field_types, k, DataType::Utf8)?; - } - Value::Object(inner_map) => { - if let InferredType::Any = field_types.get(k).unwrap_or(&InferredType::Any) { - field_types.insert(k.to_string(), InferredType::Object(HashMap::new())); - } - match field_types.get_mut(k).unwrap() { - InferredType::Object(inner_field_types) => { - collect_field_types_from_object(inner_field_types, inner_map)?; - } - t => { - return Err(ArrowError::JsonError(format!( - "Expected object json type, found: {t:?}", - ))); - } - } - } - } - } - - Ok(()) -} - /// Infer the fields of a JSON file by reading all items from the JSON Value Iterator. /// /// The following type coercion logic is implemented: @@ -426,30 +126,31 @@ where I: Iterator>, V: Borrow, { - let mut field_types: HashMap = HashMap::new(); - - for record in value_iter { - match record?.borrow() { - Value::Object(map) => { - collect_field_types_from_object(&mut field_types, map)?; - } - value => { - return Err(ArrowError::JsonError(format!( - "Expected JSON record to be an object, found {value:?}" - ))); - } - }; - } - - generate_schema(field_types) + let arena = &Bump::new(); + + value_iter + .into_iter() + .try_fold(EMPTY_OBJECT_TY, |ty, record| { + infer_json_type(record?.borrow(), ty, arena) + })? + .into_schema() } #[cfg(test)] mod tests { use super::*; - use flate2::read::GzDecoder; + use std::fs::File; use std::io::{BufReader, Cursor}; + use std::sync::Arc; + + use arrow_schema::{DataType, Field, Fields}; + use flate2::read::GzDecoder; + + /// Shorthand for building list data type of `ty` + fn list_type_of(ty: DataType) -> DataType { + DataType::List(Arc::new(Field::new_list_field(ty, true))) + } #[test] fn test_json_infer_schema() { @@ -600,27 +301,6 @@ mod tests { assert_eq!(small_field.data_type(), &DataType::Float64); } - #[test] - fn test_coercion_scalar_and_list() { - assert_eq!( - list_type_of(DataType::Float64), - coerce_data_type(vec![&DataType::Float64, &list_type_of(DataType::Float64)]) - ); - assert_eq!( - list_type_of(DataType::Float64), - coerce_data_type(vec![&DataType::Float64, &list_type_of(DataType::Int64)]) - ); - assert_eq!( - list_type_of(DataType::Int64), - coerce_data_type(vec![&DataType::Int64, &list_type_of(DataType::Int64)]) - ); - // boolean and number are incompatible, return utf8 - assert_eq!( - list_type_of(DataType::Utf8), - coerce_data_type(vec![&DataType::Boolean, &list_type_of(DataType::Float64)]) - ); - } - #[test] fn test_invalid_json_infer_schema() { let re = infer_json_schema_from_seekable(Cursor::new(b"}"), None); diff --git a/arrow-json/src/reader/schema/infer.rs b/arrow-json/src/reader/schema/infer.rs new file mode 100644 index 000000000000..bac3b70ff140 --- /dev/null +++ b/arrow-json/src/reader/schema/infer.rs @@ -0,0 +1,297 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.kind() (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.kind() +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; + +use arrow_schema::{ArrowError, DataType, Field, Fields, Schema}; +use bumpalo::Bump; + +#[derive(Clone, Copy, Debug)] +pub struct InferredType<'t>(&'t TyKind<'t>); + +#[derive(Clone, Copy, Debug)] +enum TyKind<'t> { + Any, + Scalar(ScalarTy), + Array(InferredType<'t>), + Object(&'t [(&'t str, InferredType<'t>)]), +} + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +enum ScalarTy { + Bool, + Int64, + Float64, + String, + // NOTE: Null isn't needed because it's absorbed into Never +} + +pub static ANY_TY: InferredType<'static> = InferredType(&TyKind::Any); +pub static BOOL_TY: InferredType<'static> = InferredType(&TyKind::Scalar(ScalarTy::Bool)); +pub static INT64_TY: InferredType<'static> = InferredType(&TyKind::Scalar(ScalarTy::Int64)); +pub static FLOAT64_TY: InferredType<'static> = InferredType(&TyKind::Scalar(ScalarTy::Float64)); +pub static STRING_TY: InferredType<'static> = InferredType(&TyKind::Scalar(ScalarTy::String)); +pub static ARRAY_OF_ANY_TY: InferredType<'static> = InferredType(&TyKind::Array(ANY_TY)); +pub static EMPTY_OBJECT_TY: InferredType<'static> = InferredType(&TyKind::Object(&[])); + +/// Infers the type of the provided JSON value, given an expected type. +pub fn infer_json_type<'a, 't>( + value: impl JsonValue<'a>, + expected: InferredType<'t>, + arena: &'t Bump, +) -> Result, ArrowError> { + let make_err = |got| { + let expected = match expected.kind() { + TyKind::Any => unreachable!(), + TyKind::Scalar(_) => "a scalar value", + TyKind::Array(_) => "an array", + TyKind::Object(_) => "an object", + }; + let msg = format!("Expected {expected}, found {got}"); + ArrowError::JsonError(msg) + }; + + let infer_scalar = |scalar: ScalarTy, got: &'static str| { + Ok(match expected.kind() { + TyKind::Any => match scalar { + ScalarTy::Bool => BOOL_TY, + ScalarTy::Int64 => INT64_TY, + ScalarTy::Float64 => FLOAT64_TY, + ScalarTy::String => STRING_TY, + }, + TyKind::Scalar(expect) => match (expect, &scalar) { + (ScalarTy::Bool, ScalarTy::Bool) => BOOL_TY, + (ScalarTy::Int64, ScalarTy::Int64) => INT64_TY, + // Mixed numbers coerce to f64 + (ScalarTy::Int64 | ScalarTy::Float64, ScalarTy::Int64 | ScalarTy::Float64) => { + FLOAT64_TY + } + // Any other combination coerces to string + _ => STRING_TY, + }, + _ => Err(make_err(got))?, + }) + }; + + match value.get() { + JsonType::Null => Ok(expected), + JsonType::Bool => infer_scalar(ScalarTy::Bool, "a boolean"), + JsonType::Int64 => infer_scalar(ScalarTy::Int64, "a number"), + JsonType::Float64 => infer_scalar(ScalarTy::Float64, "a number"), + JsonType::String => infer_scalar(ScalarTy::String, "a string"), + JsonType::Array => { + let (expected, expected_elem) = match *expected.kind() { + TyKind::Any => (ARRAY_OF_ANY_TY, ANY_TY), + TyKind::Array(inner) => (expected, inner), + _ => Err(make_err("an array"))?, + }; + + let elem = value + .elements() + .try_fold(expected_elem, |expected, value| { + let result = infer_json_type(value, expected, arena); + result + })?; + + Ok(if elem.ptr_eq(expected_elem) { + expected + } else { + InferredType::new_array(elem, arena) + }) + } + JsonType::Object => { + let (expected, expected_fields) = match *expected.kind() { + TyKind::Any => (EMPTY_OBJECT_TY, &[] as &[_]), + TyKind::Object(fields) => (expected, fields), + _ => Err(make_err("an object"))?, + }; + + let mut num_fields = expected_fields.len(); + let mut substs = HashMap::)>::new(); + + for (key, value) in value.fields() { + let existing_field = expected_fields + .iter() + .copied() + .enumerate() + .find(|(_, (existing_key, _))| *existing_key == key); + + match existing_field { + Some((field_idx, (key, expect_ty))) => { + let ty = infer_json_type(value, expect_ty, arena)?; + if !ty.ptr_eq(expect_ty) { + substs.insert(field_idx, (key, ty)); + } + } + None => { + let field_idx = num_fields; + num_fields += 1; + let key = arena.alloc_str(key); + let ty = infer_json_type(value, ANY_TY, arena)?; + substs.insert(field_idx, (key, ty)); + } + }; + } + + if substs.is_empty() { + return Ok(expected); + } + + let fields = (0..num_fields).map(|idx| match substs.get(&idx) { + Some(subst) => *subst, + None => expected_fields[idx], + }); + + Ok(InferredType::new_object(fields, arena)) + } + } +} + +impl<'t> InferredType<'t> { + fn new_array(inner: InferredType<'t>, arena: &'t Bump) -> Self { + Self(arena.alloc(TyKind::Array(inner))) + } + + fn new_object(fields: F, arena: &'t Bump) -> Self + where + F: IntoIterator)>, + F::IntoIter: ExactSizeIterator, + { + let fields = arena.alloc_slice_fill_iter(fields); + Self(arena.alloc(TyKind::Object(fields))) + } + + fn kind(self) -> &'t TyKind<'t> { + self.0 + } + + fn ptr_eq(self, other: Self) -> bool { + std::ptr::eq(self.kind(), other.kind()) + } + + pub fn into_datatype(self) -> DataType { + match self.kind() { + TyKind::Any => DataType::Null, + TyKind::Scalar(s) => s.into_datatype(), + TyKind::Array(elem) => DataType::List(elem.into_list_field().into()), + TyKind::Object(fields) => { + DataType::Struct(fields.iter().map(|(key, ty)| ty.into_field(*key)).collect()) + } + } + } + + pub fn into_field(self, name: impl Into) -> Field { + Field::new(name, self.into_datatype(), true) + } + + pub fn into_list_field(self) -> Field { + Field::new_list_field(self.into_datatype(), true) + } + + pub fn into_schema(self) -> Result { + let TyKind::Object(fields) = self.kind() else { + Err(ArrowError::JsonError(format!( + "Expected JSON object, found {self:?}", + )))? + }; + + let fields = fields + .iter() + .map(|(key, ty)| ty.into_field(*key)) + .collect::(); + + Ok(Schema::new(fields)) + } +} + +impl ScalarTy { + fn into_datatype(self) -> DataType { + match self { + Self::Bool => DataType::Boolean, + Self::Int64 => DataType::Int64, + Self::Float64 => DataType::Float64, + Self::String => DataType::Utf8, + } + } +} + +/// Abstraction of a JSON value that is only concerned with the +/// type of the value rather than the value itself. +pub trait JsonValue<'a> { + /// Gets the type of this JSON value. + fn get(&self) -> JsonType; + + /// Returns an iterator over the elements of this JSON array. + /// + /// Panics if the JSON value is not an array. + fn elements(&self) -> impl Iterator; + + /// Returns an iterator over the fields of this JSON object. + /// + /// Panics if the JSON value is not an object. + fn fields(&self) -> impl Iterator; +} + +/// The type of a JSON value +pub enum JsonType { + Null, + Bool, + Int64, + Float64, + String, + Array, + Object, +} + +impl<'a> JsonValue<'a> for &'a serde_json::Value { + fn get(&self) -> JsonType { + use serde_json::Value; + + match self { + Value::Null => JsonType::Null, + Value::Bool(_) => JsonType::Bool, + Value::Number(n) => { + if n.is_i64() { + JsonType::Int64 + } else { + JsonType::Float64 + } + } + Value::String(_) => JsonType::String, + Value::Array(_) => JsonType::Array, + Value::Object(_) => JsonType::Object, + } + } + + fn elements(&self) -> impl Iterator { + use serde_json::Value; + + match self { + Value::Array(elements) => elements.iter(), + _ => panic!("Expected an array"), + } + } + + fn fields(&self) -> impl Iterator { + use serde_json::Value; + + match self { + Value::Object(fields) => fields.iter().map(|(key, value)| (key.as_str(), value)), + _ => panic!("Expected an object"), + } + } +} From 27b07c33ff93d4b40c7c55f0c0733b0f9a76297d Mon Sep 17 00:00:00 2001 From: Alexander Rafferty Date: Sat, 28 Feb 2026 23:02:10 +1100 Subject: [PATCH 02/10] Remove "mixed_arrays.json" as the reader can't actually handle "scalar-to-array promotion", and adjust tests. --- arrow-json/src/reader/schema.rs | 14 +++++++------- arrow-json/src/reader/value_iter.rs | 2 +- arrow-json/test/data/arrays.json.gz | Bin 0 -> 133 bytes arrow-json/test/data/mixed_arrays.json | 4 ---- arrow-json/test/data/mixed_arrays.json.gz | Bin 141 -> 0 bytes 5 files changed, 8 insertions(+), 12 deletions(-) create mode 100644 arrow-json/test/data/arrays.json.gz delete mode 100644 arrow-json/test/data/mixed_arrays.json delete mode 100644 arrow-json/test/data/mixed_arrays.json.gz diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs index 785e43adacea..904f9ca583b1 100644 --- a/arrow-json/src/reader/schema.rs +++ b/arrow-json/src/reader/schema.rs @@ -43,7 +43,7 @@ mod infer; /// use std::io::BufReader; /// use arrow_json::reader::infer_json_schema_from_seekable; /// -/// let file = File::open("test/data/mixed_arrays.json").unwrap(); +/// let file = File::open("test/data/arrays.json").unwrap(); /// // file's cursor's offset at 0 /// let mut reader = BufReader::new(file); /// let inferred_schema = infer_json_schema_from_seekable(&mut reader, None).unwrap(); @@ -89,7 +89,7 @@ pub fn infer_json_schema_from_seekable( /// use flate2::read::GzDecoder; /// use arrow_json::reader::infer_json_schema; /// -/// let mut file = File::open("test/data/mixed_arrays.json.gz").unwrap(); +/// let mut file = File::open("test/data/arrays.json.gz").unwrap(); /// /// // file's cursor's offset at 0 /// let mut reader = BufReader::new(GzDecoder::new(&file)); @@ -158,21 +158,21 @@ mod tests { Field::new("a", DataType::Int64, true), Field::new("b", list_type_of(DataType::Float64), true), Field::new("c", list_type_of(DataType::Boolean), true), - Field::new("d", list_type_of(DataType::Utf8), true), + Field::new("d", DataType::Utf8, true), ]); - let mut reader = BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); + let mut reader = BufReader::new(File::open("test/data/arrays.json").unwrap()); let (inferred_schema, n_rows) = infer_json_schema_from_seekable(&mut reader, None).unwrap(); assert_eq!(inferred_schema, schema); - assert_eq!(n_rows, 4); + assert_eq!(n_rows, 3); - let file = File::open("test/data/mixed_arrays.json.gz").unwrap(); + let file = File::open("test/data/arrays.json.gz").unwrap(); let mut reader = BufReader::new(GzDecoder::new(&file)); let (inferred_schema, n_rows) = infer_json_schema(&mut reader, None).unwrap(); assert_eq!(inferred_schema, schema); - assert_eq!(n_rows, 4); + assert_eq!(n_rows, 3); } #[test] diff --git a/arrow-json/src/reader/value_iter.rs b/arrow-json/src/reader/value_iter.rs index f70b893f52a0..350c2290fd78 100644 --- a/arrow-json/src/reader/value_iter.rs +++ b/arrow-json/src/reader/value_iter.rs @@ -30,7 +30,7 @@ use serde_json::Value; /// use arrow_json::reader::ValueIter; /// /// let mut reader = -/// BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); +/// BufReader::new(File::open("test/data/arrays.json").unwrap()); /// let mut value_reader = ValueIter::new(&mut reader, None); /// for value in value_reader { /// println!("JSON value: {}", value.unwrap()); diff --git a/arrow-json/test/data/arrays.json.gz b/arrow-json/test/data/arrays.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..c584e16847ec2ab371fde227b781291f8deba7fc GIT binary patch literal 133 zcmV;00DAu)iwFpZ51?rP17UJ^dYIARH0IOC?RI)PEQBX=!vWhm+Gtf~m)HBvm z&^6ODj0K7&14YvkbBa@S6iSLpQ$a#0N>)lHO0~RP)gZ09h6Z>wgLL82YiMR Date: Sat, 28 Feb 2026 23:44:01 +1100 Subject: [PATCH 03/10] Re-implement `infer_json_schema` to make use of `TapeDecoder`, removing the need to parse rows into `serde_json::Value`s first. --- arrow-json/src/reader/schema.rs | 93 +++++++++++++++++++++++---- arrow-json/src/reader/schema/infer.rs | 50 ++++++++++++++ arrow-json/src/reader/tape.rs | 49 ++++++++++++++ 3 files changed, 181 insertions(+), 11 deletions(-) diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs index 904f9ca583b1..d1cd94012411 100644 --- a/arrow-json/src/reader/schema.rs +++ b/arrow-json/src/reader/schema.rs @@ -22,8 +22,8 @@ use arrow_schema::{ArrowError, Schema}; use bumpalo::Bump; use serde_json::Value; -use self::infer::{EMPTY_OBJECT_TY, infer_json_type}; -use super::ValueIter; +use super::tape::TapeDecoder; +use infer::{ANY_TY, EMPTY_OBJECT_TY, InferredType, TapeValue, infer_json_type}; mod infer; @@ -100,12 +100,29 @@ pub fn infer_json_schema_from_seekable( /// file.seek(SeekFrom::Start(0)).unwrap(); /// ``` pub fn infer_json_schema( - reader: R, + mut reader: R, max_read_records: Option, ) -> Result<(Schema, usize), ArrowError> { - let mut values = ValueIter::new(reader, max_read_records); - let schema = infer_json_schema_from_iterator(&mut values)?; - Ok((schema, values.record_count())) + let arena = Bump::new(); + let mut decoder = SchemaDecoder::new(max_read_records, &arena); + + loop { + let buf = reader.fill_buf()?; + let read = buf.len(); + + if read == 0 { + break; + } + + let decoded = decoder.decode(buf)?; + reader.consume(decoded); + + if decoded != read { + break; + } + } + + decoder.finish() } /// Infer the fields of a JSON file by reading all items from the JSON Value Iterator. @@ -136,6 +153,60 @@ where .into_schema() } +struct SchemaDecoder<'a> { + decoder: TapeDecoder, + max_read_records: Option, + record_count: usize, + schema: InferredType<'a>, + arena: &'a Bump, +} + +impl<'a> SchemaDecoder<'a> { + pub fn new(max_read_records: Option, arena: &'a Bump) -> Self { + Self { + decoder: TapeDecoder::new(1024, 8), + max_read_records, + record_count: 0, + schema: ANY_TY, + arena, + } + } + + pub fn decode(&mut self, buf: &[u8]) -> Result { + let read = self.decoder.decode(buf)?; + if read != buf.len() { + self.infer_batch()?; + } + Ok(read) + } + + pub fn finish(mut self) -> Result<(Schema, usize), ArrowError> { + self.infer_batch()?; + Ok((self.schema.into_schema()?, self.record_count)) + } + + fn infer_batch(&mut self) -> Result<(), ArrowError> { + let tape = self.decoder.finish()?; + + let remaining_records = self + .max_read_records + .map_or(usize::MAX, |max| max - self.record_count); + + let records = tape + .iter_rows() + .map(|idx| TapeValue::new(&tape, idx)) + .take(remaining_records); + + for record in records { + self.schema = infer_json_type(record, self.schema, self.arena)?; + self.record_count += 1; + } + + self.decoder.clear(); + Ok(()) + } +} + #[cfg(test)] mod tests { use super::*; @@ -306,7 +377,7 @@ mod tests { let re = infer_json_schema_from_seekable(Cursor::new(b"}"), None); assert_eq!( re.err().unwrap().to_string(), - "Json error: Not valid JSON: expected value at line 1 column 1", + "Json error: Encountered unexpected '}' whilst parsing value" ); } @@ -320,14 +391,14 @@ mod tests { let (inferred_schema, _) = infer_json_schema_from_seekable(Cursor::new(data), None).expect("infer"); let schema = Schema::new(vec![ - Field::new("an", list_type_of(DataType::Null), true), Field::new("in", DataType::Int64, true), - Field::new("n", DataType::Null, true), - Field::new("na", list_type_of(DataType::Null), true), - Field::new("nas", list_type_of(DataType::Utf8), true), Field::new("ni", DataType::Int64, true), Field::new("ns", DataType::Utf8, true), Field::new("sn", DataType::Utf8, true), + Field::new("n", DataType::Null, true), + Field::new("an", list_type_of(DataType::Null), true), + Field::new("na", list_type_of(DataType::Null), true), + Field::new("nas", list_type_of(DataType::Utf8), true), ]); assert_eq!(inferred_schema, schema); } diff --git a/arrow-json/src/reader/schema/infer.rs b/arrow-json/src/reader/schema/infer.rs index bac3b70ff140..038c8d38f80e 100644 --- a/arrow-json/src/reader/schema/infer.rs +++ b/arrow-json/src/reader/schema/infer.rs @@ -20,6 +20,8 @@ use std::collections::HashMap; use arrow_schema::{ArrowError, DataType, Field, Fields, Schema}; use bumpalo::Bump; +use crate::reader::tape::{Tape, TapeElement}; + #[derive(Clone, Copy, Debug)] pub struct InferredType<'t>(&'t TyKind<'t>); @@ -257,6 +259,54 @@ pub enum JsonType { Object, } +#[derive(Copy, Clone, Debug)] +pub struct TapeValue<'a> { + tape: &'a Tape<'a>, + idx: u32, +} + +impl<'a> TapeValue<'a> { + pub fn new(tape: &'a Tape<'a>, idx: u32) -> Self { + Self { tape, idx } + } +} + +impl<'a> JsonValue<'a> for TapeValue<'a> { + fn get(&self) -> JsonType { + match self.tape.get(self.idx) { + TapeElement::Null => JsonType::Null, + TapeElement::False => JsonType::Bool, + TapeElement::True => JsonType::Bool, + TapeElement::I64(_) | TapeElement::I32(_) => JsonType::Int64, + TapeElement::F64(_) | TapeElement::F32(_) => JsonType::Float64, + TapeElement::Number(s) => { + if self.tape.get_string(s).parse::().is_ok() { + JsonType::Int64 + } else { + JsonType::Float64 + } + } + TapeElement::String(_) => JsonType::String, + TapeElement::StartList(_) => JsonType::Array, + TapeElement::EndList(_) => unreachable!(), + TapeElement::StartObject(_) => JsonType::Object, + TapeElement::EndObject(_) => unreachable!(), + } + } + + fn elements(&self) -> impl Iterator { + self.tape + .iter_elements(self.idx) + .map(move |idx| Self { idx, ..*self }) + } + + fn fields(&self) -> impl Iterator { + self.tape + .iter_fields(self.idx) + .map(move |(key, idx)| (key, Self { idx, ..*self })) + } +} + impl<'a> JsonValue<'a> for &'a serde_json::Value { fn get(&self) -> JsonType { use serde_json::Value; diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs index 89ee3f778765..4ea8dce2f668 100644 --- a/arrow-json/src/reader/tape.rs +++ b/arrow-json/src/reader/tape.rs @@ -142,6 +142,55 @@ impl<'a> Tape<'a> { self.num_rows } + /// Iterates over the rows of the tape + pub fn iter_rows(&self) -> impl Iterator { + self.iter_values(0, self.elements.len() as u32) + } + + /// Iterates over the elements of the array starting at `idx` + pub fn iter_elements(&self, idx: u32) -> impl Iterator { + let end = match self.get(idx) { + TapeElement::StartList(end) => end, + elem => panic!("Expected the start of a list, found {:?}", elem), + }; + + self.iter_values(idx, end) + } + + /// Iterates over the fields of the objected starting at `idx` + pub fn iter_fields(&self, idx: u32) -> impl Iterator { + let end = match self.get(idx) { + TapeElement::StartObject(end) => end, + elem => panic!("Expected the start of an object, found {:?}", elem), + }; + + let mut idx = idx + 1; + + std::iter::from_fn(move || { + (idx < end).then(|| { + let key = match self.get(idx) { + TapeElement::String(s) => self.get_string(s), + elem => panic!("Expected a string, found {:?}", elem), + }; + let value_idx = idx + 1; + idx = self.next(value_idx, "field value").unwrap(); + (key, value_idx) + }) + }) + } + + fn iter_values(&self, start: u32, end: u32) -> impl Iterator { + let mut idx = start + 1; + + std::iter::from_fn(move || { + (idx < end).then(|| { + let value_id = idx; + idx = self.next(idx, "value").unwrap(); + value_id + }) + }) + } + /// Serialize the tape element at index `idx` to `out` returning the next field index fn serialize(&self, out: &mut String, idx: u32) -> u32 { match self.get(idx) { From e82462abfaac3e0a4d559ad3a73aaec0466ba066 Mon Sep 17 00:00:00 2001 From: Alexander Rafferty Date: Sun, 1 Mar 2026 00:00:59 +1100 Subject: [PATCH 04/10] Fix licenses, clippy fixes, remove unused `indexmap` crate --- arrow-json/Cargo.toml | 1 - arrow-json/src/reader/schema/infer.rs | 17 ++++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 8faae63b8de4..d533d9537534 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -42,7 +42,6 @@ arrow-cast = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } half = { version = "2.1", default-features = false } -indexmap = { version = "2.0", default-features = false, features = ["std"] } num-traits = { version = "0.2.19", default-features = false, features = ["std"] } serde_core = { version = "1.0", default-features = false } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/arrow-json/src/reader/schema/infer.rs b/arrow-json/src/reader/schema/infer.rs index 038c8d38f80e..ee944dbab968 100644 --- a/arrow-json/src/reader/schema/infer.rs +++ b/arrow-json/src/reader/schema/infer.rs @@ -2,11 +2,11 @@ // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.kind() (the +// to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.kind() +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an @@ -105,15 +105,14 @@ pub fn infer_json_type<'a, 't>( let elem = value .elements() .try_fold(expected_elem, |expected, value| { - let result = infer_json_type(value, expected, arena); - result + infer_json_type(value, expected, arena) })?; - Ok(if elem.ptr_eq(expected_elem) { - expected - } else { - InferredType::new_array(elem, arena) - }) + if elem.ptr_eq(expected_elem) { + return Ok(expected); + } + + Ok(InferredType::new_array(elem, arena)) } JsonType::Object => { let (expected, expected_fields) = match *expected.kind() { From 52f21396d3f6dc5b8dabb1fefa31059eeb11bd1e Mon Sep 17 00:00:00 2001 From: Alexander Rafferty Date: Tue, 17 Mar 2026 21:30:37 +1100 Subject: [PATCH 05/10] Removed `bumpalo`, and use `Arc`s instead --- arrow-json/Cargo.toml | 1 - arrow-json/src/reader/schema.rs | 26 ++--- arrow-json/src/reader/schema/infer.rs | 155 ++++++++++++++------------ 3 files changed, 94 insertions(+), 88 deletions(-) diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index d533d9537534..bb5f772d09bf 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -51,7 +51,6 @@ memchr = "2.7.4" simdutf8 = { workspace = true } ryu = "1.0" itoa = "1.0" -bumpalo = "3.20.2" [dev-dependencies] flate2 = { version = "1", default-features = false, features = ["rust_backend"] } diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs index d1cd94012411..f37cbff1cdf5 100644 --- a/arrow-json/src/reader/schema.rs +++ b/arrow-json/src/reader/schema.rs @@ -19,11 +19,10 @@ use std::borrow::Borrow; use std::io::{BufRead, Seek}; use arrow_schema::{ArrowError, Schema}; -use bumpalo::Bump; use serde_json::Value; use super::tape::TapeDecoder; -use infer::{ANY_TY, EMPTY_OBJECT_TY, InferredType, TapeValue, infer_json_type}; +use infer::{ANY_TY, EMPTY_OBJECT_TY, InferTy, TapeValue, infer_json_type}; mod infer; @@ -103,8 +102,7 @@ pub fn infer_json_schema( mut reader: R, max_read_records: Option, ) -> Result<(Schema, usize), ArrowError> { - let arena = Bump::new(); - let mut decoder = SchemaDecoder::new(max_read_records, &arena); + let mut decoder = SchemaDecoder::new(max_read_records); loop { let buf = reader.fill_buf()?; @@ -143,32 +141,28 @@ where I: Iterator>, V: Borrow, { - let arena = &Bump::new(); - value_iter .into_iter() - .try_fold(EMPTY_OBJECT_TY, |ty, record| { - infer_json_type(record?.borrow(), ty, arena) + .try_fold(EMPTY_OBJECT_TY.clone(), |ty, record| { + infer_json_type(record?.borrow(), ty) })? .into_schema() } -struct SchemaDecoder<'a> { +struct SchemaDecoder { decoder: TapeDecoder, max_read_records: Option, record_count: usize, - schema: InferredType<'a>, - arena: &'a Bump, + schema: InferTy, } -impl<'a> SchemaDecoder<'a> { - pub fn new(max_read_records: Option, arena: &'a Bump) -> Self { +impl SchemaDecoder { + pub fn new(max_read_records: Option) -> Self { Self { decoder: TapeDecoder::new(1024, 8), max_read_records, record_count: 0, - schema: ANY_TY, - arena, + schema: ANY_TY.clone(), } } @@ -198,7 +192,7 @@ impl<'a> SchemaDecoder<'a> { .take(remaining_records); for record in records { - self.schema = infer_json_type(record, self.schema, self.arena)?; + self.schema = infer_json_type(record, self.schema.clone())?; self.record_count += 1; } diff --git a/arrow-json/src/reader/schema/infer.rs b/arrow-json/src/reader/schema/infer.rs index ee944dbab968..2649cd893d2e 100644 --- a/arrow-json/src/reader/schema/infer.rs +++ b/arrow-json/src/reader/schema/infer.rs @@ -16,21 +16,21 @@ // under the License. use std::collections::HashMap; +use std::sync::{Arc, LazyLock}; use arrow_schema::{ArrowError, DataType, Field, Fields, Schema}; -use bumpalo::Bump; use crate::reader::tape::{Tape, TapeElement}; -#[derive(Clone, Copy, Debug)] -pub struct InferredType<'t>(&'t TyKind<'t>); +#[derive(Clone, Debug)] +pub struct InferTy(Arc); -#[derive(Clone, Copy, Debug)] -enum TyKind<'t> { +#[derive(Clone, Debug)] +enum TyKind { Any, Scalar(ScalarTy), - Array(InferredType<'t>), - Object(&'t [(&'t str, InferredType<'t>)]), + Array(InferTy), + Object(Arc<[(Arc, InferTy)]>), } #[derive(Clone, Copy, PartialEq, Eq, Debug)] @@ -39,23 +39,24 @@ enum ScalarTy { Int64, Float64, String, - // NOTE: Null isn't needed because it's absorbed into Never + // NOTE: Null isn't needed because it's absorbed into Any } -pub static ANY_TY: InferredType<'static> = InferredType(&TyKind::Any); -pub static BOOL_TY: InferredType<'static> = InferredType(&TyKind::Scalar(ScalarTy::Bool)); -pub static INT64_TY: InferredType<'static> = InferredType(&TyKind::Scalar(ScalarTy::Int64)); -pub static FLOAT64_TY: InferredType<'static> = InferredType(&TyKind::Scalar(ScalarTy::Float64)); -pub static STRING_TY: InferredType<'static> = InferredType(&TyKind::Scalar(ScalarTy::String)); -pub static ARRAY_OF_ANY_TY: InferredType<'static> = InferredType(&TyKind::Array(ANY_TY)); -pub static EMPTY_OBJECT_TY: InferredType<'static> = InferredType(&TyKind::Object(&[])); +pub static ANY_TY: LazyLock = LazyLock::new(|| InferTy::new_any()); +pub static BOOL_TY: LazyLock = LazyLock::new(|| InferTy::new_scalar(ScalarTy::Bool)); +pub static INT64_TY: LazyLock = LazyLock::new(|| InferTy::new_scalar(ScalarTy::Int64)); +pub static FLOAT64_TY: LazyLock = LazyLock::new(|| InferTy::new_scalar(ScalarTy::Float64)); +pub static STRING_TY: LazyLock = LazyLock::new(|| InferTy::new_scalar(ScalarTy::String)); +pub static ARRAY_OF_ANY_TY: LazyLock = LazyLock::new(|| InferTy::new_array(&*ANY_TY)); +pub static EMPTY_FIELDS: LazyLock, InferTy)]>> = LazyLock::new(|| Arc::new([])); +pub static EMPTY_OBJECT_TY: LazyLock = + LazyLock::new(|| InferTy::new_object(EMPTY_FIELDS.clone())); /// Infers the type of the provided JSON value, given an expected type. -pub fn infer_json_type<'a, 't>( +pub fn infer_json_type<'a>( value: impl JsonValue<'a>, - expected: InferredType<'t>, - arena: &'t Bump, -) -> Result, ArrowError> { + expected: InferTy, +) -> Result { let make_err = |got| { let expected = match expected.kind() { TyKind::Any => unreachable!(), @@ -70,20 +71,20 @@ pub fn infer_json_type<'a, 't>( let infer_scalar = |scalar: ScalarTy, got: &'static str| { Ok(match expected.kind() { TyKind::Any => match scalar { - ScalarTy::Bool => BOOL_TY, - ScalarTy::Int64 => INT64_TY, - ScalarTy::Float64 => FLOAT64_TY, - ScalarTy::String => STRING_TY, + ScalarTy::Bool => BOOL_TY.clone(), + ScalarTy::Int64 => INT64_TY.clone(), + ScalarTy::Float64 => FLOAT64_TY.clone(), + ScalarTy::String => STRING_TY.clone(), }, TyKind::Scalar(expect) => match (expect, &scalar) { - (ScalarTy::Bool, ScalarTy::Bool) => BOOL_TY, - (ScalarTy::Int64, ScalarTy::Int64) => INT64_TY, + (ScalarTy::Bool, ScalarTy::Bool) => BOOL_TY.clone(), + (ScalarTy::Int64, ScalarTy::Int64) => INT64_TY.clone(), // Mixed numbers coerce to f64 (ScalarTy::Int64 | ScalarTy::Float64, ScalarTy::Int64 | ScalarTy::Float64) => { - FLOAT64_TY + FLOAT64_TY.clone() } // Any other combination coerces to string - _ => STRING_TY, + _ => STRING_TY.clone(), }, _ => Err(make_err(got))?, }) @@ -96,54 +97,52 @@ pub fn infer_json_type<'a, 't>( JsonType::Float64 => infer_scalar(ScalarTy::Float64, "a number"), JsonType::String => infer_scalar(ScalarTy::String, "a string"), JsonType::Array => { - let (expected, expected_elem) = match *expected.kind() { - TyKind::Any => (ARRAY_OF_ANY_TY, ANY_TY), - TyKind::Array(inner) => (expected, inner), + let (expected_elem, expected) = match expected.kind() { + TyKind::Any => (ANY_TY.clone(), ARRAY_OF_ANY_TY.clone()), + TyKind::Array(inner) => (inner.clone(), expected), _ => Err(make_err("an array"))?, }; let elem = value .elements() - .try_fold(expected_elem, |expected, value| { - infer_json_type(value, expected, arena) + .try_fold(expected_elem.clone(), |expected, value| { + infer_json_type(value, expected) })?; - if elem.ptr_eq(expected_elem) { + if elem.ptr_eq(&expected_elem) { return Ok(expected); } - Ok(InferredType::new_array(elem, arena)) + Ok(InferTy::new_array(elem)) } JsonType::Object => { - let (expected, expected_fields) = match *expected.kind() { - TyKind::Any => (EMPTY_OBJECT_TY, &[] as &[_]), - TyKind::Object(fields) => (expected, fields), + let (expected_fields, expected) = match expected.kind() { + TyKind::Any => (EMPTY_FIELDS.clone(), EMPTY_OBJECT_TY.clone()), + TyKind::Object(fields) => (fields.clone(), expected), _ => Err(make_err("an object"))?, }; let mut num_fields = expected_fields.len(); - let mut substs = HashMap::)>::new(); + let mut substs = HashMap::, InferTy)>::new(); for (key, value) in value.fields() { let existing_field = expected_fields .iter() - .copied() .enumerate() - .find(|(_, (existing_key, _))| *existing_key == key); + .find(|(_, (existing_key, _))| &**existing_key == key); match existing_field { Some((field_idx, (key, expect_ty))) => { - let ty = infer_json_type(value, expect_ty, arena)?; + let ty = infer_json_type(value, expect_ty.clone())?; if !ty.ptr_eq(expect_ty) { - substs.insert(field_idx, (key, ty)); + substs.insert(field_idx, (key.clone(), ty)); } } None => { let field_idx = num_fields; num_fields += 1; - let key = arena.alloc_str(key); - let ty = infer_json_type(value, ANY_TY, arena)?; - substs.insert(field_idx, (key, ty)); + let ty = infer_json_type(value, ANY_TY.clone())?; + substs.insert(field_idx, (key.into(), ty)); } }; } @@ -152,54 +151,62 @@ pub fn infer_json_type<'a, 't>( return Ok(expected); } - let fields = (0..num_fields).map(|idx| match substs.get(&idx) { - Some(subst) => *subst, - None => expected_fields[idx], - }); + let fields = (0..num_fields) + .map(|idx| match substs.remove(&idx) { + Some(subst) => subst, + None => expected_fields[idx].clone(), + }) + .collect(); - Ok(InferredType::new_object(fields, arena)) + Ok(InferTy::new_object(fields)) } } } -impl<'t> InferredType<'t> { - fn new_array(inner: InferredType<'t>, arena: &'t Bump) -> Self { - Self(arena.alloc(TyKind::Array(inner))) +impl InferTy { + fn new_any() -> Self { + Self(TyKind::Any.into()) } - fn new_object(fields: F, arena: &'t Bump) -> Self - where - F: IntoIterator)>, - F::IntoIter: ExactSizeIterator, - { - let fields = arena.alloc_slice_fill_iter(fields); - Self(arena.alloc(TyKind::Object(fields))) + fn new_scalar(scalar: ScalarTy) -> Self { + Self(TyKind::Scalar(scalar).into()) } - fn kind(self) -> &'t TyKind<'t> { - self.0 + fn new_array(inner: impl Into) -> Self { + Self(TyKind::Array(inner.into()).into()) } - fn ptr_eq(self, other: Self) -> bool { - std::ptr::eq(self.kind(), other.kind()) + fn new_object(fields: Arc<[(Arc, InferTy)]>) -> Self { + Self(TyKind::Object(fields).into()) } - pub fn into_datatype(self) -> DataType { + fn kind(&self) -> &TyKind { + &*self.0 + } + + fn ptr_eq(&self, other: &Self) -> bool { + Arc::ptr_eq(&self.0, &other.0) + } + + pub fn into_datatype(&self) -> DataType { match self.kind() { TyKind::Any => DataType::Null, TyKind::Scalar(s) => s.into_datatype(), TyKind::Array(elem) => DataType::List(elem.into_list_field().into()), - TyKind::Object(fields) => { - DataType::Struct(fields.iter().map(|(key, ty)| ty.into_field(*key)).collect()) - } + TyKind::Object(fields) => DataType::Struct( + fields + .iter() + .map(|(key, ty)| ty.into_field(&**key)) + .collect(), + ), } } - pub fn into_field(self, name: impl Into) -> Field { + pub fn into_field(&self, name: impl Into) -> Field { Field::new(name, self.into_datatype(), true) } - pub fn into_list_field(self) -> Field { + pub fn into_list_field(&self) -> Field { Field::new_list_field(self.into_datatype(), true) } @@ -212,13 +219,19 @@ impl<'t> InferredType<'t> { let fields = fields .iter() - .map(|(key, ty)| ty.into_field(*key)) + .map(|(key, ty)| ty.into_field(&**key)) .collect::(); Ok(Schema::new(fields)) } } +impl From<&InferTy> for InferTy { + fn from(value: &InferTy) -> Self { + Self(value.0.clone()) + } +} + impl ScalarTy { fn into_datatype(self) -> DataType { match self { From 1b5d16b1593b0c215bd7d98829fcc21330ea8ea3 Mon Sep 17 00:00:00 2001 From: Alexander Rafferty Date: Tue, 17 Mar 2026 21:35:52 +1100 Subject: [PATCH 06/10] Fix clippy errors --- arrow-json/src/reader/schema/infer.rs | 33 +++++++++++++-------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/arrow-json/src/reader/schema/infer.rs b/arrow-json/src/reader/schema/infer.rs index 2649cd893d2e..f4717eb41013 100644 --- a/arrow-json/src/reader/schema/infer.rs +++ b/arrow-json/src/reader/schema/infer.rs @@ -30,9 +30,11 @@ enum TyKind { Any, Scalar(ScalarTy), Array(InferTy), - Object(Arc<[(Arc, InferTy)]>), + Object(Arc<[InferField]>), } +pub type InferField = (Arc, InferTy); + #[derive(Clone, Copy, PartialEq, Eq, Debug)] enum ScalarTy { Bool, @@ -42,13 +44,13 @@ enum ScalarTy { // NOTE: Null isn't needed because it's absorbed into Any } -pub static ANY_TY: LazyLock = LazyLock::new(|| InferTy::new_any()); +pub static ANY_TY: LazyLock = LazyLock::new(InferTy::new_any); pub static BOOL_TY: LazyLock = LazyLock::new(|| InferTy::new_scalar(ScalarTy::Bool)); pub static INT64_TY: LazyLock = LazyLock::new(|| InferTy::new_scalar(ScalarTy::Int64)); pub static FLOAT64_TY: LazyLock = LazyLock::new(|| InferTy::new_scalar(ScalarTy::Float64)); pub static STRING_TY: LazyLock = LazyLock::new(|| InferTy::new_scalar(ScalarTy::String)); pub static ARRAY_OF_ANY_TY: LazyLock = LazyLock::new(|| InferTy::new_array(&*ANY_TY)); -pub static EMPTY_FIELDS: LazyLock, InferTy)]>> = LazyLock::new(|| Arc::new([])); +pub static EMPTY_FIELDS: LazyLock> = LazyLock::new(|| Arc::new([])); pub static EMPTY_OBJECT_TY: LazyLock = LazyLock::new(|| InferTy::new_object(EMPTY_FIELDS.clone())); @@ -181,33 +183,30 @@ impl InferTy { } fn kind(&self) -> &TyKind { - &*self.0 + &self.0 } fn ptr_eq(&self, other: &Self) -> bool { Arc::ptr_eq(&self.0, &other.0) } - pub fn into_datatype(&self) -> DataType { + pub fn to_datatype(&self) -> DataType { match self.kind() { TyKind::Any => DataType::Null, TyKind::Scalar(s) => s.into_datatype(), - TyKind::Array(elem) => DataType::List(elem.into_list_field().into()), - TyKind::Object(fields) => DataType::Struct( - fields - .iter() - .map(|(key, ty)| ty.into_field(&**key)) - .collect(), - ), + TyKind::Array(elem) => DataType::List(elem.to_list_field().into()), + TyKind::Object(fields) => { + DataType::Struct(fields.iter().map(|(key, ty)| ty.to_field(&**key)).collect()) + } } } - pub fn into_field(&self, name: impl Into) -> Field { - Field::new(name, self.into_datatype(), true) + pub fn to_field(&self, name: impl Into) -> Field { + Field::new(name, self.to_datatype(), true) } - pub fn into_list_field(&self) -> Field { - Field::new_list_field(self.into_datatype(), true) + pub fn to_list_field(&self) -> Field { + Field::new_list_field(self.to_datatype(), true) } pub fn into_schema(self) -> Result { @@ -219,7 +218,7 @@ impl InferTy { let fields = fields .iter() - .map(|(key, ty)| ty.into_field(&**key)) + .map(|(key, ty)| ty.to_field(&**key)) .collect::(); Ok(Schema::new(fields)) From 6ee6985fdeb8e1d6d5be694ca9e84db61b5594c6 Mon Sep 17 00:00:00 2001 From: Alexander Rafferty Date: Sat, 21 Mar 2026 12:17:41 +1100 Subject: [PATCH 07/10] PR feedback and refactoring --- arrow-json/src/reader/schema.rs | 16 +- arrow-json/src/reader/schema/infer.rs | 396 +++++++++------------- arrow-json/src/reader/schema/json_type.rs | 133 ++++++++ 3 files changed, 313 insertions(+), 232 deletions(-) create mode 100644 arrow-json/src/reader/schema/json_type.rs diff --git a/arrow-json/src/reader/schema.rs b/arrow-json/src/reader/schema.rs index f37cbff1cdf5..1558ac132581 100644 --- a/arrow-json/src/reader/schema.rs +++ b/arrow-json/src/reader/schema.rs @@ -21,10 +21,12 @@ use std::io::{BufRead, Seek}; use arrow_schema::{ArrowError, Schema}; use serde_json::Value; +use self::infer::{InferTy, infer_json_type}; +use self::json_type::TapeValue; use super::tape::TapeDecoder; -use infer::{ANY_TY, EMPTY_OBJECT_TY, InferTy, TapeValue, infer_json_type}; mod infer; +mod json_type; /// Infer the fields of a JSON file by reading the first n records of the file, with /// `max_read_records` controlling the maximum number of records to read. @@ -143,7 +145,7 @@ where { value_iter .into_iter() - .try_fold(EMPTY_OBJECT_TY.clone(), |ty, record| { + .try_fold(InferTy::any(), |ty, record| { infer_json_type(record?.borrow(), ty) })? .into_schema() @@ -162,7 +164,7 @@ impl SchemaDecoder { decoder: TapeDecoder::new(1024, 8), max_read_records, record_count: 0, - schema: ANY_TY.clone(), + schema: InferTy::empty_object(), } } @@ -217,6 +219,14 @@ mod tests { DataType::List(Arc::new(Field::new_list_field(ty, true))) } + #[test] + fn test_empty_input_infers_empty_schema() { + let (inferred_schema, n_rows) = infer_json_schema_from_seekable(Cursor::new(b""), None) + .expect("failed to infer schema"); + assert_eq!(inferred_schema, Schema::empty()); + assert_eq!(n_rows, 0); + } + #[test] fn test_json_infer_schema() { let schema = Schema::new(vec![ diff --git a/arrow-json/src/reader/schema/infer.rs b/arrow-json/src/reader/schema/infer.rs index f4717eb41013..121f3dd725f9 100644 --- a/arrow-json/src/reader/schema/infer.rs +++ b/arrow-json/src/reader/schema/infer.rs @@ -16,15 +16,18 @@ // under the License. use std::collections::HashMap; +use std::fmt::Display; use std::sync::{Arc, LazyLock}; use arrow_schema::{ArrowError, DataType, Field, Fields, Schema}; -use crate::reader::tape::{Tape, TapeElement}; +use super::json_type::{JsonType, JsonValue}; +/// Represents an inferred JSON type. #[derive(Clone, Debug)] -pub struct InferTy(Arc); +pub(crate) struct InferTy(Arc); +/// The possible variants of an `InferTy`. #[derive(Clone, Debug)] enum TyKind { Any, @@ -33,145 +36,169 @@ enum TyKind { Object(Arc<[InferField]>), } -pub type InferField = (Arc, InferTy); +/// A field in an inferred object type. +type InferField = (Arc, InferTy); +/// An inferred scalar type. #[derive(Clone, Copy, PartialEq, Eq, Debug)] enum ScalarTy { Bool, Int64, Float64, String, - // NOTE: Null isn't needed because it's absorbed into Any } -pub static ANY_TY: LazyLock = LazyLock::new(InferTy::new_any); -pub static BOOL_TY: LazyLock = LazyLock::new(|| InferTy::new_scalar(ScalarTy::Bool)); -pub static INT64_TY: LazyLock = LazyLock::new(|| InferTy::new_scalar(ScalarTy::Int64)); -pub static FLOAT64_TY: LazyLock = LazyLock::new(|| InferTy::new_scalar(ScalarTy::Float64)); -pub static STRING_TY: LazyLock = LazyLock::new(|| InferTy::new_scalar(ScalarTy::String)); -pub static ARRAY_OF_ANY_TY: LazyLock = LazyLock::new(|| InferTy::new_array(&*ANY_TY)); -pub static EMPTY_FIELDS: LazyLock> = LazyLock::new(|| Arc::new([])); -pub static EMPTY_OBJECT_TY: LazyLock = - LazyLock::new(|| InferTy::new_object(EMPTY_FIELDS.clone())); - -/// Infers the type of the provided JSON value, given an expected type. -pub fn infer_json_type<'a>( - value: impl JsonValue<'a>, - expected: InferTy, -) -> Result { - let make_err = |got| { - let expected = match expected.kind() { - TyKind::Any => unreachable!(), - TyKind::Scalar(_) => "a scalar value", - TyKind::Array(_) => "an array", - TyKind::Object(_) => "an object", - }; - let msg = format!("Expected {expected}, found {got}"); - ArrowError::JsonError(msg) - }; - - let infer_scalar = |scalar: ScalarTy, got: &'static str| { - Ok(match expected.kind() { - TyKind::Any => match scalar { - ScalarTy::Bool => BOOL_TY.clone(), - ScalarTy::Int64 => INT64_TY.clone(), - ScalarTy::Float64 => FLOAT64_TY.clone(), - ScalarTy::String => STRING_TY.clone(), - }, - TyKind::Scalar(expect) => match (expect, &scalar) { - (ScalarTy::Bool, ScalarTy::Bool) => BOOL_TY.clone(), - (ScalarTy::Int64, ScalarTy::Int64) => INT64_TY.clone(), - // Mixed numbers coerce to f64 - (ScalarTy::Int64 | ScalarTy::Float64, ScalarTy::Int64 | ScalarTy::Float64) => { - FLOAT64_TY.clone() - } - // Any other combination coerces to string - _ => STRING_TY.clone(), - }, - _ => Err(make_err(got))?, - }) - }; +/// During the process of schema inference, types are frequently produced and discarded. +/// As an optimisation to avoid excess heap allocations, a few common types +/// are instantiated here so they can be reused. +static COMMON_TYS: LazyLock = LazyLock::new(|| CommonTypes { + any: InferTy(TyKind::Any.into()), + bool: InferTy(TyKind::Scalar(ScalarTy::Bool).into()), + int64: InferTy(TyKind::Scalar(ScalarTy::Int64).into()), + float64: InferTy(TyKind::Scalar(ScalarTy::Float64).into()), + string: InferTy(TyKind::Scalar(ScalarTy::String).into()), + array_of_any: InferTy::new_array(InferTy(TyKind::Any.into())), + empty_object: InferTy::new_object(Arc::new([])), +}); + +struct CommonTypes { + any: InferTy, + bool: InferTy, + int64: InferTy, + float64: InferTy, + string: InferTy, + array_of_any: InferTy, + empty_object: InferTy, +} +/// Infers the type of a JSON value, given an expected type. +pub fn infer_json_type<'a, T>(value: T, expected: InferTy) -> Result +where + T: JsonValue<'a>, +{ match value.get() { JsonType::Null => Ok(expected), - JsonType::Bool => infer_scalar(ScalarTy::Bool, "a boolean"), - JsonType::Int64 => infer_scalar(ScalarTy::Int64, "a number"), - JsonType::Float64 => infer_scalar(ScalarTy::Float64, "a number"), - JsonType::String => infer_scalar(ScalarTy::String, "a string"), - JsonType::Array => { - let (expected_elem, expected) = match expected.kind() { - TyKind::Any => (ANY_TY.clone(), ARRAY_OF_ANY_TY.clone()), - TyKind::Array(inner) => (inner.clone(), expected), - _ => Err(make_err("an array"))?, - }; - - let elem = value - .elements() - .try_fold(expected_elem.clone(), |expected, value| { - infer_json_type(value, expected) - })?; - - if elem.ptr_eq(&expected_elem) { - return Ok(expected); - } + JsonType::Bool => infer_scalar(ScalarTy::Bool, expected), + JsonType::Int64 => infer_scalar(ScalarTy::Int64, expected), + JsonType::Float64 => infer_scalar(ScalarTy::Float64, expected), + JsonType::String => infer_scalar(ScalarTy::String, expected), + JsonType::Array => infer_array(value.elements(), expected), + JsonType::Object => infer_object(value.fields(), expected), + } +} - Ok(InferTy::new_array(elem)) - } - JsonType::Object => { - let (expected_fields, expected) = match expected.kind() { - TyKind::Any => (EMPTY_FIELDS.clone(), EMPTY_OBJECT_TY.clone()), - TyKind::Object(fields) => (fields.clone(), expected), - _ => Err(make_err("an object"))?, - }; - - let mut num_fields = expected_fields.len(); - let mut substs = HashMap::, InferTy)>::new(); - - for (key, value) in value.fields() { - let existing_field = expected_fields - .iter() - .enumerate() - .find(|(_, (existing_key, _))| &**existing_key == key); - - match existing_field { - Some((field_idx, (key, expect_ty))) => { - let ty = infer_json_type(value, expect_ty.clone())?; - if !ty.ptr_eq(expect_ty) { - substs.insert(field_idx, (key.clone(), ty)); - } - } - None => { - let field_idx = num_fields; - num_fields += 1; - let ty = infer_json_type(value, ANY_TY.clone())?; - substs.insert(field_idx, (key.into(), ty)); - } - }; +/// Infers the type of a scalar JSON value, given an expected type. +fn infer_scalar(scalar: ScalarTy, expected: InferTy) -> Result { + Ok(match expected.kind() { + TyKind::Any => match scalar { + ScalarTy::Bool => COMMON_TYS.bool.clone(), + ScalarTy::Int64 => COMMON_TYS.int64.clone(), + ScalarTy::Float64 => COMMON_TYS.float64.clone(), + ScalarTy::String => COMMON_TYS.string.clone(), + }, + TyKind::Scalar(expect) => match (expect, &scalar) { + (ScalarTy::Bool, ScalarTy::Bool) => COMMON_TYS.bool.clone(), + (ScalarTy::Int64, ScalarTy::Int64) => COMMON_TYS.int64.clone(), + // Mixed numbers coerce to f64 + (ScalarTy::Int64 | ScalarTy::Float64, ScalarTy::Int64 | ScalarTy::Float64) => { + COMMON_TYS.float64.clone() } + // Any other combination coerces to string + _ => COMMON_TYS.string.clone(), + }, + _ => Err(ArrowError::JsonError(format!( + "Expected {expected}, found {scalar}" + )))?, + }) +} - if substs.is_empty() { - return Ok(expected); - } +/// Infers the type of a JSON array, given an expected type. +fn infer_array<'a, I>(mut elements: I, mut expected: InferTy) -> Result +where + I: Iterator, + I::Item: JsonValue<'a>, +{ + if let TyKind::Any = expected.kind() { + expected = COMMON_TYS.array_of_any.clone(); + } + + let (expected_elem, expected) = match expected.kind() { + TyKind::Array(inner) => (inner.clone(), expected), + _ => Err(ArrowError::JsonError(format!( + "Expected {expected}, found an array" + )))?, + }; - let fields = (0..num_fields) - .map(|idx| match substs.remove(&idx) { - Some(subst) => subst, - None => expected_fields[idx].clone(), - }) - .collect(); + let elem = elements.try_fold(expected_elem.clone(), |expected, value| { + infer_json_type(value, expected) + })?; - Ok(InferTy::new_object(fields)) - } + if elem.ptr_eq(&expected_elem) { + return Ok(expected); } + + Ok(InferTy::new_array(elem)) } -impl InferTy { - fn new_any() -> Self { - Self(TyKind::Any.into()) +/// Infers the type of a JSON object, given an expected type. +fn infer_object<'a, I, T>(fields: I, mut expected: InferTy) -> Result +where + I: Iterator, + T: JsonValue<'a>, +{ + if let TyKind::Any = expected.kind() { + expected = COMMON_TYS.empty_object.clone(); + } + + let (expected_fields, expected) = match expected.kind() { + TyKind::Object(fields) => (fields.clone(), expected), + _ => Err(ArrowError::JsonError(format!( + "Expected {expected}, found an object" + )))?, + }; + + let mut num_fields = expected_fields.len(); + let mut substs = HashMap::, InferTy)>::new(); + + for (key, value) in fields { + let existing_field = expected_fields + .iter() + .enumerate() + .find(|(_, (existing_key, _))| &**existing_key == key); + + match existing_field { + Some((field_idx, (key, expect_ty))) => { + let ty = infer_json_type(value, expect_ty.clone())?; + if !ty.ptr_eq(expect_ty) { + substs.insert(field_idx, (key.clone(), ty)); + } + } + None => { + let field_idx = num_fields; + num_fields += 1; + let ty = infer_json_type(value, COMMON_TYS.any.clone())?; + substs.insert(field_idx, (key.into(), ty)); + } + }; + } + + if substs.is_empty() { + return Ok(expected); } - fn new_scalar(scalar: ScalarTy) -> Self { - Self(TyKind::Scalar(scalar).into()) + let fields = (0..num_fields) + .map(|idx| match substs.remove(&idx) { + Some(subst) => subst, + None => expected_fields[idx].clone(), + }) + .collect(); + + Ok(InferTy::new_object(fields)) +} + +impl InferTy { + pub fn any() -> Self { + COMMON_TYS.any.clone() } fn new_array(inner: impl Into) -> Self { @@ -182,6 +209,10 @@ impl InferTy { Self(TyKind::Object(fields).into()) } + pub fn empty_object() -> Self { + COMMON_TYS.empty_object.clone() + } + fn kind(&self) -> &TyKind { &self.0 } @@ -231,6 +262,17 @@ impl From<&InferTy> for InferTy { } } +impl Display for InferTy { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.kind() { + TyKind::Any => write!(f, "any value"), + TyKind::Scalar(s) => write!(f, "{}", s), + TyKind::Array(_) => write!(f, "an array"), + TyKind::Object(_) => write!(f, "an object"), + } + } +} + impl ScalarTy { fn into_datatype(self) -> DataType { match self { @@ -242,117 +284,13 @@ impl ScalarTy { } } -/// Abstraction of a JSON value that is only concerned with the -/// type of the value rather than the value itself. -pub trait JsonValue<'a> { - /// Gets the type of this JSON value. - fn get(&self) -> JsonType; - - /// Returns an iterator over the elements of this JSON array. - /// - /// Panics if the JSON value is not an array. - fn elements(&self) -> impl Iterator; - - /// Returns an iterator over the fields of this JSON object. - /// - /// Panics if the JSON value is not an object. - fn fields(&self) -> impl Iterator; -} - -/// The type of a JSON value -pub enum JsonType { - Null, - Bool, - Int64, - Float64, - String, - Array, - Object, -} - -#[derive(Copy, Clone, Debug)] -pub struct TapeValue<'a> { - tape: &'a Tape<'a>, - idx: u32, -} - -impl<'a> TapeValue<'a> { - pub fn new(tape: &'a Tape<'a>, idx: u32) -> Self { - Self { tape, idx } - } -} - -impl<'a> JsonValue<'a> for TapeValue<'a> { - fn get(&self) -> JsonType { - match self.tape.get(self.idx) { - TapeElement::Null => JsonType::Null, - TapeElement::False => JsonType::Bool, - TapeElement::True => JsonType::Bool, - TapeElement::I64(_) | TapeElement::I32(_) => JsonType::Int64, - TapeElement::F64(_) | TapeElement::F32(_) => JsonType::Float64, - TapeElement::Number(s) => { - if self.tape.get_string(s).parse::().is_ok() { - JsonType::Int64 - } else { - JsonType::Float64 - } - } - TapeElement::String(_) => JsonType::String, - TapeElement::StartList(_) => JsonType::Array, - TapeElement::EndList(_) => unreachable!(), - TapeElement::StartObject(_) => JsonType::Object, - TapeElement::EndObject(_) => unreachable!(), - } - } - - fn elements(&self) -> impl Iterator { - self.tape - .iter_elements(self.idx) - .map(move |idx| Self { idx, ..*self }) - } - - fn fields(&self) -> impl Iterator { - self.tape - .iter_fields(self.idx) - .map(move |(key, idx)| (key, Self { idx, ..*self })) - } -} - -impl<'a> JsonValue<'a> for &'a serde_json::Value { - fn get(&self) -> JsonType { - use serde_json::Value; - - match self { - Value::Null => JsonType::Null, - Value::Bool(_) => JsonType::Bool, - Value::Number(n) => { - if n.is_i64() { - JsonType::Int64 - } else { - JsonType::Float64 - } - } - Value::String(_) => JsonType::String, - Value::Array(_) => JsonType::Array, - Value::Object(_) => JsonType::Object, - } - } - - fn elements(&self) -> impl Iterator { - use serde_json::Value; - - match self { - Value::Array(elements) => elements.iter(), - _ => panic!("Expected an array"), - } - } - - fn fields(&self) -> impl Iterator { - use serde_json::Value; - +impl Display for ScalarTy { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Value::Object(fields) => fields.iter().map(|(key, value)| (key.as_str(), value)), - _ => panic!("Expected an object"), + ScalarTy::Bool => write!(f, "a boolean"), + ScalarTy::Int64 => write!(f, "an integer"), + ScalarTy::Float64 => write!(f, "a number"), + ScalarTy::String => write!(f, "a string"), } } } diff --git a/arrow-json/src/reader/schema/json_type.rs b/arrow-json/src/reader/schema/json_type.rs new file mode 100644 index 000000000000..266bc00685fe --- /dev/null +++ b/arrow-json/src/reader/schema/json_type.rs @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::reader::tape::{Tape, TapeElement}; + +/// Abstraction of a JSON value that is only concerned with the +/// type of the value rather than the value itself. +pub trait JsonValue<'a> { + /// Gets the type of this JSON value. + fn get(&self) -> JsonType; + + /// Returns an iterator over the elements of this JSON array. + /// + /// Panics if the JSON value is not an array. + fn elements(&self) -> impl Iterator; + + /// Returns an iterator over the fields of this JSON object. + /// + /// Panics if the JSON value is not an object. + fn fields(&self) -> impl Iterator; +} + +/// The type of a JSON value +pub enum JsonType { + Null, + Bool, + Int64, + Float64, + String, + Array, + Object, +} + +#[derive(Copy, Clone, Debug)] +pub struct TapeValue<'a> { + tape: &'a Tape<'a>, + idx: u32, +} + +impl<'a> TapeValue<'a> { + pub fn new(tape: &'a Tape<'a>, idx: u32) -> Self { + Self { tape, idx } + } +} + +impl<'a> JsonValue<'a> for TapeValue<'a> { + fn get(&self) -> JsonType { + match self.tape.get(self.idx) { + TapeElement::Null => JsonType::Null, + TapeElement::False => JsonType::Bool, + TapeElement::True => JsonType::Bool, + TapeElement::I64(_) | TapeElement::I32(_) => JsonType::Int64, + TapeElement::F64(_) | TapeElement::F32(_) => JsonType::Float64, + TapeElement::Number(s) => { + if self.tape.get_string(s).parse::().is_ok() { + JsonType::Int64 + } else { + JsonType::Float64 + } + } + TapeElement::String(_) => JsonType::String, + TapeElement::StartList(_) => JsonType::Array, + TapeElement::EndList(_) => unreachable!(), + TapeElement::StartObject(_) => JsonType::Object, + TapeElement::EndObject(_) => unreachable!(), + } + } + + fn elements(&self) -> impl Iterator { + self.tape + .iter_elements(self.idx) + .map(move |idx| Self { idx, ..*self }) + } + + fn fields(&self) -> impl Iterator { + self.tape + .iter_fields(self.idx) + .map(move |(key, idx)| (key, Self { idx, ..*self })) + } +} + +impl<'a> JsonValue<'a> for &'a serde_json::Value { + fn get(&self) -> JsonType { + use serde_json::Value; + + match self { + Value::Null => JsonType::Null, + Value::Bool(_) => JsonType::Bool, + Value::Number(n) => { + if n.is_i64() { + JsonType::Int64 + } else { + JsonType::Float64 + } + } + Value::String(_) => JsonType::String, + Value::Array(_) => JsonType::Array, + Value::Object(_) => JsonType::Object, + } + } + + fn elements(&self) -> impl Iterator { + use serde_json::Value; + + match self { + Value::Array(elements) => elements.iter(), + _ => panic!("Expected an array"), + } + } + + fn fields(&self) -> impl Iterator { + use serde_json::Value; + + match self { + Value::Object(fields) => fields.iter().map(|(key, value)| (key.as_str(), value)), + _ => panic!("Expected an object"), + } + } +} From cfba9882ddfb5a6bc8e158e042dc01a237c7b422 Mon Sep 17 00:00:00 2001 From: Alexander Rafferty Date: Sat, 21 Mar 2026 12:54:54 +1100 Subject: [PATCH 08/10] Make `InferTy` an enum directly rather than a newtyped `Arc`, and refactor `infer_json_type` into helper functions. --- arrow-json/src/reader/schema/infer.rs | 178 +++++++++++--------------- 1 file changed, 73 insertions(+), 105 deletions(-) diff --git a/arrow-json/src/reader/schema/infer.rs b/arrow-json/src/reader/schema/infer.rs index 121f3dd725f9..69524c3e4281 100644 --- a/arrow-json/src/reader/schema/infer.rs +++ b/arrow-json/src/reader/schema/infer.rs @@ -25,51 +25,24 @@ use super::json_type::{JsonType, JsonValue}; /// Represents an inferred JSON type. #[derive(Clone, Debug)] -pub(crate) struct InferTy(Arc); - -/// The possible variants of an `InferTy`. -#[derive(Clone, Debug)] -enum TyKind { +pub(crate) enum InferTy { Any, Scalar(ScalarTy), - Array(InferTy), - Object(Arc<[InferField]>), + Array(Arc), + Object(InferFields), } -/// A field in an inferred object type. -type InferField = (Arc, InferTy); - /// An inferred scalar type. #[derive(Clone, Copy, PartialEq, Eq, Debug)] -enum ScalarTy { +pub(crate) enum ScalarTy { Bool, Int64, Float64, String, } -/// During the process of schema inference, types are frequently produced and discarded. -/// As an optimisation to avoid excess heap allocations, a few common types -/// are instantiated here so they can be reused. -static COMMON_TYS: LazyLock = LazyLock::new(|| CommonTypes { - any: InferTy(TyKind::Any.into()), - bool: InferTy(TyKind::Scalar(ScalarTy::Bool).into()), - int64: InferTy(TyKind::Scalar(ScalarTy::Int64).into()), - float64: InferTy(TyKind::Scalar(ScalarTy::Float64).into()), - string: InferTy(TyKind::Scalar(ScalarTy::String).into()), - array_of_any: InferTy::new_array(InferTy(TyKind::Any.into())), - empty_object: InferTy::new_object(Arc::new([])), -}); - -struct CommonTypes { - any: InferTy, - bool: InferTy, - int64: InferTy, - float64: InferTy, - string: InferTy, - array_of_any: InferTy, - empty_object: InferTy, -} +/// A field in an inferred object type. +pub(crate) type InferFields = Arc<[(Arc, InferTy)]>; /// Infers the type of a JSON value, given an expected type. pub fn infer_json_type<'a, T>(value: T, expected: InferTy) -> Result @@ -88,70 +61,71 @@ where } /// Infers the type of a scalar JSON value, given an expected type. +/// +/// Mixed integer and float types will coerce to float, and any other +/// mixtures will coerce to string. fn infer_scalar(scalar: ScalarTy, expected: InferTy) -> Result { - Ok(match expected.kind() { - TyKind::Any => match scalar { - ScalarTy::Bool => COMMON_TYS.bool.clone(), - ScalarTy::Int64 => COMMON_TYS.int64.clone(), - ScalarTy::Float64 => COMMON_TYS.float64.clone(), - ScalarTy::String => COMMON_TYS.string.clone(), - }, - TyKind::Scalar(expect) => match (expect, &scalar) { - (ScalarTy::Bool, ScalarTy::Bool) => COMMON_TYS.bool.clone(), - (ScalarTy::Int64, ScalarTy::Int64) => COMMON_TYS.int64.clone(), - // Mixed numbers coerce to f64 - (ScalarTy::Int64 | ScalarTy::Float64, ScalarTy::Int64 | ScalarTy::Float64) => { - COMMON_TYS.float64.clone() - } - // Any other combination coerces to string - _ => COMMON_TYS.string.clone(), + use ScalarTy::*; + + Ok(InferTy::Scalar(match expected { + InferTy::Any => scalar, + InferTy::Scalar(expect) => match (expect, &scalar) { + (Bool, Bool) => Bool, + (Int64, Int64) => Int64, + (Int64 | Float64, Int64 | Float64) => Float64, + _ => String, }, _ => Err(ArrowError::JsonError(format!( "Expected {expected}, found {scalar}" )))?, - }) + })) } /// Infers the type of a JSON array, given an expected type. -fn infer_array<'a, I>(mut elements: I, mut expected: InferTy) -> Result +fn infer_array<'a, I>(mut elements: I, expected: InferTy) -> Result where I: Iterator, I::Item: JsonValue<'a>, { - if let TyKind::Any = expected.kind() { - expected = COMMON_TYS.array_of_any.clone(); - } - - let (expected_elem, expected) = match expected.kind() { - TyKind::Array(inner) => (inner.clone(), expected), + let expected_elem = match expected { + InferTy::Any => { + // Memoize to avoid excess heap allocations + static ANY_TY: LazyLock> = LazyLock::new(|| InferTy::Any.into()); + ANY_TY.clone() + } + InferTy::Array(inner) => inner.clone(), _ => Err(ArrowError::JsonError(format!( "Expected {expected}, found an array" )))?, }; - let elem = elements.try_fold(expected_elem.clone(), |expected, value| { + let elem = elements.try_fold((*expected_elem).clone(), |expected, value| { infer_json_type(value, expected) })?; - if elem.ptr_eq(&expected_elem) { - return Ok(expected); + // If the inferred type is the same as the expected type, + // reuse the expected type and thus any inner `Arc`s it contains, + // to avoid excess heap allocations. + if elem.ptr_eq(&*expected_elem) { + Ok(InferTy::Array(expected_elem)) + } else { + Ok(InferTy::Array(elem.into())) } - - Ok(InferTy::new_array(elem)) } /// Infers the type of a JSON object, given an expected type. -fn infer_object<'a, I, T>(fields: I, mut expected: InferTy) -> Result +fn infer_object<'a, I, T>(fields: I, expected: InferTy) -> Result where I: Iterator, T: JsonValue<'a>, { - if let TyKind::Any = expected.kind() { - expected = COMMON_TYS.empty_object.clone(); - } - - let (expected_fields, expected) = match expected.kind() { - TyKind::Object(fields) => (fields.clone(), expected), + let expected_fields = match expected { + InferTy::Any => { + // Memoize to avoid excess heap allocations + static EMPTY_FIELDS: LazyLock = LazyLock::new(|| Arc::new([])); + EMPTY_FIELDS.clone() + } + InferTy::Object(fields) => fields.clone(), _ => Err(ArrowError::JsonError(format!( "Expected {expected}, found an object" )))?, @@ -176,14 +150,14 @@ where None => { let field_idx = num_fields; num_fields += 1; - let ty = infer_json_type(value, COMMON_TYS.any.clone())?; + let ty = infer_json_type(value, InferTy::Any)?; substs.insert(field_idx, (key.into(), ty)); } }; } if substs.is_empty() { - return Ok(expected); + return Ok(InferTy::Object(expected_fields)); } let fields = (0..num_fields) @@ -193,40 +167,40 @@ where }) .collect(); - Ok(InferTy::new_object(fields)) + Ok(InferTy::Object(fields)) } impl InferTy { + /// Returns an `InferTy` that represents any possible JSON type. pub fn any() -> Self { - COMMON_TYS.any.clone() - } - - fn new_array(inner: impl Into) -> Self { - Self(TyKind::Array(inner.into()).into()) - } - - fn new_object(fields: Arc<[(Arc, InferTy)]>) -> Self { - Self(TyKind::Object(fields).into()) + Self::Any } + /// Returns an `InferTy` that represents an empty JSON object. pub fn empty_object() -> Self { - COMMON_TYS.empty_object.clone() - } - - fn kind(&self) -> &TyKind { - &self.0 + // Memoised to avoid excess heap allocations + static EMPTY_OBJECT: LazyLock = LazyLock::new(|| InferTy::Object(Arc::new([]))); + EMPTY_OBJECT.clone() } - fn ptr_eq(&self, other: &Self) -> bool { - Arc::ptr_eq(&self.0, &other.0) + /// Performs a shallow comparison between two types, only checking for + /// pointer equality on any nested `Arc`s. + pub fn ptr_eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::Any, Self::Any) => true, + (Self::Scalar(lhs), Self::Scalar(rhs)) => lhs == rhs, + (Self::Array(lhs), Self::Array(rhs)) => Arc::ptr_eq(lhs, rhs), + (Self::Object(lhs), Self::Object(rhs)) => Arc::ptr_eq(lhs, rhs), + _ => false, + } } pub fn to_datatype(&self) -> DataType { - match self.kind() { - TyKind::Any => DataType::Null, - TyKind::Scalar(s) => s.into_datatype(), - TyKind::Array(elem) => DataType::List(elem.to_list_field().into()), - TyKind::Object(fields) => { + match self { + InferTy::Any => DataType::Null, + InferTy::Scalar(s) => s.into_datatype(), + InferTy::Array(elem) => DataType::List(elem.to_list_field().into()), + InferTy::Object(fields) => { DataType::Struct(fields.iter().map(|(key, ty)| ty.to_field(&**key)).collect()) } } @@ -241,7 +215,7 @@ impl InferTy { } pub fn into_schema(self) -> Result { - let TyKind::Object(fields) = self.kind() else { + let InferTy::Object(fields) = self else { Err(ArrowError::JsonError(format!( "Expected JSON object, found {self:?}", )))? @@ -256,19 +230,13 @@ impl InferTy { } } -impl From<&InferTy> for InferTy { - fn from(value: &InferTy) -> Self { - Self(value.0.clone()) - } -} - impl Display for InferTy { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self.kind() { - TyKind::Any => write!(f, "any value"), - TyKind::Scalar(s) => write!(f, "{}", s), - TyKind::Array(_) => write!(f, "an array"), - TyKind::Object(_) => write!(f, "an object"), + match self { + InferTy::Any => write!(f, "any value"), + InferTy::Scalar(s) => write!(f, "{}", s), + InferTy::Array(_) => write!(f, "an array"), + InferTy::Object(_) => write!(f, "an object"), } } } From b5df8cadd221c8604aef3341a31106aadcdf6734 Mon Sep 17 00:00:00 2001 From: Alexander Rafferty Date: Sat, 21 Mar 2026 13:40:16 +1100 Subject: [PATCH 09/10] Memoize common types --- arrow-json/src/reader/schema/infer.rs | 56 ++++++++++++++++++--------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/arrow-json/src/reader/schema/infer.rs b/arrow-json/src/reader/schema/infer.rs index 69524c3e4281..78fe80465ea7 100644 --- a/arrow-json/src/reader/schema/infer.rs +++ b/arrow-json/src/reader/schema/infer.rs @@ -29,7 +29,7 @@ pub(crate) enum InferTy { Any, Scalar(ScalarTy), Array(Arc), - Object(InferFields), + Object(Arc), } /// An inferred scalar type. @@ -42,7 +42,7 @@ pub(crate) enum ScalarTy { } /// A field in an inferred object type. -pub(crate) type InferFields = Arc<[(Arc, InferTy)]>; +pub(crate) type InferFields = [(Arc, InferTy)]; /// Infers the type of a JSON value, given an expected type. pub fn infer_json_type<'a, T>(value: T, expected: InferTy) -> Result @@ -88,11 +88,7 @@ where I::Item: JsonValue<'a>, { let expected_elem = match expected { - InferTy::Any => { - // Memoize to avoid excess heap allocations - static ANY_TY: LazyLock> = LazyLock::new(|| InferTy::Any.into()); - ANY_TY.clone() - } + InferTy::Any => InferTy::Any.to_arc(), InferTy::Array(inner) => inner.clone(), _ => Err(ArrowError::JsonError(format!( "Expected {expected}, found an array" @@ -109,7 +105,7 @@ where if elem.ptr_eq(&*expected_elem) { Ok(InferTy::Array(expected_elem)) } else { - Ok(InferTy::Array(elem.into())) + Ok(InferTy::Array(elem.to_arc())) } } @@ -120,11 +116,7 @@ where T: JsonValue<'a>, { let expected_fields = match expected { - InferTy::Any => { - // Memoize to avoid excess heap allocations - static EMPTY_FIELDS: LazyLock = LazyLock::new(|| Arc::new([])); - EMPTY_FIELDS.clone() - } + InferTy::Any => InferTy::empty_fields(), InferTy::Object(fields) => fields.clone(), _ => Err(ArrowError::JsonError(format!( "Expected {expected}, found an object" @@ -170,22 +162,50 @@ where Ok(InferTy::Object(fields)) } +macro_rules! memoize { + ($ty:ty, $value:expr) => {{ + const VALUE: LazyLock> = LazyLock::new(|| Arc::new($value)); + VALUE.clone() + }}; +} + impl InferTy { /// Returns an `InferTy` that represents any possible JSON type. - pub fn any() -> Self { + pub const fn any() -> Self { Self::Any } /// Returns an `InferTy` that represents an empty JSON object. pub fn empty_object() -> Self { - // Memoised to avoid excess heap allocations - static EMPTY_OBJECT: LazyLock = LazyLock::new(|| InferTy::Object(Arc::new([]))); - EMPTY_OBJECT.clone() + Self::Object(Self::empty_fields()) + } + + /// Returns an `InferFields` with no fields. + fn empty_fields() -> Arc { + memoize!(InferFields, []) + } + + /// Returns this `InferTy` as an `Arc`. + /// + /// To avoid excess heaps allocations, common types are memoized in static variables. + fn to_arc(&self) -> Arc { + use ScalarTy::*; + + match self { + // Use a memoized `Arc` if possible + Self::Any => memoize!(InferTy, InferTy::Any), + Self::Scalar(Bool) => memoize!(InferTy, InferTy::Scalar(Bool)), + Self::Scalar(Int64) => memoize!(InferTy, InferTy::Scalar(Int64)), + Self::Scalar(Float64) => memoize!(InferTy, InferTy::Scalar(Float64)), + Self::Scalar(String) => memoize!(InferTy, InferTy::Scalar(String)), + // Otherwise allocate a new `Arc` + _ => Arc::new(self.clone()), + } } /// Performs a shallow comparison between two types, only checking for /// pointer equality on any nested `Arc`s. - pub fn ptr_eq(&self, other: &Self) -> bool { + fn ptr_eq(&self, other: &Self) -> bool { match (self, other) { (Self::Any, Self::Any) => true, (Self::Scalar(lhs), Self::Scalar(rhs)) => lhs == rhs, From 469b437f5acc6055f72e097a9257e88eb2818a65 Mon Sep 17 00:00:00 2001 From: Alexander Rafferty Date: Sat, 21 Mar 2026 13:42:53 +1100 Subject: [PATCH 10/10] Fix accidental `const` instead of `static` --- arrow-json/src/reader/schema/infer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-json/src/reader/schema/infer.rs b/arrow-json/src/reader/schema/infer.rs index 78fe80465ea7..911be595e8b2 100644 --- a/arrow-json/src/reader/schema/infer.rs +++ b/arrow-json/src/reader/schema/infer.rs @@ -102,7 +102,7 @@ where // If the inferred type is the same as the expected type, // reuse the expected type and thus any inner `Arc`s it contains, // to avoid excess heap allocations. - if elem.ptr_eq(&*expected_elem) { + if elem.ptr_eq(&expected_elem) { Ok(InferTy::Array(expected_elem)) } else { Ok(InferTy::Array(elem.to_arc())) @@ -164,7 +164,7 @@ where macro_rules! memoize { ($ty:ty, $value:expr) => {{ - const VALUE: LazyLock> = LazyLock::new(|| Arc::new($value)); + static VALUE: LazyLock> = LazyLock::new(|| Arc::new($value)); VALUE.clone() }}; }