From bf74dce1e35886a75b2bf84af4846425d08d64bb Mon Sep 17 00:00:00 2001 From: Yan Tingwang Date: Sat, 21 Mar 2026 08:48:52 +0800 Subject: [PATCH] Support extracting struct fields as Variant using ExtensionType --- parquet-variant-compute/src/variant_get.rs | 251 +++++++++++++++++++-- 1 file changed, 227 insertions(+), 24 deletions(-) diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 9204dcf7085..89366eb13b3 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -24,6 +24,7 @@ use arrow_schema::{ArrowError, DataType, FieldRef}; use parquet_variant::{VariantPath, VariantPathElement}; use crate::VariantArray; +use crate::VariantType; use crate::variant_array::BorrowedShreddingState; use crate::variant_to_arrow::make_variant_to_arrow_row_builder; @@ -86,15 +87,21 @@ pub(crate) fn follow_shredded_path_element<'a>( return Ok(missing_path_step()); }; - let struct_array = field.as_struct_opt().ok_or_else(|| { - // TODO: Should we blow up? Or just end the traversal and let the normal - // variant pathing code sort out the mess that it must anyway be - // prepared to handle? - ArrowError::InvalidArgumentError(format!( - "Expected Struct array while following path, got {}", - field.data_type(), - )) - })?; + // The field might be a VariantArray (StructArray) if shredded, + // or it might be a primitive array. Only proceed if it's a StructArray. + let Some(struct_array) = field.as_struct_opt() else { + // Field exists but is not a StructArray (VariantArray), + // which means it's not shredded further. + if !cast_options.safe { + return Err(ArrowError::CastError(format!( + "Expected Struct array while following path, got {}", + field.data_type(), + ))); + } + // In safe mode, return NotShredded to let the caller + // handle it via value column. + return Ok(ShreddedPathStep::NotShredded); + }; let state = BorrowedShreddingState::try_from(struct_array)?; Ok(ShreddedPathStep::Success(state)) @@ -216,22 +223,63 @@ fn shredded_get_path( // Structs are special. Recurse into each field separately, hoping to follow the shredding even // further, and build up the final struct from those individually shredded results. if let DataType::Struct(fields) = as_field.data_type() { - let children = fields + let mut updated_fields = Vec::with_capacity(fields.len()); + let children: Result> = fields .iter() .map(|field| { - shredded_get_path( + // If the field has VariantType extension metadata, extract it as a + // VariantArray instead of casting to the declared data type. This allows + // callers to request structs where some fields remain as variants. + // See test_struct_extraction_with_variant_fields for usage example. + let is_variant_field = field.try_extension_type::().is_ok(); + let field_as_type: Option<&Field> = if is_variant_field { + None + } else { + Some(field.as_ref()) + }; + let child = shredded_get_path( &target, &[VariantPathElement::from(field.name().as_str())], - Some(field), + field_as_type, cast_options, - ) + )?; + + // Update field type if it was a Variant marker (extracted as VariantArray). + // The actual data type will be the internal structure of VariantArray. + // Preserve VariantType extension metadata so downstream consumers + // can recognize this field as a Variant column. + // + // When the field is entirely absent in the data, shredded_get_path + // returns a NullArray (DataType::Null). VariantType only supports + // Struct storage, so we must skip the extension in that case. + let updated_field = + if is_variant_field && matches!(child.data_type(), DataType::Struct(_)) { + field + .as_ref() + .clone() + .with_data_type(child.data_type().clone()) + .with_extension_type(VariantType) + } else if is_variant_field { + // Field was requested as Variant but data is all-null; + // preserve the data type from the child without extension metadata. + field + .as_ref() + .clone() + .with_data_type(child.data_type().clone()) + } else { + field.as_ref().clone() + }; + updated_fields.push(updated_field); + + Ok(child) }) - .collect::>>()?; + .collect(); + let children = children?; let struct_nulls = target.nulls().cloned(); return Ok(Arc::new(StructArray::try_new( - fields.clone(), + updated_fields.into(), children, struct_nulls, )?)); @@ -252,9 +300,9 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti .value_field() .is_none_or(|v| v.null_count() == v.len()) { - // Here we need to gate against the case where the `typed_value` is null but data is in the `value` column. - // 1. If the `value` column is null, or - // 2. If every row in the `value` column is null + // When shredding is partial, some values may remain in the `value` column + // (as raw variant binary) while `typed_value` is null. Only return the + // typed value if the `value` column is entirely null (complete shredding). // This is a perfect shredding, where the value is entirely shredded out, // so we can just return the typed value. @@ -265,15 +313,30 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti /// Returns an array with the specified path extracted from the variant values. /// -/// The return array type depends on the `as_type` field of the options parameter +/// The return array type depends on the `as_type` field of the options parameter: /// 1. `as_type: None`: a VariantArray is returned. The values in this new VariantArray will point /// to the specified path. /// 2. `as_type: Some()`: an array of the specified type is returned. /// -/// TODO: How would a caller request a struct or list type where the fields/elements can be any -/// variant? Caller can pass None as the requested type to fetch a specific path, but it would -/// quickly become annoying (and inefficient) to call `variant_get` for each leaf value in a struct or -/// list and then try to assemble the results. +/// When extracting a struct type (`DataType::Struct`), you can mix typed fields with variant fields +/// by marking fields with the [`VariantType`] extension type. Fields with `VariantType` metadata +/// will be extracted as VariantArrays, preserving the original variant representation. +/// +/// Example: +/// ```rust,ignore +/// use parquet_variant_compute::VariantType; +/// use arrow_schema::extension::ExtensionType; +/// +/// // Extract a struct where "name" is converted to Int32, but "data" remains a Variant +/// let fields = Fields::from(vec![ +/// Field::new("name", DataType::Int32, true), +/// // Use VariantType extension metadata to request extraction as VariantArray +/// Field::new("data", DataType::Struct(Fields::empty()), true) +/// .with_extension_type(VariantType), +/// ]); +/// let options = GetOptions::new() +/// .with_as_type(Some(Arc::new(Field::new("result", DataType::Struct(fields), true)))); +/// ``` pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result { let variant_array = VariantArray::try_new(input)?; @@ -335,7 +398,8 @@ mod test { use super::{GetOptions, variant_get}; use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; use crate::{ - VariantArray, VariantArrayBuilder, cast_to_variant, json_to_variant, shred_variant, + VariantArray, VariantArrayBuilder, VariantType, cast_to_variant, json_to_variant, + shred_variant, }; use arrow::array::{ Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, @@ -4237,4 +4301,143 @@ mod test { ); } } + + /// Test extracting a struct with mixed typed and variant fields. + /// Fields with VariantType extension metadata should be extracted as VariantArrays. + #[test] + fn test_struct_extraction_with_variant_fields() { + // Create test data: [{"id": 1, "name": "Alice", "data": {"score": 95}}, + // {"id": 2, "name": "Bob", "data": null}] + let json_strings = vec![ + r#"{"id": 1, "name": "Alice", "data": {"score": 95}}"#, + r#"{"id": 2, "name": "Bob", "data": null}"#, + r#"{"id": 3, "name": null, "data": {"level": 5}}"#, + ]; + let string_array: Arc = Arc::new(StringArray::from(json_strings)); + let variant_array = json_to_variant(&string_array).unwrap(); + + // Request struct where: + // - "id" is extracted as Int32 + // - "name" is extracted as String (Utf8) + // - "data" is extracted as Variant (using VariantType extension metadata) + let struct_fields = Fields::from(vec![ + Field::new("id", DataType::Int32, true), + Field::new("name", DataType::Utf8, true), + // Use VariantType extension metadata to request extraction as VariantArray. + // The data type must be Struct to satisfy VariantType::supports_data_type. + Field::new("data", DataType::Struct(Fields::empty()), true) + .with_extension_type(VariantType), + ]); + let struct_type = DataType::Struct(struct_fields); + + let options = GetOptions { + path: VariantPath::default(), + as_type: Some(Arc::new(Field::new("result", struct_type, true))), + cast_options: CastOptions::default(), + }; + + let variant_array_ref = ArrayRef::from(variant_array); + let result = variant_get(&variant_array_ref, options).unwrap(); + + // Verify the result is a StructArray with 3 fields + let struct_result = result.as_any().downcast_ref::().unwrap(); + assert_eq!(struct_result.len(), 3); + assert_eq!(struct_result.num_columns(), 3); + + // Verify "id" field (Int32) + let id_field = struct_result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(id_field.value(0), 1); + assert_eq!(id_field.value(1), 2); + assert_eq!(id_field.value(2), 3); + + // Verify "name" field (String/Utf8) + let name_field = struct_result + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(name_field.value(0), "Alice"); + assert_eq!(name_field.value(1), "Bob"); + assert!(name_field.is_null(2)); // null name in row 2 + + // Verify "data" field schema has VariantType extension metadata + let data_schema_field = struct_result + .fields() + .iter() + .find(|f| f.name() == "data") + .unwrap(); + assert!( + data_schema_field + .try_extension_type::() + .is_ok(), + "data field should have VariantType extension metadata" + ); + + // Verify "data" field (VariantArray) + let data_field = struct_result.column(2); + // The data field should be a StructArray representing VariantArray's internal structure + // It has columns: metadata, value (optional), typed_value (optional) + let data_as_struct = data_field.as_any().downcast_ref::(); + assert!( + data_as_struct.is_some(), + "data field should be a VariantArray (represented as StructArray)" + ); + + // Verify we can access the variant values + let data_variant_array = VariantArray::try_new(data_field).unwrap(); + assert_eq!(data_variant_array.len(), 3); + + // Row 0: data = {"score": 95} + let data0 = data_variant_array.value(0); + assert!(matches!(data0, Variant::Object(_))); + + // Row 1: data = null + assert!( + data_variant_array.is_null(1) || matches!(data_variant_array.value(1), Variant::Null) + ); + + // Row 2: data = {"level": 5} + let data2 = data_variant_array.value(2); + assert!(matches!(data2, Variant::Object(_))); + } + + /// Test that requesting a variant field absent in all rows does not panic. + /// Regression test: with_extension_type(VariantType) used to panic on NullArray. + #[test] + fn test_struct_extraction_missing_variant_field_no_panic() { + // Data has "id" but NOT "missing_field" + let json_strings = vec![r#"{"id": 1}"#, r#"{"id": 2}"#]; + let string_array: Arc = Arc::new(StringArray::from(json_strings)); + let variant_array = json_to_variant(&string_array).unwrap(); + + // Request struct with a variant field that doesn't exist in any row + let struct_fields = Fields::from(vec![ + Field::new("id", DataType::Int32, true), + Field::new("missing_field", DataType::Struct(Fields::empty()), true) + .with_extension_type(VariantType), + ]); + let struct_type = DataType::Struct(struct_fields); + + let options = GetOptions { + path: VariantPath::default(), + as_type: Some(Arc::new(Field::new("result", struct_type, true))), + cast_options: CastOptions::default(), + }; + + let variant_array_ref = ArrayRef::from(variant_array); + // This should not panic + let result = variant_get(&variant_array_ref, options).unwrap(); + + let struct_result = result.as_any().downcast_ref::().unwrap(); + assert_eq!(struct_result.len(), 2); + assert_eq!(struct_result.num_columns(), 2); + + // The missing variant field should be all nulls + let missing_col = struct_result.column(1); + assert_eq!(missing_col.null_count(), missing_col.len()); + } }