-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Support extracting struct fields as Variant using ExtensionType #9598
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,6 +24,7 @@ use arrow_schema::{ArrowError, DataType, FieldRef}; | |
| use parquet_variant::{VariantPath, VariantPathElement}; | ||
|
|
||
| use crate::VariantArray; | ||
| use crate::VariantType; | ||
| use crate::variant_array::BorrowedShreddingState; | ||
| use crate::variant_to_arrow::make_variant_to_arrow_row_builder; | ||
|
|
||
|
|
@@ -86,15 +87,21 @@ pub(crate) fn follow_shredded_path_element<'a>( | |
| return Ok(missing_path_step()); | ||
| }; | ||
|
|
||
| let struct_array = field.as_struct_opt().ok_or_else(|| { | ||
| // TODO: Should we blow up? Or just end the traversal and let the normal | ||
| // variant pathing code sort out the mess that it must anyway be | ||
| // prepared to handle? | ||
| ArrowError::InvalidArgumentError(format!( | ||
| "Expected Struct array while following path, got {}", | ||
| field.data_type(), | ||
| )) | ||
| })?; | ||
| // The field might be a VariantArray (StructArray) if shredded, | ||
| // or it might be a primitive array. Only proceed if it's a StructArray. | ||
| let Some(struct_array) = field.as_struct_opt() else { | ||
| // Field exists but is not a StructArray (VariantArray), | ||
| // which means it's not shredded further. | ||
| if !cast_options.safe { | ||
| return Err(ArrowError::CastError(format!( | ||
| "Expected Struct array while following path, got {}", | ||
| field.data_type(), | ||
| ))); | ||
| } | ||
| // In safe mode, return NotShredded to let the caller | ||
| // handle it via value column. | ||
| return Ok(ShreddedPathStep::NotShredded); | ||
| }; | ||
|
|
||
| let state = BorrowedShreddingState::try_from(struct_array)?; | ||
| Ok(ShreddedPathStep::Success(state)) | ||
|
|
@@ -216,22 +223,63 @@ fn shredded_get_path( | |
| // Structs are special. Recurse into each field separately, hoping to follow the shredding even | ||
| // further, and build up the final struct from those individually shredded results. | ||
| if let DataType::Struct(fields) = as_field.data_type() { | ||
| let children = fields | ||
| let mut updated_fields = Vec::with_capacity(fields.len()); | ||
| let children: Result<Vec<_>> = fields | ||
| .iter() | ||
| .map(|field| { | ||
| shredded_get_path( | ||
| // If the field has VariantType extension metadata, extract it as a | ||
| // VariantArray instead of casting to the declared data type. This allows | ||
| // callers to request structs where some fields remain as variants. | ||
| // See test_struct_extraction_with_variant_fields for usage example. | ||
| let is_variant_field = field.try_extension_type::<VariantType>().is_ok(); | ||
| let field_as_type: Option<&Field> = if is_variant_field { | ||
| None | ||
| } else { | ||
| Some(field.as_ref()) | ||
| }; | ||
|
Comment on lines
+234
to
+239
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a bit of a double negative... trying to decide if it could be clearer with inverted logic? let is_strongly_typed = field.try_extension_type::<VariantType>().is_err();
let field_as_type = is_strongly_typed.then(|| field.as_ref()); |
||
| let child = shredded_get_path( | ||
| &target, | ||
| &[VariantPathElement::from(field.name().as_str())], | ||
| Some(field), | ||
| field_as_type, | ||
| cast_options, | ||
| ) | ||
| )?; | ||
|
|
||
| // Update field type if it was a Variant marker (extracted as VariantArray). | ||
| // The actual data type will be the internal structure of VariantArray. | ||
| // Preserve VariantType extension metadata so downstream consumers | ||
| // can recognize this field as a Variant column. | ||
| // | ||
| // When the field is entirely absent in the data, shredded_get_path | ||
| // returns a NullArray (DataType::Null). VariantType only supports | ||
| // Struct storage, so we must skip the extension in that case. | ||
| let updated_field = | ||
| if is_variant_field && matches!(child.data_type(), DataType::Struct(_)) { | ||
| field | ||
| .as_ref() | ||
| .clone() | ||
|
Comment on lines
+257
to
+259
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This part is the same in all cases. Can we simplify by hoisting out a let updated_field = if is_strongly_typed {
new_field
} else {
// handle a variant field that may or may not exist in the data (see PR comment below)
}; |
||
| .with_data_type(child.data_type().clone()) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why needed? Doesn't it already have that type? |
||
| .with_extension_type(VariantType) | ||
| } else if is_variant_field { | ||
| // Field was requested as Variant but data is all-null; | ||
| // preserve the data type from the child without extension metadata. | ||
|
Comment on lines
+263
to
+264
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this intentional, to strip away e.g. JSON or other extensions that might be present? Also, why is it correct to change the data type merely because the data are all-null? |
||
| field | ||
| .as_ref() | ||
| .clone() | ||
| .with_data_type(child.data_type().clone()) | ||
| } else { | ||
| field.as_ref().clone() | ||
| }; | ||
| updated_fields.push(updated_field); | ||
|
|
||
| Ok(child) | ||
| }) | ||
| .collect::<Result<Vec<_>>>()?; | ||
| .collect(); | ||
| let children = children?; | ||
|
Comment on lines
-229
to
+277
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This bit seems like a gratuitous change? let children = fields...collect::<Result<Vec_>>>()?;becomes let children: Result<Vec<_>> = fields...collect();
let children = children?; |
||
|
|
||
| let struct_nulls = target.nulls().cloned(); | ||
|
|
||
| return Ok(Arc::new(StructArray::try_new( | ||
| fields.clone(), | ||
| updated_fields.into(), | ||
| children, | ||
| struct_nulls, | ||
| )?)); | ||
|
|
@@ -252,9 +300,9 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti | |
| .value_field() | ||
| .is_none_or(|v| v.null_count() == v.len()) | ||
| { | ||
| // Here we need to gate against the case where the `typed_value` is null but data is in the `value` column. | ||
| // 1. If the `value` column is null, or | ||
| // 2. If every row in the `value` column is null | ||
| // When shredding is partial, some values may remain in the `value` column | ||
| // (as raw variant binary) while `typed_value` is null. Only return the | ||
| // typed value if the `value` column is entirely null (complete shredding). | ||
|
|
||
| // This is a perfect shredding, where the value is entirely shredded out, | ||
| // so we can just return the typed value. | ||
|
|
@@ -265,15 +313,30 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti | |
|
|
||
| /// Returns an array with the specified path extracted from the variant values. | ||
| /// | ||
| /// The return array type depends on the `as_type` field of the options parameter | ||
| /// The return array type depends on the `as_type` field of the options parameter: | ||
| /// 1. `as_type: None`: a VariantArray is returned. The values in this new VariantArray will point | ||
| /// to the specified path. | ||
| /// 2. `as_type: Some(<specific field>)`: an array of the specified type is returned. | ||
| /// | ||
| /// TODO: How would a caller request a struct or list type where the fields/elements can be any | ||
| /// variant? Caller can pass None as the requested type to fetch a specific path, but it would | ||
| /// quickly become annoying (and inefficient) to call `variant_get` for each leaf value in a struct or | ||
| /// list and then try to assemble the results. | ||
| /// When extracting a struct type (`DataType::Struct`), you can mix typed fields with variant fields | ||
| /// by marking fields with the [`VariantType`] extension type. Fields with `VariantType` metadata | ||
| /// will be extracted as VariantArrays, preserving the original variant representation. | ||
| /// | ||
| /// Example: | ||
| /// ```rust,ignore | ||
| /// use parquet_variant_compute::VariantType; | ||
| /// use arrow_schema::extension::ExtensionType; | ||
| /// | ||
| /// // Extract a struct where "name" is converted to Int32, but "data" remains a Variant | ||
| /// let fields = Fields::from(vec![ | ||
| /// Field::new("name", DataType::Int32, true), | ||
| /// // Use VariantType extension metadata to request extraction as VariantArray | ||
| /// Field::new("data", DataType::Struct(Fields::empty()), true) | ||
| /// .with_extension_type(VariantType), | ||
| /// ]); | ||
| /// let options = GetOptions::new() | ||
| /// .with_as_type(Some(Arc::new(Field::new("result", DataType::Struct(fields), true)))); | ||
| /// ``` | ||
| pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> { | ||
| let variant_array = VariantArray::try_new(input)?; | ||
|
|
||
|
|
@@ -335,7 +398,8 @@ mod test { | |
| use super::{GetOptions, variant_get}; | ||
| use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; | ||
| use crate::{ | ||
| VariantArray, VariantArrayBuilder, cast_to_variant, json_to_variant, shred_variant, | ||
| VariantArray, VariantArrayBuilder, VariantType, cast_to_variant, json_to_variant, | ||
| shred_variant, | ||
| }; | ||
| use arrow::array::{ | ||
| Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, | ||
|
|
@@ -4237,4 +4301,143 @@ mod test { | |
| ); | ||
| } | ||
| } | ||
|
|
||
| /// Test extracting a struct with mixed typed and variant fields. | ||
| /// Fields with VariantType extension metadata should be extracted as VariantArrays. | ||
| #[test] | ||
| fn test_struct_extraction_with_variant_fields() { | ||
| // Create test data: [{"id": 1, "name": "Alice", "data": {"score": 95}}, | ||
| // {"id": 2, "name": "Bob", "data": null}] | ||
| let json_strings = vec![ | ||
| r#"{"id": 1, "name": "Alice", "data": {"score": 95}}"#, | ||
| r#"{"id": 2, "name": "Bob", "data": null}"#, | ||
| r#"{"id": 3, "name": null, "data": {"level": 5}}"#, | ||
| ]; | ||
| let string_array: Arc<dyn Array> = Arc::new(StringArray::from(json_strings)); | ||
| let variant_array = json_to_variant(&string_array).unwrap(); | ||
|
|
||
| // Request struct where: | ||
| // - "id" is extracted as Int32 | ||
| // - "name" is extracted as String (Utf8) | ||
| // - "data" is extracted as Variant (using VariantType extension metadata) | ||
| let struct_fields = Fields::from(vec![ | ||
| Field::new("id", DataType::Int32, true), | ||
| Field::new("name", DataType::Utf8, true), | ||
| // Use VariantType extension metadata to request extraction as VariantArray. | ||
| // The data type must be Struct to satisfy VariantType::supports_data_type. | ||
| Field::new("data", DataType::Struct(Fields::empty()), true) | ||
| .with_extension_type(VariantType), | ||
| ]); | ||
| let struct_type = DataType::Struct(struct_fields); | ||
|
|
||
| let options = GetOptions { | ||
| path: VariantPath::default(), | ||
| as_type: Some(Arc::new(Field::new("result", struct_type, true))), | ||
| cast_options: CastOptions::default(), | ||
| }; | ||
|
|
||
| let variant_array_ref = ArrayRef::from(variant_array); | ||
| let result = variant_get(&variant_array_ref, options).unwrap(); | ||
|
|
||
| // Verify the result is a StructArray with 3 fields | ||
| let struct_result = result.as_any().downcast_ref::<StructArray>().unwrap(); | ||
| assert_eq!(struct_result.len(), 3); | ||
| assert_eq!(struct_result.num_columns(), 3); | ||
|
|
||
| // Verify "id" field (Int32) | ||
| let id_field = struct_result | ||
| .column(0) | ||
| .as_any() | ||
| .downcast_ref::<Int32Array>() | ||
| .unwrap(); | ||
| assert_eq!(id_field.value(0), 1); | ||
| assert_eq!(id_field.value(1), 2); | ||
| assert_eq!(id_field.value(2), 3); | ||
|
|
||
| // Verify "name" field (String/Utf8) | ||
| let name_field = struct_result | ||
| .column(1) | ||
| .as_any() | ||
| .downcast_ref::<StringArray>() | ||
| .unwrap(); | ||
| assert_eq!(name_field.value(0), "Alice"); | ||
| assert_eq!(name_field.value(1), "Bob"); | ||
| assert!(name_field.is_null(2)); // null name in row 2 | ||
|
|
||
| // Verify "data" field schema has VariantType extension metadata | ||
| let data_schema_field = struct_result | ||
| .fields() | ||
| .iter() | ||
| .find(|f| f.name() == "data") | ||
| .unwrap(); | ||
| assert!( | ||
| data_schema_field | ||
| .try_extension_type::<VariantType>() | ||
| .is_ok(), | ||
| "data field should have VariantType extension metadata" | ||
| ); | ||
|
|
||
| // Verify "data" field (VariantArray) | ||
| let data_field = struct_result.column(2); | ||
| // The data field should be a StructArray representing VariantArray's internal structure | ||
| // It has columns: metadata, value (optional), typed_value (optional) | ||
| let data_as_struct = data_field.as_any().downcast_ref::<StructArray>(); | ||
| assert!( | ||
| data_as_struct.is_some(), | ||
| "data field should be a VariantArray (represented as StructArray)" | ||
| ); | ||
|
|
||
| // Verify we can access the variant values | ||
| let data_variant_array = VariantArray::try_new(data_field).unwrap(); | ||
| assert_eq!(data_variant_array.len(), 3); | ||
|
|
||
| // Row 0: data = {"score": 95} | ||
| let data0 = data_variant_array.value(0); | ||
| assert!(matches!(data0, Variant::Object(_))); | ||
|
|
||
| // Row 1: data = null | ||
| assert!( | ||
| data_variant_array.is_null(1) || matches!(data_variant_array.value(1), Variant::Null) | ||
| ); | ||
|
|
||
| // Row 2: data = {"level": 5} | ||
| let data2 = data_variant_array.value(2); | ||
| assert!(matches!(data2, Variant::Object(_))); | ||
| } | ||
|
|
||
| /// Test that requesting a variant field absent in all rows does not panic. | ||
| /// Regression test: with_extension_type(VariantType) used to panic on NullArray. | ||
| #[test] | ||
| fn test_struct_extraction_missing_variant_field_no_panic() { | ||
| // Data has "id" but NOT "missing_field" | ||
| let json_strings = vec![r#"{"id": 1}"#, r#"{"id": 2}"#]; | ||
| let string_array: Arc<dyn Array> = Arc::new(StringArray::from(json_strings)); | ||
| let variant_array = json_to_variant(&string_array).unwrap(); | ||
|
|
||
| // Request struct with a variant field that doesn't exist in any row | ||
| let struct_fields = Fields::from(vec![ | ||
| Field::new("id", DataType::Int32, true), | ||
| Field::new("missing_field", DataType::Struct(Fields::empty()), true) | ||
| .with_extension_type(VariantType), | ||
| ]); | ||
| let struct_type = DataType::Struct(struct_fields); | ||
|
|
||
| let options = GetOptions { | ||
| path: VariantPath::default(), | ||
| as_type: Some(Arc::new(Field::new("result", struct_type, true))), | ||
| cast_options: CastOptions::default(), | ||
| }; | ||
|
|
||
| let variant_array_ref = ArrayRef::from(variant_array); | ||
| // This should not panic | ||
| let result = variant_get(&variant_array_ref, options).unwrap(); | ||
|
|
||
| let struct_result = result.as_any().downcast_ref::<StructArray>().unwrap(); | ||
| assert_eq!(struct_result.len(), 2); | ||
| assert_eq!(struct_result.num_columns(), 2); | ||
|
|
||
| // The missing variant field should be all nulls | ||
| let missing_col = struct_result.column(1); | ||
| assert_eq!(missing_col.null_count(), missing_col.len()); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(but maybe we want to fix that everywhere in a separate dedicated PR)