Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
251 changes: 227 additions & 24 deletions parquet-variant-compute/src/variant_get.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use arrow_schema::{ArrowError, DataType, FieldRef};
use parquet_variant::{VariantPath, VariantPathElement};

use crate::VariantArray;
use crate::VariantType;
use crate::variant_array::BorrowedShreddingState;
use crate::variant_to_arrow::make_variant_to_arrow_row_builder;

Expand Down Expand Up @@ -86,15 +87,21 @@ pub(crate) fn follow_shredded_path_element<'a>(
return Ok(missing_path_step());
};

let struct_array = field.as_struct_opt().ok_or_else(|| {
// TODO: Should we blow up? Or just end the traversal and let the normal
// variant pathing code sort out the mess that it must anyway be
// prepared to handle?
ArrowError::InvalidArgumentError(format!(
"Expected Struct array while following path, got {}",
field.data_type(),
))
})?;
// The field might be a VariantArray (StructArray) if shredded,
// or it might be a primitive array. Only proceed if it's a StructArray.
let Some(struct_array) = field.as_struct_opt() else {
// Field exists but is not a StructArray (VariantArray),
// which means it's not shredded further.
if !cast_options.safe {
return Err(ArrowError::CastError(format!(
Comment on lines +93 to +96
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(but maybe we want to fix that everywhere in a separate dedicated PR)

"Expected Struct array while following path, got {}",
field.data_type(),
)));
}
// In safe mode, return NotShredded to let the caller
// handle it via value column.
return Ok(ShreddedPathStep::NotShredded);
};

let state = BorrowedShreddingState::try_from(struct_array)?;
Ok(ShreddedPathStep::Success(state))
Expand Down Expand Up @@ -216,22 +223,63 @@ fn shredded_get_path(
// Structs are special. Recurse into each field separately, hoping to follow the shredding even
// further, and build up the final struct from those individually shredded results.
if let DataType::Struct(fields) = as_field.data_type() {
let children = fields
let mut updated_fields = Vec::with_capacity(fields.len());
let children: Result<Vec<_>> = fields
.iter()
.map(|field| {
shredded_get_path(
// If the field has VariantType extension metadata, extract it as a
// VariantArray instead of casting to the declared data type. This allows
// callers to request structs where some fields remain as variants.
// See test_struct_extraction_with_variant_fields for usage example.
let is_variant_field = field.try_extension_type::<VariantType>().is_ok();
let field_as_type: Option<&Field> = if is_variant_field {
None
} else {
Some(field.as_ref())
};
Comment on lines +234 to +239
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit of a double negative... trying to decide if it could be clearer with inverted logic?

let is_strongly_typed = field.try_extension_type::<VariantType>().is_err();
let field_as_type = is_strongly_typed.then(|| field.as_ref());

let child = shredded_get_path(
&target,
&[VariantPathElement::from(field.name().as_str())],
Some(field),
field_as_type,
cast_options,
)
)?;

// Update field type if it was a Variant marker (extracted as VariantArray).
// The actual data type will be the internal structure of VariantArray.
// Preserve VariantType extension metadata so downstream consumers
// can recognize this field as a Variant column.
//
// When the field is entirely absent in the data, shredded_get_path
// returns a NullArray (DataType::Null). VariantType only supports
// Struct storage, so we must skip the extension in that case.
let updated_field =
if is_variant_field && matches!(child.data_type(), DataType::Struct(_)) {
field
.as_ref()
.clone()
Comment on lines +257 to +259
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part is the same in all cases. Can we simplify by hoisting out a let mut new_field and then:

let updated_field = if is_strongly_typed {
    new_field
} else {
    // handle a variant field that may or may not exist in the data (see PR comment below)
};

.with_data_type(child.data_type().clone())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why needed? Doesn't it already have that type?

.with_extension_type(VariantType)
} else if is_variant_field {
// Field was requested as Variant but data is all-null;
// preserve the data type from the child without extension metadata.
Comment on lines +263 to +264
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this intentional, to strip away e.g. JSON or other extensions that might be present?

Also, why is it correct to change the data type merely because the data are all-null?
Shouldn't it just come back as an all-null StructArray with variant annotations?

field
.as_ref()
.clone()
.with_data_type(child.data_type().clone())
} else {
field.as_ref().clone()
};
updated_fields.push(updated_field);

Ok(child)
})
.collect::<Result<Vec<_>>>()?;
.collect();
let children = children?;
Comment on lines -229 to +277
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This bit seems like a gratuitous change?

let children = fields...collect::<Result<Vec_>>>()?;

becomes

let children: Result<Vec<_>> = fields...collect();
let children = children?;


let struct_nulls = target.nulls().cloned();

return Ok(Arc::new(StructArray::try_new(
fields.clone(),
updated_fields.into(),
children,
struct_nulls,
)?));
Expand All @@ -252,9 +300,9 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti
.value_field()
.is_none_or(|v| v.null_count() == v.len())
{
// Here we need to gate against the case where the `typed_value` is null but data is in the `value` column.
// 1. If the `value` column is null, or
// 2. If every row in the `value` column is null
// When shredding is partial, some values may remain in the `value` column
// (as raw variant binary) while `typed_value` is null. Only return the
// typed value if the `value` column is entirely null (complete shredding).

// This is a perfect shredding, where the value is entirely shredded out,
// so we can just return the typed value.
Expand All @@ -265,15 +313,30 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti

/// Returns an array with the specified path extracted from the variant values.
///
/// The return array type depends on the `as_type` field of the options parameter
/// The return array type depends on the `as_type` field of the options parameter:
/// 1. `as_type: None`: a VariantArray is returned. The values in this new VariantArray will point
/// to the specified path.
/// 2. `as_type: Some(<specific field>)`: an array of the specified type is returned.
///
/// TODO: How would a caller request a struct or list type where the fields/elements can be any
/// variant? Caller can pass None as the requested type to fetch a specific path, but it would
/// quickly become annoying (and inefficient) to call `variant_get` for each leaf value in a struct or
/// list and then try to assemble the results.
/// When extracting a struct type (`DataType::Struct`), you can mix typed fields with variant fields
/// by marking fields with the [`VariantType`] extension type. Fields with `VariantType` metadata
/// will be extracted as VariantArrays, preserving the original variant representation.
///
/// Example:
/// ```rust,ignore
/// use parquet_variant_compute::VariantType;
/// use arrow_schema::extension::ExtensionType;
///
/// // Extract a struct where "name" is converted to Int32, but "data" remains a Variant
/// let fields = Fields::from(vec![
/// Field::new("name", DataType::Int32, true),
/// // Use VariantType extension metadata to request extraction as VariantArray
/// Field::new("data", DataType::Struct(Fields::empty()), true)
/// .with_extension_type(VariantType),
/// ]);
/// let options = GetOptions::new()
/// .with_as_type(Some(Arc::new(Field::new("result", DataType::Struct(fields), true))));
/// ```
pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> {
let variant_array = VariantArray::try_new(input)?;

Expand Down Expand Up @@ -335,7 +398,8 @@ mod test {
use super::{GetOptions, variant_get};
use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
use crate::{
VariantArray, VariantArrayBuilder, cast_to_variant, json_to_variant, shred_variant,
VariantArray, VariantArrayBuilder, VariantType, cast_to_variant, json_to_variant,
shred_variant,
};
use arrow::array::{
Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array,
Expand Down Expand Up @@ -4237,4 +4301,143 @@ mod test {
);
}
}

/// Test extracting a struct with mixed typed and variant fields.
/// Fields with VariantType extension metadata should be extracted as VariantArrays.
#[test]
fn test_struct_extraction_with_variant_fields() {
// Create test data: [{"id": 1, "name": "Alice", "data": {"score": 95}},
// {"id": 2, "name": "Bob", "data": null}]
let json_strings = vec![
r#"{"id": 1, "name": "Alice", "data": {"score": 95}}"#,
r#"{"id": 2, "name": "Bob", "data": null}"#,
r#"{"id": 3, "name": null, "data": {"level": 5}}"#,
];
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(json_strings));
let variant_array = json_to_variant(&string_array).unwrap();

// Request struct where:
// - "id" is extracted as Int32
// - "name" is extracted as String (Utf8)
// - "data" is extracted as Variant (using VariantType extension metadata)
let struct_fields = Fields::from(vec![
Field::new("id", DataType::Int32, true),
Field::new("name", DataType::Utf8, true),
// Use VariantType extension metadata to request extraction as VariantArray.
// The data type must be Struct to satisfy VariantType::supports_data_type.
Field::new("data", DataType::Struct(Fields::empty()), true)
.with_extension_type(VariantType),
]);
let struct_type = DataType::Struct(struct_fields);

let options = GetOptions {
path: VariantPath::default(),
as_type: Some(Arc::new(Field::new("result", struct_type, true))),
cast_options: CastOptions::default(),
};

let variant_array_ref = ArrayRef::from(variant_array);
let result = variant_get(&variant_array_ref, options).unwrap();

// Verify the result is a StructArray with 3 fields
let struct_result = result.as_any().downcast_ref::<StructArray>().unwrap();
assert_eq!(struct_result.len(), 3);
assert_eq!(struct_result.num_columns(), 3);

// Verify "id" field (Int32)
let id_field = struct_result
.column(0)
.as_any()
.downcast_ref::<Int32Array>()
.unwrap();
assert_eq!(id_field.value(0), 1);
assert_eq!(id_field.value(1), 2);
assert_eq!(id_field.value(2), 3);

// Verify "name" field (String/Utf8)
let name_field = struct_result
.column(1)
.as_any()
.downcast_ref::<StringArray>()
.unwrap();
assert_eq!(name_field.value(0), "Alice");
assert_eq!(name_field.value(1), "Bob");
assert!(name_field.is_null(2)); // null name in row 2

// Verify "data" field schema has VariantType extension metadata
let data_schema_field = struct_result
.fields()
.iter()
.find(|f| f.name() == "data")
.unwrap();
assert!(
data_schema_field
.try_extension_type::<VariantType>()
.is_ok(),
"data field should have VariantType extension metadata"
);

// Verify "data" field (VariantArray)
let data_field = struct_result.column(2);
// The data field should be a StructArray representing VariantArray's internal structure
// It has columns: metadata, value (optional), typed_value (optional)
let data_as_struct = data_field.as_any().downcast_ref::<StructArray>();
assert!(
data_as_struct.is_some(),
"data field should be a VariantArray (represented as StructArray)"
);

// Verify we can access the variant values
let data_variant_array = VariantArray::try_new(data_field).unwrap();
assert_eq!(data_variant_array.len(), 3);

// Row 0: data = {"score": 95}
let data0 = data_variant_array.value(0);
assert!(matches!(data0, Variant::Object(_)));

// Row 1: data = null
assert!(
data_variant_array.is_null(1) || matches!(data_variant_array.value(1), Variant::Null)
);

// Row 2: data = {"level": 5}
let data2 = data_variant_array.value(2);
assert!(matches!(data2, Variant::Object(_)));
}

/// Test that requesting a variant field absent in all rows does not panic.
/// Regression test: with_extension_type(VariantType) used to panic on NullArray.
#[test]
fn test_struct_extraction_missing_variant_field_no_panic() {
// Data has "id" but NOT "missing_field"
let json_strings = vec![r#"{"id": 1}"#, r#"{"id": 2}"#];
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(json_strings));
let variant_array = json_to_variant(&string_array).unwrap();

// Request struct with a variant field that doesn't exist in any row
let struct_fields = Fields::from(vec![
Field::new("id", DataType::Int32, true),
Field::new("missing_field", DataType::Struct(Fields::empty()), true)
.with_extension_type(VariantType),
]);
let struct_type = DataType::Struct(struct_fields);

let options = GetOptions {
path: VariantPath::default(),
as_type: Some(Arc::new(Field::new("result", struct_type, true))),
cast_options: CastOptions::default(),
};

let variant_array_ref = ArrayRef::from(variant_array);
// This should not panic
let result = variant_get(&variant_array_ref, options).unwrap();

let struct_result = result.as_any().downcast_ref::<StructArray>().unwrap();
assert_eq!(struct_result.len(), 2);
assert_eq!(struct_result.num_columns(), 2);

// The missing variant field should be all nulls
let missing_col = struct_result.column(1);
assert_eq!(missing_col.null_count(), missing_col.len());
}
}
Loading