diff --git a/parquet/benches/metadata.rs b/parquet/benches/metadata.rs index 949e0d98ea39..ec6bf93963a5 100644 --- a/parquet/benches/metadata.rs +++ b/parquet/benches/metadata.rs @@ -70,6 +70,7 @@ fn encoded_meta() -> Vec { distinct_count: None, max_value: Some(vec![rng.random(); 8]), min_value: Some(vec![rng.random(); 8]), + nan_count: None, is_max_value_exact: Some(true), is_min_value_exact: Some(true), }; diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index 2deb3c535a12..c0acb74a5937 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -287,6 +287,7 @@ impl FallbackEncoder { encoding, min_value, max_value, + nan_count: None, variable_length_bytes, }) } @@ -409,6 +410,7 @@ impl DictEncoder { encoding: Encoding::RLE_DICTIONARY, min_value, max_value, + nan_count: None, variable_length_bytes, } } diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index 7371c72a5896..6e2af3190726 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -63,6 +63,7 @@ pub struct DataPageValues { pub encoding: Encoding, pub min_value: Option, pub max_value: Option, + pub nan_count: Option, pub variable_length_bytes: Option, } @@ -131,6 +132,7 @@ pub struct ColumnValueEncoderImpl { statistics_enabled: EnabledStatistics, min_value: Option, max_value: Option, + nan_count: Option, bloom_filter: Option, variable_length_bytes: Option, } @@ -148,6 +150,17 @@ impl ColumnValueEncoderImpl { // INTERVAL has undefined sort order, so don't write min/max stats for it && self.descr.converted_type() != ConvertedType::INTERVAL { + // Count NaN values for floating point types + if matches!(T::T::PHYSICAL_TYPE, Type::FLOAT | Type::DOUBLE) + || (T::T::PHYSICAL_TYPE == Type::FIXED_LEN_BYTE_ARRAY + && self.descr.logical_type() == Some(LogicalType::Float16)) + { + let nan_count = slice.iter().filter(|v| is_nan(&self.descr, *v)).count() as u64; + if nan_count > 0 { + *self.nan_count.get_or_insert(0) += nan_count; + } + } + if let Some((min, max)) = self.min_max(slice, None) { update_min(&self.descr, &min, &mut self.min_value); update_max(&self.descr, &max, &mut self.max_value); @@ -210,6 +223,7 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { bloom_filter, min_value: None, max_value: None, + nan_count: None, variable_length_bytes: None, }) } @@ -304,6 +318,7 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { num_values: std::mem::take(&mut self.num_values), min_value: self.min_value.take(), max_value: self.max_value.take(), + nan_count: self.nan_count.take(), variable_length_bytes: self.variable_length_bytes.take(), }) } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 9374e226b87f..88fc356ff483 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -196,6 +196,7 @@ struct PageMetrics { num_buffered_values: u32, num_buffered_rows: u32, num_page_nulls: u64, + num_page_nans: Option, repetition_level_histogram: Option, definition_level_histogram: Option, } @@ -223,6 +224,7 @@ impl PageMetrics { self.num_buffered_values = 0; self.num_buffered_rows = 0; self.num_page_nulls = 0; + self.num_page_nans = None; self.repetition_level_histogram .as_mut() .map(LevelHistogram::reset); @@ -259,6 +261,7 @@ struct ColumnMetrics { min_column_value: Option, max_column_value: Option, num_column_nulls: u64, + num_column_nans: Option, column_distinct_count: Option, variable_length_bytes: Option, repetition_level_histogram: Option, @@ -770,17 +773,31 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { page_statistics: Option<&ValueStatistics>, page_variable_length_bytes: Option, ) { + // Determine if this is a floating-point column + let is_float_column = matches!(self.descr.physical_type(), Type::FLOAT | Type::DOUBLE) + || (self.descr.physical_type() == Type::FIXED_LEN_BYTE_ARRAY + && self.descr.logical_type() == Some(LogicalType::Float16)); + // update the column index let null_page = (self.page_metrics.num_buffered_rows as u64) == self.page_metrics.num_page_nulls; // a page contains only null values, // and writers have to set the corresponding entries in min_values and max_values to byte[0] if null_page && self.column_index_builder.valid() { + // For float columns, always provide Some(n), even if n is 0 + // For non-float columns, always provide None + let nan_count = if is_float_column { + Some(self.page_metrics.num_page_nans.unwrap_or(0) as i64) + } else { + None + }; + self.column_index_builder.append( null_page, vec![], vec![], self.page_metrics.num_page_nulls as i64, + nan_count, ); } else if self.column_index_builder.valid() { // from page statistics @@ -814,6 +831,14 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } self.last_non_null_data_page_min_max = Some((new_min.clone(), new_max.clone())); + // For float columns, always provide Some(n), even if n is 0 + // For non-float columns, always provide None + let nan_count = if is_float_column { + Some(stat.nan_count_opt().unwrap_or(0) as i64) + } else { + None + }; + if self.can_truncate_value() { self.column_index_builder.append( null_page, @@ -828,6 +853,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { ) .0, self.page_metrics.num_page_nulls as i64, + nan_count, ); } else { self.column_index_builder.append( @@ -835,6 +861,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { stat.min_bytes_opt().unwrap().to_vec(), stat.max_bytes_opt().unwrap().to_vec(), self.page_metrics.num_page_nulls as i64, + nan_count, ); } } @@ -1003,6 +1030,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { self.column_metrics.num_column_nulls += self.page_metrics.num_page_nulls; + if let Some(nan_count) = values_data.nan_count { + *self.column_metrics.num_column_nans.get_or_insert(0) += nan_count; + self.page_metrics.num_page_nans = Some(nan_count); + } + let page_statistics = match (values_data.min_value, values_data.max_value) { (Some(min), Some(max)) => { // Update chunk level statistics @@ -1016,7 +1048,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { None, Some(self.page_metrics.num_page_nulls), false, - ), + ) + .with_nan_count(values_data.nan_count), ) } _ => None, @@ -1190,6 +1223,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { Some(self.column_metrics.num_column_nulls), false, ) + .with_nan_count(self.column_metrics.num_column_nans) .with_backwards_compatible_min_max(backwards_compatible_min_max) .into(); @@ -1395,7 +1429,24 @@ fn update_stat( /// Evaluate `a > b` according to underlying logical type. fn compare_greater(descr: &ColumnDescriptor, a: &T, b: &T) -> bool { + use crate::util::ieee754; + use std::cmp::Ordering; + match T::PHYSICAL_TYPE { + Type::FLOAT => { + // Use IEEE 754 total order for float comparisons + // SAFETY: We know T is f32 when PHYSICAL_TYPE is FLOAT + let a_f32 = unsafe { *(a as *const T as *const f32) }; + let b_f32 = unsafe { *(b as *const T as *const f32) }; + return ieee754::compare_f32(a_f32, b_f32) == Ordering::Greater; + } + Type::DOUBLE => { + // Use IEEE 754 total order for double comparisons + // SAFETY: We know T is f64 when PHYSICAL_TYPE is DOUBLE + let a_f64 = unsafe { *(a as *const T as *const f64) }; + let b_f64 = unsafe { *(b as *const T as *const f64) }; + return ieee754::compare_f64(a_f64, b_f64) == Ordering::Greater; + } Type::INT32 | Type::INT64 => { if let Some(LogicalType::Integer { is_signed: false, .. @@ -1471,9 +1522,12 @@ fn compare_greater_unsigned_int(a: &T, b: &T) -> bool { #[inline] fn compare_greater_f16(a: &[u8], b: &[u8]) -> bool { + use crate::util::ieee754; + use std::cmp::Ordering; + let a = f16::from_le_bytes(a.try_into().unwrap()); let b = f16::from_le_bytes(b.try_into().unwrap()); - a > b + ieee754::compare_f16(a, b) == Ordering::Greater } /// Signed comparison of bytes arrays @@ -2619,6 +2673,77 @@ mod tests { ); } + #[test] + fn test_ieee754_total_order_float() { + // Test IEEE 754 total order for f32 + // Order should be: -NaN < -Inf < -1.0 < -0.0 < +0.0 < 1.0 < +Inf < +NaN + let neg_nan = f32::from_bits(0xffc00000); + let neg_inf = f32::NEG_INFINITY; + let neg_one = -1.0_f32; + let neg_zero = -0.0_f32; + let pos_zero = 0.0_f32; + let pos_one = 1.0_f32; + let pos_inf = f32::INFINITY; + let pos_nan = f32::from_bits(0x7fc00000); + + let values = vec![ + pos_nan, neg_zero, pos_inf, neg_one, neg_nan, pos_one, neg_inf, pos_zero, + ]; + + let stats = statistics_roundtrip::(&values); + if let Statistics::Float(stats) = stats { + // With IEEE 754 total order, min should be -NaN, max should be +NaN + // But since we filter out NaN values, min should be -Inf, max should be +Inf + assert_eq!(stats.min_opt().unwrap(), &neg_inf); + assert_eq!(stats.max_opt().unwrap(), &pos_inf); + assert_eq!(stats.nan_count_opt(), Some(2)); // neg_nan and pos_nan + } else { + panic!("Expected float statistics"); + } + } + + #[test] + fn test_ieee754_total_order_double() { + // Test IEEE 754 total order for f64 + let neg_nan = f64::from_bits(0xfff8000000000000); + let neg_inf = f64::NEG_INFINITY; + let neg_one = -1.0_f64; + let neg_zero = -0.0_f64; + let pos_zero = 0.0_f64; + let pos_one = 1.0_f64; + let pos_inf = f64::INFINITY; + let pos_nan = f64::from_bits(0x7ff8000000000000); + + let values = vec![ + pos_nan, neg_zero, pos_inf, neg_one, neg_nan, pos_one, neg_inf, pos_zero, + ]; + + let stats = statistics_roundtrip::(&values); + if let Statistics::Double(stats) = stats { + // With IEEE 754 total order, and NaN filtering + assert_eq!(stats.min_opt().unwrap(), &neg_inf); + assert_eq!(stats.max_opt().unwrap(), &pos_inf); + assert_eq!(stats.nan_count_opt(), Some(2)); + } else { + panic!("Expected double statistics"); + } + } + + #[test] + fn test_ieee754_total_order_zeros() { + // Test that -0.0 and +0.0 are handled correctly + let values = vec![-0.0_f32, 0.0_f32, -0.0_f32, 0.0_f32]; + + let stats = statistics_roundtrip::(&values); + if let Statistics::Float(stats) = stats { + // With IEEE 754 total order, -0.0 < +0.0 + assert_eq!(stats.min_opt().unwrap().to_bits(), (-0.0_f32).to_bits()); + assert_eq!(stats.max_opt().unwrap().to_bits(), 0.0_f32.to_bits()); + } else { + panic!("Expected float statistics"); + } + } + #[test] fn test_column_writer_check_float16_nan_middle() { let input = [f16::ONE, f16::NAN, f16::ONE + f16::ONE] @@ -2649,6 +2774,7 @@ mod tests { stats.max_opt().unwrap(), &ByteArray::from(f16::ONE + f16::ONE) ); + assert_eq!(stats.nan_count_opt(), Some(1)); } #[test] @@ -2665,6 +2791,7 @@ mod tests { stats.max_opt().unwrap(), &ByteArray::from(f16::ONE + f16::ONE) ); + assert_eq!(stats.nan_count_opt(), Some(1)); } #[test] @@ -2678,6 +2805,7 @@ mod tests { assert!(stats.min_bytes_opt().is_none()); assert!(stats.max_bytes_opt().is_none()); assert!(stats.is_min_max_backwards_compatible()); + assert_eq!(stats.nan_count_opt(), Some(2)); } #[test] @@ -2736,24 +2864,26 @@ mod tests { fn test_float_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f32::NAN, 2.0]); assert!(stats.is_min_max_backwards_compatible()); - if let Statistics::Float(stats) = stats { + if let Statistics::Float(stats) = &stats { assert_eq!(stats.min_opt().unwrap(), &1.0); assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Float"); } + assert_eq!(stats.nan_count_opt(), Some(1)); } #[test] fn test_float_statistics_nan_start() { let stats = statistics_roundtrip::(&[f32::NAN, 1.0, 2.0]); assert!(stats.is_min_max_backwards_compatible()); - if let Statistics::Float(stats) = stats { + if let Statistics::Float(stats) = &stats { assert_eq!(stats.min_opt().unwrap(), &1.0); assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Float"); } + assert_eq!(stats.nan_count_opt(), Some(1)); } #[test] @@ -2763,6 +2893,7 @@ mod tests { assert!(stats.max_bytes_opt().is_none()); assert!(stats.is_min_max_backwards_compatible()); assert!(matches!(stats, Statistics::Float(_))); + assert_eq!(stats.nan_count_opt(), Some(2)); } #[test] @@ -2823,24 +2954,26 @@ mod tests { fn test_double_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f64::NAN, 2.0]); assert!(stats.is_min_max_backwards_compatible()); - if let Statistics::Double(stats) = stats { + if let Statistics::Double(stats) = &stats { assert_eq!(stats.min_opt().unwrap(), &1.0); assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Double"); } + assert_eq!(stats.nan_count_opt(), Some(1)); } #[test] fn test_double_statistics_nan_start() { let stats = statistics_roundtrip::(&[f64::NAN, 1.0, 2.0]); assert!(stats.is_min_max_backwards_compatible()); - if let Statistics::Double(stats) = stats { + if let Statistics::Double(stats) = &stats { assert_eq!(stats.min_opt().unwrap(), &1.0); assert_eq!(stats.max_opt().unwrap(), &2.0); } else { panic!("expecting Statistics::Double"); } + assert_eq!(stats.nan_count_opt(), Some(1)); } #[test] @@ -2850,6 +2983,7 @@ mod tests { assert!(stats.max_bytes_opt().is_none()); assert!(matches!(stats, Statistics::Double(_))); assert!(stats.is_min_max_backwards_compatible()); + assert_eq!(stats.nan_count_opt(), Some(2)); } #[test] @@ -2906,6 +3040,108 @@ mod tests { } } + #[test] + fn test_float_statistics_infinity_with_nan() { + // Test column with Infinity and NaN values + let stats = + statistics_roundtrip::(&[1.0, f32::INFINITY, f32::NAN, 2.0, f32::NAN]); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Float(stats) = &stats { + assert_eq!(stats.min_opt().unwrap(), &1.0); + assert_eq!(stats.max_opt().unwrap(), &f32::INFINITY); + } else { + panic!("expecting Statistics::Float"); + } + assert_eq!(stats.nan_count_opt(), Some(2)); + } + + #[test] + fn test_float_statistics_neg_infinity_with_nan() { + // Test column with -Infinity and NaN values + let stats = statistics_roundtrip::(&[ + f32::NEG_INFINITY, + -1.0, + f32::NAN, + 0.0, + f32::NAN, + 1.0, + ]); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Float(stats) = &stats { + assert_eq!(stats.min_opt().unwrap(), &f32::NEG_INFINITY); + assert_eq!(stats.max_opt().unwrap(), &1.0); + } else { + panic!("expecting Statistics::Float"); + } + assert_eq!(stats.nan_count_opt(), Some(2)); + } + + #[test] + fn test_float_statistics_both_infinities_with_nan() { + // Test column with both +Infinity, -Infinity and NaN values + let stats = statistics_roundtrip::(&[ + f32::NEG_INFINITY, + f32::NAN, + 0.0, + f32::INFINITY, + f32::NAN, + ]); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Float(stats) = &stats { + assert_eq!(stats.min_opt().unwrap(), &f32::NEG_INFINITY); + assert_eq!(stats.max_opt().unwrap(), &f32::INFINITY); + } else { + panic!("expecting Statistics::Float"); + } + assert_eq!(stats.nan_count_opt(), Some(2)); + } + + #[test] + fn test_double_statistics_infinity_with_nan() { + // Test with f64 (double) type + let stats = statistics_roundtrip::(&[ + 1.0, + f64::INFINITY, + f64::NAN, + f64::NEG_INFINITY, + f64::NAN, + 2.0, + ]); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Double(stats) = &stats { + assert_eq!(stats.min_opt().unwrap(), &f64::NEG_INFINITY); + assert_eq!(stats.max_opt().unwrap(), &f64::INFINITY); + } else { + panic!("expecting Statistics::Double"); + } + + assert_eq!(stats.nan_count_opt(), Some(2)); + } + + #[test] + fn test_float16_statistics_infinity_with_nan() { + // Test Float16 with Infinity and NaN + let input = [ + f16::ONE, + f16::INFINITY, + f16::NAN, + f16::NEG_INFINITY, + f16::NAN, + ] + .into_iter() + .map(|s| ByteArray::from(s).into()) + .collect::>(); + + let stats = float16_statistics_roundtrip(&input); + assert!(stats.is_min_max_backwards_compatible()); + assert_eq!( + stats.min_opt().unwrap(), + &ByteArray::from(f16::NEG_INFINITY) + ); + assert_eq!(stats.max_opt().unwrap(), &ByteArray::from(f16::INFINITY)); + assert_eq!(stats.nan_count_opt(), Some(2)); + } + #[test] fn test_compare_greater_byte_array_decimals() { assert!(!compare_greater_byte_array_decimals(&[], &[],),); diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index c33198809297..f20f8e4fef63 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -1522,6 +1522,7 @@ pub struct ColumnIndexBuilder { min_values: Vec>, max_values: Vec>, null_counts: Vec, + nan_counts: Vec>, boundary_order: BoundaryOrder, /// contains the concatenation of the histograms of all pages repetition_level_histograms: Option>, @@ -1551,6 +1552,7 @@ impl ColumnIndexBuilder { min_values: Vec::new(), max_values: Vec::new(), null_counts: Vec::new(), + nan_counts: Vec::new(), boundary_order: BoundaryOrder::UNORDERED, repetition_level_histograms: None, definition_level_histograms: None, @@ -1559,17 +1561,24 @@ impl ColumnIndexBuilder { } /// Append statistics for the next page + /// + /// For floating-point columns (FLOAT, DOUBLE, or FLOAT16), `nan_count` must always + /// be `Some(n)`, even if n is 0. For non-floating-point columns, `nan_count` must + /// always be `None`. This requirement ensures correct serialization according to + /// the Parquet specification. pub fn append( &mut self, null_page: bool, min_value: Vec, max_value: Vec, null_count: i64, + nan_count: Option, ) { self.null_pages.push(null_page); self.min_values.push(min_value); self.max_values.push(max_value); self.null_counts.push(null_count); + self.nan_counts.push(nan_count); } /// Append the given page-level histograms to the [`ColumnIndex`] histograms. @@ -1613,6 +1622,34 @@ impl ColumnIndexBuilder { /// /// Note: callers should check [`Self::valid`] before calling this method pub fn build_to_thrift(self) -> ColumnIndex { + // Parquet spec requires nan_counts to be either present for all pages or absent entirely. + // Callers must ensure consistency: + // - For floating-point columns: all pages must have Some(n) + // - For non-floating-point columns: all pages must have None + let nan_counts = if !self.nan_counts.is_empty() { + let has_some = self.nan_counts.iter().any(|x| x.is_some()); + let has_none = self.nan_counts.iter().any(|x| x.is_none()); + + if has_some && !has_none { + Some(self.nan_counts.into_iter().map(|x| x.unwrap()).collect()) + } else if !has_some && has_none { + None + } else { + debug_assert!( + false, + "Mixed Some/None in nan_counts - caller should provide consistent values" + ); + Some( + self.nan_counts + .into_iter() + .map(|x| x.unwrap_or(0)) + .collect(), + ) + } + } else { + None + }; + ColumnIndex::new( self.null_pages, self.min_values, @@ -1621,6 +1658,7 @@ impl ColumnIndexBuilder { self.null_counts, self.repetition_level_histograms, self.definition_level_histograms, + nan_counts, ) } } @@ -1968,14 +2006,14 @@ mod tests { .build(); #[cfg(not(feature = "encryption"))] - let base_expected_size = 2312; + let base_expected_size = 2376; #[cfg(feature = "encryption")] - let base_expected_size = 2648; + let base_expected_size = 2712; assert_eq!(parquet_meta.memory_size(), base_expected_size); let mut column_index = ColumnIndexBuilder::new(); - column_index.append(false, vec![1u8], vec![2u8, 3u8], 4); + column_index.append(false, vec![1u8], vec![2u8, 3u8], 4, None); let column_index = column_index.build_to_thrift(); let native_index = NativeIndex::::try_new(column_index).unwrap(); @@ -1998,9 +2036,9 @@ mod tests { .build(); #[cfg(not(feature = "encryption"))] - let bigger_expected_size = 2816; + let bigger_expected_size = 2944; #[cfg(feature = "encryption")] - let bigger_expected_size = 3152; + let bigger_expected_size = 3280; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 4b97b5fc55b5..0ca4843414c0 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -17,7 +17,7 @@ use std::{io::Read, ops::Range, sync::Arc}; -use crate::basic::ColumnOrder; +use crate::basic::{ColumnOrder, SortOrder}; #[cfg(feature = "encryption")] use crate::encryption::{ decrypt::{FileDecryptionProperties, FileDecryptor}, @@ -1106,6 +1106,9 @@ impl ParquetMetaDataReader { ); res.push(ColumnOrder::TYPE_DEFINED_ORDER(sort_order)); } + TColumnOrder::IEEE754TOTALORDER(_) => { + res.push(ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED)); + } } } Ok(Some(res)) diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index a66509e14c7a..9b66fedd77b7 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -36,6 +36,8 @@ pub struct PageIndex { pub max: Option, /// Null values in the page pub null_count: Option, + /// NaN values in the page (only for floating point types) + pub nan_count: Option, /// Repetition level histogram for the page /// /// `repetition_level_histogram[i]` is a count of how many values are at repetition level `i`. @@ -69,6 +71,11 @@ impl PageIndex { self.null_count } + /// Returns the number of NaN values in the page + pub fn nan_count(&self) -> Option { + self.nan_count + } + /// Returns the repetition level histogram for the page pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> { self.repetition_level_histogram.as_ref() @@ -194,6 +201,11 @@ impl NativeIndex { .map(|x| x.into_iter().map(Some).collect::>()) .unwrap_or_else(|| vec![None; len]); + let nan_counts = index + .nan_counts + .map(|x| x.into_iter().map(Some).collect::>()) + .unwrap_or_else(|| vec![None; len]); + // histograms are a 1D array encoding a 2D num_pages X num_levels matrix. let to_page_histograms = |opt_hist: Option>| { if let Some(hist) = opt_hist { @@ -222,11 +234,12 @@ impl NativeIndex { .zip(index.max_values.iter()) .zip(index.null_pages.into_iter()) .zip(null_counts.into_iter()) + .zip(nan_counts.into_iter()) .zip(rep_hists.into_iter()) .zip(def_hists.into_iter()) .map( |( - ((((min, max), is_null), null_count), repetition_level_histogram), + (((((min, max), is_null), null_count), nan_count), repetition_level_histogram), definition_level_histogram, )| { let (min, max) = if is_null { @@ -241,6 +254,7 @@ impl NativeIndex { min, max, null_count, + nan_count, repetition_level_histogram, definition_level_histogram, }) @@ -273,6 +287,12 @@ impl NativeIndex { .map(|x| x.null_count()) .collect::>>(); + let nan_counts = self + .indexes + .iter() + .map(|x| x.nan_count()) + .collect::>>(); + // Concatenate page histograms into a single Option let repetition_level_histograms = self .indexes @@ -296,6 +316,7 @@ impl NativeIndex { null_counts, repetition_level_histograms, definition_level_histograms, + nan_counts, ) } } @@ -310,6 +331,7 @@ mod tests { min: Some(-123), max: Some(234), null_count: Some(0), + nan_count: None, repetition_level_histogram: Some(LevelHistogram::from(vec![1, 2])), definition_level_histogram: Some(LevelHistogram::from(vec![1, 2, 3])), }; @@ -335,6 +357,7 @@ mod tests { min: None, max: None, null_count: None, + nan_count: None, repetition_level_histogram: None, definition_level_histogram: None, }; @@ -363,6 +386,7 @@ mod tests { null_counts: None, repetition_level_histograms: None, definition_level_histograms: None, + nan_counts: None, boundary_order: BoundaryOrder::UNORDERED, }; diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 02729a5016bb..3ad57b681492 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -141,6 +141,8 @@ pub fn from_thrift( let null_count = Some(null_count as u64); // Generic distinct count (count of distinct values occurring) let distinct_count = stats.distinct_count.map(|value| value as u64); + // Generic nan count for floating point types + let nan_count = stats.nan_count.map(|value| value as u64); // Whether or not statistics use deprecated min/max fields. let old_format = stats.min_value.is_none() && stats.max_value.is_none(); // Generic min value as bytes. @@ -224,19 +226,29 @@ pub fn from_thrift( }; Statistics::int96(min, max, distinct_count, null_count, old_format) } - Type::FLOAT => Statistics::float( - min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())), - max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())), - distinct_count, - null_count, - old_format, + Type::FLOAT => Statistics::Float( + ValueStatistics::new( + min.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())), + max.map(|data| f32::from_le_bytes(data[..4].try_into().unwrap())), + distinct_count, + null_count, + old_format, + ) + .with_nan_count(nan_count) + .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false)) + .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)), ), - Type::DOUBLE => Statistics::double( - min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())), - max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())), - distinct_count, - null_count, - old_format, + Type::DOUBLE => Statistics::Double( + ValueStatistics::new( + min.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())), + max.map(|data| f64::from_le_bytes(data[..8].try_into().unwrap())), + distinct_count, + null_count, + old_format, + ) + .with_nan_count(nan_count) + .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false)) + .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)), ), Type::BYTE_ARRAY => Statistics::ByteArray( ValueStatistics::new( @@ -257,6 +269,12 @@ pub fn from_thrift( null_count, old_format, ) + // Note: We set nan_count here even though we can't verify if this is Float16. + // The spec says nan_count should only be set for Float16 logical type, + // but from_thrift doesn't have access to logical type information. + // Writers should only set nan_count for Float16, and readers should + // handle this gracefully. + .with_nan_count(nan_count) .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false)) .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)), ), @@ -282,6 +300,11 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { .distinct_count_opt() .and_then(|value| i64::try_from(value).ok()); + // record nan count if it can fit in i64 + let nan_count = stats + .nan_count_opt() + .and_then(|value| i64::try_from(value).ok()); + let mut thrift_stats = TStatistics { max: None, min: None, @@ -291,6 +314,7 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { min_value: None, is_max_value_exact: None, is_min_value_exact: None, + nan_count, }; // Get min/max if set. @@ -432,6 +456,11 @@ impl Statistics { statistics_enum_func![self, null_count_opt] } + /// Returns NaN count for floating point types, if known. + pub fn nan_count_opt(&self) -> Option { + statistics_enum_func![self, nan_count_opt] + } + /// Returns `true` if the min value is set, and is an exact min value. pub fn min_is_exact(&self) -> bool { statistics_enum_func![self, min_is_exact] @@ -495,6 +524,8 @@ pub struct ValueStatistics { // Distinct count could be omitted in some cases distinct_count: Option, null_count: Option, + // NaN count for floating point types + nan_count: Option, // Whether or not the min or max values are exact, or truncated. is_max_value_exact: bool, @@ -525,6 +556,7 @@ impl ValueStatistics { max, distinct_count, null_count, + nan_count: None, is_min_max_deprecated, is_min_max_backwards_compatible: is_min_max_deprecated, } @@ -610,6 +642,16 @@ impl ValueStatistics { self.null_count } + /// Returns NaN count for floating point types. + pub fn nan_count_opt(&self) -> Option { + self.nan_count + } + + /// Set the NaN count for floating point types. + pub fn with_nan_count(self, nan_count: Option) -> Self { + Self { nan_count, ..self } + } + /// Returns `true` if statistics were created using old min/max fields. fn is_min_max_deprecated(&self) -> bool { self.is_min_max_deprecated @@ -711,6 +753,7 @@ mod tests { min_value: None, is_max_value_exact: None, is_min_value_exact: None, + nan_count: None, }; from_thrift(Type::INT32, Some(thrift_stats)).unwrap(); @@ -1075,4 +1118,138 @@ mod tests { is_min_max_deprecated, )) } + + #[test] + fn test_nan_count_float() { + // Test NaN count for f32 + let stats = Statistics::Float( + ValueStatistics::new(Some(1.0_f32), Some(5.0_f32), None, Some(0), false) + .with_nan_count(Some(3)), + ); + + assert_eq!(stats.nan_count_opt(), Some(3)); + + // Verify round-trip through thrift + let thrift_stats = to_thrift(Some(&stats)).unwrap(); + assert_eq!(thrift_stats.nan_count, Some(3)); + + let round_tripped = from_thrift(Type::FLOAT, Some(thrift_stats)) + .unwrap() + .unwrap(); + assert_eq!(round_tripped.nan_count_opt(), Some(3)); + } + + #[test] + fn test_nan_count_double() { + // Test NaN count for f64 + let stats = Statistics::Double( + ValueStatistics::new(Some(1.0_f64), Some(5.0_f64), None, Some(0), false) + .with_nan_count(Some(5)), + ); + + assert_eq!(stats.nan_count_opt(), Some(5)); + + // Verify round-trip through thrift + let thrift_stats = to_thrift(Some(&stats)).unwrap(); + assert_eq!(thrift_stats.nan_count, Some(5)); + + let round_tripped = from_thrift(Type::DOUBLE, Some(thrift_stats)) + .unwrap() + .unwrap(); + assert_eq!(round_tripped.nan_count_opt(), Some(5)); + } + + #[test] + fn test_nan_count_none_for_non_float() { + // NaN count should not be set for non-floating point types + let stats = Statistics::int32(Some(1), Some(100), None, Some(0), false); + assert_eq!(stats.nan_count_opt(), None); + + let thrift_stats = to_thrift(Some(&stats)).unwrap(); + assert_eq!(thrift_stats.nan_count, None); + } + + #[test] + fn test_nan_count_backwards_compatible() { + // Test that missing nan_count field is handled correctly + let thrift_stats = TStatistics { + min_value: Some(vec![0, 0, 0, 0]), // 0.0_f32 in bytes + max_value: Some(vec![0, 0, 128, 63]), // 1.0_f32 in bytes + null_count: Some(0), + distinct_count: None, + nan_count: None, // Not set + ..Default::default() + }; + + let stats = from_thrift(Type::FLOAT, Some(thrift_stats)) + .unwrap() + .unwrap(); + + // nan_count should be None when not provided + assert_eq!(stats.nan_count_opt(), None); + } + + #[test] + fn test_statistics_with_nan_min_max() { + // Test that when there are only NaN values, min/max should be None + let stats = Statistics::Float( + ValueStatistics::new( + None, // No min (all NaN) + None, // No max (all NaN) + None, + Some(0), + false, + ) + .with_nan_count(Some(10)), // All values are NaN + ); + + assert_eq!(stats.min_bytes_opt(), None); + assert_eq!(stats.max_bytes_opt(), None); + assert_eq!(stats.nan_count_opt(), Some(10)); + + // Verify serialization handles this case + let thrift_stats = to_thrift(Some(&stats)).unwrap(); + assert_eq!(thrift_stats.min_value, None); + assert_eq!(thrift_stats.max_value, None); + assert_eq!(thrift_stats.nan_count, Some(10)); + } + + #[test] + fn test_nan_count_too_large() { + // Test that nan_count larger than i64::MAX is not serialized + let stats = Statistics::Float( + ValueStatistics::new(Some(1.0_f32), Some(2.0_f32), None, Some(0), false) + .with_nan_count(Some(u64::MAX)), + ); + + let thrift_stats = to_thrift(Some(&stats)).unwrap(); + // u64::MAX can't fit in i64, so it should be None + assert_eq!(thrift_stats.nan_count, None); + } + + #[test] + fn test_nan_counts_in_column_index() { + // Test that nan_counts are properly collected in page index + use crate::file::metadata::ColumnIndexBuilder; + + // Test for floating-point column - all pages must have Some(n) + let mut float_builder = ColumnIndexBuilder::new(); + float_builder.append(false, vec![0u8; 4], vec![255u8; 4], 0, Some(5)); + float_builder.append(false, vec![0u8; 4], vec![255u8; 4], 2, Some(3)); + float_builder.append(false, vec![0u8; 4], vec![255u8; 4], 0, Some(0)); // No NaN but still Some(0) + + let float_column_index = float_builder.build_to_thrift(); + // Verify nan_counts field is properly set for float column + assert_eq!(float_column_index.nan_counts, Some(vec![5, 3, 0])); + + // Test for non-floating-point column - all pages must have None + let mut int_builder = ColumnIndexBuilder::new(); + int_builder.append(false, vec![0u8; 4], vec![255u8; 4], 0, None); + int_builder.append(false, vec![0u8; 4], vec![255u8; 4], 2, None); + int_builder.append(false, vec![0u8; 4], vec![255u8; 4], 0, None); + + let int_column_index = int_builder.build_to_thrift(); + // Verify nan_counts field is None for non-float column + assert_eq!(int_column_index.nan_counts, None); + } } diff --git a/parquet/src/format.rs b/parquet/src/format.rs index 101799d00350..5ab5d409b5a6 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -1110,10 +1110,15 @@ pub struct Statistics { pub is_max_value_exact: Option, /// If true, min_value is the actual minimum value for a column pub is_min_value_exact: Option, + /// Count of NaN values in the column. + /// + /// This field is only set for columns with floating point type + /// (float, double, or Float16). NaN values are not included in min/max statistics. + pub nan_count: Option, } impl Statistics { - pub fn new(max: F1, min: F2, null_count: F3, distinct_count: F4, max_value: F5, min_value: F6, is_max_value_exact: F7, is_min_value_exact: F8) -> Statistics where F1: Into>>, F2: Into>>, F3: Into>, F4: Into>, F5: Into>>, F6: Into>>, F7: Into>, F8: Into> { + pub fn new(max: F1, min: F2, null_count: F3, distinct_count: F4, max_value: F5, min_value: F6, is_max_value_exact: F7, is_min_value_exact: F8, nan_count: F9) -> Statistics where F1: Into>>, F2: Into>>, F3: Into>, F4: Into>, F5: Into>>, F6: Into>>, F7: Into>, F8: Into>, F9: Into> { Statistics { max: max.into(), min: min.into(), @@ -1123,6 +1128,7 @@ impl Statistics { min_value: min_value.into(), is_max_value_exact: is_max_value_exact.into(), is_min_value_exact: is_min_value_exact.into(), + nan_count: nan_count.into(), } } } @@ -1138,6 +1144,7 @@ impl crate::thrift::TSerializable for Statistics { let mut f_6: Option> = None; let mut f_7: Option = None; let mut f_8: Option = None; + let mut f_9: Option = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -1177,6 +1184,10 @@ impl crate::thrift::TSerializable for Statistics { let val = i_prot.read_bool()?; f_8 = Some(val); }, + 9 => { + let val = i_prot.read_i64()?; + f_9 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -1193,6 +1204,7 @@ impl crate::thrift::TSerializable for Statistics { min_value: f_6, is_max_value_exact: f_7, is_min_value_exact: f_8, + nan_count: f_9, }; Ok(ret) } @@ -1239,6 +1251,11 @@ impl crate::thrift::TSerializable for Statistics { o_prot.write_bool(fld_var)?; o_prot.write_field_end()? } + if let Some(fld_var) = self.nan_count { + o_prot.write_field_begin(&TFieldIdentifier::new("nan_count", TType::I64, 9))?; + o_prot.write_i64(fld_var)?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -4980,6 +4997,43 @@ impl crate::thrift::TSerializable for TypeDefinedOrder { } } +// +// IEEE754TotalOrder +// + +/// IEEE 754 total order for floating point values +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct IEEE754TotalOrder { +} + +impl IEEE754TotalOrder { + pub fn new() -> IEEE754TotalOrder { + IEEE754TotalOrder {} + } +} + +impl crate::thrift::TSerializable for IEEE754TotalOrder { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { + i_prot.read_struct_begin()?; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + i_prot.skip(field_ident.field_type)?; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + Ok(IEEE754TotalOrder {}) + } + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("IEEE754TotalOrder"); + o_prot.write_struct_begin(&struct_ident)?; + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + // // ColumnOrder // @@ -4987,6 +5041,7 @@ impl crate::thrift::TSerializable for TypeDefinedOrder { #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub enum ColumnOrder { TYPEORDER(TypeDefinedOrder), + IEEE754TOTALORDER(IEEE754TotalOrder), } impl crate::thrift::TSerializable for ColumnOrder { @@ -5008,6 +5063,13 @@ impl crate::thrift::TSerializable for ColumnOrder { } received_field_count += 1; }, + 2 => { + let val = IEEE754TotalOrder::read_from_in_protocol(i_prot)?; + if ret.is_none() { + ret = Some(ColumnOrder::IEEE754TOTALORDER(val)); + } + received_field_count += 1; + }, _ => { i_prot.skip(field_ident.field_type)?; received_field_count += 1; @@ -5047,6 +5109,11 @@ impl crate::thrift::TSerializable for ColumnOrder { f.write_to_out_protocol(o_prot)?; o_prot.write_field_end()?; }, + ColumnOrder::IEEE754TOTALORDER(ref f) => { + o_prot.write_field_begin(&TFieldIdentifier::new("IEEE_754_TOTAL_ORDER", TType::Struct, 2))?; + f.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()?; + }, } o_prot.write_field_stop()?; o_prot.write_struct_end() @@ -5298,10 +5365,17 @@ pub struct ColumnIndex { /// Same as repetition_level_histograms except for definitions levels. /// pub definition_level_histograms: Option>, + /// A list containing the number of NaN values for each page + /// Only present for columns of physical type FLOAT, DOUBLE, or logical type FLOAT16. + /// + /// If nan_counts are not present, readers MUST NOT assume all + /// NaN counts are 0. Writers SHOULD write this field if the column + /// is of applicable type, even if no NaN values are present. + pub nan_counts: Option>, } impl ColumnIndex { - pub fn new(null_pages: Vec, min_values: Vec>, max_values: Vec>, boundary_order: BoundaryOrder, null_counts: F5, repetition_level_histograms: F6, definition_level_histograms: F7) -> ColumnIndex where F5: Into>>, F6: Into>>, F7: Into>> { + pub fn new(null_pages: Vec, min_values: Vec>, max_values: Vec>, boundary_order: BoundaryOrder, null_counts: F5, repetition_level_histograms: F6, definition_level_histograms: F7, nan_counts: F8) -> ColumnIndex where F5: Into>>, F6: Into>>, F7: Into>>, F8: Into>> { ColumnIndex { null_pages, min_values, @@ -5310,6 +5384,7 @@ impl ColumnIndex { null_counts: null_counts.into(), repetition_level_histograms: repetition_level_histograms.into(), definition_level_histograms: definition_level_histograms.into(), + nan_counts: nan_counts.into(), } } } @@ -5324,6 +5399,7 @@ impl crate::thrift::TSerializable for ColumnIndex { let mut f_5: Option> = None; let mut f_6: Option> = None; let mut f_7: Option> = None; + let mut f_8: Option> = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -5395,6 +5471,16 @@ impl crate::thrift::TSerializable for ColumnIndex { i_prot.read_list_end()?; f_7 = Some(val); }, + 8 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_18 = i_prot.read_i64()?; + val.push(list_elem_18); + } + i_prot.read_list_end()?; + f_8 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -5414,6 +5500,7 @@ impl crate::thrift::TSerializable for ColumnIndex { null_counts: f_5, repetition_level_histograms: f_6, definition_level_histograms: f_7, + nan_counts: f_8, }; Ok(ret) } @@ -5471,6 +5558,15 @@ impl crate::thrift::TSerializable for ColumnIndex { o_prot.write_list_end()?; o_prot.write_field_end()? } + if let Some(ref fld_var) = self.nan_counts { + o_prot.write_field_begin(&TFieldIdentifier::new("nan_counts", TType::List, 8))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } diff --git a/parquet/src/thrift.rs b/parquet/src/thrift.rs index fc391abe87d7..a703674328c0 100644 --- a/parquet/src/thrift.rs +++ b/parquet/src/thrift.rs @@ -343,6 +343,7 @@ mod tests { null_counts: None, repetition_level_histograms: None, definition_level_histograms: None, + nan_counts: None, }; assert_eq!(&index, &expected); @@ -364,6 +365,7 @@ mod tests { null_counts: None, repetition_level_histograms: None, definition_level_histograms: None, + nan_counts: None, }; assert_eq!(&index, &expected); diff --git a/parquet/src/util/ieee754.rs b/parquet/src/util/ieee754.rs new file mode 100644 index 000000000000..449de1bdcb1e --- /dev/null +++ b/parquet/src/util/ieee754.rs @@ -0,0 +1,150 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! IEEE 754 total order comparison functions for floating point types. +//! +//! According to the IEEE 754 specification, the total order is: +//! -NaN < -Infinity < -x < -0 < +0 < +x < +Infinity < +NaN +//! where x represents any finite non-zero value. +//! +//! Within NaN values, the order is determined by the payload bits. + +use half::f16; +use std::cmp::Ordering; + +/// Converts a float to its total order representation. +/// This allows for bitwise comparison that respects IEEE 754 total order. +#[inline] +fn total_order_f32(f: f32) -> u32 { + let bits = f.to_bits(); + if bits & 0x8000_0000 != 0 { + // Negative number: flip all bits except sign + !bits + } else { + // Positive number: flip only sign bit + bits | 0x8000_0000 + } +} + +/// Converts a double to its total order representation. +#[inline] +fn total_order_f64(f: f64) -> u64 { + let bits = f.to_bits(); + if bits & 0x8000_0000_0000_0000 != 0 { + // Negative number: flip all bits except sign + !bits + } else { + // Positive number: flip only sign bit + bits | 0x8000_0000_0000_0000 + } +} + +/// Converts a f16 to its total order representation. +#[inline] +fn total_order_f16(f: f16) -> u16 { + let bits = f.to_bits(); + if bits & 0x8000 != 0 { + // Negative number: flip all bits except sign + !bits + } else { + // Positive number: flip only sign bit + bits | 0x8000 + } +} + +/// Compare two f32 values using IEEE 754 total order. +pub fn compare_f32(a: f32, b: f32) -> Ordering { + total_order_f32(a).cmp(&total_order_f32(b)) +} + +/// Compare two f64 values using IEEE 754 total order. +pub fn compare_f64(a: f64, b: f64) -> Ordering { + total_order_f64(a).cmp(&total_order_f64(b)) +} + +/// Compare two f16 values using IEEE 754 total order. +pub fn compare_f16(a: f16, b: f16) -> Ordering { + total_order_f16(a).cmp(&total_order_f16(b)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_f32_total_order() { + // Test the order: -NaN < -Inf < -1.0 < -0.0 < +0.0 < 1.0 < +Inf < +NaN + let neg_nan = f32::from_bits(0xffc00000); + let neg_inf = f32::NEG_INFINITY; + let neg_one = -1.0_f32; + let neg_zero = -0.0_f32; + let pos_zero = 0.0_f32; + let pos_one = 1.0_f32; + let pos_inf = f32::INFINITY; + let pos_nan = f32::from_bits(0x7fc00000); + + assert!(compare_f32(neg_nan, neg_inf) == Ordering::Less); + assert!(compare_f32(neg_inf, neg_one) == Ordering::Less); + assert!(compare_f32(neg_one, neg_zero) == Ordering::Less); + assert!(compare_f32(neg_zero, pos_zero) == Ordering::Less); + assert!(compare_f32(pos_zero, pos_one) == Ordering::Less); + assert!(compare_f32(pos_one, pos_inf) == Ordering::Less); + assert!(compare_f32(pos_inf, pos_nan) == Ordering::Less); + } + + #[test] + fn test_f64_total_order() { + // Test the order: -NaN < -Inf < -1.0 < -0.0 < +0.0 < 1.0 < +Inf < +NaN + let neg_nan = f64::from_bits(0xfff8000000000000); + let neg_inf = f64::NEG_INFINITY; + let neg_one = -1.0_f64; + let neg_zero = -0.0_f64; + let pos_zero = 0.0_f64; + let pos_one = 1.0_f64; + let pos_inf = f64::INFINITY; + let pos_nan = f64::from_bits(0x7ff8000000000000); + + assert!(compare_f64(neg_nan, neg_inf) == Ordering::Less); + assert!(compare_f64(neg_inf, neg_one) == Ordering::Less); + assert!(compare_f64(neg_one, neg_zero) == Ordering::Less); + assert!(compare_f64(neg_zero, pos_zero) == Ordering::Less); + assert!(compare_f64(pos_zero, pos_one) == Ordering::Less); + assert!(compare_f64(pos_one, pos_inf) == Ordering::Less); + assert!(compare_f64(pos_inf, pos_nan) == Ordering::Less); + } + + #[test] + fn test_f16_total_order() { + // Test the order: -NaN < -Inf < -1.0 < -0.0 < +0.0 < 1.0 < +Inf < +NaN + let neg_nan = f16::from_bits(0xfe00); + let neg_inf = f16::NEG_INFINITY; + let neg_one = f16::from_f32(-1.0); + let neg_zero = f16::NEG_ZERO; + let pos_zero = f16::ZERO; + let pos_one = f16::from_f32(1.0); + let pos_inf = f16::INFINITY; + let pos_nan = f16::from_bits(0x7e00); + + assert!(compare_f16(neg_nan, neg_inf) == Ordering::Less); + assert!(compare_f16(neg_inf, neg_one) == Ordering::Less); + assert!(compare_f16(neg_one, neg_zero) == Ordering::Less); + assert!(compare_f16(neg_zero, pos_zero) == Ordering::Less); + assert!(compare_f16(pos_zero, pos_one) == Ordering::Less); + assert!(compare_f16(pos_one, pos_inf) == Ordering::Less); + assert!(compare_f16(pos_inf, pos_nan) == Ordering::Less); + } +} diff --git a/parquet/src/util/mod.rs b/parquet/src/util/mod.rs index 1431132473e9..140542e204f2 100644 --- a/parquet/src/util/mod.rs +++ b/parquet/src/util/mod.rs @@ -19,6 +19,7 @@ pub mod bit_util; mod bit_pack; pub(crate) mod interner; +pub mod ieee754; #[cfg(any(test, feature = "test_common"))] pub(crate) mod test_common;