diff --git a/CHANGES.md b/CHANGES.md index 2ae0554..f9d7046 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,4 +1,10 @@ +## 0.11.2 (2025-09-09) + +* fix statistics for Raster with Nan/Inf values and missing Nodata +* add `histogram_bins` and `histogram_range` options for histogram configuration +* add fallback for histogram calculation when data is homogenous + ## 0.11.1 (2025-08-28) * avoid serialization issue for statistics diff --git a/rio_stac/stac.py b/rio_stac/stac.py index 1745713..20dc8c3 100644 --- a/rio_stac/stac.py +++ b/rio_stac/stac.py @@ -5,7 +5,7 @@ import os import warnings from contextlib import ExitStack -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Sequence, Tuple, Union import numpy import pystac @@ -156,13 +156,16 @@ def get_eobands_info( return eo_bands -def _get_stats(arr: numpy.ma.MaskedArray, **kwargs: Any) -> Dict: +def _get_stats( + arr: numpy.ma.MaskedArray, + bins: Union[int, str, Sequence] = 10, + range: Optional[Tuple[float, float]] = None, +) -> Dict: """Calculate array statistics.""" # Avoid non masked nan/inf values arr = numpy.ma.fix_invalid(arr, copy=True) - sample, edges = numpy.histogram(arr[~arr.mask]) - return { + stats = { "statistics": { "mean": arr.mean().item(), "minimum": arr.min().item(), @@ -171,19 +174,38 @@ def _get_stats(arr: numpy.ma.MaskedArray, **kwargs: Any) -> Dict: "valid_percent": float( numpy.count_nonzero(~arr.mask) / float(arr.data.size) * 100 ), - }, - "histogram": { - "count": len(edges), - "min": float(edges.min()), - "max": float(edges.max()), - "buckets": sample.tolist(), - }, + } } + try: + sample, edges = numpy.histogram(arr[~arr.mask], bins=bins, range=range) + + except ValueError as e: + if "Too many bins for data range." in str(e): + _, counts = numpy.unique(arr[~arr.mask], return_counts=True) + warnings.warn( + f"Could not calculate the histogram, fall back to automatic bin={len(counts) + 1}.", + UserWarning, + ) + sample, edges = numpy.histogram(arr[~arr.mask], bins=len(counts) + 1) + else: + raise e + + stats["histogram"] = { + "count": len(edges), + "min": float(edges.min()), + "max": float(edges.max()), + "buckets": sample.tolist(), + } + + return stats + def get_raster_info( # noqa: C901 src_dst: Union[DatasetReader, DatasetWriter, WarpedVRT, MemoryFile], max_size: int = 1024, + histogram_bins: Union[int, str, Sequence] = 10, + histogram_range: Optional[Tuple[float, float]] = None, ) -> List[Dict]: """Get raster metadata. @@ -231,7 +253,11 @@ def get_raster_info( # noqa: C901 value["unit"] = src_dst.units[band - 1] value.update( - _get_stats(src_dst.read(indexes=band, out_shape=(height, width), masked=True)) + _get_stats( + src_dst.read(indexes=band, out_shape=(height, width), masked=True), + bins=histogram_bins, + range=histogram_range, + ) ) meta.append(value) @@ -272,7 +298,7 @@ def get_media_type( elif driver == "PNG": return pystac.MediaType.PNG - warnings.warn("Could not determine the media type from GDAL driver.") + warnings.warn("Could not determine the media type from GDAL driver.", UserWarning) return None @@ -296,6 +322,8 @@ def create_stac_item( geom_densify_pts: int = 0, geom_precision: int = -1, geographic_crs: rasterio.crs.CRS = EPSG_4326, + histogram_bins: Union[int, str, Sequence] = 10, + histogram_range: Optional[Tuple[float, float]] = None, ) -> pystac.Item: """Create a Stac Item. @@ -393,7 +421,12 @@ def create_stac_item( ) raster_info = { - "raster:bands": get_raster_info(dataset, max_size=raster_max_size) + "raster:bands": get_raster_info( + dataset, + max_size=raster_max_size, + histogram_bins=histogram_bins, + histogram_range=histogram_range, + ) } eo_info: Dict[str, List] = {} diff --git a/tests/fixtures/S2A_MSIL2A_20220722T105631_N0400_R094_T31TCN_20220722T171159.tif b/tests/fixtures/S2A_MSIL2A_20220722T105631_N0400_R094_T31TCN_20220722T171159.tif new file mode 100644 index 0000000..330d01f Binary files /dev/null and b/tests/fixtures/S2A_MSIL2A_20220722T105631_N0400_R094_T31TCN_20220722T171159.tif differ diff --git a/tests/test_create_item.py b/tests/test_create_item.py index 01c3cd4..8c41a7b 100644 --- a/tests/test_create_item.py +++ b/tests/test_create_item.py @@ -3,6 +3,8 @@ import datetime import json import os +import sys +import warnings import numpy import pystac @@ -359,6 +361,55 @@ def test_json_serialization(): assert json.dumps(item_dict) +@pytest.mark.xfail(sys.version_info < (3, 10), reason="Old numpy do not raise error") +def test_stats_unique_values(): + """issue 68 - + ref: https://github.com/developmentseed/rio-stac/issues/68 + """ + src_path = os.path.join( + PREFIX, "S2A_MSIL2A_20220722T105631_N0400_R094_T31TCN_20220722T171159.tif" + ) + with pytest.warns(UserWarning): + item = create_stac_item(src_path, with_raster=True) + assert item.validate() + item_dict = item.to_dict() + assert "raster:bands" in item_dict["assets"]["asset"] + stats = item_dict["assets"]["asset"]["raster:bands"][9]["histogram"] + assert len(stats["buckets"]) == 3 + + # Should not raise warnings when bins is good + with warnings.catch_warnings(): + warnings.simplefilter("error") + item = create_stac_item(src_path, with_raster=True, histogram_bins=3) + + with pytest.raises(ValueError): + create_stac_item( + src_path, with_raster=True, histogram_bins=3, histogram_range=(0, -1) + ) + + with pytest.raises(ValueError): + create_stac_item(src_path, with_raster=True, histogram_range=(0, -1)) + + +def test_create_item_raster_custom_histogram(): + """Should return a valid item with raster properties.""" + src_path = os.path.join(PREFIX, "dataset_cog.tif") + item = create_stac_item( + src_path, + input_datetime=input_date, + with_raster=True, + raster_max_size=128, + histogram_bins=5, + histogram_range=[0, 10], + ) + assert item.validate() + item_dict = item.to_dict() + stats = item_dict["assets"]["asset"]["raster:bands"][0]["histogram"] + assert len(stats["buckets"]) == 5 + assert stats["max"] == 10 + assert stats["min"] == 0 + + def test_stats_with_nan_missing_nodata(): """Stats should ignore nan/inf values.