Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@

## 0.11.2 (2025-09-09)

* fix statistics for Raster with Nan/Inf values and missing Nodata
* add `histogram_bins` and `histogram_range` options for histogram configuration
* add fallback for histogram calculation when data is homogenous

## 0.11.1 (2025-08-28)

* avoid serialization issue for statistics
Expand Down
61 changes: 47 additions & 14 deletions rio_stac/stac.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os
import warnings
from contextlib import ExitStack
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Dict, List, Optional, Sequence, Tuple, Union

import numpy
import pystac
Expand Down Expand Up @@ -156,13 +156,16 @@ def get_eobands_info(
return eo_bands


def _get_stats(arr: numpy.ma.MaskedArray, **kwargs: Any) -> Dict:
def _get_stats(
arr: numpy.ma.MaskedArray,
bins: Union[int, str, Sequence] = 10,
range: Optional[Tuple[float, float]] = None,
) -> Dict:
"""Calculate array statistics."""
# Avoid non masked nan/inf values
arr = numpy.ma.fix_invalid(arr, copy=True)

sample, edges = numpy.histogram(arr[~arr.mask])
return {
stats = {
"statistics": {
"mean": arr.mean().item(),
"minimum": arr.min().item(),
Expand All @@ -171,19 +174,38 @@ def _get_stats(arr: numpy.ma.MaskedArray, **kwargs: Any) -> Dict:
"valid_percent": float(
numpy.count_nonzero(~arr.mask) / float(arr.data.size) * 100
),
},
"histogram": {
"count": len(edges),
"min": float(edges.min()),
"max": float(edges.max()),
"buckets": sample.tolist(),
},
}
}

try:
sample, edges = numpy.histogram(arr[~arr.mask], bins=bins, range=range)

except ValueError as e:
if "Too many bins for data range." in str(e):
_, counts = numpy.unique(arr[~arr.mask], return_counts=True)
warnings.warn(
f"Could not calculate the histogram, fall back to automatic bin={len(counts) + 1}.",
UserWarning,
)
sample, edges = numpy.histogram(arr[~arr.mask], bins=len(counts) + 1)
else:
raise e

stats["histogram"] = {
"count": len(edges),
"min": float(edges.min()),
"max": float(edges.max()),
"buckets": sample.tolist(),
}

return stats


def get_raster_info( # noqa: C901
src_dst: Union[DatasetReader, DatasetWriter, WarpedVRT, MemoryFile],
max_size: int = 1024,
histogram_bins: Union[int, str, Sequence] = 10,
histogram_range: Optional[Tuple[float, float]] = None,
) -> List[Dict]:
"""Get raster metadata.

Expand Down Expand Up @@ -231,7 +253,11 @@ def get_raster_info( # noqa: C901
value["unit"] = src_dst.units[band - 1]

value.update(
_get_stats(src_dst.read(indexes=band, out_shape=(height, width), masked=True))
_get_stats(
src_dst.read(indexes=band, out_shape=(height, width), masked=True),
bins=histogram_bins,
range=histogram_range,
)
)
meta.append(value)

Expand Down Expand Up @@ -272,7 +298,7 @@ def get_media_type(
elif driver == "PNG":
return pystac.MediaType.PNG

warnings.warn("Could not determine the media type from GDAL driver.")
warnings.warn("Could not determine the media type from GDAL driver.", UserWarning)
return None


Expand All @@ -296,6 +322,8 @@ def create_stac_item(
geom_densify_pts: int = 0,
geom_precision: int = -1,
geographic_crs: rasterio.crs.CRS = EPSG_4326,
histogram_bins: Union[int, str, Sequence] = 10,
histogram_range: Optional[Tuple[float, float]] = None,
) -> pystac.Item:
"""Create a Stac Item.

Expand Down Expand Up @@ -393,7 +421,12 @@ def create_stac_item(
)

raster_info = {
"raster:bands": get_raster_info(dataset, max_size=raster_max_size)
"raster:bands": get_raster_info(
dataset,
max_size=raster_max_size,
histogram_bins=histogram_bins,
histogram_range=histogram_range,
)
}

eo_info: Dict[str, List] = {}
Expand Down
Binary file not shown.
51 changes: 51 additions & 0 deletions tests/test_create_item.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import datetime
import json
import os
import sys
import warnings

import numpy
import pystac
Expand Down Expand Up @@ -359,6 +361,55 @@ def test_json_serialization():
assert json.dumps(item_dict)


@pytest.mark.xfail(sys.version_info < (3, 10), reason="Old numpy do not raise error")
def test_stats_unique_values():
"""issue 68 -
ref: https://github.com/developmentseed/rio-stac/issues/68
"""
src_path = os.path.join(
PREFIX, "S2A_MSIL2A_20220722T105631_N0400_R094_T31TCN_20220722T171159.tif"
)
with pytest.warns(UserWarning):
item = create_stac_item(src_path, with_raster=True)
assert item.validate()
item_dict = item.to_dict()
assert "raster:bands" in item_dict["assets"]["asset"]
stats = item_dict["assets"]["asset"]["raster:bands"][9]["histogram"]
assert len(stats["buckets"]) == 3

# Should not raise warnings when bins is good
with warnings.catch_warnings():
warnings.simplefilter("error")
item = create_stac_item(src_path, with_raster=True, histogram_bins=3)

with pytest.raises(ValueError):
create_stac_item(
src_path, with_raster=True, histogram_bins=3, histogram_range=(0, -1)
)

with pytest.raises(ValueError):
create_stac_item(src_path, with_raster=True, histogram_range=(0, -1))


def test_create_item_raster_custom_histogram():
"""Should return a valid item with raster properties."""
src_path = os.path.join(PREFIX, "dataset_cog.tif")
item = create_stac_item(
src_path,
input_datetime=input_date,
with_raster=True,
raster_max_size=128,
histogram_bins=5,
histogram_range=[0, 10],
)
assert item.validate()
item_dict = item.to_dict()
stats = item_dict["assets"]["asset"]["raster:bands"][0]["histogram"]
assert len(stats["buckets"]) == 5
assert stats["max"] == 10
assert stats["min"] == 0


def test_stats_with_nan_missing_nodata():
"""Stats should ignore nan/inf values.

Expand Down