From 679a57adbafbf886b4ad5fe618e1de730dbf663a Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 23 Mar 2026 15:57:25 +0100 Subject: [PATCH 1/5] Refactor get_dimensions function in IDS tensorization More readable code, and less repetition: ```python # before self.ncmeta.get_dimensions(..., self.homogeneous_time) # after self.get_dimensions(...) ``` --- imas/backends/netcdf/ids2nc.py | 9 ++++----- imas/backends/netcdf/ids_tensorizer.py | 25 ++++++++++++++---------- imas/backends/netcdf/nc2ids.py | 27 +++++++++++++------------- 3 files changed, 32 insertions(+), 29 deletions(-) diff --git a/imas/backends/netcdf/ids2nc.py b/imas/backends/netcdf/ids2nc.py index 531c7ac2..b970279c 100644 --- a/imas/backends/netcdf/ids2nc.py +++ b/imas/backends/netcdf/ids2nc.py @@ -47,7 +47,6 @@ def create_dimensions(self) -> None: def create_variables(self) -> None: """Create netCDF variables.""" - get_dimensions = self.ncmeta.get_dimensions for path in self.filled_data: metadata = self.ids.metadata[path] var_name = path.replace("/", ".") @@ -75,7 +74,7 @@ def create_variables(self) -> None: if dtype is not dtypes[IDSDataType.CPX]: # Set fillvalue kwargs.update(fill_value=default_fillvals[metadata.data_type]) # Create variable - dimensions = get_dimensions(path, self.homogeneous_time) + dimensions = self.get_dimensions(path) var = self.group.createVariable(var_name, dtype, dimensions, **kwargs) # Fill metadata attributes @@ -108,9 +107,9 @@ def create_variables(self) -> None: var.sparse = f"Sparse data, data shapes are stored in {shape_name}" # Create variable to store data shape - dimensions = get_dimensions( - self.ncmeta.aos.get(path), self.homogeneous_time - ) + (f"{metadata.ndim}D",) + dimensions = self.get_dimensions(self.ncmeta.aos.get(path)) + ( + f"{metadata.ndim}D", + ) shape_var = self.group.createVariable( shape_name, SHAPE_DTYPE, diff --git a/imas/backends/netcdf/ids_tensorizer.py b/imas/backends/netcdf/ids_tensorizer.py index 7e9e33ec..daac49c2 100644 --- a/imas/backends/netcdf/ids_tensorizer.py +++ b/imas/backends/netcdf/ids_tensorizer.py @@ -3,7 +3,7 @@ """Tensorization logic to convert IDSs to netCDF files and/or xarray Datasets.""" from collections import deque -from typing import List +from typing import List, Tuple import numpy @@ -47,13 +47,21 @@ def __init__(self, ids: IDSToplevel, paths_to_tensorize: List[str]) -> None: """Map of IDS paths to filled data nodes.""" self.filled_variables = set() """Set of filled IDS variables""" - self.homogeneous_time = ( + self.homogeneous_time = bool( ids.ids_properties.homogeneous_time == IDS_TIME_MODE_HOMOGENEOUS ) """True iff the IDS time mode is homogeneous.""" self.shapes = {} """Map of IDS paths to data shape arrays.""" + def get_dimensions(self, path: str) -> Tuple[str, ...]: + """Get the dimensions for a netCDF variable. + + Args: + path: Data Dictionary path to the variable, e.g. ``ids_properties/comment``. + """ + return self.ncmeta.get_dimensions(path, self.homogeneous_time) + def include_coordinate_paths(self) -> None: """Append all paths that are coordinates of self.paths_to_tensorize""" # Use a queue so we can also take coordinates of coordinates into account @@ -62,7 +70,7 @@ def include_coordinate_paths(self) -> None: for path in self.paths_to_tensorize: while path: path, _, _ = path.rpartition("/") - if self.ncmeta.get_dimensions(path, self.homogeneous_time): + if self.get_dimensions(path): queue.append(path) self.paths_to_tensorize = [] @@ -82,7 +90,6 @@ def collect_filled_data(self) -> None: # Initialize dictionary with all paths that could exist in this IDS filled_data = {path: {} for path in self.ncmeta.paths} dimension_size = {} - get_dimensions = self.ncmeta.get_dimensions if self.paths_to_tensorize: # Restrict tensorization to provided paths @@ -102,7 +109,7 @@ def collect_filled_data(self) -> None: ndim = node.metadata.ndim if not ndim: continue - dimensions = get_dimensions(path, self.homogeneous_time) + dimensions = self.get_dimensions(path) # We're only interested in the non-tensorized dimensions: [-ndim:] for dim_name, size in zip(dimensions[-ndim:], node.shape): dimension_size[dim_name] = max(dimension_size.get(dim_name, 0), size) @@ -115,15 +122,13 @@ def collect_filled_data(self) -> None: def determine_data_shapes(self) -> None: """Determine tensorized data shapes and sparsity, save in :attr:`shapes`.""" - get_dimensions = self.ncmeta.get_dimensions - for path, nodes_dict in self.filled_data.items(): metadata = self.ids.metadata[path] # Structures don't have a size if metadata.data_type is IDSDataType.STRUCTURE: continue ndim = metadata.ndim - dimensions = get_dimensions(path, self.homogeneous_time) + dimensions = self.get_dimensions(path) # node shape if it is completely filled full_shape = tuple(self.dimension_size[dim] for dim in dimensions[-ndim:]) @@ -137,7 +142,7 @@ def determine_data_shapes(self) -> None: else: # Data is tensorized, determine if it is homogeneously shaped - aos_dims = get_dimensions(self.ncmeta.aos[path], self.homogeneous_time) + aos_dims = self.get_dimensions(self.ncmeta.aos[path]) shapes_shape = [self.dimension_size[dim] for dim in aos_dims] if ndim: shapes_shape.append(ndim) @@ -180,7 +185,7 @@ def tensorize(self, path, fillvalue): Returns: A tensor filled with the data from the specified path. """ - dimensions = self.ncmeta.get_dimensions(path, self.homogeneous_time) + dimensions = self.get_dimensions(path) shape = tuple(self.dimension_size[dim] for dim in dimensions) # TODO: depending on the data, tmp_var may be HUGE, we may need a more diff --git a/imas/backends/netcdf/nc2ids.py b/imas/backends/netcdf/nc2ids.py index 564d5210..5688d5aa 100644 --- a/imas/backends/netcdf/nc2ids.py +++ b/imas/backends/netcdf/nc2ids.py @@ -1,6 +1,6 @@ import logging import os -from typing import Optional +from typing import Optional, Tuple import netCDF4 import numpy as np @@ -80,6 +80,14 @@ def __init__( ) self.homogeneous_time = var[()] == IDS_TIME_MODE_HOMOGENEOUS + def get_dimensions(self, path: str) -> Tuple[str, ...]: + """Get the dimensions for a netCDF variable. + + Args: + path: Data Dictionary path to the variable, e.g. ``ids_properties/comment``. + """ + return self.ncmeta.get_dimensions(path, self.homogeneous_time) + def run(self, lazy: bool) -> None: """Load the data from the netCDF group into the IDS.""" self.variables.sort() @@ -130,9 +138,7 @@ def run(self, lazy: bool) -> None: else: # FIXME: extract dimension name from nc file? - dim = self.ncmeta.get_dimensions( - metadata.path_string, self.homogeneous_time - )[-1] + dim = self.get_dimensions(metadata.path_string)[-1] size = self.group.dimensions[dim].size for _, node in indexed_tree_iter(self.ids, target_metadata): node.resize(size) @@ -235,9 +241,7 @@ def _validate_variable(self, var: netCDF4.Variable, metadata: IDSMetadata) -> No raise variable_error(var, "data type", var.dtype, expected_dtype) # Dimensions - expected_dims = self.ncmeta.get_dimensions( - metadata.path_string, self.homogeneous_time - ) + expected_dims = self.get_dimensions(metadata.path_string) if var.dimensions != expected_dims: raise variable_error(var, "dimensions", var.dimensions, expected_dims) @@ -298,9 +302,7 @@ def _validate_sparsity( return # Sparsity is stored with _Fillvalue, nothing to validate # Dimensions - aos_dimensions = self.ncmeta.get_dimensions( - self.ncmeta.aos.get(metadata.path_string), self.homogeneous_time - ) + aos_dimensions = self.get_dimensions(self.ncmeta.aos.get(metadata.path_string)) shape_dimensions = shape_var.dimensions if ( len(shape_dimensions) != len(aos_dimensions) + 1 @@ -331,7 +333,6 @@ def get_child(self, child): Args: child: The child IDS node which should be lazy loaded. - """ metadata = child.metadata path = metadata.path_string @@ -347,9 +348,7 @@ def get_child(self, child): size = nc2ids.group[var.name + ":shape"][self.index][0] else: # FIXME: extract dimension name from nc file? - dim = nc2ids.ncmeta.get_dimensions( - metadata.path_string, nc2ids.homogeneous_time - )[-1] + dim = nc2ids.get_dimensions(metadata.path_string)[-1] size = nc2ids.group.dimensions[dim].size child._set_lazy_context(LazyArrayStructContext(nc2ids, self.index, size)) From 2accf646f9151a94d64de34e2c5378b0cc1cd540 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 23 Mar 2026 16:04:16 +0100 Subject: [PATCH 2/5] Include metadata variables for (arrays of) structures in `to_xarray` This allows reading back data in the following scenario: ```python ids = imas.IDSFactory().core_profiles() ... # fill IDS xrds = imas.util.to_xarray(ids) xrds.to_netcdf("data.nc", group="core_profiles/0") with imas.DBEntry("data.nc", "r") as entry: ids2 = entry.get("core_profiles") ``` N.B. `to_xarray()` doesn't include metadata for inhomogeneously sized arrays of structures, so the two IDSs in above example could be different. --- imas/_to_xarray.py | 21 +++++++++++++-------- imas/test/test_to_xarray.py | 4 +++- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/imas/_to_xarray.py b/imas/_to_xarray.py index 13525c82..9deed01a 100644 --- a/imas/_to_xarray.py +++ b/imas/_to_xarray.py @@ -50,18 +50,23 @@ def to_xarray(ids: IDSToplevel, *paths: str) -> xarray.Dataset: var_name = path.replace("/", ".") metadata = ids.metadata[path] if metadata.data_type in (IDSDataType.STRUCTURE, IDSDataType.STRUCT_ARRAY): - continue # We don't store these in xarray - - dimensions = tensorizer.ncmeta.get_dimensions(path, tensorizer.homogeneous_time) - data = tensorizer.tensorize(path, fillvals[metadata.data_type]) + # Metadata variables for (arrays of) structures + if paths and path not in paths: + continue + dimensions = () + data = "" + else: + dimensions = tensorizer.get_dimensions(path) + data = tensorizer.tensorize(path, fillvals[metadata.data_type]) attrs = dict(documentation=metadata.documentation) if metadata.units: attrs["units"] = metadata.units - coordinates = tensorizer.filter_coordinates(path) - if coordinates: - coordinate_names.update(coordinates.split(" ")) - attrs["coordinates"] = coordinates + if dimensions: + coordinates = tensorizer.filter_coordinates(path) + if coordinates: + coordinate_names.update(coordinates.split(" ")) + attrs["coordinates"] = coordinates data_vars[var_name] = (dimensions, data, attrs) diff --git a/imas/test/test_to_xarray.py b/imas/test/test_to_xarray.py index a5df6a1e..a1ff8bdc 100644 --- a/imas/test/test_to_xarray.py +++ b/imas/test/test_to_xarray.py @@ -87,7 +87,9 @@ def test_to_xarray(): ids.profiles_1d[0].time = 0.0 # These should all be identical: - ds1 = to_xarray(ids) + ds1 = to_xarray(ids).drop_vars( + ["profiles_1d", "profiles_1d.electrons", "profiles_1d.grid"] + ) ds2 = to_xarray(ids, "profiles_1d.electrons.temperature") ds3 = to_xarray(ids, "profiles_1d/electrons/temperature") assert ds1.equals(ds2) From 71639932d36f8d721c7954cb3eb814faeaf0a9a1 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 27 Mar 2026 13:58:30 +0100 Subject: [PATCH 3/5] Add shape information in to_xarray - Centralize attribute metadata logic - Include :shape arrays in imas.util.to_xarray - Test that netCDF files written with xarray can be read as DBEntry --- imas/_to_xarray.py | 24 +++++------ imas/backends/netcdf/ids2nc.py | 55 ++++++-------------------- imas/backends/netcdf/ids_tensorizer.py | 55 +++++++++++++++++++++++++- imas/test/test_to_xarray.py | 23 ++++++++++- 4 files changed, 100 insertions(+), 57 deletions(-) diff --git a/imas/_to_xarray.py b/imas/_to_xarray.py index 9deed01a..979b126f 100644 --- a/imas/_to_xarray.py +++ b/imas/_to_xarray.py @@ -8,7 +8,7 @@ from imas.ids_data_type import IDSDataType fillvals = { - IDSDataType.INT: -(2**31) + 1, + IDSDataType.INT: numpy.int32(-(2**31) + 1), IDSDataType.STR: "", IDSDataType.FLT: numpy.nan, IDSDataType.CPX: numpy.nan * (1 + 1j), @@ -54,22 +54,24 @@ def to_xarray(ids: IDSToplevel, *paths: str) -> xarray.Dataset: if paths and path not in paths: continue dimensions = () - data = "" + data = b"" else: dimensions = tensorizer.get_dimensions(path) data = tensorizer.tensorize(path, fillvals[metadata.data_type]) - attrs = dict(documentation=metadata.documentation) - if metadata.units: - attrs["units"] = metadata.units - if dimensions: - coordinates = tensorizer.filter_coordinates(path) - if coordinates: - coordinate_names.update(coordinates.split(" ")) - attrs["coordinates"] = coordinates - + attrs = tensorizer.get_attributes(path, fillvals) + if "coordinates" in attrs: + coordinate_names.update(attrs["coordinates"].split(" ")) data_vars[var_name] = (dimensions, data, attrs) + # :shape array for sparse data + if path in tensorizer.shapes and metadata.ndim: + shape_name = f"{var_name}:shape" + dimensions = tensorizer.get_shape_dimensions(path) + data = tensorizer.shapes[path] + attrs = tensorizer.get_shape_attributes(var_name) + data_vars[shape_name] = (dimensions, data, attrs) + # Remove coordinates from data_vars and put in coordinates mapping: coordinates = {} for coordinate_name in coordinate_names: diff --git a/imas/backends/netcdf/ids2nc.py b/imas/backends/netcdf/ids2nc.py index b970279c..a01cc6ea 100644 --- a/imas/backends/netcdf/ids2nc.py +++ b/imas/backends/netcdf/ids2nc.py @@ -78,50 +78,17 @@ def create_variables(self) -> None: var = self.group.createVariable(var_name, dtype, dimensions, **kwargs) # Fill metadata attributes - var.documentation = metadata.documentation - if metadata.units: - var.units = metadata.units - - ancillary_variables = " ".join( - error_var - for error_var in [f"{var_name}_error_upper", f"{var_name}_error_lower"] - if error_var in self.filled_variables - ) - if ancillary_variables: - var.ancillary_variables = ancillary_variables - - if metadata.data_type is not IDSDataType.STRUCT_ARRAY: - coordinates = self.filter_coordinates(path) - if coordinates: - var.coordinates = coordinates - - # Sparsity and :shape array - if path in self.shapes: - if not metadata.ndim: - # Doesn't need a :shape array: - var.sparse = "Sparse data, missing data is filled with _FillValue" - var.sparse += f" ({default_fillvals[metadata.data_type]})" - - else: - shape_name = f"{var_name}:shape" - var.sparse = f"Sparse data, data shapes are stored in {shape_name}" - - # Create variable to store data shape - dimensions = self.get_dimensions(self.ncmeta.aos.get(path)) + ( - f"{metadata.ndim}D", - ) - shape_var = self.group.createVariable( - shape_name, - SHAPE_DTYPE, - dimensions, - ) - doc_indices = ",".join(chr(ord("i") + i) for i in range(3)) - shape_var.documentation = ( - f"Shape information for {var_name}.\n" - f"{shape_name}[{doc_indices},:] describes the shape of filled " - f"data of {var_name}[{doc_indices},...]. Data outside this " - "shape is unset (i.e. filled with _Fillvalue)." - ) + var.setncatts(self.get_attributes(path, default_fillvals)) + + # :shape array for sparse data + if path in self.shapes and metadata.ndim: + shape_name = f"{var_name}:shape" + # Create variable to store data shape + dimensions = self.get_shape_dimensions(path) + shape_var = self.group.createVariable( + shape_name, SHAPE_DTYPE, dimensions + ) + shape_var.setncatts(self.get_shape_attributes(var_name)) def store_data(self) -> None: """Store data in the netCDF variables""" diff --git a/imas/backends/netcdf/ids_tensorizer.py b/imas/backends/netcdf/ids_tensorizer.py index daac49c2..08bc05fe 100644 --- a/imas/backends/netcdf/ids_tensorizer.py +++ b/imas/backends/netcdf/ids_tensorizer.py @@ -3,7 +3,7 @@ """Tensorization logic to convert IDSs to netCDF files and/or xarray Datasets.""" from collections import deque -from typing import List, Tuple +from typing import List, Tuple, Dict import numpy @@ -62,6 +62,11 @@ def get_dimensions(self, path: str) -> Tuple[str, ...]: """ return self.ncmeta.get_dimensions(path, self.homogeneous_time) + def get_shape_dimensions(self, path: str) -> Tuple[str, ...]: + """Get dimensions names for shape array of the tensorized variable""" + ndim = self.ids.metadata[path].ndim + return self.get_dimensions(self.ncmeta.aos.get(path, "")) + (f"{ndim}D",) + def include_coordinate_paths(self) -> None: """Append all paths that are coordinates of self.paths_to_tensorize""" # Use a queue so we can also take coordinates of coordinates into account @@ -173,6 +178,54 @@ def filter_coordinates(self, path: str) -> str: if coordinate in self.filled_variables ) + def get_attributes(self, path: str, fillvals: dict) -> Dict[str, str]: + """Get metadata attributes of the tensorized variable""" + metadata = self.ids.metadata[path] + var_name = path.replace("/", ".") + + assert metadata.documentation is not None + attrs = {"documentation": metadata.documentation} + if metadata.units: + attrs["units"] = metadata.units + + ancillary_variables = " ".join( + error_var + for error_var in [f"{var_name}_error_upper", f"{var_name}_error_lower"] + if error_var in self.filled_variables + ) + if ancillary_variables: + attrs["ancillary_variables"] = ancillary_variables + + if metadata.data_type is not IDSDataType.STRUCT_ARRAY: + coordinates = self.filter_coordinates(path) + if coordinates: + attrs["coordinates"] = coordinates + + # Sparsity + if path in self.shapes: + if not metadata.ndim: + # Doesn't need a :shape array + attrs["sparse"] = ( + "Sparse data, missing data is filled with _FillValue" + f" ({fillvals[metadata.data_type]})" + ) + else: + attrs["sparse"] = ( + f"Sparse data, data shapes are stored in {var_name}:shape" + ) + + return attrs + + def get_shape_attributes(self, var_name: str) -> Dict[str, str]: + doc_indices = ",".join(chr(ord("i") + i) for i in range(3)) + documentation = ( + f"Shape information for {var_name}.\n" + f"{var_name}:shape[{doc_indices},:] describes the shape of filled " + f"data of {var_name}[{doc_indices},...]. Data outside this " + "shape is unset (i.e. filled with _Fillvalue)." + ) + return {"documentation": documentation} + def tensorize(self, path, fillvalue): """ Tensorizes the data at the given path with the specified fill value. diff --git a/imas/test/test_to_xarray.py b/imas/test/test_to_xarray.py index a1ff8bdc..1ef6f184 100644 --- a/imas/test/test_to_xarray.py +++ b/imas/test/test_to_xarray.py @@ -1,11 +1,13 @@ import numpy as np +import netCDF4 import pytest import imas import imas.training +from imas.test.test_helpers import compare_children from imas.util import to_xarray -pytest.importorskip("xarray") +xarray = pytest.importorskip("xarray") @pytest.fixture @@ -94,3 +96,22 @@ def test_to_xarray(): ds3 = to_xarray(ids, "profiles_1d/electrons/temperature") assert ds1.equals(ds2) assert ds2.equals(ds3) + + +@pytest.mark.parametrize("idsname", ["core_profiles", "equilibrium"]) +def test_roundtrip_xarray_netcdf(tmp_path, entry, idsname): + ids = entry.get(idsname) + xrds = to_xarray(ids) + fname = f"{tmp_path}/test-{idsname}-xarray.nc" + # First write mandatory file-level metadata + with netCDF4.Dataset(fname, "x") as ds: + ds.data_dictionary_version = imas.util.get_data_dictionary_version(ids) + # Then use xarray to write the IDS + xrds.to_netcdf(fname, "a", format="NETCDF4", group=f"{idsname}/0") + # And read it back with a DBEntry + with imas.DBEntry(fname, "r") as entry: + ids2 = entry.get(idsname) + compare_children(ids, ids2) + # Reading the netCDF file with xarray should produce an identical dataset + ncxrds = xarray.load_dataset(fname, group=f"{idsname}/0") + assert xrds.equals(ncxrds) From f33c2a6e9bdf4db07769b5fe8d621813354749c2 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 27 Mar 2026 14:17:01 +0100 Subject: [PATCH 4/5] Add missing docstring --- imas/backends/netcdf/ids_tensorizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/imas/backends/netcdf/ids_tensorizer.py b/imas/backends/netcdf/ids_tensorizer.py index 08bc05fe..a4019c9c 100644 --- a/imas/backends/netcdf/ids_tensorizer.py +++ b/imas/backends/netcdf/ids_tensorizer.py @@ -217,6 +217,7 @@ def get_attributes(self, path: str, fillvals: dict) -> Dict[str, str]: return attrs def get_shape_attributes(self, var_name: str) -> Dict[str, str]: + """Get attributes of the :shape variable corresponding to var_name""" doc_indices = ",".join(chr(ord("i") + i) for i in range(3)) documentation = ( f"Shape information for {var_name}.\n" From b5ddaaa54118affb672ae950e04aa5e99b5e6e2b Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 2 Apr 2026 09:41:16 +0200 Subject: [PATCH 5/5] [docs] Store xarray dataset in IMAS netCDF file --- docs/source/netcdf.rst | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/docs/source/netcdf.rst b/docs/source/netcdf.rst index 868ae429..f772144e 100644 --- a/docs/source/netcdf.rst +++ b/docs/source/netcdf.rst @@ -183,3 +183,39 @@ specific paths inside the IDS. The latter variant can also be combined with # profiles_1d.grid.rho_tor # profiles_1d.grid.rho_tor_norm # profiles_1d.grid.psi + + +Store Xarray Datasets in IMAS-compatible netCDF file +'''''''''''''''''''''''''''''''''''''''''''''''''''' + +.. versionadded:: 2.3.0 :py:meth:`~imas.util.to_xarray` now includes the required + metadata to load the IDS from a netCDF file. + +The following snippet shows how to store an IMAS Xarray dataset in an IMAS-compatible +netCDF file. The group name in the netCDF file must correspond to ``/`` (``core_profiles/0`` in the snippet). + +.. code-block:: python + :caption: Store IMAS Xarray dataset in an IMAS-compatible netCDF file + + import imas.training + import netCDF4 + + with imas.training.get_training_db_entry() as training_entry: + core_profiles = training_entry.get("core_profiles") + xrds = imas.util.to_xarray(core_profiles) + + # Store the xarray dataset in an IMAS-compatible netCDF dataset + filename = "data.nc" + xrds.to_netcdf( + filename, + group="core_profiles/0", # Update to the correct IDS name and occurrence + # auto_complex=True, # Uncomment if the dataset contains complex data + ) + # Set global DD version metadata + with netCDF4.Dataset(filename, "a") as ds: + ds.data_dictionary_version = imas.util.get_data_dictionary_version(ids) + + # Test that we can get the IDS from the netCDF file + with imas.DBEntry(filename, "r") as entry: + ids2 = entry.get("core_profiles")