From d4daf41f9a3ea932a4144ad455d6185065b644e1 Mon Sep 17 00:00:00 2001 From: mkuehbach Date: Tue, 17 Mar 2026 14:01:03 +0100 Subject: [PATCH 1/5] carried over from NXapm run-through --- pyproject.toml | 2 ++ src/pynxtools/dataconverter/chunk.py | 17 +++++++++++++-- src/pynxtools/dataconverter/writer.py | 31 ++++++++++++++++++++++----- 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9bfe2ec05..0051592c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,8 @@ dependencies = [ "click>=7.1.2", "click_default_group", "h5py>=3.6.0", + "hdf5plugin", # making explicit what in the past came implicit through pandas + "blosc2", # making explicit what in the past came implicit through pandas "xarray>=0.20.2", "PyYAML>=6.0", 'numpy', diff --git a/src/pynxtools/dataconverter/chunk.py b/src/pynxtools/dataconverter/chunk.py index c9d150a76..a9b3ab1d8 100644 --- a/src/pynxtools/dataconverter/chunk.py +++ b/src/pynxtools/dataconverter/chunk.py @@ -22,11 +22,25 @@ import numpy as np +import importlib.util + # HDF5 data storage layout for HDF5 datasets is "contiguous" unless # one wraps the payload for a dataconverter template into a dictionary with # keyword "compress", causing chunked layout to be used -COMPRESSION_FILTERS: list[str] = ["gzip"] # deflate +PYNX_ENABLE_BLOSC: bool = False # deactivated by default +# use only when it is acceptable to work with blosc2-compressed content downstreams +# mind that doing so in C/C++, Matlab, and Fortran application requires specific linking of these apps +# consider that using blosc sets explicit a certain number of cores eligible for doing compression and decompression +# work that may drain resources when pynxtools is used in conjunction with other apps and services like NOMAD +# check the set_nthreads in writer.py to modify according to your best practice + +if PYNX_ENABLE_BLOSC and importlib.util.find_spec("hdf5plugin") is not None and importlib.util.find_spec("blosc2") is not None: + COMPRESSION_FILTERS: list[str] = ["gzip"] +else: + COMPRESSION_FILTERS: list[str] = ["gzip", "blosc"] +# order matters! 0th entry always taken as the default for backwards compatibility +# "gzip" -> deflate, "blosc" -> "zstd"] COMPRESSION_STRENGTH: int = 9 # integer values from 0 (effectively no), 1, ..., to at most 9 (strongest compression) # using strongest compression is space efficient but can take substantially longer than @@ -76,7 +90,6 @@ CHUNK_CONFIG_DEFAULT = CHUNK_CONFIG_HFIVEPY - logger = logging.getLogger("pynxtools") diff --git a/src/pynxtools/dataconverter/writer.py b/src/pynxtools/dataconverter/writer.py index e487972ff..ade19d1e9 100644 --- a/src/pynxtools/dataconverter/writer.py +++ b/src/pynxtools/dataconverter/writer.py @@ -20,7 +20,9 @@ # pylint: disable=R0912 import copy +import importlib.util import logging +import os import sys import xml.etree.ElementTree as ET @@ -36,9 +38,18 @@ ) logger = logging.getLogger("pynxtools") # pylint: disable=C0103 -from pynxtools.dataconverter.chunk import COMPRESSION_FILTERS, COMPRESSION_STRENGTH - - +from pynxtools.dataconverter.chunk import PYNX_ENABLE_BLOSC, COMPRESSION_FILTERS, COMPRESSION_STRENGTH + +if PYNX_ENABLE_BLOSC and importlib.util.find_spec("hdf5plugin") is not None and importlib.util.find_spec("blosc2") is not None: + import blosc2 + import hdf5plugin + + NTHREADS_BLOSC = blosc2.set_nthreads(max(int(os.cpu_count() / 2), 1)) + # do not oversubscribe to use hyperthreading cores + logger.info(f"blosc2 is configured to use {blosc2.nthreads} threads on host with {blosc2.ncores} cores") + logger.info(blosc2.print_versions()) +else: + NTHREADS_BLOSC = 0 def does_path_exist(path, h5py_obj) -> bool: """Returns true if the requested path exists in the given h5py object.""" try: @@ -153,7 +164,10 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path, append): elif "compress" in data.keys(): if not (isinstance(data["compress"], str) or np.isscalar(data["compress"])): if ("filter" in data.keys()) and (data["filter"] in COMPRESSION_FILTERS): - compression_filter = data["filter"] + if PYNX_ENABLE_BLOSC and data["filter"] == "blosc": + compression_filter = data["filter"] + else: + compression_filter = COMPRESSION_FILTERS[0] else: # fall-back to default compression_filter = COMPRESSION_FILTERS[0] @@ -168,12 +182,19 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path, append): if entry_name not in grp: try: + if compression_filter == "gzip": + compression_config = dict( + compression=compression_filter, + compression_opts=compression_strength, + ) + else: # by virtue of construction blosc + compression_config = hdf5plugin.Blosc2(cname="zstd", clevel=9) grp.create_dataset( entry_name, data=data["compress"], compression=compression_filter, chunks=chunking_strategy(data), - compression_opts=compression_strength, + **compression_config, ) except ValueError: logger.warning(f"ValueError caught upon creating_dataset {path}") From 32439fa8c83b77e96f0f6406b71e067834edbac5 Mon Sep 17 00:00:00 2001 From: mkuehbach Date: Tue, 17 Mar 2026 14:27:39 +0100 Subject: [PATCH 2/5] linting --- .cspell/custom-dictionary.txt | 7 +++++++ src/pynxtools/dataconverter/chunk.py | 16 ++++++++++------ src/pynxtools/dataconverter/writer.py | 18 +++++++++++++++--- tests/dataconverter/test_nexus_tree.py | 10 +++++----- tests/nexus/test_nexus.py | 2 +- 5 files changed, 38 insertions(+), 15 deletions(-) diff --git a/.cspell/custom-dictionary.txt b/.cspell/custom-dictionary.txt index 7657931d7..f0a8cd20c 100644 --- a/.cspell/custom-dictionary.txt +++ b/.cspell/custom-dictionary.txt @@ -40,6 +40,7 @@ Márquez NFDI NIAC NINF +NTHREADS NXDL Namefits Namefitting @@ -47,6 +48,7 @@ OPCPA ORCID PATHCONV PDBX +PYNX PYNXTOOLS Pielsticker Pincelli @@ -88,6 +90,7 @@ blosc bulletpoint callspec caplog +clevel cnxvalidate complexfloating dataarrays @@ -95,6 +98,7 @@ dataconverter datamodel defaultdicts docstrings +downstreams edgeitems ekey electronanalyzer @@ -122,6 +126,7 @@ hdfgroup hdfobject hfive hixu +hyperthreading hypothes idname idobj @@ -159,6 +164,7 @@ mynxdl namefit namefitting nbytes +ncores ndarray ndataconverter ndims @@ -169,6 +175,7 @@ nodemixin nonvariadic nslots nsmap +nthreads nxcollection nxdata nxdl diff --git a/src/pynxtools/dataconverter/chunk.py b/src/pynxtools/dataconverter/chunk.py index a9b3ab1d8..1e05c5339 100644 --- a/src/pynxtools/dataconverter/chunk.py +++ b/src/pynxtools/dataconverter/chunk.py @@ -18,12 +18,11 @@ """Configuration and utilities for customized chunking and compression.""" +import importlib.util import logging import numpy as np -import importlib.util - # HDF5 data storage layout for HDF5 datasets is "contiguous" unless # one wraps the payload for a dataconverter template into a dictionary with # keyword "compress", causing chunked layout to be used @@ -35,10 +34,15 @@ # work that may drain resources when pynxtools is used in conjunction with other apps and services like NOMAD # check the set_nthreads in writer.py to modify according to your best practice -if PYNX_ENABLE_BLOSC and importlib.util.find_spec("hdf5plugin") is not None and importlib.util.find_spec("blosc2") is not None: - COMPRESSION_FILTERS: list[str] = ["gzip"] -else: - COMPRESSION_FILTERS: list[str] = ["gzip", "blosc"] +COMPRESSION_FILTERS: list[str] = ( + ["gzip"] + if ( + PYNX_ENABLE_BLOSC + and importlib.util.find_spec("hdf5plugin") is not None + and importlib.util.find_spec("blosc2") is not None + ) + else ["gzip", "blosc"] +) # order matters! 0th entry always taken as the default for backwards compatibility # "gzip" -> deflate, "blosc" -> "zstd"] COMPRESSION_STRENGTH: int = 9 diff --git a/src/pynxtools/dataconverter/writer.py b/src/pynxtools/dataconverter/writer.py index ade19d1e9..d0c8c92cc 100644 --- a/src/pynxtools/dataconverter/writer.py +++ b/src/pynxtools/dataconverter/writer.py @@ -38,18 +38,30 @@ ) logger = logging.getLogger("pynxtools") # pylint: disable=C0103 -from pynxtools.dataconverter.chunk import PYNX_ENABLE_BLOSC, COMPRESSION_FILTERS, COMPRESSION_STRENGTH +from pynxtools.dataconverter.chunk import ( + COMPRESSION_FILTERS, + COMPRESSION_STRENGTH, + PYNX_ENABLE_BLOSC, +) -if PYNX_ENABLE_BLOSC and importlib.util.find_spec("hdf5plugin") is not None and importlib.util.find_spec("blosc2") is not None: +if ( + PYNX_ENABLE_BLOSC + and importlib.util.find_spec("hdf5plugin") is not None + and importlib.util.find_spec("blosc2") is not None +): import blosc2 import hdf5plugin NTHREADS_BLOSC = blosc2.set_nthreads(max(int(os.cpu_count() / 2), 1)) # do not oversubscribe to use hyperthreading cores - logger.info(f"blosc2 is configured to use {blosc2.nthreads} threads on host with {blosc2.ncores} cores") + logger.info( + f"blosc2 is configured to use {blosc2.nthreads} threads on host with {blosc2.ncores} cores" + ) logger.info(blosc2.print_versions()) else: NTHREADS_BLOSC = 0 + + def does_path_exist(path, h5py_obj) -> bool: """Returns true if the requested path exists in the given h5py object.""" try: diff --git a/tests/dataconverter/test_nexus_tree.py b/tests/dataconverter/test_nexus_tree.py index 1fc01a082..452cb7ea2 100644 --- a/tests/dataconverter/test_nexus_tree.py +++ b/tests/dataconverter/test_nexus_tree.py @@ -1,6 +1,11 @@ from typing import Any, get_args from anytree import Resolver +from pynxtools.definitions.dev_tools.utils.nxdl_utils import ( + get_app_defs_names, + get_nx_attribute_type, + get_nx_units, +) from pynxtools.dataconverter.nexus_tree import ( NexusNode, @@ -8,11 +13,6 @@ NexusUnitCategory, generate_tree_from, ) -from pynxtools.definitions.dev_tools.utils.nxdl_utils import ( - get_app_defs_names, - get_nx_attribute_type, - get_nx_units, -) def test_parsing_of_all_appdefs(): diff --git a/tests/nexus/test_nexus.py b/tests/nexus/test_nexus.py index 8d7dad59e..f833c3275 100644 --- a/tests/nexus/test_nexus.py +++ b/tests/nexus/test_nexus.py @@ -24,7 +24,6 @@ import lxml.etree as ET import numpy as np import pytest - from pynxtools.definitions.dev_tools.utils.nxdl_utils import ( get_inherited_nodes, get_node_at_nxdl_path, @@ -32,6 +31,7 @@ get_nx_classes, get_nx_units, ) + from pynxtools.nexus.nexus import HandleNexus, decode_if_string logger = logging.getLogger(__name__) From 9d14e6c73a29435cd775d5b0e50ddbd42b55e804 Mon Sep 17 00:00:00 2001 From: mkuehbach Date: Tue, 17 Mar 2026 14:31:06 +0100 Subject: [PATCH 3/5] invert logic --- src/pynxtools/dataconverter/chunk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pynxtools/dataconverter/chunk.py b/src/pynxtools/dataconverter/chunk.py index 1e05c5339..f9fec3237 100644 --- a/src/pynxtools/dataconverter/chunk.py +++ b/src/pynxtools/dataconverter/chunk.py @@ -35,13 +35,13 @@ # check the set_nthreads in writer.py to modify according to your best practice COMPRESSION_FILTERS: list[str] = ( - ["gzip"] + ["gzip", "blosc"] if ( PYNX_ENABLE_BLOSC and importlib.util.find_spec("hdf5plugin") is not None and importlib.util.find_spec("blosc2") is not None ) - else ["gzip", "blosc"] + else ["gzip"] ) # order matters! 0th entry always taken as the default for backwards compatibility # "gzip" -> deflate, "blosc" -> "zstd"] From 4b4d629a31d48c7931f8e5f1374b0e36bf00463f Mon Sep 17 00:00:00 2001 From: mkuehbach Date: Tue, 17 Mar 2026 14:39:59 +0100 Subject: [PATCH 4/5] linting --- .pre-commit-config.yaml | 12 ++++++------ src/pynxtools/dataconverter/chunk.py | 13 +++++++------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f5547c122..88bfe2e54 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.15.0 + rev: v0.15.6 hooks: - id: ruff-check files: ^(src/pynxtools|tests)/.*\.py$ @@ -15,18 +15,18 @@ repos: - types-PyYAML - repo: https://github.com/asottile/pyupgrade - rev: v3.21.1 + rev: v3.21.2 hooks: - id: pyupgrade - args: [--py36-plus] # modernizes syntax for Python 3.6+ + args: [--py310-plus] # modernizes syntax for Python 3.10+ - repo: https://github.com/kynan/nbstripout - rev: 0.9.0 + rev: 0.9.1 hooks: - id: nbstripout # removes notebook outputs before committing - repo: https://github.com/streetsidesoftware/cspell-cli - rev: v9.6.0 + rev: v9.7.0 hooks: - id: cspell # spellchecking pass_filenames: false @@ -37,4 +37,4 @@ repos: - CITATION.cff - docs/**/* - src/pynxtools/**/*.py - - tests/**/*.py \ No newline at end of file + - tests/**/*.py diff --git a/src/pynxtools/dataconverter/chunk.py b/src/pynxtools/dataconverter/chunk.py index f9fec3237..40569e2d2 100644 --- a/src/pynxtools/dataconverter/chunk.py +++ b/src/pynxtools/dataconverter/chunk.py @@ -29,10 +29,12 @@ PYNX_ENABLE_BLOSC: bool = False # deactivated by default # use only when it is acceptable to work with blosc2-compressed content downstreams -# mind that doing so in C/C++, Matlab, and Fortran application requires specific linking of these apps -# consider that using blosc sets explicit a certain number of cores eligible for doing compression and decompression -# work that may drain resources when pynxtools is used in conjunction with other apps and services like NOMAD -# check the set_nthreads in writer.py to modify according to your best practice +# mind that doing so in C/C++, Matlab, and Fortran application requires specific +# linking of these apps with a customized HDF5 library that links to the blosc library +# consider that using blosc sets explicit a certain number of cores eligible for +# doing compression and decompression work that may drain resources when pynxtools +# is used in conjunction with other apps and services like NOMAD +# check the set_nthreads in writer.py to modify accordingly for your best practice COMPRESSION_FILTERS: list[str] = ( ["gzip", "blosc"] @@ -47,8 +49,7 @@ # "gzip" -> deflate, "blosc" -> "zstd"] COMPRESSION_STRENGTH: int = 9 # integer values from 0 (effectively no), 1, ..., to at most 9 (strongest compression) -# using strongest compression is space efficient but can take substantially longer than -# using 1 +# using strongest compression is space efficient but takes substantially longer than 1 # compressed payload is served as a dict with at least one keyword "compress", # "strength" is optional keyword for that dictionary to overwrite the default From b43a060c46d8aed924f40bc822e74528834b68db Mon Sep 17 00:00:00 2001 From: mkuehbach Date: Tue, 17 Mar 2026 14:59:14 +0100 Subject: [PATCH 5/5] lint testing --- pyproject.toml | 3 ++- tests/dataconverter/test_nexus_tree.py | 10 +++++----- tests/nexus/test_nexus.py | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0051592c3..a322832fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -165,6 +165,7 @@ ignore = [ ] fixable = ["ALL"] isort.split-on-trailing-comma = false +isort.known-first-party = ["pynxtools"] [tool.ruff.format] quote-style = "double" @@ -183,4 +184,4 @@ exclude = ["src/pynxtools/definitions/*"] [tool.uv] extra-index-url = [ "https://gitlab.mpcdf.mpg.de/api/v4/projects/2187/packages/pypi/simple", -] \ No newline at end of file +] diff --git a/tests/dataconverter/test_nexus_tree.py b/tests/dataconverter/test_nexus_tree.py index 452cb7ea2..1fc01a082 100644 --- a/tests/dataconverter/test_nexus_tree.py +++ b/tests/dataconverter/test_nexus_tree.py @@ -1,11 +1,6 @@ from typing import Any, get_args from anytree import Resolver -from pynxtools.definitions.dev_tools.utils.nxdl_utils import ( - get_app_defs_names, - get_nx_attribute_type, - get_nx_units, -) from pynxtools.dataconverter.nexus_tree import ( NexusNode, @@ -13,6 +8,11 @@ NexusUnitCategory, generate_tree_from, ) +from pynxtools.definitions.dev_tools.utils.nxdl_utils import ( + get_app_defs_names, + get_nx_attribute_type, + get_nx_units, +) def test_parsing_of_all_appdefs(): diff --git a/tests/nexus/test_nexus.py b/tests/nexus/test_nexus.py index f833c3275..8d7dad59e 100644 --- a/tests/nexus/test_nexus.py +++ b/tests/nexus/test_nexus.py @@ -24,6 +24,7 @@ import lxml.etree as ET import numpy as np import pytest + from pynxtools.definitions.dev_tools.utils.nxdl_utils import ( get_inherited_nodes, get_node_at_nxdl_path, @@ -31,7 +32,6 @@ get_nx_classes, get_nx_units, ) - from pynxtools.nexus.nexus import HandleNexus, decode_if_string logger = logging.getLogger(__name__)