From d4daf41f9a3ea932a4144ad455d6185065b644e1 Mon Sep 17 00:00:00 2001
From: mkuehbach <markus.kuehbach@physik.hu-berlin.de>
Date: Tue, 17 Mar 2026 14:01:03 +0100
Subject: [PATCH 1/5] carried over from NXapm run-through

---
 pyproject.toml                        |  2 ++
 src/pynxtools/dataconverter/chunk.py  | 17 +++++++++++++--
 src/pynxtools/dataconverter/writer.py | 31 ++++++++++++++++++++++-----
 3 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9bfe2ec05..0051592c3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,8 @@ dependencies = [
     "click>=7.1.2",
     "click_default_group",
     "h5py>=3.6.0",
+    "hdf5plugin",  # making explicit what in the past came implicit through pandas
+    "blosc2",  # making explicit what in the past came implicit through pandas
     "xarray>=0.20.2",
     "PyYAML>=6.0",
     'numpy',
diff --git a/src/pynxtools/dataconverter/chunk.py b/src/pynxtools/dataconverter/chunk.py
index c9d150a76..a9b3ab1d8 100644
--- a/src/pynxtools/dataconverter/chunk.py
+++ b/src/pynxtools/dataconverter/chunk.py
@@ -22,11 +22,25 @@
 
 import numpy as np
 
+import importlib.util
+
 # HDF5 data storage layout for HDF5 datasets is "contiguous" unless
 # one wraps the payload for a dataconverter template into a dictionary with
 # keyword "compress", causing chunked layout to be used
 
-COMPRESSION_FILTERS: list[str] = ["gzip"]  # deflate
+PYNX_ENABLE_BLOSC: bool = False  # deactivated by default
+# use only when it is acceptable to work with blosc2-compressed content downstreams
+# mind that doing so in C/C++, Matlab, and Fortran application requires specific linking of these apps
+# consider that using blosc sets explicit a certain number of cores eligible for doing compression and decompression
+# work that may drain resources when pynxtools is used in conjunction with other apps and services like NOMAD
+# check the set_nthreads in writer.py to modify according to your best practice
+
+if PYNX_ENABLE_BLOSC and importlib.util.find_spec("hdf5plugin") is not None and importlib.util.find_spec("blosc2") is not None:
+    COMPRESSION_FILTERS: list[str] = ["gzip"]
+else:
+    COMPRESSION_FILTERS: list[str] = ["gzip", "blosc"]
+# order matters! 0th entry always taken as the default for backwards compatibility
+# "gzip" -> deflate, "blosc" -> "zstd"]
 COMPRESSION_STRENGTH: int = 9
 # integer values from 0 (effectively no), 1, ..., to at most 9 (strongest compression)
 # using strongest compression is space efficient but can take substantially longer than
@@ -76,7 +90,6 @@
 
 CHUNK_CONFIG_DEFAULT = CHUNK_CONFIG_HFIVEPY
 
-
 logger = logging.getLogger("pynxtools")
 
 
diff --git a/src/pynxtools/dataconverter/writer.py b/src/pynxtools/dataconverter/writer.py
index e487972ff..ade19d1e9 100644
--- a/src/pynxtools/dataconverter/writer.py
+++ b/src/pynxtools/dataconverter/writer.py
@@ -20,7 +20,9 @@
 # pylint: disable=R0912
 
 import copy
+import importlib.util
 import logging
+import os
 import sys
 import xml.etree.ElementTree as ET
 
@@ -36,9 +38,18 @@
 )
 
 logger = logging.getLogger("pynxtools")  # pylint: disable=C0103
-from pynxtools.dataconverter.chunk import COMPRESSION_FILTERS, COMPRESSION_STRENGTH
-
-
+from pynxtools.dataconverter.chunk import PYNX_ENABLE_BLOSC, COMPRESSION_FILTERS, COMPRESSION_STRENGTH
+
+if PYNX_ENABLE_BLOSC and importlib.util.find_spec("hdf5plugin") is not None and importlib.util.find_spec("blosc2") is not None:
+    import blosc2
+    import hdf5plugin
+
+    NTHREADS_BLOSC = blosc2.set_nthreads(max(int(os.cpu_count() / 2), 1))
+    # do not oversubscribe to use hyperthreading cores
+    logger.info(f"blosc2 is configured to use {blosc2.nthreads} threads on host with {blosc2.ncores} cores")
+    logger.info(blosc2.print_versions())
+else:
+    NTHREADS_BLOSC = 0
 def does_path_exist(path, h5py_obj) -> bool:
     """Returns true if the requested path exists in the given h5py object."""
     try:
@@ -153,7 +164,10 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path, append):
     elif "compress" in data.keys():
         if not (isinstance(data["compress"], str) or np.isscalar(data["compress"])):
             if ("filter" in data.keys()) and (data["filter"] in COMPRESSION_FILTERS):
-                compression_filter = data["filter"]
+                if PYNX_ENABLE_BLOSC and data["filter"] == "blosc":
+                    compression_filter = data["filter"]
+                else:
+                    compression_filter = COMPRESSION_FILTERS[0]
             else:  # fall-back to default
                 compression_filter = COMPRESSION_FILTERS[0]
 
@@ -168,12 +182,19 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path, append):
 
             if entry_name not in grp:
                 try:
+                    if compression_filter == "gzip":
+                        compression_config = dict(
+                            compression=compression_filter,
+                            compression_opts=compression_strength,
+                        )
+                    else:  # by virtue of construction blosc
+                        compression_config = hdf5plugin.Blosc2(cname="zstd", clevel=9)
                     grp.create_dataset(
                         entry_name,
                         data=data["compress"],
                         compression=compression_filter,
                         chunks=chunking_strategy(data),
-                        compression_opts=compression_strength,
+                        **compression_config,
                     )
                 except ValueError:
                     logger.warning(f"ValueError caught upon creating_dataset {path}")

From 32439fa8c83b77e96f0f6406b71e067834edbac5 Mon Sep 17 00:00:00 2001
From: mkuehbach <markus.kuehbach@physik.hu-berlin.de>
Date: Tue, 17 Mar 2026 14:27:39 +0100
Subject: [PATCH 2/5] linting

---
 .cspell/custom-dictionary.txt          |  7 +++++++
 src/pynxtools/dataconverter/chunk.py   | 16 ++++++++++------
 src/pynxtools/dataconverter/writer.py  | 18 +++++++++++++++---
 tests/dataconverter/test_nexus_tree.py | 10 +++++-----
 tests/nexus/test_nexus.py              |  2 +-
 5 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/.cspell/custom-dictionary.txt b/.cspell/custom-dictionary.txt
index 7657931d7..f0a8cd20c 100644
--- a/.cspell/custom-dictionary.txt
+++ b/.cspell/custom-dictionary.txt
@@ -40,6 +40,7 @@ Márquez
 NFDI
 NIAC
 NINF
+NTHREADS
 NXDL
 Namefits
 Namefitting
@@ -47,6 +48,7 @@ OPCPA
 ORCID
 PATHCONV
 PDBX
+PYNX
 PYNXTOOLS
 Pielsticker
 Pincelli
@@ -88,6 +90,7 @@ blosc
 bulletpoint
 callspec
 caplog
+clevel
 cnxvalidate
 complexfloating
 dataarrays
@@ -95,6 +98,7 @@ dataconverter
 datamodel
 defaultdicts
 docstrings
+downstreams
 edgeitems
 ekey
 electronanalyzer
@@ -122,6 +126,7 @@ hdfgroup
 hdfobject
 hfive
 hixu
+hyperthreading
 hypothes
 idname
 idobj
@@ -159,6 +164,7 @@ mynxdl
 namefit
 namefitting
 nbytes
+ncores
 ndarray
 ndataconverter
 ndims
@@ -169,6 +175,7 @@ nodemixin
 nonvariadic
 nslots
 nsmap
+nthreads
 nxcollection
 nxdata
 nxdl
diff --git a/src/pynxtools/dataconverter/chunk.py b/src/pynxtools/dataconverter/chunk.py
index a9b3ab1d8..1e05c5339 100644
--- a/src/pynxtools/dataconverter/chunk.py
+++ b/src/pynxtools/dataconverter/chunk.py
@@ -18,12 +18,11 @@
 
 """Configuration and utilities for customized chunking and compression."""
 
+import importlib.util
 import logging
 
 import numpy as np
 
-import importlib.util
-
 # HDF5 data storage layout for HDF5 datasets is "contiguous" unless
 # one wraps the payload for a dataconverter template into a dictionary with
 # keyword "compress", causing chunked layout to be used
@@ -35,10 +34,15 @@
 # work that may drain resources when pynxtools is used in conjunction with other apps and services like NOMAD
 # check the set_nthreads in writer.py to modify according to your best practice
 
-if PYNX_ENABLE_BLOSC and importlib.util.find_spec("hdf5plugin") is not None and importlib.util.find_spec("blosc2") is not None:
-    COMPRESSION_FILTERS: list[str] = ["gzip"]
-else:
-    COMPRESSION_FILTERS: list[str] = ["gzip", "blosc"]
+COMPRESSION_FILTERS: list[str] = (
+    ["gzip"]
+    if (
+        PYNX_ENABLE_BLOSC
+        and importlib.util.find_spec("hdf5plugin") is not None
+        and importlib.util.find_spec("blosc2") is not None
+    )
+    else ["gzip", "blosc"]
+)
 # order matters! 0th entry always taken as the default for backwards compatibility
 # "gzip" -> deflate, "blosc" -> "zstd"]
 COMPRESSION_STRENGTH: int = 9
diff --git a/src/pynxtools/dataconverter/writer.py b/src/pynxtools/dataconverter/writer.py
index ade19d1e9..d0c8c92cc 100644
--- a/src/pynxtools/dataconverter/writer.py
+++ b/src/pynxtools/dataconverter/writer.py
@@ -38,18 +38,30 @@
 )
 
 logger = logging.getLogger("pynxtools")  # pylint: disable=C0103
-from pynxtools.dataconverter.chunk import PYNX_ENABLE_BLOSC, COMPRESSION_FILTERS, COMPRESSION_STRENGTH
+from pynxtools.dataconverter.chunk import (
+    COMPRESSION_FILTERS,
+    COMPRESSION_STRENGTH,
+    PYNX_ENABLE_BLOSC,
+)
 
-if PYNX_ENABLE_BLOSC and importlib.util.find_spec("hdf5plugin") is not None and importlib.util.find_spec("blosc2") is not None:
+if (
+    PYNX_ENABLE_BLOSC
+    and importlib.util.find_spec("hdf5plugin") is not None
+    and importlib.util.find_spec("blosc2") is not None
+):
     import blosc2
     import hdf5plugin
 
     NTHREADS_BLOSC = blosc2.set_nthreads(max(int(os.cpu_count() / 2), 1))
     # do not oversubscribe to use hyperthreading cores
-    logger.info(f"blosc2 is configured to use {blosc2.nthreads} threads on host with {blosc2.ncores} cores")
+    logger.info(
+        f"blosc2 is configured to use {blosc2.nthreads} threads on host with {blosc2.ncores} cores"
+    )
     logger.info(blosc2.print_versions())
 else:
     NTHREADS_BLOSC = 0
+
+
 def does_path_exist(path, h5py_obj) -> bool:
     """Returns true if the requested path exists in the given h5py object."""
     try:
diff --git a/tests/dataconverter/test_nexus_tree.py b/tests/dataconverter/test_nexus_tree.py
index 1fc01a082..452cb7ea2 100644
--- a/tests/dataconverter/test_nexus_tree.py
+++ b/tests/dataconverter/test_nexus_tree.py
@@ -1,6 +1,11 @@
 from typing import Any, get_args
 
 from anytree import Resolver
+from pynxtools.definitions.dev_tools.utils.nxdl_utils import (
+    get_app_defs_names,
+    get_nx_attribute_type,
+    get_nx_units,
+)
 
 from pynxtools.dataconverter.nexus_tree import (
     NexusNode,
@@ -8,11 +13,6 @@
     NexusUnitCategory,
     generate_tree_from,
 )
-from pynxtools.definitions.dev_tools.utils.nxdl_utils import (
-    get_app_defs_names,
-    get_nx_attribute_type,
-    get_nx_units,
-)
 
 
 def test_parsing_of_all_appdefs():
diff --git a/tests/nexus/test_nexus.py b/tests/nexus/test_nexus.py
index 8d7dad59e..f833c3275 100644
--- a/tests/nexus/test_nexus.py
+++ b/tests/nexus/test_nexus.py
@@ -24,7 +24,6 @@
 import lxml.etree as ET
 import numpy as np
 import pytest
-
 from pynxtools.definitions.dev_tools.utils.nxdl_utils import (
     get_inherited_nodes,
     get_node_at_nxdl_path,
@@ -32,6 +31,7 @@
     get_nx_classes,
     get_nx_units,
 )
+
 from pynxtools.nexus.nexus import HandleNexus, decode_if_string
 
 logger = logging.getLogger(__name__)

From 9d14e6c73a29435cd775d5b0e50ddbd42b55e804 Mon Sep 17 00:00:00 2001
From: mkuehbach <markus.kuehbach@physik.hu-berlin.de>
Date: Tue, 17 Mar 2026 14:31:06 +0100
Subject: [PATCH 3/5] invert logic

---
 src/pynxtools/dataconverter/chunk.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pynxtools/dataconverter/chunk.py b/src/pynxtools/dataconverter/chunk.py
index 1e05c5339..f9fec3237 100644
--- a/src/pynxtools/dataconverter/chunk.py
+++ b/src/pynxtools/dataconverter/chunk.py
@@ -35,13 +35,13 @@
 # check the set_nthreads in writer.py to modify according to your best practice
 
 COMPRESSION_FILTERS: list[str] = (
-    ["gzip"]
+    ["gzip", "blosc"]
     if (
         PYNX_ENABLE_BLOSC
         and importlib.util.find_spec("hdf5plugin") is not None
         and importlib.util.find_spec("blosc2") is not None
     )
-    else ["gzip", "blosc"]
+    else ["gzip"]
 )
 # order matters! 0th entry always taken as the default for backwards compatibility
 # "gzip" -> deflate, "blosc" -> "zstd"]

From 4b4d629a31d48c7931f8e5f1374b0e36bf00463f Mon Sep 17 00:00:00 2001
From: mkuehbach <markus.kuehbach@physik.hu-berlin.de>
Date: Tue, 17 Mar 2026 14:39:59 +0100
Subject: [PATCH 4/5] linting

---
 .pre-commit-config.yaml              | 12 ++++++------
 src/pynxtools/dataconverter/chunk.py | 13 +++++++------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f5547c122..88bfe2e54 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.15.0
+    rev: v0.15.6
     hooks:
       - id: ruff-check
         files: ^(src/pynxtools|tests)/.*\.py$
@@ -15,18 +15,18 @@ repos:
           - types-PyYAML
 
   - repo: https://github.com/asottile/pyupgrade
-    rev: v3.21.1
+    rev: v3.21.2
     hooks:
       - id: pyupgrade
-        args: [--py36-plus]  # modernizes syntax for Python 3.6+
+        args: [--py310-plus]  # modernizes syntax for Python 3.10+
 
   - repo: https://github.com/kynan/nbstripout
-    rev: 0.9.0
+    rev: 0.9.1
     hooks:
       - id: nbstripout  # removes notebook outputs before committing
 
   - repo: https://github.com/streetsidesoftware/cspell-cli
-    rev: v9.6.0
+    rev: v9.7.0
     hooks:
       - id: cspell  # spellchecking
         pass_filenames: false
@@ -37,4 +37,4 @@ repos:
           - CITATION.cff
           - docs/**/*
           - src/pynxtools/**/*.py
-          - tests/**/*.py
\ No newline at end of file
+          - tests/**/*.py
diff --git a/src/pynxtools/dataconverter/chunk.py b/src/pynxtools/dataconverter/chunk.py
index f9fec3237..40569e2d2 100644
--- a/src/pynxtools/dataconverter/chunk.py
+++ b/src/pynxtools/dataconverter/chunk.py
@@ -29,10 +29,12 @@
 
 PYNX_ENABLE_BLOSC: bool = False  # deactivated by default
 # use only when it is acceptable to work with blosc2-compressed content downstreams
-# mind that doing so in C/C++, Matlab, and Fortran application requires specific linking of these apps
-# consider that using blosc sets explicit a certain number of cores eligible for doing compression and decompression
-# work that may drain resources when pynxtools is used in conjunction with other apps and services like NOMAD
-# check the set_nthreads in writer.py to modify according to your best practice
+# mind that doing so in C/C++, Matlab, and Fortran application requires specific
+# linking of these apps with a customized HDF5 library that links to the blosc library
+# consider that using blosc sets explicit a certain number of cores eligible for
+# doing compression and decompression work that may drain resources when pynxtools
+# is used in conjunction with other apps and services like NOMAD
+# check the set_nthreads in writer.py to modify accordingly for your best practice
 
 COMPRESSION_FILTERS: list[str] = (
     ["gzip", "blosc"]
@@ -47,8 +49,7 @@
 # "gzip" -> deflate, "blosc" -> "zstd"]
 COMPRESSION_STRENGTH: int = 9
 # integer values from 0 (effectively no), 1, ..., to at most 9 (strongest compression)
-# using strongest compression is space efficient but can take substantially longer than
-# using 1
+# using strongest compression is space efficient but takes substantially longer than 1
 
 # compressed payload is served as a dict with at least one keyword "compress",
 # "strength" is optional keyword for that dictionary to overwrite the default

From b43a060c46d8aed924f40bc822e74528834b68db Mon Sep 17 00:00:00 2001
From: mkuehbach <markus.kuehbach@physik.hu-berlin.de>
Date: Tue, 17 Mar 2026 14:59:14 +0100
Subject: [PATCH 5/5] lint testing

---
 pyproject.toml                         |  3 ++-
 tests/dataconverter/test_nexus_tree.py | 10 +++++-----
 tests/nexus/test_nexus.py              |  2 +-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0051592c3..a322832fc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -165,6 +165,7 @@ ignore = [
 ]
 fixable = ["ALL"]
 isort.split-on-trailing-comma = false
+isort.known-first-party = ["pynxtools"]
 
 [tool.ruff.format]
 quote-style = "double"
@@ -183,4 +184,4 @@ exclude = ["src/pynxtools/definitions/*"]
 [tool.uv]
 extra-index-url = [
 "https://gitlab.mpcdf.mpg.de/api/v4/projects/2187/packages/pypi/simple",
-]
\ No newline at end of file
+]
diff --git a/tests/dataconverter/test_nexus_tree.py b/tests/dataconverter/test_nexus_tree.py
index 452cb7ea2..1fc01a082 100644
--- a/tests/dataconverter/test_nexus_tree.py
+++ b/tests/dataconverter/test_nexus_tree.py
@@ -1,11 +1,6 @@
 from typing import Any, get_args
 
 from anytree import Resolver
-from pynxtools.definitions.dev_tools.utils.nxdl_utils import (
-    get_app_defs_names,
-    get_nx_attribute_type,
-    get_nx_units,
-)
 
 from pynxtools.dataconverter.nexus_tree import (
     NexusNode,
@@ -13,6 +8,11 @@
     NexusUnitCategory,
     generate_tree_from,
 )
+from pynxtools.definitions.dev_tools.utils.nxdl_utils import (
+    get_app_defs_names,
+    get_nx_attribute_type,
+    get_nx_units,
+)
 
 
 def test_parsing_of_all_appdefs():
diff --git a/tests/nexus/test_nexus.py b/tests/nexus/test_nexus.py
index f833c3275..8d7dad59e 100644
--- a/tests/nexus/test_nexus.py
+++ b/tests/nexus/test_nexus.py
@@ -24,6 +24,7 @@
 import lxml.etree as ET
 import numpy as np
 import pytest
+
 from pynxtools.definitions.dev_tools.utils.nxdl_utils import (
     get_inherited_nodes,
     get_node_at_nxdl_path,
@@ -31,7 +32,6 @@
     get_nx_classes,
     get_nx_units,
 )
-
 from pynxtools.nexus.nexus import HandleNexus, decode_if_string
 
 logger = logging.getLogger(__name__)