FAIRmat-NFDI · mkuehbach · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/.cspell/custom-dictionary.txt b/.cspell/custom-dictionary.txt
@@ -40,13 +40,15 @@ Márquez
 NFDI
 NIAC
 NINF
+NTHREADS
 NXDL
 Namefits
 Namefitting
 OPCPA
 ORCID
 PATHCONV
 PDBX
+PYNX
 PYNXTOOLS
 Pielsticker
 Pincelli
@@ -88,13 +90,15 @@ blosc
 bulletpoint
 callspec
 caplog
+clevel
 cnxvalidate
 complexfloating
 dataarrays
 dataconverter
 datamodel
 defaultdicts
 docstrings
+downstreams
 edgeitems
 ekey
 electronanalyzer
@@ -122,6 +126,7 @@ hdfgroup
 hdfobject
 hfive
 hixu
+hyperthreading
 hypothes
 idname
 idobj
@@ -159,6 +164,7 @@ mynxdl
 namefit
 namefitting
 nbytes
+ncores
 ndarray
 ndataconverter
 ndims
@@ -169,6 +175,7 @@ nodemixin
 nonvariadic
 nslots
 nsmap
+nthreads
 nxcollection
 nxdata
 nxdl

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.15.0
+    rev: v0.15.6
     hooks:
       - id: ruff-check
         files: ^(src/pynxtools|tests)/.*\.py$
@@ -15,18 +15,18 @@ repos:
           - types-PyYAML
 
   - repo: https://github.com/asottile/pyupgrade
-    rev: v3.21.1
+    rev: v3.21.2
     hooks:
       - id: pyupgrade
-        args: [--py36-plus]  # modernizes syntax for Python 3.6+
+        args: [--py310-plus]  # modernizes syntax for Python 3.10+
 
   - repo: https://github.com/kynan/nbstripout
-    rev: 0.9.0
+    rev: 0.9.1
     hooks:
       - id: nbstripout  # removes notebook outputs before committing
 
   - repo: https://github.com/streetsidesoftware/cspell-cli
-    rev: v9.6.0
+    rev: v9.7.0
     hooks:
       - id: cspell  # spellchecking
         pass_filenames: false
@@ -37,4 +37,4 @@ repos:
           - CITATION.cff
           - docs/**/*
           - src/pynxtools/**/*.py
-          - tests/**/*.py
+          - tests/**/*.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,8 @@ dependencies = [
     "click>=7.1.2",
     "click_default_group",
     "h5py>=3.6.0",
+    "hdf5plugin",  # making explicit what in the past came implicit through pandas
+    "blosc2",  # making explicit what in the past came implicit through pandas
     "xarray>=0.20.2",
     "PyYAML>=6.0",
     'numpy',
@@ -163,6 +165,7 @@ ignore = [
 ]
 fixable = ["ALL"]
 isort.split-on-trailing-comma = false
+isort.known-first-party = ["pynxtools"]
 
 [tool.ruff.format]
 quote-style = "double"
@@ -181,4 +184,4 @@ exclude = ["src/pynxtools/definitions/*"]
 [tool.uv]
 extra-index-url = [
 "https://gitlab.mpcdf.mpg.de/api/v4/projects/2187/packages/pypi/simple",
-]
+]
diff --git a/src/pynxtools/dataconverter/chunk.py b/src/pynxtools/dataconverter/chunk.py
@@ -18,6 +18,7 @@
 
 """Configuration and utilities for customized chunking and compression."""
 
+import importlib.util
 import logging
 
 import numpy as np
@@ -26,11 +27,29 @@
 # one wraps the payload for a dataconverter template into a dictionary with
 # keyword "compress", causing chunked layout to be used
 
-COMPRESSION_FILTERS: list[str] = ["gzip"]  # deflate
+PYNX_ENABLE_BLOSC: bool = False  # deactivated by default
+# use only when it is acceptable to work with blosc2-compressed content downstreams
+# mind that doing so in C/C++, Matlab, and Fortran application requires specific
+# linking of these apps with a customized HDF5 library that links to the blosc library
+# consider that using blosc sets explicit a certain number of cores eligible for
+# doing compression and decompression work that may drain resources when pynxtools
+# is used in conjunction with other apps and services like NOMAD
+# check the set_nthreads in writer.py to modify accordingly for your best practice
+
+COMPRESSION_FILTERS: list[str] = (
+    ["gzip", "blosc"]
+    if (
+        PYNX_ENABLE_BLOSC
+        and importlib.util.find_spec("hdf5plugin") is not None
+        and importlib.util.find_spec("blosc2") is not None
+    )
+    else ["gzip"]
+)
+# order matters! 0th entry always taken as the default for backwards compatibility
+# "gzip" -> deflate, "blosc" -> "zstd"]
 COMPRESSION_STRENGTH: int = 9
 # integer values from 0 (effectively no), 1, ..., to at most 9 (strongest compression)
-# using strongest compression is space efficient but can take substantially longer than
-# using 1
+# using strongest compression is space efficient but takes substantially longer than 1
 
 # compressed payload is served as a dict with at least one keyword "compress",
 # "strength" is optional keyword for that dictionary to overwrite the default
@@ -76,7 +95,6 @@
 
 CHUNK_CONFIG_DEFAULT = CHUNK_CONFIG_HFIVEPY
 
-
 logger = logging.getLogger("pynxtools")
 
 

diff --git a/src/pynxtools/dataconverter/writer.py b/src/pynxtools/dataconverter/writer.py
@@ -20,7 +20,9 @@
 # pylint: disable=R0912
 
 import copy
+import importlib.util
 import logging
+import os
 import sys
 import xml.etree.ElementTree as ET
 
@@ -36,7 +38,28 @@
 )
 
 logger = logging.getLogger("pynxtools")  # pylint: disable=C0103
-from pynxtools.dataconverter.chunk import COMPRESSION_FILTERS, COMPRESSION_STRENGTH
+from pynxtools.dataconverter.chunk import (
+    COMPRESSION_FILTERS,
+    COMPRESSION_STRENGTH,
+    PYNX_ENABLE_BLOSC,
+)
+
+if (
+    PYNX_ENABLE_BLOSC
+    and importlib.util.find_spec("hdf5plugin") is not None
+    and importlib.util.find_spec("blosc2") is not None
+):
+    import blosc2
+    import hdf5plugin
+
+    NTHREADS_BLOSC = blosc2.set_nthreads(max(int(os.cpu_count() / 2), 1))
+    # do not oversubscribe to use hyperthreading cores
+    logger.info(
+        f"blosc2 is configured to use {blosc2.nthreads} threads on host with {blosc2.ncores} cores"
+    )
+    logger.info(blosc2.print_versions())
+else:
+    NTHREADS_BLOSC = 0
 
 
 def does_path_exist(path, h5py_obj) -> bool:
@@ -153,7 +176,10 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path, append):
     elif "compress" in data.keys():
         if not (isinstance(data["compress"], str) or np.isscalar(data["compress"])):
             if ("filter" in data.keys()) and (data["filter"] in COMPRESSION_FILTERS):
-                compression_filter = data["filter"]
+                if PYNX_ENABLE_BLOSC and data["filter"] == "blosc":
+                    compression_filter = data["filter"]
+                else:
+                    compression_filter = COMPRESSION_FILTERS[0]
             else:  # fall-back to default
                 compression_filter = COMPRESSION_FILTERS[0]
 
@@ -168,12 +194,19 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path, append):
 
             if entry_name not in grp:
                 try:
+                    if compression_filter == "gzip":
+                        compression_config = dict(
+                            compression=compression_filter,
+                            compression_opts=compression_strength,
+                        )
+                    else:  # by virtue of construction blosc
+                        compression_config = hdf5plugin.Blosc2(cname="zstd", clevel=9)
                     grp.create_dataset(
                         entry_name,
                         data=data["compress"],
                         compression=compression_filter,
                         chunks=chunking_strategy(data),
-                        compression_opts=compression_strength,
+                        **compression_config,
                     )
                 except ValueError:
                     logger.warning(f"ValueError caught upon creating_dataset {path}")