diff --git a/.cspell/custom-dictionary.txt b/.cspell/custom-dictionary.txt index 7657931d7..f0a8cd20c 100644 --- a/.cspell/custom-dictionary.txt +++ b/.cspell/custom-dictionary.txt @@ -40,6 +40,7 @@ Márquez NFDI NIAC NINF +NTHREADS NXDL Namefits Namefitting @@ -47,6 +48,7 @@ OPCPA ORCID PATHCONV PDBX +PYNX PYNXTOOLS Pielsticker Pincelli @@ -88,6 +90,7 @@ blosc bulletpoint callspec caplog +clevel cnxvalidate complexfloating dataarrays @@ -95,6 +98,7 @@ dataconverter datamodel defaultdicts docstrings +downstreams edgeitems ekey electronanalyzer @@ -122,6 +126,7 @@ hdfgroup hdfobject hfive hixu +hyperthreading hypothes idname idobj @@ -159,6 +164,7 @@ mynxdl namefit namefitting nbytes +ncores ndarray ndataconverter ndims @@ -169,6 +175,7 @@ nodemixin nonvariadic nslots nsmap +nthreads nxcollection nxdata nxdl diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f5547c122..88bfe2e54 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.15.0 + rev: v0.15.6 hooks: - id: ruff-check files: ^(src/pynxtools|tests)/.*\.py$ @@ -15,18 +15,18 @@ repos: - types-PyYAML - repo: https://github.com/asottile/pyupgrade - rev: v3.21.1 + rev: v3.21.2 hooks: - id: pyupgrade - args: [--py36-plus] # modernizes syntax for Python 3.6+ + args: [--py310-plus] # modernizes syntax for Python 3.10+ - repo: https://github.com/kynan/nbstripout - rev: 0.9.0 + rev: 0.9.1 hooks: - id: nbstripout # removes notebook outputs before committing - repo: https://github.com/streetsidesoftware/cspell-cli - rev: v9.6.0 + rev: v9.7.0 hooks: - id: cspell # spellchecking pass_filenames: false @@ -37,4 +37,4 @@ repos: - CITATION.cff - docs/**/* - src/pynxtools/**/*.py - - tests/**/*.py \ No newline at end of file + - tests/**/*.py diff --git a/pyproject.toml b/pyproject.toml index 9bfe2ec05..a322832fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,8 @@ dependencies = [ "click>=7.1.2", "click_default_group", "h5py>=3.6.0", + "hdf5plugin", # making explicit what in the past came implicit through pandas + "blosc2", # making explicit what in the past came implicit through pandas "xarray>=0.20.2", "PyYAML>=6.0", 'numpy', @@ -163,6 +165,7 @@ ignore = [ ] fixable = ["ALL"] isort.split-on-trailing-comma = false +isort.known-first-party = ["pynxtools"] [tool.ruff.format] quote-style = "double" @@ -181,4 +184,4 @@ exclude = ["src/pynxtools/definitions/*"] [tool.uv] extra-index-url = [ "https://gitlab.mpcdf.mpg.de/api/v4/projects/2187/packages/pypi/simple", -] \ No newline at end of file +] diff --git a/src/pynxtools/dataconverter/chunk.py b/src/pynxtools/dataconverter/chunk.py index c9d150a76..40569e2d2 100644 --- a/src/pynxtools/dataconverter/chunk.py +++ b/src/pynxtools/dataconverter/chunk.py @@ -18,6 +18,7 @@ """Configuration and utilities for customized chunking and compression.""" +import importlib.util import logging import numpy as np @@ -26,11 +27,29 @@ # one wraps the payload for a dataconverter template into a dictionary with # keyword "compress", causing chunked layout to be used -COMPRESSION_FILTERS: list[str] = ["gzip"] # deflate +PYNX_ENABLE_BLOSC: bool = False # deactivated by default +# use only when it is acceptable to work with blosc2-compressed content downstreams +# mind that doing so in C/C++, Matlab, and Fortran application requires specific +# linking of these apps with a customized HDF5 library that links to the blosc library +# consider that using blosc sets explicit a certain number of cores eligible for +# doing compression and decompression work that may drain resources when pynxtools +# is used in conjunction with other apps and services like NOMAD +# check the set_nthreads in writer.py to modify accordingly for your best practice + +COMPRESSION_FILTERS: list[str] = ( + ["gzip", "blosc"] + if ( + PYNX_ENABLE_BLOSC + and importlib.util.find_spec("hdf5plugin") is not None + and importlib.util.find_spec("blosc2") is not None + ) + else ["gzip"] +) +# order matters! 0th entry always taken as the default for backwards compatibility +# "gzip" -> deflate, "blosc" -> "zstd"] COMPRESSION_STRENGTH: int = 9 # integer values from 0 (effectively no), 1, ..., to at most 9 (strongest compression) -# using strongest compression is space efficient but can take substantially longer than -# using 1 +# using strongest compression is space efficient but takes substantially longer than 1 # compressed payload is served as a dict with at least one keyword "compress", # "strength" is optional keyword for that dictionary to overwrite the default @@ -76,7 +95,6 @@ CHUNK_CONFIG_DEFAULT = CHUNK_CONFIG_HFIVEPY - logger = logging.getLogger("pynxtools") diff --git a/src/pynxtools/dataconverter/writer.py b/src/pynxtools/dataconverter/writer.py index e487972ff..d0c8c92cc 100644 --- a/src/pynxtools/dataconverter/writer.py +++ b/src/pynxtools/dataconverter/writer.py @@ -20,7 +20,9 @@ # pylint: disable=R0912 import copy +import importlib.util import logging +import os import sys import xml.etree.ElementTree as ET @@ -36,7 +38,28 @@ ) logger = logging.getLogger("pynxtools") # pylint: disable=C0103 -from pynxtools.dataconverter.chunk import COMPRESSION_FILTERS, COMPRESSION_STRENGTH +from pynxtools.dataconverter.chunk import ( + COMPRESSION_FILTERS, + COMPRESSION_STRENGTH, + PYNX_ENABLE_BLOSC, +) + +if ( + PYNX_ENABLE_BLOSC + and importlib.util.find_spec("hdf5plugin") is not None + and importlib.util.find_spec("blosc2") is not None +): + import blosc2 + import hdf5plugin + + NTHREADS_BLOSC = blosc2.set_nthreads(max(int(os.cpu_count() / 2), 1)) + # do not oversubscribe to use hyperthreading cores + logger.info( + f"blosc2 is configured to use {blosc2.nthreads} threads on host with {blosc2.ncores} cores" + ) + logger.info(blosc2.print_versions()) +else: + NTHREADS_BLOSC = 0 def does_path_exist(path, h5py_obj) -> bool: @@ -153,7 +176,10 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path, append): elif "compress" in data.keys(): if not (isinstance(data["compress"], str) or np.isscalar(data["compress"])): if ("filter" in data.keys()) and (data["filter"] in COMPRESSION_FILTERS): - compression_filter = data["filter"] + if PYNX_ENABLE_BLOSC and data["filter"] == "blosc": + compression_filter = data["filter"] + else: + compression_filter = COMPRESSION_FILTERS[0] else: # fall-back to default compression_filter = COMPRESSION_FILTERS[0] @@ -168,12 +194,19 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path, append): if entry_name not in grp: try: + if compression_filter == "gzip": + compression_config = dict( + compression=compression_filter, + compression_opts=compression_strength, + ) + else: # by virtue of construction blosc + compression_config = hdf5plugin.Blosc2(cname="zstd", clevel=9) grp.create_dataset( entry_name, data=data["compress"], compression=compression_filter, chunks=chunking_strategy(data), - compression_opts=compression_strength, + **compression_config, ) except ValueError: logger.warning(f"ValueError caught upon creating_dataset {path}")