Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .cspell/custom-dictionary.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,15 @@ Márquez
NFDI
NIAC
NINF
NTHREADS
NXDL
Namefits
Namefitting
OPCPA
ORCID
PATHCONV
PDBX
PYNX
PYNXTOOLS
Pielsticker
Pincelli
Expand Down Expand Up @@ -88,13 +90,15 @@ blosc
bulletpoint
callspec
caplog
clevel
cnxvalidate
complexfloating
dataarrays
dataconverter
datamodel
defaultdicts
docstrings
downstreams
edgeitems
ekey
electronanalyzer
Expand Down Expand Up @@ -122,6 +126,7 @@ hdfgroup
hdfobject
hfive
hixu
hyperthreading
hypothes
idname
idobj
Expand Down Expand Up @@ -159,6 +164,7 @@ mynxdl
namefit
namefitting
nbytes
ncores
ndarray
ndataconverter
ndims
Expand All @@ -169,6 +175,7 @@ nodemixin
nonvariadic
nslots
nsmap
nthreads
nxcollection
nxdata
nxdl
Expand Down
12 changes: 6 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.15.0
rev: v0.15.6
hooks:
- id: ruff-check
files: ^(src/pynxtools|tests)/.*\.py$
Expand All @@ -15,18 +15,18 @@ repos:
- types-PyYAML

- repo: https://github.com/asottile/pyupgrade
rev: v3.21.1
rev: v3.21.2
hooks:
- id: pyupgrade
args: [--py36-plus] # modernizes syntax for Python 3.6+
args: [--py310-plus] # modernizes syntax for Python 3.10+

- repo: https://github.com/kynan/nbstripout
rev: 0.9.0
rev: 0.9.1
hooks:
- id: nbstripout # removes notebook outputs before committing

- repo: https://github.com/streetsidesoftware/cspell-cli
rev: v9.6.0
rev: v9.7.0
hooks:
- id: cspell # spellchecking
pass_filenames: false
Expand All @@ -37,4 +37,4 @@ repos:
- CITATION.cff
- docs/**/*
- src/pynxtools/**/*.py
- tests/**/*.py
- tests/**/*.py
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ dependencies = [
"click>=7.1.2",
"click_default_group",
"h5py>=3.6.0",
"hdf5plugin", # making explicit what in the past came implicit through pandas
"blosc2", # making explicit what in the past came implicit through pandas
"xarray>=0.20.2",
"PyYAML>=6.0",
'numpy',
Expand Down Expand Up @@ -163,6 +165,7 @@ ignore = [
]
fixable = ["ALL"]
isort.split-on-trailing-comma = false
isort.known-first-party = ["pynxtools"]

[tool.ruff.format]
quote-style = "double"
Expand All @@ -181,4 +184,4 @@ exclude = ["src/pynxtools/definitions/*"]
[tool.uv]
extra-index-url = [
"https://gitlab.mpcdf.mpg.de/api/v4/projects/2187/packages/pypi/simple",
]
]
26 changes: 22 additions & 4 deletions src/pynxtools/dataconverter/chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

"""Configuration and utilities for customized chunking and compression."""

import importlib.util
import logging

import numpy as np
Expand All @@ -26,11 +27,29 @@
# one wraps the payload for a dataconverter template into a dictionary with
# keyword "compress", causing chunked layout to be used

COMPRESSION_FILTERS: list[str] = ["gzip"] # deflate
PYNX_ENABLE_BLOSC: bool = False # deactivated by default
# use only when it is acceptable to work with blosc2-compressed content downstreams
# mind that doing so in C/C++, Matlab, and Fortran application requires specific
# linking of these apps with a customized HDF5 library that links to the blosc library
# consider that using blosc sets explicit a certain number of cores eligible for
# doing compression and decompression work that may drain resources when pynxtools
# is used in conjunction with other apps and services like NOMAD
# check the set_nthreads in writer.py to modify accordingly for your best practice
Comment on lines +31 to +37
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Too much of an in code comment. Let's simplify and use some defaults or add this to a more accessible README.md section/link to another .md and/or to the docs.


COMPRESSION_FILTERS: list[str] = (
["gzip", "blosc"]
if (
PYNX_ENABLE_BLOSC
and importlib.util.find_spec("hdf5plugin") is not None
and importlib.util.find_spec("blosc2") is not None
)
else ["gzip"]
)
# order matters! 0th entry always taken as the default for backwards compatibility
# "gzip" -> deflate, "blosc" -> "zstd"]
COMPRESSION_STRENGTH: int = 9
# integer values from 0 (effectively no), 1, ..., to at most 9 (strongest compression)
# using strongest compression is space efficient but can take substantially longer than
# using 1
# using strongest compression is space efficient but takes substantially longer than 1

# compressed payload is served as a dict with at least one keyword "compress",
# "strength" is optional keyword for that dictionary to overwrite the default
Expand Down Expand Up @@ -76,7 +95,6 @@

CHUNK_CONFIG_DEFAULT = CHUNK_CONFIG_HFIVEPY


logger = logging.getLogger("pynxtools")


Expand Down
39 changes: 36 additions & 3 deletions src/pynxtools/dataconverter/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
# pylint: disable=R0912

import copy
import importlib.util
import logging
import os
import sys
import xml.etree.ElementTree as ET

Expand All @@ -36,7 +38,28 @@
)

logger = logging.getLogger("pynxtools") # pylint: disable=C0103
from pynxtools.dataconverter.chunk import COMPRESSION_FILTERS, COMPRESSION_STRENGTH
from pynxtools.dataconverter.chunk import (
COMPRESSION_FILTERS,
COMPRESSION_STRENGTH,
PYNX_ENABLE_BLOSC,
)

if (
PYNX_ENABLE_BLOSC
and importlib.util.find_spec("hdf5plugin") is not None
and importlib.util.find_spec("blosc2") is not None
):
Comment on lines +47 to +51
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Either we add these two dependencies to optional in the pyproject.toml then we have a check for them here. Or we keep the pyproject with these as mandatory dependencies and skip the check.

import blosc2
import hdf5plugin

NTHREADS_BLOSC = blosc2.set_nthreads(max(int(os.cpu_count() / 2), 1))
# do not oversubscribe to use hyperthreading cores
Comment on lines +55 to +56
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this necessary? Aren't there any sane defaults in the hdf5lib already?

logger.info(
f"blosc2 is configured to use {blosc2.nthreads} threads on host with {blosc2.ncores} cores"
)
logger.info(blosc2.print_versions())
else:
NTHREADS_BLOSC = 0


def does_path_exist(path, h5py_obj) -> bool:
Expand Down Expand Up @@ -153,7 +176,10 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path, append):
elif "compress" in data.keys():
if not (isinstance(data["compress"], str) or np.isscalar(data["compress"])):
if ("filter" in data.keys()) and (data["filter"] in COMPRESSION_FILTERS):
compression_filter = data["filter"]
if PYNX_ENABLE_BLOSC and data["filter"] == "blosc":
compression_filter = data["filter"]
else:
compression_filter = COMPRESSION_FILTERS[0]
else: # fall-back to default
compression_filter = COMPRESSION_FILTERS[0]

Expand All @@ -168,12 +194,19 @@ def handle_dicts_entries(data, grp, entry_name, output_path, path, append):

if entry_name not in grp:
try:
if compression_filter == "gzip":
compression_config = dict(
compression=compression_filter,
compression_opts=compression_strength,
)
else: # by virtue of construction blosc
compression_config = hdf5plugin.Blosc2(cname="zstd", clevel=9)
grp.create_dataset(
entry_name,
data=data["compress"],
compression=compression_filter,
chunks=chunking_strategy(data),
compression_opts=compression_strength,
**compression_config,
Comment on lines +197 to +209
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can have one function that deals with our compression needs instead of if statements here.

)
except ValueError:
logger.warning(f"ValueError caught upon creating_dataset {path}")
Expand Down
Loading