From b09077c8b35a6db6b797e5aeccb445a1717e6397 Mon Sep 17 00:00:00 2001 From: Arash Date: Mon, 19 Jan 2026 17:20:57 +0100 Subject: [PATCH 1/6] Normalize version fields to strings in import scripts Add normalize_version_fields function to convert version fields (which can be int, float, or str) to string type for consistency. Integrate version normalization into all import scripts: - bioconda: normalize package.version - bioconductor: normalize Version - biotools: normalize version and nested version fields - galaxytool: normalize Suite_version, conda package version, and workflow versions --- bioconda-import/bioconda_importer.py | 5 +++ bioconductor-import/import.py | 5 +++ biotools-import/import.py | 7 ++++ common/metadata.py | 51 ++++++++++++++++++++++++++ galaxytool-import/galaxytool-import.py | 12 ++++++ 5 files changed, 80 insertions(+) create mode 100644 common/metadata.py diff --git a/bioconda-import/bioconda_importer.py b/bioconda-import/bioconda_importer.py index d2aa19f..ba9fbc1 100644 --- a/bioconda-import/bioconda_importer.py +++ b/bioconda-import/bioconda_importer.py @@ -1,11 +1,15 @@ #!/usr/bin/env python import os +import sys import yaml import argparse from pathlib import Path import jinja2 +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from common.metadata import normalize_version_fields + def clean(content_path): import_directory = os.path.join(content_path, "imports", "bioconda") @@ -67,6 +71,7 @@ def merge(conda, content_path): biotools_data_path = os.path.join(content_path, "data") for name, data in conda.items(): try: + data = normalize_version_fields(data, ["package.version"]) package_name = data["package"]["name"] import_file_path = os.path.join( bioconda_import_path, f"bioconda_{package_name}.yaml" diff --git a/bioconductor-import/import.py b/bioconductor-import/import.py index 13b6568..b32f837 100644 --- a/bioconductor-import/import.py +++ b/bioconductor-import/import.py @@ -2,10 +2,14 @@ import glob import json import os +import sys import requests import logging import yaml +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from common.metadata import normalize_version_fields + # Set up logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" @@ -99,6 +103,7 @@ def retrieve(version, filters=None): ) try: + pack = normalize_version_fields(pack, ["Version"]) with open(path, "w") as write_file: json.dump( pack, write_file, sort_keys=True, indent=4, separators=(",", ": ") diff --git a/biotools-import/import.py b/biotools-import/import.py index 199d677..256bfb8 100644 --- a/biotools-import/import.py +++ b/biotools-import/import.py @@ -6,6 +6,9 @@ import requests from boltons.iterutils import remap +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from common.metadata import normalize_version_fields + BIOTOOLS_DOMAIN = "https://bio.tools" SSL_VERIFY = True @@ -62,6 +65,10 @@ def drop_false(path, key, value): return bool(value) tool_cleaned = remap(tool, visit=drop_false) + tool_cleaned = normalize_version_fields( + tool_cleaned, ["version", "version[].version"] + ) + json.dump( tool_cleaned, write_file, diff --git a/common/metadata.py b/common/metadata.py new file mode 100644 index 0000000..ea7b151 --- /dev/null +++ b/common/metadata.py @@ -0,0 +1,51 @@ +def normalize_version_to_string(value): + if value is None or isinstance(value, bool): + return value + if isinstance(value, (int, float)): + return str(value) + if isinstance(value, list): + return [normalize_version_to_string(v) for v in value] + if isinstance(value, dict): + return {k: normalize_version_to_string(v) for k, v in value.items()} + return value + + +def normalize_version_fields(data, field_paths): + if not isinstance(data, dict): + raise TypeError(f"Expected dict, got {type(data).__name__}") + + for field_path in field_paths: + try: + if "[" in field_path: + if "[]." not in field_path: + list_key = field_path.rstrip("[]") + if list_key in data and isinstance(data[list_key], list): + data[list_key] = normalize_version_to_string(data[list_key]) + else: + list_key, item_path = field_path.split("[].", 1) + if list_key in data and isinstance(data[list_key], list): + for item in data[list_key]: + if isinstance(item, dict) and item_path in item: + item[item_path] = normalize_version_to_string( + item[item_path] + ) + elif "." in field_path: + keys = field_path.split(".") + current = data + for key in keys[:-1]: + if not isinstance(current, dict) or key not in current: + break + current = current[key] + else: + final_key = keys[-1] + if isinstance(current, dict) and final_key in current: + current[final_key] = normalize_version_to_string( + current[final_key] + ) + else: + if field_path in data: + data[field_path] = normalize_version_to_string(data[field_path]) + except Exception: + continue + + return data diff --git a/galaxytool-import/galaxytool-import.py b/galaxytool-import/galaxytool-import.py index e5ee60d..a072841 100644 --- a/galaxytool-import/galaxytool-import.py +++ b/galaxytool-import/galaxytool-import.py @@ -1,9 +1,13 @@ import glob import json import os +import sys import requests +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from common.metadata import normalize_version_fields + GALAXY_ALL_TOOLS_METADATA = "https://raw.githubusercontent.com/galaxyproject/galaxy_codex/refs/heads/main/communities/all/resources/tools.json" GALAXY_ALL_WORKFLOWS_METADATA = "https://raw.githubusercontent.com/galaxyproject/galaxy_codex/refs/heads/main/communities/all/resources/workflows.json" @@ -72,6 +76,14 @@ def retrieve(): # store tool json in galaxy import folder galaxy_tool_id = galaxy_tool_id.lower() tool_cleaned = {k.replace(" ", "_"): v for k, v in tool.items()} + tool_cleaned = normalize_version_fields( + tool_cleaned, + [ + "Suite_version", + "Latest_suite_conda_package_version", + "Related_Workflows[].latest_version", + ], + ) save_path = os.path.join(galaxy_directory, f"{galaxy_tool_id}.galaxy.json") with open(save_path, "w") as write_file: json.dump( From db4a0b7d87aac4b42212b483851478552523a63f Mon Sep 17 00:00:00 2001 From: Arash Date: Mon, 2 Feb 2026 13:43:16 +0100 Subject: [PATCH 2/6] Refactor version normalization to target only the main version field --- biotools-import/import.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biotools-import/import.py b/biotools-import/import.py index 256bfb8..2e0e0bd 100644 --- a/biotools-import/import.py +++ b/biotools-import/import.py @@ -66,7 +66,7 @@ def drop_false(path, key, value): tool_cleaned = remap(tool, visit=drop_false) tool_cleaned = normalize_version_fields( - tool_cleaned, ["version", "version[].version"] + tool_cleaned, ["version"] ) json.dump( From 07afcef3c80fa8fef697addc7e7c26edaf7ea1a6 Mon Sep 17 00:00:00 2001 From: Arash Date: Mon, 2 Feb 2026 13:43:23 +0100 Subject: [PATCH 3/6] Enhance version normalization functions with detailed docstrings and improved error handling --- common/metadata.py | 58 ++++++++++++++++++++++++++++- research-software-ecosystem-content | 1 + 2 files changed, 57 insertions(+), 2 deletions(-) create mode 160000 research-software-ecosystem-content diff --git a/common/metadata.py b/common/metadata.py index ea7b151..6d92642 100644 --- a/common/metadata.py +++ b/common/metadata.py @@ -1,4 +1,32 @@ +import logging + + def normalize_version_to_string(value): + """ + Recursively convert version values to strings. + + This function processes version data by converting numeric types to strings + while preserving None and boolean values. It recursively processes nested + structures (lists and dicts). + + Args: + value: The value to normalize. Can be any type. + + Returns: + - None and bool values are returned unchanged + - int and float values are converted to strings + - Lists are processed recursively, returning a new list with normalized values + - Dicts are processed recursively, returning a new dict with normalized values + - Other types are returned unchanged + + Examples: + >>> normalize_version_to_string(1) + '1' + >>> normalize_version_to_string([1, 2, 3]) + ['1', '2', '3'] + >>> normalize_version_to_string({'version': 1.5}) + {'version': '1.5'} + """ if value is None or isinstance(value, bool): return value if isinstance(value, (int, float)): @@ -11,6 +39,31 @@ def normalize_version_to_string(value): def normalize_version_fields(data, field_paths): + """ + Normalize version fields to strings in a data dictionary. + + This function takes a dictionary and a collection of field paths, then normalizes + the version values at those paths to strings using normalize_version_to_string. + + Args: + data (dict): The dictionary to process. + field_paths (iterable): An iterable of field path strings. Supports: + - Simple fields: "version" + - Nested fields: "tool.version" + - List fields: "versions[]" + - List item nested fields: "versions[].version" + + Returns: + dict: The modified data dictionary with normalized version fields. + + Raises: + TypeError: If data is not a dictionary. + + Examples: + >>> data = {"version": 1, "versions": [{"version": 2}]} + >>> normalize_version_fields(data, ["version", "versions[].version"]) + {'version': '1', 'versions': [{'version': '2'}]} + """ if not isinstance(data, dict): raise TypeError(f"Expected dict, got {type(data).__name__}") @@ -18,7 +71,7 @@ def normalize_version_fields(data, field_paths): try: if "[" in field_path: if "[]." not in field_path: - list_key = field_path.rstrip("[]") + list_key = field_path[:-2] if field_path.endswith("[]") else field_path if list_key in data and isinstance(data[list_key], list): data[list_key] = normalize_version_to_string(data[list_key]) else: @@ -45,7 +98,8 @@ def normalize_version_fields(data, field_paths): else: if field_path in data: data[field_path] = normalize_version_to_string(data[field_path]) - except Exception: + except (KeyError, TypeError, IndexError, AttributeError) as e: + logging.debug(f"Skipping field path '{field_path}': {e}") continue return data diff --git a/research-software-ecosystem-content b/research-software-ecosystem-content new file mode 160000 index 0000000..f1355a7 --- /dev/null +++ b/research-software-ecosystem-content @@ -0,0 +1 @@ +Subproject commit f1355a7c14310cbf3eeed9661335fff8f6a57d91 From 2b033641cf716d0cee336892b5537ef2d523acf7 Mon Sep 17 00:00:00 2001 From: Arash Date: Mon, 2 Feb 2026 13:44:29 +0100 Subject: [PATCH 4/6] Remove obsolete subproject content directory --- research-software-ecosystem-content | 1 - 1 file changed, 1 deletion(-) delete mode 160000 research-software-ecosystem-content diff --git a/research-software-ecosystem-content b/research-software-ecosystem-content deleted file mode 160000 index f1355a7..0000000 --- a/research-software-ecosystem-content +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f1355a7c14310cbf3eeed9661335fff8f6a57d91 From d7f4634726b15bc1c1d95048c84deb46d8892615 Mon Sep 17 00:00:00 2001 From: Arash Date: Tue, 3 Mar 2026 11:56:03 +0100 Subject: [PATCH 5/6] Refactor normalize_version_fields function for improved readability and maintainability --- biotools-import/import.py | 7 +++---- common/metadata.py | 4 +++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/biotools-import/import.py b/biotools-import/import.py index 2e0e0bd..75557c6 100644 --- a/biotools-import/import.py +++ b/biotools-import/import.py @@ -1,5 +1,6 @@ import json import os +import sys import glob import argparse @@ -7,7 +8,7 @@ from boltons.iterutils import remap sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from common.metadata import normalize_version_fields +from common.metadata import normalize_version_fields # noqa: E402 BIOTOOLS_DOMAIN = "https://bio.tools" SSL_VERIFY = True @@ -65,9 +66,7 @@ def drop_false(path, key, value): return bool(value) tool_cleaned = remap(tool, visit=drop_false) - tool_cleaned = normalize_version_fields( - tool_cleaned, ["version"] - ) + tool_cleaned = normalize_version_fields(tool_cleaned, ["version"]) json.dump( tool_cleaned, diff --git a/common/metadata.py b/common/metadata.py index 6d92642..b33a010 100644 --- a/common/metadata.py +++ b/common/metadata.py @@ -71,7 +71,9 @@ def normalize_version_fields(data, field_paths): try: if "[" in field_path: if "[]." not in field_path: - list_key = field_path[:-2] if field_path.endswith("[]") else field_path + list_key = ( + field_path[:-2] if field_path.endswith("[]") else field_path + ) if list_key in data and isinstance(data[list_key], list): data[list_key] = normalize_version_to_string(data[list_key]) else: From 9f0c526841de8bf1fb83b36a2baee8d27d76eec1 Mon Sep 17 00:00:00 2001 From: Arash Date: Tue, 3 Mar 2026 11:58:10 +0100 Subject: [PATCH 6/6] Fix import order for normalize_version_fields to comply with style guidelines --- biotools-import/import.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biotools-import/import.py b/biotools-import/import.py index 75557c6..5cf8598 100644 --- a/biotools-import/import.py +++ b/biotools-import/import.py @@ -8,7 +8,7 @@ from boltons.iterutils import remap sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from common.metadata import normalize_version_fields # noqa: E402 +from common.metadata import normalize_version_fields BIOTOOLS_DOMAIN = "https://bio.tools" SSL_VERIFY = True