Imageomics · Andrey170170 · Feb 16, 2025 · May 12, 2025 · May 12, 2025 · May 12, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,86 +3,78 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [tool.hatch.build.targets.wheel]
-packages = ["src/DD_tools"]
+packages = ["src/TreeOfLife_toolbox"]
 
 [project]
-name = "DD_tools"
+name = "TreeOfLife_toolbox"
 dynamic = ["version"]
 authors = [
     { name = "Andrey Kopanev", email = "kopanev.1@osu.edu" },
     { name = "Elizabeth G. Campolongo", email = "e.campolongo479@gmail.com" },
     { name = "Matthew J. Thompson", email = "thompson.m.j@outlook.com" },
 ]
-description = "A tool for downloading files from a list of URLs in parallel."
+description = "A tool for processing datasets that was downloaded using the distributed-downloader package."
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.10, <3.12"
 classifiers = [
+    "Development Status :: 4 - Beta",
     "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
 dependencies = [
     "attrs",
     "brotli",
-    "certifi",
-    "charset-normalizer",
     "cramjam",
     "cython",
-    "exceptiongroup",
     "fsspec",
-    "hatchling",
-    "idna",
     "inflate64",
-    "iniconfig",
-    "mpi4py < 4",
+    "mpi4py",
     "multivolumefile",
-    "numpy",
     "opencv-python",
-    "packaging",
     "pandas",
     "pathspec",
     "pillow",
-    "pip",
-    "pluggy",
     "psutil",
-    "py4j",
     "pyarrow",
     "pybcj",
     "pycryptodomex",
     "pyppmd",
     "pyspark",
-    "pytest",
-    "python-dateutil",
     "python-dotenv",
-    "pytz",
     "pyyaml",
     "pyzstd",
     "requests",
     "setuptools",
-    "six",
     "texttable",
-    "tomli",
     "trove-classifiers",
     "typing-extensions",
-    "tzdata",
-    "urllib3",
     "wheel"
 ]
 
 [project.optional-dependencies]
-dev = ["pytest"]
+dev = [
+    "pytest",
+    "ruff"
+]
 
 keywords = [
     "parallel",
     "distributed",
-    "download",
     "url",
+    "mpi-applications",
+    "dataset-generation",
 ]
 
 [project.urls]
 Homepage = "https://github.com/Imageomics/distributed-downloader"
 Repository = "https://github.com/Imageomics/distributed-downloader.git"
 "Bug Tracker" = "https://github.com/Imageomics/distributed-downloader/issues"
 
+[project.scripts]
+tree_of_life_toolbox = "TreeOfLife_toolbox.main.main:main"
+
 [tool.hatch.version]
-path = "src/DD_tools/main/__about__.py"
+path = "src/TreeOfLife_toolbox/main/__about__.py"
diff --git a/scripts/tools_filter.slurm b/scripts/tools_filter.slurm
@@ -19,11 +19,10 @@ executor_memory="64G"
 module load spark/3.4.1
 module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 pbs-spark-submit \
     --driver-memory $driver_memory \
     --executor-memory $executor_memory \
-    "${REPO_ROOT}/src/distributed_downloader/tools/filter.py" \
+    "${TOOLBOX_PATH}/main/filter.py" \
     "${tool_name}" \
     > "${logs_dir}/tool_filter.log"
diff --git a/scripts/tools_scheduler.slurm b/scripts/tools_scheduler.slurm
@@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
 export PYARROW_IGNORE_TIMEZONE=1
 export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 srun \
   --mpi=pmi2 \
@@ -28,4 +27,4 @@ srun \
   --cpus-per-task=1 \
   --mem=0 \
   --output="${logs_dir}/tool_scheduler.log" \
-  python "${REPO_ROOT}/src/distributed_downloader/tools/scheduler.py" "${tool_name}"
+  python "${TOOLBOX_PATH}/main/scheduler.py" "${tool_name}"
diff --git a/scripts/tools_verifier.slurm b/scripts/tools_verifier.slurm
@@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
 export PYARROW_IGNORE_TIMEZONE=1
 export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 srun \
   --mpi=pmi2 \
@@ -28,4 +27,4 @@ srun \
   --cpus-per-task=1 \
   --mem=0 \
   --output="${logs_dir}/tool_verifier.log" \
-  python "${REPO_ROOT}/src/distributed_downloader/tools/verification.py" "${tool_name}"
+  python "${TOOLBOX_PATH}/main/verification.py" "${tool_name}"
diff --git a/scripts/tools_worker.slurm b/scripts/tools_worker.slurm
@@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
 export PYARROW_IGNORE_TIMEZONE=1
 export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 srun \
   --mpi=pmi2 \
@@ -28,4 +27,4 @@ srun \
   --cpus-per-task="$TOOLS_CPU_PER_WORKER" \
   --mem=0 \
   --output="${logs_dir}/tool_worker-%2t.log" \
-  python "${REPO_ROOT}/src/distributed_downloader/tools/runner.py" "${tool_name}"
+  python "${TOOLBOX_PATH}/main/runner.py" "${tool_name}"
diff --git a/src/DD_tools/__init__.py b/src/DD_tools/__init__.py
diff --git a/src/TreeOfLife_toolbox/__init__.py b/src/TreeOfLife_toolbox/__init__.py
@@ -0,0 +1 @@
+from TreeOfLife_toolbox import eol_rename
diff --git a/src/TreeOfLife_toolbox/eol_rename/README.md b/src/TreeOfLife_toolbox/eol_rename/README.md
@@ -0,0 +1,48 @@
+# EoL Rename Tool
+
+This tool enhances the Encyclopedia of Life (EoL) dataset by adding source identifiers to downloaded images.
+
+## Overview
+
+The EoL Rename tool processes images downloaded using the `distributed-downloader` tool. It enriches the dataset by:
+
+1. Reading image data from the downloaded images directory
+2. Reading original batch data containing EoL content and page IDs
+3. Merging these datasets on the "uuid" field
+4. Creating a new "source_id" field by concatenating "EOL content ID" and "EOL page ID"
+5. Saving the updated data back to the original parquet files
+
+This process ensures that images can be traced back to their source EoL content and pages.
+
+## Components
+
+The tool consists of three main components:
+
+- **EoLRenameFilter**: Registers the 'eol_rename' filter in the system
+- **EoLRenameScheduleCreation**: Creates execution schedules for rename operations
+- **EoLRenameRunner**: Executes the actual renaming process by adding source IDs to image data
+
+## Configuration
+
+No additional configuration fields are required beyond the standard TreeOfLife toolbox configuration:
+
+- Standard path configurations for downloaded images and URL folders are used
+- The system will automatically locate the required data based on server_name and partition_id
+
+## Pre-conditions
+
+The tool requires the following to be true before running:
+
+- Images must be downloaded using the `distributed-downloader` tool
+- No additional data processing should have been performed on the dataset
+- The folder structure must follow the distributed-downloader's conventions:
+  - Downloaded images stored in paths with `server_name` and `partition_id` partitions
+  - Original batch data available in the URLs folder with similar partitioning
+
+## Post-conditions
+
+After running the tool:
+
+- The source_id will be in the format `{EOL content ID}_{EOL page ID}`
+- Original parquet files will be updated in-place with the new field
+- No duplicate or additional files will be created
diff --git a/src/TreeOfLife_toolbox/eol_rename/__init__.py b/src/TreeOfLife_toolbox/eol_rename/__init__.py
@@ -0,0 +1 @@
+from .classes import EoLRenameFilter, EoLRenameScheduleCreation, EoLRenameRunner
diff --git a/src/TreeOfLife_toolbox/eol_rename/classes.py b/src/TreeOfLife_toolbox/eol_rename/classes.py
@@ -0,0 +1,131 @@
+"""
+Encyclopedia of Life (EoL) dataset renaming module.
+
+This module provides components for renaming images in the EoL dataset by merging
+source identifiers from the original batch data. It includes filter, scheduler,
+and runner classes for the EoL rename operation within the TreeOfLife toolbox.
+"""
+
+import os
+from typing import List
+
+import pandas as pd
+
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.filters import PythonFilterToolBase, FilterRegister
+from TreeOfLife_toolbox.main.runners import MPIRunnerTool, RunnerRegister
+from TreeOfLife_toolbox.main.schedulers import DefaultScheduler, SchedulerRegister
+
+
+@FilterRegister("eol_rename")
+class EoLRenameFilter(PythonFilterToolBase):
+    """
+    Filter class for EoL rename operations.
+
+    This class registers the 'eol_rename' filter in the filtering system.
+    It doesn't override any methods as it uses the default behavior from
+    the PythonFilterToolBase class.
+
+    Attributes:
+        filter_name: Name of the filter used for registration and identification.
+    """
+    def __init__(self, cfg: Config):
+        super().__init__(cfg)
+        self.filter_name: str = "eol_rename"
+
+
+@SchedulerRegister("eol_rename")
+class EoLRenameScheduleCreation(DefaultScheduler):
+    """
+    Scheduler class for EoL rename operations.
+
+    This scheduler is responsible for creating the execution schedule for 
+    the EoL rename operation. It uses the default scheduling behavior from
+    the DefaultScheduler class.
+
+    Attributes:
+        filter_name: Name of the filter used for registration and identification.
+    """
+    def __init__(self, cfg: Config):
+        super().__init__(cfg)
+        self.filter_name: str = "eol_rename"
+
+
+@RunnerRegister("eol_rename")
+class EoLRenameRunner(MPIRunnerTool):
+    """
+    Runner class for executing the EoL rename operations.
+
+    This class handles the actual processing of the EoL dataset images,
+    adding source identifiers by merging information from batch data
+    with downloaded image data.
+
+    Attributes:
+        filter_name: Name of the filter used for registration and identification.
+        data_scheme: List of fields used to partition the dataset.
+        verification_scheme: List of fields used for verification.
+        total_time: Maximum allowed execution time in seconds.
+    """
+    def __init__(self, cfg: Config):
+        super().__init__(cfg)
+        self.filter_name: str = "eol_rename"
+        self.data_scheme: List[str] = ["server_name", "partition_id"]
+        self.verification_scheme: List[str] = ["server_name", "partition_id"]
+        self.total_time = 150
+
+    def apply_filter(
+            self, filtering_df: pd.DataFrame, server_name: str, partition_id: int
+    ) -> int:
+        """
+        Apply the EoL rename filter to a specific partition of data.
+
+        This method adds source identifiers to the downloaded images data by 
+        merging 'EOL content ID' and 'EOL page ID' from the original batch data.
+        It concatenates these IDs to create a 'source_id' field and saves the 
+        updated data back to the original successes.parquet file.
+
+        Args:
+            filtering_df: DataFrame containing the filter data.
+            server_name: Name of the server containing the data.
+            partition_id: Partition ID within the server.
+
+        Returns:
+            int: Number of records processed.
+
+        Notes:
+            - Checks for time constraints during operation.
+            - Skips processing if the parquet path doesn't exist.
+        """
+        self.is_enough_time()
+
+        parquet_path = os.path.join(
+            self.downloaded_images_path,
+            f"server_name={server_name}",
+            f"partition_id={partition_id}",
+            "successes.parquet",
+        )
+        server_batch_path = os.path.join(
+            self.config.get_folder("urls_folder"),
+            f"server_name={server_name}",
+            f"partition_id={partition_id}",
+        )
+
+        if not os.path.exists(parquet_path):
+            self.logger.info(f"Path doesn't exists: {parquet_path}")
+            return 0
+
+        parquet = pd.read_parquet(parquet_path)
+        server_batch = pd.read_parquet(
+            server_batch_path, columns=["EOL content ID", "EOL page ID", "uuid"]
+        )
+
+        self.is_enough_time()
+
+        parquet = parquet.merge(server_batch, on="uuid", how="left", validate="1:1")
+        parquet["source_id"] = parquet["EOL content ID"] + "_" + parquet["EOL page ID"]
+
+        parquet.to_parquet(
+            parquet_path, index=False, compression="zstd", compression_level=3
+        )
+
+        return len(parquet)
diff --git a/src/DD_tools/main/__about__.py → src/TreeOfLife_toolbox/main/__about__.py b/src/DD_tools/main/__about__.py → src/TreeOfLife_toolbox/main/__about__.py
diff --git a/src/DD_tools/main/checkpoint.py → src/TreeOfLife_toolbox/main/checkpoint.py b/src/DD_tools/main/checkpoint.py → src/TreeOfLife_toolbox/main/checkpoint.py
diff --git a/src/DD_tools/main/config.py → src/TreeOfLife_toolbox/main/config.py b/src/DD_tools/main/config.py → src/TreeOfLife_toolbox/main/config.py
diff --git a/...DD_tools/main/config_templates/tools.yaml → ..._toolbox/main/config_templates/tools.yaml b/...DD_tools/main/config_templates/tools.yaml → ..._toolbox/main/config_templates/tools.yaml
diff --git a/src/DD_tools/main/filter.py → src/TreeOfLife_toolbox/main/filter.py b/src/DD_tools/main/filter.py → src/TreeOfLife_toolbox/main/filter.py
@@ -1,10 +1,10 @@
 import argparse
 import os
 
-from DD_tools.main.checkpoint import Checkpoint
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsRegistryBase
-from DD_tools.main.utils import init_logger
+from TreeOfLife_toolbox.main.checkpoint import Checkpoint
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
+from TreeOfLife_toolbox.main.utils import init_logger
 
 if __name__ == "__main__":
     config_path = os.environ.get("CONFIG_PATH")

diff --git a/src/DD_tools/main/filters.py → src/TreeOfLife_toolbox/main/filters.py b/src/DD_tools/main/filters.py → src/TreeOfLife_toolbox/main/filters.py
@@ -7,10 +7,10 @@
 from pyspark.sql import SparkSession
 from pyspark.sql.types import StructType
 
-from DD_tools.main.config import Config
-from DD_tools.main.registry import ToolsBase
-from DD_tools.main.registry import ToolsRegistryBase
-from DD_tools.main.utils import SuccessEntry
+from TreeOfLife_toolbox.main.config import Config
+from TreeOfLife_toolbox.main.registry import ToolsBase
+from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
+from TreeOfLife_toolbox.main.utils import SuccessEntry
 
 FilterRegister = partial(ToolsRegistryBase.register, "filter")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .classes import EoLRenameFilter, EoLRenameScheduleCreation, EoLRenameRunner