Imageomics · Andrey170170 · Feb 16, 2025 · May 12, 2025 · May 12, 2025 · May 12, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,86 +3,78 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [tool.hatch.build.targets.wheel]
-packages = ["src/DD_tools"]
+packages = ["src/TreeOfLife_toolbox"]
 
 [project]
-name = "DD_tools"
+name = "TreeOfLife_toolbox"
 dynamic = ["version"]
 authors = [
     { name = "Andrey Kopanev", email = "kopanev.1@osu.edu" },
     { name = "Elizabeth G. Campolongo", email = "e.campolongo479@gmail.com" },
     { name = "Matthew J. Thompson", email = "thompson.m.j@outlook.com" },
 ]
-description = "A tool for downloading files from a list of URLs in parallel."
+description = "A tool for processing datasets that was downloaded using the distributed-downloader package."
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.10, <3.12"
 classifiers = [
+    "Development Status :: 4 - Beta",
     "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
 dependencies = [
     "attrs",
     "brotli",
-    "certifi",
-    "charset-normalizer",
     "cramjam",
     "cython",
-    "exceptiongroup",
     "fsspec",
-    "hatchling",
-    "idna",
     "inflate64",
-    "iniconfig",
-    "mpi4py < 4",
+    "mpi4py",
     "multivolumefile",
-    "numpy",
     "opencv-python",
-    "packaging",
     "pandas",
     "pathspec",
     "pillow",
-    "pip",
-    "pluggy",
     "psutil",
-    "py4j",
     "pyarrow",
     "pybcj",
     "pycryptodomex",
     "pyppmd",
     "pyspark",
-    "pytest",
-    "python-dateutil",
     "python-dotenv",
-    "pytz",
     "pyyaml",
     "pyzstd",
     "requests",
     "setuptools",
-    "six",
     "texttable",
-    "tomli",
     "trove-classifiers",
     "typing-extensions",
-    "tzdata",
-    "urllib3",
     "wheel"
 ]
 
 [project.optional-dependencies]
-dev = ["pytest"]
+dev = [
+    "pytest",
+    "ruff"
+]
 
 keywords = [
     "parallel",
     "distributed",
-    "download",
     "url",
+    "mpi-applications",
+    "dataset-generation",
 ]
 
 [project.urls]
 Homepage = "https://github.com/Imageomics/distributed-downloader"
 Repository = "https://github.com/Imageomics/distributed-downloader.git"
 "Bug Tracker" = "https://github.com/Imageomics/distributed-downloader/issues"
 
+[project.scripts]
+tree_of_life_toolbox = "TreeOfLife_toolbox.main.main:main"
+
 [tool.hatch.version]
-path = "src/DD_tools/main/__about__.py"
+path = "src/TreeOfLife_toolbox/main/__about__.py"
diff --git a/scripts/tools_filter.slurm b/scripts/tools_filter.slurm
@@ -19,11 +19,10 @@ executor_memory="64G"
 module load spark/3.4.1
 module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 pbs-spark-submit \
     --driver-memory $driver_memory \
     --executor-memory $executor_memory \
-    "${REPO_ROOT}/src/distributed_downloader/tools/filter.py" \
+    "${TOOLBOX_PATH}/main/filter.py" \
     "${tool_name}" \
     > "${logs_dir}/tool_filter.log"
diff --git a/scripts/tools_scheduler.slurm b/scripts/tools_scheduler.slurm
@@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
 export PYARROW_IGNORE_TIMEZONE=1
 export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 srun \
   --mpi=pmi2 \
@@ -28,4 +27,4 @@ srun \
   --cpus-per-task=1 \
   --mem=0 \
   --output="${logs_dir}/tool_scheduler.log" \
-  python "${REPO_ROOT}/src/distributed_downloader/tools/scheduler.py" "${tool_name}"
+  python "${TOOLBOX_PATH}/main/scheduler.py" "${tool_name}"
diff --git a/scripts/tools_verifier.slurm b/scripts/tools_verifier.slurm
@@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
 export PYARROW_IGNORE_TIMEZONE=1
 export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 srun \
   --mpi=pmi2 \
@@ -28,4 +27,4 @@ srun \
   --cpus-per-task=1 \
   --mem=0 \
   --output="${logs_dir}/tool_verifier.log" \
-  python "${REPO_ROOT}/src/distributed_downloader/tools/verification.py" "${tool_name}"
+  python "${TOOLBOX_PATH}/main/verification.py" "${tool_name}"
diff --git a/scripts/tools_worker.slurm b/scripts/tools_worker.slurm
@@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
 source "${REPO_ROOT}/.venv/bin/activate"
 export PYARROW_IGNORE_TIMEZONE=1
 export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
-export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"
 
 srun \
   --mpi=pmi2 \
@@ -28,4 +27,4 @@ srun \
   --cpus-per-task="$TOOLS_CPU_PER_WORKER" \
   --mem=0 \
   --output="${logs_dir}/tool_worker-%2t.log" \
-  python "${REPO_ROOT}/src/distributed_downloader/tools/runner.py" "${tool_name}"
+  python "${TOOLBOX_PATH}/main/runner.py" "${tool_name}"
diff --git a/src/DD_tools/__init__.py b/src/DD_tools/__init__.py
diff --git a/src/TreeOfLife_toolbox/__init__.py b/src/TreeOfLife_toolbox/__init__.py
@@ -0,0 +1 @@
+from TreeOfLife_toolbox import lila_extra_noaa_processing
diff --git a/src/TreeOfLife_toolbox/lila_extra_noaa_processing/README.md b/src/TreeOfLife_toolbox/lila_extra_noaa_processing/README.md
@@ -0,0 +1,53 @@
+# LILA Extra NOAA Processing
+
+## Overview
+
+This tool processes the LILA NOAA (National Oceanic and Atmospheric Administration) dataset and converts it into the
+standardized `TreeOfLife-toolbox` format compatible with the distributed-downloader ecosystem. The tool performs the
+following key operations:
+
+1. **Filtering**: Loads the NOAA dataset, standardizes column names, generates UUIDs, and partitions the data
+2. **Scheduling**: Creates a processing schedule to distribute work across compute resources
+3. **Processing**: Loads and crops images according to bounding box coordinates, computes hash values, and saves
+   processed data in parquet format
+
+The tool was **specifically** developed to convert LILA NOAA dataset into `distributed-downloader` format. It is not
+going to work on anything else.
+
+## Configuration Requirements
+
+### Required Fields in Config
+
+- `og_images_root`: Path to the root directory of the NOAA images (absolute path)
+
+## Assumptions/Pre-conditions
+
+- The NOAA images are available in the `og_images_root` directory.
+- The input CSV file contains the following columns:
+    - `detection_id`: Unique identifier for each detection
+    - `detection_type`: Life stage of the detected organism
+    - `rgb_image_path`: Relative path to the image from the root directory
+    - `rgb_left`, `rgb_right`, `rgb_top`, `rgb_bottom`: Bounding box coordinates for cropping
+- The paths in `rgb_image_path` are relative to the `og_images_root` directory.
+
+## Post-conditions
+
+After successful execution, the following is guaranteed:
+
+1. The processed data is available in the configured output directory with the structure:
+   ```
+   {images_folder}/server_name=noaa/partition_id={id}/successes.parquet
+   ```
+
+2. Each parquet file contains:
+    - `uuid`: Unique identifier for each entry
+    - `source_id`: Original detection ID
+    - `identifier`: Full path to the original image
+    - `is_license_full`: Set to False (NOAA data does not include license information)
+    - `original_size`: Dimensions of the original image
+    - `resized_size`: Dimensions of the cropped image
+    - `hashsum_original`: MD5 hash of the original image
+    - `hashsum_resized`: MD5 hash of the cropped image
+    - `image`: Binary data of the cropped image
+
+3. The verification tables confirm the completion of processing for all partitions.
diff --git a/src/TreeOfLife_toolbox/lila_extra_noaa_processing/__init__.py b/src/TreeOfLife_toolbox/lila_extra_noaa_processing/__init__.py
@@ -0,0 +1,5 @@
+from .classes import (
+    LilaExtraNoaaScheduleCreation,
+    LilaExtraNoaaFilter,
+    LilaExtraNoaaRunner,
+)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from TreeOfLife_toolbox import lila_extra_noaa_processing