Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 18 additions & 26 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,86 +3,78 @@ requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["src/DD_tools"]
packages = ["src/TreeOfLife_toolbox"]

[project]
name = "DD_tools"
name = "TreeOfLife_toolbox"
dynamic = ["version"]
authors = [
{ name = "Andrey Kopanev", email = "kopanev.1@osu.edu" },
{ name = "Elizabeth G. Campolongo", email = "e.campolongo479@gmail.com" },
{ name = "Matthew J. Thompson", email = "thompson.m.j@outlook.com" },
]
description = "A tool for downloading files from a list of URLs in parallel."
description = "A tool for processing datasets that was downloaded using the distributed-downloader package."
readme = "README.md"
requires-python = ">=3.8"
requires-python = ">=3.10, <3.12"
classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
dependencies = [
"attrs",
"brotli",
"certifi",
"charset-normalizer",
"cramjam",
"cython",
"exceptiongroup",
"fsspec",
"hatchling",
"idna",
"inflate64",
"iniconfig",
"mpi4py < 4",
"mpi4py",
"multivolumefile",
"numpy",
"opencv-python",
"packaging",
"pandas",
"pathspec",
"pillow",
"pip",
"pluggy",
"psutil",
"py4j",
"pyarrow",
"pybcj",
"pycryptodomex",
"pyppmd",
"pyspark",
"pytest",
"python-dateutil",
"python-dotenv",
"pytz",
"pyyaml",
"pyzstd",
"requests",
"setuptools",
"six",
"texttable",
"tomli",
"trove-classifiers",
"typing-extensions",
"tzdata",
"urllib3",
"wheel"
]

[project.optional-dependencies]
dev = ["pytest"]
dev = [
"pytest",
"ruff"
]

keywords = [
"parallel",
"distributed",
"download",
"url",
"mpi-applications",
"dataset-generation",
]

[project.urls]
Homepage = "https://github.com/Imageomics/distributed-downloader"
Repository = "https://github.com/Imageomics/distributed-downloader.git"
"Bug Tracker" = "https://github.com/Imageomics/distributed-downloader/issues"

[project.scripts]
tree_of_life_toolbox = "TreeOfLife_toolbox.main.main:main"

[tool.hatch.version]
path = "src/DD_tools/main/__about__.py"
path = "src/TreeOfLife_toolbox/main/__about__.py"
3 changes: 1 addition & 2 deletions scripts/tools_filter.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@ executor_memory="64G"
module load spark/3.4.1
module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

pbs-spark-submit \
--driver-memory $driver_memory \
--executor-memory $executor_memory \
"${REPO_ROOT}/src/distributed_downloader/tools/filter.py" \
"${TOOLBOX_PATH}/main/filter.py" \
"${tool_name}" \
> "${logs_dir}/tool_filter.log"
3 changes: 1 addition & 2 deletions scripts/tools_scheduler.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task=1 \
--mem=0 \
--output="${logs_dir}/tool_scheduler.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/scheduler.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/scheduler.py" "${tool_name}"
3 changes: 1 addition & 2 deletions scripts/tools_verifier.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task=1 \
--mem=0 \
--output="${logs_dir}/tool_verifier.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/verification.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/verification.py" "${tool_name}"
3 changes: 1 addition & 2 deletions scripts/tools_worker.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task="$TOOLS_CPU_PER_WORKER" \
--mem=0 \
--output="${logs_dir}/tool_worker-%2t.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/runner.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/runner.py" "${tool_name}"
Empty file removed src/DD_tools/__init__.py
Empty file.
1 change: 1 addition & 0 deletions src/TreeOfLife_toolbox/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from TreeOfLife_toolbox import fathom_net_crop_fix
81 changes: 81 additions & 0 deletions src/TreeOfLife_toolbox/fathom_net_crop_fix/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# FathomNet Crop Fix Tool

## Overview

This tool corrects improperly cropped FathomNet images by reprocessing them from the original source images using
improved cropping algorithms. It addresses boundary issues in the previous cropping implementation by enforcing proper
bounds checking and preventing out-of-bounds errors.

The tool follows a three-stage processing pipeline:

1. **Filter Stage**: Identifies affected images by joining UUID tables with lookup information
2. **Scheduler Stage**: Organizes processing by server to enable efficient parallel execution
3. **Runner Stage**: Performs the actual image recropping using correct boundary parameters

> ⚠️ **Note**: This is a specialized tool built for a specific dataset issue. It should not be used for other cases
> without code modifications.

## Configuration Requirements

The following fields must be defined in your configuration file:

| Field | Description |
|----------------------------|-----------------------------------------------------------------------------|
| `uuid_table_path` | Path to CSV/parquet with UUIDs of images needing recropping |
| `look_up_table_path` | Path to lookup table with `uuid` to `file_name` mapping information |
| `filtered_by_size` | Path to original CSV containing bounding box coordinates and UUID matches |
| `data_transfer_table` | Path to CSV mapping ToL dataset files to original image locations |
| `base_path` | Base directory where images were transferred using the `data_transfer` tool |
| `original_image_base_path` | Base directory where original uncropped images are stored |
| `image_crop_path` | Output directory where corrected cropped images will be saved |

## Pre-Conditions

For the tool to work correctly, the following conditions must be met:

- Original uncropped images still exist and are accessible
- Original images have not been modified since initial cropping
- Initial cropping was performed using the `fathom_net_crop` tool
- Images were transferred and restructured using the `data_transfer` tool
- Transfer logs are available to provide mapping between new filenames and original files
- The provided `filtered_by_size` CSV contains valid bounding box information (x, y, width, height)

## Processing Details

The tool applies the following corrections to each image:

- Ensures crop boundaries stay within the original image dimensions
- Applies proper bounds checking to prevent negative coordinates
- Ensures maximum bounds do not exceed image dimensions
- Recalculates image hashes for the properly cropped images
- Preserves all original metadata while updating size information

## Post-Conditions

After successful execution:

- Corrected cropped images are saved to the `image_crop_path` directory
- Images are organized in a server-based directory structure
- Each output file contains properly cropped images with corrected dimensions
- Each cropped image maintains its original UUID and source identification
- New hashsum values are calculated for the corrected images
- Verification data is created to track processing completion

## Usage Notes

The tool is designed to run in a distributed environment using MPI. It handles processing in batches by server to
maximize efficiency and manages timeouts to ensure job completion within allocation constraints.

**Technical Implementation**: The core fix applies proper boundary checking to ensure crop coordinates are within valid
image dimensions:

```python
# Calculate corrected crop coordinates with proper bounds checking
min_y = min(image_size[0], max(row["y"], 0))
min_x = min(image_size[1], max(row["x"], 0))
max_y = min(image_size[0], max(row["y"] + row["height"], 0))
max_x = min(image_size[1], max(row["x"] + row["width"], 0))
```

This prevents both negative coordinates and exceeding image dimensions, which were the main causes of errors in the
original cropping implementation.
5 changes: 5 additions & 0 deletions src/TreeOfLife_toolbox/fathom_net_crop_fix/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .classes import (
FathomnetCropFixFilter,
FathomnetCropFixScheduleCreation,
FathomnetCropFixRunner,
)
Loading