Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 18 additions & 26 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,86 +3,78 @@ requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["src/DD_tools"]
packages = ["src/TreeOfLife_toolbox"]

[project]
name = "DD_tools"
name = "TreeOfLife_toolbox"
dynamic = ["version"]
authors = [
{ name = "Andrey Kopanev", email = "kopanev.1@osu.edu" },
{ name = "Elizabeth G. Campolongo", email = "e.campolongo479@gmail.com" },
{ name = "Matthew J. Thompson", email = "thompson.m.j@outlook.com" },
]
description = "A tool for downloading files from a list of URLs in parallel."
description = "A tool for processing datasets that was downloaded using the distributed-downloader package."
readme = "README.md"
requires-python = ">=3.8"
requires-python = ">=3.10, <3.12"
classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
dependencies = [
"attrs",
"brotli",
"certifi",
"charset-normalizer",
"cramjam",
"cython",
"exceptiongroup",
"fsspec",
"hatchling",
"idna",
"inflate64",
"iniconfig",
"mpi4py < 4",
"mpi4py",
"multivolumefile",
"numpy",
"opencv-python",
"packaging",
"pandas",
"pathspec",
"pillow",
"pip",
"pluggy",
"psutil",
"py4j",
"pyarrow",
"pybcj",
"pycryptodomex",
"pyppmd",
"pyspark",
"pytest",
"python-dateutil",
"python-dotenv",
"pytz",
"pyyaml",
"pyzstd",
"requests",
"setuptools",
"six",
"texttable",
"tomli",
"trove-classifiers",
"typing-extensions",
"tzdata",
"urllib3",
"wheel"
]

[project.optional-dependencies]
dev = ["pytest"]
dev = [
"pytest",
"ruff"
]

keywords = [
"parallel",
"distributed",
"download",
"url",
"mpi-applications",
"dataset-generation",
]

[project.urls]
Homepage = "https://github.com/Imageomics/distributed-downloader"
Repository = "https://github.com/Imageomics/distributed-downloader.git"
"Bug Tracker" = "https://github.com/Imageomics/distributed-downloader/issues"

[project.scripts]
tree_of_life_toolbox = "TreeOfLife_toolbox.main.main:main"

[tool.hatch.version]
path = "src/DD_tools/main/__about__.py"
path = "src/TreeOfLife_toolbox/main/__about__.py"
3 changes: 1 addition & 2 deletions scripts/tools_filter.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@ executor_memory="64G"
module load spark/3.4.1
module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

pbs-spark-submit \
--driver-memory $driver_memory \
--executor-memory $executor_memory \
"${REPO_ROOT}/src/distributed_downloader/tools/filter.py" \
"${TOOLBOX_PATH}/main/filter.py" \
"${tool_name}" \
> "${logs_dir}/tool_filter.log"
3 changes: 1 addition & 2 deletions scripts/tools_scheduler.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task=1 \
--mem=0 \
--output="${logs_dir}/tool_scheduler.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/scheduler.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/scheduler.py" "${tool_name}"
3 changes: 1 addition & 2 deletions scripts/tools_verifier.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task=1 \
--mem=0 \
--output="${logs_dir}/tool_verifier.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/verification.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/verification.py" "${tool_name}"
3 changes: 1 addition & 2 deletions scripts/tools_worker.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ module load miniconda3/23.3.1-py310
source "${REPO_ROOT}/.venv/bin/activate"
export PYARROW_IGNORE_TIMEZONE=1
export I_MPI_JOB_RESPECT_PROCESS_PLACEMENT=0
export PYTHONPATH=${PYTHONPATH}:"${REPO_ROOT}/src":"${REPO_ROOT}/distributed-downloader"

srun \
--mpi=pmi2 \
Expand All @@ -28,4 +27,4 @@ srun \
--cpus-per-task="$TOOLS_CPU_PER_WORKER" \
--mem=0 \
--output="${logs_dir}/tool_worker-%2t.log" \
python "${REPO_ROOT}/src/distributed_downloader/tools/runner.py" "${tool_name}"
python "${TOOLBOX_PATH}/main/runner.py" "${tool_name}"
Empty file removed src/DD_tools/__init__.py
Empty file.
1 change: 1 addition & 0 deletions src/TreeOfLife_toolbox/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from TreeOfLife_toolbox import eol_rename
48 changes: 48 additions & 0 deletions src/TreeOfLife_toolbox/eol_rename/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# EoL Rename Tool

This tool enhances the Encyclopedia of Life (EoL) dataset by adding source identifiers to downloaded images.

## Overview

The EoL Rename tool processes images downloaded using the `distributed-downloader` tool. It enriches the dataset by:

1. Reading image data from the downloaded images directory
2. Reading original batch data containing EoL content and page IDs
3. Merging these datasets on the "uuid" field
4. Creating a new "source_id" field by concatenating "EOL content ID" and "EOL page ID"
5. Saving the updated data back to the original parquet files

This process ensures that images can be traced back to their source EoL content and pages.

## Components

The tool consists of three main components:

- **EoLRenameFilter**: Registers the 'eol_rename' filter in the system
- **EoLRenameScheduleCreation**: Creates execution schedules for rename operations
- **EoLRenameRunner**: Executes the actual renaming process by adding source IDs to image data

## Configuration

No additional configuration fields are required beyond the standard TreeOfLife toolbox configuration:

- Standard path configurations for downloaded images and URL folders are used
- The system will automatically locate the required data based on server_name and partition_id

## Pre-conditions

The tool requires the following to be true before running:

- Images must be downloaded using the `distributed-downloader` tool
- No additional data processing should have been performed on the dataset
- The folder structure must follow the distributed-downloader's conventions:
- Downloaded images stored in paths with `server_name` and `partition_id` partitions
- Original batch data available in the URLs folder with similar partitioning

## Post-conditions

After running the tool:

- The source_id will be in the format `{EOL content ID}_{EOL page ID}`
- Original parquet files will be updated in-place with the new field
- No duplicate or additional files will be created
1 change: 1 addition & 0 deletions src/TreeOfLife_toolbox/eol_rename/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .classes import EoLRenameFilter, EoLRenameScheduleCreation, EoLRenameRunner
131 changes: 131 additions & 0 deletions src/TreeOfLife_toolbox/eol_rename/classes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""
Encyclopedia of Life (EoL) dataset renaming module.

This module provides components for renaming images in the EoL dataset by merging
source identifiers from the original batch data. It includes filter, scheduler,
and runner classes for the EoL rename operation within the TreeOfLife toolbox.
"""

import os
from typing import List

import pandas as pd

from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.filters import PythonFilterToolBase, FilterRegister
from TreeOfLife_toolbox.main.runners import MPIRunnerTool, RunnerRegister
from TreeOfLife_toolbox.main.schedulers import DefaultScheduler, SchedulerRegister


@FilterRegister("eol_rename")
class EoLRenameFilter(PythonFilterToolBase):
"""
Filter class for EoL rename operations.

This class registers the 'eol_rename' filter in the filtering system.
It doesn't override any methods as it uses the default behavior from
the PythonFilterToolBase class.

Attributes:
filter_name: Name of the filter used for registration and identification.
"""
def __init__(self, cfg: Config):
super().__init__(cfg)
self.filter_name: str = "eol_rename"


@SchedulerRegister("eol_rename")
class EoLRenameScheduleCreation(DefaultScheduler):
"""
Scheduler class for EoL rename operations.

This scheduler is responsible for creating the execution schedule for
the EoL rename operation. It uses the default scheduling behavior from
the DefaultScheduler class.

Attributes:
filter_name: Name of the filter used for registration and identification.
"""
def __init__(self, cfg: Config):
super().__init__(cfg)
self.filter_name: str = "eol_rename"


@RunnerRegister("eol_rename")
class EoLRenameRunner(MPIRunnerTool):
"""
Runner class for executing the EoL rename operations.

This class handles the actual processing of the EoL dataset images,
adding source identifiers by merging information from batch data
with downloaded image data.

Attributes:
filter_name: Name of the filter used for registration and identification.
data_scheme: List of fields used to partition the dataset.
verification_scheme: List of fields used for verification.
total_time: Maximum allowed execution time in seconds.
"""
def __init__(self, cfg: Config):
super().__init__(cfg)
self.filter_name: str = "eol_rename"
self.data_scheme: List[str] = ["server_name", "partition_id"]
self.verification_scheme: List[str] = ["server_name", "partition_id"]
self.total_time = 150

def apply_filter(
self, filtering_df: pd.DataFrame, server_name: str, partition_id: int
) -> int:
"""
Apply the EoL rename filter to a specific partition of data.

This method adds source identifiers to the downloaded images data by
merging 'EOL content ID' and 'EOL page ID' from the original batch data.
It concatenates these IDs to create a 'source_id' field and saves the
updated data back to the original successes.parquet file.

Args:
filtering_df: DataFrame containing the filter data.
server_name: Name of the server containing the data.
partition_id: Partition ID within the server.

Returns:
int: Number of records processed.

Notes:
- Checks for time constraints during operation.
- Skips processing if the parquet path doesn't exist.
"""
self.is_enough_time()

parquet_path = os.path.join(
self.downloaded_images_path,
f"server_name={server_name}",
f"partition_id={partition_id}",
"successes.parquet",
)
server_batch_path = os.path.join(
self.config.get_folder("urls_folder"),
f"server_name={server_name}",
f"partition_id={partition_id}",
)

if not os.path.exists(parquet_path):
self.logger.info(f"Path doesn't exists: {parquet_path}")
return 0

parquet = pd.read_parquet(parquet_path)
server_batch = pd.read_parquet(
server_batch_path, columns=["EOL content ID", "EOL page ID", "uuid"]
)

self.is_enough_time()

parquet = parquet.merge(server_batch, on="uuid", how="left", validate="1:1")
parquet["source_id"] = parquet["EOL content ID"] + "_" + parquet["EOL page ID"]

parquet.to_parquet(
parquet_path, index=False, compression="zstd", compression_level=3
)

return len(parquet)
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import argparse
import os

from DD_tools.main.checkpoint import Checkpoint
from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import init_logger
from TreeOfLife_toolbox.main.checkpoint import Checkpoint
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import init_logger

if __name__ == "__main__":
config_path = os.environ.get("CONFIG_PATH")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType

from DD_tools.main.config import Config
from DD_tools.main.registry import ToolsBase
from DD_tools.main.registry import ToolsRegistryBase
from DD_tools.main.utils import SuccessEntry
from TreeOfLife_toolbox.main.config import Config
from TreeOfLife_toolbox.main.registry import ToolsBase
from TreeOfLife_toolbox.main.registry import ToolsRegistryBase
from TreeOfLife_toolbox.main.utils import SuccessEntry

FilterRegister = partial(ToolsRegistryBase.register, "filter")

Expand Down
Loading