diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml new file mode 100644 index 0000000..1dd444e --- /dev/null +++ b/.github/workflows/draft-pdf.yml @@ -0,0 +1,28 @@ +name: Draft PDF +on: + push: + paths: + - paper/** + - .github/workflows/draft-pdf.yml + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: paper/paper.md + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: paper + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: paper/paper.pdf diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 0000000..fe0d7fa --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,113 @@ +@software{histomicsui, + title = {HistomicsUI: Organize, visualize, annotate, and analyze histology images}, + author = {{Kitware, Inc}}, + year = {2025}, + note = {Package version 1.7.0}, + url = {https://github.com/DigitalSlideArchive/HistomicsUI}, + doi = {10.5281/zenodo.5474914} +} + +@software{histomicstk, + title = {HistomicsTK: a Python package for the analysis of digital pathology images}, + author = {{Kitware, Inc}}, + year = {2025}, + note = {Package version 1.4.0}, + url = {https://github.com/DigitalSlideArchive/HistomicsTK}, + doi = {10.5281/zenodo.14833780} +} + +@software{digitalslidearchive, + title = {Digital Slide Archive: a system for working with large microscopy images}, + author = {{Kitware, Inc}}, + year = {2025}, + note = {Commit 2da1bfc7365dd72011854b5aebf4a744cfcf98a1; Access: 2025-04-30}, + url = {https://github.com/DigitalSlideArchive/digital_slide_archive} +} + +@article{batchbald2019, + author = {Kirsch, Andreas and + van Amersfoort, Joost and + Gal, Yarin}, + title = {BatchBALD: Efficient and Diverse Batch Acquisition for Deep Bayesian + Active Learning}, + journal = {CoRR}, + volume = {abs/1906.08158}, + year = {2019}, + url = {http://arxiv.org/abs/1906.08158}, + eprinttype = {arXiv}, + eprint = {1906.08158}, + timestamp = {Thu, 14 Oct 2021 09:14:34 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-1906-08158.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{Gutman2017, + title = {The Digital Slide Archive: A Software Platform for Management, Integration, and Analysis of Histology for Cancer Research}, + volume = {77}, + ISSN = {1538-7445}, + url = {http://dx.doi.org/10.1158/0008-5472.can-17-0629}, + DOI = {10.1158/0008-5472.can-17-0629}, + number = {21}, + journal = {Cancer Research}, + publisher = {American Association for Cancer Research (AACR)}, + author = {Gutman, David A and Khalilia, Mohammed and Lee, Sanghoon and Nalisnik, Michael and Mullen, Zach and Beezley, Jonathan and Chittajallu, Deepak R and Manthey, David and Cooper, Lee A D}, + year = {2017}, + month = {Oct}, + pages = {e75–e78} +} + +@misc{TCGAData, + author = {National Cancer Institute and National Human Genome Research Institute}, + title = {The Cancer Genome Atlas (TCGA) Program}, + year = {2022}, + url = {https://www.cancer.gov/tcga}, + note = {Accessed: 2022-11-10]} +} + +@article{SLIC2012, + author = {Achanta, Radhakrishna and + Shaji, Appu and + Smith, Kevin and + Lucchi, Aurelien and + Fua, Pascal and + S\"usstrunk, Sabine}, + title = {SLIC superpixels compared to state-of-the-art superpixel methods}, + journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, + year = {2012}, + volume = {34}, + number = {11}, + pages = {2274-2282}, + doi = {10.1109/TPAMI.2012.120} +} + +@article{huggingface2024uni, + author = {Chen, Richard J and + Ding, Tong and + Lu, Ming Y and + Williamson, Drew F K and + Jaume, Guillaume and + Song, Andrew H and + Chen, Bowen and + Zhang, Andrew and + Shao, Daniel and + Shaban, Muhammad and + Williams, Mane and + Oldenburg, Lukas and + Weishaupt, Luca L and + Wang, Judy J and + Vaidya, Anurag and + Le, Long Phi and + Gerber, Georg and + Sahai, Sharifa and + Williams, Walt and + Mahmood, Faisal}, + title = {Towards a general-purpose foundation model for computational pathology}, + journal = {Nature Medicine}, + year = {2024}, + volume = {30}, + number = {3}, + pages = {850-862}, + month = {Mar}, + url = {https://huggingface.co/MahmoodLab/UNI}, + doi = {10.1038/s41591-024-02857-3} +} diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..f4954e9 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,90 @@ +--- +title: 'Histomics Label' +tags: + - Python + - histology + - bioimage informatics + - whole slide annotation + - whole slide images + - guided labeling +# (add orcid for anyone who has one) +authors: + - name: Brianna Major + affiliation: 1 + orcid: 0000-0003-4968-5701 + - name: Jeffery A. Goldstein + affiliation: 2 + orcid: 0000-0002-4086-057X + - name: Michael Nagler + affiliation: 1 + orcid: 0000-0003-3531-6630 + - name: Lee A. Newberg + affiliation: 1 + orcid: 0000-0003-4644-8874 + - name: Abhishek Sharma + affiliation: 2 + orcid: 0000-0001-6666-2179 + - name: Anders Sildnes + affiliation: 2 + orcid: 0009-0003-0141-6112 + - name: Faiza Ahmed + affiliation: 1 + orcid: 0000-0001-6687-9941 + - name: Jeff Baumes + affiliation: 1 + orcid: 0000-0002-4719-3490 + - name: Lee A.D. Cooper + affiliation: 2 + orcid: 0000-0002-3504-4965 + - name: David Manthey + affiliation: 1 + orcid: 0000-0002-4580-8770 +affiliations: + - index: 1 + name: Kitware, Inc., New York, United States + - index: 2 + name: Northwestern University Feinberg School of Medicine, Illinois, United States +date: 30 April 2025 +bibliography: paper.bib +--- + +# Summary + +`Histomics Label` is a software tool for the interactive development of machine-learning classifiers for whole slide pathology images. It is deployed as part of the Digital Slide Archive [@Gutman2017; @digitalslidearchive], a web-based data management system for whole slide image datasets, and was built on top of HistomicsUI [@histomicsui] and uses the HistomicsTK [@histomicstk] image analysis tool kit. + +Users label image regions or tissue structures to provide training data for classifiers and iteratively improve these classifiers by reviewing their output and providing additional labels. The interface uses heuristics to guide users to the most impactful examples to label, and supports bulk labeling of samples and review of labeled examples for collaboration. An example data generation pipeline is included that segments a whole slide image into superpixels, and generates feature embeddings for segmented regions using a foundation model. + +# Statement of need + +One of the limitations in developing classification models is the need for labeled data. In pathology and other medical fields, the expertise required for labeling and busy schedules of medical experts make labeling particularly challenging. For whole slide images, where each image can contain several billion pixels, navigating vast datasets in search of possibly rare tissue states can be very inefficient and frustrating. Software interfaces need to be optimized for the user experience and make the most of an expert's time and energy. + +Other issues in labeling include the volume and accessibility of data. Software that must run local to the data requires that all data be copied and correctly versioned for the project. Using a web-client and server model with appropriate permission models, only requires that the data be on centrally managed server. This allows there to be a single, coordinate source of data for a project, and reduces the burden on individual users to only requiring a web browser and ordinary internet connection. This enables collaboration between multiple experts, or to allow experts to review the work of their trainees. + +`Histomics Label` uses a technique called active learning to identify the unlabeled examples that can provide the most benefit to classifier performance and provides an intuitive workflow +for presenting these examples to experts for efficient labeling. Data can be generated using a built-in pipeline that partitions whole-slide images into superpixels, or users can provide their own data from external cell or tissue segmentation algorithms. Users specify the categories that can be labeled and assign display properties like color, and can exclude categories from classifier training (for instance, for regions whose categories cannot be accurately determined). After labeling a few initial example regions, a classifier is trained and used to both predict the category of all regions and the unlabeled regions that provide the most classifier benefit. The user can retrain the classifier at any time and review the classifier predictions and labels from other users. Labeling can also be performed by painting directly on the whole slide image with a brush tool. + +For development, the initial segmentation uses superpixels generated with the SLIC [@SLIC2012] algorithm. These are computed on whole slide images in a tiled manner so that they can work on arbitrarily large images, and the tile boundaries are properly handled to merge seamlessly. Once generated, segments are represented in one of two ways, either as two-dimensional patches, each centered in a fixed-sized square of masked pixels, or as one-dimensional feature embeddings, such as those generated from the huggingface UNI [@huggingface2024uni] foundation model. One of two basic models is trained based upon the segment representation. For two-dimensional patches, the model to be trained is a small-scale CNN implemented in tensorflow/keras or torch. For one-dimensional vectors, the model to be trained is a single-layer linear classifier. The certainty criteria for which segments should be labeled next can also be selected, and includes confidence, margin, negative entropy, and the BatchBALD [@batchbald2019] algorithm. + +We had a placental pathologist provide feedback to validate the efficiency of the user interface and utility of the process. + +# Basic Workflow + +When starting a new labeling project, the user selects how superpixels are generated, which certainty metric is used for determining the optimal labeling order, and what features are used for model training. The labeling mode allows defining project labels and performing initial labeling. This mode can also be used to add new label categories or combine two categories if they should not have been distinct. Label categories can additionally be marked as excluded, which removes them from training and ensures that superpixels with those labels are no longer suggested for labeling. + +![The Bulk Labeling interface showing one of the project images divided into superpixels with some categories defined. A user can "paint" areas with known labels as an initial seed for the guided labeling process](../docs/screenshots/initial_labels.png) + +Once some segments have been labeled and an initial training process has been performed, additional segments are shown with their predictions. The user can use keyboard shortcuts or the mouse to confirm or correct labels. These are presented in an order that maximizes the utility of improving the model based on the originally selected certainty metric. + +![The Guided Labeling interface showing a row of superpixels to be labeled and part of a whole slide image](../docs/screenshots/active_learning_view.png) + +To check on overall behavior or correct mistakes, there is a review mode that allows seeing all labeled segments with various filtering and sorting options. This can be used to check agreement between pathologists or determine how well the model agrees with the manually labeled data. + +![The Review interface showing labeled superpixels in each category](../docs/screenshots/reviewmode.png) + +The whole slide image data in these figures are from data generated by the TCGA Research Network [@TCGA]. + +# Acknowledgements + +This work has been funded in part by National Library of Medicine grant 5R01LM013523 entitled "Guiding humans to create better labeled datasets for machine learning in biomedical research". + +# References