From d0294237734ffe117ec2a04b9e5a227b610a5f29 Mon Sep 17 00:00:00 2001 From: David Manthey Date: Wed, 30 Apr 2025 13:51:10 -0400 Subject: [PATCH 01/10] Draft of JOSS paper --- .github/workflows/draft-pdf.yml | 28 +++++++++++++ paper/paper.bib | 57 ++++++++++++++++++++++++++ paper/paper.md | 71 +++++++++++++++++++++++++++++++++ 3 files changed, 156 insertions(+) create mode 100644 .github/workflows/draft-pdf.yml create mode 100644 paper/paper.bib create mode 100644 paper/paper.md diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml new file mode 100644 index 0000000..1dd444e --- /dev/null +++ b/.github/workflows/draft-pdf.yml @@ -0,0 +1,28 @@ +name: Draft PDF +on: + push: + paths: + - paper/** + - .github/workflows/draft-pdf.yml + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: paper/paper.md + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: paper + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: paper/paper.pdf diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 0000000..72f1018 --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,57 @@ +@software{histomicsui, + title = {HistomicsUI: Organize, visualize, annotate, and analyze histology images}, + author = {{Kitware, Inc}}, + year = {2025}, + note = {Package version 1.7.0}, + url = {https://github.com/DigitalSlideArchive/HistomicsUI}, + doi = {10.5281/zenodo.5474914}, +} + +@software{histomicstk, + title = {HistomicsTK: a Python package for the analysis of digital pathology images}, + author = {{Kitware, Inc}}, + year = {2025}, + note = {Package version 1.4.0}, + url = {https://github.com/DigitalSlideArchive/HistomicsTK}, + doi = {10.5281/zenodo.14833780}, +} + +@software{digitalslidearchive, + title = {Digital Slide Archive: a system for working with large microscopy images}, + author = {{Kitware, Inc}}, + year = {2025}, + note = {Commit 2da1bfc7365dd72011854b5aebf4a744cfcf98a1; Access: 2025-04-30}, + url = {https://github.com/DigitalSlideArchive/digital_slide_archive}, +} + +@article{batchbald2019, + author = {Andreas Kirsch and + Joost van Amersfoort and + Yarin Gal}, + title = {BatchBALD: Efficient and Diverse Batch Acquisition for Deep Bayesian + Active Learning}, + journal = {CoRR}, + volume = {abs/1906.08158}, + year = {2019}, + url = {http://arxiv.org/abs/1906.08158}, + eprinttype = {arXiv}, + eprint = {1906.08158}, + timestamp = {Thu, 14 Oct 2021 09:14:34 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-1906-08158.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{Gutman2017, + title = {The Digital Slide Archive: A Software Platform for Management, Integration, and Analysis of Histology for Cancer Research}, + volume = {77}, + ISSN = {1538-7445}, + url = {http://dx.doi.org/10.1158/0008-5472.can-17-0629}, + DOI = {10.1158/0008-5472.can-17-0629}, + number = {21}, + journal = {Cancer Research}, + publisher = {American Association for Cancer Research (AACR)}, + author = {Gutman, David A. and Khalilia, Mohammed and Lee, Sanghoon and Nalisnik, Michael and Mullen, Zach and Beezley, Jonathan and Chittajallu, Deepak R. and Manthey, David and Cooper, Lee A.D.}, + year = {2017}, + month = oct, + pages = {e75–e78} +} diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..859fee1 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,71 @@ +--- +title: 'WSI Superpixel Guided Labeling' +tags: + - Python + - histology + - bioimage informatics + - whole slide annotation + - whole slide images + - guided labeling +# (add orcid for anyone who has one) +authors: + - name: Brianna Major + affiliation: 1 + orcid: 0000-0003-4968-5701 + - name: Jeffery A. Goldstein + affiliation: 2 + orcid: 0000-0002-4086-057X + - name: Michael Nagler + affiliation: 1 + orcid: 0000-0003-3531-6630 + - name: Lee A. Newberg + affiliation: 1 + orcid: 0000-0003-4644-8874 + - name: Abhishek Sharma + affiliation: 2 + - name: Anders Sildnes + affiliation: 2 + - name: Faiza Ahmed + affiliation: 1 + orcid: 0000-0001-6687-9941 + - name: Jeff Baumes + affiliation: 1 + orcid: 0000-0002-4719-3490 + - name: Lee A.D. Cooper + affiliation: 2 + orcid: 0000-0002-3504-4965 + - name: David Manthey + affiliation: 1 + orcid: 0000-0002-4580-8770 +affiliations: + - index: 1 + name: Kitware, Inc., New York, United States + - index: 2 + name: Northwestern University Feinberg School of Medicine, Illinois, United States +date: 30 April 2025 +bibliography: paper.bib +--- + +# Summary + +`WSI Superpixel Guided Labeling` facilitates active learning on whole slide images. It has a user interface built on top of the HistomicsUI [@histomicsui] base and deployed as part of the Digital Slide Archive [@Gutman2017, @digitalslidearchive], and uses the HistomicsTK [@histomicstk] tool kit as part of the process. + +Users label superpixel regions or other segmented areas of whole slide images to be used as classification input for machine learning algorithms. An example algorithm is included which generates superpixels, features, and machine learning models for active learning on a directory of images. The interface allows bulk labeling, labeling the most impactful superpixels to improve the model, and reviewing labeled and predicted categories. + +# Statement of need + +One of the limitations in generating accurate models is the need for labeled data. Given a model and a few labeled samples, there are a variety of algorithms that can be used to determine what samples should be additionally labeled to most efficiently improve the model. To actually get labeled data, this prediction of which samples to label needs to be combined with an efficient workflow so that the domain expert can use their labeling time in the most effective manner possible. + +`WSI Superpixel Guided Labeling` provides a user interface and workflow for this guided labeling process. Given a set of whole slide images, the images are segmented based on a some user choices. This segmentation is the basis for labeling. The user can specify any number of label categories, including labels that will be excluded from training (for instance, for segmented regions whose categories cannot be accurately determined). After labeling a few initial segments, a model is generated and used to both predict the category of all segments and the segments that would result in the best improvement in the model if they were also labeled. The user can retrain the model at any time and review the results of both the predictions and other users. + +For development, the initial segmentation uses superpixels generated with the SLIC algorithm. These are computed on whole slide images in a tiled manner so that they can work on arbitrarily large images, and the tile boundaries are properly handled to avoid visible artifacts. Either of two basic models can be trained and used for predictions: small-scale CNN using image features implemented in tensorflow/keras or torch, or a huggingface foundation model that generates a one-dimensional feature vector. The certainty criteria for which segments should be labeled next can also be selected, and includes confidence, margin, negative entropy, and the BatchBALD [@batchbald2019] algorithm. + +We had a placental pathologist provide feedback to validate the efficiency of the user interface and utility of the process. + +![The Guided Labeling interface showing a row of superpixels to be labeled and part of a whole slide image](../docs/screenshots/active_learning_view.png) + +# Acknowledgements + +This work has been funded in part by National Library of Medicine grant 5R01LM013523 entitled "Guiding humans to create better labeled datasets for machine learning in biomedical research". + +# References From 6a3a221a6472d40b7f7b590b1fed2c959bc1b5c2 Mon Sep 17 00:00:00 2001 From: David Manthey Date: Tue, 6 May 2025 09:23:09 -0400 Subject: [PATCH 02/10] Add more figures --- paper/paper.bib | 8 ++++++++ paper/paper.md | 14 ++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/paper/paper.bib b/paper/paper.bib index 72f1018..e323bdd 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -55,3 +55,11 @@ @article{Gutman2017 month = oct, pages = {e75–e78} } + +@misc{TCGAData, + author = {National Cancer Institute and National Human Genome Research Institute}, + title = {The Cancer Genome Atlas (TCGA) Program}, + year = {2022}, + url = {https://www.cancer.gov/tcga}, + note = {Accessed: 2022-11-10]} +} diff --git a/paper/paper.md b/paper/paper.md index 859fee1..f6c8fa0 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -62,8 +62,22 @@ For development, the initial segmentation uses superpixels generated with the SL We had a placental pathologist provide feedback to validate the efficiency of the user interface and utility of the process. +# Basic Workflow + +When starting a new labeling project, the user selects how superpixels are generated, which certainty metric is used for determining the optimal labeling order, and what features are used for model training. The labeling mode allows defining project labels and performing initial labeling. This mode can also be used to add new label categories or combine two categories if they should not have been distinct. + +![The Bulk Labeling interface showing one of the project images divided into superpixels with some categories defined. A user can "paint" areas with known labels as an initial seed for the guided labeling process](../docs/screenshots/initial_labels.png) + +Once some segments have been labeled and an initial training process has been performed, additional segments are shown with their predictions. The user can use keyboard shortcuts or the mouse to confirm or correct labels. These are presented in an order that maximizes the utility of improving the model based on the originally selected certainty metric. + ![The Guided Labeling interface showing a row of superpixels to be labeled and part of a whole slide image](../docs/screenshots/active_learning_view.png) +To check on overall behavior or correct mistakes, there is a review mode that allows seeing all labeled segments with various filtering and sorting options. This can be used to check agreement between pathologists or determine how well the model agrees with the manually labeled data. + +![The Review interface showing labeled superpixels in each category](../docs/screenshots/reviewmode.png) + +The whole slide image data in these figures are from data generated by the TCGA Research Network [@TCGA]. + # Acknowledgements This work has been funded in part by National Library of Medicine grant 5R01LM013523 entitled "Guiding humans to create better labeled datasets for machine learning in biomedical research". From 1215873cb35955425cd2019ed519f09ed0ffc3d4 Mon Sep 17 00:00:00 2001 From: David Manthey Date: Wed, 7 May 2025 13:03:32 -0400 Subject: [PATCH 03/10] Add another orcid. --- paper/paper.md | 1 + 1 file changed, 1 insertion(+) diff --git a/paper/paper.md b/paper/paper.md index f6c8fa0..2c1426c 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -25,6 +25,7 @@ authors: affiliation: 2 - name: Anders Sildnes affiliation: 2 + orcid: 0009-0003-0141-6112 - name: Faiza Ahmed affiliation: 1 orcid: 0000-0001-6687-9941 From 11fb2a300c676846017cf9ace706cac4845e3828 Mon Sep 17 00:00:00 2001 From: David Manthey Date: Wed, 7 May 2025 13:04:28 -0400 Subject: [PATCH 04/10] Update paper/paper.md Co-authored-by: Brianna Major --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 2c1426c..a72045d 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -65,7 +65,7 @@ We had a placental pathologist provide feedback to validate the efficiency of th # Basic Workflow -When starting a new labeling project, the user selects how superpixels are generated, which certainty metric is used for determining the optimal labeling order, and what features are used for model training. The labeling mode allows defining project labels and performing initial labeling. This mode can also be used to add new label categories or combine two categories if they should not have been distinct. +When starting a new labeling project, the user selects how superpixels are generated, which certainty metric is used for determining the optimal labeling order, and what features are used for model training. The labeling mode allows defining project labels and performing initial labeling. This mode can also be used to add new label categories or combine two categories if they should not have been distinct. Label categories can additionally be marked as excluded, which removes them from training and ensures that superpixels with those labels are no longer suggested for labeling. ![The Bulk Labeling interface showing one of the project images divided into superpixels with some categories defined. A user can "paint" areas with known labels as an initial seed for the guided labeling process](../docs/screenshots/initial_labels.png) From 235476889073f23bb6c57ff36e40d09787b06843 Mon Sep 17 00:00:00 2001 From: David Manthey Date: Thu, 8 May 2025 12:21:06 -0400 Subject: [PATCH 05/10] Add another orcid --- paper/paper.md | 1 + 1 file changed, 1 insertion(+) diff --git a/paper/paper.md b/paper/paper.md index a72045d..dda3ee4 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -23,6 +23,7 @@ authors: orcid: 0000-0003-4644-8874 - name: Abhishek Sharma affiliation: 2 + orcid: 0000-0001-6666-2179 - name: Anders Sildnes affiliation: 2 orcid: 0009-0003-0141-6112 From 3c6077d1f0c3973c06a6f236d607276a871739e6 Mon Sep 17 00:00:00 2001 From: Lee Newberg Date: Thu, 8 May 2025 16:17:03 -0400 Subject: [PATCH 06/10] Say more about feature shapes, SLIC, and huggingface UNI --- paper/paper.bib | 45 +++++++++++++++++++++++++++++++++++++++++++++ paper/paper.md | 12 ++++++------ 2 files changed, 51 insertions(+), 6 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index e323bdd..920fc12 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -63,3 +63,48 @@ @misc{TCGAData url = {https://www.cancer.gov/tcga}, note = {Accessed: 2022-11-10]} } + +@article{SLIC2012, + author = {Radhakrishna Achanta and + Appu Shaji and + Kevin Smith and + Aurelien Lucchi and + Pascal Fua and + Sabine S\"usstrunk}, + title = {SLIC superpixels compared to state-of-the-art superpixel methods}, + journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, + year = {2012}, + volume = {34}, + number = {11}, + pages = {2274-2282} +} + +@article{huggingface2024uni, + author = {Chen, Richard J and + Ding, Tong and + Lu, Ming Y and + Williamson, Drew F K and + Jaume, Guillaume and + Song, Andrew H and + Chen, Bowen and + Zhang, Andrew and + Shao, Daniel and + Shaban, Muhammad and + Williams, Mane and + Oldenburg, Lukas and + Weishaupt, Luca L and + Wang, Judy J and + Vaidya, Anurag and + Le, Long Phi and + Gerber, Georg and + Sahai, Sharifa and + Williams, Walt and + Mahmood, Faisal}, + title = {Towards a general-purpose foundation model for computational pathology}, + journal = {Nature Medicine}, + year = {2024}, + volume = {30}, + number = {3}, + pages = {850-862}, + month = {Mar} +} diff --git a/paper/paper.md b/paper/paper.md index dda3ee4..21deecf 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -2,9 +2,9 @@ title: 'WSI Superpixel Guided Labeling' tags: - Python - - histology - - bioimage informatics - - whole slide annotation + - histology + - bioimage informatics + - whole slide annotation - whole slide images - guided labeling # (add orcid for anyone who has one) @@ -50,7 +50,7 @@ bibliography: paper.bib # Summary -`WSI Superpixel Guided Labeling` facilitates active learning on whole slide images. It has a user interface built on top of the HistomicsUI [@histomicsui] base and deployed as part of the Digital Slide Archive [@Gutman2017, @digitalslidearchive], and uses the HistomicsTK [@histomicstk] tool kit as part of the process. +`WSI Superpixel Guided Labeling` facilitates active learning on whole slide images. It has a user interface built on top of the HistomicsUI [@histomicsui] base and deployed as part of the Digital Slide Archive [@Gutman2017, @digitalslidearchive], and uses the HistomicsTK [@histomicstk] tool kit as part of the process. Users label superpixel regions or other segmented areas of whole slide images to be used as classification input for machine learning algorithms. An example algorithm is included which generates superpixels, features, and machine learning models for active learning on a directory of images. The interface allows bulk labeling, labeling the most impactful superpixels to improve the model, and reviewing labeled and predicted categories. @@ -60,13 +60,13 @@ One of the limitations in generating accurate models is the need for labeled dat `WSI Superpixel Guided Labeling` provides a user interface and workflow for this guided labeling process. Given a set of whole slide images, the images are segmented based on a some user choices. This segmentation is the basis for labeling. The user can specify any number of label categories, including labels that will be excluded from training (for instance, for segmented regions whose categories cannot be accurately determined). After labeling a few initial segments, a model is generated and used to both predict the category of all segments and the segments that would result in the best improvement in the model if they were also labeled. The user can retrain the model at any time and review the results of both the predictions and other users. -For development, the initial segmentation uses superpixels generated with the SLIC algorithm. These are computed on whole slide images in a tiled manner so that they can work on arbitrarily large images, and the tile boundaries are properly handled to avoid visible artifacts. Either of two basic models can be trained and used for predictions: small-scale CNN using image features implemented in tensorflow/keras or torch, or a huggingface foundation model that generates a one-dimensional feature vector. The certainty criteria for which segments should be labeled next can also be selected, and includes confidence, margin, negative entropy, and the BatchBALD [@batchbald2019] algorithm. +For development, the initial segmentation uses superpixels generated with the SLIC [@SLIC2012] algorithm. These are computed on whole slide images in a tiled manner so that they can work on arbitrarily large images, and the tile boundaries are properly handled to avoid visible artifacts. Once generated, segments are represented in one of two ways, either as two-dimensional patches, each centered in a fixed-sized square of pixels with non-segment pixels set to black, or as one-dimensional vectors, such as those generated from the huggingface UNI [@huggingface2024uni] foundation model. One of two basic models is trained based upon the segment representation. For two-dimensional patches, the model to be trained is a small-scale CNN implemented in tensorflow/keras or torch. For one-dimensional vectors, the model to be trained is a single-layer linear classifier. The certainty criteria for which segments should be labeled next can also be selected, and includes confidence, margin, negative entropy, and the BatchBALD [@batchbald2019] algorithm. We had a placental pathologist provide feedback to validate the efficiency of the user interface and utility of the process. # Basic Workflow -When starting a new labeling project, the user selects how superpixels are generated, which certainty metric is used for determining the optimal labeling order, and what features are used for model training. The labeling mode allows defining project labels and performing initial labeling. This mode can also be used to add new label categories or combine two categories if they should not have been distinct. Label categories can additionally be marked as excluded, which removes them from training and ensures that superpixels with those labels are no longer suggested for labeling. +When starting a new labeling project, the user selects how superpixels are generated, which certainty metric is used for determining the optimal labeling order, and what features are used for model training. The labeling mode allows defining project labels and performing initial labeling. This mode can also be used to add new label categories or combine two categories if they should not have been distinct. Label categories can additionally be marked as excluded, which removes them from training and ensures that superpixels with those labels are no longer suggested for labeling. ![The Bulk Labeling interface showing one of the project images divided into superpixels with some categories defined. A user can "paint" areas with known labels as an initial seed for the guided labeling process](../docs/screenshots/initial_labels.png) From 10ecf15d427faf241cab2ca1d327280b8701c2cc Mon Sep 17 00:00:00 2001 From: Lee Newberg Date: Fri, 9 May 2025 12:23:45 -0400 Subject: [PATCH 07/10] Add DOI to some citations. Fix two-word last name --- paper/paper.bib | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 920fc12..fe0d7fa 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -4,7 +4,7 @@ @software{histomicsui year = {2025}, note = {Package version 1.7.0}, url = {https://github.com/DigitalSlideArchive/HistomicsUI}, - doi = {10.5281/zenodo.5474914}, + doi = {10.5281/zenodo.5474914} } @software{histomicstk, @@ -13,7 +13,7 @@ @software{histomicstk year = {2025}, note = {Package version 1.4.0}, url = {https://github.com/DigitalSlideArchive/HistomicsTK}, - doi = {10.5281/zenodo.14833780}, + doi = {10.5281/zenodo.14833780} } @software{digitalslidearchive, @@ -21,13 +21,13 @@ @software{digitalslidearchive author = {{Kitware, Inc}}, year = {2025}, note = {Commit 2da1bfc7365dd72011854b5aebf4a744cfcf98a1; Access: 2025-04-30}, - url = {https://github.com/DigitalSlideArchive/digital_slide_archive}, + url = {https://github.com/DigitalSlideArchive/digital_slide_archive} } @article{batchbald2019, - author = {Andreas Kirsch and - Joost van Amersfoort and - Yarin Gal}, + author = {Kirsch, Andreas and + van Amersfoort, Joost and + Gal, Yarin}, title = {BatchBALD: Efficient and Diverse Batch Acquisition for Deep Bayesian Active Learning}, journal = {CoRR}, @@ -42,7 +42,7 @@ @article{batchbald2019 } @article{Gutman2017, - title = {The Digital Slide Archive: A Software Platform for Management, Integration, and Analysis of Histology for Cancer Research}, + title = {The Digital Slide Archive: A Software Platform for Management, Integration, and Analysis of Histology for Cancer Research}, volume = {77}, ISSN = {1538-7445}, url = {http://dx.doi.org/10.1158/0008-5472.can-17-0629}, @@ -50,9 +50,9 @@ @article{Gutman2017 number = {21}, journal = {Cancer Research}, publisher = {American Association for Cancer Research (AACR)}, - author = {Gutman, David A. and Khalilia, Mohammed and Lee, Sanghoon and Nalisnik, Michael and Mullen, Zach and Beezley, Jonathan and Chittajallu, Deepak R. and Manthey, David and Cooper, Lee A.D.}, + author = {Gutman, David A and Khalilia, Mohammed and Lee, Sanghoon and Nalisnik, Michael and Mullen, Zach and Beezley, Jonathan and Chittajallu, Deepak R and Manthey, David and Cooper, Lee A D}, year = {2017}, - month = oct, + month = {Oct}, pages = {e75–e78} } @@ -65,18 +65,19 @@ @misc{TCGAData } @article{SLIC2012, - author = {Radhakrishna Achanta and - Appu Shaji and - Kevin Smith and - Aurelien Lucchi and - Pascal Fua and - Sabine S\"usstrunk}, + author = {Achanta, Radhakrishna and + Shaji, Appu and + Smith, Kevin and + Lucchi, Aurelien and + Fua, Pascal and + S\"usstrunk, Sabine}, title = {SLIC superpixels compared to state-of-the-art superpixel methods}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, year = {2012}, volume = {34}, number = {11}, - pages = {2274-2282} + pages = {2274-2282}, + doi = {10.1109/TPAMI.2012.120} } @article{huggingface2024uni, @@ -106,5 +107,7 @@ @article{huggingface2024uni volume = {30}, number = {3}, pages = {850-862}, - month = {Mar} + month = {Mar}, + url = {https://huggingface.co/MahmoodLab/UNI}, + doi = {10.1038/s41591-024-02857-3} } From e0557e07e67784b536a26aa9abe7ea254cec0baf Mon Sep 17 00:00:00 2001 From: David Manthey Date: Wed, 21 May 2025 10:23:01 -0400 Subject: [PATCH 08/10] Update for repo name change --- paper/paper.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 21deecf..413ed6a 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -1,5 +1,5 @@ --- -title: 'WSI Superpixel Guided Labeling' +title: 'Histomics Label' tags: - Python - histology @@ -50,7 +50,7 @@ bibliography: paper.bib # Summary -`WSI Superpixel Guided Labeling` facilitates active learning on whole slide images. It has a user interface built on top of the HistomicsUI [@histomicsui] base and deployed as part of the Digital Slide Archive [@Gutman2017, @digitalslidearchive], and uses the HistomicsTK [@histomicstk] tool kit as part of the process. +`Histomics Label` facilitates active learning on whole slide images. It has a user interface built on top of the HistomicsUI [@histomicsui] base and deployed as part of the Digital Slide Archive [@Gutman2017, @digitalslidearchive], and uses the HistomicsTK [@histomicstk] tool kit as part of the process. Users label superpixel regions or other segmented areas of whole slide images to be used as classification input for machine learning algorithms. An example algorithm is included which generates superpixels, features, and machine learning models for active learning on a directory of images. The interface allows bulk labeling, labeling the most impactful superpixels to improve the model, and reviewing labeled and predicted categories. @@ -58,7 +58,7 @@ Users label superpixel regions or other segmented areas of whole slide images to One of the limitations in generating accurate models is the need for labeled data. Given a model and a few labeled samples, there are a variety of algorithms that can be used to determine what samples should be additionally labeled to most efficiently improve the model. To actually get labeled data, this prediction of which samples to label needs to be combined with an efficient workflow so that the domain expert can use their labeling time in the most effective manner possible. -`WSI Superpixel Guided Labeling` provides a user interface and workflow for this guided labeling process. Given a set of whole slide images, the images are segmented based on a some user choices. This segmentation is the basis for labeling. The user can specify any number of label categories, including labels that will be excluded from training (for instance, for segmented regions whose categories cannot be accurately determined). After labeling a few initial segments, a model is generated and used to both predict the category of all segments and the segments that would result in the best improvement in the model if they were also labeled. The user can retrain the model at any time and review the results of both the predictions and other users. +`Histomics Label` provides a user interface and workflow for this guided labeling process. Given a set of whole slide images, the images are segmented based on a some user choices. This segmentation is the basis for labeling. The user can specify any number of label categories, including labels that will be excluded from training (for instance, for segmented regions whose categories cannot be accurately determined). After labeling a few initial segments, a model is generated and used to both predict the category of all segments and the segments that would result in the best improvement in the model if they were also labeled. The user can retrain the model at any time and review the results of both the predictions and other users. For development, the initial segmentation uses superpixels generated with the SLIC [@SLIC2012] algorithm. These are computed on whole slide images in a tiled manner so that they can work on arbitrarily large images, and the tile boundaries are properly handled to avoid visible artifacts. Once generated, segments are represented in one of two ways, either as two-dimensional patches, each centered in a fixed-sized square of pixels with non-segment pixels set to black, or as one-dimensional vectors, such as those generated from the huggingface UNI [@huggingface2024uni] foundation model. One of two basic models is trained based upon the segment representation. For two-dimensional patches, the model to be trained is a small-scale CNN implemented in tensorflow/keras or torch. For one-dimensional vectors, the model to be trained is a single-layer linear classifier. The certainty criteria for which segments should be labeled next can also be selected, and includes confidence, margin, negative entropy, and the BatchBALD [@batchbald2019] algorithm. From 6a243b05dfbd65a0af450fab0aa0daf472c82fe9 Mon Sep 17 00:00:00 2001 From: Lee Cooper Date: Thu, 5 Jun 2025 09:57:07 -0500 Subject: [PATCH 09/10] Summary and background changes --- paper/paper.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 413ed6a..29db838 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -50,17 +50,18 @@ bibliography: paper.bib # Summary -`Histomics Label` facilitates active learning on whole slide images. It has a user interface built on top of the HistomicsUI [@histomicsui] base and deployed as part of the Digital Slide Archive [@Gutman2017, @digitalslidearchive], and uses the HistomicsTK [@histomicstk] tool kit as part of the process. +`Histomics Label` is a software tool for the interactive development of machine-learning classifiers for whole slide pathology images. It is deployed as part of the Digital Slide Archive [@Gutman2017, @digitalslidearchive], a web-based data management system for whole slide image datasets, and was built on top of HistomicsUI [@histomicsui] and uses the HistomicsTK [@histomicstk] image analysis tool kit. -Users label superpixel regions or other segmented areas of whole slide images to be used as classification input for machine learning algorithms. An example algorithm is included which generates superpixels, features, and machine learning models for active learning on a directory of images. The interface allows bulk labeling, labeling the most impactful superpixels to improve the model, and reviewing labeled and predicted categories. +Users label image regions or tissue structures to provide training data for classifiers and iteratively improve these classifiers by reviewing their output and providing additional labels. The interface uses heuristics to guide users to the most impactful examples to label, and supports bulk labeling of samples and review of labeled examples for collaboration. An example data generation pipeline is included that segments a whole slide image into superpixels, and generates feature embeddings for segmented regions using a foundation model. # Statement of need -One of the limitations in generating accurate models is the need for labeled data. Given a model and a few labeled samples, there are a variety of algorithms that can be used to determine what samples should be additionally labeled to most efficiently improve the model. To actually get labeled data, this prediction of which samples to label needs to be combined with an efficient workflow so that the domain expert can use their labeling time in the most effective manner possible. +One of the limitations in developing classification models is the need for labeled data. In pathology and other medical fields, the expertise required for labeling and busy schedules of medical experts make labeling particularly challenging. For whole slide images, where each image can contain several billion pixels, navigating vast datasets in search of possibly rare can be very inefficient and frustrating. Software interfaces need to be optimized for the user experience and make the most of an expert's time and energy. Other issues in labeling include *some comment on the problem of thick clients and moving large datasets around*, and enabling collaboration between multiple experts, or to allow experts to review the work of their trainees. -`Histomics Label` provides a user interface and workflow for this guided labeling process. Given a set of whole slide images, the images are segmented based on a some user choices. This segmentation is the basis for labeling. The user can specify any number of label categories, including labels that will be excluded from training (for instance, for segmented regions whose categories cannot be accurately determined). After labeling a few initial segments, a model is generated and used to both predict the category of all segments and the segments that would result in the best improvement in the model if they were also labeled. The user can retrain the model at any time and review the results of both the predictions and other users. +`Histomics Label` uses a technique called active learning to identify the unlabeled examples that can provide the most benefit to classifier performance and provides an intuitive workflow +for presenting these examples to experts for efficient labeling. Data can be generated using a built-in pipeline that partitions whole-slide images into superpixels, or users can provide their own data from external cell or tissue segmentation algorithms. Users specify the categories that can be labeled and assign display properties like color, and can exclude categories from classifier training (for instance, for regions whose categories cannot be accurately determined). After labeling a few initial example regions, a classifier is trained and used to both predict the category of all regions and the unlabeled regions that provide the most classifier benefit. The user can retrain the classifier at any time and review the classifier predictions and labels from other users. Labeling can also be performed by painting directly on the whole slide image with a brush tool. -For development, the initial segmentation uses superpixels generated with the SLIC [@SLIC2012] algorithm. These are computed on whole slide images in a tiled manner so that they can work on arbitrarily large images, and the tile boundaries are properly handled to avoid visible artifacts. Once generated, segments are represented in one of two ways, either as two-dimensional patches, each centered in a fixed-sized square of pixels with non-segment pixels set to black, or as one-dimensional vectors, such as those generated from the huggingface UNI [@huggingface2024uni] foundation model. One of two basic models is trained based upon the segment representation. For two-dimensional patches, the model to be trained is a small-scale CNN implemented in tensorflow/keras or torch. For one-dimensional vectors, the model to be trained is a single-layer linear classifier. The certainty criteria for which segments should be labeled next can also be selected, and includes confidence, margin, negative entropy, and the BatchBALD [@batchbald2019] algorithm. +For development, the initial segmentation uses superpixels generated with the SLIC [@SLIC2012] algorithm. These are computed on whole slide images in a tiled manner so that they can work on arbitrarily large images, and the tile boundaries are properly handled to merge seamlessly. Once generated, segments are represented in one of two ways, either as two-dimensional patches, each centered in a fixed-sized square of masked pixels, or as one-dimensional feature embeddings, such as those generated from the huggingface UNI [@huggingface2024uni] foundation model. One of two basic models is trained based upon the segment representation. For two-dimensional patches, the model to be trained is a small-scale CNN implemented in tensorflow/keras or torch. For one-dimensional vectors, the model to be trained is a single-layer linear classifier. The certainty criteria for which segments should be labeled next can also be selected, and includes confidence, margin, negative entropy, and the BatchBALD [@batchbald2019] algorithm. We had a placental pathologist provide feedback to validate the efficiency of the user interface and utility of the process. From 2ac56788d9bd7f9a3c65c14b1b7c740892886a50 Mon Sep 17 00:00:00 2001 From: David Manthey Date: Thu, 5 Jun 2025 17:00:33 -0400 Subject: [PATCH 10/10] Update about client/server architecture --- paper/paper.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 29db838..f4954e9 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -50,13 +50,15 @@ bibliography: paper.bib # Summary -`Histomics Label` is a software tool for the interactive development of machine-learning classifiers for whole slide pathology images. It is deployed as part of the Digital Slide Archive [@Gutman2017, @digitalslidearchive], a web-based data management system for whole slide image datasets, and was built on top of HistomicsUI [@histomicsui] and uses the HistomicsTK [@histomicstk] image analysis tool kit. +`Histomics Label` is a software tool for the interactive development of machine-learning classifiers for whole slide pathology images. It is deployed as part of the Digital Slide Archive [@Gutman2017; @digitalslidearchive], a web-based data management system for whole slide image datasets, and was built on top of HistomicsUI [@histomicsui] and uses the HistomicsTK [@histomicstk] image analysis tool kit. Users label image regions or tissue structures to provide training data for classifiers and iteratively improve these classifiers by reviewing their output and providing additional labels. The interface uses heuristics to guide users to the most impactful examples to label, and supports bulk labeling of samples and review of labeled examples for collaboration. An example data generation pipeline is included that segments a whole slide image into superpixels, and generates feature embeddings for segmented regions using a foundation model. # Statement of need -One of the limitations in developing classification models is the need for labeled data. In pathology and other medical fields, the expertise required for labeling and busy schedules of medical experts make labeling particularly challenging. For whole slide images, where each image can contain several billion pixels, navigating vast datasets in search of possibly rare can be very inefficient and frustrating. Software interfaces need to be optimized for the user experience and make the most of an expert's time and energy. Other issues in labeling include *some comment on the problem of thick clients and moving large datasets around*, and enabling collaboration between multiple experts, or to allow experts to review the work of their trainees. +One of the limitations in developing classification models is the need for labeled data. In pathology and other medical fields, the expertise required for labeling and busy schedules of medical experts make labeling particularly challenging. For whole slide images, where each image can contain several billion pixels, navigating vast datasets in search of possibly rare tissue states can be very inefficient and frustrating. Software interfaces need to be optimized for the user experience and make the most of an expert's time and energy. + +Other issues in labeling include the volume and accessibility of data. Software that must run local to the data requires that all data be copied and correctly versioned for the project. Using a web-client and server model with appropriate permission models, only requires that the data be on centrally managed server. This allows there to be a single, coordinate source of data for a project, and reduces the burden on individual users to only requiring a web browser and ordinary internet connection. This enables collaboration between multiple experts, or to allow experts to review the work of their trainees. `Histomics Label` uses a technique called active learning to identify the unlabeled examples that can provide the most benefit to classifier performance and provides an intuitive workflow for presenting these examples to experts for efficient labeling. Data can be generated using a built-in pipeline that partitions whole-slide images into superpixels, or users can provide their own data from external cell or tissue segmentation algorithms. Users specify the categories that can be labeled and assign display properties like color, and can exclude categories from classifier training (for instance, for regions whose categories cannot be accurately determined). After labeling a few initial example regions, a classifier is trained and used to both predict the category of all regions and the unlabeled regions that provide the most classifier benefit. The user can retrain the classifier at any time and review the classifier predictions and labels from other users. Labeling can also be performed by painting directly on the whole slide image with a brush tool.