From 96064710fa97cb25c9ea63323ee4feebfc119af7 Mon Sep 17 00:00:00 2001 From: Richard Lundeen Date: Thu, 9 Apr 2026 12:31:17 -0700 Subject: [PATCH 1/2] Add pre-commit hook to strip tqdm progress bars from notebooks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create build_scripts/strip_notebook_progress_bars.py that removes tqdm progress bar outputs from notebook stderr streams. Detects patterns like %|, ━, █, and Unicode block characters. Removes entire stderr outputs when all lines are tqdm; strips individual lines otherwise. Add strip-notebook-progress-bars hook to .pre-commit-config.yaml alongside existing sanitize-notebook-paths hook. Strip 117 tqdm outputs from 14 affected notebooks (929 lines removed). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pre-commit-config.yaml | 5 + build_scripts/strip_notebook_progress_bars.py | 96 +++++++++++++++++++ .../auxiliary_attacks/1_gcg_azure_ml.ipynb | 1 - doc/code/front_end/1_pyrit_scan.ipynb | 9 -- doc/code/memory/8_seed_database.ipynb | 32 ------- doc/code/registry/1_class_registry.ipynb | 7 -- doc/code/scenarios/1_red_team_agent.ipynb | 80 ---------------- doc/code/scenarios/2_content_harms.ipynb | 96 ------------------- doc/code/scenarios/3_psychosocial.ipynb | 96 ------------------- doc/code/scenarios/4_cyber.ipynb | 96 ------------------- doc/code/scenarios/5_jailbreak.ipynb | 96 ------------------- doc/code/scenarios/6_leakage.ipynb | 96 ------------------- doc/code/scenarios/7_scam.ipynb | 96 ------------------- doc/code/scenarios/8_garak_encoding.ipynb | 96 ------------------- doc/code/scenarios/9_baseline_only.ipynb | 88 ----------------- doc/code/setup/2_resiliency.ipynb | 40 -------- 16 files changed, 101 insertions(+), 929 deletions(-) create mode 100644 build_scripts/strip_notebook_progress_bars.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index eac582c9a4..c51c487510 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,6 +13,11 @@ repos: entry: python ./build_scripts/sanitize_notebook_paths.py language: python files: ^doc.*\.(ipynb)$ + - id: strip-notebook-progress-bars + name: Strip Notebook Progress Bars + entry: python ./build_scripts/strip_notebook_progress_bars.py + language: python + files: ^doc.*\.(ipynb)$ - id: validate-docs name: Validate Documentation Structure entry: python ./build_scripts/validate_docs.py diff --git a/build_scripts/strip_notebook_progress_bars.py b/build_scripts/strip_notebook_progress_bars.py new file mode 100644 index 0000000000..2a270142fe --- /dev/null +++ b/build_scripts/strip_notebook_progress_bars.py @@ -0,0 +1,96 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import json +import re +import sys + +# tqdm text-mode progress bar patterns: +# - "%|" separates percentage from the bar +# - Block characters (━, █, ▏-▉) are used for the bar itself +# - "\r" carriage returns are used for in-place updates +_TQDM_PATTERNS = [ + re.compile(r"%\|"), # " 0%|" or " 50%|..." + re.compile(r"[━█▏▎▍▌▋▊▉]"), # progress bar block characters +] + + +def _is_tqdm_line(line: str) -> bool: + """ + Check if a line is part of a tqdm progress bar output. + + Args: + line (str): A single line of text from stderr output. + + Returns: + bool: True if the line matches tqdm progress bar patterns. + """ + stripped = line.strip() + if not stripped or stripped == "\r": + # Bare carriage returns or blank lines between tqdm updates + return False + return any(pattern.search(line) for pattern in _TQDM_PATTERNS) + + +def strip_notebook_progress_bars(file_path: str) -> bool: + """ + Remove tqdm progress bar outputs from notebook cell stderr streams. + + Strips stderr stream outputs that contain tqdm progress bar patterns. + If all lines in a stderr output are tqdm lines, the entire output is removed. + If only some lines are tqdm, those lines are stripped and the output is kept. + + Args: + file_path (str): Path to the .ipynb file. + + Returns: + bool: True if the file was modified. + """ + if not file_path.endswith(".ipynb"): + return False + + with open(file_path, encoding="utf-8") as f: + content = json.load(f) + + modified = False + + for cell in content.get("cells", []): + outputs = cell.get("outputs", []) + new_outputs = [] + + for output in outputs: + if output.get("output_type") == "stream" and output.get("name") == "stderr": + text_lines = output.get("text", []) + non_tqdm_lines = [line for line in text_lines if not _is_tqdm_line(line)] + + if len(non_tqdm_lines) < len(text_lines): + modified = True + # Keep output only if there are meaningful non-tqdm lines + remaining = [line for line in non_tqdm_lines if line.strip()] + if remaining: + output["text"] = non_tqdm_lines + new_outputs.append(output) + # else: drop the entire output (all tqdm or only whitespace left) + else: + new_outputs.append(output) + else: + new_outputs.append(output) + + if len(new_outputs) != len(outputs): + cell["outputs"] = new_outputs + + if not modified: + return False + + with open(file_path, "w", encoding="utf-8") as f: + json.dump(content, f, indent=1, ensure_ascii=False) + f.write("\n") + + return True + + +if __name__ == "__main__": + modified_files = [file_path for file_path in sys.argv[1:] if strip_notebook_progress_bars(file_path)] + if modified_files: + print("Stripped tqdm progress bars from:", modified_files) + sys.exit(1) diff --git a/doc/code/auxiliary_attacks/1_gcg_azure_ml.ipynb b/doc/code/auxiliary_attacks/1_gcg_azure_ml.ipynb index fe368fcede..416b27353e 100644 --- a/doc/code/auxiliary_attacks/1_gcg_azure_ml.ipynb +++ b/doc/code/auxiliary_attacks/1_gcg_azure_ml.ipynb @@ -202,7 +202,6 @@ "Example: azcopy copy './git/PyRIT' 'https://romanlutz0437468309.blob.core.windows.net/3f52e8b9-0bac-4c48-9e4a-a92e85a582c4-10s61nn9uso4b2p89xjypawyc7/PyRIT' \n", "\n", "See https://learn.microsoft.com/azure/storage/common/storage-use-azcopy-v10 for more information.\n", - "\u001b[32mUploading PyRIT (194.65 MBs): 100%|##########| 194652493/194652493 [01:19<00:00, 2447407.71it/s] \n", "\u001b[39m\n", "\n" ] diff --git a/doc/code/front_end/1_pyrit_scan.ipynb b/doc/code/front_end/1_pyrit_scan.ipynb index 2a4d6af9c4..88d35d1c88 100644 --- a/doc/code/front_end/1_pyrit_scan.ipynb +++ b/doc/code/front_end/1_pyrit_scan.ipynb @@ -545,16 +545,7 @@ "output_type": "stream", "text": [ "\n", - "Loading datasets - this can take a few minutes: 0%| | 0/58 [00:00\n" ] diff --git a/doc/code/memory/8_seed_database.ipynb b/doc/code/memory/8_seed_database.ipynb index 90f03cd6c2..e74799a272 100644 --- a/doc/code/memory/8_seed_database.ipynb +++ b/doc/code/memory/8_seed_database.ipynb @@ -39,38 +39,6 @@ "id": "2", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\r", - "Loading datasets - this can take a few minutes: 0%| | 0/41 [00:00 Date: Thu, 9 Apr 2026 13:48:23 -0700 Subject: [PATCH 2/2] adding tests --- .../test_strip_notebook_progress_bars.py | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 tests/unit/build_scripts/test_strip_notebook_progress_bars.py diff --git a/tests/unit/build_scripts/test_strip_notebook_progress_bars.py b/tests/unit/build_scripts/test_strip_notebook_progress_bars.py new file mode 100644 index 0000000000..d5d7e55088 --- /dev/null +++ b/tests/unit/build_scripts/test_strip_notebook_progress_bars.py @@ -0,0 +1,147 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import json +import os +import tempfile + +import pytest + +from build_scripts.strip_notebook_progress_bars import _is_tqdm_line, strip_notebook_progress_bars + + +def _make_notebook(outputs: list) -> dict: + return {"cells": [{"cell_type": "code", "outputs": outputs}]} + + +def _write_notebook(nb: dict) -> str: + f = tempfile.NamedTemporaryFile(mode="w", suffix=".ipynb", delete=False, encoding="utf-8") + json.dump(nb, f) + f.close() + return f.name + + +class TestIsTqdmLine: + @pytest.mark.parametrize( + "line", + [ + " 45%|████████████████▍ | 45/100 [00:15<00:18, 2.98it/s]\n", + "100%|██████████| 100/100 [00:30<00:00, 3.33it/s]\n", + " 0%| | 0/50 [00:00 None: + assert _is_tqdm_line(line) is True + + @pytest.mark.parametrize( + "line", + [ + "INFO: Processing file X\n", + "WARNING: something happened\n", + "", + "\r", + " \n", + ], + ) + def test_rejects_non_tqdm_lines(self, line: str) -> None: + assert _is_tqdm_line(line) is False + + +class TestStripNotebookProgressBars: + def test_skips_non_ipynb(self) -> None: + assert strip_notebook_progress_bars("test.py") is False + + def test_no_modification_when_clean(self) -> None: + nb = _make_notebook([{"output_type": "stream", "name": "stdout", "text": ["Hello\n"]}]) + path = _write_notebook(nb) + try: + assert strip_notebook_progress_bars(path) is False + finally: + os.unlink(path) + + def test_strips_all_tqdm_stderr(self) -> None: + nb = _make_notebook( + [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + " 0%| | 0/10 [00:00 None: + nb = _make_notebook( + [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "WARNING: something\n", + " 50%|█████ | 5/10 [00:02<00:02]\n", + ], + } + ] + ) + path = _write_notebook(nb) + try: + assert strip_notebook_progress_bars(path) is True + with open(path, encoding="utf-8") as f: + result = json.load(f) + text = result["cells"][0]["outputs"][0]["text"] + assert text == ["WARNING: something\n"] + finally: + os.unlink(path) + + def test_preserves_stdout_and_other_outputs(self) -> None: + nb = _make_notebook( + [ + {"output_type": "stream", "name": "stdout", "text": ["hello\n"]}, + { + "output_type": "stream", + "name": "stderr", + "text": ["100%|██████████| 10/10\n"], + }, + {"output_type": "execute_result", "data": {"text/plain": "42"}}, + ] + ) + path = _write_notebook(nb) + try: + assert strip_notebook_progress_bars(path) is True + with open(path, encoding="utf-8") as f: + result = json.load(f) + outputs = result["cells"][0]["outputs"] + assert len(outputs) == 2 + assert outputs[0]["name"] == "stdout" + assert outputs[1]["output_type"] == "execute_result" + finally: + os.unlink(path) + + def test_idempotent(self) -> None: + nb = _make_notebook( + [ + { + "output_type": "stream", + "name": "stderr", + "text": ["100%|██████████| 10/10\n"], + } + ] + ) + path = _write_notebook(nb) + try: + assert strip_notebook_progress_bars(path) is True + assert strip_notebook_progress_bars(path) is False + finally: + os.unlink(path)