Feature/search replace api (#116)

JamesG-Speechmatics · web-flow · commit 2bcbf3db240f · 2025-03-05T13:39:13.000Z
* Support the upcoming API for search/replace in transcript
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.0.3] - 2025-03-03
+
+### Added
+
+- Support search/replace API (DEL-24399 DEL-24766)
+
 ## [3.0.2] - 2024-12-18
 
 ### Added
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-3.0.2
+3.0.3
diff --git a/speechmatics/cli.py b/speechmatics/cli.py
@@ -56,6 +56,61 @@ def print_symbol(symbol):
     print(symbol, end="", file=sys.stderr, flush=True)
 
 
+def parse_word_replacements(replacement_words_filepath) -> List[Dict]:
+    """
+    Parses a word replacements list from a file.
+
+    :param replacement_words_filepath: Path to the replacement words file.
+    :type replacement_words_filepath: str
+
+    :return: A list of objects which are the replacement from->to pairs
+    :rtype: List[Dict]
+
+    :raises SystemExit: If the file is not valid JSON.
+
+    The file should be formatted as:
+    ```
+    [
+        {"from":"search_term", "to":"the_replacement"},
+        {"from":"/^[Dd]octor$/", "to":"Dr"}
+    ]
+    ```
+    """
+
+    replacement_words = []
+    with open(replacement_words_filepath, encoding="utf-8") as replacement_words_file:
+        try:
+            replacement_words = json.load(replacement_words_file)
+        except json.JSONDecodeError as exc:
+            raise SystemExit(
+                f"Word replacements at: {replacement_words_filepath} "
+                f"is not valid json."
+            ) from exc
+
+    if not isinstance(replacement_words, list):
+        raise SystemExit(
+            (
+                f"Word replacements file at: {replacement_words_filepath} "
+                "should be a list of objects."
+            )
+        )
+    if not replacement_words:
+        LOGGER.warning(
+            "Provided word replacements at: %s is an empty list.",
+            replacement_words_filepath,
+        )
+    for item in replacement_words:
+        if "from" not in item or "to" not in item:
+            raise SystemExit(
+                (
+                    f"Word replacements file at: {replacement_words_filepath} "
+                    "should have 'from' and 'to' keys."
+                )
+            )
+
+    return replacement_words
+
+
 def parse_additional_vocab(additional_vocab_filepath):
     """
     Parses an additional vocab list from a file.
@@ -240,6 +295,27 @@ def get_transcription_config(
             "remove_disfluencies"
         )
 
+    if args.get("replacement_words_file") is not None:
+        replace_words = parse_word_replacements(args["replacement_words_file"])
+        if "transcript_filtering_config" not in config:
+            config["transcript_filtering_config"] = {}
+        config["transcript_filtering_config"]["replacements"] = replace_words
+        LOGGER.info(
+            "Using additional vocab from file %s", args["replacement_words_file"]
+        )
+
+    if args.get("replacement_words") is not None:
+        if "transcript_filtering_config" not in config:
+            config["transcript_filtering_config"] = {}
+        if "replacements" in config["transcript_filtering_config"]:
+            config["transcript_filtering_config"]["replacements"].extend(
+                args.get("replacement_words")
+            )
+        else:
+            config["transcript_filtering_config"]["replacements"] = args.get(
+                "replacement_words"
+            )
+
     if args.get("ctrl"):
         LOGGER.warning(f"Using internal dev control command: {args['ctrl']}")
         config["ctrl"] = json.loads(args["ctrl"])
diff --git a/speechmatics/cli_parser.py b/speechmatics/cli_parser.py
@@ -5,6 +5,7 @@
 import argparse
 import logging
 from urllib.parse import urlparse
+from typing import Dict
 
 LOGGER = logging.getLogger(__name__)
 
@@ -28,6 +29,32 @@ def __call__(self, parser, args, values, option_string=None):
             setattr(args, self.dest, d)
 
 
+def replacement_words_item(to_parse: str) -> Dict[str, str]:
+    """
+    Parses a single item of replacement words. Used in conjunction with the
+    replacement words command line argument.
+
+    :param to_parse: The item to parse.
+    :type to_parse: str
+
+    :return: a dictionary of replacements
+    :rtype: dict
+    """
+    to_parse = str(to_parse)
+    parts = to_parse.split(":", 2)
+    if len(parts) != 2:
+        raise argparse.ArgumentTypeError(
+            f"Must have exactly two colon-separated parts in replacement words: "
+            f"{to_parse}."
+        )
+    if len(parts[0]) == 0:
+        raise argparse.ArgumentTypeError(
+            f"Replacement words must have a 'from' value in: {to_parse}"
+        )
+
+    return {"from": parts[0], "to": parts[1]}
+
+
 def additional_vocab_item(to_parse):
     """
     Parses a single item of additional vocab. Used in conjunction with the
@@ -198,6 +225,24 @@ def get_arg_parser():
         required=False,
         help=("Removes words tagged as disfluency."),
     )
+    config_parser.add_argument(
+        "--replacement-words",
+        nargs="*",
+        default=None,
+        type=replacement_words_item,
+        help=(
+            "List of replacements to make in the transcript, in the form 'from:to' or '/regex/:to'."
+        ),
+    )
+    config_parser.add_argument(
+        "--replacement-words-file",
+        default=None,
+        type=str,
+        help=(
+            "Path to a file containing a list of words to replace in the transcript."
+            "The file should be formatted as as JSON list of objects with 'from' and 'to' keys."
+        ),
+    )
     config_parser.add_argument(
         "--operating-point",
         choices=["standard", "enhanced"],
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -109,6 +109,36 @@
                 ]
             },
         ),
+        (
+            [
+                "batch",
+                "transcribe",
+                "--replacement-words",
+                "foo:bar",
+                "/regex*/:[redacted]",
+            ],
+            {
+                "replacement_words": [
+                    {"from": "foo", "to": "bar"},
+                    {"from": "/regex*/", "to": "[redacted]"},
+                ],
+            },
+        ),
+        (
+            [
+                "rt",
+                "transcribe",
+                "--replacement-words",
+                "foo:bar",
+                "/regex*/:[redacted]",
+            ],
+            {
+                "replacement_words": [
+                    {"from": "foo", "to": "bar"},
+                    {"from": "/regex*/", "to": "[redacted]"},
+                ],
+            },
+        ),
         (
             ["rt", "transcribe", "--punctuation-permitted-marks", ", ? ."],
             {"punctuation_permitted_marks": ", ? ."},
@@ -584,6 +614,8 @@ def test_rt_main_with_all_options(mock_server, tmp_path):
     vocab_file.write_text(
         '["jabberwock", {"content": "brillig", "sounds_like": ["brillick"]}]'
     )
+    replacement_words_file = tmp_path / "replacement_words.json"
+    replacement_words_file.write_text('[{"from": "baz", "to": "quux"}]')
 
     chunk_size = 1024 * 8
     audio_path = path_to_test_resource("ch.wav")
@@ -620,6 +652,11 @@ def test_rt_main_with_all_options(mock_server, tmp_path):
         "--auth-token=xyz",
         audio_path,
         "--remove-disfluencies",
+        "--replacement-words-file",
+        str(replacement_words_file),
+        "--replacement-words",
+        "foo:bar",
+        "/regex*/:[redacted]",
     ]
 
     cli.main(vars(cli.parse_args(args)))
@@ -662,6 +699,13 @@ def test_rt_main_with_all_options(mock_server, tmp_path):
         ]
         is True
     )
+    assert msg["transcription_config"]["transcript_filtering_config"][
+        "replacements"
+    ] == [
+        {"from": "baz", "to": "quux"},
+        {"from": "foo", "to": "bar"},
+        {"from": "/regex*/", "to": "[redacted]"},
+    ]
 
     # Check that the chunk size argument is respected
     add_audio_messages = mock_server.find_add_audio_messages()