Skip to content

Commit 2bcbf3d

Browse files
Feature/search replace api (#116)
* Support the upcoming API for search/replace in transcript
1 parent 0ecbbc7 commit 2bcbf3d

File tree

5 files changed

+172
-1
lines changed

5 files changed

+172
-1
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [3.0.3] - 2025-03-03
9+
10+
### Added
11+
12+
- Support search/replace API (DEL-24399 DEL-24766)
13+
814
## [3.0.2] - 2024-12-18
915

1016
### Added

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
3.0.2
1+
3.0.3

speechmatics/cli.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,61 @@ def print_symbol(symbol):
5656
print(symbol, end="", file=sys.stderr, flush=True)
5757

5858

59+
def parse_word_replacements(replacement_words_filepath) -> List[Dict]:
60+
"""
61+
Parses a word replacements list from a file.
62+
63+
:param replacement_words_filepath: Path to the replacement words file.
64+
:type replacement_words_filepath: str
65+
66+
:return: A list of objects which are the replacement from->to pairs
67+
:rtype: List[Dict]
68+
69+
:raises SystemExit: If the file is not valid JSON.
70+
71+
The file should be formatted as:
72+
```
73+
[
74+
{"from":"search_term", "to":"the_replacement"},
75+
{"from":"/^[Dd]octor$/", "to":"Dr"}
76+
]
77+
```
78+
"""
79+
80+
replacement_words = []
81+
with open(replacement_words_filepath, encoding="utf-8") as replacement_words_file:
82+
try:
83+
replacement_words = json.load(replacement_words_file)
84+
except json.JSONDecodeError as exc:
85+
raise SystemExit(
86+
f"Word replacements at: {replacement_words_filepath} "
87+
f"is not valid json."
88+
) from exc
89+
90+
if not isinstance(replacement_words, list):
91+
raise SystemExit(
92+
(
93+
f"Word replacements file at: {replacement_words_filepath} "
94+
"should be a list of objects."
95+
)
96+
)
97+
if not replacement_words:
98+
LOGGER.warning(
99+
"Provided word replacements at: %s is an empty list.",
100+
replacement_words_filepath,
101+
)
102+
for item in replacement_words:
103+
if "from" not in item or "to" not in item:
104+
raise SystemExit(
105+
(
106+
f"Word replacements file at: {replacement_words_filepath} "
107+
"should have 'from' and 'to' keys."
108+
)
109+
)
110+
111+
return replacement_words
112+
113+
59114
def parse_additional_vocab(additional_vocab_filepath):
60115
"""
61116
Parses an additional vocab list from a file.
@@ -240,6 +295,27 @@ def get_transcription_config(
240295
"remove_disfluencies"
241296
)
242297

298+
if args.get("replacement_words_file") is not None:
299+
replace_words = parse_word_replacements(args["replacement_words_file"])
300+
if "transcript_filtering_config" not in config:
301+
config["transcript_filtering_config"] = {}
302+
config["transcript_filtering_config"]["replacements"] = replace_words
303+
LOGGER.info(
304+
"Using additional vocab from file %s", args["replacement_words_file"]
305+
)
306+
307+
if args.get("replacement_words") is not None:
308+
if "transcript_filtering_config" not in config:
309+
config["transcript_filtering_config"] = {}
310+
if "replacements" in config["transcript_filtering_config"]:
311+
config["transcript_filtering_config"]["replacements"].extend(
312+
args.get("replacement_words")
313+
)
314+
else:
315+
config["transcript_filtering_config"]["replacements"] = args.get(
316+
"replacement_words"
317+
)
318+
243319
if args.get("ctrl"):
244320
LOGGER.warning(f"Using internal dev control command: {args['ctrl']}")
245321
config["ctrl"] = json.loads(args["ctrl"])

speechmatics/cli_parser.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import argparse
66
import logging
77
from urllib.parse import urlparse
8+
from typing import Dict
89

910
LOGGER = logging.getLogger(__name__)
1011

@@ -28,6 +29,32 @@ def __call__(self, parser, args, values, option_string=None):
2829
setattr(args, self.dest, d)
2930

3031

32+
def replacement_words_item(to_parse: str) -> Dict[str, str]:
33+
"""
34+
Parses a single item of replacement words. Used in conjunction with the
35+
replacement words command line argument.
36+
37+
:param to_parse: The item to parse.
38+
:type to_parse: str
39+
40+
:return: a dictionary of replacements
41+
:rtype: dict
42+
"""
43+
to_parse = str(to_parse)
44+
parts = to_parse.split(":", 2)
45+
if len(parts) != 2:
46+
raise argparse.ArgumentTypeError(
47+
f"Must have exactly two colon-separated parts in replacement words: "
48+
f"{to_parse}."
49+
)
50+
if len(parts[0]) == 0:
51+
raise argparse.ArgumentTypeError(
52+
f"Replacement words must have a 'from' value in: {to_parse}"
53+
)
54+
55+
return {"from": parts[0], "to": parts[1]}
56+
57+
3158
def additional_vocab_item(to_parse):
3259
"""
3360
Parses a single item of additional vocab. Used in conjunction with the
@@ -198,6 +225,24 @@ def get_arg_parser():
198225
required=False,
199226
help=("Removes words tagged as disfluency."),
200227
)
228+
config_parser.add_argument(
229+
"--replacement-words",
230+
nargs="*",
231+
default=None,
232+
type=replacement_words_item,
233+
help=(
234+
"List of replacements to make in the transcript, in the form 'from:to' or '/regex/:to'."
235+
),
236+
)
237+
config_parser.add_argument(
238+
"--replacement-words-file",
239+
default=None,
240+
type=str,
241+
help=(
242+
"Path to a file containing a list of words to replace in the transcript."
243+
"The file should be formatted as as JSON list of objects with 'from' and 'to' keys."
244+
),
245+
)
201246
config_parser.add_argument(
202247
"--operating-point",
203248
choices=["standard", "enhanced"],

tests/test_cli.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,36 @@
109109
]
110110
},
111111
),
112+
(
113+
[
114+
"batch",
115+
"transcribe",
116+
"--replacement-words",
117+
"foo:bar",
118+
"/regex*/:[redacted]",
119+
],
120+
{
121+
"replacement_words": [
122+
{"from": "foo", "to": "bar"},
123+
{"from": "/regex*/", "to": "[redacted]"},
124+
],
125+
},
126+
),
127+
(
128+
[
129+
"rt",
130+
"transcribe",
131+
"--replacement-words",
132+
"foo:bar",
133+
"/regex*/:[redacted]",
134+
],
135+
{
136+
"replacement_words": [
137+
{"from": "foo", "to": "bar"},
138+
{"from": "/regex*/", "to": "[redacted]"},
139+
],
140+
},
141+
),
112142
(
113143
["rt", "transcribe", "--punctuation-permitted-marks", ", ? ."],
114144
{"punctuation_permitted_marks": ", ? ."},
@@ -584,6 +614,8 @@ def test_rt_main_with_all_options(mock_server, tmp_path):
584614
vocab_file.write_text(
585615
'["jabberwock", {"content": "brillig", "sounds_like": ["brillick"]}]'
586616
)
617+
replacement_words_file = tmp_path / "replacement_words.json"
618+
replacement_words_file.write_text('[{"from": "baz", "to": "quux"}]')
587619

588620
chunk_size = 1024 * 8
589621
audio_path = path_to_test_resource("ch.wav")
@@ -620,6 +652,11 @@ def test_rt_main_with_all_options(mock_server, tmp_path):
620652
"--auth-token=xyz",
621653
audio_path,
622654
"--remove-disfluencies",
655+
"--replacement-words-file",
656+
str(replacement_words_file),
657+
"--replacement-words",
658+
"foo:bar",
659+
"/regex*/:[redacted]",
623660
]
624661

625662
cli.main(vars(cli.parse_args(args)))
@@ -662,6 +699,13 @@ def test_rt_main_with_all_options(mock_server, tmp_path):
662699
]
663700
is True
664701
)
702+
assert msg["transcription_config"]["transcript_filtering_config"][
703+
"replacements"
704+
] == [
705+
{"from": "baz", "to": "quux"},
706+
{"from": "foo", "to": "bar"},
707+
{"from": "/regex*/", "to": "[redacted]"},
708+
]
665709

666710
# Check that the chunk size argument is respected
667711
add_audio_messages = mock_server.find_add_audio_messages()

0 commit comments

Comments
 (0)