dmort27 · dmort27 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/epitran/_epitran.py b/epitran/_epitran.py
@@ -23,7 +23,6 @@ class Epitran(object):
     :param rev_preproc bool: if True, apply preprocessors when reverse transliterating
     :param rev_postproc bool: if True, apply postprocessors when reverse transliterating
     """
-    @final
     special = {'eng-Latn': FliteLexLookup,
                'cmn-Hans': Epihan,
                'cmn-Hant': EpihanTraditional,
@@ -122,7 +121,7 @@ def xsampa_list(self, word: str, normpunc: bool=False, ligaturize: bool=False) -
                                                           ligaturize))
         return list(map(self.xsampa.ipa2xs, ipa_segs))
 
-    def word_to_tuples(self, word: str, normpunc: bool=False, _ligaturize: bool=False) -> "list[tuple[str, str, str, str, list[int]]]":
+    def word_to_tuples(self, word: str, normpunc: bool=False, _ligaturize: bool=False) -> "list[tuple[str, int, str, str, list[tuple[str, list[int]]]]]":
         """Given a word, returns a list of tuples corresponding to IPA segments. The "feature
         vectors" form a list consisting of (segment, vector) pairs.
         For IPA segments, segment is a substring of phonetic_form such that the

diff --git a/epitran/bin/connl2engipaspace.py b/epitran/bin/connl2engipaspace.py
@@ -25,7 +25,7 @@ def norm(c: str) -> str:
 
 
 def add_record(flite: epitran.flite.Flite, ft: panphon.FeatureTable, orth: str) -> Counter[str]:
-    space = Counter()
+    space: Counter[str] = Counter()
     orth = normpunc(flite, orth)
     trans = flite.transliterate(orth)
     while trans:
@@ -34,7 +34,7 @@ def add_record(flite: epitran.flite.Flite, ft: panphon.FeatureTable, orth: str)
             space[pref] += 1
             trans = trans[len(pref):]
         else:
-            if trans[0] in flite.puncnorm_vals:
+            if trans[0] in flite.puncnorm.puncnorm:
                 space[trans[0]] += 1
             else:
                 space[trans[0]] += 1
@@ -43,7 +43,7 @@ def add_record(flite: epitran.flite.Flite, ft: panphon.FeatureTable, orth: str)
 
 
 def add_file(flite: epitran.flite.Flite, ft: panphon.FeatureTable, fn: str) -> Counter[str]:
-    space = Counter()
+    space: Counter[str] = Counter()
     with codecs.open(fn, 'r', 'utf-8') as f:
         for line in f:
             fields = line.split(u'\t')
@@ -56,7 +56,7 @@ def add_file(flite: epitran.flite.Flite, ft: panphon.FeatureTable, fn: str) -> C
 
 def print_space(output: str, space: Counter[str]) -> None:
     pairs = enumerate(sorted(filter(lambda x: x, space.keys())))
-    with open(output, 'wb') as f:
+    with open(output, 'w', newline='', encoding='utf-8') as f:
         writer = csv.writer(f)
         for i, char in pairs:
             writer.writerow((i, char))
@@ -65,7 +65,7 @@ def print_space(output: str, space: Counter[str]) -> None:
 def main(infiles: List[str], output: str) -> None:
     flite = epitran.flite.Flite()
     ft = panphon.FeatureTable()
-    space = Counter()
+    space: Counter[str] = Counter()
     for fn in infiles:
         logger.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
         space.update(add_file(flite, ft, fn))

diff --git a/epitran/bin/connl2ipaspace.py b/epitran/bin/connl2ipaspace.py
@@ -23,7 +23,7 @@ def norm(c: str) -> str:
 
 
 def add_record_gen(epi: epitran.Epitran, ft: panphon.FeatureTable, orth: str) -> Counter[str]:
-    space = Counter()
+    space: Counter[str] = Counter()
     orth = normpunc(epi, orth)
     trans = epi.transliterate(orth)
     while trans:
@@ -38,7 +38,7 @@ def add_record_gen(epi: epitran.Epitran, ft: panphon.FeatureTable, orth: str) ->
 
 
 def add_file_gen(epi: epitran.Epitran, ft: panphon.FeatureTable, fn: str) -> Counter[str]:
-    space = Counter()
+    space: Counter[str] = Counter()
     with codecs.open(fn, 'r', 'utf-8') as f:
         for line in f:
             fields = line.split(u'\t')
@@ -50,7 +50,7 @@ def add_file_gen(epi: epitran.Epitran, ft: panphon.FeatureTable, fn: str) -> Cou
 
 
 def add_file_op(epi: epitran.Epitran, ft: panphon.FeatureTable, fn: str) -> Counter[str]:
-    space = Counter()
+    space: Counter[str] = Counter()
     with codecs.open(fn, 'r', 'utf-8') as f:
         for line in f:
             fields = line.split(u'\t')
@@ -74,7 +74,7 @@ def add_file_op(epi: epitran.Epitran, ft: panphon.FeatureTable, fn: str) -> Coun
 
 def print_space(output: str, space: Counter[str]) -> None:
     pairs = enumerate(sorted(filter(lambda x: x, space.keys())))
-    with open(output, 'wb') as f:
+    with open(output, 'w', newline='', encoding='utf-8') as f:
         writer = csv.writer(f)
         for i, char in pairs:
             writer.writerow((i, char))
@@ -83,7 +83,7 @@ def print_space(output: str, space: Counter[str]) -> None:
 def main(code: str, op: bool, infiles: List[str], output: str) -> None:
     epi = epitran.Epitran(code)
     ft = panphon.FeatureTable()
-    space = Counter()
+    space: Counter[str] = Counter()
     for fn in infiles:
         logger.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
         add_file = add_file_op if op else add_file_gen

diff --git a/epitran/bin/detectcaps.py b/epitran/bin/detectcaps.py
@@ -7,16 +7,14 @@
 
 def main() -> None:
     for line in fileinput.input():
-        line = line.decode('utf-8')
         token = line.strip()
         if len(token) > 1 and unicodedata.category(token[1]) == 'Lu':
             is_cap = 0
         elif len(token) > 0 and unicodedata.category(token[0]) == 'Lu':
             is_cap = 1
         else:
             is_cap = 0
-        line = u'{}\t{}'.format(is_cap, token)
-        line = line.encode('utf-8')
+        line = '{}\t{}'.format(is_cap, token)
         print(line)
 
 

diff --git a/epitran/bin/epitranscribe.py b/epitran/bin/epitranscribe.py
@@ -10,10 +10,8 @@
 def main(code: str) -> None:
     epi = epitran.Epitran(code)
     for line in sys.stdin:  # pointless
-        line = line.decode('utf-8')
         line = unicodedata.normalize('NFD', line.lower())
         line = epi.transliterate(line)
-        line = line.encode('utf-8')
         sys.stdout.write(line)
 
 

diff --git a/epitran/bin/extract_test_cases.py b/epitran/bin/extract_test_cases.py
@@ -4,11 +4,11 @@
 import glob
 import csv
 
-RE_DERIVATION = re.compile("""self\._?derivation\(u?['"]([^'"]+)['"], u?['"]([^'"]+)['"]\)""", re.M | re.S)
-RE_TR = re.compile("""self\.epi\.transliterate\(['"]([^'"]+)['"]\).+?self\.assertEqual\(tr, ['"]([^'"]+)['"]\)""", re.M | re.S)
-RE_RES = re.compile("""self\.epi\.transliterate\(['"]([^'"]+)['"]\).+?self\.assertEqual\(res, ['"]([^'"]+)['"]\)""", re.M | re.S)
-RE_ASSERT_TRANS = re.compile("""self\._assert_trans\(['"]([^'"]+)['"],\s*['"]([^'"]+)['"]\)""", re.M | re.S)
-RE_TUPLE = re.compile("""\(['"]([^'"]+)['"],\s*['"]([^'"]+)['"]\)""", re.M | re.S)
+RE_DERIVATION = re.compile(r"""self\._?derivation\(u?['"]([^'"]+)['"], u?['"]([^'"]+)['"]\)""", re.M | re.S)
+RE_TR = re.compile(r"""self\.epi\.transliterate\(['"]([^'"]+)['"]\).+?self\.assertEqual\(tr, ['"]([^'"]+)['"]\)""", re.M | re.S)
+RE_RES = re.compile(r"""self\.epi\.transliterate\(['"]([^'"]+)['"]\).+?self\.assertEqual\(res, ['"]([^'"]+)['"]\)""", re.M | re.S)
+RE_ASSERT_TRANS = re.compile(r"""self\._assert_trans\(['"]([^'"]+)['"],\s*['"]([^'"]+)['"]\)""", re.M | re.S)
+RE_TUPLE = re.compile(r"""\(['"]([^'"]+)['"],\s*['"]([^'"]+)['"]\)""", re.M | re.S)
 RE_CODE = re.compile("""["']([a-z]{3}-[A-Z][a-z]{3})["']""")
 
 def extract_code(code: str) -> str:

diff --git a/epitran/bin/ltf2ipaspace.py b/epitran/bin/ltf2ipaspace.py
@@ -33,7 +33,7 @@ def read_input(input_: List[List[str]], langscript: str) -> Set[str]:
 
 
 def write_output(output: str, space: Set[str]) -> None:
-    with open(output, 'wb') as f:
+    with open(output, 'w', newline='', encoding='utf-8') as f:
         writer = csv.writer(f)
         for n, ch in enumerate(sorted(list(space))):
             writer.writerow((n, ch))

diff --git a/epitran/bin/migraterules.py b/epitran/bin/migraterules.py
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 
 
+import csv
 import glob
 import re
 from typing import List, Optional
@@ -20,19 +21,24 @@ def build_rule(fields: List[str]) -> Optional[str]:
 
 
 def main() -> None:
-    for csv in glob.glob('*.csv'):
-        txt = re.match('[A-Za-z-]+', csv).group(0) + '.txt'
-        with open(csv, 'r', encoding='utf-8') as f, open(txt, 'w', encoding='utf-8') as g:
+    for csv_file in glob.glob('*.csv'):
+        match = re.match('[A-Za-z-]+', csv_file)
+        if match:
+            txt = match.group(0) + '.txt'
+        else:
+            continue
+        with open(csv_file, 'r', encoding='utf-8') as f, open(txt, 'w', encoding='utf-8') as g:
             reader = csv.reader(f)
             next(reader)
             for fields in reader:
-                if re.match('\s*%', fields[0]):
+                if re.match(r'\s*%', fields[0]):
                     print(','.join([x for x in fields if x]), file=g)
                 else:
                     rule = build_rule(fields)
-                    rule = re.sub('[ ]+', ' ', rule)
-                    rule = re.sub('[ ]$', '', rule)
-                    print(rule, file=g)
+                    if rule is not None:
+                        rule = re.sub('[ ]+', ' ', rule)
+                        rule = re.sub('[ ]$', '', rule)
+                        print(rule, file=g)
 
 
 if __name__ == '__main__':

diff --git a/epitran/bin/space2punc.py b/epitran/bin/space2punc.py
@@ -14,7 +14,7 @@ def main(fns: List[str], fnn: str) -> None:
             for _, s in reader:
                 if len(s) == 1 and unicodedata.category(s)[0] == u'P':
                     punc.add(s)
-    with open(fnn, 'wb') as f:
+    with open(fnn, 'w', newline='', encoding='utf-8') as f:
         writer = csv.writer(f)
         for mark in sorted(list(punc)):
             writer.writerow([mark])

diff --git a/epitran/bin/uigtransliterate.py b/epitran/bin/uigtransliterate.py
@@ -5,5 +5,5 @@
 
 epi = epitran.Epitran('uig-Arab')
 for line in fileinput.input():
-    s = epi.transliterate(line.strip().decode('utf-8'))
-    print(s.encode('utf-8'))
+    s = epi.transliterate(line.strip())
+    print(s)
diff --git a/epitran/cedict.py b/epitran/cedict.py
@@ -43,8 +43,8 @@ def _construct_trie(self, hanzi: Dict[str, Tuple[List[str], List[str]]]) -> Any:
         pairs = []
         for hz, df in self.hanzi.items():
             py, en = df
-            py = ''.join(filter(lambda x: x in ASCII_CHARS, ' '.join(py)))
-            pairs.append((hz, (py.encode('utf-8'),)))
+            py_str = ''.join(filter(lambda x: x in ASCII_CHARS, ' '.join(py)))
+            pairs.append((hz, (py_str.encode('utf-8'),)))
         trie = marisa_trie.RecordTrie('@s', pairs)
         return trie
 

diff --git a/epitran/epihan.py b/epitran/epihan.py
@@ -88,14 +88,14 @@ def transliterate(self, text: str, normpunc: bool = False, ligatures: bool = Fal
         for token in tokens:
             if token in self.cedict.hanzi:
                 (pinyin, _) = self.cedict.hanzi[token]
-                pinyin = u''.join(pinyin).lower()
-                ipa = self.rules.apply(pinyin)
+                pinyin_str = u''.join(pinyin).lower()
+                ipa = self.rules.apply(pinyin_str)
                 ipa_tokens.append(ipa.replace(u',', u''))
             else:
                 if normpunc:
                     token = self.normalize_punc(token)
                 ipa_tokens.append(token)
-        ipa_tokens = map(ligaturize, ipa_tokens)\
+        ipa_tokens = list(map(ligaturize, ipa_tokens))\
                 if ligatures else ipa_tokens
         return u''.join(ipa_tokens)
 
@@ -164,6 +164,19 @@ def __init__(self, **kwargs) -> None:
         self.regexp = re.compile(r'\p{Han}')
 
 class EpiJpan(object):
+    punc = [(u'\uff0c', u','),
+            (u'\uff01', u'!'),
+            (u'\uff1f', u'?'),
+            (u'\uff1b', u';'),
+            (u'\uff1a', u':'),
+            (u'\uff08', u'('),
+            (u'\uff09', u')'),
+            (u'\uff3b', u'['),
+            (u'\uff3d', u']'),
+            (u'\u3010', u'['),
+            (u'\u3011', u']'),
+            ]
+
     def __init__(self, **kwargs) -> None:
         """Construct epitran object for Japanese
 
@@ -185,6 +198,20 @@ def __init__(self, **kwargs) -> None:
         self.regexp = None
         self.tones = tones
 
+    def normalize_punc(self, text: str) -> str:
+        """Normalize punctutation in a string
+
+        Args:
+            text (str): an orthographic string
+
+        Return:
+            str: an orthographic string with punctation normalized to
+                     Western equivalents
+        """
+        for a, b in self.punc:
+            text = text.replace(a, b)
+        return text
+
     def transliterate(self, text: str, normpunc: bool = False, ligatures: bool = False) -> str:
         tokens = self.cedict.tokenize(text)
         ipa_tokens = []

diff --git a/epitran/flite.py b/epitran/flite.py
@@ -59,7 +59,7 @@ def arpa_text_to_list(self, arpa_text: str) -> List[str]:
     def arpa_to_ipa(self, arpa_text: str, ligatures: bool = False) -> str:
         arpa_text = arpa_text.strip()
         arpa_list = self.arpa_text_to_list(arpa_text)
-        arpa_list = map(lambda d: re.sub(r'\d', '', d), arpa_list)
+        arpa_list = list(map(lambda d: re.sub(r'\d', '', d), arpa_list))
         ipa_list = map(lambda d: self.arpa_map[d], arpa_list)
         text = ''.join(ipa_list)
         return text
@@ -168,8 +168,8 @@ class FliteT2P(Flite):
     def english_g2p(self, text: str) -> str:
         text = self.normalize(text)
         try:
-            arpa_text = subprocess.check_output(['t2p', '"{}"'.format(text)])
-            arpa_text = arpa_text.decode('utf-8')
+            arpa_bytes = subprocess.check_output(['t2p', '"{}"'.format(text)])
+            arpa_text = arpa_bytes.decode('utf-8')
         except OSError:
             logger.warning('t2p (from flite) is not installed.')
             arpa_text = ''
@@ -188,8 +188,8 @@ def arpa_text_to_list(self, arpa_text: str) -> List[str]:
     def english_g2p(self, text: str) -> str:
         text = self.normalize(text).lower()
         try:
-            arpa_text = subprocess.check_output(['lex_lookup', text])
-            arpa_text = arpa_text.decode('utf-8')
+            arpa_bytes = subprocess.check_output(['lex_lookup', text])
+            arpa_text = arpa_bytes.decode('utf-8')
         except OSError:
             logger.warning('lex_lookup (from flite) is not installed.')
             arpa_text = ''

diff --git a/epitran/ppprocessor.py b/epitran/ppprocessor.py
@@ -1,7 +1,7 @@
 
 import logging
 import os.path
-
+from pathlib import Path
 
 from importlib import resources
 
@@ -33,7 +33,7 @@ def _read_rules(self, code: str, fix: str, rev: bool) -> Rules:
         try:
             resource_path = resources.files(__package__).joinpath(fn)
             if resource_path.is_file():
-                return Rules([resource_path])
+                return Rules([Path(str(resource_path))])
             else:
                 return Rules([])
         except (KeyError, FileNotFoundError):

diff --git a/epitran/reromanize.py b/epitran/reromanize.py
@@ -28,8 +28,8 @@ def __init__(self, code: str, table: str, decompose: bool = True, cedict_file: O
         self.mapping = self._load_reromanizer(table, decompose)
 
     def _load_reromanizer(self, table: str, decompose: bool) -> Dict[str, str]:
-        path = os.path.join('data', 'reromanize', table + '.csv')
-        path = resources.files(__package__).joinpath(path)
+        path_str = os.path.join('data', 'reromanize', table + '.csv')
+        path = resources.files(__package__).joinpath(path_str)
         if path.is_file():
             mapping = {}
             with path.open('r', encoding='utf-8') as f:

diff --git a/epitran/rules.py b/epitran/rules.py
@@ -88,6 +88,7 @@ def _read_rule(self, i: int, line: str) -> Optional[Callable[[str], str]]:
                         return self._fields_to_function(a, b, X, Y)
                 except Exception as e:
                     raise DatafileError('Line {}: "{}" cannot be compiled as regex: ̪{}'.format(i + 1, line, e))
+        return None
 
     def _fields_to_function_metathesis(self, a: str, X: str, Y: str) -> Callable[[str], str]:
         left = r'(?P<X>{}){}(?P<Y>{})'.format(X, a, Y)