Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions epitran/_epitran.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ class Epitran(object):
:param rev_preproc bool: if True, apply preprocessors when reverse transliterating
:param rev_postproc bool: if True, apply postprocessors when reverse transliterating
"""
@final
special = {'eng-Latn': FliteLexLookup,
'cmn-Hans': Epihan,
'cmn-Hant': EpihanTraditional,
Expand Down Expand Up @@ -122,7 +121,7 @@ def xsampa_list(self, word: str, normpunc: bool=False, ligaturize: bool=False) -
ligaturize))
return list(map(self.xsampa.ipa2xs, ipa_segs))

def word_to_tuples(self, word: str, normpunc: bool=False, _ligaturize: bool=False) -> "list[tuple[str, str, str, str, list[int]]]":
def word_to_tuples(self, word: str, normpunc: bool=False, _ligaturize: bool=False) -> "list[tuple[str, int, str, str, list[tuple[str, list[int]]]]]":
"""Given a word, returns a list of tuples corresponding to IPA segments. The "feature
vectors" form a list consisting of (segment, vector) pairs.
For IPA segments, segment is a substring of phonetic_form such that the
Expand Down
10 changes: 5 additions & 5 deletions epitran/bin/connl2engipaspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def norm(c: str) -> str:


def add_record(flite: epitran.flite.Flite, ft: panphon.FeatureTable, orth: str) -> Counter[str]:
space = Counter()
space: Counter[str] = Counter()
orth = normpunc(flite, orth)
trans = flite.transliterate(orth)
while trans:
Expand All @@ -34,7 +34,7 @@ def add_record(flite: epitran.flite.Flite, ft: panphon.FeatureTable, orth: str)
space[pref] += 1
trans = trans[len(pref):]
else:
if trans[0] in flite.puncnorm_vals:
if trans[0] in flite.puncnorm.puncnorm:
space[trans[0]] += 1
else:
space[trans[0]] += 1
Expand All @@ -43,7 +43,7 @@ def add_record(flite: epitran.flite.Flite, ft: panphon.FeatureTable, orth: str)


def add_file(flite: epitran.flite.Flite, ft: panphon.FeatureTable, fn: str) -> Counter[str]:
space = Counter()
space: Counter[str] = Counter()
with codecs.open(fn, 'r', 'utf-8') as f:
for line in f:
fields = line.split(u'\t')
Expand All @@ -56,7 +56,7 @@ def add_file(flite: epitran.flite.Flite, ft: panphon.FeatureTable, fn: str) -> C

def print_space(output: str, space: Counter[str]) -> None:
pairs = enumerate(sorted(filter(lambda x: x, space.keys())))
with open(output, 'wb') as f:
with open(output, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
for i, char in pairs:
writer.writerow((i, char))
Expand All @@ -65,7 +65,7 @@ def print_space(output: str, space: Counter[str]) -> None:
def main(infiles: List[str], output: str) -> None:
flite = epitran.flite.Flite()
ft = panphon.FeatureTable()
space = Counter()
space: Counter[str] = Counter()
for fn in infiles:
logger.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
space.update(add_file(flite, ft, fn))
Expand Down
10 changes: 5 additions & 5 deletions epitran/bin/connl2ipaspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def norm(c: str) -> str:


def add_record_gen(epi: epitran.Epitran, ft: panphon.FeatureTable, orth: str) -> Counter[str]:
space = Counter()
space: Counter[str] = Counter()
orth = normpunc(epi, orth)
trans = epi.transliterate(orth)
while trans:
Expand All @@ -38,7 +38,7 @@ def add_record_gen(epi: epitran.Epitran, ft: panphon.FeatureTable, orth: str) ->


def add_file_gen(epi: epitran.Epitran, ft: panphon.FeatureTable, fn: str) -> Counter[str]:
space = Counter()
space: Counter[str] = Counter()
with codecs.open(fn, 'r', 'utf-8') as f:
for line in f:
fields = line.split(u'\t')
Expand All @@ -50,7 +50,7 @@ def add_file_gen(epi: epitran.Epitran, ft: panphon.FeatureTable, fn: str) -> Cou


def add_file_op(epi: epitran.Epitran, ft: panphon.FeatureTable, fn: str) -> Counter[str]:
space = Counter()
space: Counter[str] = Counter()
with codecs.open(fn, 'r', 'utf-8') as f:
for line in f:
fields = line.split(u'\t')
Expand All @@ -74,7 +74,7 @@ def add_file_op(epi: epitran.Epitran, ft: panphon.FeatureTable, fn: str) -> Coun

def print_space(output: str, space: Counter[str]) -> None:
pairs = enumerate(sorted(filter(lambda x: x, space.keys())))
with open(output, 'wb') as f:
with open(output, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
for i, char in pairs:
writer.writerow((i, char))
Expand All @@ -83,7 +83,7 @@ def print_space(output: str, space: Counter[str]) -> None:
def main(code: str, op: bool, infiles: List[str], output: str) -> None:
epi = epitran.Epitran(code)
ft = panphon.FeatureTable()
space = Counter()
space: Counter[str] = Counter()
for fn in infiles:
logger.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
add_file = add_file_op if op else add_file_gen
Expand Down
4 changes: 1 addition & 3 deletions epitran/bin/detectcaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,14 @@

def main() -> None:
for line in fileinput.input():
line = line.decode('utf-8')
token = line.strip()
if len(token) > 1 and unicodedata.category(token[1]) == 'Lu':
is_cap = 0
elif len(token) > 0 and unicodedata.category(token[0]) == 'Lu':
is_cap = 1
else:
is_cap = 0
line = u'{}\t{}'.format(is_cap, token)
line = line.encode('utf-8')
line = '{}\t{}'.format(is_cap, token)
print(line)


Expand Down
2 changes: 0 additions & 2 deletions epitran/bin/epitranscribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,8 @@
def main(code: str) -> None:
epi = epitran.Epitran(code)
for line in sys.stdin: # pointless
line = line.decode('utf-8')
line = unicodedata.normalize('NFD', line.lower())
line = epi.transliterate(line)
line = line.encode('utf-8')
sys.stdout.write(line)


Expand Down
10 changes: 5 additions & 5 deletions epitran/bin/extract_test_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
import glob
import csv

RE_DERIVATION = re.compile("""self\._?derivation\(u?['"]([^'"]+)['"], u?['"]([^'"]+)['"]\)""", re.M | re.S)
RE_TR = re.compile("""self\.epi\.transliterate\(['"]([^'"]+)['"]\).+?self\.assertEqual\(tr, ['"]([^'"]+)['"]\)""", re.M | re.S)
RE_RES = re.compile("""self\.epi\.transliterate\(['"]([^'"]+)['"]\).+?self\.assertEqual\(res, ['"]([^'"]+)['"]\)""", re.M | re.S)
RE_ASSERT_TRANS = re.compile("""self\._assert_trans\(['"]([^'"]+)['"],\s*['"]([^'"]+)['"]\)""", re.M | re.S)
RE_TUPLE = re.compile("""\(['"]([^'"]+)['"],\s*['"]([^'"]+)['"]\)""", re.M | re.S)
RE_DERIVATION = re.compile(r"""self\._?derivation\(u?['"]([^'"]+)['"], u?['"]([^'"]+)['"]\)""", re.M | re.S)
RE_TR = re.compile(r"""self\.epi\.transliterate\(['"]([^'"]+)['"]\).+?self\.assertEqual\(tr, ['"]([^'"]+)['"]\)""", re.M | re.S)
RE_RES = re.compile(r"""self\.epi\.transliterate\(['"]([^'"]+)['"]\).+?self\.assertEqual\(res, ['"]([^'"]+)['"]\)""", re.M | re.S)
RE_ASSERT_TRANS = re.compile(r"""self\._assert_trans\(['"]([^'"]+)['"],\s*['"]([^'"]+)['"]\)""", re.M | re.S)
RE_TUPLE = re.compile(r"""\(['"]([^'"]+)['"],\s*['"]([^'"]+)['"]\)""", re.M | re.S)
RE_CODE = re.compile("""["']([a-z]{3}-[A-Z][a-z]{3})["']""")

def extract_code(code: str) -> str:
Expand Down
2 changes: 1 addition & 1 deletion epitran/bin/ltf2ipaspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def read_input(input_: List[List[str]], langscript: str) -> Set[str]:


def write_output(output: str, space: Set[str]) -> None:
with open(output, 'wb') as f:
with open(output, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
for n, ch in enumerate(sorted(list(space))):
writer.writerow((n, ch))
Expand Down
20 changes: 13 additions & 7 deletions epitran/bin/migraterules.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# -*- coding: utf-8 -*-


import csv
import glob
import re
from typing import List, Optional
Expand All @@ -20,19 +21,24 @@ def build_rule(fields: List[str]) -> Optional[str]:


def main() -> None:
for csv in glob.glob('*.csv'):
txt = re.match('[A-Za-z-]+', csv).group(0) + '.txt'
with open(csv, 'r', encoding='utf-8') as f, open(txt, 'w', encoding='utf-8') as g:
for csv_file in glob.glob('*.csv'):
match = re.match('[A-Za-z-]+', csv_file)
if match:
txt = match.group(0) + '.txt'
else:
continue
with open(csv_file, 'r', encoding='utf-8') as f, open(txt, 'w', encoding='utf-8') as g:
reader = csv.reader(f)
next(reader)
for fields in reader:
if re.match('\s*%', fields[0]):
if re.match(r'\s*%', fields[0]):
print(','.join([x for x in fields if x]), file=g)
else:
rule = build_rule(fields)
rule = re.sub('[ ]+', ' ', rule)
rule = re.sub('[ ]$', '', rule)
print(rule, file=g)
if rule is not None:
rule = re.sub('[ ]+', ' ', rule)
rule = re.sub('[ ]$', '', rule)
print(rule, file=g)


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion epitran/bin/space2punc.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def main(fns: List[str], fnn: str) -> None:
for _, s in reader:
if len(s) == 1 and unicodedata.category(s)[0] == u'P':
punc.add(s)
with open(fnn, 'wb') as f:
with open(fnn, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
for mark in sorted(list(punc)):
writer.writerow([mark])
Expand Down
4 changes: 2 additions & 2 deletions epitran/bin/uigtransliterate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@

epi = epitran.Epitran('uig-Arab')
for line in fileinput.input():
s = epi.transliterate(line.strip().decode('utf-8'))
print(s.encode('utf-8'))
s = epi.transliterate(line.strip())
print(s)
4 changes: 2 additions & 2 deletions epitran/cedict.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ def _construct_trie(self, hanzi: Dict[str, Tuple[List[str], List[str]]]) -> Any:
pairs = []
for hz, df in self.hanzi.items():
py, en = df
py = ''.join(filter(lambda x: x in ASCII_CHARS, ' '.join(py)))
pairs.append((hz, (py.encode('utf-8'),)))
py_str = ''.join(filter(lambda x: x in ASCII_CHARS, ' '.join(py)))
pairs.append((hz, (py_str.encode('utf-8'),)))
trie = marisa_trie.RecordTrie('@s', pairs)
return trie

Expand Down
33 changes: 30 additions & 3 deletions epitran/epihan.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,14 @@ def transliterate(self, text: str, normpunc: bool = False, ligatures: bool = Fal
for token in tokens:
if token in self.cedict.hanzi:
(pinyin, _) = self.cedict.hanzi[token]
pinyin = u''.join(pinyin).lower()
ipa = self.rules.apply(pinyin)
pinyin_str = u''.join(pinyin).lower()
ipa = self.rules.apply(pinyin_str)
ipa_tokens.append(ipa.replace(u',', u''))
else:
if normpunc:
token = self.normalize_punc(token)
ipa_tokens.append(token)
ipa_tokens = map(ligaturize, ipa_tokens)\
ipa_tokens = list(map(ligaturize, ipa_tokens))\
if ligatures else ipa_tokens
return u''.join(ipa_tokens)

Expand Down Expand Up @@ -164,6 +164,19 @@ def __init__(self, **kwargs) -> None:
self.regexp = re.compile(r'\p{Han}')

class EpiJpan(object):
punc = [(u'\uff0c', u','),
(u'\uff01', u'!'),
(u'\uff1f', u'?'),
(u'\uff1b', u';'),
(u'\uff1a', u':'),
(u'\uff08', u'('),
(u'\uff09', u')'),
(u'\uff3b', u'['),
(u'\uff3d', u']'),
(u'\u3010', u'['),
(u'\u3011', u']'),
]

def __init__(self, **kwargs) -> None:
"""Construct epitran object for Japanese

Expand All @@ -185,6 +198,20 @@ def __init__(self, **kwargs) -> None:
self.regexp = None
self.tones = tones

def normalize_punc(self, text: str) -> str:
"""Normalize punctutation in a string

Args:
text (str): an orthographic string

Return:
str: an orthographic string with punctation normalized to
Western equivalents
"""
for a, b in self.punc:
text = text.replace(a, b)
return text

def transliterate(self, text: str, normpunc: bool = False, ligatures: bool = False) -> str:
tokens = self.cedict.tokenize(text)
ipa_tokens = []
Expand Down
10 changes: 5 additions & 5 deletions epitran/flite.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def arpa_text_to_list(self, arpa_text: str) -> List[str]:
def arpa_to_ipa(self, arpa_text: str, ligatures: bool = False) -> str:
arpa_text = arpa_text.strip()
arpa_list = self.arpa_text_to_list(arpa_text)
arpa_list = map(lambda d: re.sub(r'\d', '', d), arpa_list)
arpa_list = list(map(lambda d: re.sub(r'\d', '', d), arpa_list))
ipa_list = map(lambda d: self.arpa_map[d], arpa_list)
text = ''.join(ipa_list)
return text
Expand Down Expand Up @@ -168,8 +168,8 @@ class FliteT2P(Flite):
def english_g2p(self, text: str) -> str:
text = self.normalize(text)
try:
arpa_text = subprocess.check_output(['t2p', '"{}"'.format(text)])
arpa_text = arpa_text.decode('utf-8')
arpa_bytes = subprocess.check_output(['t2p', '"{}"'.format(text)])
arpa_text = arpa_bytes.decode('utf-8')
except OSError:
logger.warning('t2p (from flite) is not installed.')
arpa_text = ''
Expand All @@ -188,8 +188,8 @@ def arpa_text_to_list(self, arpa_text: str) -> List[str]:
def english_g2p(self, text: str) -> str:
text = self.normalize(text).lower()
try:
arpa_text = subprocess.check_output(['lex_lookup', text])
arpa_text = arpa_text.decode('utf-8')
arpa_bytes = subprocess.check_output(['lex_lookup', text])
arpa_text = arpa_bytes.decode('utf-8')
except OSError:
logger.warning('lex_lookup (from flite) is not installed.')
arpa_text = ''
Expand Down
4 changes: 2 additions & 2 deletions epitran/ppprocessor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

import logging
import os.path

from pathlib import Path

from importlib import resources

Expand Down Expand Up @@ -33,7 +33,7 @@ def _read_rules(self, code: str, fix: str, rev: bool) -> Rules:
try:
resource_path = resources.files(__package__).joinpath(fn)
if resource_path.is_file():
return Rules([resource_path])
return Rules([Path(str(resource_path))])
else:
return Rules([])
except (KeyError, FileNotFoundError):
Expand Down
4 changes: 2 additions & 2 deletions epitran/reromanize.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def __init__(self, code: str, table: str, decompose: bool = True, cedict_file: O
self.mapping = self._load_reromanizer(table, decompose)

def _load_reromanizer(self, table: str, decompose: bool) -> Dict[str, str]:
path = os.path.join('data', 'reromanize', table + '.csv')
path = resources.files(__package__).joinpath(path)
path_str = os.path.join('data', 'reromanize', table + '.csv')
path = resources.files(__package__).joinpath(path_str)
if path.is_file():
mapping = {}
with path.open('r', encoding='utf-8') as f:
Expand Down
1 change: 1 addition & 0 deletions epitran/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def _read_rule(self, i: int, line: str) -> Optional[Callable[[str], str]]:
return self._fields_to_function(a, b, X, Y)
except Exception as e:
raise DatafileError('Line {}: "{}" cannot be compiled as regex: ̪{}'.format(i + 1, line, e))
return None

def _fields_to_function_metathesis(self, a: str, X: str, Y: str) -> Callable[[str], str]:
left = r'(?P<X>{}){}(?P<Y>{})'.format(X, a, Y)
Expand Down
Loading
Loading