Skip to content

Commit cd4b706

Browse files
committed
Fix reflexive pronouns (particles) in Czech.
1 parent 85a7130 commit cd4b706

File tree

1 file changed

+28
-2
lines changed

1 file changed

+28
-2
lines changed

udapi/block/ud/cs/fixmorpho.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,38 @@
66
created as part of the Hičkok project (while importing nineteenth-century Czech
77
data) but it should be applicable on any other Czech treebank.
88
"""
9-
import udapi.block
9+
from udapi.core.block import Block
10+
import logging
1011
import re
1112

12-
class FixMorpho(udapi.block):
13+
class FixMorpho(Block):
1314

1415
def process_node(self, node):
16+
# Do not touch words marked as Foreign or Typo. They may not behave the
17+
# way we expect in Czech data.
18+
if node.feats['Foreign'] == 'Yes' or node.feats['Typo'] == 'Yes':
19+
return
20+
# Nouns do not have polarity but the Prague-style tagsets may mark it.
21+
if node.upos in ['NOUN', 'PROPN']:
22+
if node.feats['Polarity'] == 'Pos':
23+
node.feats['Polarity'] = ''
24+
elif node.feats['Polarity'] == 'Neg':
25+
logging.warn(f'To remove Polarity=Neg from the NOUN {node.form}, we may have to change its lemma ({node.lemma}).')
26+
# In 19th century data, the grammaticalized usages of "se", "si" are
27+
# tagged as PART (rather than a reflexive PRON, which is the standard).
28+
# Even if it already was tagged PRON, some features may have to be added.
29+
if node.upos in ['PRON', 'PART'] and node.form.lower() in ['se', 'si']:
30+
node.lemma = 'se'
31+
node.upos = 'PRON'
32+
node.feats['PronType'] = 'Prs'
33+
node.feats['Reflex'] = 'Yes'
34+
if node.form.lower() == 'se':
35+
# Occasionally "se" can be genitive: "z prudkého do se dorážení".
36+
if not node.feats['Case'] == 'Gen':
37+
node.feats['Case'] = 'Acc'
38+
else:
39+
node.feats['Case'] = 'Dat'
40+
node.feats['Variant'] = 'Short'
1541
# In Czech UD, "být" is always tagged as AUX and never as VERB, regardless
1642
# of the fact that it can participate in purely existential constructions
1743
# where it no longer acts as a copula. Czech tagsets typically do not

0 commit comments

Comments
 (0)