Skip to content

Commit 28177c5

Browse files
committed
Various fixes.
1 parent cd4b706 commit 28177c5

File tree

1 file changed

+97
-0
lines changed

1 file changed

+97
-0
lines changed

udapi/block/ud/cs/fixmorpho.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,24 @@ def process_node(self, node):
2323
node.feats['Polarity'] = ''
2424
elif node.feats['Polarity'] == 'Neg':
2525
logging.warn(f'To remove Polarity=Neg from the NOUN {node.form}, we may have to change its lemma ({node.lemma}).')
26+
# Clitic forms of personal pronouns have Variant=Short if there is also a longer, full form.
27+
if node.upos == 'PRON' and node.feats['PronType'] == 'Prs' and re.fullmatch(r'(mi|mě|ti|tě|si|se|ho|mu)', node.form.lower()):
28+
node.feats['Variant'] = 'Short'
29+
# Forms of "my" should be lemmatized as "já".
30+
if node.upos == 'PRON' and node.lemma == 'my':
31+
node.lemma = 'já'
32+
# Forms of "vy" should be lemmatized as "ty".
33+
if node.upos == 'PRON' and node.lemma == 'vy':
34+
node.lemma = 'ty'
35+
# Forms of "oni" should be lemmatized as "on" and cases that allow
36+
# a preposition should have PrepCase.
37+
if node.upos == 'PRON' and node.lemma in ['on', 'oni']:
38+
node.lemma = 'on'
39+
if node.feats['Case'] not in ['Nom', 'Voc']:
40+
if node.form.lower().startswith('j'):
41+
node.feats['PrepCase'] = 'Npr'
42+
elif re.match(r'[nň]', node.form.lower()):
43+
node.feats['PrepCase'] = 'Pre'
2644
# In 19th century data, the grammaticalized usages of "se", "si" are
2745
# tagged as PART (rather than a reflexive PRON, which is the standard).
2846
# Even if it already was tagged PRON, some features may have to be added.
@@ -38,10 +56,89 @@ def process_node(self, node):
3856
else:
3957
node.feats['Case'] = 'Dat'
4058
node.feats['Variant'] = 'Short'
59+
# As the genitive/accusative form of "on", "jeho" should have PrepCase.
60+
if node.upos == 'PRON' and node.form.lower() == 'jeho':
61+
node.feats['PrepCase'] = 'Npr'
62+
# Possessive pronouns have Person, Gender[psor] and Number[psor].
63+
if node.upos == 'DET' and node.feats['Poss'] == 'Yes':
64+
if node.lemma == 'můj':
65+
node.feats['Person'] = '1'
66+
node.feats['Number[psor]'] = 'Sing'
67+
elif node.lemma == 'tvůj':
68+
node.feats['Person'] = '2'
69+
node.feats['Number[psor]'] = 'Sing'
70+
elif node.lemma == 'náš':
71+
node.feats['Person'] = '1'
72+
node.feats['Number[psor]'] = 'Plur'
73+
elif node.lemma == 'váš':
74+
node.feats['Person'] = '2'
75+
node.feats['Number[psor]'] = 'Plur'
76+
elif node.form.lower() == 'jeho':
77+
node.feats['Person'] = '3'
78+
node.feats['Number[psor]'] = 'Sing'
79+
node.feats['Gender[psor]'] = 'Masc,Neut'
80+
elif re.fullmatch(r'jich|jejich', node.form.lower()):
81+
node.lemma = 'jeho'
82+
node.feats['Person'] = '3'
83+
node.feats['Number[psor]'] = 'Plur'
84+
# Reflexive possessive pronoun should not forget the Reflex=Yes feature.
85+
if node.upos == 'DET' and node.lemma == 'svůj':
86+
node.feats['Reflex'] = 'Yes'
87+
# Relative pronoun "jenž" should be PRON, not DET
88+
# (it inflects for Gender but it can never be used as congruent attribute).
89+
if node.upos in ['PRON', 'DET'] and node.lemma == 'jenž':
90+
node.upos = 'PRON'
91+
if node.form.lower().startswith('j'):
92+
node.feats['PrepCase'] = 'Npr'
93+
else:
94+
node.feats['PrepCase'] = 'Pre'
95+
# Pronoun "sám" is lemmatized to the long form, it is DET and PronType=Emp.
96+
if node.upos in ['PRON', 'DET'] and node.lemma == 'sám':
97+
node.lemma = 'samý'
98+
node.upos = 'DET'
99+
node.feats['PronType'] = 'Emp'
100+
node.feats['Variant'] = 'Short' if re.fullmatch(r'(sám|sama|samo|sami|samy|samu)', node.form.lower()) else ''
101+
# Pronominal cardinal numerals should be DET, not NUM.
102+
if node.upos == 'NUM':
103+
if re.fullmatch(r'(mnoho|málo|několik)', node.lemma):
104+
node.upos = 'DET'
105+
node.feats['PronType'] = 'Ind'
106+
node.feats['NumForm'] = ''
107+
node.feats['Polarity'] = '' ###!!! so we are losing the distinction mnoho/nemnoho?
108+
elif re.fullmatch(r'(tolik)', node.lemma):
109+
node.upos = 'DET'
110+
node.feats['PronType'] = 'Dem'
111+
node.feats['NumForm'] = ''
112+
node.feats['Polarity'] = ''
113+
elif re.fullmatch(r'(kolik)', node.lemma):
114+
node.upos = 'DET'
115+
node.feats['PronType'] = 'Int,Rel'
116+
node.feats['NumForm'] = ''
117+
node.feats['Polarity'] = ''
118+
# Pronominal adverbs have PronType but most of them do not have Degree
119+
# and Polarity.
120+
if node.upos == 'ADV':
121+
if re.fullmatch(r'(tady|tak|tam|teď|tehdy|tu)', node.lemma):
122+
node.feats['PronType'] = 'Dem'
123+
node.feats['Degree'] = ''
124+
node.feats['Polarity'] = ''
125+
elif re.fullmatch(r'(dokdy|jak|kam|kde|kdy|kudy|odkdy|odkud|proč)', node.lemma):
126+
node.feats['PronType'] = 'Int,Rel'
127+
node.feats['Degree'] = ''
128+
node.feats['Polarity'] = ''
41129
# In Czech UD, "být" is always tagged as AUX and never as VERB, regardless
42130
# of the fact that it can participate in purely existential constructions
43131
# where it no longer acts as a copula. Czech tagsets typically do not
44132
# distinguish AUX from VERB, which means that converted data may have to
45133
# be fixed.
46134
if node.upos == 'VERB' and node.lemma == 'být':
47135
node.upos = 'AUX'
136+
# Present forms of perfective verbs normally have Tense=Pres despite
137+
# meaning future. However, a few imperfective verbs have a separate
138+
# future form (distinct from present form), which gets Tense=Fut
139+
# despite inflecting similarly to present forms.
140+
if node.upos in ['VERB', 'AUX'] and node.feats['Mood'] == 'Ind' and node.feats['Tense'] == 'Pres' and node.feats['Aspect'] != 'Perf' and re.match(r'((bud|půjd|pojed|polez)(u|eš|e|eme?|ete|ou)|polet(ím|íš|í|íme|íte))', node.form.lower()):
141+
node.feats['Tense'] = 'Fut'
142+
# "jako" should be SCONJ but 19th century data have it as PART.
143+
if node.upos == 'PART' and node.lemma == 'jako':
144+
node.upos = 'SCONJ'

0 commit comments

Comments
 (0)