Skip to content

Commit 2e61d9c

Browse files
committed
More Czech fixes.
1 parent 9f6847f commit 2e61d9c

File tree

1 file changed

+44
-16
lines changed

1 file changed

+44
-16
lines changed

udapi/block/ud/cs/fixmorpho.py

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def process_node(self, node):
3030
# them as proper nouns. We must be careful and not add too many to this
3131
# rule, as many of them could be used as surnames and then they should
3232
# be PROPN.
33-
if node.upos == 'PROPN' and re.fullmatch(r'(bůh|duch|hospodin|město|milost|pán|panna)', node.lemma.lower()):
33+
if node.upos == 'PROPN' and re.fullmatch(r'(bůh|duch|hospodin|město|milost|pan|pán|panna|stvořitel|trojice)', node.lemma.lower()):
3434
node.lemma = node.lemma.lower()
3535
node.upos = 'NOUN'
3636
# Lemmatization.
@@ -124,7 +124,7 @@ def process_node(self, node):
124124
node.feats['Number[psor]'] = 'Sing'
125125
if not re.search(r'(Masc|Neut)', node.feats['Gender[psor]']):
126126
node.feats['Gender[psor]'] = 'Masc,Neut'
127-
elif node.form.lower() == 'její':
127+
elif re.fullmatch(r'(její|jejího|jejímu|jejím|jejích|jejími|jejíma)', node.form.lower()):
128128
node.lemma = 'jeho'
129129
node.feats['Person'] = '3'
130130
node.feats['Number[psor]'] = 'Sing'
@@ -142,6 +142,14 @@ def process_node(self, node):
142142
node.lemma = 'jehož'
143143
node.feats['PronType'] = 'Rel'
144144
node.feats['Number[psor]'] = 'Plur'
145+
elif re.fullmatch(r'jichžto|jejichžto', node.form.lower()):
146+
node.lemma = 'jehožto'
147+
node.feats['PronType'] = 'Rel'
148+
node.feats['Number[psor]'] = 'Plur'
149+
elif node.lemma == 'čí':
150+
node.feats['Poss'] = 'Yes'
151+
if node.feats['PronType'] == '':
152+
node.feats['PronType'] = 'Int,Rel'
145153
# Reflexive possessive pronoun should not forget the Reflex=Yes feature.
146154
if node.upos == 'DET' and node.lemma == 'svůj':
147155
node.feats['Reflex'] = 'Yes'
@@ -150,15 +158,15 @@ def process_node(self, node):
150158
if node.upos in ['PRON', 'DET']:
151159
# Relative pronoun "jenž" should be PRON, not DET
152160
# (it inflects for Gender but it can never be used as congruent attribute).
153-
if node.lemma == 'jenž':
161+
if re.fullmatch(r'(jenž|jenžto)', node.lemma):
154162
node.upos = 'PRON'
155163
if node.form.lower().startswith('j'):
156164
node.feats['PrepCase'] = 'Npr'
157165
else:
158166
node.feats['PrepCase'] = 'Pre'
159167
# Relative pronoun "ješto" should be PRON, not DET (if it is not SCONJ, but that was excluded by a condition above)
160168
# (it inflects for Gender but it can never be used as congruent attribute).
161-
elif node.form.lower() == 'ješto':
169+
elif node.form.lower() in ['ješto', 'ježto']:
162170
node.lemma = 'jenžto'
163171
node.upos = 'PRON'
164172
node.feats['PrepCase'] = 'Npr'
@@ -177,6 +185,17 @@ def process_node(self, node):
177185
node.feats['Gender'] = 'Masc'
178186
node.feats['Animacy'] = 'Anim'
179187
node.feats['Number'] = ''
188+
# Pronoun "kdož" is PRON (not DET).
189+
elif node.lemma == 'kdož':
190+
node.lemma = 'kdož'
191+
node.upos = 'PRON'
192+
if node.feats['PronType'] == '':
193+
node.feats['PronType'] = 'Rel'
194+
# Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc.
195+
# However, we do not annotate Number ("kdo" can be the subject of a plural verb).
196+
node.feats['Gender'] = 'Masc'
197+
node.feats['Animacy'] = 'Anim'
198+
node.feats['Number'] = ''
180199
# Pronoun "někdo", "kdosi" is PRON (not DET).
181200
elif re.fullmatch(r'(kdosi|někdo)', node.lemma):
182201
node.upos = 'PRON'
@@ -208,8 +227,7 @@ def process_node(self, node):
208227
node.feats['Animacy'] = ''
209228
node.feats['Number'] = ''
210229
# Pronoun "což" is PRON (not DET).
211-
elif node.lemma == 'což':
212-
node.lemma = 'což'
230+
elif node.lemma in ['což', 'cože']:
213231
node.upos = 'PRON'
214232
if node.feats['PronType'] == '':
215233
node.feats['PronType'] = 'Rel'
@@ -219,7 +237,7 @@ def process_node(self, node):
219237
node.feats['Animacy'] = ''
220238
node.feats['Number'] = ''
221239
# Pronoun "něco" is PRON (not DET).
222-
elif re.fullmatch(r'(cosi|něco)', node.lemma):
240+
elif re.fullmatch(r'(cokoli|cosi|něco)', node.lemma):
223241
node.upos = 'PRON'
224242
node.feats['PronType'] = 'Ind'
225243
# We do not annotate Gender and Number, although it could be argued
@@ -253,6 +271,10 @@ def process_node(self, node):
253271
elif node.lemma == 'všechen':
254272
node.upos = 'DET'
255273
node.feats['PronType'] = 'Tot'
274+
elif re.fullmatch(r'(všecek|všecka|všecku|všecko|všickni)', node.form.lower()):
275+
node.lemma = 'všechen'
276+
node.upos = 'DET'
277+
node.feats['PronType'] = 'Tot'
256278
# Pronoun "sám" is lemmatized to the long form, it is DET and PronType=Emp.
257279
elif node.lemma in ['sám', 'samý']:
258280
node.lemma = 'samý'
@@ -275,7 +297,8 @@ def process_node(self, node):
275297
node.feats['PronType'] = 'Ind'
276298
node.feats['NumForm'] = ''
277299
node.feats['Polarity'] = '' ###!!! so we are losing the distinction mnoho/nemnoho?
278-
elif re.fullmatch(r'(tolik)', node.lemma):
300+
elif re.fullmatch(r'(toliko?)', node.lemma):
301+
node.lemma = 'tolik'
279302
node.upos = 'DET'
280303
node.feats['PronType'] = 'Dem'
281304
node.feats['NumForm'] = ''
@@ -303,7 +326,7 @@ def process_node(self, node):
303326
# Pronominal adverbs have PronType but most of them do not have Degree
304327
# and Polarity.
305328
if node.upos == 'ADV':
306-
if re.fullmatch(r'(dosud|nyní|proto|sem|tady|tak|takto|tam|teď|tehdy|tenkrát|tu|zde)', node.lemma):
329+
if re.fullmatch(r'(dosud|dotud|nyní|odsud|odtud|proto|sem|tady|tak|takož|takto|tam|tamto|teď|tehdy|tenkrát|tu|tudy|zde)', node.lemma):
307330
node.feats['PronType'] = 'Dem'
308331
node.feats['Degree'] = ''
309332
node.feats['Polarity'] = ''
@@ -316,7 +339,7 @@ def process_node(self, node):
316339
node.feats['PronType'] = 'Rel'
317340
node.feats['Degree'] = ''
318341
node.feats['Polarity'] = ''
319-
elif re.fullmatch(r'(jaksi|kamsi|kdesi|kdysi|kudysi|nějak|někam|někde|někdy|někudy)', node.lemma):
342+
elif re.fullmatch(r'(jakkoli|jaksi|kamkoli|kamsi|kdekoli|kdesi|kdykoli|kdysi|kudykoli|kudysi|nějak|někam|někde|někdy|někudy)', node.lemma):
320343
node.feats['PronType'] = 'Ind'
321344
node.feats['Degree'] = ''
322345
node.feats['Polarity'] = ''
@@ -325,7 +348,7 @@ def process_node(self, node):
325348
node.feats['Degree'] = ''
326349
node.feats['Polarity'] = ''
327350
# Total pronominals can be negated ("nevždy"). Then they get Degree, too.
328-
elif re.fullmatch(r'(všude|všudy|ve?ždy|ve?ždycky)', node.lemma):
351+
elif re.fullmatch(r'(odevšad|všude|všudy|ve?ždy|ve?ždycky)', node.lemma):
329352
node.feats['PronType'] = 'Tot'
330353
node.feats['Degree'] = 'Pos'
331354
node.feats['Polarity'] = 'Pos'
@@ -337,7 +360,7 @@ def process_node(self, node):
337360
# where it no longer acts as a copula. Czech tagsets typically do not
338361
# distinguish AUX from VERB, which means that converted data may have to
339362
# be fixed.
340-
if node.upos == 'VERB' and node.lemma == 'být':
363+
if node.upos == 'VERB' and node.lemma in ['být', 'bývat', 'bývávat']:
341364
node.upos = 'AUX'
342365
if node.upos in ['ADV', 'VERB'] and re.fullmatch(r'(ne)?lze', node.form.lower()):
343366
node.upos = 'ADV'
@@ -429,15 +452,20 @@ def process_node(self, node):
429452
node.lemma = 'u'
430453
node.feats['AdpType'] = 'Prep'
431454
#----------------------------------------------------------------------
432-
# CONJUNCTION
455+
# CONJUNCTIONS
433456
#----------------------------------------------------------------------
434457
# As a conjunction (and not particle/adverb), "ani" is coordinating and
435458
# not subordinating.
436459
if node.upos == 'SCONJ' and node.lemma == 'ani':
437460
node.upos = 'CCONJ'
461+
if node.upos == 'CCONJ' and node.lemma == 'nebť':
462+
node.lemma = 'neboť'
438463
#----------------------------------------------------------------------
439-
# PARTICLES THAT SHOULD BE ADVERBS
464+
# PARTICLES (other than those already grabbed above)
440465
#----------------------------------------------------------------------
441466
# "jako" should be SCONJ but 19th century data have it as PART.
442-
if node.upos == 'PART' and node.lemma == 'jako':
443-
node.upos = 'SCONJ'
467+
if node.upos == 'PART':
468+
if node.lemma == 'jako':
469+
node.upos = 'SCONJ'
470+
elif node.lemma == 'ti':
471+
node.lemma = 'ť'

0 commit comments

Comments
 (0)