@@ -30,7 +30,7 @@ def process_node(self, node):
3030 # them as proper nouns. We must be careful and not add too many to this
3131 # rule, as many of them could be used as surnames and then they should
3232 # be PROPN.
33- if node .upos == 'PROPN' and re .fullmatch (r'(bůh|duch|hospodin|město|milost|pán|panna)' , node .lemma .lower ()):
33+ if node .upos == 'PROPN' and re .fullmatch (r'(bůh|duch|hospodin|město|milost|pan| pán|panna|stvořitel|trojice )' , node .lemma .lower ()):
3434 node .lemma = node .lemma .lower ()
3535 node .upos = 'NOUN'
3636 # Lemmatization.
@@ -124,7 +124,7 @@ def process_node(self, node):
124124 node .feats ['Number[psor]' ] = 'Sing'
125125 if not re .search (r'(Masc|Neut)' , node .feats ['Gender[psor]' ]):
126126 node .feats ['Gender[psor]' ] = 'Masc,Neut'
127- elif node .form .lower () == 'její' :
127+ elif re . fullmatch ( r'(její|jejího|jejímu|jejím|jejích|jejími|jejíma)' , node .form .lower ()) :
128128 node .lemma = 'jeho'
129129 node .feats ['Person' ] = '3'
130130 node .feats ['Number[psor]' ] = 'Sing'
@@ -142,6 +142,14 @@ def process_node(self, node):
142142 node .lemma = 'jehož'
143143 node .feats ['PronType' ] = 'Rel'
144144 node .feats ['Number[psor]' ] = 'Plur'
145+ elif re .fullmatch (r'jichžto|jejichžto' , node .form .lower ()):
146+ node .lemma = 'jehožto'
147+ node .feats ['PronType' ] = 'Rel'
148+ node .feats ['Number[psor]' ] = 'Plur'
149+ elif node .lemma == 'čí' :
150+ node .feats ['Poss' ] = 'Yes'
151+ if node .feats ['PronType' ] == '' :
152+ node .feats ['PronType' ] = 'Int,Rel'
145153 # Reflexive possessive pronoun should not forget the Reflex=Yes feature.
146154 if node .upos == 'DET' and node .lemma == 'svůj' :
147155 node .feats ['Reflex' ] = 'Yes'
@@ -150,15 +158,15 @@ def process_node(self, node):
150158 if node .upos in ['PRON' , 'DET' ]:
151159 # Relative pronoun "jenž" should be PRON, not DET
152160 # (it inflects for Gender but it can never be used as congruent attribute).
153- if node . lemma == ' jenž' :
161+ if re . fullmatch ( r'( jenž|jenžto)' , node . lemma ) :
154162 node .upos = 'PRON'
155163 if node .form .lower ().startswith ('j' ):
156164 node .feats ['PrepCase' ] = 'Npr'
157165 else :
158166 node .feats ['PrepCase' ] = 'Pre'
159167 # Relative pronoun "ješto" should be PRON, not DET (if it is not SCONJ, but that was excluded by a condition above)
160168 # (it inflects for Gender but it can never be used as congruent attribute).
161- elif node .form .lower () == 'ješto' :
169+ elif node .form .lower () in [ 'ješto' , 'ježto' ] :
162170 node .lemma = 'jenžto'
163171 node .upos = 'PRON'
164172 node .feats ['PrepCase' ] = 'Npr'
@@ -177,6 +185,17 @@ def process_node(self, node):
177185 node .feats ['Gender' ] = 'Masc'
178186 node .feats ['Animacy' ] = 'Anim'
179187 node .feats ['Number' ] = ''
188+ # Pronoun "kdož" is PRON (not DET).
189+ elif node .lemma == 'kdož' :
190+ node .lemma = 'kdož'
191+ node .upos = 'PRON'
192+ if node .feats ['PronType' ] == '' :
193+ node .feats ['PronType' ] = 'Rel'
194+ # Unlike "co", we annotate "kdo" as Animacy=Anim|Gender=Masc.
195+ # However, we do not annotate Number ("kdo" can be the subject of a plural verb).
196+ node .feats ['Gender' ] = 'Masc'
197+ node .feats ['Animacy' ] = 'Anim'
198+ node .feats ['Number' ] = ''
180199 # Pronoun "někdo", "kdosi" is PRON (not DET).
181200 elif re .fullmatch (r'(kdosi|někdo)' , node .lemma ):
182201 node .upos = 'PRON'
@@ -208,8 +227,7 @@ def process_node(self, node):
208227 node .feats ['Animacy' ] = ''
209228 node .feats ['Number' ] = ''
210229 # Pronoun "což" is PRON (not DET).
211- elif node .lemma == 'což' :
212- node .lemma = 'což'
230+ elif node .lemma in ['což' , 'cože' ]:
213231 node .upos = 'PRON'
214232 if node .feats ['PronType' ] == '' :
215233 node .feats ['PronType' ] = 'Rel'
@@ -219,7 +237,7 @@ def process_node(self, node):
219237 node .feats ['Animacy' ] = ''
220238 node .feats ['Number' ] = ''
221239 # Pronoun "něco" is PRON (not DET).
222- elif re .fullmatch (r'(cosi|něco)' , node .lemma ):
240+ elif re .fullmatch (r'(cokoli| cosi|něco)' , node .lemma ):
223241 node .upos = 'PRON'
224242 node .feats ['PronType' ] = 'Ind'
225243 # We do not annotate Gender and Number, although it could be argued
@@ -253,6 +271,10 @@ def process_node(self, node):
253271 elif node .lemma == 'všechen' :
254272 node .upos = 'DET'
255273 node .feats ['PronType' ] = 'Tot'
274+ elif re .fullmatch (r'(všecek|všecka|všecku|všecko|všickni)' , node .form .lower ()):
275+ node .lemma = 'všechen'
276+ node .upos = 'DET'
277+ node .feats ['PronType' ] = 'Tot'
256278 # Pronoun "sám" is lemmatized to the long form, it is DET and PronType=Emp.
257279 elif node .lemma in ['sám' , 'samý' ]:
258280 node .lemma = 'samý'
@@ -275,7 +297,8 @@ def process_node(self, node):
275297 node .feats ['PronType' ] = 'Ind'
276298 node .feats ['NumForm' ] = ''
277299 node .feats ['Polarity' ] = '' ###!!! so we are losing the distinction mnoho/nemnoho?
278- elif re .fullmatch (r'(tolik)' , node .lemma ):
300+ elif re .fullmatch (r'(toliko?)' , node .lemma ):
301+ node .lemma = 'tolik'
279302 node .upos = 'DET'
280303 node .feats ['PronType' ] = 'Dem'
281304 node .feats ['NumForm' ] = ''
@@ -303,7 +326,7 @@ def process_node(self, node):
303326 # Pronominal adverbs have PronType but most of them do not have Degree
304327 # and Polarity.
305328 if node .upos == 'ADV' :
306- if re .fullmatch (r'(dosud|nyní|proto|sem|tady|tak|takto|tam|teď|tehdy|tenkrát|tu|zde)' , node .lemma ):
329+ if re .fullmatch (r'(dosud|dotud| nyní|odsud|odtud| proto|sem|tady|tak|takož| takto|tam|tamto| teď|tehdy|tenkrát|tu|tudy |zde)' , node .lemma ):
307330 node .feats ['PronType' ] = 'Dem'
308331 node .feats ['Degree' ] = ''
309332 node .feats ['Polarity' ] = ''
@@ -316,7 +339,7 @@ def process_node(self, node):
316339 node .feats ['PronType' ] = 'Rel'
317340 node .feats ['Degree' ] = ''
318341 node .feats ['Polarity' ] = ''
319- elif re .fullmatch (r'(jaksi|kamsi|kdesi|kdysi|kudysi|nějak|někam|někde|někdy|někudy)' , node .lemma ):
342+ elif re .fullmatch (r'(jakkoli| jaksi|kamkoli| kamsi|kdekoli| kdesi|kdykoli| kdysi|kudykoli |kudysi|nějak|někam|někde|někdy|někudy)' , node .lemma ):
320343 node .feats ['PronType' ] = 'Ind'
321344 node .feats ['Degree' ] = ''
322345 node .feats ['Polarity' ] = ''
@@ -325,7 +348,7 @@ def process_node(self, node):
325348 node .feats ['Degree' ] = ''
326349 node .feats ['Polarity' ] = ''
327350 # Total pronominals can be negated ("nevždy"). Then they get Degree, too.
328- elif re .fullmatch (r'(všude|všudy|ve?ždy|ve?ždycky)' , node .lemma ):
351+ elif re .fullmatch (r'(odevšad| všude|všudy|ve?ždy|ve?ždycky)' , node .lemma ):
329352 node .feats ['PronType' ] = 'Tot'
330353 node .feats ['Degree' ] = 'Pos'
331354 node .feats ['Polarity' ] = 'Pos'
@@ -337,7 +360,7 @@ def process_node(self, node):
337360 # where it no longer acts as a copula. Czech tagsets typically do not
338361 # distinguish AUX from VERB, which means that converted data may have to
339362 # be fixed.
340- if node .upos == 'VERB' and node .lemma == 'být' :
363+ if node .upos == 'VERB' and node .lemma in [ 'být' , 'bývat' , 'bývávat' ] :
341364 node .upos = 'AUX'
342365 if node .upos in ['ADV' , 'VERB' ] and re .fullmatch (r'(ne)?lze' , node .form .lower ()):
343366 node .upos = 'ADV'
@@ -429,15 +452,20 @@ def process_node(self, node):
429452 node .lemma = 'u'
430453 node .feats ['AdpType' ] = 'Prep'
431454 #----------------------------------------------------------------------
432- # CONJUNCTION
455+ # CONJUNCTIONS
433456 #----------------------------------------------------------------------
434457 # As a conjunction (and not particle/adverb), "ani" is coordinating and
435458 # not subordinating.
436459 if node .upos == 'SCONJ' and node .lemma == 'ani' :
437460 node .upos = 'CCONJ'
461+ if node .upos == 'CCONJ' and node .lemma == 'nebť' :
462+ node .lemma = 'neboť'
438463 #----------------------------------------------------------------------
439- # PARTICLES THAT SHOULD BE ADVERBS
464+ # PARTICLES (other than those already grabbed above)
440465 #----------------------------------------------------------------------
441466 # "jako" should be SCONJ but 19th century data have it as PART.
442- if node .upos == 'PART' and node .lemma == 'jako' :
443- node .upos = 'SCONJ'
467+ if node .upos == 'PART' :
468+ if node .lemma == 'jako' :
469+ node .upos = 'SCONJ'
470+ elif node .lemma == 'ti' :
471+ node .lemma = 'ť'
0 commit comments