@@ -23,6 +23,24 @@ def process_node(self, node):
2323 node .feats ['Polarity' ] = ''
2424 elif node .feats ['Polarity' ] == 'Neg' :
2525 logging .warn (f'To remove Polarity=Neg from the NOUN { node .form } , we may have to change its lemma ({ node .lemma } ).' )
26+ # Clitic forms of personal pronouns have Variant=Short if there is also a longer, full form.
27+ if node .upos == 'PRON' and node .feats ['PronType' ] == 'Prs' and re .fullmatch (r'(mi|mě|ti|tě|si|se|ho|mu)' , node .form .lower ()):
28+ node .feats ['Variant' ] = 'Short'
29+ # Forms of "my" should be lemmatized as "já".
30+ if node .upos == 'PRON' and node .lemma == 'my' :
31+ node .lemma = 'já'
32+ # Forms of "vy" should be lemmatized as "ty".
33+ if node .upos == 'PRON' and node .lemma == 'vy' :
34+ node .lemma = 'ty'
35+ # Forms of "oni" should be lemmatized as "on" and cases that allow
36+ # a preposition should have PrepCase.
37+ if node .upos == 'PRON' and node .lemma in ['on' , 'oni' ]:
38+ node .lemma = 'on'
39+ if node .feats ['Case' ] not in ['Nom' , 'Voc' ]:
40+ if node .form .lower ().startswith ('j' ):
41+ node .feats ['PrepCase' ] = 'Npr'
42+ elif re .match (r'[nň]' , node .form .lower ()):
43+ node .feats ['PrepCase' ] = 'Pre'
2644 # In 19th century data, the grammaticalized usages of "se", "si" are
2745 # tagged as PART (rather than a reflexive PRON, which is the standard).
2846 # Even if it already was tagged PRON, some features may have to be added.
@@ -38,10 +56,89 @@ def process_node(self, node):
3856 else :
3957 node .feats ['Case' ] = 'Dat'
4058 node .feats ['Variant' ] = 'Short'
59+ # As the genitive/accusative form of "on", "jeho" should have PrepCase.
60+ if node .upos == 'PRON' and node .form .lower () == 'jeho' :
61+ node .feats ['PrepCase' ] = 'Npr'
62+ # Possessive pronouns have Person, Gender[psor] and Number[psor].
63+ if node .upos == 'DET' and node .feats ['Poss' ] == 'Yes' :
64+ if node .lemma == 'můj' :
65+ node .feats ['Person' ] = '1'
66+ node .feats ['Number[psor]' ] = 'Sing'
67+ elif node .lemma == 'tvůj' :
68+ node .feats ['Person' ] = '2'
69+ node .feats ['Number[psor]' ] = 'Sing'
70+ elif node .lemma == 'náš' :
71+ node .feats ['Person' ] = '1'
72+ node .feats ['Number[psor]' ] = 'Plur'
73+ elif node .lemma == 'váš' :
74+ node .feats ['Person' ] = '2'
75+ node .feats ['Number[psor]' ] = 'Plur'
76+ elif node .form .lower () == 'jeho' :
77+ node .feats ['Person' ] = '3'
78+ node .feats ['Number[psor]' ] = 'Sing'
79+ node .feats ['Gender[psor]' ] = 'Masc,Neut'
80+ elif re .fullmatch (r'jich|jejich' , node .form .lower ()):
81+ node .lemma = 'jeho'
82+ node .feats ['Person' ] = '3'
83+ node .feats ['Number[psor]' ] = 'Plur'
84+ # Reflexive possessive pronoun should not forget the Reflex=Yes feature.
85+ if node .upos == 'DET' and node .lemma == 'svůj' :
86+ node .feats ['Reflex' ] = 'Yes'
87+ # Relative pronoun "jenž" should be PRON, not DET
88+ # (it inflects for Gender but it can never be used as congruent attribute).
89+ if node .upos in ['PRON' , 'DET' ] and node .lemma == 'jenž' :
90+ node .upos = 'PRON'
91+ if node .form .lower ().startswith ('j' ):
92+ node .feats ['PrepCase' ] = 'Npr'
93+ else :
94+ node .feats ['PrepCase' ] = 'Pre'
95+ # Pronoun "sám" is lemmatized to the long form, it is DET and PronType=Emp.
96+ if node .upos in ['PRON' , 'DET' ] and node .lemma == 'sám' :
97+ node .lemma = 'samý'
98+ node .upos = 'DET'
99+ node .feats ['PronType' ] = 'Emp'
100+ node .feats ['Variant' ] = 'Short' if re .fullmatch (r'(sám|sama|samo|sami|samy|samu)' , node .form .lower ()) else ''
101+ # Pronominal cardinal numerals should be DET, not NUM.
102+ if node .upos == 'NUM' :
103+ if re .fullmatch (r'(mnoho|málo|několik)' , node .lemma ):
104+ node .upos = 'DET'
105+ node .feats ['PronType' ] = 'Ind'
106+ node .feats ['NumForm' ] = ''
107+ node .feats ['Polarity' ] = '' ###!!! so we are losing the distinction mnoho/nemnoho?
108+ elif re .fullmatch (r'(tolik)' , node .lemma ):
109+ node .upos = 'DET'
110+ node .feats ['PronType' ] = 'Dem'
111+ node .feats ['NumForm' ] = ''
112+ node .feats ['Polarity' ] = ''
113+ elif re .fullmatch (r'(kolik)' , node .lemma ):
114+ node .upos = 'DET'
115+ node .feats ['PronType' ] = 'Int,Rel'
116+ node .feats ['NumForm' ] = ''
117+ node .feats ['Polarity' ] = ''
118+ # Pronominal adverbs have PronType but most of them do not have Degree
119+ # and Polarity.
120+ if node .upos == 'ADV' :
121+ if re .fullmatch (r'(tady|tak|tam|teď|tehdy|tu)' , node .lemma ):
122+ node .feats ['PronType' ] = 'Dem'
123+ node .feats ['Degree' ] = ''
124+ node .feats ['Polarity' ] = ''
125+ elif re .fullmatch (r'(dokdy|jak|kam|kde|kdy|kudy|odkdy|odkud|proč)' , node .lemma ):
126+ node .feats ['PronType' ] = 'Int,Rel'
127+ node .feats ['Degree' ] = ''
128+ node .feats ['Polarity' ] = ''
41129 # In Czech UD, "být" is always tagged as AUX and never as VERB, regardless
42130 # of the fact that it can participate in purely existential constructions
43131 # where it no longer acts as a copula. Czech tagsets typically do not
44132 # distinguish AUX from VERB, which means that converted data may have to
45133 # be fixed.
46134 if node .upos == 'VERB' and node .lemma == 'být' :
47135 node .upos = 'AUX'
136+ # Present forms of perfective verbs normally have Tense=Pres despite
137+ # meaning future. However, a few imperfective verbs have a separate
138+ # future form (distinct from present form), which gets Tense=Fut
139+ # despite inflecting similarly to present forms.
140+ if node .upos in ['VERB' , 'AUX' ] and node .feats ['Mood' ] == 'Ind' and node .feats ['Tense' ] == 'Pres' and node .feats ['Aspect' ] != 'Perf' and re .match (r'((bud|půjd|pojed|polez)(u|eš|e|eme?|ete|ou)|polet(ím|íš|í|íme|íte))' , node .form .lower ()):
141+ node .feats ['Tense' ] = 'Fut'
142+ # "jako" should be SCONJ but 19th century data have it as PART.
143+ if node .upos == 'PART' and node .lemma == 'jako' :
144+ node .upos = 'SCONJ'
0 commit comments