|
6 | 6 | created as part of the Hičkok project (while importing nineteenth-century Czech |
7 | 7 | data) but it should be applicable on any other Czech treebank. |
8 | 8 | """ |
9 | | -import udapi.block |
| 9 | +from udapi.core.block import Block |
| 10 | +import logging |
10 | 11 | import re |
11 | 12 |
|
12 | | -class FixMorpho(udapi.block): |
| 13 | +class FixMorpho(Block): |
13 | 14 |
|
14 | 15 | def process_node(self, node): |
| 16 | + # Do not touch words marked as Foreign or Typo. They may not behave the |
| 17 | + # way we expect in Czech data. |
| 18 | + if node.feats['Foreign'] == 'Yes' or node.feats['Typo'] == 'Yes': |
| 19 | + return |
| 20 | + # Nouns do not have polarity but the Prague-style tagsets may mark it. |
| 21 | + if node.upos in ['NOUN', 'PROPN']: |
| 22 | + if node.feats['Polarity'] == 'Pos': |
| 23 | + node.feats['Polarity'] = '' |
| 24 | + elif node.feats['Polarity'] == 'Neg': |
| 25 | + logging.warn(f'To remove Polarity=Neg from the NOUN {node.form}, we may have to change its lemma ({node.lemma}).') |
| 26 | + # In 19th century data, the grammaticalized usages of "se", "si" are |
| 27 | + # tagged as PART (rather than a reflexive PRON, which is the standard). |
| 28 | + # Even if it already was tagged PRON, some features may have to be added. |
| 29 | + if node.upos in ['PRON', 'PART'] and node.form.lower() in ['se', 'si']: |
| 30 | + node.lemma = 'se' |
| 31 | + node.upos = 'PRON' |
| 32 | + node.feats['PronType'] = 'Prs' |
| 33 | + node.feats['Reflex'] = 'Yes' |
| 34 | + if node.form.lower() == 'se': |
| 35 | + # Occasionally "se" can be genitive: "z prudkého do se dorážení". |
| 36 | + if not node.feats['Case'] == 'Gen': |
| 37 | + node.feats['Case'] = 'Acc' |
| 38 | + else: |
| 39 | + node.feats['Case'] = 'Dat' |
| 40 | + node.feats['Variant'] = 'Short' |
15 | 41 | # In Czech UD, "být" is always tagged as AUX and never as VERB, regardless |
16 | 42 | # of the fact that it can participate in purely existential constructions |
17 | 43 | # where it no longer acts as a copula. Czech tagsets typically do not |
|
0 commit comments