From 83114b3d86a7afbb8a97af63b0bf3c421bfeab92 Mon Sep 17 00:00:00 2001 From: "david.vilares" Date: Wed, 24 May 2017 16:05:01 +0200 Subject: [PATCH 1/4] bist-covington: a non-projective transition-based BIST-parser --- bcovington/covington.py~ | 875 +++++++++++ bcovington/parser.py~ | 0 bcovington/src/covington.py | 875 +++++++++++ bcovington/src/covington.pyc | Bin 0 -> 24143 bytes bcovington/src/parser.py | 175 +++ bcovington/src/parser.pyc | Bin 0 -> 6247 bytes bcovington/src/utils.py | 265 ++++ bcovington/src/utils.pyc | Bin 0 -> 8418 bytes bcovington/src/utils/conll17_ud_eval.py | 556 +++++++ bcovington/src/utils/eval.pl | 1827 +++++++++++++++++++++++ bcovington/src/utils/weights.clas | 11 + bcovington/utils.py | 269 ++++ 12 files changed, 4853 insertions(+) create mode 100644 bcovington/covington.py~ create mode 100644 bcovington/parser.py~ create mode 100644 bcovington/src/covington.py create mode 100644 bcovington/src/covington.pyc create mode 100644 bcovington/src/parser.py create mode 100644 bcovington/src/parser.pyc create mode 100644 bcovington/src/utils.py create mode 100644 bcovington/src/utils.pyc create mode 100644 bcovington/src/utils/conll17_ud_eval.py create mode 100644 bcovington/src/utils/eval.pl create mode 100644 bcovington/src/utils/weights.clas create mode 100644 bcovington/utils.py diff --git a/bcovington/covington.py~ b/bcovington/covington.py~ new file mode 100644 index 0000000..ec982a9 --- /dev/null +++ b/bcovington/covington.py~ @@ -0,0 +1,875 @@ +from dynet import * +from utils_bcovington import read_conll, write_conll, CovingtonConfiguration +from operator import itemgetter +from itertools import chain +from tarjan import tarjan +import time, random +import numpy as np +import os +import warnings + + +""" +This is a module extended from original the transition-based BIST-Parser barchybrid: + +https://github.com/elikip/bist-parser/blob/master/barchybrid/ +Kiperwasser, E., & Goldberg, Y. (2016). Simple and accurate dependency parsing using bidirectional LSTM feature representations. arXiv preprint arXiv:1603.04351. + + +that has been adapted to include to support non-projective transition-based dependency parsing +using an implementation (O(n^2)) of the traditional Covington's (2001) algorithm, according +to the list-based transition-based described in Nivre (2008). + +Covington, M. A. (2001). A fundamental algorithm for dependency parsing. In Proceedings of the 39th annual ACM southeast conference (pp. 95-102). +Nivre, J. (2008). Algorithms for deterministic incremental dependency parsing. Computational Linguistics, 34(4), 513-553. + +We also include the O(n) dynamic oracle described in Gomez-Rodriguez and Fernandez-Gonzalez (2015). +TODO: Current implementation is O(n^2) + +Gomez-Rodriguez, C., & Fernandez-Gonzalez, D. (2015). An efficient dynamic oracle for unrestricted non-projective parsing. Volume 2: Short Papers, 256. + +""" + + + +class CovingtonBILSTM: + + #ACTIVATION FUNCTIONS + TANH = 'tanh' + SIGMOID = 'sigmoid' + RELU = 'relu' + TANH3 = 'tanh3' + + #OPTIMIZERS + SGD="sgd" + MOMENTUM="momentum" + ADAGRAD="adagrad" + ADADELTA="adadelta" + ADAM = "adam" + + #SPECIAL INDEXES + INDEX_WORD_PAD = 1 + INDEX_WORD_INITIAL = 2 + INDEX_POS_PAD = 1 + INDEX_POS_INITIAL = 2 + INIT_WORD_INDEX = 3 + INIT_POS_INDEX = INIT_WORD_INDEX + + INDEX_FEATS_PAD = 1 + INDEX_FEATS_INITIAL= 2 + INIT_FEATS_INDEX = INIT_WORD_INDEX + + #TRANSITIONS + LEFT_ARC = 0 + RIGHT_ARC = 1 + SHIFT = 2 + NO_ARC = 3 + TRANSITIONS = [LEFT_ARC, RIGHT_ARC, SHIFT, NO_ARC] + + #OTHER HYPERPARAMETERS + SIZE_TRANSITIONS = len(TRANSITIONS) + + def __init__(self, words, lemmas, cpos, pos, feats, rels, w2i, l2i, options, path_oov_external_embedding=None, + pretrained=False): + + self.model = Model() + if options.optimizer == self.ADAM: + self.trainer = AdamTrainer(self.model) + elif options.optimizer == self.SGD: + self.trainer = SimpleSGDTrainer(self.model) + elif options.optimizer == self.MOMENTUM: + self.trainer = MomentumSGDTrainer(self.model) + elif options.optimizer == self.ADAGRAD: + self.trainer = AdagradTrainer(self.model) + elif options.optimizer == self.ADADELTA: + self.trainer = AdadeltaTrainer(self.model) + else: + raise NotImplementedError("Selected optimizer is not available") + + random.seed(1) + + self.activations = {self.TANH: tanh, + self.SIGMOID: logistic, + self.RELU: rectify, + self.TANH3: (lambda x: tanh(cwise_multiply(cwise_multiply(x, x), x)))} + + self.activation = self.activations[options.activation] + + self.oracle = options.oracle + + + self.ldims = options.lstm_dims * 2 #*2 because it is a bi-lstm + self.wdims = options.wembedding_dims + self.pdims = options.pembedding_dims + self.rdims = options.rembedding_dims + self.layers = options.lstm_layers + self.wordsCount = words + + self.vocab = {word: ind+self.INIT_WORD_INDEX for word, ind in w2i.iteritems()} + self.lemmas = {lemma: ind+self.INIT_WORD_INDEX for lemma,ind in l2i.iteritems()} + self.cpos = {cpos: ind+self.INIT_POS_INDEX for ind, cpos in enumerate(cpos)} + self.pos = {pos: ind+self.INIT_POS_INDEX for ind, pos in enumerate(pos)} + self.feats = {f: ind+self.INIT_FEATS_INDEX for ind, f in enumerate(feats)} + self.rels = {word: ind for ind, word in enumerate(rels)} + + #List of dependency types + self.irels = rels + + self.headFlag = options.headFlag + self.rlMostFlag = options.rlMostFlag + self.rlFlag = options.rlFlag + self.kb = options.window_b + self.kl1 = options.window_l1 + self.kl2_r = options.window_l2r + self.kl2_l = options.window_l2l + + self.nnvecs = (1 if self.headFlag else 0) + (2 if self.rlFlag or self.rlMostFlag else 0) + + #Reading external embedding files, if they exists + + #INFORMATION FOR EXTERNAL WORD EMBEDDINGS + self.external_embedding = None + self.edim = None + self.noextrn = None + self.extrnd = None + self.elookup = None + if options.external_embedding is not None and os.path.exists(options.external_embedding): + self.external_embedding, self.edim,self.noextrn,self.extrnd, self.elookup = self._assign_external_embeddings(options.external_embedding, + self.INDEX_WORD_PAD, self.INDEX_WORD_INITIAL) + else: + warnings.warn("Not using any external file for FORM embeddings") + + #INFORMATION FOR THE EXTERNAL CPOSTAG EMBEDDINGS + self.cpos_external_embedding = None + self.cpos_edim = None + self.cpos_noextrn = None + self.cpos_extrnd = None + self.cpos_elookup = None + if options.cpos_external_embedding is not None and os.path.exists(options.cpos_external_embedding): + self.cpos_external_embedding, self.cpos_edim,self.cpos_noextrn,self.cpos_extrnd, self.cpos_elookup = self._assign_external_embeddings(options.cpos_external_embedding, + self.INDEX_POS_PAD, self.INDEX_POS_INITIAL) + else: + warnings.warn("Not using any external file for CPOSTAG embeddings") + + #INFORMATION FOR THE EXTERNAL POSTAG EMBEDDINGS + self.pos_external_embedding = None + self.pos_edim = None + self.pos_noextrn = None + self.pos_extrnd = None + self.pos_elookup= None + if options.pos_external_embedding is not None and os.path.exists(options.pos_external_embedding): + self.pos_external_embedding, self.pos_edim,self.pos_noextrn,self.pos_extrnd, self.pos_elookup = self._assign_external_embeddings(options.pos_external_embedding, + self.INDEX_POS_PAD, self.INDEX_POS_INITIAL) + else: + warnings.warn("Not using any external file for POSTAG embeddings") + + #INFORMATION FOR THE EXTERNAL FEATS EMBEDDINGS + self.feats_external_embedding = None + self.feats_edim = None + self.feats_noextrn = None + self.feats_extrnd = None + self.feats_elookup= None + + if options.feats_external_embedding is not None and os.path.exists(options.feats_external_embedding): + self.feats_external_embedding, self.feats_edim,self.feats_noextrn,self.feats_extrnd, self.feats_elookup = self._assign_external_embeddings(options.feats_external_embedding, self.INDEX_FEATS_PAD, self.INDEX_FEATS_INITIAL) + else: + warnings.warn("Not using any external file for FEATS embeddings") + + + #INFORMATION FOR THE EXTERNAL FEATS EMBEDDINGS +# self.lemmas_external_embedding = None +# self.lemmas_edim = None +# self.lemmas_noextrn = None +# self.lemmas_extrnd = None +# self.lemmas_elookup= None + +# if options.lemmas_external_embedding is not None and os.path.exists(options.lemmas_external_embedding): +# self.lemmas_external_embedding, self.lemmas_edim,self.lemmas_noextrn,self.lemmas_extrnd, self.lemmas_elookup = self._assign_external_embeddings(options.lemmas_external_embedding, self.INDEX_FEATS_PAD, self.INDEX_FEATS_INITIAL) +# else: +# warnings.warn("Not using any external file for LEMMAS embeddings") + + + + + self.oov_external_embedding = None + self.oov_edim = None + self.oov_noextrn = None + self.oov_extrnd = None + self.oov_elookup = None + + + if path_oov_external_embedding is not None and os.path.exists(options.feats_external_embedding): + self.oov_external_embedding, self.oov_edim,self.oov_noextrn,self.oov_extrnd, self.oov_elookup = self._assign_external_embeddings(path_oov_external_embedding, + self.INDEX_WORD_PAD, self.INDEX_WORD_INITIAL) + + if self.oov_external_embedding is not None and self.oov_edim != self.edim: + raise ValueError("The dimensions of the embeddings for OOV words is not equal to the dimension of the rest of external word embeddings (self.oov_edim != self.edim)") + + #Obtaining the dimension of the input + dims = (self.wdims + self.pdims + (self.edim if self.external_embedding is not None else 0) + + (self.cpos_edim if self.cpos_external_embedding is not None else 0) + + (self.pos_edim if self.pos_external_embedding is not None else 0)+ + (self.feats_edim if self.feats_external_embedding is not None else 0) +# + +# (self.lemmas_edim if self.lemmas_external_embedding is not None else 0) + ) + + + #Initialization of the architecture + + self.blstmFlag = options.blstmFlag + self.bibiFlag = options.bibiFlag + + if self.bibiFlag: + self.surfaceBuilders = [VanillaLSTMBuilder(1, dims, self.ldims * 0.5, self.model), + VanillaLSTMBuilder(1, dims, self.ldims * 0.5, self.model)] + self.bsurfaceBuilders = [VanillaLSTMBuilder(1, self.ldims, self.ldims * 0.5, self.model), + VanillaLSTMBuilder(1, self.ldims, self.ldims * 0.5, self.model)] + elif self.blstmFlag: + if self.layers > 0: + self.surfaceBuilders = [VanillaLSTMBuilder(self.layers, dims, self.ldims * 0.5, self.model), LSTMBuilder(self.layers, dims, self.ldims * 0.5, self.model)] + else: + self.surfaceBuilders = [SimpleRNNBuilder(1, dims, self.ldims * 0.5, self.model), LSTMBuilder(1, dims, self.ldims * 0.5, self.model)] + + + self.hidden_units = options.hidden_units + self.hidden2_units = options.hidden2_units + self.vocab['*PAD*'] = self.INDEX_WORD_PAD + self.cpos['*PAD*'] = self.INDEX_POS_PAD + self.feats['*PAD*'] = self.INDEX_FEATS_PAD + + self.vocab['*INITIAL*'] = self.INDEX_WORD_INITIAL + self.cpos['*INITIAL*'] = self.INDEX_POS_INITIAL + self.feats['*INITIAL*'] = self.INDEX_FEATS_INITIAL + + self.wlookup = self.model.add_lookup_parameters((len(words) + self.INIT_WORD_INDEX, self.wdims)) + self.plookup = self.model.add_lookup_parameters((len(cpos) + self.INIT_POS_INDEX, self.pdims)) + self.rlookup = self.model.add_lookup_parameters((len(rels), self.rdims)) + + + self.word2lstm = self.model.add_parameters((self.ldims, dims)) + + self.word2lstmbias = self.model.add_parameters((self.ldims)) + self.lstm2lstm = self.model.add_parameters((self.ldims, self.ldims * self.nnvecs + self.rdims)) + self.lstm2lstmbias = self.model.add_parameters((self.ldims)) + + self.hidLayer = self.model.add_parameters((self.hidden_units, self.ldims * self.nnvecs * (self.kl1 + self.kl2_l + self.kl2_r + self.kb))) + self.hidBias = self.model.add_parameters((self.hidden_units)) + + self.hid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) + self.hid2Bias = self.model.add_parameters((self.hidden2_units)) + + self.outLayer = self.model.add_parameters((self.SIZE_TRANSITIONS, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) + self.outBias = self.model.add_parameters((self.SIZE_TRANSITIONS)) + + self.rhidLayer = self.model.add_parameters((self.hidden_units, self.ldims * self.nnvecs * (self.kl1 + self.kl2_l + self.kl2_r + self.kb))) + self.rhidBias = self.model.add_parameters((self.hidden_units)) + + self.rhid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) + self.rhid2Bias = self.model.add_parameters((self.hidden2_units)) + + self.routLayer = self.model.add_parameters((2 * (len(self.irels) + 0) + 1, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) + self.routBias = self.model.add_parameters((2 * (len(self.irels) + 0) + 1)) + + self.pretrained = pretrained + + + def _assign_external_embeddings(self,option_external_embedding, + index_pad,index_initial): + """ + Reads an external embedding file + Returns: + external_embedding: A dictionary of key:embedding + edim: Dimension of the embedding + noextrn: ?? + extrnd: Index for each key + elookup: Parameter lookup + """ + + + if option_external_embedding is not None: + + external_embedding_fp = open(option_external_embedding,'r') + external_embedding_fp.readline() + + external_embedding = {line.split(' ')[0] : [float(f) for f in line.strip().split(' ')[1:]] + for line in external_embedding_fp} + + + external_embedding_fp.close() + + edim = len(external_embedding.values()[0]) + noextrn = [0.0 for _ in xrange(edim)] + extrnd = {element: i + self.INIT_POS_INDEX + for i, element in enumerate(external_embedding)} + elookup = self.model.add_lookup_parameters((len(external_embedding) + self.INIT_WORD_INDEX, edim)) + + for element, i in extrnd.iteritems(): + elookup.init_row(i, external_embedding[element]) + extrnd['*PAD*'] = index_pad + extrnd['*INITIAL*'] = index_initial + + return external_embedding, edim, noextrn, extrnd, elookup + + return None,None,None,None,None + + + + def __evaluate(self, c, train): + """ + @param c: A CovingtonConfiguration instance + @param train: True if used in the training phase, False otherwise + Returns the scores for all possible transitions (training) + or the top ones (testing) for a given configuration c + """ + + #Gets the embeddings for the terms to be used in the prediction + top_l1 = [c.sentence[c.l1-i].lstms if c.l1 - i > 0 else [self.empty] for i in xrange(self.kl1)] + top_l2l = [c.sentence[c.l1+1+i].lstms if c.l1+1+i < c.b else [self.empty] for i in xrange(self.kl2_l)] + top_l2r = [c.sentence[c.b-i].lstms if c.b-i > c.l1 else [self.empty] for i in xrange(self.kl2_r)] + topBuffer = [c.sentence[c.b+i-1].lstms if c.b+i-1 <= c.sentence[-1].id else [self.empty] for i in xrange(self.kb)] + + input = concatenate(list(chain(*(top_l1 + top_l2l + top_l2r + topBuffer)))) + + if self.hidden2_units > 0: + routput = (self.routLayer.expr() * self.activation(self.rhid2Bias.expr() + self.rhid2Layer.expr() * self.activation(self.rhidLayer.expr() * input + self.rhidBias.expr())) + self.routBias.expr()) + else: + routput = (self.routLayer.expr() * self.activation(self.rhidLayer.expr() * input + self.rhidBias.expr()) + self.routBias.expr()) + + if self.hidden2_units > 0: + output = (self.outLayer.expr() * self.activation(self.hid2Bias.expr() + self.hid2Layer.expr() * self.activation(self.hidLayer.expr() * input + self.hidBias.expr())) + self.outBias.expr()) + else: + output = (self.outLayer.expr() * self.activation(self.hidLayer.expr() * input + self.hidBias.expr()) + self.outBias.expr()) + + scrs, uscrs = routput.value(), output.value() + + if train: + left_arc_info = [(rel,self.LEFT_ARC, scrs[1+j*2] + uscrs[self.LEFT_ARC], routput[1+j*2]+ output[self.LEFT_ARC]) + for j, rel in enumerate(self.irels) if c.l1 > 0 and c.l1 < c.b and c.b <= c.sentence[-1].id] + + right_arc_info = [(rel,self.RIGHT_ARC, scrs[2+j*2] + uscrs[self.RIGHT_ARC], routput[2+j*2]+ output[self.RIGHT_ARC]) + for j, rel in enumerate(self.irels) if c.l1 >= 0 and c.l1 < c.b and c.b <= c.sentence[-1].id] + + shift_info = [ (None, self.SHIFT, scrs[0] + uscrs[self.SHIFT], routput[0] + output[self.SHIFT]) ] if c.b <= c.sentence[-1].id else [] + + no_arc_info = [(None, self.NO_ARC,scrs[3] + uscrs[self.NO_ARC], routput[3] + output[self.NO_ARC] )] if c.l1> 0 and c.b <= c.sentence[-1].id else [] + + ret = [left_arc_info,right_arc_info, shift_info, no_arc_info] + + else: + #It is done different from the 'train' phase, due to the dynamic oracle. + #In the test phase we already pick the most likely transition/dependency instead of returning them all + #and then selecting one according to the prediction of the dynamic oracle + sLEFT,rLEFT = max(zip(scrs[1::2],self.irels)) + sRIGHT,rRIGHT = max(zip(scrs[2::2],self.irels)) + sLEFT += uscrs[self.LEFT_ARC] + sRIGHT += uscrs[self.RIGHT_ARC] + ret = [ [(rLEFT, self.LEFT_ARC, sLEFT) ] if (c.l1 > 0 and c.l1 < c.b and c.b <= c.sentence[-1].id and self._is_valid_left_arc(c)) else [], + [(rRIGHT, self.RIGHT_ARC, sRIGHT) ] if (c.l1 >= 0 and c.l1 < c.b and c.b <= c.sentence[-1].id and self._is_valid_right_arc(c)) else [], + [(None, self.SHIFT, scrs[0] + uscrs[self.SHIFT]) ] if (c.b <= c.sentence[-1].id) else [], + [(None, self.NO_ARC,scrs[3] + uscrs[self.NO_ARC]) ] if (c.l1 > 0 and c.b <= c.sentence[-1].id) else [] + ] + return ret + + + def Save(self, filename): + self.model.save(filename) + + + def Load(self, filename): + self.model.load(filename) + + def Init(self): + evec = self.elookup[1] if self.external_embedding is not None else None + cpos_evec = self.cpos_elookup[1] if self.cpos_external_embedding is not None else None + pos_evec = self.pos_elookup[1] if self.pos_external_embedding is not None else None + feats_evec = self.feats_elookup[1] if self.feats_external_embedding is not None else None + # lemmas_evec = self.lemmas_elookup[1] if self.lemmas_external_embedding is not None else None + paddingWordVec = self.wlookup[1] + paddingPosVec = self.plookup[1] if self.pdims > 0 else None + # paddingVec = tanh(self.word2lstm.expr() * concatenate(filter(None, [paddingWordVec, paddingPosVec, evec, cpos_evec, pos_evec, feats_evec, lemmas_evec])) + self.word2lstmbias.expr()) + paddingVec = tanh(self.word2lstm.expr() * concatenate(filter(None, [paddingWordVec, paddingPosVec, evec, cpos_evec, pos_evec, feats_evec])) + self.word2lstmbias.expr()) + self.empty = paddingVec if self.nnvecs == 1 else concatenate([paddingVec for _ in xrange(self.nnvecs)]) + + + def getWordEmbeddings(self, sentence, train): + """ + Gets the embeddings (also external) for every term in a sentence + Returns a vector of all embeddings concatenated + """ + + for root in sentence: + c = float(self.wordsCount.get(root.norm, 0)) + dropFlag = not train or (random.random() < (c/(0.25+c))) + sys.stdout.flush() + root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if dropFlag else 0] + root.cposvec = self.plookup[int(self.cpos.get(root.cpos,0))] if self.pdims > 0 else None + + #For word embeddings + if self.external_embedding is not None: + if root.form in self.external_embedding: + root.evec = self.elookup[self.extrnd[root.form]] + elif root.norm in self.external_embedding: + root.evec = self.elookup[self.extrnd[root.norm]] + else: + if (self.oov_external_embedding is not None and root.form.replace(" ","_") in self.oov_external_embedding): + root.evec = self.oov_elookup[self.oov_extrnd[root.form.replace(" ","_")]] + else: + root.evec = self.elookup[0] + else: + root.evec = None + + #For cpostag embeddings + if self.cpos_external_embedding is not None: + if root.cpos in self.cpos_external_embedding: + root.cposevec = self.cpos_elookup[self.cpos_extrnd[root.cpos]] + else: + root.cposevec = self.cpos_elookup[0] + else: + root.cposevec = None + + #For postag embeddings + if self.pos_external_embedding is not None: + if root.pos in self.pos_external_embedding: + root.posevec = self.pos_elookup[self.pos_extrnd[root.pos]] + else: + root.posevec = self.pos_elookup[0] + else: + root.posevec = None +# + #For feats embeddings + if self.feats_external_embedding is not None: + if root.feats in self.feats_external_embedding: + root.featsevec = self.feats_elookup[self.feats_extrnd[root.feats]] + else: + root.featsevec = self.feats_elookup[0] + else: + root.featsevec = None + + + #For lemmas embeddings +# if self.lemmas_external_embedding is not None: +# if root.lemma in self.lemmas_external_embedding: +# root.lemmasevec = self.lemmas_elookup[self.lemmas_extrnd[root.lemma]] +# else: +# root.lemmasevec = self.lemmas_elookup[0] +# else: +# root.lemmasevec = None + + + # root.ivec = concatenate(filter(None, [root.wordvec, root.cposvec, root.evec, root.cposevec, root.posevec, root.featsevec, root.lemmasevec])) + root.ivec = concatenate(filter(None, [root.wordvec, root.cposvec, root.evec, root.cposevec, root.posevec, root.featsevec])) + + if self.blstmFlag: + forward = self.surfaceBuilders[0].initial_state() + backward = self.surfaceBuilders[1].initial_state() + + for froot, rroot in zip(sentence, reversed(sentence)): + forward = forward.add_input( froot.ivec ) + backward = backward.add_input( rroot.ivec ) + froot.fvec = forward.output() + rroot.bvec = backward.output() + for root in sentence: + root.vec = concatenate( [root.fvec, root.bvec] ) + + if self.bibiFlag: + bforward = self.bsurfaceBuilders[0].initial_state() + bbackward = self.bsurfaceBuilders[1].initial_state() + + for froot, rroot in zip(sentence, reversed(sentence)): + bforward = bforward.add_input( froot.vec ) + bbackward = bbackward.add_input( rroot.vec ) + froot.bfvec = bforward.output() + rroot.bbvec = bbackward.output() + for root in sentence: + root.vec = concatenate( [root.bfvec, root.bbvec] ) + + else: + for root in sentence: + root.ivec = (self.word2lstm.expr() * root.ivec) + self.word2lstmbias.expr() + root.vec = tanh( root.ivec ) + + + def Predict(self, conll_path): + """ + Makes non-projective depending parsing prediction given a ConLL-X file + """ + + + with open(conll_path, 'r') as conllFP: + for iSentence, sentence in enumerate(read_conll(conllFP)): + self.Init() + + l1 = sentence[0].id + b = sentence[1].id + arcs = set([]) + + self.getWordEmbeddings(sentence, False) + + for root in sentence: + root.lstms = [root.vec for _ in xrange(self.nnvecs)] + + hoffset = 1 if self.headFlag else 0 + + c = CovingtonConfiguration(l1,b,sentence,arcs) + while not self._is_final_state(b,sentence): + + transition_scores = self.__evaluate(c, False) + + + best = max(chain(*transition_scores), key = itemgetter(2) ) + + if best[1] == self.LEFT_ARC: + + sentence[l1].pred_parent_id = sentence[b].id + sentence[l1].pred_relation = best[0] + best_op = self.LEFT_ARC + if self.rlMostFlag: + sentence[b].lstms[best_op+hoffset] = sentence[l1].lstms[best_op+hoffset] + if self.rlFlag: + sentence[b].lstms[best_op+hoffset] = sentence[l1].vec + + arcs.add((b,l1)) + l1 = l1 -1 + + elif best[1] == self.RIGHT_ARC: + + sentence[b].pred_parent_id = sentence[l1].id + sentence[b].pred_relation = best[0] + + best_op = self.RIGHT_ARC + if self.rlMostFlag: + sentence[l1].lstms[best_op+hoffset] = sentence[b].lstms[best_op+hoffset] + if self.rlFlag: + sentence[l1].lstms[best_op+hoffset] = sentence[b].vec + + arcs.add((l1,b)) + l1 = l1-1 + + elif best[1] == self.SHIFT: + l1 = b + b = b + 1 + + + elif best[1] == self.NO_ARC: + l1 = l1 - 1 + + c = CovingtonConfiguration(l1,b,sentence,arcs) + renew_cg() + yield sentence + + + def Train(self, conll_path): + """ + Trains a O(n^2) Covington's parser with a O(n^2) dynamic oracle + """ + mloss = 0.0 + errors = 0 + batch = 0 + eloss = 0.0 + eerrors = 0 + lerrors = 0 + etotal = 0 + ltotal = 0 + ninf = -float('inf') + + hoffset = 1 if self.headFlag else 0 + + start = time.time() + + with open(conll_path, 'r') as conllFP: + shuffledData = list(read_conll(conllFP)) + + random.shuffle(shuffledData) + + + errs = [] + eeloss = 0.0 + + self.Init() + + for iSentence, sentence in enumerate(shuffledData): + if iSentence % 100 == 0 and iSentence != 0: + print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Labeled Errors:', (float(lerrors) / etotal) , 'Time', time.time()-start + start = time.time() + eerrors = 0 + eloss = 0.0 + etotal = 0 + lerrors = 0 + ltotal = 0 + + self.getWordEmbeddings(sentence, True) + #We obtain the gold arcs to then compute the dynamic oracle for covington + gold_arcs = set([]) + for word in sentence: + + #TODO: Weird error if not, adds and arc (0,0) + if word.id != word.parent_id: + gold_arcs.add((word.parent_id,word.id)) + + + l1 = sentence[0].id + b = sentence[1].id + arcs = set([]) + c = CovingtonConfiguration(l1,b,sentence,arcs) + loss_c = self._loss(c,gold_arcs, iSentence) + + for word in sentence: + word.lstms = [word.vec for _ in xrange(self.nnvecs)] + + hoffset = 1 if self.headFlag else 0 + + while not self._is_final_state(b,sentence): + + costs = [None,None,None,None] + transition_scores = self.__evaluate(c, True) + + #We determine if the transitions are valid for a given configuration c + for t in self.TRANSITIONS: + + l1_aux = l1 + b_aux = b + arcs_aux = set(arcs) + valid_transition = False + + if t == self.LEFT_ARC and self._is_valid_left_arc(c): + arcs_aux.add((b_aux,l1_aux)) + l1_aux = l1_aux -1 + valid_transition = True + + if t == self.RIGHT_ARC and l1 >=0 and self._is_valid_right_arc(c): + arcs_aux.add((l1_aux,b_aux)) + l1_aux = l1_aux-1 + valid_transition = True + + if t == self.NO_ARC and l1 >0: + l1_aux = l1_aux-1 + valid_transition = True + + if t == self.SHIFT: + l1_aux = b_aux + b_aux = b_aux + 1 + valid_transition = True + + if valid_transition: + + new_c = CovingtonConfiguration(l1_aux,b_aux,sentence,arcs_aux) + loss_new_c = self._loss(new_c,gold_arcs,iSentence) + + cost = loss_new_c - loss_c + costs[t] = float(cost) + + #Valid transitions are those with cost 0 + #If it is a LEFT/RIGHT arc, also the relation must match with the one in gold standard + valid_transitions = [s for s in chain(*transition_scores) if costs[s[1]] == 0 and (s[1] in [self.SHIFT,self.NO_ARC] + or ((s[1] == self.LEFT_ARC and s[0] == sentence[l1].relation) + or (s[1] == self.RIGHT_ARC and s[0] == sentence[b].relation)))] + + best_valid = max(valid_transitions, key=itemgetter(2)) + + wrong_transitions = [s for s in chain(*transition_scores) if costs[s[1]] is not None and ( (costs[s[1]] != 0) or (s[1] in [self.LEFT_ARC,self.RIGHT_ARC] + and ((s[1] == self.LEFT_ARC and s[0] != sentence[l1].relation) + or (s[1] == self.RIGHT_ARC and s[0] != sentence[b].relation))) ) ] + + #Aggressive exploration as done by Kiperwasser and Golberg (2016) + if wrong_transitions != []: + best_wrong = max(wrong_transitions, key=itemgetter(2)) + + best = best_valid if ( (not self.oracle) or (best_valid[2] - best_wrong[2] > 1.0) + or (best_valid[2] > best_wrong[2] and random.random() > 0.1) ) else best_wrong + else: + best = best_valid + + + #Moving a new configuration based on the "best" choice + if best[1] == self.LEFT_ARC: + + sentence[l1].pred_parent_id = sentence[b].id + sentence[l1].pred_relation = best[0] + + best_op = self.LEFT_ARC + if self.rlMostFlag: + sentence[b].lstms[best_op+hoffset] = sentence[l1].lstms[best_op+hoffset] + if self.rlFlag: + sentence[b].lstms[best_op+hoffset] = sentence[l1].vec + + child = sentence[l1] + arcs.add((b,l1)) + l1 = l1 -1 + + elif best[1] == self.RIGHT_ARC: + + + sentence[b].pred_parent_id = sentence[l1].id + sentence[b].pred_relation = best[0] + + best_op = self.RIGHT_ARC + if self.rlMostFlag: + sentence[l1].lstms[best_op+hoffset] = sentence[b].lstms[best_op+hoffset] + if self.rlFlag: + sentence[l1].lstms[best_op+hoffset] = sentence[b].vec + + arcs.add((l1,b)) + child = sentence[b] + l1 = l1-1 + + + elif best[1] == self.SHIFT: + l1 = b + child = sentence[b] + b = b + 1 + + + elif best[1] == self.NO_ARC: + l1 = l1 - 1 + child = sentence[l1] + + + if best_valid[2] < best_wrong[2] + 1.0: + loss = best_wrong[3] - best_valid[3] + mloss += 1.0 + best_wrong[2] - best_valid[2] + eloss += 1.0 + best_wrong[2] - best_valid[2] + errs.append(loss) + + + if best[1] not in [self.SHIFT, self.NO_ARC] and (child.pred_parent_id != child.parent_id or child.pred_relation != child.relation): + lerrors += 1 + if child.pred_parent_id != child.parent_id: + errors += 1 + eerrors += 1 + + etotal += 1 + c = CovingtonConfiguration(l1,b,sentence,arcs) + loss_c = self._loss(c,gold_arcs, iSentence) + + + if len(errs) > 50: + eerrs = esum(errs) + scalar_loss = eerrs.scalar_value() + eerrs.backward() + self.trainer.update() + errs = [] + lerrs = [] + + renew_cg() + self.Init() + + if len(errs) > 0: + eerrs = (esum(errs)) # * (1.0/(float(len(errs)))) + eerrs.scalar_value() + eerrs.backward() + self.trainer.update() + + errs = [] + lerrs = [] + + renew_cg() + + self.trainer.update_epoch() + print "Loss: ", mloss/iSentence + + + def _is_final_state(self,b,sentence): + return b >= len(sentence) + + + def _is_valid_left_arc(self,c): + + aux = set(c.A) + aux.add((c.b,c.l1)) + l1_has_head = self._y_has_head(c.A, c.b, c.l1) + return (c.l1 > 0 and not l1_has_head + and self._count_cycles(aux) == 0) + + + def _is_valid_right_arc(self,c): + + b_has_head = self._y_has_head(c.A, c.l1, c.b) + aux = set(c.A) + aux.add((c.l1,c.b)) + return ((not b_has_head) and self._count_cycles(aux) == 0) + + + """ + Gomez-Rodriguez & Fernandez-Gonzalez: + An Efficiente Dynamic Oracle for Unrestricted Non-Projective Parsing (ACL,2015) + Algorithm 1 + """ + def _loss(self, c, gold_arcs, iSentence): + + U = set([]) #set of unreachable nodes + non_built_arcs = gold_arcs.difference(c.A) + + + i = c.l1 + j = c.b + + for x,y in non_built_arcs: + left = min(x,y) #O(n) + right = max(x,y) #O(n) + if (j > right or (j==right and i < left) or self._y_has_head(c.A,x,y) + or self._weakly_connected(c.A, x, y,c, gold_arcs)): + U.add((x,y)) + + I = gold_arcs.difference(U) + + return len(U) + self._count_cycles( c.A.union(I)) + + + #TODO: This can be done more efficient + #O(n^2) + def _weakly_connected(self,A,x,y,c, gold_arcs): + + weakly_connected = False + end_path = False + parent = x + + while parent != 0 and not weakly_connected and not end_path and A != set([]): + if (parent,y) in A: + weakly_connected = True + break + else: + + for (a,b) in A: + if b == parent: + parent = a + break + else: + end_path = True + + + return weakly_connected + + + """ + Tarjan (1972) implementation at https://github.com/bwesterb/py-tarjan/ + O(n) + """ + def _count_cycles(self, A): + + d = {} + for a,b in A: + if a not in d: + d[a] = [b] + else: + d[a].append(b) + + return sum([1 for e in tarjan(d) if len(e) > 1]) + + + """ + Determines if node y has already a head + """ + #O(n) + def _y_has_head(self,A,x,y): + + for z,y_prime in A: + if y_prime == y and z != x: + return True + return False + + #O(n) +# def violates_single_root(self, A): +# print A,[1 for (h,d) in A if h==0], len([1 for (h,d) in A if h==0]) != 0 +# return len([1 for (h,d) in A if h==0]) != 0 + diff --git a/bcovington/parser.py~ b/bcovington/parser.py~ new file mode 100644 index 0000000..e69de29 diff --git a/bcovington/src/covington.py b/bcovington/src/covington.py new file mode 100644 index 0000000..e29dae5 --- /dev/null +++ b/bcovington/src/covington.py @@ -0,0 +1,875 @@ +from dynet import * +from utils import read_conll, write_conll, CovingtonConfiguration +from operator import itemgetter +from itertools import chain +from tarjan import tarjan +import time, random +import numpy as np +import os +import warnings + + +""" +This is a module extended from original the transition-based BIST-Parser barchybrid: + +https://github.com/elikip/bist-parser/blob/master/barchybrid/ +Kiperwasser, E., & Goldberg, Y. (2016). Simple and accurate dependency parsing using bidirectional LSTM feature representations. arXiv preprint arXiv:1603.04351. + + +that has been adapted to include to support non-projective transition-based dependency parsing +using an implementation (O(n^2)) of the traditional Covington's (2001) algorithm, according +to the list-based transition-based described in Nivre (2008). + +Covington, M. A. (2001). A fundamental algorithm for dependency parsing. In Proceedings of the 39th annual ACM southeast conference (pp. 95-102). +Nivre, J. (2008). Algorithms for deterministic incremental dependency parsing. Computational Linguistics, 34(4), 513-553. + +We also include the O(n) dynamic oracle described in Gomez-Rodriguez and Fernandez-Gonzalez (2015). +TODO: Current implementation is O(n^2) + +Gomez-Rodriguez, C., & Fernandez-Gonzalez, D. (2015). An efficient dynamic oracle for unrestricted non-projective parsing. Volume 2: Short Papers, 256. + +""" + + + +class CovingtonBILSTM: + + #ACTIVATION FUNCTIONS + TANH = 'tanh' + SIGMOID = 'sigmoid' + RELU = 'relu' + TANH3 = 'tanh3' + + #OPTIMIZERS + SGD="sgd" + MOMENTUM="momentum" + ADAGRAD="adagrad" + ADADELTA="adadelta" + ADAM = "adam" + + #SPECIAL INDEXES + INDEX_WORD_PAD = 1 + INDEX_WORD_INITIAL = 2 + INDEX_POS_PAD = 1 + INDEX_POS_INITIAL = 2 + INIT_WORD_INDEX = 3 + INIT_POS_INDEX = INIT_WORD_INDEX + + INDEX_FEATS_PAD = 1 + INDEX_FEATS_INITIAL= 2 + INIT_FEATS_INDEX = INIT_WORD_INDEX + + #TRANSITIONS + LEFT_ARC = 0 + RIGHT_ARC = 1 + SHIFT = 2 + NO_ARC = 3 + TRANSITIONS = [LEFT_ARC, RIGHT_ARC, SHIFT, NO_ARC] + + #OTHER HYPERPARAMETERS + SIZE_TRANSITIONS = len(TRANSITIONS) + + def __init__(self, words, lemmas, cpos, pos, feats, rels, w2i, l2i, options, path_oov_external_embedding=None, + pretrained=False): + + self.model = Model() + if options.optimizer == self.ADAM: + self.trainer = AdamTrainer(self.model) + elif options.optimizer == self.SGD: + self.trainer = SimpleSGDTrainer(self.model) + elif options.optimizer == self.MOMENTUM: + self.trainer = MomentumSGDTrainer(self.model) + elif options.optimizer == self.ADAGRAD: + self.trainer = AdagradTrainer(self.model) + elif options.optimizer == self.ADADELTA: + self.trainer = AdadeltaTrainer(self.model) + else: + raise NotImplementedError("Selected optimizer is not available") + + random.seed(1) + + self.activations = {self.TANH: tanh, + self.SIGMOID: logistic, + self.RELU: rectify, + self.TANH3: (lambda x: tanh(cwise_multiply(cwise_multiply(x, x), x)))} + + self.activation = self.activations[options.activation] + + self.oracle = options.oracle + + + self.ldims = options.lstm_dims * 2 #*2 because it is a bi-lstm + self.wdims = options.wembedding_dims + self.pdims = options.pembedding_dims + self.rdims = options.rembedding_dims + self.layers = options.lstm_layers + self.wordsCount = words + + self.vocab = {word: ind+self.INIT_WORD_INDEX for word, ind in w2i.iteritems()} + self.lemmas = {lemma: ind+self.INIT_WORD_INDEX for lemma,ind in l2i.iteritems()} + self.cpos = {cpos: ind+self.INIT_POS_INDEX for ind, cpos in enumerate(cpos)} + self.pos = {pos: ind+self.INIT_POS_INDEX for ind, pos in enumerate(pos)} + self.feats = {f: ind+self.INIT_FEATS_INDEX for ind, f in enumerate(feats)} + self.rels = {word: ind for ind, word in enumerate(rels)} + + #List of dependency types + self.irels = rels + + self.headFlag = options.headFlag + self.rlMostFlag = options.rlMostFlag + self.rlFlag = options.rlFlag + self.kb = options.window_b + self.kl1 = options.window_l1 + self.kl2_r = options.window_l2r + self.kl2_l = options.window_l2l + + self.nnvecs = (1 if self.headFlag else 0) + (2 if self.rlFlag or self.rlMostFlag else 0) + + #Reading external embedding files, if they exists + + #INFORMATION FOR EXTERNAL WORD EMBEDDINGS + self.external_embedding = None + self.edim = None + self.noextrn = None + self.extrnd = None + self.elookup = None + if options.external_embedding is not None and os.path.exists(options.external_embedding): + self.external_embedding, self.edim,self.noextrn,self.extrnd, self.elookup = self._assign_external_embeddings(options.external_embedding, + self.INDEX_WORD_PAD, self.INDEX_WORD_INITIAL) + else: + warnings.warn("Not using any external file for FORM embeddings") + + #INFORMATION FOR THE EXTERNAL CPOSTAG EMBEDDINGS + self.cpos_external_embedding = None + self.cpos_edim = None + self.cpos_noextrn = None + self.cpos_extrnd = None + self.cpos_elookup = None + if options.cpos_external_embedding is not None and os.path.exists(options.cpos_external_embedding): + self.cpos_external_embedding, self.cpos_edim,self.cpos_noextrn,self.cpos_extrnd, self.cpos_elookup = self._assign_external_embeddings(options.cpos_external_embedding, + self.INDEX_POS_PAD, self.INDEX_POS_INITIAL) + else: + warnings.warn("Not using any external file for CPOSTAG embeddings") + + #INFORMATION FOR THE EXTERNAL POSTAG EMBEDDINGS + self.pos_external_embedding = None + self.pos_edim = None + self.pos_noextrn = None + self.pos_extrnd = None + self.pos_elookup= None + if options.pos_external_embedding is not None and os.path.exists(options.pos_external_embedding): + self.pos_external_embedding, self.pos_edim,self.pos_noextrn,self.pos_extrnd, self.pos_elookup = self._assign_external_embeddings(options.pos_external_embedding, + self.INDEX_POS_PAD, self.INDEX_POS_INITIAL) + else: + warnings.warn("Not using any external file for POSTAG embeddings") + + #INFORMATION FOR THE EXTERNAL FEATS EMBEDDINGS + self.feats_external_embedding = None + self.feats_edim = None + self.feats_noextrn = None + self.feats_extrnd = None + self.feats_elookup= None + + if options.feats_external_embedding is not None and os.path.exists(options.feats_external_embedding): + self.feats_external_embedding, self.feats_edim,self.feats_noextrn,self.feats_extrnd, self.feats_elookup = self._assign_external_embeddings(options.feats_external_embedding, self.INDEX_FEATS_PAD, self.INDEX_FEATS_INITIAL) + else: + warnings.warn("Not using any external file for FEATS embeddings") + + + #INFORMATION FOR THE EXTERNAL FEATS EMBEDDINGS +# self.lemmas_external_embedding = None +# self.lemmas_edim = None +# self.lemmas_noextrn = None +# self.lemmas_extrnd = None +# self.lemmas_elookup= None + +# if options.lemmas_external_embedding is not None and os.path.exists(options.lemmas_external_embedding): +# self.lemmas_external_embedding, self.lemmas_edim,self.lemmas_noextrn,self.lemmas_extrnd, self.lemmas_elookup = self._assign_external_embeddings(options.lemmas_external_embedding, self.INDEX_FEATS_PAD, self.INDEX_FEATS_INITIAL) +# else: +# warnings.warn("Not using any external file for LEMMAS embeddings") + + + + + self.oov_external_embedding = None + self.oov_edim = None + self.oov_noextrn = None + self.oov_extrnd = None + self.oov_elookup = None + + + if path_oov_external_embedding is not None and os.path.exists(options.feats_external_embedding): + self.oov_external_embedding, self.oov_edim,self.oov_noextrn,self.oov_extrnd, self.oov_elookup = self._assign_external_embeddings(path_oov_external_embedding, + self.INDEX_WORD_PAD, self.INDEX_WORD_INITIAL) + + if self.oov_external_embedding is not None and self.oov_edim != self.edim: + raise ValueError("The dimensions of the embeddings for OOV words is not equal to the dimension of the rest of external word embeddings (self.oov_edim != self.edim)") + + #Obtaining the dimension of the input + dims = (self.wdims + self.pdims + (self.edim if self.external_embedding is not None else 0) + + (self.cpos_edim if self.cpos_external_embedding is not None else 0) + + (self.pos_edim if self.pos_external_embedding is not None else 0)+ + (self.feats_edim if self.feats_external_embedding is not None else 0) +# + +# (self.lemmas_edim if self.lemmas_external_embedding is not None else 0) + ) + + + #Initialization of the architecture + + self.blstmFlag = options.blstmFlag + self.bibiFlag = options.bibiFlag + + if self.bibiFlag: + self.surfaceBuilders = [VanillaLSTMBuilder(1, dims, self.ldims * 0.5, self.model), + VanillaLSTMBuilder(1, dims, self.ldims * 0.5, self.model)] + self.bsurfaceBuilders = [VanillaLSTMBuilder(1, self.ldims, self.ldims * 0.5, self.model), + VanillaLSTMBuilder(1, self.ldims, self.ldims * 0.5, self.model)] + elif self.blstmFlag: + if self.layers > 0: + self.surfaceBuilders = [VanillaLSTMBuilder(self.layers, dims, self.ldims * 0.5, self.model), LSTMBuilder(self.layers, dims, self.ldims * 0.5, self.model)] + else: + self.surfaceBuilders = [SimpleRNNBuilder(1, dims, self.ldims * 0.5, self.model), LSTMBuilder(1, dims, self.ldims * 0.5, self.model)] + + + self.hidden_units = options.hidden_units + self.hidden2_units = options.hidden2_units + self.vocab['*PAD*'] = self.INDEX_WORD_PAD + self.cpos['*PAD*'] = self.INDEX_POS_PAD + self.feats['*PAD*'] = self.INDEX_FEATS_PAD + + self.vocab['*INITIAL*'] = self.INDEX_WORD_INITIAL + self.cpos['*INITIAL*'] = self.INDEX_POS_INITIAL + self.feats['*INITIAL*'] = self.INDEX_FEATS_INITIAL + + self.wlookup = self.model.add_lookup_parameters((len(words) + self.INIT_WORD_INDEX, self.wdims)) + self.plookup = self.model.add_lookup_parameters((len(cpos) + self.INIT_POS_INDEX, self.pdims)) + self.rlookup = self.model.add_lookup_parameters((len(rels), self.rdims)) + + + self.word2lstm = self.model.add_parameters((self.ldims, dims)) + + self.word2lstmbias = self.model.add_parameters((self.ldims)) + self.lstm2lstm = self.model.add_parameters((self.ldims, self.ldims * self.nnvecs + self.rdims)) + self.lstm2lstmbias = self.model.add_parameters((self.ldims)) + + self.hidLayer = self.model.add_parameters((self.hidden_units, self.ldims * self.nnvecs * (self.kl1 + self.kl2_l + self.kl2_r + self.kb))) + self.hidBias = self.model.add_parameters((self.hidden_units)) + + self.hid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) + self.hid2Bias = self.model.add_parameters((self.hidden2_units)) + + self.outLayer = self.model.add_parameters((self.SIZE_TRANSITIONS, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) + self.outBias = self.model.add_parameters((self.SIZE_TRANSITIONS)) + + self.rhidLayer = self.model.add_parameters((self.hidden_units, self.ldims * self.nnvecs * (self.kl1 + self.kl2_l + self.kl2_r + self.kb))) + self.rhidBias = self.model.add_parameters((self.hidden_units)) + + self.rhid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) + self.rhid2Bias = self.model.add_parameters((self.hidden2_units)) + + self.routLayer = self.model.add_parameters((2 * (len(self.irels) + 0) + 1, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) + self.routBias = self.model.add_parameters((2 * (len(self.irels) + 0) + 1)) + + self.pretrained = pretrained + + + def _assign_external_embeddings(self,option_external_embedding, + index_pad,index_initial): + """ + Reads an external embedding file + Returns: + external_embedding: A dictionary of key:embedding + edim: Dimension of the embedding + noextrn: ?? + extrnd: Index for each key + elookup: Parameter lookup + """ + + + if option_external_embedding is not None: + + external_embedding_fp = open(option_external_embedding,'r') + external_embedding_fp.readline() + + external_embedding = {line.split(' ')[0] : [float(f) for f in line.strip().split(' ')[1:]] + for line in external_embedding_fp} + + + external_embedding_fp.close() + + edim = len(external_embedding.values()[0]) + noextrn = [0.0 for _ in xrange(edim)] + extrnd = {element: i + self.INIT_POS_INDEX + for i, element in enumerate(external_embedding)} + elookup = self.model.add_lookup_parameters((len(external_embedding) + self.INIT_WORD_INDEX, edim)) + + for element, i in extrnd.iteritems(): + elookup.init_row(i, external_embedding[element]) + extrnd['*PAD*'] = index_pad + extrnd['*INITIAL*'] = index_initial + + return external_embedding, edim, noextrn, extrnd, elookup + + return None,None,None,None,None + + + + def __evaluate(self, c, train): + """ + @param c: A CovingtonConfiguration instance + @param train: True if used in the training phase, False otherwise + Returns the scores for all possible transitions (training) + or the top ones (testing) for a given configuration c + """ + + #Gets the embeddings for the terms to be used in the prediction + top_l1 = [c.sentence[c.l1-i].lstms if c.l1 - i > 0 else [self.empty] for i in xrange(self.kl1)] + top_l2l = [c.sentence[c.l1+1+i].lstms if c.l1+1+i < c.b else [self.empty] for i in xrange(self.kl2_l)] + top_l2r = [c.sentence[c.b-i].lstms if c.b-i > c.l1 else [self.empty] for i in xrange(self.kl2_r)] + topBuffer = [c.sentence[c.b+i-1].lstms if c.b+i-1 <= c.sentence[-1].id else [self.empty] for i in xrange(self.kb)] + + input = concatenate(list(chain(*(top_l1 + top_l2l + top_l2r + topBuffer)))) + + if self.hidden2_units > 0: + routput = (self.routLayer.expr() * self.activation(self.rhid2Bias.expr() + self.rhid2Layer.expr() * self.activation(self.rhidLayer.expr() * input + self.rhidBias.expr())) + self.routBias.expr()) + else: + routput = (self.routLayer.expr() * self.activation(self.rhidLayer.expr() * input + self.rhidBias.expr()) + self.routBias.expr()) + + if self.hidden2_units > 0: + output = (self.outLayer.expr() * self.activation(self.hid2Bias.expr() + self.hid2Layer.expr() * self.activation(self.hidLayer.expr() * input + self.hidBias.expr())) + self.outBias.expr()) + else: + output = (self.outLayer.expr() * self.activation(self.hidLayer.expr() * input + self.hidBias.expr()) + self.outBias.expr()) + + scrs, uscrs = routput.value(), output.value() + + if train: + left_arc_info = [(rel,self.LEFT_ARC, scrs[1+j*2] + uscrs[self.LEFT_ARC], routput[1+j*2]+ output[self.LEFT_ARC]) + for j, rel in enumerate(self.irels) if c.l1 > 0 and c.l1 < c.b and c.b <= c.sentence[-1].id] + + right_arc_info = [(rel,self.RIGHT_ARC, scrs[2+j*2] + uscrs[self.RIGHT_ARC], routput[2+j*2]+ output[self.RIGHT_ARC]) + for j, rel in enumerate(self.irels) if c.l1 >= 0 and c.l1 < c.b and c.b <= c.sentence[-1].id] + + shift_info = [ (None, self.SHIFT, scrs[0] + uscrs[self.SHIFT], routput[0] + output[self.SHIFT]) ] if c.b <= c.sentence[-1].id else [] + + no_arc_info = [(None, self.NO_ARC,scrs[3] + uscrs[self.NO_ARC], routput[3] + output[self.NO_ARC] )] if c.l1> 0 and c.b <= c.sentence[-1].id else [] + + ret = [left_arc_info,right_arc_info, shift_info, no_arc_info] + + else: + #It is done different from the 'train' phase, due to the dynamic oracle. + #In the test phase we already pick the most likely transition/dependency instead of returning them all + #and then selecting one according to the prediction of the dynamic oracle + sLEFT,rLEFT = max(zip(scrs[1::2],self.irels)) + sRIGHT,rRIGHT = max(zip(scrs[2::2],self.irels)) + sLEFT += uscrs[self.LEFT_ARC] + sRIGHT += uscrs[self.RIGHT_ARC] + ret = [ [(rLEFT, self.LEFT_ARC, sLEFT) ] if (c.l1 > 0 and c.l1 < c.b and c.b <= c.sentence[-1].id and self._is_valid_left_arc(c)) else [], + [(rRIGHT, self.RIGHT_ARC, sRIGHT) ] if (c.l1 >= 0 and c.l1 < c.b and c.b <= c.sentence[-1].id and self._is_valid_right_arc(c)) else [], + [(None, self.SHIFT, scrs[0] + uscrs[self.SHIFT]) ] if (c.b <= c.sentence[-1].id) else [], + [(None, self.NO_ARC,scrs[3] + uscrs[self.NO_ARC]) ] if (c.l1 > 0 and c.b <= c.sentence[-1].id) else [] + ] + return ret + + + def Save(self, filename): + self.model.save(filename) + + + def Load(self, filename): + self.model.load(filename) + + def Init(self): + evec = self.elookup[1] if self.external_embedding is not None else None + cpos_evec = self.cpos_elookup[1] if self.cpos_external_embedding is not None else None + pos_evec = self.pos_elookup[1] if self.pos_external_embedding is not None else None + feats_evec = self.feats_elookup[1] if self.feats_external_embedding is not None else None + # lemmas_evec = self.lemmas_elookup[1] if self.lemmas_external_embedding is not None else None + paddingWordVec = self.wlookup[1] + paddingPosVec = self.plookup[1] if self.pdims > 0 else None + # paddingVec = tanh(self.word2lstm.expr() * concatenate(filter(None, [paddingWordVec, paddingPosVec, evec, cpos_evec, pos_evec, feats_evec, lemmas_evec])) + self.word2lstmbias.expr()) + paddingVec = tanh(self.word2lstm.expr() * concatenate(filter(None, [paddingWordVec, paddingPosVec, evec, cpos_evec, pos_evec, feats_evec])) + self.word2lstmbias.expr()) + self.empty = paddingVec if self.nnvecs == 1 else concatenate([paddingVec for _ in xrange(self.nnvecs)]) + + + def getWordEmbeddings(self, sentence, train): + """ + Gets the embeddings (also external) for every term in a sentence + Returns a vector of all embeddings concatenated + """ + + for root in sentence: + c = float(self.wordsCount.get(root.norm, 0)) + dropFlag = not train or (random.random() < (c/(0.25+c))) + sys.stdout.flush() + root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if dropFlag else 0] + root.cposvec = self.plookup[int(self.cpos.get(root.cpos,0))] if self.pdims > 0 else None + + #For word embeddings + if self.external_embedding is not None: + if root.form in self.external_embedding: + root.evec = self.elookup[self.extrnd[root.form]] + elif root.norm in self.external_embedding: + root.evec = self.elookup[self.extrnd[root.norm]] + else: + if (self.oov_external_embedding is not None and root.form.replace(" ","_") in self.oov_external_embedding): + root.evec = self.oov_elookup[self.oov_extrnd[root.form.replace(" ","_")]] + else: + root.evec = self.elookup[0] + else: + root.evec = None + + #For cpostag embeddings + if self.cpos_external_embedding is not None: + if root.cpos in self.cpos_external_embedding: + root.cposevec = self.cpos_elookup[self.cpos_extrnd[root.cpos]] + else: + root.cposevec = self.cpos_elookup[0] + else: + root.cposevec = None + + #For postag embeddings + if self.pos_external_embedding is not None: + if root.pos in self.pos_external_embedding: + root.posevec = self.pos_elookup[self.pos_extrnd[root.pos]] + else: + root.posevec = self.pos_elookup[0] + else: + root.posevec = None +# + #For feats embeddings + if self.feats_external_embedding is not None: + if root.feats in self.feats_external_embedding: + root.featsevec = self.feats_elookup[self.feats_extrnd[root.feats]] + else: + root.featsevec = self.feats_elookup[0] + else: + root.featsevec = None + + + #For lemmas embeddings +# if self.lemmas_external_embedding is not None: +# if root.lemma in self.lemmas_external_embedding: +# root.lemmasevec = self.lemmas_elookup[self.lemmas_extrnd[root.lemma]] +# else: +# root.lemmasevec = self.lemmas_elookup[0] +# else: +# root.lemmasevec = None + + + # root.ivec = concatenate(filter(None, [root.wordvec, root.cposvec, root.evec, root.cposevec, root.posevec, root.featsevec, root.lemmasevec])) + root.ivec = concatenate(filter(None, [root.wordvec, root.cposvec, root.evec, root.cposevec, root.posevec, root.featsevec])) + + if self.blstmFlag: + forward = self.surfaceBuilders[0].initial_state() + backward = self.surfaceBuilders[1].initial_state() + + for froot, rroot in zip(sentence, reversed(sentence)): + forward = forward.add_input( froot.ivec ) + backward = backward.add_input( rroot.ivec ) + froot.fvec = forward.output() + rroot.bvec = backward.output() + for root in sentence: + root.vec = concatenate( [root.fvec, root.bvec] ) + + if self.bibiFlag: + bforward = self.bsurfaceBuilders[0].initial_state() + bbackward = self.bsurfaceBuilders[1].initial_state() + + for froot, rroot in zip(sentence, reversed(sentence)): + bforward = bforward.add_input( froot.vec ) + bbackward = bbackward.add_input( rroot.vec ) + froot.bfvec = bforward.output() + rroot.bbvec = bbackward.output() + for root in sentence: + root.vec = concatenate( [root.bfvec, root.bbvec] ) + + else: + for root in sentence: + root.ivec = (self.word2lstm.expr() * root.ivec) + self.word2lstmbias.expr() + root.vec = tanh( root.ivec ) + + + def Predict(self, conll_path): + """ + Makes non-projective depending parsing prediction given a ConLL-X file + """ + + + with open(conll_path, 'r') as conllFP: + for iSentence, sentence in enumerate(read_conll(conllFP)): + self.Init() + + l1 = sentence[0].id + b = sentence[1].id + arcs = set([]) + + self.getWordEmbeddings(sentence, False) + + for root in sentence: + root.lstms = [root.vec for _ in xrange(self.nnvecs)] + + hoffset = 1 if self.headFlag else 0 + + c = CovingtonConfiguration(l1,b,sentence,arcs) + while not self._is_final_state(b,sentence): + + transition_scores = self.__evaluate(c, False) + + + best = max(chain(*transition_scores), key = itemgetter(2) ) + + if best[1] == self.LEFT_ARC: + + sentence[l1].pred_parent_id = sentence[b].id + sentence[l1].pred_relation = best[0] + best_op = self.LEFT_ARC + if self.rlMostFlag: + sentence[b].lstms[best_op+hoffset] = sentence[l1].lstms[best_op+hoffset] + if self.rlFlag: + sentence[b].lstms[best_op+hoffset] = sentence[l1].vec + + arcs.add((b,l1)) + l1 = l1 -1 + + elif best[1] == self.RIGHT_ARC: + + sentence[b].pred_parent_id = sentence[l1].id + sentence[b].pred_relation = best[0] + + best_op = self.RIGHT_ARC + if self.rlMostFlag: + sentence[l1].lstms[best_op+hoffset] = sentence[b].lstms[best_op+hoffset] + if self.rlFlag: + sentence[l1].lstms[best_op+hoffset] = sentence[b].vec + + arcs.add((l1,b)) + l1 = l1-1 + + elif best[1] == self.SHIFT: + l1 = b + b = b + 1 + + + elif best[1] == self.NO_ARC: + l1 = l1 - 1 + + c = CovingtonConfiguration(l1,b,sentence,arcs) + renew_cg() + yield sentence + + + def Train(self, conll_path): + """ + Trains a O(n^2) Covington's parser with a O(n^2) dynamic oracle + """ + mloss = 0.0 + errors = 0 + batch = 0 + eloss = 0.0 + eerrors = 0 + lerrors = 0 + etotal = 0 + ltotal = 0 + ninf = -float('inf') + + hoffset = 1 if self.headFlag else 0 + + start = time.time() + + with open(conll_path, 'r') as conllFP: + shuffledData = list(read_conll(conllFP)) + + random.shuffle(shuffledData) + + + errs = [] + eeloss = 0.0 + + self.Init() + + for iSentence, sentence in enumerate(shuffledData): + if iSentence % 100 == 0 and iSentence != 0: + print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Labeled Errors:', (float(lerrors) / etotal) , 'Time', time.time()-start + start = time.time() + eerrors = 0 + eloss = 0.0 + etotal = 0 + lerrors = 0 + ltotal = 0 + + self.getWordEmbeddings(sentence, True) + #We obtain the gold arcs to then compute the dynamic oracle for covington + gold_arcs = set([]) + for word in sentence: + + #TODO: Weird error if not, adds and arc (0,0) + if word.id != word.parent_id: + gold_arcs.add((word.parent_id,word.id)) + + + l1 = sentence[0].id + b = sentence[1].id + arcs = set([]) + c = CovingtonConfiguration(l1,b,sentence,arcs) + loss_c = self._loss(c,gold_arcs, iSentence) + + for word in sentence: + word.lstms = [word.vec for _ in xrange(self.nnvecs)] + + hoffset = 1 if self.headFlag else 0 + + while not self._is_final_state(b,sentence): + + costs = [None,None,None,None] + transition_scores = self.__evaluate(c, True) + + #We determine if the transitions are valid for a given configuration c + for t in self.TRANSITIONS: + + l1_aux = l1 + b_aux = b + arcs_aux = set(arcs) + valid_transition = False + + if t == self.LEFT_ARC and self._is_valid_left_arc(c): + arcs_aux.add((b_aux,l1_aux)) + l1_aux = l1_aux -1 + valid_transition = True + + if t == self.RIGHT_ARC and l1 >=0 and self._is_valid_right_arc(c): + arcs_aux.add((l1_aux,b_aux)) + l1_aux = l1_aux-1 + valid_transition = True + + if t == self.NO_ARC and l1 >0: + l1_aux = l1_aux-1 + valid_transition = True + + if t == self.SHIFT: + l1_aux = b_aux + b_aux = b_aux + 1 + valid_transition = True + + if valid_transition: + + new_c = CovingtonConfiguration(l1_aux,b_aux,sentence,arcs_aux) + loss_new_c = self._loss(new_c,gold_arcs,iSentence) + + cost = loss_new_c - loss_c + costs[t] = float(cost) + + #Valid transitions are those with cost 0 + #If it is a LEFT/RIGHT arc, also the relation must match with the one in gold standard + valid_transitions = [s for s in chain(*transition_scores) if costs[s[1]] == 0 and (s[1] in [self.SHIFT,self.NO_ARC] + or ((s[1] == self.LEFT_ARC and s[0] == sentence[l1].relation) + or (s[1] == self.RIGHT_ARC and s[0] == sentence[b].relation)))] + + best_valid = max(valid_transitions, key=itemgetter(2)) + + wrong_transitions = [s for s in chain(*transition_scores) if costs[s[1]] is not None and ( (costs[s[1]] != 0) or (s[1] in [self.LEFT_ARC,self.RIGHT_ARC] + and ((s[1] == self.LEFT_ARC and s[0] != sentence[l1].relation) + or (s[1] == self.RIGHT_ARC and s[0] != sentence[b].relation))) ) ] + + #Aggressive exploration as done by Kiperwasser and Golberg (2016) + if wrong_transitions != []: + best_wrong = max(wrong_transitions, key=itemgetter(2)) + + best = best_valid if ( (not self.oracle) or (best_valid[2] - best_wrong[2] > 1.0) + or (best_valid[2] > best_wrong[2] and random.random() > 0.1) ) else best_wrong + else: + best = best_valid + + + #Moving a new configuration based on the "best" choice + if best[1] == self.LEFT_ARC: + + sentence[l1].pred_parent_id = sentence[b].id + sentence[l1].pred_relation = best[0] + + best_op = self.LEFT_ARC + if self.rlMostFlag: + sentence[b].lstms[best_op+hoffset] = sentence[l1].lstms[best_op+hoffset] + if self.rlFlag: + sentence[b].lstms[best_op+hoffset] = sentence[l1].vec + + child = sentence[l1] + arcs.add((b,l1)) + l1 = l1 -1 + + elif best[1] == self.RIGHT_ARC: + + + sentence[b].pred_parent_id = sentence[l1].id + sentence[b].pred_relation = best[0] + + best_op = self.RIGHT_ARC + if self.rlMostFlag: + sentence[l1].lstms[best_op+hoffset] = sentence[b].lstms[best_op+hoffset] + if self.rlFlag: + sentence[l1].lstms[best_op+hoffset] = sentence[b].vec + + arcs.add((l1,b)) + child = sentence[b] + l1 = l1-1 + + + elif best[1] == self.SHIFT: + l1 = b + child = sentence[b] + b = b + 1 + + + elif best[1] == self.NO_ARC: + l1 = l1 - 1 + child = sentence[l1] + + + if best_valid[2] < best_wrong[2] + 1.0: + loss = best_wrong[3] - best_valid[3] + mloss += 1.0 + best_wrong[2] - best_valid[2] + eloss += 1.0 + best_wrong[2] - best_valid[2] + errs.append(loss) + + + if best[1] not in [self.SHIFT, self.NO_ARC] and (child.pred_parent_id != child.parent_id or child.pred_relation != child.relation): + lerrors += 1 + if child.pred_parent_id != child.parent_id: + errors += 1 + eerrors += 1 + + etotal += 1 + c = CovingtonConfiguration(l1,b,sentence,arcs) + loss_c = self._loss(c,gold_arcs, iSentence) + + + if len(errs) > 50: + eerrs = esum(errs) + scalar_loss = eerrs.scalar_value() + eerrs.backward() + self.trainer.update() + errs = [] + lerrs = [] + + renew_cg() + self.Init() + + if len(errs) > 0: + eerrs = (esum(errs)) # * (1.0/(float(len(errs)))) + eerrs.scalar_value() + eerrs.backward() + self.trainer.update() + + errs = [] + lerrs = [] + + renew_cg() + + self.trainer.update_epoch() + print "Loss: ", mloss/iSentence + + + def _is_final_state(self,b,sentence): + return b >= len(sentence) + + + def _is_valid_left_arc(self,c): + + aux = set(c.A) + aux.add((c.b,c.l1)) + l1_has_head = self._y_has_head(c.A, c.b, c.l1) + return (c.l1 > 0 and not l1_has_head + and self._count_cycles(aux) == 0) + + + def _is_valid_right_arc(self,c): + + b_has_head = self._y_has_head(c.A, c.l1, c.b) + aux = set(c.A) + aux.add((c.l1,c.b)) + return ((not b_has_head) and self._count_cycles(aux) == 0) + + + """ + Gomez-Rodriguez & Fernandez-Gonzalez: + An Efficiente Dynamic Oracle for Unrestricted Non-Projective Parsing (ACL,2015) + Algorithm 1 + """ + def _loss(self, c, gold_arcs, iSentence): + + U = set([]) #set of unreachable nodes + non_built_arcs = gold_arcs.difference(c.A) + + + i = c.l1 + j = c.b + + for x,y in non_built_arcs: + left = min(x,y) #O(n) + right = max(x,y) #O(n) + if (j > right or (j==right and i < left) or self._y_has_head(c.A,x,y) + or self._weakly_connected(c.A, x, y,c, gold_arcs)): + U.add((x,y)) + + I = gold_arcs.difference(U) + + return len(U) + self._count_cycles( c.A.union(I)) + + + #TODO: This can be done more efficient + #O(n^2) + def _weakly_connected(self,A,x,y,c, gold_arcs): + + weakly_connected = False + end_path = False + parent = x + + while parent != 0 and not weakly_connected and not end_path and A != set([]): + if (parent,y) in A: + weakly_connected = True + break + else: + + for (a,b) in A: + if b == parent: + parent = a + break + else: + end_path = True + + + return weakly_connected + + + """ + Tarjan (1972) implementation at https://github.com/bwesterb/py-tarjan/ + O(n) + """ + def _count_cycles(self, A): + + d = {} + for a,b in A: + if a not in d: + d[a] = [b] + else: + d[a].append(b) + + return sum([1 for e in tarjan(d) if len(e) > 1]) + + + """ + Determines if node y has already a head + """ + #O(n) + def _y_has_head(self,A,x,y): + + for z,y_prime in A: + if y_prime == y and z != x: + return True + return False + + #O(n) +# def violates_single_root(self, A): +# print A,[1 for (h,d) in A if h==0], len([1 for (h,d) in A if h==0]) != 0 +# return len([1 for (h,d) in A if h==0]) != 0 + diff --git a/bcovington/src/covington.pyc b/bcovington/src/covington.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c52ffdf5bb5b35491481800b0996b6ade7e7316 GIT binary patch literal 24143 zcmcJ1eQ;dYb>Drv_$I&)d=LOXmtP`CNu)?hqQy|6D1w9_ks$Q|k`gRvwb*?C7F_HC z?>&&94cM_MJC2@NrXOi%(n+Vz zq%)q*xWC`;zW4Sm04hpF6WqJ^+a4(Hs9iOsI`R!xs7fw8@+?^FFw!S@@l*LZ!#+iARhGmCi#j5lb! z0rLP@?J}jk#@k~`UFIF2x7U=mlJk@MOlg}k`%P)PGC5P)q09kO>Q?5UDfK9G$dr1O zIc!RO${aDJoyt6BO8v?lHKhS%hD>QtnPaB3OPS-Qv|E|SO=*uZPZ)Q6Xdk&wmWGAu zT-Zv!mS-1g<&r9Wud*C+o!ELNq$C8hrLYa(3$+EW8ZIxWPoY$p^$WHE`I1)&3mO20 zg)+scoMFNN%J`UKCz7t{qhX(cNQIKdFBp?|%xb%t&lu$Av*u35_-BYFiw;>zm}XP` z^%Pi(z$Oh4S`YS_6j+gA@0gXG!aT3Im#X&@<|p{Lfh!!gCEf5O z%cS~*C_1V9N#&nX{wd{8DSt}&)5_b0d|LTuRR65?9O^P(mb5GsXjWKv7&b%6D zuEv>bno^KN?k^jW8|84QtRpwdDd)$OUQZ#X`ur>su{e`EY$b zhqVH7&Pv!p(_hDVc|CYE%-zO%@x5`FX9V$_rR}W!I%E2K>to+rKSnh6N@KlU!E%bW z`|AOsj-P0(|D=lZvAq68J4u~4Rc+i5l{zM?>pMmRt~J*Ge^L3V^@mr)aT1y;82W7?u~xpYZ%#ImlIOVXknuP_7iFE1rz;A}w-$XxfkC6YN0(>l+>) z{;%Qzh9wS-rxeqU%VQfGq7m=55q2w*;=OX<LH%cT={frjq|Cuhsy$?0+sKDk)%1J6G>U5vfQlS%Hx;(fUq=PQMU=~Cf^w=h<~ zq2)}72)YhU`1IftJjN#BKDEcZnK5gQSpyN)i!i1i8sWW+S)!v0NMjMEt2sJ0I+4G9 z-5t)4jtyUa-Fm#F^~6*1V(!)alAND%wPbs7&cLh8JQ5-UddKHWlsqli zjqBs7^%7|!NJ~~+tOY+_%COBnTFTsA7_q)gmKF;5kDZj-qb4QXjuEv;S1w)!%60cz&RdxFN+kdV!RZgt=MtWwiHjrYZl^v( zw~f7?`VhS&3hAl-COS>bdAU*34i4h&%v)Lp>0wR4 zN%y!n_Lq?Pgh%X~9&?DQ`$VmFH}9cy?!faoRj_bu7D(aWf4UHGb3A_I;_&gHojHp0 z=*6qYhdzgrbm+Aj16>4LP>x{9&MPf+arojj#Y!(C4mRN<*6H~Qn1xZEkN5b@egaN~qzjdE%R^t{q#->>;0qp^OWTBAc_AV8xgX z6_HS_-t~$W!LeG^lk~-pQ$Asz2;ftrHLunH;aAmPxss~AO09Nhc~Mw`63WkFA)JfA zAU;7z-sB4bBG1+Q##sUhwk$Jiq~VzqLaD0g;gnF`EBIAvKuyI$NHd29!6t)?YqpJa zFf6T7ObjI=?&?=db%2;G2C*$7v`2{Aj)s~TGj13UL4k{@CSt$|S%?wf`bGUXf7~2T zPrFURv>9#e8#QuhqH2rr;3PNc&Ee}H@dVaBhE<%mTRADD(y%ng+q+a_NE)S8qs=KL zDWox0xFUju7!BeX^^0j#F_IiM-zrp=J*%3i7N+@>CQrh2dAh6umE^5LwOpwbcxLp{ za=B81#w88GvOiNOdQnv%jXS-*lH5&qq#0w?vO6{wk4C1=l}jbBnqRJ#A--gqtvDUk z(9ABBN_k5_9zITC!GpF4^jKEBs!Vb{9``Jaj6YQ@MpRR!`ca-XFULy_L@rPtohB!Q zLQ2=_(bS<^=^W#T}79ZCfI+(`phsMkg=lC)|r;<5=_SW8-?{ zYRh4LAQX~;e2i2dh$S^~MOla@AuTG_`z0e10Y4s+{T6-C?w+L~GG3mG)S9%jdxdq0 zvUOi(aTJL-fA4hJt^mHtU+&g0JEVZ9l=6`j%CxFWulu(-05umLqhJW%DX;0H+J0hS zK3|3?0_`jSW+v#*Wed|A&&P-RP%jv*hPo^i+ncePmMt|*@{Z9Nhm~GF*_4>{{ z-Q4cPe}~cIOr|Rvm)o5WQIk3BT*!2FCw;O9Fy5KW0cTrt!P(!2Su*X;9_I*fcpNR; zoL4f#prO;*&cBRJA4l8a?7qx^bJ7{e4C8YEpXXBfSDoXTB00^q)H~ZCrx#lwN{>?y@e>jv{`G552SP$27$o<=*Hu$wRVMTZ~)d$!LSWqK=7vn zUTaGf&RXj*EA~ukiw@DIa&j}a7{;7C%sf*32a;RNAP8}m{-wDM-vhzZQj0l{voq#e zkpl>jY-1VyW}3}7&ZX^XeZV+QM=tv3!g2`UAJk3XWUrQ^kOB;s{c3P7so1#fp37Z? zWe!8FRxSAVX;0tr?w?D>tM|k+aV|F;+widwz7a6e_UCdJF4Q~IVkn)CatVkoD)>pTgm#iVJQta=4G6u=Xb@H#Vl$7Z7FG)KgN&qT z|L{~dLq)Jrz<<NC#XnB;Wo^IK>BzoB6{+XCD}eSr@MmN32>! z7?QrkkJq<9S*Pw{uG-iTHOLtM*}dX6iei3k5%{&C#5*lq2`y8xQVTp6k1>^kF?VTm zSSMfwzR;}aj-b}fG5IMbjH|iBNHml*xL>=M+Li{v(G02f>~m{R$fMfY{IxNjg4)AQ~m%O7X*CX;{6LcF| zB4fHbGuxbwOpDWz?a1te;-|7dk{NUcxItliD)LF8g-QeL5?(I~(tgX~T$UpmUtpWf(d%F%gP9F?1*RdkLu?`3m-=y|qyJ~5 z_P}{;aO9OH4*B&qjDqad*WK`=Mm}~C;Gsl zh-!Ki?Bnx$#Zg!R%4F88Re+05Pc~EHaBByh_FLijY1s{3W^HTqm_(1rsciJrB(iO5 zARH2ulPs8dQA0?o$!bWVTowTFgTX%b5ID8Y{*u56DDx4S?e=(&?Xv?4*}h2*80tA zuS`$7>}Sz^b!S`wcAi~BDNS&;#TG5u)Spur-@ZfCfx36iYwYD@bAJ^}S zjtw0HuiURQc@2qzhzM%Zatj!(218`IF}7jDsNTY2c;CHAVA?&?E)qIz#-(Ra_X>Q|V2g~=~7`7)DVW%5f*eu2q*On9i_ z{ydYvg+$27j$Z2#pO^i4A3x7t2xTjwFruccoA}0InF#U0ubs}$SOt~ z(T|y$!n_H_wu%9tV6F_}1i_`N)<~!=lPL%XX^wy_RnAwAxCG*?(DIeF1~FPbvahB1 zc|PxPaUn%7fVkis2X1$rDd#345r<(Nmot~5bP#rO*7<6NpPRT#+SkHh$PWA!@b{u~ zGgG8|#-B?>k^TjB8~!Wc0gcMrn+>-ya)6Pj28Fwxy`aIhmXPsmu!_jzBcz(kI*ys{ zkzOKeoR5e%T;oz5Z}&fA$VZItRm||o6uy*0`d*YK@%bgxSU!Whj47kI7>e~=e@ zVFO?RP^CN}UQiJmEzE`p0+tgX@YaDH<82kO57!}Jx&s0sCJXUd74cF+9RlVJAYg`} zis361I%t#8?D$O;VF}d1TWH}EsEfAn11$SCgUB|QWI{#h{w$N4F?F880@X`&{=p){>~O_-etsLF^;@W77&5B5Q43hZK2Z&d z?mtEG5zFf+{Qpl;UKwvU&O!T!u>zZ|q4O;iQ#3jh3@kWov15Cczo0>{gbFM9{Ff$Cw6ZLot8cEVUw94kR9jXf`o}E5KmP{F9Vhg!ZE< zxLi?Ht69NK>=X1HLcAU#2L($!JT)fwx{X17#7OW)2SV-h$LYUt7`y|68hGrs2D%b3 zr}X5snfz%1*kZfMpD|14v$t1{aroh>0}SebM&b`Zq-PASNmFZrgs08YFJ?u;Gsgd} zNO(3)f^`&8RVQJ~hamyarzPQOk$_>g#3a}nFd8rk33%NF00{{nGfO`L2~TV!0e~Rk zV~r%d-gJBEyG_+mvvS4uctMdYtjBX^{+!r^PRDI*=qI8oM&z)B;dzZH#zq|ZV$*Gi zL*yfI6G)td5ibnnWF#jGa5Lc1)RMyv>uf;h6P&gdh5DM06_^2Uy z-8O-o(O$WnT>kqKt%nLHY3XwIIR~6B=V|Bp>^2-H@8u`+q;m}M`F30Lu-w)cairUW zgXV+i(~TqD9*nbple#0>F6S}p2Qbo43#o`t(1dlNF%lou{@kBuaLTswUo&vcle0Ex z!{Mp#XBfuH%07YWut&DmGGTn%p5&IWce4PiS7CW7XluGcP2>7KEEb%it~|@a@YD%f zsXHw)t zp0IQqBx32`ncLwW^MDp}ley(A9Wl4Lk-heMeK(_mPBxQkg*yn7RcqBJ7ya5i)D%=> zuH@lZlHNxS@jzYm#buZ13_c2>6%Ze+UcLI{>+8<*e+`3cgVJs!g@{9RMbgHG_M*}K z8j}$wP!7iZb0+@+iC_aSbhn~j+Dmal>r=_Q%A&#j9m2oLgvwS|su*XQDdQ5dRo>w) zG~|=;rG0I`9wBkR#m>}|E)}@@`$(+E1sZv?3Hx*&;W>F+tU{1TS47>b1iwabPL0jL z{R7tgA(KC1H5Dzod-w9i*&%8m>j6p+@#hLKIxZYicVlC(RIc36u2LS4ECZ|hzrlY0 z9EtQlLQH|%Lc;0bsqb8EW`=yyuCRl(NlY|v!=`paLgZ8M)5zp&i;qy(lY$#oBK{{( z3noxDE!{YI?SOaKg)2DQ@%==mFVpArWx8?ErYF;j2oo>u9K~mU1{&PfL5nxkIW10C zmaT(MtJUPEFgB^Ygir9>*uG$0Lwi##TIHzV2c90*-~mF4$OcjbG?3z647y}#f^LBx zrSidfz*Q>3NV!5&X6Ow*fVmE33AYXV1QcY({M^59gMtJ%A&z()F$7ipRyzMYcCjzx z&Y~NB$pQ^<88A9dXYur{e7TLDra*?LzSPETWrtaV3n`~^Cs5;)?>a0akM~hs2bXL{ zP#(ekq7gpZZKiJoH*7|@cj-jg)^yk!sCO9fg4u$am($5`==QB9&0fYLryIBtx+ZHl#vv=@aek#z4fMwLNBKMh@~I=KQ#`2LG_ntl=yYhg$m;3Ih$`Ud})lt-;H+ zUa?iS5)2V@Dy09=OwxunkcOxNCO=|2gYPu2 z9wp6#tB9W9YXCnegAnnCdWMa^-%Pa$H3oIOO@ZX^vp1A5I5-%sG7 z=%NW5;q2UN2ua}L1+*TNMql!X;eq5Mi!j?>Cq;tg{7h>sYnBAy&`1I+ED3v4B=o9V z1Hy^WHll_F0SV_5cuPV9@gmhCj=nggRs_h2NF`Nu67Ul=^;NJ>{V%mry`iUN4dD^8 z!>XhGYt&bNGk3Q674I%@H>AwTB=|mf4Xgk&&&G;Hvdy*kyG)J=4AN3nJ zi`}1=@MbO8G;y5=*dY3bLn_Xerh_tNHAAcM-)~!e!puXEkZL;%s;a> zPd1seYuN7_q1mSJ>{-|$wA=~RVwT=-`I|_}TWMqv6Ji7L3%-T~XR4h#!l{HG1zHaaz4na2@g8EcY zl$h$P?VC>JeE5k-Hy8|DP74?O62b_U5PNsu$jg}1z9JsH2173KKPLDB8m)>c0=L4^f-z$f6b%4@+oFtZuIs0x_G7jjr|KxngK*vd~oDf$&NeRsXBUU&3 zTG7L80(duZNG^xBqozIoobGR4#Z`lIK`S8jng{Z<P&SLPA6kM$YCqT zxb^ocT;6$?ljMFIEv{Uif6W|S9_#$jnTnjB$W{6l+lZn2F($nngBLbaJ`_1pSfrzA zZ@LoN3-Ci;oyY{m0^ZN(r2-Jx&W?vsiwy*4_=_b*==xD9P+QJ>i?!n15S;}ZMjK!| z5nCYP>gn-~LVFQlx)2uU)QtCw&`dsiQ9JXMD9;JKuomJLEVC7xC8~I}YsTFZPlBkS zAG(7vPSKK;h6~Z}k72k72hHd10;JqMd4RLEN{N>V0=>@bCLW->n_%C;Y<6~Fnztu? zLcNIB*aCMiC&kCuYY&xE`NHyw+NYJHQ_8Vb;7OjnsZn-~IIrTKaSYWCkGB(^ zYqyg_3=$qNUYY5OePH&?ZpXcg7WT;OL;SMCY0n&wzIUs%4XvzgkVdqyYRO?`y$HS_-xN~ z0MkL>%~xaqLy9u3PCsyF88yTk6nEJZU+NQnF5#ndVK~clPi0HM9f|LRkdJz9}TK63rl3ZohVB<#Gl$b);yRmx%npK?if{(c_!;_`gk zRzWBHlqKHm%@^``KFe}qaJsBgdC-oZzD$;wlcUiG>FLkP8*A9 z;OyIICiO9M$uTfjY$eC=9t*;{VkVtoFc1fDmc21pY=Yt{KrJ7$Qn8wcv2q;i?Q(-m zX-RE>+U+LLu%2TiV>)4!dy^ojdNju#E_cEmGkn1I}A#b2xzAsHts-cR9|h3*e}RKi2(+)@tyui-t0t2E;REEJN2|79J&-pQ`9=6U zKaF>Ebwx4IO+@$K5}2181!sRjfY$^yvTg^eVe}Eo(ovwckFrE;4fd06z8Z;G={EeG z$5Xf~vn^Ak9YVR{w`A*=k%4SMbeLjsAscOG<yEE%t9Ofe{mA#W`8nH3!4GrBe;L2Rew;dBSm0A=TYp5lo4ZY-75rPUyF8@+C0 z$_7;k%8a!p^}rqhwL>JtYBxR8Arn;D|oMX9gG2QAFkWC4>l+8>DL0)8wG;Hi*NDVOh|MhiQ0fAN2;~)jG|y z8)PyuPVt^afc< zW1lP~8@I#ElIkQf$p(HFg+>X4c7z_pfBTEPka#In3Nqd{!VN%P$pHzms^Pl06gNrV zP$Q{&#wBwi`V(>m?&lYMxMYt|hN!vTCc$k$#^9MAaN3-)p(jz0Fz55UAe+z21kdO3 zLoUk|6y2wYJ6%ZkX(rDwIl|;9lVeOCXL689fyp!zNRDyuFzIDN!FBI4xyNKLliz3Z z2TcBu$saM{o05_T_ZL}|Qz-xM@3Z(e6Z(kme_`&wG12iI{ZTi^>g5Bs74sazCvcIp&_?H9{GHChmThe7&UWH1bmw>5kF`J7-qfCL z?`ofJzuf*O$hRZk-@dzjyG56WP}2SgK}2`Gm&0-;-~(e^2^kjHE+^jW#ILQ?eEGog z@%SSxVXX#mst5bz9U@as#KR&8fghUVIMqdWiae9$`~(V!Z literal 0 HcmV?d00001 diff --git a/bcovington/src/parser.py b/bcovington/src/parser.py new file mode 100644 index 0000000..45a5071 --- /dev/null +++ b/bcovington/src/parser.py @@ -0,0 +1,175 @@ +from argparse import ArgumentParser +import utils +import covington +import os +import pickle +import time +import tempfile +import yaml +import codecs +import sys +import warnings +""" +Main file +""" + + + +if __name__ == '__main__': + + parser = ArgumentParser() + parser.add_argument("--input", dest="input", help="Path to the input file",default=None) + parser.add_argument("--input_type", dest="input_type",help="Style of the input file [raw|conllu] (only use with --predict)") + parser.add_argument("--pipe", dest="pipe",default="UDpipe",help="Framework used to do the pipeline. Only \"UDpipe\" supported (only use with --predict)") + parser.add_argument("--train", dest="conll_train", help="Annotated CONLL train file", metavar="FILE", default="../data/PTB_SD_3_3_0/train.conll") + parser.add_argument("--dev", dest="conll_dev", help="Annotated CONLL dev file", metavar="FILE", default="../data/PTB_SD_3_3_0/dev.conll") + parser.add_argument("--test", dest="conll_test", help="Annotated CONLL test file", metavar="FILE", default="../data/PTB_SD_3_3_0/test.conll") + parser.add_argument("--params", dest="params", help="Parameters file", metavar="FILE", default="params.pickle") + parser.add_argument("--extrn", dest="external_embedding", help="External embeddings", metavar="FILE") + parser.add_argument("--extrn_cpos", dest="cpos_external_embedding",help="CPoStag external embeddings", metavar="FILE") + parser.add_argument("--extrn_pos", dest="pos_external_embedding", help= "PoStag external embeddings", metavar="FILE") + parser.add_argument("--extrn_feats", dest="feats_external_embedding", help="Feats external embeddings", metavar="FILE") + parser.add_argument("--model", dest="model", help="Load/Save model file", metavar="FILE", default="bcovington.model") + parser.add_argument("--wembedding", type=int, dest="wembedding_dims", default=100) + parser.add_argument("--pembedding", type=int, dest="pembedding_dims", default=25) + parser.add_argument("--rembedding", type=int, dest="rembedding_dims", default=25) + parser.add_argument("--epochs", type=int, dest="epochs", default=30) + parser.add_argument("--hidden", type=int, dest="hidden_units", default=100) + parser.add_argument("--hidden2", type=int, dest="hidden2_units", default=0) + parser.add_argument("--kb", type=int, dest="window_b", default=1) + parser.add_argument("--k1", type=int, dest="window_l1", default=3) + parser.add_argument("--k2r", type=int, dest="window_l2r", default = 1) + parser.add_argument("--k2l", type=int, dest="window_l2l", default = 1) + parser.add_argument("--lr", type=float, dest="learning_rate", default=0.1) + parser.add_argument("--outdir", type=str, dest="output", default="results") + parser.add_argument("--activation", type=str, dest="activation", default="tanh") + parser.add_argument("--optimizer",type=str, dest="optimizer", default="adam") + parser.add_argument("--lstmlayers", type=int, dest="lstm_layers", default=2) + parser.add_argument("--lstmdims", type=int, dest="lstm_dims", default=125) + parser.add_argument("--dynet-seed", type=int, dest="seed", default=7) + parser.add_argument("--disableoracle", action="store_false", dest="oracle", default=True) + parser.add_argument("--disableblstm", action="store_false", dest="blstmFlag", default=True) + parser.add_argument("--bibi-lstm", action="store_true", dest="bibiFlag", default=False) + parser.add_argument("--usehead", action="store_true", dest="headFlag", default=False) + parser.add_argument("--userlmost", action="store_true", dest="rlFlag", default=False) + parser.add_argument("--userl", action="store_true", dest="rlMostFlag", default=False) + parser.add_argument("--dynet-mem", type=int, dest="cnn_mem", default=512) + parser.add_argument("--conll2017", action="store_true",dest="conll2017", default=False) + parser.add_argument("--predict", action="store_true", dest="predictFlag", default=False) + + + # parser.add_argument("--conf", metavar="FILE", dest="conf",required=True) + + args = parser.parse_args() + + if not args.predictFlag: + + if not os.path.exists(args.output): + os.mkdir(args.output) + + # config = yaml.safe_load(open(args.conf)) + + print "Training..." + if not (args.rlFlag or args.rlMostFlag or args.headFlag): + print 'You must use either --userlmost or --userl or --usehead (you can use multiple)' + sys.exit() + + path_tmp_file_oov = None + + print 'Preparing vocab' + words, w2i, lemmas, l2i, cpos, pos, feats, rels = utils.vocab(args.conll_train) + + + with open(os.path.join(args.output, args.params), 'w') as paramsfp: + pickle.dump((words, w2i, lemmas, l2i, cpos, pos, feats, rels, args), paramsfp) + print 'Finished collecting vocab' + + print 'Initializing blstm covington:' + parser = covington.CovingtonBILSTM(words, lemmas, cpos, pos, feats, rels, w2i, l2i, args, + path_tmp_file_oov) + + + if path_tmp_file_oov is not None: + os.unlink(path_tmp_file_oov) + + if args.conll2017: + with codecs.open(args.conll_dev) as f_conll_dev: + lookup_conll_data = utils.lookup_conll_extra_data(f_conll_dev) + + + + for epoch in xrange(args.epochs): + print 'Starting epoch', epoch + parser.Train(args.conll_train) + devpath = os.path.join(args.output, 'dev_epoch_' + str(epoch+1) + '.conll') + utils.write_conll(devpath, parser.Predict(args.conll_dev)) + + if args.conll2017: + utils.dump_lookup_extra_into_conll(devpath, lookup_conll_data) + utils.transform_to_single_root(devpath) + + + print 'Executing conll17_eval' + + + if not args.conll2017: + os.system('perl src/utils/eval.pl -g ' + args.conll_dev + ' -s ' + devpath + ' > ' + devpath + '.txt') + else: + os.system('python src/utils/conll17_ud_eval.py -v -w src/utils/weights.clas ' + args.conll_dev + ' ' + devpath + ' > ' + devpath + '.txt') + + + + + parser.Save(os.path.join(args.output, args.model)) + + else: + + #TEST PHASE + with codecs.open(args.params, 'r') as paramsfp: + aux = pickle.load(paramsfp) + words, w2i, lemmas, l2i, cpos , pos, feats, rels, stored_opt = aux + + + stored_opt.external_embedding = args.external_embedding + stored_opt.pos_external_embedding = args.pos_external_embedding + stored_opt.feats_external_embedding = args.feats_external_embedding + + print "Running model with this configuration", stored_opt + + parser = covington.CovingtonBILSTM(words, lemmas, cpos, pos, feats, rels, w2i, l2i, stored_opt, + None) + + parser.Load(args.model) + + conllu = (os.path.splitext(args.conll_test.lower())[1] == '.conllu') + tespath = os.path.join(args.output, 'test_pred.conll' if not conllu else 'test_pred.conllu') + + + if args.conll2017: + with codecs.open(args.conll_test) as f_conll_test: + lookup_conll_data = utils.lookup_conll_extra_data(f_conll_test) + + + + ts = time.time() + pred = list(parser.Predict(args.conll_test)) + te = time.time() + utils.write_conll(tespath, pred) + + + if args.conll2017: + utils.dump_lookup_extra_into_conll(tespath, lookup_conll_data) + utils.transform_to_single_root(tespath) + + + if not args.conll2017: + os.system('perl src/utils/eval.pl -g ' + args.conll_test + ' -s ' + tespath + ' > ' + tespath + '.txt') + else: + os.system('python src/utils/conll17_ud_eval.py -v -w src/utils/weights.clas ' + args.conll_test + ' ' + tespath + ' > ' + tespath + '.txt') + + + + + + + diff --git a/bcovington/src/parser.pyc b/bcovington/src/parser.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e309ab49c6418551993256ceed56deaca7be950c GIT binary patch literal 6247 zcma)AU3(M96`qwXgTW+D2sQ))n*;|#A_b2il*|oBn{_^}aX#1HI|fi(a+wIkS=mWZK#wotgKXGv|8-#{O~W z!0CH)A2&qtGmY zJt4e_KD|$P`}*{L;qC9!lfs+q(+7lipidtZ-a#R!L^vk4OCta0q;$l6A-qFE9ulp` zM2Hh{5cESWv_VNp+;_&D0)`^M2N|9MQ0fDR7=QvuJkId6E%yY&Pcm>=gind^2(!+B zX}qsvmYI$+(=m(bX`AUdGo4_jXDp_ZHq$9)a+&E_i|IL=>3L>4%}jFzljZ6Q44+|m zeiS~-@HvKG9ED$ExWaIC6s|FRp5cW8rqC_g23`>1%R){w>nj|tR{;-lanXWbWB7F& ze#3&_9EC4g@LLRDwx!>;;CDvPzGA^w8NOyqU$@{ThL>&lh6TUN@J$*~E z$*^z3-?HF<;SC#Zh%gkPWXm@RmWOS4%YvJuaKvznrMDS=%9f5TxIGFd7Mza4nFZe) zg>%a>MR2I?vpbe_cT~D(!8;GZM|gV82stBKvzW5` zgghhUypU&wgpwJtiELMi-}w*&`TdW=FJVUb9kOLiw9d0AST;DGff1Jbbl*Ib{ukZwn{*9k`2b6vcA6N3NPFp5= zEy$J3SM`4E`F=Zy65m(*u&7j`BoSPpR#V2An!(mupqs8vUEP!}!|rAj zOHC?5xfyg~t)@U1>U_PMNqwNNikIp#mg{;qmTtQFkRJDwJm`JVNRv43eCE#Md)MtK z>GmR=TB&5Y45NlVt0wq}EP`?}mQSnRrO-|Fg?t4_g3@%5A=Tp`QG-a0mbo(e z4h`UH7B#kGSrnw)(Rreu1PV&#Nf7(8y&=OeO19KtY%dkt?r^&(J{I*GS*r9b7>V%r zP;wmDwY79z2U~6*wv|zeD;1-RbWsC5LKS8Izi1*K`M6*qq(N9)54zIjJriI0SK&q@?ILVCO{#|5G-+=TIj%|WR!Y%u-w&g< zibCuaq0NRO#5r6VPO`k<%pc?&E}2uA5|LTjXqpl+WJHx)fXyfjF|^25L;9U0(n?Vu zR4Qg`QB!OS+Tam3grdPvscdiXtm{QdnD+dQh$dnmTHuB%pm-q~Bg}N$_F~RW(FYgv z0T4e3#D%svXG3Bc~Re&ZIb2x*Z3*=r*2MwDiq7Dv`g``jrZ8 zQ>IxL^Kjn3Bo3n|+)bpeC@DiO6(SO}A6!bv( zUAjh4D%EOLts)d3ryaN5!IXtLuEdQebGK3%nC+&6<$xp?-T7SzHiCrZ+9;(ci{)7} zYIBLfi!#=w)#p*qfafOKEDGZ2bK2&7fUg)JZ*bl) zGjkT=uI1R_X7PFD^Zi`h1T-_3c4VW&+#H{Ui@xll7-(i@NQ$fSMh$qZYP4U?Vz;v8 za$mU>K(1Fkl?OvrimrmSKSF&do|m2D_ytJvsijE+G=W5ZN!0cIT+?M zS98!C*ElwS~VlZ6Dg}%+rH}Ycj~U zkPwAeBkj*qn$7wDr#0J#N5t)BovqP}it zK6&h?X_w`=oa@x{yu>>%G_Lm+BR(jW?RH?cVr-F0eh!ddbd}TL<{~C2mzX|8q#5oT zB3fyb)M-F*AmJPa>So4J*lA~)?!IgNmbiLzW&Q4LKDU$LZMj{)0Ju(rlRJOY=i>Cy zA^aYR({#I&6`SW8u%H0s7qMZnrU0&)!mK4YB!1l+`zT1NY^$XPmfmI z6t&@tsYc_;>|&^}&6n_q>csLgzb5_)jnmH5=1: + yield tokens + read += 1 + tokens = [root] + id = 0 + else: + try: + if "." in tok[0] or "-" in tok[0]: continue + tokens.append(ConllEntry(int(tok[0]), tok[1], tok[2] ,tok[3], + tok[4], tok[5], int(tok[6]) if tok[6] != '_' else -1 , tok[7])) + tokens_read+=1 + + except IndexError: + pass + + #Last sentence + if len(tokens) > 1: + yield tokens + print read, 'sentences read.' + print tokens_read ,'tokens read' + + +def write_conll(fn, conll_gen): + """ + Writes a CoNLL file + """ + with open(fn, 'w') as fh: + for sentence in conll_gen: + for entry in sentence[1:]: + fh.write('\t'.join([str(entry.id), entry.form, entry.lemma, entry.cpos, entry.pos, entry.feats, str(entry.pred_parent_id), entry.pred_relation, '_', '_'])) + fh.write('\n') + fh.write('\n') + + + +numberRegex = re.compile("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+"); +def normalize(word): + return 'NUM' if numberRegex.match(word) else word.lower() + + + + +""" +Looks for multiword expressions in the CoNLL file and creates a lookup table that +allows to reconstruct then the output +""" +def lookup_conll_extra_data(fh): + + lookup = {} + sentence_id = 0 + lookup[sentence_id] = {} + id_insert_before = 1 + + for line in fh: + + if line.startswith('#'): continue + tok = line.strip().split('\t') + + if not tok or tok == ['']: #If it is empty line + sentence_id+=1 + id_insert_before = 1 + lookup[sentence_id] = {} + else: + if "." in tok[0] or "-" in tok[0]: + lookup[sentence_id][id_insert_before] = line + else: + id_insert_before+=1 + + return lookup + +""" +dumps the content of the lookup table extracted by lookup_conll_extra_data +into a output conll_path +""" +def dump_lookup_extra_into_conll(conll_path,lookup): + + sentence_id = 0 + word_id = 1 + + with codecs.open(conll_path) as f_conll: + lines = f_conll.readlines() + + #DUMPING the content of the file + f_conll = codecs.open(conll_path,"w") + + for line in lines: + + tok = line.strip().split('\t') + if tok == ['']: #If it is empty line + sentence_id+=1 + word_id = 1 + else: + if sentence_id in lookup: + if word_id in lookup[sentence_id]: + f_conll.write(lookup[sentence_id][word_id]) + word_id+=1 + f_conll.write(line) + + f_conll.close() + + +def get_rooted(conll_str): + """ + Returns a list of [id,ctag,head] of the nodes rooted to 0 + """ + rooted_elements = [] + + lines = conll_str.split('\n') + for l in lines: + ls = l.split('\t') + try: + identifier,tag,head = int(ls[UD_ID_COLUMN]),ls[UD_CTAG_COLUMN],int(ls[UD_HEAD_COLUMN]) + if head == DUMMY_ROOT: + rooted_elements.append((identifier,tag,head)) + except ValueError: + pass + return rooted_elements + + +def get_new_single_root(lmultiple_rooted): + """ + Returns the ID of the first VERB rooted to 0 or the leftmost rooted + element otherwise + """ + for e in lmultiple_rooted: + if e[2] == DUMMY_ROOT and e[1] == UD_CTAG_VERB: + return e[0] + return lmultiple_rooted[0][0] + +""" +""" +def transform_to_single_root(conll_path): + + with codecs.open(conll_path) as f_conll: + sentences = f_conll.read().split('\n\n') + + with codecs.open(conll_path,"w") as f_conll: + + i=0 + for s in sentences: + if s == "": continue + rooted = get_rooted(s) + if len(rooted) > 1: + frv = get_new_single_root(rooted) + for l in s.split('\n'): + ls = l.strip().split('\t') + + if ls != [''] and not l.startswith("#"): #If it is empty line + if ls[UD_HEAD_COLUMN] != "_" and int(ls[UD_HEAD_COLUMN]) == DUMMY_ROOT and int(ls[UD_ID_COLUMN]) != frv: + ls[UD_HEAD_COLUMN] = str(frv) + + f_conll.write('\t'.join(ls)+"\n") + else: + f_conll.write(s+"\n") + f_conll.write('\n') + i+=1 diff --git a/bcovington/src/utils.pyc b/bcovington/src/utils.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e5636000d5d171cb67f4c1b9192a20c8d3f67ff GIT binary patch literal 8418 zcmcIpU2Ggz6}~gOUhi(~ICf&ECayYFXuC=Mlai)C!K8KVGo1n=;D=gxZF5D-$Fwdc;c zckbMu^Yfi^?iBvfU;Tdc%uQG2zXI;B<4XUC!o$CzLZw=US~iryx~W1_Et~QTbU`f_ zl!vmYyrNp}QC^Q)E-A01mV2eWth`?3S5(tbURiyh)IFtcE?1RT>9kh&S`R7j5G%`l z%Ig!NUwQpP99F)m)(4b#Slz>7jwo+H+6R?)M2I2f4GM8oc|$@RQ{GV_hLv|rh!HM- zu{MmY1^>dYW`N3iQOw3$QRXKL0ce#KQM`N6ej_M>2*{Fb1jTH8JBU`ZIGT;4X0W=I zG_oL$_E^Wk-oTYwD17V!J4Ojs88SQgWC~GK_F|2rrq}RJ)xvKfxSjap*4vX-?Mr9Q zymD&Nnh96qB*@lUsnv`V%k#0kRuBbg7PwXrxryKMqpT6)N!yP+KXUI_?M9Mfg_G87 z+-h%SvOM&-jJ7R#F&($gzF2$l)VOu-g|idq&Yhi<*`)|yIBzY*iD%7wR@}6*HQ(B1 zwUG&(#qCaW*l4YI4J(d(E5xw!_R9bJ^2%1T=_m3|iVa}h{&}o0%{V$38-^F7EV&~c zX3krwFAuQYM&@HNiJMxrY^xpmxUcyQZ+>pP!+~sm?$jxH(d_|ud6~Fo$>|!3yN0@} zlmo&P0@Z>Ls1}8Qh!hlrl3b`!3o;H6zL2qHB`c%sELxgoYMey$CH2FmO`+O+&t|(# z_2I<2NWTnfYAtU0Q(ohC;7#5R!bak!Q>#HXwE_-IP=0UZZ7NONsjVys)5-Q7 znci`r1DWGIg|SkuN0}9ae=e8Ei))ZAPl3p=S*6wu8V%I&cwRSIKfbS_APtK(Zg)Yp zX6gNNv@}ch?MybsiEJ9q0d9}OISwXsoM$;J*P;x~VeZs%qDISioUAI9R_twstm^l? z*mWG6{p>?1y6Y0^QF(83F~p6O+b@+%W~Eow(bZTN59CLdy@4y`F8uvCAV^6x0D=N9 zH5&8~4X88eYbZp0F=*BZqErkoh+zLMv1OV_12zZ43nEan@xtZH6F0185c=AZ-3rFw z#r4kON>dd7srb-q=suKy22r4z!zS`7OhH>ikGhM0bSSC2_y^Ie?&2RrS>44yhziw; z1Q1XJ){hg*xryA!LHMR#y4Y4@Jf^csiXYmA! zQ5Ii7(N(KQ)+nx)CYo|CRiF(I;7?mKO0>8go94ms#v z5!t%PYh8r3E|OZWP)zGpDG#aDin^DhKtUbX3aAbLJu$pn(jOOv*8TS|vLkxWev*u8-P@{3$)H0hKv#A53J`3<~R2qC_!6a*#R%cK>f`w^!|aCW7y!#m?3L zq;?@;gIcf(TQ8$&K}bHkAJ(Nux_qd+6du~8Bwc=?yA&VVrB}NA zUU%tvXqU2d`KRtudT5u5bUAKtxtrJ3(h{KLt2eMWjg>%L?(-12$gShTrVxF80N&ym zHu$+`*p1YIXE}kD=@!f{M6(R6qIzrPPh9G00To4E+`F5E;-)W4z*V(IotjAwt5n;;{? z$F}a}?z#nHY}B=_#t3J{wIby69G4McM!Vi>dk6t-L78KyKL|C1XK@IAwAJzvoBLw= z8~&ZNMxM!-G8|k&wkA~uxT(bc8G{CKJb9glORuA!60G+s98LUB_Zd8jt}qypN#1-% zVnA6#6np-`aZ>!>?bvOsyot$D3ab+Yit{jsjHBjBqw_amju~Ud3GGa{E25%{EBzTn z3|N6evy?n#SCB}17}%AECLIl`cqk-%cm(u92+@~*n>VkjFw%yplHXAFh&IpzDd0v?B|m5NU4j-G zUdHM;T#@}TGuK<@9CtourpTcTHM&t!>7SV}cI(Ms)wW&=hGC=@!@ubAWeNVQ)!??z zh>($>6|b!OZe}&tq&Kh&FVlifGYC7K(mx70A^Lt=(g^$N)oT;)TwP45AyS!TjjI0U z#hGi16KoUBOE`+4!^xBwQGx3MLXwo$Nhv1+Mxc^A2ZYjqL7U{$^4O95JD|0sbf2B9 zDz#|!lPRq+i#L44z-(Bk(Ex4s6uV8cMv|r5NRMSn$QgpRrrKeUNt6--MY!oko9VGM zA%2m!=aDY&T*N*S2>=DIn`;@PzbKBJb`q~UdkYc+<{AVc^qL&j?ITa? z7&6LecPSJ~f_t6CmHrY11lSEQeoQ3T`5qnIT>U!=fUvX8Tmc^Vsk!=w008_f>=fP= z0SrPP5+ya8!a50onvoA~6GPD$k&Y=%&_Hn>o0U)dlT$o+AUul5GxQ z)7w8Zx_uxcIKh5nD0F^GzBVIqKib8t`jJfrYbU_*p^wR(gHs1lM_IG>ksn|RF>O^G zf({HCH*lpV(fZz*iC62--_^HUlb@@yGG2e)rS5fu&S zouh&R`Yl|)BI%wi3C?%LPi%kH-x0PVlXJC|hViza=p#2G0)6knvBN%!A-{_&&jn;h zG)`U=MEYA4z`{Lm2U-Vl(@<@y`6t}!T>azR$F0LTN{XCZP@5-n_XB?16Ba;s!4A|~`VJPS?yXPM36MEKeJ^2;wMXziWb~lX|wE4Y^MWV8V zXN!V|I=vR{wqNEJ1&-}kK-#aeIFCXkV^6dC6&4rRMyA;0ir`lm#~WL1kzNP(m`)Sl z0uSd+IFrhp6(8BUZ@%LA`o>`TQIMJPV{4RS!q`{?6=V_ zmg|Ob>TAb4jyC%mi|c$!?4|iqOGr2KFrB>D!#xf)8N=w_R;%sgx}voNPCeGs{~FJ` zx`HTps4xU=8OEVX6&|tAID#m+3T<&|C)AZaZ1^cEtAY)H02;m=n9>o7DbLlvL4<&K z_)LyXpc;3sOKOgk9nyqkAQX7RQN%Z-iqcjvp_WJj!0x6r3>- z|1E8f?S+g#R16u%5HgPCMql6#soZ%Om~ABpkQ#Cd&$Gez0&m>S8mr?x7_WDZ>zREZ zj6&3g1N1Dm&dA%^32H6TLMAfUGbl1fh}Y+w`8j9y>gDTK7UVDu4YSu~Et&rz9eO+u>)ee5SvkS%3NH={GEh zLd=~AW4R={$!N#GAH;naH%{4JGZ-cmBrKR>Tsl{O&d{kZ96trO zsy43@q@jYacFX}ghk3)KLBk&~g*vtfya{u|jdOhyM&8<|v8mMQi|knD*KbU3 zP7rp8M~68-zOdo}w@^~YaWgi5qL!k;yaRUHY0ux#E{X9;5278!Cs#X%cH)?5whI!G zRjW3=lBkzNDBYmEn!757FcP2F7~&K7Ok!ivc|bEws=$84wWVcWi5b8ubY4^k{6+fwsUtp%0+!iFVn;Zn!0pSe4DkobxVB4BGN5G1{s!oGZ7Dt9h#ww$gXDa>pic+nN LRZdifDt-S3NY$g~ literal 0 HcmV?d00001 diff --git a/bcovington/src/utils/conll17_ud_eval.py b/bcovington/src/utils/conll17_ud_eval.py new file mode 100644 index 0000000..c1ec200 --- /dev/null +++ b/bcovington/src/utils/conll17_ud_eval.py @@ -0,0 +1,556 @@ +#!/usr/bin/env python + +# CoNLL 2017 UD Parsing evaluation script. +# +# Compatible with Python 2.7 and 3.2+, can be used either as a module +# or a standalone executable. +# +# Copyright 2017 Institute of Formal and Applied Linguistics (UFAL), +# Faculty of Mathematics and Physics, Charles University, Czech Republic. +# +# Changelog: +# - [02 Jan 2017] Version 0.9: Initial release +# - [25 Jan 2017] Version 0.9.1: Fix bug in LCS alignment computation +# - [10 Mar 2017] Version 1.0: Add documentation and test +# Compare HEADs correctly using aligned words +# Allow evaluation with errorneous spaces in forms +# Compare forms in LCS case insensitively +# Detect cycles and multiple root nodes +# Compute AlignedAccuracy + +# Command line usage +# ------------------ +# conll17_ud_eval.py [-v] [-w weights_file] gold_conllu_file system_conllu_file +# +# - if no -v is given, only the CoNLL17 UD Shared Task evaluation LAS metrics +# is printed +# - if -v is given, several metrics are printed (as precision, recall, F1 score, +# and in case the metric is computed on aligned words also accuracy on these): +# - Tokens: how well do the gold tokens match system tokens +# - Sentences: how well do the gold sentences match system sentences +# - Words: how well can the gold words be aligned to system words +# - UPOS: using aligned words, how well does UPOS match +# - XPOS: using aligned words, how well does XPOS match +# - Feats: using aligned words, how well does FEATS match +# - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match +# - Lemmas: using aligned words, how well does LEMMA match +# - UAS: using aligned words, how well does HEAD match +# - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match +# - if weights_file is given (with lines containing deprel-weight pairs), +# one more metric is shown: +# - WeightedLAS: as LAS, but each deprel (ignoring subtypes) has different weight + +# API usage +# --------- +# - load_conllu(file) +# - loads CoNLL-U file from given file object to an internal representation +# - the file object should return str on both Python 2 and Python 3 +# - raises UDError exception if the given file cannot be loaded +# - evaluate(gold_ud, system_ud) +# - evaluate the given gold and system CoNLL-U files (loaded with load_conllu) +# - raises UDError if the concatenated tokens of gold and system file do not match +# - returns a dictionary with the metrics described above, each metrics having +# three fields: precision, recall and f1 + +# Description of token matching +# ----------------------------- +# In order to match tokens of gold file and system file, we consider the text +# resulting from concatenation of gold tokens and text resulting from +# concatenation of system tokens. These texts should match -- if they do not, +# the evaluation fails. +# +# If the texts do match, every token is represented as a range in this original +# text, and tokens are equal only if their range is the same. + +# Description of word matching +# ---------------------------- +# When matching words of gold file and system file, we first match the tokens. +# The words which are also tokens are matched as tokens, but words in multi-word +# tokens have to be handled differently. +# +# To handle multi-word tokens, we start by finding "multi-word spans". +# Multi-word span is a span in the original text such that +# - it contains at least one multi-word token +# - all multi-word tokens in the span (considering both gold and system ones) +# are completely inside the span (i.e., they do not "stick out") +# - the multi-word span is as small as possible +# +# For every multi-word span, we align the gold and system words completely +# inside this span using LCS on their FORMs. The words not intersecting +# (even partially) any multi-word span are then aligned as tokens. + + +from __future__ import division +from __future__ import print_function + +import argparse +import io +import sys +import unittest + +# CoNLL-U column names +ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10) + +# UD Error is used when raising exceptions in this module +class UDError(Exception): + pass + +# Load given CoNLL-U file into internal representation +def load_conllu(file): + # Internal representation classes + class UDRepresentation: + def __init__(self): + # Characters of all the tokens in the whole file. + # Whitespace between tokens is not included. + self.characters = [] + # List of UDSpan instances with start&end indices into `characters`. + self.tokens = [] + # List of UDWord instances. + self.words = [] + # List of UDSpan instances with start&end indices into `characters`. + self.sentences = [] + class UDSpan: + def __init__(self, start, end): + self.start = start + # Note that self.end marks the first position **after the end** of span, + # so we can use characters[start:end] or range(start, end). + self.end = end + class UDWord: + def __init__(self, span, columns, is_multiword): + # Span of this word (or MWT, see below) within ud_representation.characters. + self.span = span + # 10 columns of the CoNLL-U file: ID, FORM, LEMMA,... + self.columns = columns + # is_multiword==True means that this word is part of a multi-word token. + # In that case, self.span marks the span of the whole multi-word token. + self.is_multiword = is_multiword + # Reference to the UDWord instance representing the HEAD (or None if root). + self.parent = None + # Let's ignore language-specific deprel subtypes. + self.columns[DEPREL] = columns[DEPREL].split(':')[0] + + ud = UDRepresentation() + + # Load the CoNLL-U file + index, sentence_start = 0, None + while True: + line = file.readline() + if not line: + break + line = line.rstrip("\r\n") + + # Handle sentence start boundaries + if sentence_start is None: + # Skip comments + if line.startswith("#"): + continue + # Start a new sentence + ud.sentences.append(UDSpan(index, 0)) + sentence_start = len(ud.words) + if not line: + # Add parent UDWord links and check there are no cycles + def process_word(word): + if word.parent == "remapping": + raise UDError("There is a cycle in a sentence") + if word.parent is None: + head = int(word.columns[HEAD]) + if head > len(ud.words) - sentence_start: + raise UDError("HEAD '{}' points outside of the sentence".format(word.columns[HEAD])) + if head: + parent = ud.words[sentence_start + head - 1] + word.parent = "remapping" + process_word(parent) + word.parent = parent + + for word in ud.words[sentence_start:]: + process_word(word) + + # Check there is a single root node + if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1: + raise UDError("There are multiple roots in a sentence") + + # End the sentence + ud.sentences[-1].end = index + sentence_start = None + continue + + # Read next token/word + columns = line.split("\t") + if len(columns) != 10: + raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(line)) + + # Skip empty nodes + if "." in columns[ID]: + continue + + # Delete spaces from FORM so gold.characters == system.characters + # even if one of them tokenizes the space. + columns[FORM] = columns[FORM].replace(" ", "") + if not columns[FORM]: + raise UDError("There is an empty FORM in the CoNLL-U file") + + # Save token + ud.characters.extend(columns[FORM]) + ud.tokens.append(UDSpan(index, index + len(columns[FORM]))) + index += len(columns[FORM]) + + # Handle multi-word tokens to save word(s) + if "-" in columns[ID]: + try: + start, end = map(int, columns[ID].split("-")) + except: + raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID])) + + for _ in range(start, end + 1): + word_line = file.readline().rstrip("\r\n") + word_columns = word_line.split("\t") + if len(word_columns) != 10: + raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(word_line)) + ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True)) + # Basic tokens/words + else: + try: + word_id = int(columns[ID]) + except: + raise UDError("Cannot parse word ID '{}'".format(columns[ID])) + if word_id != len(ud.words) - sentence_start + 1: + raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1)) + + try: + head_id = int(columns[HEAD]) + except: + raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD])) + if head_id < 0: + raise UDError("HEAD cannot be negative") + + ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False)) + + if sentence_start is not None: + raise UDError("The CoNLL-U file does not end with empty line") + + return ud + +# Evaluate the gold and system treebanks (loaded using load_conllu). +def evaluate(gold_ud, system_ud, deprel_weights=None): + class Score: + def __init__(self, gold_total, system_total, correct, aligned_total=None): + self.precision = correct / system_total if system_total else 0.0 + self.recall = correct / gold_total if gold_total else 0.0 + self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0 + self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total + class AlignmentWord: + def __init__(self, gold_word, system_word): + self.gold_word = gold_word + self.system_word = system_word + self.gold_parent = None + self.system_parent_gold_aligned = None + class Alignment: + def __init__(self, gold_words, system_words): + self.gold_words = gold_words + self.system_words = system_words + self.matched_words = [] + self.matched_words_map = {} + def append_aligned_words(self, gold_word, system_word): + self.matched_words.append(AlignmentWord(gold_word, system_word)) + self.matched_words_map[system_word] = gold_word + def fill_parents(self): + # We represent root parents in both gold and system data by '0'. + # For gold data, we represent non-root parent by corresponding gold word. + # For system data, we represent non-root parent by either gold word aligned + # to parent system nodes, or by None if no gold words is aligned to the parent. + for words in self.matched_words: + words.gold_parent = words.gold_word.parent if words.gold_word.parent is not None else 0 + words.system_parent_gold_aligned = self.matched_words_map.get(words.system_word.parent, None) \ + if words.system_word.parent is not None else 0 + + def lower(text): + if sys.version_info < (3, 0) and isinstance(text, str): + return text.decode("utf-8").lower() + return text.lower() + + def spans_score(gold_spans, system_spans): + correct, gi, si = 0, 0, 0 + while gi < len(gold_spans) and si < len(system_spans): + if system_spans[si].start < gold_spans[gi].start: + si += 1 + elif gold_spans[gi].start < system_spans[si].start: + gi += 1 + else: + correct += gold_spans[gi].end == system_spans[si].end + si += 1 + gi += 1 + + return Score(len(gold_spans), len(system_spans), correct) + + def alignment_score(alignment, key_fn, weight_fn=lambda w: 1): + gold, system, aligned, correct = 0, 0, 0, 0 + + for word in alignment.gold_words: + gold += weight_fn(word) + + for word in alignment.system_words: + system += weight_fn(word) + + for words in alignment.matched_words: + aligned += weight_fn(words.gold_word) + + if key_fn is None: + # Return score for whole aligned words + return Score(gold, system, aligned) + + for words in alignment.matched_words: + if key_fn(words.gold_word, words.gold_parent) == key_fn(words.system_word, words.system_parent_gold_aligned): + correct += weight_fn(words.gold_word) + + return Score(gold, system, correct, aligned) + + def beyond_end(words, i, multiword_span_end): + if i >= len(words): + return True + if words[i].is_multiword: + return words[i].span.start >= multiword_span_end + return words[i].span.end > multiword_span_end + + def extend_end(word, multiword_span_end): + if word.is_multiword and word.span.end > multiword_span_end: + return word.span.end + return multiword_span_end + + def find_multiword_span(gold_words, system_words, gi, si): + # We know gold_words[gi].is_multiword or system_words[si].is_multiword. + # Find the start of the multiword span (gs, ss), so the multiword span is minimal. + # Initialize multiword_span_end characters index. + if gold_words[gi].is_multiword: + multiword_span_end = gold_words[gi].span.end + if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start: + si += 1 + else: # if system_words[si].is_multiword + multiword_span_end = system_words[si].span.end + if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start: + gi += 1 + gs, ss = gi, si + + # Find the end of the multiword span + # (so both gi and si are pointing to the word following the multiword span end). + while not beyond_end(gold_words, gi, multiword_span_end) or \ + not beyond_end(system_words, si, multiword_span_end): + if gi < len(gold_words) and (si >= len(system_words) or + gold_words[gi].span.start <= system_words[si].span.start): + multiword_span_end = extend_end(gold_words[gi], multiword_span_end) + gi += 1 + else: + multiword_span_end = extend_end(system_words[si], multiword_span_end) + si += 1 + return gs, ss, gi, si + + def compute_lcs(gold_words, system_words, gi, si, gs, ss): + lcs = [[0] * (si - ss) for i in range(gi - gs)] + for g in reversed(range(gi - gs)): + for s in reversed(range(si - ss)): + if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): + lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0) + lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0) + lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0) + return lcs + + def align_words(gold_words, system_words): + alignment = Alignment(gold_words, system_words) + + gi, si = 0, 0 + while gi < len(gold_words) and si < len(system_words): + if gold_words[gi].is_multiword or system_words[si].is_multiword: + # A: Multi-word tokens => align via LCS within the whole "multiword span". + gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si) + + if si > ss and gi > gs: + lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss) + + # Store aligned words + s, g = 0, 0 + while g < gi - gs and s < si - ss: + if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): + alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s]) + g += 1 + s += 1 + elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0): + g += 1 + else: + s += 1 + else: + # B: No multi-word token => align according to spans. + if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end): + alignment.append_aligned_words(gold_words[gi], system_words[si]) + gi += 1 + si += 1 + elif gold_words[gi].span.start <= system_words[si].span.start: + gi += 1 + else: + si += 1 + + alignment.fill_parents() + + return alignment + + # Check that underlying character sequences do match + if gold_ud.characters != system_ud.characters: + index = 0 + while gold_ud.characters[index] == system_ud.characters[index]: + index += 1 + + raise UDError( + "The concatenation of tokens in gold file and in system file differ!\n" + + "First 20 differing characters in gold file: '{}' and system file: '{}'".format( + "".join(gold_ud.characters[index:index + 20]), + "".join(system_ud.characters[index:index + 20]) + ) + ) + + # Align words + alignment = align_words(gold_ud.words, system_ud.words) + + # Compute the F1-scores + result = { + "Tokens": spans_score(gold_ud.tokens, system_ud.tokens), + "Sentences": spans_score(gold_ud.sentences, system_ud.sentences), + "Words": alignment_score(alignment, None), + "UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]), + "XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]), + "Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]), + "AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])), + "Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]), + "UAS": alignment_score(alignment, lambda w, parent: parent), + "LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])), + } + + # Add WeightedLAS if weights are given + if deprel_weights is not None: + def weighted_las(word): + return deprel_weights.get(word.columns[DEPREL], 1.0) + result["WeightedLAS"] = alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL]), weighted_las) + + return result + +def load_deprel_weights(weights_file): + if weights_file is None: + return None + + deprel_weights = {} + for line in weights_file: + # Ignore comments and empty lines + if line.startswith("#") or not line.strip(): + continue + + columns = line.rstrip("\r\n").split() + if len(columns) != 2: + raise ValueError("Expected two columns in the UD Relations weights file on line '{}'".format(line)) + + deprel_weights[columns[0]] = float(columns[1]) + + return deprel_weights + +def load_conllu_file(path): + _file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {})) + return load_conllu(_file) + +def evaluate_wrapper(args): + # Load CoNLL-U files + gold_ud = load_conllu_file(args.gold_file) + system_ud = load_conllu_file(args.system_file) + + # Load weights if requested + deprel_weights = load_deprel_weights(args.weights) + + return evaluate(gold_ud, system_ud, deprel_weights) + +def main(): + # Parse arguments + parser = argparse.ArgumentParser() + parser.add_argument("gold_file", type=str, + help="Name of the CoNLL-U file with the gold data.") + parser.add_argument("system_file", type=str, + help="Name of the CoNLL-U file with the predicted data.") + parser.add_argument("--weights", "-w", type=argparse.FileType("r"), default=None, + metavar="deprel_weights_file", + help="Compute WeightedLAS using given weights for Universal Dependency Relations.") + parser.add_argument("--verbose", "-v", default=0, action="count", + help="Print all metrics.") + args = parser.parse_args() + + # Use verbose if weights are supplied + if args.weights is not None and not args.verbose: + args.verbose = 1 + + # Evaluate + evaluation = evaluate_wrapper(args) + + # Print the evaluation + if not args.verbose: + print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1)) + else: + metrics = ["Tokens", "Sentences", "Words", "UPOS", "XPOS", "Feats", "AllTags", "Lemmas", "UAS", "LAS"] + if args.weights is not None: + metrics.append("WeightedLAS") + + print("Metrics | Precision | Recall | F1 Score | AligndAcc") + print("-----------+-----------+-----------+-----------+-----------") + for metric in metrics: + print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format( + metric, + 100 * evaluation[metric].precision, + 100 * evaluation[metric].recall, + 100 * evaluation[metric].f1, + "{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else "" + )) + +if __name__ == "__main__": + main() + +# Tests, which can be executed with `python -m unittest conll17_ud_eval`. +class TestAlignment(unittest.TestCase): + @staticmethod + def _load_words(words): + """Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors.""" + lines, num_words = [], 0 + for w in words: + parts = w.split(" ") + if len(parts) == 1: + num_words += 1 + lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1))) + else: + lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0])) + for part in parts[1:]: + num_words += 1 + lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1))) + return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"]))) + + def _test_exception(self, gold, system): + self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system)) + + def _test_ok(self, gold, system, correct): + metrics = evaluate(self._load_words(gold), self._load_words(system)) + gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold)) + system_words = sum((max(1, len(word.split(" ")) - 1) for word in system)) + self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1), + (correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words))) + + def test_exception(self): + self._test_exception(["a"], ["b"]) + + def test_equal(self): + self._test_ok(["a"], ["a"], 1) + self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3) + + def test_equal_with_multiword(self): + self._test_ok(["abc a b c"], ["a", "b", "c"], 3) + self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4) + self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4) + self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5) + + def test_alignment(self): + self._test_ok(["abcd"], ["a", "b", "c", "d"], 0) + self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1) + self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2) + self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2) + self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4) + self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2) + self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1) diff --git a/bcovington/src/utils/eval.pl b/bcovington/src/utils/eval.pl new file mode 100644 index 0000000..f8dbbfd --- /dev/null +++ b/bcovington/src/utils/eval.pl @@ -0,0 +1,1827 @@ +#!/usr/bin/env perl + +# Author: Yuval Krymolowski +# Addition of precision and recall +# and of frame confusion list: Sabine Buchholz +# Addition of DEPREL + ATTACHMENT: +# Prokopis Prokopidis (prokopis at ilsp dot gr) +# Acknowledgements: +# to Markus Kuhn for suggesting the use of +# the Unicode category property + +if ($] < 5.008001) +{ + printf STDERR < -s + + This script evaluates a system output with respect to a gold standard. + Both files should be in UTF-8 encoded CoNLL-X tabular format. + + Punctuation tokens (those where all characters have the Unicode + category property "Punctuation") are ignored for scoring (unless the + -p flag is used). + + The output breaks down the errors according to their type and context. + + Optional parameters: + -o FILE : output: print output to FILE (default is standard output) + -q : quiet: only print overall performance, without the details + -b : evalb: produce output in a format similar to evalb + (http://nlp.cs.nyu.edu/evalb/); use together with -q + -p : punctuation: also score on punctuation (default is not to score on it) + -v : version: show the version number + -h : help: print this help text and exit + +EOT +; + +my ($line_num) ; +my ($sep) = '0x01' ; + +my ($START) = '.S' ; +my ($END) = '.E' ; + +my ($con_err_num) = 3 ; +my ($freq_err_num) = 10 ; +my ($spec_err_loc_con) = 8 ; + +################################################################################ +### subfunctions ### +################################################################################ + +# Whether a string consists entirely of characters with the Unicode +# category property "Punctuation" (see "man perlunicode") +sub is_uni_punct +{ + my ($word) = @_ ; + + return scalar(Encode::decode_utf8($word)=~ /^\p{Punctuation}+$/) ; +} + +# The length of a unicode string, excluding non-spacing marks +# (for example vowel marks in Arabic) + +sub uni_len +{ + my ($word) = @_ ; + my ($ch, $l) ; + + $l = 0 ; + foreach $ch (split(//, Encode::decode_utf8($word))) + { + if ($ch !~ /^\p{NonspacingMark}/) + { + $l++ ; + } + } + + return $l ; +} + +sub filter_context_counts +{ # filter_context_counts + + my ($vec, $num, $max_len) = @_ ; + my ($con, $l, $thresh) ; + + $thresh = (sort {$b <=> $a} values %{$vec})[$num-1] ; + + foreach $con (keys %{$vec}) + { + if (${$vec}{$con} < $thresh) + { + delete ${$vec}{$con} ; + next ; + } + + $l = uni_len($con) ; + + if ($l > ${$max_len}) + { + ${$max_len} = $l ; + } + } + +} # filter_context_counts + +sub print_context +{ # print_context + + my ($counts, $counts_pos, $max_con_len, $max_con_pos_len) = @_ ; + my (@v_con, @v_con_pos, $con, $con_pos, $i, $n) ; + + printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_pos_len, 'CPOS', 'any', 'head', 'dep', 'both' ; + printf OUT " ||" ; + printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_len, 'word', 'any', 'head', 'dep', 'both' ; + printf OUT "\n" ; + printf OUT " %s-+------+------+------+-----", '-' x $max_con_pos_len; + printf OUT "--++" ; + printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len; + printf OUT "\n" ; + + @v_con = sort {${$counts}{tot}{$b} <=> ${$counts}{tot}{$a}} keys %{${$counts}{tot}} ; + @v_con_pos = sort {${$counts_pos}{tot}{$b} <=> ${$counts_pos}{tot}{$a}} keys %{${$counts_pos}{tot}} ; + + $n = scalar @v_con ; + if (scalar @v_con_pos > $n) + { + $n = scalar @v_con_pos ; + } + + foreach $i (0 .. $n-1) + { + if (defined $v_con_pos[$i]) + { + $con_pos = $v_con_pos[$i] ; + printf OUT " %-*s | %4d | %4d | %4d | %4d", + $max_con_pos_len, $con_pos, ${$counts_pos}{tot}{$con_pos}, + ${$counts_pos}{err_head}{$con_pos}, ${$counts_pos}{err_dep}{$con_pos}, + ${$counts_pos}{err_dep}{$con_pos}+${$counts_pos}{err_head}{$con_pos}-${$counts_pos}{tot}{$con_pos} ; + } + else + { + printf OUT " %-*s | %4s | %4s | %4s | %4s", + $max_con_pos_len, ' ', ' ', ' ', ' ', ' ' ; + } + + printf OUT " ||" ; + + if (defined $v_con[$i]) + { + $con = $v_con[$i] ; + printf OUT " %-*s | %4d | %4d | %4d | %4d", + $max_con_len+length($con)-uni_len($con), $con, ${$counts}{tot}{$con}, + ${$counts}{err_head}{$con}, ${$counts}{err_dep}{$con}, + ${$counts}{err_dep}{$con}+${$counts}{err_head}{$con}-${$counts}{tot}{$con} ; + } + else + { + printf OUT " %-*s | %4s | %4s | %4s | %4s", + $max_con_len, ' ', ' ', ' ', ' ', ' ' ; + } + + printf OUT "\n" ; + } + + printf OUT " %s-+------+------+------+-----", '-' x $max_con_pos_len; + printf OUT "--++" ; + printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len; + printf OUT "\n" ; + + printf OUT "\n\n" ; + +} # print_context + +sub num_as_word +{ + my ($num) = @_ ; + + $num = abs($num) ; + + if ($num == 1) + { + return ('one word') ; + } + elsif ($num == 2) + { + return ('two words') ; + } + elsif ($num == 3) + { + return ('three words') ; + } + elsif ($num == 4) + { + return ('four words') ; + } + else + { + return ($num.' words') ; + } +} + +sub describe_err +{ # describe_err + + my ($head_err, $head_aft_bef, $dep_err) = @_ ; + my ($dep_g, $dep_s, $desc) ; + my ($head_aft_bef_g, $head_aft_bef_s) = split(//, $head_aft_bef) ; + + if ($head_err eq '-') + { + $desc = 'correct head' ; + + if ($head_aft_bef_s eq '0') + { + $desc .= ' (0)' ; + } + elsif ($head_aft_bef_s eq 'e') + { + $desc .= ' (the focus word)' ; + } + elsif ($head_aft_bef_s eq 'a') + { + $desc .= ' (after the focus word)' ; + } + elsif ($head_aft_bef_s eq 'b') + { + $desc .= ' (before the focus word)' ; + } + } + elsif ($head_aft_bef_s eq '0') + { + $desc = 'head = 0 instead of ' ; + if ($head_aft_bef_g eq 'a') + { + $desc.= 'after ' ; + } + if ($head_aft_bef_g eq 'b') + { + $desc.= 'before ' ; + } + $desc .= 'the focus word' ; + } + elsif ($head_aft_bef_g eq '0') + { + $desc = 'head is ' ; + if ($head_aft_bef_g eq 'a') + { + $desc.= 'after ' ; + } + if ($head_aft_bef_g eq 'b') + { + $desc.= 'before ' ; + } + $desc .= 'the focus word instead of 0' ; + } + else + { + $desc = num_as_word($head_err) ; + if ($head_err < 0) + { + $desc .= ' before' ; + } + else + { + $desc .= ' after' ; + } + + $desc = 'head '.$desc.' the correct head ' ; + + if ($head_aft_bef_s eq '0') + { + $desc .= '(0' ; + } + elsif ($head_aft_bef_s eq 'e') + { + $desc .= '(the focus word' ; + } + elsif ($head_aft_bef_s eq 'a') + { + $desc .= '(after the focus word' ; + } + elsif ($head_aft_bef_s eq 'b') + { + $desc .= '(before the focus word' ; + } + + if ($head_aft_bef_g ne $head_aft_bef_s) + { + $desc .= ' instead of' ; + if ($head_aft_bef_s eq '0') + { + $desc .= '0' ; + } + elsif ($head_aft_bef_s eq 'e') + { + $desc .= 'the focus word' ; + } + elsif ($head_aft_bef_s eq 'a') + { + $desc .= 'after the focus word' ; + } + elsif ($head_aft_bef_s eq 'b') + { + $desc .= 'before the focus word' ; + } + } + + $desc .= ')' ; + } + + $desc .= ', ' ; + + if ($dep_err eq '-') + { + $desc .= 'correct dependency' ; + } + else + { + ($dep_g, $dep_s) = ($dep_err =~ /^(.*)->(.*)$/) ; + $desc .= sprintf('dependency "%s" instead of "%s"', $dep_s, $dep_g) ; + } + + return($desc) ; + +} # describe_err + +sub get_context +{ # get_context + + my ($sent, $i_w) = @_ ; + my ($w_2, $w_1, $w1, $w2) ; + my ($p_2, $p_1, $p1, $p2) ; + + if ($i_w >= 2) + { + $w_2 = ${${$sent}[$i_w-2]}{word} ; + $p_2 = ${${$sent}[$i_w-2]}{pos} ; + } + else + { + $w_2 = $START ; + $p_2 = $START ; + } + + if ($i_w >= 1) + { + $w_1 = ${${$sent}[$i_w-1]}{word} ; + $p_1 = ${${$sent}[$i_w-1]}{pos} ; + } + else + { + $w_1 = $START ; + $p_1 = $START ; + } + + if ($i_w <= scalar @{$sent}-2) + { + $w1 = ${${$sent}[$i_w+1]}{word} ; + $p1 = ${${$sent}[$i_w+1]}{pos} ; + } + else + { + $w1 = $END ; + $p1 = $END ; + } + + if ($i_w <= scalar @{$sent}-3) + { + $w2 = ${${$sent}[$i_w+2]}{word} ; + $p2 = ${${$sent}[$i_w+2]}{pos} ; + } + else + { + $w2 = $END ; + $p2 = $END ; + } + + return ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) ; + +} # get_context + +sub read_sent +{ # read_sent + + my ($sent_gold, $sent_sys) = @_ ; + my ($line_g, $line_s, $new_sent) ; + my (%fields_g, %fields_s) ; + + $new_sent = 1 ; + + @{$sent_gold} = () ; + @{$sent_sys} = () ; + + while (1) + { # main reading loop + + $line_g = ; + $line_s = ; + + $line_num++ ; + + # system output has fewer lines than gold standard + if ((defined $line_g) && (! defined $line_s)) + { + printf STDERR "line mismatch, line %d:\n", $line_num ; + printf STDERR " gold: %s", $line_g ; + printf STDERR " sys : past end of file\n" ; + exit(1) ; + } + + # system output has more lines than gold standard + if ((! defined $line_g) && (defined $line_s)) + { + printf STDERR "line mismatch, line %d:\n", $line_num ; + printf STDERR " gold: past end of file\n" ; + printf STDERR " sys : %s", $line_s ; + exit(1) ; + } + + # end of file reached for both + if ((! defined $line_g) && (! defined $line_s)) + { + return (1) ; + } + + # one contains end of sentence but other one does not + if (($line_g =~ /^\s+$/) != ($line_s =~ /^\s+$/)) + { + printf STDERR "line mismatch, line %d:\n", $line_num ; + printf STDERR " gold: %s", $line_g ; + printf STDERR " sys : %s", $line_s ; + exit(1) ; + } + + # end of sentence reached + if ($line_g =~ /^\s+$/) + { + return(0) ; + } + + # now both lines contain information + + if ($new_sent) + { + $new_sent = 0 ; + } + + # 'official' column names + # options.output = ['id','form','lemma','cpostag','postag', + # 'feats','head','deprel','phead','pdeprel'] + + @fields_g{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_g))[1, 3, 6, 7] ; + + push @{$sent_gold}, { %fields_g } ; + + @fields_s{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_s))[1, 3, 6, 7] ; + + if (($fields_g{word} ne $fields_s{word}) + || + ($fields_g{pos} ne $fields_s{pos})) + { + printf STDERR "Word/pos mismatch, line %d:\n", $line_num ; + printf STDERR " gold: %s", $line_g ; + printf STDERR " sys : %s", $line_s ; + exit(1) ; + } + + push @{$sent_sys}, { %fields_s } ; + + } # main reading loop + +} # read_sent + +################################################################################ +### main ### +################################################################################ + +our ($opt_g, $opt_s, $opt_o, $opt_h, $opt_v, $opt_q, $opt_p, $opt_b) ; + +my ($sent_num, $eof, $word_num, @err_sent) ; +my (@sent_gold, @sent_sys, @starts) ; +my ($word, $pos, $wp, $head_g, $dep_g, $head_s, $dep_s) ; +my (%counts, $err_head, $err_dep, $con, $con1, $con_pos, $con_pos1, $thresh) ; +my ($head_err, $dep_err, @cur_err, %err_counts, $err_counter, $err_desc) ; +my ($loc_con, %loc_con_err_counts, %err_desc) ; +my ($head_aft_bef_g, $head_aft_bef_s, $head_aft_bef) ; +my ($con_bef, $con_aft, $con_bef_2, $con_aft_2, @bits, @e_bits, @v_con, @v_con_pos) ; +my ($con_pos_bef, $con_pos_aft, $con_pos_bef_2, $con_pos_aft_2) ; +my ($max_word_len, $max_pos_len, $max_con_len, $max_con_pos_len) ; +my ($max_word_spec_len, $max_con_bef_len, $max_con_aft_len) ; +my (%freq_err, $err) ; + +my ($i, $j, $i_w, $l, $n_args) ; +my ($w_2, $w_1, $w1, $w2) ; +my ($wp_2, $wp_1, $wp1, $wp2) ; +my ($p_2, $p_1, $p1, $p2) ; + +my ($short_output) ; +my ($score_on_punct) ; +$counts{punct} = 0; # initialize + +getopts("g:o:s:qvhpb") ; + +if (defined $opt_v) +{ + my $id = '$Id: eval.pl,v 1.9 2006/05/09 20:30:01 yuval Exp $'; + my @parts = split ' ',$id; + print "Version $parts[2]\n"; + exit(0); +} + +if ((defined $opt_h) || ((! defined $opt_g) && (! defined $opt_s))) +{ + die $usage ; +} + +if (! defined $opt_g) +{ + die "Gold standard file (-g) missing\n" ; +} + +if (! defined $opt_s) +{ + die "System output file (-s) missing\n" ; +} + +if (! defined $opt_o) +{ + $opt_o = '-' ; +} + +if (defined $opt_q) +{ + $short_output = 1 ; +} else { + $short_output = 0 ; +} + +if (defined $opt_p) +{ + $score_on_punct = 1 ; +} else { + $score_on_punct = 0 ; +} + +$line_num = 0 ; +$sent_num = 0 ; +$eof = 0 ; + +@err_sent = () ; +@starts = () ; + +%{$err_sent[0]} = () ; + +$max_pos_len = length('CPOS') ; + +################################################################################ +### reading input ### +################################################################################ + +open (GOLD, "<$opt_g") || die "Could not open gold standard file $opt_g\n" ; +open (SYS, "<$opt_s") || die "Could not open system output file $opt_s\n" ; +open (OUT, ">$opt_o") || die "Could not open output file $opt_o\n" ; + + +if (defined $opt_b) { # produce output similar to evalb + print OUT " Sent. Attachment Correct Scoring \n"; + print OUT " ID Tokens - Unlab. Lab. HEAD HEAD+DEPREL tokens - - - -\n"; + print OUT " ============================================================================\n"; +} + + +while (! $eof) +{ # main reading loop + + $starts[$sent_num] = $line_num+1 ; + $eof = read_sent(\@sent_gold, \@sent_sys) ; + + $sent_num++ ; + + %{$err_sent[$sent_num]} = () ; + $word_num = scalar @sent_gold ; + + # for accuracy per sentence + my %sent_counts = ( tot => 0, + err_any => 0, + err_head => 0 + ); + + # printf "$sent_num $word_num\n" ; + + my @frames_g = ('** '); # the initial frame for the virtual root + my @frames_s = ('** '); # the initial frame for the virtual root + foreach $i_w (0 .. $word_num-1) + { # loop on words + push @frames_g, ''; # initialize + push @frames_s, ''; # initialize + } + + foreach $i_w (0 .. $word_num-1) + { # loop on words + + ($word, $pos, $head_g, $dep_g) + = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ; + $wp = $word.' / '.$pos ; + + # printf "%d: %s %s %s %s\n", $i_w, $word, $pos, $head_g, $dep_g ; + + if ((! $score_on_punct) && is_uni_punct($word)) + { + $counts{punct}++ ; + # ignore punctuations + next ; + } + + if (length($pos) > $max_pos_len) + { + $max_pos_len = length($pos) ; + } + + ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ; + + $counts{tot}++ ; + $counts{word}{$wp}{tot}++ ; + $counts{pos}{$pos}{tot}++ ; + $counts{head}{$head_g-$i_w-1}{tot}++ ; + + # for frame confusions + # add child to frame of parent + $frames_g[$head_g] .= "$dep_g "; + $frames_s[$head_s] .= "$dep_s "; + # add to frame of token itself + $frames_g[$i_w+1] .= "*$dep_g* "; # $i_w+1 because $i_w starts counting at zero + $frames_s[$i_w+1] .= "*$dep_g* "; + + # for precision and recall of DEPREL + $counts{dep}{$dep_g}{tot}++ ; # counts for gold standard deprels + $counts{dep2}{$dep_g}{$dep_s}++ ; # counts for confusions + $counts{dep_s}{$dep_s}{tot}++ ; # counts for system deprels + $counts{all_dep}{$dep_g} = 1 ; # list of all deprels that occur ... + $counts{all_dep}{$dep_s} = 1 ; # ... in either gold or system output + + # for precision and recall of HEAD direction + my $dir_g; + if ($head_g == 0) { + $dir_g = 'to_root'; + } elsif ($head_g < $i_w+1) { # $i_w+1 because $i_w starts counting at zero + # also below + $dir_g = 'left'; + } elsif ($head_g > $i_w+1) { + $dir_g = 'right'; + } else { + # token links to itself; should never happen in correct gold standard + $dir_g = 'self'; + } + my $dir_s; + if ($head_s == 0) { + $dir_s = 'to_root'; + } elsif ($head_s < $i_w+1) { + $dir_s = 'left'; + } elsif ($head_s > $i_w+1) { + $dir_s = 'right'; + } else { + # token links to itself; should not happen in good system + # (but not forbidden in shared task) + $dir_s = 'self'; + } + $counts{dir_g}{$dir_g}{tot}++ ; # counts for gold standard head direction + $counts{dir2}{$dir_g}{$dir_s}++ ; # counts for confusions + $counts{dir_s}{$dir_s}{tot}++ ; # counts for system head direction + + # for precision and recall of HEAD distance + my $dist_g; + if ($head_g == 0) { + $dist_g = 'to_root'; + } elsif ( abs($head_g - ($i_w+1)) <= 1 ) { + $dist_g = '1'; # includes the 'self' cases + } elsif ( abs($head_g - ($i_w+1)) <= 2 ) { + $dist_g = '2'; + } elsif ( abs($head_g - ($i_w+1)) <= 6 ) { + $dist_g = '3-6'; + } else { + $dist_g = '7-...'; + } + my $dist_s; + if ($head_s == 0) { + $dist_s = 'to_root'; + } elsif ( abs($head_s - ($i_w+1)) <= 1 ) { + $dist_s = '1'; # includes the 'self' cases + } elsif ( abs($head_s - ($i_w+1)) <= 2 ) { + $dist_s = '2'; + } elsif ( abs($head_s - ($i_w+1)) <= 6 ) { + $dist_s = '3-6'; + } else { + $dist_s = '7-...'; + } + $counts{dist_g}{$dist_g}{tot}++ ; # counts for gold standard head distance + $counts{dist2}{$dist_g}{$dist_s}++ ; # counts for confusions + $counts{dist_s}{$dist_s}{tot}++ ; # counts for system head distance + + + $err_head = ($head_g ne $head_s) ; # error in head + $err_dep = ($dep_g ne $dep_s) ; # error in deprel + + $head_err = '-' ; + $dep_err = '-' ; + + # for accuracy per sentence + $sent_counts{tot}++ ; + if ($err_dep || $err_head) { + $sent_counts{err_any}++ ; + } + if ($err_head) { + $sent_counts{err_head}++ ; + } + + # total counts and counts for CPOS involved in errors + + if ($head_g eq '0') + { + $head_aft_bef_g = '0' ; + } + elsif ($head_g eq $i_w+1) + { + $head_aft_bef_g = 'e' ; + } + else + { + $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ; + } + + if ($head_s eq '0') + { + $head_aft_bef_s = '0' ; + } + elsif ($head_s eq $i_w+1) + { + $head_aft_bef_s = 'e' ; + } + else + { + $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ; + } + + $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ; + + if ($err_head) + { + if ($head_aft_bef_s eq '0') + { + $head_err = 0 ; + } + else + { + $head_err = $head_s-$head_g ; + } + + $err_sent[$sent_num]{head}++ ; + $counts{err_head}{tot}++ ; + $counts{err_head}{$head_err}++ ; + + $counts{word}{err_head}{$wp}++ ; + $counts{pos}{$pos}{err_head}{tot}++ ; + $counts{pos}{$pos}{err_head}{$head_err}++ ; + } + + if ($err_dep) + { + $dep_err = $dep_g.'->'.$dep_s ; + $err_sent[$sent_num]{dep}++ ; + $counts{err_dep}{tot}++ ; + $counts{err_dep}{$dep_err}++ ; + + $counts{word}{err_dep}{$wp}++ ; + $counts{pos}{$pos}{err_dep}{tot}++ ; + $counts{pos}{$pos}{err_dep}{$dep_err}++ ; + + if ($err_head) + { + $counts{err_both}++ ; + $counts{pos}{$pos}{err_both}++ ; + } + } + + ### DEPREL + ATTACHMENT + if ((!$err_dep) && ($err_head)) { + $counts{err_head_corr_dep}{tot}++ ; + $counts{err_head_corr_dep}{$dep_s}++ ; + } + ### DEPREL + ATTACHMENT + + # counts for words involved in errors + + if (! ($err_head || $err_dep)) + { + next ; + } + + $err_sent[$sent_num]{word}++ ; + $counts{err_any}++ ; + $counts{word}{err_any}{$wp}++ ; + $counts{pos}{$pos}{err_any}++ ; + + ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ; + + if ($w_2 ne $START) + { + $wp_2 = $w_2.' / '.$p_2 ; + } + else + { + $wp_2 = $w_2 ; + } + + if ($w_1 ne $START) + { + $wp_1 = $w_1.' / '.$p_1 ; + } + else + { + $wp_1 = $w_1 ; + } + + if ($w1 ne $END) + { + $wp1 = $w1.' / '.$p1 ; + } + else + { + $wp1 = $w1 ; + } + + if ($w2 ne $END) + { + $wp2 = $w2.' / '.$p2 ; + } + else + { + $wp2 = $w2 ; + } + + $con_bef = $wp_1 ; + $con_bef_2 = $wp_2.' + '.$wp_1 ; + $con_aft = $wp1 ; + $con_aft_2 = $wp1.' + '.$wp2 ; + + $con_pos_bef = $p_1 ; + $con_pos_bef_2 = $p_2.'+'.$p_1 ; + $con_pos_aft = $p1 ; + $con_pos_aft_2 = $p1.'+'.$p2 ; + + if ($w_1 ne $START) + { + # do not count '.S' as a word context + $counts{con_bef_2}{tot}{$con_bef_2}++ ; + $counts{con_bef_2}{err_head}{$con_bef_2} += $err_head ; + $counts{con_bef_2}{err_dep}{$con_bef_2} += $err_dep ; + $counts{con_bef}{tot}{$con_bef}++ ; + $counts{con_bef}{err_head}{$con_bef} += $err_head ; + $counts{con_bef}{err_dep}{$con_bef} += $err_dep ; + } + + if ($w1 ne $END) + { + # do not count '.E' as a word context + $counts{con_aft_2}{tot}{$con_aft_2}++ ; + $counts{con_aft_2}{err_head}{$con_aft_2} += $err_head ; + $counts{con_aft_2}{err_dep}{$con_aft_2} += $err_dep ; + $counts{con_aft}{tot}{$con_aft}++ ; + $counts{con_aft}{err_head}{$con_aft} += $err_head ; + $counts{con_aft}{err_dep}{$con_aft} += $err_dep ; + } + + $counts{con_pos_bef_2}{tot}{$con_pos_bef_2}++ ; + $counts{con_pos_bef_2}{err_head}{$con_pos_bef_2} += $err_head ; + $counts{con_pos_bef_2}{err_dep}{$con_pos_bef_2} += $err_dep ; + $counts{con_pos_bef}{tot}{$con_pos_bef}++ ; + $counts{con_pos_bef}{err_head}{$con_pos_bef} += $err_head ; + $counts{con_pos_bef}{err_dep}{$con_pos_bef} += $err_dep ; + + $counts{con_pos_aft_2}{tot}{$con_pos_aft_2}++ ; + $counts{con_pos_aft_2}{err_head}{$con_pos_aft_2} += $err_head ; + $counts{con_pos_aft_2}{err_dep}{$con_pos_aft_2} += $err_dep ; + $counts{con_pos_aft}{tot}{$con_pos_aft}++ ; + $counts{con_pos_aft}{err_head}{$con_pos_aft} += $err_head ; + $counts{con_pos_aft}{err_dep}{$con_pos_aft} += $err_dep ; + + $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ; + $freq_err{$err}++ ; + + } # loop on words + + foreach $i_w (0 .. $word_num) # including one for the virtual root + { # loop on words + if ($frames_g[$i_w] ne $frames_s[$i_w]) { + $counts{frame2}{"$frames_g[$i_w]/ $frames_s[$i_w]"}++ ; + } + } + + if (defined $opt_b) { # produce output similar to evalb + if ($word_num > 0) { + my ($unlabeled,$labeled) = ('NaN', 'NaN'); + if ($sent_counts{tot} > 0) { # there are scoring tokens + $unlabeled = 100-$sent_counts{err_head}*100.0/$sent_counts{tot}; + $labeled = 100-$sent_counts{err_any} *100.0/$sent_counts{tot}; + } + printf OUT " %4d %4d 0 %6.2f %6.2f %4d %4d %4d 0 0 0 0\n", + $sent_num, $word_num, + $unlabeled, $labeled, + $sent_counts{tot}-$sent_counts{err_head}, + $sent_counts{tot}-$sent_counts{err_any}, + $sent_counts{tot},; + } + } + +} # main reading loop + +################################################################################ +### printing output ### +################################################################################ + +if (defined $opt_b) { # produce output similar to evalb + print OUT "\n\n"; +} +printf OUT " Labeled attachment score: %d / %d * 100 = %.2f %%\n", + $counts{tot}-$counts{err_any}, $counts{tot}, 100-$counts{err_any}*100.0/$counts{tot} ; +printf OUT " Unlabeled attachment score: %d / %d * 100 = %.2f %%\n", + $counts{tot}-$counts{err_head}{tot}, $counts{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot} ; +printf OUT " Label accuracy score: %d / %d * 100 = %.2f %%\n", + $counts{tot}-$counts{err_dep}{tot}, $counts{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot} ; + +if ($short_output) +{ + exit(0) ; +} +printf OUT "\n %s\n\n", '=' x 80 ; +printf OUT " Evaluation of the results in %s\n vs. gold standard %s:\n\n", $opt_s, $opt_g ; + +printf OUT " Legend: '%s' - the beginning of a sentence, '%s' - the end of a sentence\n\n", $START, $END ; + +printf OUT " Number of non-scoring tokens: $counts{punct}\n\n"; + +printf OUT " The overall accuracy and its distribution over CPOSTAGs\n\n" ; +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %-5s | %-5s | %% | %-5s | %% | %-5s | %%\n", + 'Accuracy', 'words', 'right', 'right', 'both' ; +printf OUT " %-10s | %-5s | %-5s | | %-5s | | %-5s |\n", + ' ', ' ', 'head', ' dep', 'right' ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + 'total', $counts{tot}, + $counts{tot}-$counts{err_head}{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot}, + $counts{tot}-$counts{err_dep}{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot}, + $counts{tot}-$counts{err_any}, 100-$counts{err_any}*100.0/$counts{tot} ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}}) +{ + if (! defined($counts{pos}{$pos}{err_head}{tot})) + { + $counts{pos}{$pos}{err_head}{tot} = 0 ; + } + if (! defined($counts{pos}{$pos}{err_dep}{tot})) + { + $counts{pos}{$pos}{err_dep}{tot} = 0 ; + } + if (! defined($counts{pos}{$pos}{err_any})) + { + $counts{pos}{$pos}{err_any} = 0 ; + } + + printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + $pos, $counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_head}{tot}, 100-$counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_dep}{tot}, 100-$counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_any}, 100-$counts{pos}{$pos}{err_any}*100.0/$counts{pos}{$pos}{tot} ; +} + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT "\n\n" ; + +printf OUT " The overall error rate and its distribution over CPOSTAGs\n\n" ; +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %-5s | %-5s | %% | %-5s | %% | %-5s | %%\n", + 'Error', 'words', 'head', ' dep', 'both' ; +printf OUT " %-10s | %-5s | %-5s | | %-5s | | %-5s |\n", + + 'Rate', ' ', 'err', ' err', 'wrong' ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + 'total', $counts{tot}, + $counts{err_head}{tot}, $counts{err_head}{tot}*100.0/$counts{tot}, + $counts{err_dep}{tot}, $counts{err_dep}{tot}*100.0/$counts{tot}, + $counts{err_both}, $counts{err_both}*100.0/$counts{tot} ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}}) +{ + if (! defined($counts{pos}{$pos}{err_both})) + { + $counts{pos}{$pos}{err_both} = 0 ; + } + + printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + $pos, $counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{err_head}{tot}, $counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{err_dep}{tot}, $counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{err_both}, $counts{pos}{$pos}{err_both}*100.0/$counts{pos}{$pos}{tot} ; + +} + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +### added by Sabine Buchholz +printf OUT "\n\n"; +printf OUT " Precision and recall of DEPREL\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " deprel | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dep (sort keys %{$counts{all_dep}}) { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dep2}{$dep}{$dep})) { + $tot_corr = $counts{dep2}{$dep}{$dep}; + } + if (defined($counts{dep}{$dep}{tot})) { + $tot_g = $counts{dep}{$dep}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dep_s}{$dep}{tot})) { + $tot_s = $counts{dep_s}{$dep}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} + +### DEPREL + ATTACHMENT: +### Same as Sabine's DEPREL apart from $tot_corr calculation +printf OUT "\n\n"; +printf OUT " Precision and recall of DEPREL + ATTACHMENT\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " deprel | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dep (sort keys %{$counts{all_dep}}) { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dep2}{$dep}{$dep})) { + if (defined($counts{err_head_corr_dep}{$dep})) { + $tot_corr = $counts{dep2}{$dep}{$dep} - $counts{err_head_corr_dep}{$dep}; + } else { + $tot_corr = $counts{dep2}{$dep}{$dep}; + } + } + if (defined($counts{dep}{$dep}{tot})) { + $tot_g = $counts{dep}{$dep}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dep_s}{$dep}{tot})) { + $tot_s = $counts{dep_s}{$dep}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} +### DEPREL + ATTACHMENT + +printf OUT "\n\n"; +printf OUT " Precision and recall of binned HEAD direction\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " direction | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dir ('to_root', 'left', 'right', 'self') { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dir2}{$dir}{$dir})) { + $tot_corr = $counts{dir2}{$dir}{$dir}; + } + if (defined($counts{dir_g}{$dir}{tot})) { + $tot_g = $counts{dir_g}{$dir}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dir_s}{$dir}{tot})) { + $tot_s = $counts{dir_s}{$dir}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dir, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} + +printf OUT "\n\n"; +printf OUT " Precision and recall of binned HEAD distance\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " distance | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dist ('to_root', '1', '2', '3-6', '7-...') { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dist2}{$dist}{$dist})) { + $tot_corr = $counts{dist2}{$dist}{$dist}; + } + if (defined($counts{dist_g}{$dist}{tot})) { + $tot_g = $counts{dist_g}{$dist}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dist_s}{$dist}{tot})) { + $tot_s = $counts{dist_s}{$dist}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dist, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} + +printf OUT "\n\n"; +printf OUT " Frame confusions (gold versus system; *...* marks the head token)\n\n"; +foreach my $frame (sort {$counts{frame2}{$b} <=> $counts{frame2}{$a}} keys %{$counts{frame2}}) +{ + if ($counts{frame2}{$frame} >= 5) # (make 5 a changeable threshold later) + { + printf OUT " %3d %s\n", $counts{frame2}{$frame}, $frame; + } +} +### end of: added by Sabine Buchholz + + +# +# Leave only the 5 words mostly involved in errors +# + + +$thresh = (sort {$b <=> $a} values %{$counts{word}{err_any}})[4] ; + +# ensure enough space for title +$max_word_len = length('word') ; + +foreach $word (keys %{$counts{word}{err_any}}) +{ + if ($counts{word}{err_any}{$word} < $thresh) + { + delete $counts{word}{err_any}{$word} ; + next ; + } + + $l = uni_len($word) ; + if ($l > $max_word_len) + { + $max_word_len = $l ; + } +} + +# filter a case when the difference between the error counts +# for 2-word and 1-word contexts is small +# (leave the 2-word context) + +foreach $con (keys %{$counts{con_aft_2}{tot}}) +{ + ($w1) = split(/\+/, $con) ; + + if (defined $counts{con_aft}{tot}{$w1} && + $counts{con_aft}{tot}{$w1}-$counts{con_aft_2}{tot}{$con} <= 1) + { + delete $counts{con_aft}{tot}{$w1} ; + } +} + +foreach $con (keys %{$counts{con_bef_2}{tot}}) +{ + ($w_2, $w_1) = split(/\+/, $con) ; + + if (defined $counts{con_bef}{tot}{$w_1} && + $counts{con_bef}{tot}{$w_1}-$counts{con_bef_2}{tot}{$con} <= 1) + { + delete $counts{con_bef}{tot}{$w_1} ; + } +} + +foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}}) +{ + ($p1) = split(/\+/, $con_pos) ; + + if (defined($counts{con_pos_aft}{tot}{$p1}) && + $counts{con_pos_aft}{tot}{$p1}-$counts{con_pos_aft_2}{tot}{$con_pos} <= 1) + { + delete $counts{con_pos_aft}{tot}{$p1} ; + } +} + +foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}}) +{ + ($p_2, $p_1) = split(/\+/, $con_pos) ; + + if (defined($counts{con_pos_bef}{tot}{$p_1}) && + $counts{con_pos_bef}{tot}{$p_1}-$counts{con_pos_bef_2}{tot}{$con_pos} <= 1) + { + delete $counts{con_pos_bef}{tot}{$p_1} ; + } +} + +# for each context type, take the three contexts most involved in errors + +$max_con_len = 0 ; + +filter_context_counts($counts{con_bef_2}{tot}, $con_err_num, \$max_con_len) ; + +filter_context_counts($counts{con_bef}{tot}, $con_err_num, \$max_con_len) ; + +filter_context_counts($counts{con_aft}{tot}, $con_err_num, \$max_con_len) ; + +filter_context_counts($counts{con_aft_2}{tot}, $con_err_num, \$max_con_len) ; + +# for each CPOS context type, take the three CPOS contexts most involved in errors + +$max_con_pos_len = 0 ; + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef_2}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}}) +{ + if ($counts{con_pos_bef_2}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_bef_2}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_bef}{tot}}) +{ + if ($counts{con_pos_bef}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_bef}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_aft}{tot}}) +{ + if ($counts{con_pos_aft}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_aft}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft_2}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}}) +{ + if ($counts{con_pos_aft_2}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_aft_2}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +# printing + +# ------------- focus words + +printf OUT "\n\n" ; +printf OUT " %d focus words where most of the errors occur:\n\n", scalar keys %{$counts{word}{err_any}} ; + +printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s\n", $max_word_len, ' ', 'any', 'head', 'dep', 'both' ; +printf OUT " %s-+------+------+------+------\n", '-' x $max_word_len; + +foreach $word (sort {$counts{word}{err_any}{$b} <=> $counts{word}{err_any}{$a}} keys %{$counts{word}{err_any}}) +{ + if (!defined($counts{word}{err_head}{$word})) + { + $counts{word}{err_head}{$word} = 0 ; + } + if (! defined($counts{word}{err_dep}{$word})) + { + $counts{word}{err_dep}{$word} = 0 ; + } + if (! defined($counts{word}{err_any}{$word})) + { + $counts{word}{err_any}{$word} = 0; + } + printf OUT " %-*s | %4d | %4d | %4d | %4d\n", + $max_word_len+length($word)-uni_len($word), $word, $counts{word}{err_any}{$word}, + $counts{word}{err_head}{$word}, + $counts{word}{err_dep}{$word}, + $counts{word}{err_dep}{$word}+$counts{word}{err_head}{$word}-$counts{word}{err_any}{$word} ; +} + +printf OUT " %s-+------+------+------+------\n", '-' x $max_word_len; + +# ------------- contexts + +printf OUT "\n\n" ; + +printf OUT " one-token preceeding contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_bef}, $counts{con_pos_bef}, $max_con_len, $max_con_pos_len) ; + +printf OUT " two-token preceeding contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_bef_2}, $counts{con_pos_bef_2}, $max_con_len, $max_con_pos_len) ; + +printf OUT " one-token following contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_aft}, $counts{con_pos_aft}, $max_con_len, $max_con_pos_len) ; + +printf OUT " two-token following contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_aft_2}, $counts{con_pos_aft_2}, $max_con_len, $max_con_pos_len) ; + +# ------------- Sentences + +printf OUT " Sentence with the highest number of word errors:\n" ; +$i = (sort { (defined($err_sent[$b]{word}) && $err_sent[$b]{word}) + <=> (defined($err_sent[$a]{word}) && $err_sent[$a]{word}) } 1 .. $sent_num)[0] ; +printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ; +printf OUT "%d head errors, %d dependency errors, %d word errors\n", + $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ; + +printf OUT "\n\n" ; + +printf OUT " Sentence with the highest number of head errors:\n" ; +$i = (sort { (defined($err_sent[$b]{head}) && $err_sent[$b]{head}) + <=> (defined($err_sent[$a]{head}) && $err_sent[$a]{head}) } 1 .. $sent_num)[0] ; +printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ; +printf OUT "%d head errors, %d dependency errors, %d word errors\n", + $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ; + +printf OUT "\n\n" ; + +printf OUT " Sentence with the highest number of dependency errors:\n" ; +$i = (sort { (defined($err_sent[$b]{dep}) && $err_sent[$b]{dep}) + <=> (defined($err_sent[$a]{dep}) && $err_sent[$a]{dep}) } 1 .. $sent_num)[0] ; +printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ; +printf OUT "%d head errors, %d dependency errors, %d word errors\n", + $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ; + +# +# Second pass, collect statistics of the frequent errors +# + +# filter the errors, leave the most frequent $freq_err_num errors + +$i = 0 ; + +$thresh = (sort {$b <=> $a} values %freq_err)[$freq_err_num-1] ; + +foreach $err (keys %freq_err) +{ + if ($freq_err{$err} < $thresh) + { + delete $freq_err{$err} ; + } +} + +# in case there are several errors with the threshold count + +$freq_err_num = scalar keys %freq_err ; + +%err_counts = () ; + +$eof = 0 ; + +seek (GOLD, 0, 0) ; +seek (SYS, 0, 0) ; + +while (! $eof) +{ # second reading loop + + $eof = read_sent(\@sent_gold, \@sent_sys) ; + $sent_num++ ; + + $word_num = scalar @sent_gold ; + + # printf "$sent_num $word_num\n" ; + + foreach $i_w (0 .. $word_num-1) + { # loop on words + ($word, $pos, $head_g, $dep_g) + = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ; + + # printf "%d: %s %s %s %s\n", $i_w, $word, $pos, $head_g, $dep_g ; + + if ((! $score_on_punct) && is_uni_punct($word)) + { + # ignore punctuations + next ; + } + + ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ; + + $err_head = ($head_g ne $head_s) ; + $err_dep = ($dep_g ne $dep_s) ; + + $head_err = '-' ; + $dep_err = '-' ; + + if ($head_g eq '0') + { + $head_aft_bef_g = '0' ; + } + elsif ($head_g eq $i_w+1) + { + $head_aft_bef_g = 'e' ; + } + else + { + $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ; + } + + if ($head_s eq '0') + { + $head_aft_bef_s = '0' ; + } + elsif ($head_s eq $i_w+1) + { + $head_aft_bef_s = 'e' ; + } + else + { + $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ; + } + + $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ; + + if ($err_head) + { + if ($head_aft_bef_s eq '0') + { + $head_err = 0 ; + } + else + { + $head_err = $head_s-$head_g ; + } + } + + if ($err_dep) + { + $dep_err = $dep_g.'->'.$dep_s ; + } + + if (! ($err_head || $err_dep)) + { + next ; + } + + # handle only the most frequent errors + + $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ; + + if (! exists $freq_err{$err}) + { + next ; + } + + ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ; + + $con_bef = $w_1 ; + $con_bef_2 = $w_2.' + '.$w_1 ; + $con_aft = $w1 ; + $con_aft_2 = $w1.' + '.$w2 ; + + $con_pos_bef = $p_1 ; + $con_pos_bef_2 = $p_2.'+'.$p_1 ; + $con_pos_aft = $p1 ; + $con_pos_aft_2 = $p1.'+'.$p2 ; + + @cur_err = ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) ; + + # printf "# %-25s %-15s %-10s %-25s %-3s %-30s\n", + # $con_bef, $word, $pos, $con_aft, $head_err, $dep_err ; + + @bits = (0, 0, 0, 0, 0, 0) ; + $j = 0 ; + + while ($j == 0) + { + for ($i = 0; $i <= $#bits; $i++) + { + if ($bits[$i] == 0) + { + $bits[$i] = 1 ; + $j = 0 ; + last ; + } + else + { + $bits[$i] = 0 ; + $j = 1 ; + } + } + + @e_bits = @cur_err ; + + for ($i = 0; $i <= $#bits; $i++) + { + if (! $bits[$i]) + { + $e_bits[$i] = '*' ; + } + } + + # include also the last case which is the most general + # (wildcards for everything) + $err_counts{$err}{join($sep, @e_bits)}++ ; + + } + + } # loop on words +} # second reading loop + +printf OUT "\n\n" ; +printf OUT " Specific errors, %d most frequent errors:", $freq_err_num ; +printf OUT "\n %s\n", '=' x 41 ; + + +# deleting local contexts which are too general + +foreach $err (keys %err_counts) +{ + foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}} + keys %{$err_counts{$err}}) + { + @cur_err = split(/\Q$sep\E/, $loc_con) ; + + # In this loop, one or two elements of the local context are + # replaced with '*' to make it more general. If the entry for + # the general context has the same count it is removed. + + foreach $i (0 .. $#cur_err) + { + $w1 = $cur_err[$i] ; + if ($cur_err[$i] eq '*') + { + next ; + } + $cur_err[$i] = '*' ; + $con1 = join($sep, @cur_err) ; + if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con}) + && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con})) + { + delete $err_counts{$err}{$con1} ; + } + for ($j = $i+1; $j <=$#cur_err; $j++) + { + if ($cur_err[$j] eq '*') + { + next ; + } + $w2 = $cur_err[$j] ; + $cur_err[$j] = '*' ; + $con1 = join($sep, @cur_err) ; + if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con}) + && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con})) + { + delete $err_counts{$err}{$con1} ; + } + $cur_err[$j] = $w2 ; + } + $cur_err[$i] = $w1 ; + } + } +} + +# Leaving only the topmost local contexts for each error + +foreach $err (keys %err_counts) +{ + $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[$spec_err_loc_con-1] || 0 ; + + # of the threshold is too low, take the 2nd highest count + # (the highest may be the total which is the generic case + # and not relevant for printing) + + if ($thresh < 5) + { + $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[1] ; + } + + foreach $loc_con (keys %{$err_counts{$err}}) + { + if ($err_counts{$err}{$loc_con} < $thresh) + { + delete $err_counts{$err}{$loc_con} ; + } + else + { + if ($loc_con ne join($sep, ('*', '*', '*', '*', '*', '*'))) + { + $loc_con_err_counts{$loc_con}{$err} = $err_counts{$err}{$loc_con} ; + } + } + } +} + +# printing an error summary + +# calculating the context field length + +$max_word_spec_len= length('word') ; +$max_con_aft_len = length('word') ; +$max_con_bef_len = length('word') ; +$max_con_pos_len = length('CPOS') ; + +foreach $err (keys %err_counts) +{ + foreach $loc_con (sort keys %{$err_counts{$err}}) + { + ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) = + split(/\Q$sep\E/, $loc_con) ; + + $l = uni_len($word) ; + if ($l > $max_word_spec_len) + { + $max_word_spec_len = $l ; + } + + $l = uni_len($con_bef) ; + if ($l > $max_con_bef_len) + { + $max_con_bef_len = $l ; + } + + $l = uni_len($con_aft) ; + if ($l > $max_con_aft_len) + { + $max_con_aft_len = $l ; + } + + if (length($con_pos_aft) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos_aft) ; + } + + if (length($con_pos_bef) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos_bef) ; + } + } +} + +$err_counter = 0 ; + +foreach $err (sort {$freq_err{$b} <=> $freq_err{$a}} keys %freq_err) +{ + + ($head_err, $head_aft_bef, $dep_err) = split(/\Q$sep\E/, $err) ; + + $err_counter++ ; + $err_desc{$err} = sprintf("%2d. ", $err_counter). + describe_err($head_err, $head_aft_bef, $dep_err) ; + + # printf OUT " %-3s %-30s %d\n", $head_err, $dep_err, $freq_err{$err} ; + printf OUT "\n" ; + printf OUT " %s : %d times\n", $err_desc{$err}, $freq_err{$err} ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + printf OUT " %-*s | %-*s | %-*s | %s\n", + $max_con_pos_len+$max_con_bef_len+3, ' Before', + $max_word_spec_len+$max_pos_len+3, ' Focus', + $max_con_pos_len+$max_con_aft_len+3, ' After', + 'Count' ; + + printf OUT " %-*s %-*s | %-*s %-*s | %-*s %-*s |\n", + $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word', + $max_pos_len, 'CPOS', $max_word_spec_len, 'word', + $max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}} + keys %{$err_counts{$err}}) + { + if ($loc_con eq join($sep, ('*', '*', '*', '*', '*', '*'))) + { + next ; + } + + $con1 = $loc_con ; + $con1 =~ s/\*/ /g ; + + ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) = + split(/\Q$sep\E/, $con1) ; + + printf OUT " %-*s | %-*s | %-*s | %-*s | %-*s | %-*s | %3d\n", + $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef, + $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word, + $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft, + $err_counts{$err}{$loc_con} ; + } + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + +} + +printf OUT "\n\n" ; +printf OUT " Local contexts involved in several frequent errors:" ; +printf OUT "\n %s\n", '=' x 51 ; +printf OUT "\n\n" ; + +foreach $loc_con (sort {scalar keys %{$loc_con_err_counts{$b}} <=> + scalar keys %{$loc_con_err_counts{$a}}} + keys %loc_con_err_counts) +{ + + if (scalar keys %{$loc_con_err_counts{$loc_con}} == 1) + { + next ; + } + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + printf OUT " %-*s | %-*s | %-*s \n", + $max_con_pos_len+$max_con_bef_len+3, ' Before', + $max_word_spec_len+$max_pos_len+3, ' Focus', + $max_con_pos_len+$max_con_aft_len+3, ' After' ; + + printf OUT " %-*s %-*s | %-*s %-*s | %-*s %-*s \n", + $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word', + $max_pos_len, 'CPOS', $max_word_spec_len, 'word', + $max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + $con1 = $loc_con ; + $con1 =~ s/\*/ /g ; + + ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) = + split(/\Q$sep\E/, $con1) ; + + printf OUT " %-*s | %-*s | %-*s | %-*s | %-*s | %-*s \n", + $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef, + $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word, + $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + foreach $err (sort {$loc_con_err_counts{$loc_con}{$b} <=> + $loc_con_err_counts{$loc_con}{$a}} + keys %{$loc_con_err_counts{$loc_con}}) + { + printf OUT " %s : %d times\n", $err_desc{$err}, + $loc_con_err_counts{$loc_con}{$err} ; + } + + printf OUT "\n" ; +} + +close GOLD ; +close SYS ; + +close OUT ; + diff --git a/bcovington/src/utils/weights.clas b/bcovington/src/utils/weights.clas new file mode 100644 index 0000000..eee7ac6 --- /dev/null +++ b/bcovington/src/utils/weights.clas @@ -0,0 +1,11 @@ +# Relations used to attach function words to content words +aux 0.1 +case 0.1 +cc 0.1 +clf 0.1 +cop 0.1 +det 0.1 +mark 0.1 + +# Punctuation +punct 0 diff --git a/bcovington/utils.py b/bcovington/utils.py new file mode 100644 index 0000000..40d3c27 --- /dev/null +++ b/bcovington/utils.py @@ -0,0 +1,269 @@ +from collections import Counter +import re + +""" +This is a module slightly extended from original utils in BIST-Parser: +https://github.com/elikip/bist-parser/blob/master/barchybrid/src/utils.py + +that has been adapted to include to support non-projective transition-based dependency parsing +and CoNLLU dependencies. +""" + +class CovingtonConfiguration(object): + """ + Nivre, J. (2008). Algorithms for deterministic incremental dependency parsing. Computational Linguistics, 34(4), 513-553. + + l1: Word Id of the word at the top of the lambda one list + b: Word Id of the word at the top of the buffer + sentence: List of ConllEntry + A: set of created arcs (tuples (headID,dependentID)) + """ + + def __init__(self,l1,b,sentence, A): + + self.l1 = l1 + self.b = b + self.sentence = sentence + self.A = A + + def __str__(self): + return str(self.l1)+" "+str(self.b)+" "+str(self.A) + + +class ConllEntry(object): + """ + Contains the information of a line in a CoNLL-X file. + """ + + def __init__(self, id, form, lemma, cpos, pos, feats, + parent_id=None, relation=None): + + self.id = id + self.form = form + self.lemma = normalize(lemma) + self.norm = normalize(form) + self.cpos = cpos + self.pos = pos + self.feats = feats + self.parent_id = parent_id + self.relation = relation + + #By default everything is assigned to a dummy root + self.pred_parent_id = 0 + self.pred_relation = 'root' + + #For debugging + def __str__(self): + return "["+'\,'.join(map(str,[self.id,self.form,self.lemma,self.norm,self.cpos,self.pos,self.feats,self.parent_id,self.relation]))+"]" + + + +def vocab(conll_path): + + wordsCount = Counter() + lemmasCount = Counter() + cposCount = Counter() + posCount = Counter() + featsCount = Counter() + relCount = Counter() + + with open(conll_path, 'r') as conllFP: + for sentence in read_conll(conllFP): + + wordsCount.update([node.norm for node in sentence]) + lemmasCount.update([node.lemma for node in sentence]) + cposCount.update([node.cpos for node in sentence]) + posCount.update([node.pos for node in sentence]) + featsCount.update([node.feats for node in sentence]) + relCount.update([node.relation for node in sentence]) + + return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())}, + lemmasCount, {l: i for i, l in enumerate(lemmasCount.keys())}, + cposCount.keys(), posCount.keys(), featsCount.keys(), + relCount.keys()) + + +def read_conll(fh): + """ + Reads a ConLL file given a file object fh + """ + + non_proj_sentences = 0 + read = 0 + tokens_read = 0 + root = ConllEntry(0, '*root*', '*root-lemma*', 'ROOT-POS', 'ROOT-CPOS','FEATS-ROOT', 0, 'rroot') + tokens = [root] + for line in fh: + + if line.startswith('#'): continue + tok = line.strip().split('\t') + if not tok or tok == ['']: #If it is empty line + if len(tokens)>1: + yield tokens + read += 1 + tokens = [root] + id = 0 + else: + try: + if "." in tok[0] or "-" in tok[0]: continue + tokens.append(ConllEntry(int(tok[0]), tok[1], tok[2] ,tok[3], + tok[4], tok[5], int(tok[6]) if tok[6] != '_' else -1 , tok[7])) + tokens_read+=1 + + except IndexError: + pass + + #Last sentence + if len(tokens) > 1: + yield tokens + print read, 'sentences read.' + print tokens_read ,'tokens read' + + +def write_conll(fn, conll_gen): + """ + Writes a CoNLL file + """ + with open(fn, 'w') as fh: + for sentence in conll_gen: + for entry in sentence[1:]: + fh.write('\t'.join([str(entry.id), entry.form, entry.lemma, entry.cpos, entry.pos, entry.feats, str(entry.pred_parent_id), entry.pred_relation, '_', '_'])) + fh.write('\n') + fh.write('\n') + + + +numberRegex = re.compile("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+"); +def normalize(word): + return 'NUM' if numberRegex.match(word) else word.lower() + + + + +""" +Looks for multiword expressions in the CoNLL file and creates a lookup table that +allows to reconstruct then the output +""" +def lookup_conll_extra_data(fh): + + lookup = {} + sentence_id = 0 + lookup[sentence_id] = {} + id_insert_before = 1 + + for line in fh: + + if line.startswith('#'): continue + tok = line.strip().split('\t') + + if not tok or tok == ['']: #If it is empty line + sentence_id+=1 + id_insert_before = 1 + lookup[sentence_id] = {} + else: + if "." in tok[0] or "-" in tok[0]: + lookup[sentence_id][id_insert_before] = line + else: + id_insert_before+=1 + + return lookup + +""" +dumps the content of the lookup table extracted by lookup_conll_extra_data +into a output conll_path +""" +def dump_lookup_extra_into_conll(conll_path,lookup): + + sentence_id = 0 + word_id = 1 + + with codecs.open(conll_path) as f_conll: + lines = f_conll.readlines() + + #DUMPING the content of the file + f_conll = codecs.open(conll_path,"w") + + for line in lines: + + tok = line.strip().split('\t') + if tok == ['']: #If it is empty line + sentence_id+=1 + word_id = 1 + else: + if sentence_id in lookup: + if word_id in lookup[sentence_id]: + f_conll.write(lookup[sentence_id][word_id]) + word_id+=1 + f_conll.write(line) + + f_conll.close() + + +def get_rooted(conll_str): + """ + Returns a list of [id,ctag,head] of the nodes rooted to 0 + """ + rooted_elements = [] + + lines = conll_str.split('\n') + for l in lines: + ls = l.split('\t') + try: + identifier,tag,head = int(ls[UD_ID_COLUMN]),ls[UD_CTAG_COLUMN],int(ls[UD_HEAD_COLUMN]) + if head == DUMMY_ROOT: + rooted_elements.append((identifier,tag,head)) + except ValueError: + pass + return rooted_elements + + +def get_new_single_root(lmultiple_rooted): + """ + Returns the ID of the first VERB rooted to 0 or the leftmost rooted + element otherwise + """ + for e in lmultiple_rooted: + if e[2] == DUMMY_ROOT and e[1] == UD_CTAG_VERB: + return e[0] + return lmultiple_rooted[0][0] + +""" +""" +def transform_to_single_root(conll_path): + + with codecs.open(conll_path) as f_conll: + sentences = f_conll.read().split('\n\n') + + with codecs.open(conll_path,"w") as f_conll: + + i=0 + for s in sentences: + if s == "": continue + rooted = get_rooted(s) + if len(rooted) > 1: + frv = get_new_single_root(rooted) + for l in s.split('\n'): + ls = l.strip().split('\t') + + if ls != [''] and not l.startswith("#"): #If it is empty line + if ls[UD_HEAD_COLUMN] != "_" and int(ls[UD_HEAD_COLUMN]) == DUMMY_ROOT and int(ls[UD_ID_COLUMN]) != frv: + ls[UD_HEAD_COLUMN] = str(frv) + + f_conll.write('\t'.join(ls)+"\n") + else: + f_conll.write(s+"\n") + f_conll.write('\n') + i+=1 + + + + + + + + + + + + + From 9a4ccd95be46b42e4c96a369e8ed70ff3a7474cb Mon Sep 17 00:00:00 2001 From: "david.vilares" Date: Wed, 24 May 2017 16:16:19 +0200 Subject: [PATCH 2/4] bist-covington: a non-projective transition-based BIST-parser --- bcovington/covington.py~ | 875 --------------------------------------- 1 file changed, 875 deletions(-) delete mode 100644 bcovington/covington.py~ diff --git a/bcovington/covington.py~ b/bcovington/covington.py~ deleted file mode 100644 index ec982a9..0000000 --- a/bcovington/covington.py~ +++ /dev/null @@ -1,875 +0,0 @@ -from dynet import * -from utils_bcovington import read_conll, write_conll, CovingtonConfiguration -from operator import itemgetter -from itertools import chain -from tarjan import tarjan -import time, random -import numpy as np -import os -import warnings - - -""" -This is a module extended from original the transition-based BIST-Parser barchybrid: - -https://github.com/elikip/bist-parser/blob/master/barchybrid/ -Kiperwasser, E., & Goldberg, Y. (2016). Simple and accurate dependency parsing using bidirectional LSTM feature representations. arXiv preprint arXiv:1603.04351. - - -that has been adapted to include to support non-projective transition-based dependency parsing -using an implementation (O(n^2)) of the traditional Covington's (2001) algorithm, according -to the list-based transition-based described in Nivre (2008). - -Covington, M. A. (2001). A fundamental algorithm for dependency parsing. In Proceedings of the 39th annual ACM southeast conference (pp. 95-102). -Nivre, J. (2008). Algorithms for deterministic incremental dependency parsing. Computational Linguistics, 34(4), 513-553. - -We also include the O(n) dynamic oracle described in Gomez-Rodriguez and Fernandez-Gonzalez (2015). -TODO: Current implementation is O(n^2) - -Gomez-Rodriguez, C., & Fernandez-Gonzalez, D. (2015). An efficient dynamic oracle for unrestricted non-projective parsing. Volume 2: Short Papers, 256. - -""" - - - -class CovingtonBILSTM: - - #ACTIVATION FUNCTIONS - TANH = 'tanh' - SIGMOID = 'sigmoid' - RELU = 'relu' - TANH3 = 'tanh3' - - #OPTIMIZERS - SGD="sgd" - MOMENTUM="momentum" - ADAGRAD="adagrad" - ADADELTA="adadelta" - ADAM = "adam" - - #SPECIAL INDEXES - INDEX_WORD_PAD = 1 - INDEX_WORD_INITIAL = 2 - INDEX_POS_PAD = 1 - INDEX_POS_INITIAL = 2 - INIT_WORD_INDEX = 3 - INIT_POS_INDEX = INIT_WORD_INDEX - - INDEX_FEATS_PAD = 1 - INDEX_FEATS_INITIAL= 2 - INIT_FEATS_INDEX = INIT_WORD_INDEX - - #TRANSITIONS - LEFT_ARC = 0 - RIGHT_ARC = 1 - SHIFT = 2 - NO_ARC = 3 - TRANSITIONS = [LEFT_ARC, RIGHT_ARC, SHIFT, NO_ARC] - - #OTHER HYPERPARAMETERS - SIZE_TRANSITIONS = len(TRANSITIONS) - - def __init__(self, words, lemmas, cpos, pos, feats, rels, w2i, l2i, options, path_oov_external_embedding=None, - pretrained=False): - - self.model = Model() - if options.optimizer == self.ADAM: - self.trainer = AdamTrainer(self.model) - elif options.optimizer == self.SGD: - self.trainer = SimpleSGDTrainer(self.model) - elif options.optimizer == self.MOMENTUM: - self.trainer = MomentumSGDTrainer(self.model) - elif options.optimizer == self.ADAGRAD: - self.trainer = AdagradTrainer(self.model) - elif options.optimizer == self.ADADELTA: - self.trainer = AdadeltaTrainer(self.model) - else: - raise NotImplementedError("Selected optimizer is not available") - - random.seed(1) - - self.activations = {self.TANH: tanh, - self.SIGMOID: logistic, - self.RELU: rectify, - self.TANH3: (lambda x: tanh(cwise_multiply(cwise_multiply(x, x), x)))} - - self.activation = self.activations[options.activation] - - self.oracle = options.oracle - - - self.ldims = options.lstm_dims * 2 #*2 because it is a bi-lstm - self.wdims = options.wembedding_dims - self.pdims = options.pembedding_dims - self.rdims = options.rembedding_dims - self.layers = options.lstm_layers - self.wordsCount = words - - self.vocab = {word: ind+self.INIT_WORD_INDEX for word, ind in w2i.iteritems()} - self.lemmas = {lemma: ind+self.INIT_WORD_INDEX for lemma,ind in l2i.iteritems()} - self.cpos = {cpos: ind+self.INIT_POS_INDEX for ind, cpos in enumerate(cpos)} - self.pos = {pos: ind+self.INIT_POS_INDEX for ind, pos in enumerate(pos)} - self.feats = {f: ind+self.INIT_FEATS_INDEX for ind, f in enumerate(feats)} - self.rels = {word: ind for ind, word in enumerate(rels)} - - #List of dependency types - self.irels = rels - - self.headFlag = options.headFlag - self.rlMostFlag = options.rlMostFlag - self.rlFlag = options.rlFlag - self.kb = options.window_b - self.kl1 = options.window_l1 - self.kl2_r = options.window_l2r - self.kl2_l = options.window_l2l - - self.nnvecs = (1 if self.headFlag else 0) + (2 if self.rlFlag or self.rlMostFlag else 0) - - #Reading external embedding files, if they exists - - #INFORMATION FOR EXTERNAL WORD EMBEDDINGS - self.external_embedding = None - self.edim = None - self.noextrn = None - self.extrnd = None - self.elookup = None - if options.external_embedding is not None and os.path.exists(options.external_embedding): - self.external_embedding, self.edim,self.noextrn,self.extrnd, self.elookup = self._assign_external_embeddings(options.external_embedding, - self.INDEX_WORD_PAD, self.INDEX_WORD_INITIAL) - else: - warnings.warn("Not using any external file for FORM embeddings") - - #INFORMATION FOR THE EXTERNAL CPOSTAG EMBEDDINGS - self.cpos_external_embedding = None - self.cpos_edim = None - self.cpos_noextrn = None - self.cpos_extrnd = None - self.cpos_elookup = None - if options.cpos_external_embedding is not None and os.path.exists(options.cpos_external_embedding): - self.cpos_external_embedding, self.cpos_edim,self.cpos_noextrn,self.cpos_extrnd, self.cpos_elookup = self._assign_external_embeddings(options.cpos_external_embedding, - self.INDEX_POS_PAD, self.INDEX_POS_INITIAL) - else: - warnings.warn("Not using any external file for CPOSTAG embeddings") - - #INFORMATION FOR THE EXTERNAL POSTAG EMBEDDINGS - self.pos_external_embedding = None - self.pos_edim = None - self.pos_noextrn = None - self.pos_extrnd = None - self.pos_elookup= None - if options.pos_external_embedding is not None and os.path.exists(options.pos_external_embedding): - self.pos_external_embedding, self.pos_edim,self.pos_noextrn,self.pos_extrnd, self.pos_elookup = self._assign_external_embeddings(options.pos_external_embedding, - self.INDEX_POS_PAD, self.INDEX_POS_INITIAL) - else: - warnings.warn("Not using any external file for POSTAG embeddings") - - #INFORMATION FOR THE EXTERNAL FEATS EMBEDDINGS - self.feats_external_embedding = None - self.feats_edim = None - self.feats_noextrn = None - self.feats_extrnd = None - self.feats_elookup= None - - if options.feats_external_embedding is not None and os.path.exists(options.feats_external_embedding): - self.feats_external_embedding, self.feats_edim,self.feats_noextrn,self.feats_extrnd, self.feats_elookup = self._assign_external_embeddings(options.feats_external_embedding, self.INDEX_FEATS_PAD, self.INDEX_FEATS_INITIAL) - else: - warnings.warn("Not using any external file for FEATS embeddings") - - - #INFORMATION FOR THE EXTERNAL FEATS EMBEDDINGS -# self.lemmas_external_embedding = None -# self.lemmas_edim = None -# self.lemmas_noextrn = None -# self.lemmas_extrnd = None -# self.lemmas_elookup= None - -# if options.lemmas_external_embedding is not None and os.path.exists(options.lemmas_external_embedding): -# self.lemmas_external_embedding, self.lemmas_edim,self.lemmas_noextrn,self.lemmas_extrnd, self.lemmas_elookup = self._assign_external_embeddings(options.lemmas_external_embedding, self.INDEX_FEATS_PAD, self.INDEX_FEATS_INITIAL) -# else: -# warnings.warn("Not using any external file for LEMMAS embeddings") - - - - - self.oov_external_embedding = None - self.oov_edim = None - self.oov_noextrn = None - self.oov_extrnd = None - self.oov_elookup = None - - - if path_oov_external_embedding is not None and os.path.exists(options.feats_external_embedding): - self.oov_external_embedding, self.oov_edim,self.oov_noextrn,self.oov_extrnd, self.oov_elookup = self._assign_external_embeddings(path_oov_external_embedding, - self.INDEX_WORD_PAD, self.INDEX_WORD_INITIAL) - - if self.oov_external_embedding is not None and self.oov_edim != self.edim: - raise ValueError("The dimensions of the embeddings for OOV words is not equal to the dimension of the rest of external word embeddings (self.oov_edim != self.edim)") - - #Obtaining the dimension of the input - dims = (self.wdims + self.pdims + (self.edim if self.external_embedding is not None else 0) + - (self.cpos_edim if self.cpos_external_embedding is not None else 0) + - (self.pos_edim if self.pos_external_embedding is not None else 0)+ - (self.feats_edim if self.feats_external_embedding is not None else 0) -# + -# (self.lemmas_edim if self.lemmas_external_embedding is not None else 0) - ) - - - #Initialization of the architecture - - self.blstmFlag = options.blstmFlag - self.bibiFlag = options.bibiFlag - - if self.bibiFlag: - self.surfaceBuilders = [VanillaLSTMBuilder(1, dims, self.ldims * 0.5, self.model), - VanillaLSTMBuilder(1, dims, self.ldims * 0.5, self.model)] - self.bsurfaceBuilders = [VanillaLSTMBuilder(1, self.ldims, self.ldims * 0.5, self.model), - VanillaLSTMBuilder(1, self.ldims, self.ldims * 0.5, self.model)] - elif self.blstmFlag: - if self.layers > 0: - self.surfaceBuilders = [VanillaLSTMBuilder(self.layers, dims, self.ldims * 0.5, self.model), LSTMBuilder(self.layers, dims, self.ldims * 0.5, self.model)] - else: - self.surfaceBuilders = [SimpleRNNBuilder(1, dims, self.ldims * 0.5, self.model), LSTMBuilder(1, dims, self.ldims * 0.5, self.model)] - - - self.hidden_units = options.hidden_units - self.hidden2_units = options.hidden2_units - self.vocab['*PAD*'] = self.INDEX_WORD_PAD - self.cpos['*PAD*'] = self.INDEX_POS_PAD - self.feats['*PAD*'] = self.INDEX_FEATS_PAD - - self.vocab['*INITIAL*'] = self.INDEX_WORD_INITIAL - self.cpos['*INITIAL*'] = self.INDEX_POS_INITIAL - self.feats['*INITIAL*'] = self.INDEX_FEATS_INITIAL - - self.wlookup = self.model.add_lookup_parameters((len(words) + self.INIT_WORD_INDEX, self.wdims)) - self.plookup = self.model.add_lookup_parameters((len(cpos) + self.INIT_POS_INDEX, self.pdims)) - self.rlookup = self.model.add_lookup_parameters((len(rels), self.rdims)) - - - self.word2lstm = self.model.add_parameters((self.ldims, dims)) - - self.word2lstmbias = self.model.add_parameters((self.ldims)) - self.lstm2lstm = self.model.add_parameters((self.ldims, self.ldims * self.nnvecs + self.rdims)) - self.lstm2lstmbias = self.model.add_parameters((self.ldims)) - - self.hidLayer = self.model.add_parameters((self.hidden_units, self.ldims * self.nnvecs * (self.kl1 + self.kl2_l + self.kl2_r + self.kb))) - self.hidBias = self.model.add_parameters((self.hidden_units)) - - self.hid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) - self.hid2Bias = self.model.add_parameters((self.hidden2_units)) - - self.outLayer = self.model.add_parameters((self.SIZE_TRANSITIONS, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) - self.outBias = self.model.add_parameters((self.SIZE_TRANSITIONS)) - - self.rhidLayer = self.model.add_parameters((self.hidden_units, self.ldims * self.nnvecs * (self.kl1 + self.kl2_l + self.kl2_r + self.kb))) - self.rhidBias = self.model.add_parameters((self.hidden_units)) - - self.rhid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) - self.rhid2Bias = self.model.add_parameters((self.hidden2_units)) - - self.routLayer = self.model.add_parameters((2 * (len(self.irels) + 0) + 1, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) - self.routBias = self.model.add_parameters((2 * (len(self.irels) + 0) + 1)) - - self.pretrained = pretrained - - - def _assign_external_embeddings(self,option_external_embedding, - index_pad,index_initial): - """ - Reads an external embedding file - Returns: - external_embedding: A dictionary of key:embedding - edim: Dimension of the embedding - noextrn: ?? - extrnd: Index for each key - elookup: Parameter lookup - """ - - - if option_external_embedding is not None: - - external_embedding_fp = open(option_external_embedding,'r') - external_embedding_fp.readline() - - external_embedding = {line.split(' ')[0] : [float(f) for f in line.strip().split(' ')[1:]] - for line in external_embedding_fp} - - - external_embedding_fp.close() - - edim = len(external_embedding.values()[0]) - noextrn = [0.0 for _ in xrange(edim)] - extrnd = {element: i + self.INIT_POS_INDEX - for i, element in enumerate(external_embedding)} - elookup = self.model.add_lookup_parameters((len(external_embedding) + self.INIT_WORD_INDEX, edim)) - - for element, i in extrnd.iteritems(): - elookup.init_row(i, external_embedding[element]) - extrnd['*PAD*'] = index_pad - extrnd['*INITIAL*'] = index_initial - - return external_embedding, edim, noextrn, extrnd, elookup - - return None,None,None,None,None - - - - def __evaluate(self, c, train): - """ - @param c: A CovingtonConfiguration instance - @param train: True if used in the training phase, False otherwise - Returns the scores for all possible transitions (training) - or the top ones (testing) for a given configuration c - """ - - #Gets the embeddings for the terms to be used in the prediction - top_l1 = [c.sentence[c.l1-i].lstms if c.l1 - i > 0 else [self.empty] for i in xrange(self.kl1)] - top_l2l = [c.sentence[c.l1+1+i].lstms if c.l1+1+i < c.b else [self.empty] for i in xrange(self.kl2_l)] - top_l2r = [c.sentence[c.b-i].lstms if c.b-i > c.l1 else [self.empty] for i in xrange(self.kl2_r)] - topBuffer = [c.sentence[c.b+i-1].lstms if c.b+i-1 <= c.sentence[-1].id else [self.empty] for i in xrange(self.kb)] - - input = concatenate(list(chain(*(top_l1 + top_l2l + top_l2r + topBuffer)))) - - if self.hidden2_units > 0: - routput = (self.routLayer.expr() * self.activation(self.rhid2Bias.expr() + self.rhid2Layer.expr() * self.activation(self.rhidLayer.expr() * input + self.rhidBias.expr())) + self.routBias.expr()) - else: - routput = (self.routLayer.expr() * self.activation(self.rhidLayer.expr() * input + self.rhidBias.expr()) + self.routBias.expr()) - - if self.hidden2_units > 0: - output = (self.outLayer.expr() * self.activation(self.hid2Bias.expr() + self.hid2Layer.expr() * self.activation(self.hidLayer.expr() * input + self.hidBias.expr())) + self.outBias.expr()) - else: - output = (self.outLayer.expr() * self.activation(self.hidLayer.expr() * input + self.hidBias.expr()) + self.outBias.expr()) - - scrs, uscrs = routput.value(), output.value() - - if train: - left_arc_info = [(rel,self.LEFT_ARC, scrs[1+j*2] + uscrs[self.LEFT_ARC], routput[1+j*2]+ output[self.LEFT_ARC]) - for j, rel in enumerate(self.irels) if c.l1 > 0 and c.l1 < c.b and c.b <= c.sentence[-1].id] - - right_arc_info = [(rel,self.RIGHT_ARC, scrs[2+j*2] + uscrs[self.RIGHT_ARC], routput[2+j*2]+ output[self.RIGHT_ARC]) - for j, rel in enumerate(self.irels) if c.l1 >= 0 and c.l1 < c.b and c.b <= c.sentence[-1].id] - - shift_info = [ (None, self.SHIFT, scrs[0] + uscrs[self.SHIFT], routput[0] + output[self.SHIFT]) ] if c.b <= c.sentence[-1].id else [] - - no_arc_info = [(None, self.NO_ARC,scrs[3] + uscrs[self.NO_ARC], routput[3] + output[self.NO_ARC] )] if c.l1> 0 and c.b <= c.sentence[-1].id else [] - - ret = [left_arc_info,right_arc_info, shift_info, no_arc_info] - - else: - #It is done different from the 'train' phase, due to the dynamic oracle. - #In the test phase we already pick the most likely transition/dependency instead of returning them all - #and then selecting one according to the prediction of the dynamic oracle - sLEFT,rLEFT = max(zip(scrs[1::2],self.irels)) - sRIGHT,rRIGHT = max(zip(scrs[2::2],self.irels)) - sLEFT += uscrs[self.LEFT_ARC] - sRIGHT += uscrs[self.RIGHT_ARC] - ret = [ [(rLEFT, self.LEFT_ARC, sLEFT) ] if (c.l1 > 0 and c.l1 < c.b and c.b <= c.sentence[-1].id and self._is_valid_left_arc(c)) else [], - [(rRIGHT, self.RIGHT_ARC, sRIGHT) ] if (c.l1 >= 0 and c.l1 < c.b and c.b <= c.sentence[-1].id and self._is_valid_right_arc(c)) else [], - [(None, self.SHIFT, scrs[0] + uscrs[self.SHIFT]) ] if (c.b <= c.sentence[-1].id) else [], - [(None, self.NO_ARC,scrs[3] + uscrs[self.NO_ARC]) ] if (c.l1 > 0 and c.b <= c.sentence[-1].id) else [] - ] - return ret - - - def Save(self, filename): - self.model.save(filename) - - - def Load(self, filename): - self.model.load(filename) - - def Init(self): - evec = self.elookup[1] if self.external_embedding is not None else None - cpos_evec = self.cpos_elookup[1] if self.cpos_external_embedding is not None else None - pos_evec = self.pos_elookup[1] if self.pos_external_embedding is not None else None - feats_evec = self.feats_elookup[1] if self.feats_external_embedding is not None else None - # lemmas_evec = self.lemmas_elookup[1] if self.lemmas_external_embedding is not None else None - paddingWordVec = self.wlookup[1] - paddingPosVec = self.plookup[1] if self.pdims > 0 else None - # paddingVec = tanh(self.word2lstm.expr() * concatenate(filter(None, [paddingWordVec, paddingPosVec, evec, cpos_evec, pos_evec, feats_evec, lemmas_evec])) + self.word2lstmbias.expr()) - paddingVec = tanh(self.word2lstm.expr() * concatenate(filter(None, [paddingWordVec, paddingPosVec, evec, cpos_evec, pos_evec, feats_evec])) + self.word2lstmbias.expr()) - self.empty = paddingVec if self.nnvecs == 1 else concatenate([paddingVec for _ in xrange(self.nnvecs)]) - - - def getWordEmbeddings(self, sentence, train): - """ - Gets the embeddings (also external) for every term in a sentence - Returns a vector of all embeddings concatenated - """ - - for root in sentence: - c = float(self.wordsCount.get(root.norm, 0)) - dropFlag = not train or (random.random() < (c/(0.25+c))) - sys.stdout.flush() - root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if dropFlag else 0] - root.cposvec = self.plookup[int(self.cpos.get(root.cpos,0))] if self.pdims > 0 else None - - #For word embeddings - if self.external_embedding is not None: - if root.form in self.external_embedding: - root.evec = self.elookup[self.extrnd[root.form]] - elif root.norm in self.external_embedding: - root.evec = self.elookup[self.extrnd[root.norm]] - else: - if (self.oov_external_embedding is not None and root.form.replace(" ","_") in self.oov_external_embedding): - root.evec = self.oov_elookup[self.oov_extrnd[root.form.replace(" ","_")]] - else: - root.evec = self.elookup[0] - else: - root.evec = None - - #For cpostag embeddings - if self.cpos_external_embedding is not None: - if root.cpos in self.cpos_external_embedding: - root.cposevec = self.cpos_elookup[self.cpos_extrnd[root.cpos]] - else: - root.cposevec = self.cpos_elookup[0] - else: - root.cposevec = None - - #For postag embeddings - if self.pos_external_embedding is not None: - if root.pos in self.pos_external_embedding: - root.posevec = self.pos_elookup[self.pos_extrnd[root.pos]] - else: - root.posevec = self.pos_elookup[0] - else: - root.posevec = None -# - #For feats embeddings - if self.feats_external_embedding is not None: - if root.feats in self.feats_external_embedding: - root.featsevec = self.feats_elookup[self.feats_extrnd[root.feats]] - else: - root.featsevec = self.feats_elookup[0] - else: - root.featsevec = None - - - #For lemmas embeddings -# if self.lemmas_external_embedding is not None: -# if root.lemma in self.lemmas_external_embedding: -# root.lemmasevec = self.lemmas_elookup[self.lemmas_extrnd[root.lemma]] -# else: -# root.lemmasevec = self.lemmas_elookup[0] -# else: -# root.lemmasevec = None - - - # root.ivec = concatenate(filter(None, [root.wordvec, root.cposvec, root.evec, root.cposevec, root.posevec, root.featsevec, root.lemmasevec])) - root.ivec = concatenate(filter(None, [root.wordvec, root.cposvec, root.evec, root.cposevec, root.posevec, root.featsevec])) - - if self.blstmFlag: - forward = self.surfaceBuilders[0].initial_state() - backward = self.surfaceBuilders[1].initial_state() - - for froot, rroot in zip(sentence, reversed(sentence)): - forward = forward.add_input( froot.ivec ) - backward = backward.add_input( rroot.ivec ) - froot.fvec = forward.output() - rroot.bvec = backward.output() - for root in sentence: - root.vec = concatenate( [root.fvec, root.bvec] ) - - if self.bibiFlag: - bforward = self.bsurfaceBuilders[0].initial_state() - bbackward = self.bsurfaceBuilders[1].initial_state() - - for froot, rroot in zip(sentence, reversed(sentence)): - bforward = bforward.add_input( froot.vec ) - bbackward = bbackward.add_input( rroot.vec ) - froot.bfvec = bforward.output() - rroot.bbvec = bbackward.output() - for root in sentence: - root.vec = concatenate( [root.bfvec, root.bbvec] ) - - else: - for root in sentence: - root.ivec = (self.word2lstm.expr() * root.ivec) + self.word2lstmbias.expr() - root.vec = tanh( root.ivec ) - - - def Predict(self, conll_path): - """ - Makes non-projective depending parsing prediction given a ConLL-X file - """ - - - with open(conll_path, 'r') as conllFP: - for iSentence, sentence in enumerate(read_conll(conllFP)): - self.Init() - - l1 = sentence[0].id - b = sentence[1].id - arcs = set([]) - - self.getWordEmbeddings(sentence, False) - - for root in sentence: - root.lstms = [root.vec for _ in xrange(self.nnvecs)] - - hoffset = 1 if self.headFlag else 0 - - c = CovingtonConfiguration(l1,b,sentence,arcs) - while not self._is_final_state(b,sentence): - - transition_scores = self.__evaluate(c, False) - - - best = max(chain(*transition_scores), key = itemgetter(2) ) - - if best[1] == self.LEFT_ARC: - - sentence[l1].pred_parent_id = sentence[b].id - sentence[l1].pred_relation = best[0] - best_op = self.LEFT_ARC - if self.rlMostFlag: - sentence[b].lstms[best_op+hoffset] = sentence[l1].lstms[best_op+hoffset] - if self.rlFlag: - sentence[b].lstms[best_op+hoffset] = sentence[l1].vec - - arcs.add((b,l1)) - l1 = l1 -1 - - elif best[1] == self.RIGHT_ARC: - - sentence[b].pred_parent_id = sentence[l1].id - sentence[b].pred_relation = best[0] - - best_op = self.RIGHT_ARC - if self.rlMostFlag: - sentence[l1].lstms[best_op+hoffset] = sentence[b].lstms[best_op+hoffset] - if self.rlFlag: - sentence[l1].lstms[best_op+hoffset] = sentence[b].vec - - arcs.add((l1,b)) - l1 = l1-1 - - elif best[1] == self.SHIFT: - l1 = b - b = b + 1 - - - elif best[1] == self.NO_ARC: - l1 = l1 - 1 - - c = CovingtonConfiguration(l1,b,sentence,arcs) - renew_cg() - yield sentence - - - def Train(self, conll_path): - """ - Trains a O(n^2) Covington's parser with a O(n^2) dynamic oracle - """ - mloss = 0.0 - errors = 0 - batch = 0 - eloss = 0.0 - eerrors = 0 - lerrors = 0 - etotal = 0 - ltotal = 0 - ninf = -float('inf') - - hoffset = 1 if self.headFlag else 0 - - start = time.time() - - with open(conll_path, 'r') as conllFP: - shuffledData = list(read_conll(conllFP)) - - random.shuffle(shuffledData) - - - errs = [] - eeloss = 0.0 - - self.Init() - - for iSentence, sentence in enumerate(shuffledData): - if iSentence % 100 == 0 and iSentence != 0: - print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Labeled Errors:', (float(lerrors) / etotal) , 'Time', time.time()-start - start = time.time() - eerrors = 0 - eloss = 0.0 - etotal = 0 - lerrors = 0 - ltotal = 0 - - self.getWordEmbeddings(sentence, True) - #We obtain the gold arcs to then compute the dynamic oracle for covington - gold_arcs = set([]) - for word in sentence: - - #TODO: Weird error if not, adds and arc (0,0) - if word.id != word.parent_id: - gold_arcs.add((word.parent_id,word.id)) - - - l1 = sentence[0].id - b = sentence[1].id - arcs = set([]) - c = CovingtonConfiguration(l1,b,sentence,arcs) - loss_c = self._loss(c,gold_arcs, iSentence) - - for word in sentence: - word.lstms = [word.vec for _ in xrange(self.nnvecs)] - - hoffset = 1 if self.headFlag else 0 - - while not self._is_final_state(b,sentence): - - costs = [None,None,None,None] - transition_scores = self.__evaluate(c, True) - - #We determine if the transitions are valid for a given configuration c - for t in self.TRANSITIONS: - - l1_aux = l1 - b_aux = b - arcs_aux = set(arcs) - valid_transition = False - - if t == self.LEFT_ARC and self._is_valid_left_arc(c): - arcs_aux.add((b_aux,l1_aux)) - l1_aux = l1_aux -1 - valid_transition = True - - if t == self.RIGHT_ARC and l1 >=0 and self._is_valid_right_arc(c): - arcs_aux.add((l1_aux,b_aux)) - l1_aux = l1_aux-1 - valid_transition = True - - if t == self.NO_ARC and l1 >0: - l1_aux = l1_aux-1 - valid_transition = True - - if t == self.SHIFT: - l1_aux = b_aux - b_aux = b_aux + 1 - valid_transition = True - - if valid_transition: - - new_c = CovingtonConfiguration(l1_aux,b_aux,sentence,arcs_aux) - loss_new_c = self._loss(new_c,gold_arcs,iSentence) - - cost = loss_new_c - loss_c - costs[t] = float(cost) - - #Valid transitions are those with cost 0 - #If it is a LEFT/RIGHT arc, also the relation must match with the one in gold standard - valid_transitions = [s for s in chain(*transition_scores) if costs[s[1]] == 0 and (s[1] in [self.SHIFT,self.NO_ARC] - or ((s[1] == self.LEFT_ARC and s[0] == sentence[l1].relation) - or (s[1] == self.RIGHT_ARC and s[0] == sentence[b].relation)))] - - best_valid = max(valid_transitions, key=itemgetter(2)) - - wrong_transitions = [s for s in chain(*transition_scores) if costs[s[1]] is not None and ( (costs[s[1]] != 0) or (s[1] in [self.LEFT_ARC,self.RIGHT_ARC] - and ((s[1] == self.LEFT_ARC and s[0] != sentence[l1].relation) - or (s[1] == self.RIGHT_ARC and s[0] != sentence[b].relation))) ) ] - - #Aggressive exploration as done by Kiperwasser and Golberg (2016) - if wrong_transitions != []: - best_wrong = max(wrong_transitions, key=itemgetter(2)) - - best = best_valid if ( (not self.oracle) or (best_valid[2] - best_wrong[2] > 1.0) - or (best_valid[2] > best_wrong[2] and random.random() > 0.1) ) else best_wrong - else: - best = best_valid - - - #Moving a new configuration based on the "best" choice - if best[1] == self.LEFT_ARC: - - sentence[l1].pred_parent_id = sentence[b].id - sentence[l1].pred_relation = best[0] - - best_op = self.LEFT_ARC - if self.rlMostFlag: - sentence[b].lstms[best_op+hoffset] = sentence[l1].lstms[best_op+hoffset] - if self.rlFlag: - sentence[b].lstms[best_op+hoffset] = sentence[l1].vec - - child = sentence[l1] - arcs.add((b,l1)) - l1 = l1 -1 - - elif best[1] == self.RIGHT_ARC: - - - sentence[b].pred_parent_id = sentence[l1].id - sentence[b].pred_relation = best[0] - - best_op = self.RIGHT_ARC - if self.rlMostFlag: - sentence[l1].lstms[best_op+hoffset] = sentence[b].lstms[best_op+hoffset] - if self.rlFlag: - sentence[l1].lstms[best_op+hoffset] = sentence[b].vec - - arcs.add((l1,b)) - child = sentence[b] - l1 = l1-1 - - - elif best[1] == self.SHIFT: - l1 = b - child = sentence[b] - b = b + 1 - - - elif best[1] == self.NO_ARC: - l1 = l1 - 1 - child = sentence[l1] - - - if best_valid[2] < best_wrong[2] + 1.0: - loss = best_wrong[3] - best_valid[3] - mloss += 1.0 + best_wrong[2] - best_valid[2] - eloss += 1.0 + best_wrong[2] - best_valid[2] - errs.append(loss) - - - if best[1] not in [self.SHIFT, self.NO_ARC] and (child.pred_parent_id != child.parent_id or child.pred_relation != child.relation): - lerrors += 1 - if child.pred_parent_id != child.parent_id: - errors += 1 - eerrors += 1 - - etotal += 1 - c = CovingtonConfiguration(l1,b,sentence,arcs) - loss_c = self._loss(c,gold_arcs, iSentence) - - - if len(errs) > 50: - eerrs = esum(errs) - scalar_loss = eerrs.scalar_value() - eerrs.backward() - self.trainer.update() - errs = [] - lerrs = [] - - renew_cg() - self.Init() - - if len(errs) > 0: - eerrs = (esum(errs)) # * (1.0/(float(len(errs)))) - eerrs.scalar_value() - eerrs.backward() - self.trainer.update() - - errs = [] - lerrs = [] - - renew_cg() - - self.trainer.update_epoch() - print "Loss: ", mloss/iSentence - - - def _is_final_state(self,b,sentence): - return b >= len(sentence) - - - def _is_valid_left_arc(self,c): - - aux = set(c.A) - aux.add((c.b,c.l1)) - l1_has_head = self._y_has_head(c.A, c.b, c.l1) - return (c.l1 > 0 and not l1_has_head - and self._count_cycles(aux) == 0) - - - def _is_valid_right_arc(self,c): - - b_has_head = self._y_has_head(c.A, c.l1, c.b) - aux = set(c.A) - aux.add((c.l1,c.b)) - return ((not b_has_head) and self._count_cycles(aux) == 0) - - - """ - Gomez-Rodriguez & Fernandez-Gonzalez: - An Efficiente Dynamic Oracle for Unrestricted Non-Projective Parsing (ACL,2015) - Algorithm 1 - """ - def _loss(self, c, gold_arcs, iSentence): - - U = set([]) #set of unreachable nodes - non_built_arcs = gold_arcs.difference(c.A) - - - i = c.l1 - j = c.b - - for x,y in non_built_arcs: - left = min(x,y) #O(n) - right = max(x,y) #O(n) - if (j > right or (j==right and i < left) or self._y_has_head(c.A,x,y) - or self._weakly_connected(c.A, x, y,c, gold_arcs)): - U.add((x,y)) - - I = gold_arcs.difference(U) - - return len(U) + self._count_cycles( c.A.union(I)) - - - #TODO: This can be done more efficient - #O(n^2) - def _weakly_connected(self,A,x,y,c, gold_arcs): - - weakly_connected = False - end_path = False - parent = x - - while parent != 0 and not weakly_connected and not end_path and A != set([]): - if (parent,y) in A: - weakly_connected = True - break - else: - - for (a,b) in A: - if b == parent: - parent = a - break - else: - end_path = True - - - return weakly_connected - - - """ - Tarjan (1972) implementation at https://github.com/bwesterb/py-tarjan/ - O(n) - """ - def _count_cycles(self, A): - - d = {} - for a,b in A: - if a not in d: - d[a] = [b] - else: - d[a].append(b) - - return sum([1 for e in tarjan(d) if len(e) > 1]) - - - """ - Determines if node y has already a head - """ - #O(n) - def _y_has_head(self,A,x,y): - - for z,y_prime in A: - if y_prime == y and z != x: - return True - return False - - #O(n) -# def violates_single_root(self, A): -# print A,[1 for (h,d) in A if h==0], len([1 for (h,d) in A if h==0]) != 0 -# return len([1 for (h,d) in A if h==0]) != 0 - From 9edba125f7f72d97ec74a894f01e15e51e52d48b Mon Sep 17 00:00:00 2001 From: "david.vilares" Date: Wed, 24 May 2017 16:17:55 +0200 Subject: [PATCH 3/4] bist-covington: a non-projective transition-based BIST-parser --- bcovington/parser.py~ | 0 bcovington/src/parser.pyc | Bin 6247 -> 0 bytes bcovington/src/utils.pyc | Bin 8418 -> 0 bytes bcovington/utils.py | 269 -------------------------------------- 4 files changed, 269 deletions(-) delete mode 100644 bcovington/parser.py~ delete mode 100644 bcovington/src/parser.pyc delete mode 100644 bcovington/src/utils.pyc delete mode 100644 bcovington/utils.py diff --git a/bcovington/parser.py~ b/bcovington/parser.py~ deleted file mode 100644 index e69de29..0000000 diff --git a/bcovington/src/parser.pyc b/bcovington/src/parser.pyc deleted file mode 100644 index e309ab49c6418551993256ceed56deaca7be950c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6247 zcma)AU3(M96`qwXgTW+D2sQ))n*;|#A_b2il*|oBn{_^}aX#1HI|fi(a+wIkS=mWZK#wotgKXGv|8-#{O~W z!0CH)A2&qtGmY zJt4e_KD|$P`}*{L;qC9!lfs+q(+7lipidtZ-a#R!L^vk4OCta0q;$l6A-qFE9ulp` zM2Hh{5cESWv_VNp+;_&D0)`^M2N|9MQ0fDR7=QvuJkId6E%yY&Pcm>=gind^2(!+B zX}qsvmYI$+(=m(bX`AUdGo4_jXDp_ZHq$9)a+&E_i|IL=>3L>4%}jFzljZ6Q44+|m zeiS~-@HvKG9ED$ExWaIC6s|FRp5cW8rqC_g23`>1%R){w>nj|tR{;-lanXWbWB7F& ze#3&_9EC4g@LLRDwx!>;;CDvPzGA^w8NOyqU$@{ThL>&lh6TUN@J$*~E z$*^z3-?HF<;SC#Zh%gkPWXm@RmWOS4%YvJuaKvznrMDS=%9f5TxIGFd7Mza4nFZe) zg>%a>MR2I?vpbe_cT~D(!8;GZM|gV82stBKvzW5` zgghhUypU&wgpwJtiELMi-}w*&`TdW=FJVUb9kOLiw9d0AST;DGff1Jbbl*Ib{ukZwn{*9k`2b6vcA6N3NPFp5= zEy$J3SM`4E`F=Zy65m(*u&7j`BoSPpR#V2An!(mupqs8vUEP!}!|rAj zOHC?5xfyg~t)@U1>U_PMNqwNNikIp#mg{;qmTtQFkRJDwJm`JVNRv43eCE#Md)MtK z>GmR=TB&5Y45NlVt0wq}EP`?}mQSnRrO-|Fg?t4_g3@%5A=Tp`QG-a0mbo(e z4h`UH7B#kGSrnw)(Rreu1PV&#Nf7(8y&=OeO19KtY%dkt?r^&(J{I*GS*r9b7>V%r zP;wmDwY79z2U~6*wv|zeD;1-RbWsC5LKS8Izi1*K`M6*qq(N9)54zIjJriI0SK&q@?ILVCO{#|5G-+=TIj%|WR!Y%u-w&g< zibCuaq0NRO#5r6VPO`k<%pc?&E}2uA5|LTjXqpl+WJHx)fXyfjF|^25L;9U0(n?Vu zR4Qg`QB!OS+Tam3grdPvscdiXtm{QdnD+dQh$dnmTHuB%pm-q~Bg}N$_F~RW(FYgv z0T4e3#D%svXG3Bc~Re&ZIb2x*Z3*=r*2MwDiq7Dv`g``jrZ8 zQ>IxL^Kjn3Bo3n|+)bpeC@DiO6(SO}A6!bv( zUAjh4D%EOLts)d3ryaN5!IXtLuEdQebGK3%nC+&6<$xp?-T7SzHiCrZ+9;(ci{)7} zYIBLfi!#=w)#p*qfafOKEDGZ2bK2&7fUg)JZ*bl) zGjkT=uI1R_X7PFD^Zi`h1T-_3c4VW&+#H{Ui@xll7-(i@NQ$fSMh$qZYP4U?Vz;v8 za$mU>K(1Fkl?OvrimrmSKSF&do|m2D_ytJvsijE+G=W5ZN!0cIT+?M zS98!C*ElwS~VlZ6Dg}%+rH}Ycj~U zkPwAeBkj*qn$7wDr#0J#N5t)BovqP}it zK6&h?X_w`=oa@x{yu>>%G_Lm+BR(jW?RH?cVr-F0eh!ddbd}TL<{~C2mzX|8q#5oT zB3fyb)M-F*AmJPa>So4J*lA~)?!IgNmbiLzW&Q4LKDU$LZMj{)0Ju(rlRJOY=i>Cy zA^aYR({#I&6`SW8u%H0s7qMZnrU0&)!mK4YB!1l+`zT1NY^$XPmfmI z6t&@tsYc_;>|&^}&6n_q>csLgzb5_)jnmH5=o1n=;D=gxZF5D-$Fwdc;c zckbMu^Yfi^?iBvfU;Tdc%uQG2zXI;B<4XUC!o$CzLZw=US~iryx~W1_Et~QTbU`f_ zl!vmYyrNp}QC^Q)E-A01mV2eWth`?3S5(tbURiyh)IFtcE?1RT>9kh&S`R7j5G%`l z%Ig!NUwQpP99F)m)(4b#Slz>7jwo+H+6R?)M2I2f4GM8oc|$@RQ{GV_hLv|rh!HM- zu{MmY1^>dYW`N3iQOw3$QRXKL0ce#KQM`N6ej_M>2*{Fb1jTH8JBU`ZIGT;4X0W=I zG_oL$_E^Wk-oTYwD17V!J4Ojs88SQgWC~GK_F|2rrq}RJ)xvKfxSjap*4vX-?Mr9Q zymD&Nnh96qB*@lUsnv`V%k#0kRuBbg7PwXrxryKMqpT6)N!yP+KXUI_?M9Mfg_G87 z+-h%SvOM&-jJ7R#F&($gzF2$l)VOu-g|idq&Yhi<*`)|yIBzY*iD%7wR@}6*HQ(B1 zwUG&(#qCaW*l4YI4J(d(E5xw!_R9bJ^2%1T=_m3|iVa}h{&}o0%{V$38-^F7EV&~c zX3krwFAuQYM&@HNiJMxrY^xpmxUcyQZ+>pP!+~sm?$jxH(d_|ud6~Fo$>|!3yN0@} zlmo&P0@Z>Ls1}8Qh!hlrl3b`!3o;H6zL2qHB`c%sELxgoYMey$CH2FmO`+O+&t|(# z_2I<2NWTnfYAtU0Q(ohC;7#5R!bak!Q>#HXwE_-IP=0UZZ7NONsjVys)5-Q7 znci`r1DWGIg|SkuN0}9ae=e8Ei))ZAPl3p=S*6wu8V%I&cwRSIKfbS_APtK(Zg)Yp zX6gNNv@}ch?MybsiEJ9q0d9}OISwXsoM$;J*P;x~VeZs%qDISioUAI9R_twstm^l? z*mWG6{p>?1y6Y0^QF(83F~p6O+b@+%W~Eow(bZTN59CLdy@4y`F8uvCAV^6x0D=N9 zH5&8~4X88eYbZp0F=*BZqErkoh+zLMv1OV_12zZ43nEan@xtZH6F0185c=AZ-3rFw z#r4kON>dd7srb-q=suKy22r4z!zS`7OhH>ikGhM0bSSC2_y^Ie?&2RrS>44yhziw; z1Q1XJ){hg*xryA!LHMR#y4Y4@Jf^csiXYmA! zQ5Ii7(N(KQ)+nx)CYo|CRiF(I;7?mKO0>8go94ms#v z5!t%PYh8r3E|OZWP)zGpDG#aDin^DhKtUbX3aAbLJu$pn(jOOv*8TS|vLkxWev*u8-P@{3$)H0hKv#A53J`3<~R2qC_!6a*#R%cK>f`w^!|aCW7y!#m?3L zq;?@;gIcf(TQ8$&K}bHkAJ(Nux_qd+6du~8Bwc=?yA&VVrB}NA zUU%tvXqU2d`KRtudT5u5bUAKtxtrJ3(h{KLt2eMWjg>%L?(-12$gShTrVxF80N&ym zHu$+`*p1YIXE}kD=@!f{M6(R6qIzrPPh9G00To4E+`F5E;-)W4z*V(IotjAwt5n;;{? z$F}a}?z#nHY}B=_#t3J{wIby69G4McM!Vi>dk6t-L78KyKL|C1XK@IAwAJzvoBLw= z8~&ZNMxM!-G8|k&wkA~uxT(bc8G{CKJb9glORuA!60G+s98LUB_Zd8jt}qypN#1-% zVnA6#6np-`aZ>!>?bvOsyot$D3ab+Yit{jsjHBjBqw_amju~Ud3GGa{E25%{EBzTn z3|N6evy?n#SCB}17}%AECLIl`cqk-%cm(u92+@~*n>VkjFw%yplHXAFh&IpzDd0v?B|m5NU4j-G zUdHM;T#@}TGuK<@9CtourpTcTHM&t!>7SV}cI(Ms)wW&=hGC=@!@ubAWeNVQ)!??z zh>($>6|b!OZe}&tq&Kh&FVlifGYC7K(mx70A^Lt=(g^$N)oT;)TwP45AyS!TjjI0U z#hGi16KoUBOE`+4!^xBwQGx3MLXwo$Nhv1+Mxc^A2ZYjqL7U{$^4O95JD|0sbf2B9 zDz#|!lPRq+i#L44z-(Bk(Ex4s6uV8cMv|r5NRMSn$QgpRrrKeUNt6--MY!oko9VGM zA%2m!=aDY&T*N*S2>=DIn`;@PzbKBJb`q~UdkYc+<{AVc^qL&j?ITa? z7&6LecPSJ~f_t6CmHrY11lSEQeoQ3T`5qnIT>U!=fUvX8Tmc^Vsk!=w008_f>=fP= z0SrPP5+ya8!a50onvoA~6GPD$k&Y=%&_Hn>o0U)dlT$o+AUul5GxQ z)7w8Zx_uxcIKh5nD0F^GzBVIqKib8t`jJfrYbU_*p^wR(gHs1lM_IG>ksn|RF>O^G zf({HCH*lpV(fZz*iC62--_^HUlb@@yGG2e)rS5fu&S zouh&R`Yl|)BI%wi3C?%LPi%kH-x0PVlXJC|hViza=p#2G0)6knvBN%!A-{_&&jn;h zG)`U=MEYA4z`{Lm2U-Vl(@<@y`6t}!T>azR$F0LTN{XCZP@5-n_XB?16Ba;s!4A|~`VJPS?yXPM36MEKeJ^2;wMXziWb~lX|wE4Y^MWV8V zXN!V|I=vR{wqNEJ1&-}kK-#aeIFCXkV^6dC6&4rRMyA;0ir`lm#~WL1kzNP(m`)Sl z0uSd+IFrhp6(8BUZ@%LA`o>`TQIMJPV{4RS!q`{?6=V_ zmg|Ob>TAb4jyC%mi|c$!?4|iqOGr2KFrB>D!#xf)8N=w_R;%sgx}voNPCeGs{~FJ` zx`HTps4xU=8OEVX6&|tAID#m+3T<&|C)AZaZ1^cEtAY)H02;m=n9>o7DbLlvL4<&K z_)LyXpc;3sOKOgk9nyqkAQX7RQN%Z-iqcjvp_WJj!0x6r3>- z|1E8f?S+g#R16u%5HgPCMql6#soZ%Om~ABpkQ#Cd&$Gez0&m>S8mr?x7_WDZ>zREZ zj6&3g1N1Dm&dA%^32H6TLMAfUGbl1fh}Y+w`8j9y>gDTK7UVDu4YSu~Et&rz9eO+u>)ee5SvkS%3NH={GEh zLd=~AW4R={$!N#GAH;naH%{4JGZ-cmBrKR>Tsl{O&d{kZ96trO zsy43@q@jYacFX}ghk3)KLBk&~g*vtfya{u|jdOhyM&8<|v8mMQi|knD*KbU3 zP7rp8M~68-zOdo}w@^~YaWgi5qL!k;yaRUHY0ux#E{X9;5278!Cs#X%cH)?5whI!G zRjW3=lBkzNDBYmEn!757FcP2F7~&K7Ok!ivc|bEws=$84wWVcWi5b8ubY4^k{6+fwsUtp%0+!iFVn;Zn!0pSe4DkobxVB4BGN5G1{s!oGZ7Dt9h#ww$gXDa>pic+nN LRZdifDt-S3NY$g~ diff --git a/bcovington/utils.py b/bcovington/utils.py deleted file mode 100644 index 40d3c27..0000000 --- a/bcovington/utils.py +++ /dev/null @@ -1,269 +0,0 @@ -from collections import Counter -import re - -""" -This is a module slightly extended from original utils in BIST-Parser: -https://github.com/elikip/bist-parser/blob/master/barchybrid/src/utils.py - -that has been adapted to include to support non-projective transition-based dependency parsing -and CoNLLU dependencies. -""" - -class CovingtonConfiguration(object): - """ - Nivre, J. (2008). Algorithms for deterministic incremental dependency parsing. Computational Linguistics, 34(4), 513-553. - - l1: Word Id of the word at the top of the lambda one list - b: Word Id of the word at the top of the buffer - sentence: List of ConllEntry - A: set of created arcs (tuples (headID,dependentID)) - """ - - def __init__(self,l1,b,sentence, A): - - self.l1 = l1 - self.b = b - self.sentence = sentence - self.A = A - - def __str__(self): - return str(self.l1)+" "+str(self.b)+" "+str(self.A) - - -class ConllEntry(object): - """ - Contains the information of a line in a CoNLL-X file. - """ - - def __init__(self, id, form, lemma, cpos, pos, feats, - parent_id=None, relation=None): - - self.id = id - self.form = form - self.lemma = normalize(lemma) - self.norm = normalize(form) - self.cpos = cpos - self.pos = pos - self.feats = feats - self.parent_id = parent_id - self.relation = relation - - #By default everything is assigned to a dummy root - self.pred_parent_id = 0 - self.pred_relation = 'root' - - #For debugging - def __str__(self): - return "["+'\,'.join(map(str,[self.id,self.form,self.lemma,self.norm,self.cpos,self.pos,self.feats,self.parent_id,self.relation]))+"]" - - - -def vocab(conll_path): - - wordsCount = Counter() - lemmasCount = Counter() - cposCount = Counter() - posCount = Counter() - featsCount = Counter() - relCount = Counter() - - with open(conll_path, 'r') as conllFP: - for sentence in read_conll(conllFP): - - wordsCount.update([node.norm for node in sentence]) - lemmasCount.update([node.lemma for node in sentence]) - cposCount.update([node.cpos for node in sentence]) - posCount.update([node.pos for node in sentence]) - featsCount.update([node.feats for node in sentence]) - relCount.update([node.relation for node in sentence]) - - return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())}, - lemmasCount, {l: i for i, l in enumerate(lemmasCount.keys())}, - cposCount.keys(), posCount.keys(), featsCount.keys(), - relCount.keys()) - - -def read_conll(fh): - """ - Reads a ConLL file given a file object fh - """ - - non_proj_sentences = 0 - read = 0 - tokens_read = 0 - root = ConllEntry(0, '*root*', '*root-lemma*', 'ROOT-POS', 'ROOT-CPOS','FEATS-ROOT', 0, 'rroot') - tokens = [root] - for line in fh: - - if line.startswith('#'): continue - tok = line.strip().split('\t') - if not tok or tok == ['']: #If it is empty line - if len(tokens)>1: - yield tokens - read += 1 - tokens = [root] - id = 0 - else: - try: - if "." in tok[0] or "-" in tok[0]: continue - tokens.append(ConllEntry(int(tok[0]), tok[1], tok[2] ,tok[3], - tok[4], tok[5], int(tok[6]) if tok[6] != '_' else -1 , tok[7])) - tokens_read+=1 - - except IndexError: - pass - - #Last sentence - if len(tokens) > 1: - yield tokens - print read, 'sentences read.' - print tokens_read ,'tokens read' - - -def write_conll(fn, conll_gen): - """ - Writes a CoNLL file - """ - with open(fn, 'w') as fh: - for sentence in conll_gen: - for entry in sentence[1:]: - fh.write('\t'.join([str(entry.id), entry.form, entry.lemma, entry.cpos, entry.pos, entry.feats, str(entry.pred_parent_id), entry.pred_relation, '_', '_'])) - fh.write('\n') - fh.write('\n') - - - -numberRegex = re.compile("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+"); -def normalize(word): - return 'NUM' if numberRegex.match(word) else word.lower() - - - - -""" -Looks for multiword expressions in the CoNLL file and creates a lookup table that -allows to reconstruct then the output -""" -def lookup_conll_extra_data(fh): - - lookup = {} - sentence_id = 0 - lookup[sentence_id] = {} - id_insert_before = 1 - - for line in fh: - - if line.startswith('#'): continue - tok = line.strip().split('\t') - - if not tok or tok == ['']: #If it is empty line - sentence_id+=1 - id_insert_before = 1 - lookup[sentence_id] = {} - else: - if "." in tok[0] or "-" in tok[0]: - lookup[sentence_id][id_insert_before] = line - else: - id_insert_before+=1 - - return lookup - -""" -dumps the content of the lookup table extracted by lookup_conll_extra_data -into a output conll_path -""" -def dump_lookup_extra_into_conll(conll_path,lookup): - - sentence_id = 0 - word_id = 1 - - with codecs.open(conll_path) as f_conll: - lines = f_conll.readlines() - - #DUMPING the content of the file - f_conll = codecs.open(conll_path,"w") - - for line in lines: - - tok = line.strip().split('\t') - if tok == ['']: #If it is empty line - sentence_id+=1 - word_id = 1 - else: - if sentence_id in lookup: - if word_id in lookup[sentence_id]: - f_conll.write(lookup[sentence_id][word_id]) - word_id+=1 - f_conll.write(line) - - f_conll.close() - - -def get_rooted(conll_str): - """ - Returns a list of [id,ctag,head] of the nodes rooted to 0 - """ - rooted_elements = [] - - lines = conll_str.split('\n') - for l in lines: - ls = l.split('\t') - try: - identifier,tag,head = int(ls[UD_ID_COLUMN]),ls[UD_CTAG_COLUMN],int(ls[UD_HEAD_COLUMN]) - if head == DUMMY_ROOT: - rooted_elements.append((identifier,tag,head)) - except ValueError: - pass - return rooted_elements - - -def get_new_single_root(lmultiple_rooted): - """ - Returns the ID of the first VERB rooted to 0 or the leftmost rooted - element otherwise - """ - for e in lmultiple_rooted: - if e[2] == DUMMY_ROOT and e[1] == UD_CTAG_VERB: - return e[0] - return lmultiple_rooted[0][0] - -""" -""" -def transform_to_single_root(conll_path): - - with codecs.open(conll_path) as f_conll: - sentences = f_conll.read().split('\n\n') - - with codecs.open(conll_path,"w") as f_conll: - - i=0 - for s in sentences: - if s == "": continue - rooted = get_rooted(s) - if len(rooted) > 1: - frv = get_new_single_root(rooted) - for l in s.split('\n'): - ls = l.strip().split('\t') - - if ls != [''] and not l.startswith("#"): #If it is empty line - if ls[UD_HEAD_COLUMN] != "_" and int(ls[UD_HEAD_COLUMN]) == DUMMY_ROOT and int(ls[UD_ID_COLUMN]) != frv: - ls[UD_HEAD_COLUMN] = str(frv) - - f_conll.write('\t'.join(ls)+"\n") - else: - f_conll.write(s+"\n") - f_conll.write('\n') - i+=1 - - - - - - - - - - - - - From 70baf9eea6bb26e02ace485e61582901a6171fa8 Mon Sep 17 00:00:00 2001 From: "david.vilares" Date: Wed, 24 May 2017 16:19:09 +0200 Subject: [PATCH 4/4] bist-covington: a non-projective transition-based BIST-parser --- bcovington/src/covington.pyc | Bin 24143 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 bcovington/src/covington.pyc diff --git a/bcovington/src/covington.pyc b/bcovington/src/covington.pyc deleted file mode 100644 index 7c52ffdf5bb5b35491481800b0996b6ade7e7316..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24143 zcmcJ1eQ;dYb>Drv_$I&)d=LOXmtP`CNu)?hqQy|6D1w9_ks$Q|k`gRvwb*?C7F_HC z?>&&94cM_MJC2@NrXOi%(n+Vz zq%)q*xWC`;zW4Sm04hpF6WqJ^+a4(Hs9iOsI`R!xs7fw8@+?^FFw!S@@l*LZ!#+iARhGmCi#j5lb! z0rLP@?J}jk#@k~`UFIF2x7U=mlJk@MOlg}k`%P)PGC5P)q09kO>Q?5UDfK9G$dr1O zIc!RO${aDJoyt6BO8v?lHKhS%hD>QtnPaB3OPS-Qv|E|SO=*uZPZ)Q6Xdk&wmWGAu zT-Zv!mS-1g<&r9Wud*C+o!ELNq$C8hrLYa(3$+EW8ZIxWPoY$p^$WHE`I1)&3mO20 zg)+scoMFNN%J`UKCz7t{qhX(cNQIKdFBp?|%xb%t&lu$Av*u35_-BYFiw;>zm}XP` z^%Pi(z$Oh4S`YS_6j+gA@0gXG!aT3Im#X&@<|p{Lfh!!gCEf5O z%cS~*C_1V9N#&nX{wd{8DSt}&)5_b0d|LTuRR65?9O^P(mb5GsXjWKv7&b%6D zuEv>bno^KN?k^jW8|84QtRpwdDd)$OUQZ#X`ur>su{e`EY$b zhqVH7&Pv!p(_hDVc|CYE%-zO%@x5`FX9V$_rR}W!I%E2K>to+rKSnh6N@KlU!E%bW z`|AOsj-P0(|D=lZvAq68J4u~4Rc+i5l{zM?>pMmRt~J*Ge^L3V^@mr)aT1y;82W7?u~xpYZ%#ImlIOVXknuP_7iFE1rz;A}w-$XxfkC6YN0(>l+>) z{;%Qzh9wS-rxeqU%VQfGq7m=55q2w*;=OX<LH%cT={frjq|Cuhsy$?0+sKDk)%1J6G>U5vfQlS%Hx;(fUq=PQMU=~Cf^w=h<~ zq2)}72)YhU`1IftJjN#BKDEcZnK5gQSpyN)i!i1i8sWW+S)!v0NMjMEt2sJ0I+4G9 z-5t)4jtyUa-Fm#F^~6*1V(!)alAND%wPbs7&cLh8JQ5-UddKHWlsqli zjqBs7^%7|!NJ~~+tOY+_%COBnTFTsA7_q)gmKF;5kDZj-qb4QXjuEv;S1w)!%60cz&RdxFN+kdV!RZgt=MtWwiHjrYZl^v( zw~f7?`VhS&3hAl-COS>bdAU*34i4h&%v)Lp>0wR4 zN%y!n_Lq?Pgh%X~9&?DQ`$VmFH}9cy?!faoRj_bu7D(aWf4UHGb3A_I;_&gHojHp0 z=*6qYhdzgrbm+Aj16>4LP>x{9&MPf+arojj#Y!(C4mRN<*6H~Qn1xZEkN5b@egaN~qzjdE%R^t{q#->>;0qp^OWTBAc_AV8xgX z6_HS_-t~$W!LeG^lk~-pQ$Asz2;ftrHLunH;aAmPxss~AO09Nhc~Mw`63WkFA)JfA zAU;7z-sB4bBG1+Q##sUhwk$Jiq~VzqLaD0g;gnF`EBIAvKuyI$NHd29!6t)?YqpJa zFf6T7ObjI=?&?=db%2;G2C*$7v`2{Aj)s~TGj13UL4k{@CSt$|S%?wf`bGUXf7~2T zPrFURv>9#e8#QuhqH2rr;3PNc&Ee}H@dVaBhE<%mTRADD(y%ng+q+a_NE)S8qs=KL zDWox0xFUju7!BeX^^0j#F_IiM-zrp=J*%3i7N+@>CQrh2dAh6umE^5LwOpwbcxLp{ za=B81#w88GvOiNOdQnv%jXS-*lH5&qq#0w?vO6{wk4C1=l}jbBnqRJ#A--gqtvDUk z(9ABBN_k5_9zITC!GpF4^jKEBs!Vb{9``Jaj6YQ@MpRR!`ca-XFULy_L@rPtohB!Q zLQ2=_(bS<^=^W#T}79ZCfI+(`phsMkg=lC)|r;<5=_SW8-?{ zYRh4LAQX~;e2i2dh$S^~MOla@AuTG_`z0e10Y4s+{T6-C?w+L~GG3mG)S9%jdxdq0 zvUOi(aTJL-fA4hJt^mHtU+&g0JEVZ9l=6`j%CxFWulu(-05umLqhJW%DX;0H+J0hS zK3|3?0_`jSW+v#*Wed|A&&P-RP%jv*hPo^i+ncePmMt|*@{Z9Nhm~GF*_4>{{ z-Q4cPe}~cIOr|Rvm)o5WQIk3BT*!2FCw;O9Fy5KW0cTrt!P(!2Su*X;9_I*fcpNR; zoL4f#prO;*&cBRJA4l8a?7qx^bJ7{e4C8YEpXXBfSDoXTB00^q)H~ZCrx#lwN{>?y@e>jv{`G552SP$27$o<=*Hu$wRVMTZ~)d$!LSWqK=7vn zUTaGf&RXj*EA~ukiw@DIa&j}a7{;7C%sf*32a;RNAP8}m{-wDM-vhzZQj0l{voq#e zkpl>jY-1VyW}3}7&ZX^XeZV+QM=tv3!g2`UAJk3XWUrQ^kOB;s{c3P7so1#fp37Z? zWe!8FRxSAVX;0tr?w?D>tM|k+aV|F;+widwz7a6e_UCdJF4Q~IVkn)CatVkoD)>pTgm#iVJQta=4G6u=Xb@H#Vl$7Z7FG)KgN&qT z|L{~dLq)Jrz<<NC#XnB;Wo^IK>BzoB6{+XCD}eSr@MmN32>! z7?QrkkJq<9S*Pw{uG-iTHOLtM*}dX6iei3k5%{&C#5*lq2`y8xQVTp6k1>^kF?VTm zSSMfwzR;}aj-b}fG5IMbjH|iBNHml*xL>=M+Li{v(G02f>~m{R$fMfY{IxNjg4)AQ~m%O7X*CX;{6LcF| zB4fHbGuxbwOpDWz?a1te;-|7dk{NUcxItliD)LF8g-QeL5?(I~(tgX~T$UpmUtpWf(d%F%gP9F?1*RdkLu?`3m-=y|qyJ~5 z_P}{;aO9OH4*B&qjDqad*WK`=Mm}~C;Gsl zh-!Ki?Bnx$#Zg!R%4F88Re+05Pc~EHaBByh_FLijY1s{3W^HTqm_(1rsciJrB(iO5 zARH2ulPs8dQA0?o$!bWVTowTFgTX%b5ID8Y{*u56DDx4S?e=(&?Xv?4*}h2*80tA zuS`$7>}Sz^b!S`wcAi~BDNS&;#TG5u)Spur-@ZfCfx36iYwYD@bAJ^}S zjtw0HuiURQc@2qzhzM%Zatj!(218`IF}7jDsNTY2c;CHAVA?&?E)qIz#-(Ra_X>Q|V2g~=~7`7)DVW%5f*eu2q*On9i_ z{ydYvg+$27j$Z2#pO^i4A3x7t2xTjwFruccoA}0InF#U0ubs}$SOt~ z(T|y$!n_H_wu%9tV6F_}1i_`N)<~!=lPL%XX^wy_RnAwAxCG*?(DIeF1~FPbvahB1 zc|PxPaUn%7fVkis2X1$rDd#345r<(Nmot~5bP#rO*7<6NpPRT#+SkHh$PWA!@b{u~ zGgG8|#-B?>k^TjB8~!Wc0gcMrn+>-ya)6Pj28Fwxy`aIhmXPsmu!_jzBcz(kI*ys{ zkzOKeoR5e%T;oz5Z}&fA$VZItRm||o6uy*0`d*YK@%bgxSU!Whj47kI7>e~=e@ zVFO?RP^CN}UQiJmEzE`p0+tgX@YaDH<82kO57!}Jx&s0sCJXUd74cF+9RlVJAYg`} zis361I%t#8?D$O;VF}d1TWH}EsEfAn11$SCgUB|QWI{#h{w$N4F?F880@X`&{=p){>~O_-etsLF^;@W77&5B5Q43hZK2Z&d z?mtEG5zFf+{Qpl;UKwvU&O!T!u>zZ|q4O;iQ#3jh3@kWov15Cczo0>{gbFM9{Ff$Cw6ZLot8cEVUw94kR9jXf`o}E5KmP{F9Vhg!ZE< zxLi?Ht69NK>=X1HLcAU#2L($!JT)fwx{X17#7OW)2SV-h$LYUt7`y|68hGrs2D%b3 zr}X5snfz%1*kZfMpD|14v$t1{aroh>0}SebM&b`Zq-PASNmFZrgs08YFJ?u;Gsgd} zNO(3)f^`&8RVQJ~hamyarzPQOk$_>g#3a}nFd8rk33%NF00{{nGfO`L2~TV!0e~Rk zV~r%d-gJBEyG_+mvvS4uctMdYtjBX^{+!r^PRDI*=qI8oM&z)B;dzZH#zq|ZV$*Gi zL*yfI6G)td5ibnnWF#jGa5Lc1)RMyv>uf;h6P&gdh5DM06_^2Uy z-8O-o(O$WnT>kqKt%nLHY3XwIIR~6B=V|Bp>^2-H@8u`+q;m}M`F30Lu-w)cairUW zgXV+i(~TqD9*nbple#0>F6S}p2Qbo43#o`t(1dlNF%lou{@kBuaLTswUo&vcle0Ex z!{Mp#XBfuH%07YWut&DmGGTn%p5&IWce4PiS7CW7XluGcP2>7KEEb%it~|@a@YD%f zsXHw)t zp0IQqBx32`ncLwW^MDp}ley(A9Wl4Lk-heMeK(_mPBxQkg*yn7RcqBJ7ya5i)D%=> zuH@lZlHNxS@jzYm#buZ13_c2>6%Ze+UcLI{>+8<*e+`3cgVJs!g@{9RMbgHG_M*}K z8j}$wP!7iZb0+@+iC_aSbhn~j+Dmal>r=_Q%A&#j9m2oLgvwS|su*XQDdQ5dRo>w) zG~|=;rG0I`9wBkR#m>}|E)}@@`$(+E1sZv?3Hx*&;W>F+tU{1TS47>b1iwabPL0jL z{R7tgA(KC1H5Dzod-w9i*&%8m>j6p+@#hLKIxZYicVlC(RIc36u2LS4ECZ|hzrlY0 z9EtQlLQH|%Lc;0bsqb8EW`=yyuCRl(NlY|v!=`paLgZ8M)5zp&i;qy(lY$#oBK{{( z3noxDE!{YI?SOaKg)2DQ@%==mFVpArWx8?ErYF;j2oo>u9K~mU1{&PfL5nxkIW10C zmaT(MtJUPEFgB^Ygir9>*uG$0Lwi##TIHzV2c90*-~mF4$OcjbG?3z647y}#f^LBx zrSidfz*Q>3NV!5&X6Ow*fVmE33AYXV1QcY({M^59gMtJ%A&z()F$7ipRyzMYcCjzx z&Y~NB$pQ^<88A9dXYur{e7TLDra*?LzSPETWrtaV3n`~^Cs5;)?>a0akM~hs2bXL{ zP#(ekq7gpZZKiJoH*7|@cj-jg)^yk!sCO9fg4u$am($5`==QB9&0fYLryIBtx+ZHl#vv=@aek#z4fMwLNBKMh@~I=KQ#`2LG_ntl=yYhg$m;3Ih$`Ud})lt-;H+ zUa?iS5)2V@Dy09=OwxunkcOxNCO=|2gYPu2 z9wp6#tB9W9YXCnegAnnCdWMa^-%Pa$H3oIOO@ZX^vp1A5I5-%sG7 z=%NW5;q2UN2ua}L1+*TNMql!X;eq5Mi!j?>Cq;tg{7h>sYnBAy&`1I+ED3v4B=o9V z1Hy^WHll_F0SV_5cuPV9@gmhCj=nggRs_h2NF`Nu67Ul=^;NJ>{V%mry`iUN4dD^8 z!>XhGYt&bNGk3Q674I%@H>AwTB=|mf4Xgk&&&G;Hvdy*kyG)J=4AN3nJ zi`}1=@MbO8G;y5=*dY3bLn_Xerh_tNHAAcM-)~!e!puXEkZL;%s;a> zPd1seYuN7_q1mSJ>{-|$wA=~RVwT=-`I|_}TWMqv6Ji7L3%-T~XR4h#!l{HG1zHaaz4na2@g8EcY zl$h$P?VC>JeE5k-Hy8|DP74?O62b_U5PNsu$jg}1z9JsH2173KKPLDB8m)>c0=L4^f-z$f6b%4@+oFtZuIs0x_G7jjr|KxngK*vd~oDf$&NeRsXBUU&3 zTG7L80(duZNG^xBqozIoobGR4#Z`lIK`S8jng{Z<P&SLPA6kM$YCqT zxb^ocT;6$?ljMFIEv{Uif6W|S9_#$jnTnjB$W{6l+lZn2F($nngBLbaJ`_1pSfrzA zZ@LoN3-Ci;oyY{m0^ZN(r2-Jx&W?vsiwy*4_=_b*==xD9P+QJ>i?!n15S;}ZMjK!| z5nCYP>gn-~LVFQlx)2uU)QtCw&`dsiQ9JXMD9;JKuomJLEVC7xC8~I}YsTFZPlBkS zAG(7vPSKK;h6~Z}k72k72hHd10;JqMd4RLEN{N>V0=>@bCLW->n_%C;Y<6~Fnztu? zLcNIB*aCMiC&kCuYY&xE`NHyw+NYJHQ_8Vb;7OjnsZn-~IIrTKaSYWCkGB(^ zYqyg_3=$qNUYY5OePH&?ZpXcg7WT;OL;SMCY0n&wzIUs%4XvzgkVdqyYRO?`y$HS_-xN~ z0MkL>%~xaqLy9u3PCsyF88yTk6nEJZU+NQnF5#ndVK~clPi0HM9f|LRkdJz9}TK63rl3ZohVB<#Gl$b);yRmx%npK?if{(c_!;_`gk zRzWBHlqKHm%@^``KFe}qaJsBgdC-oZzD$;wlcUiG>FLkP8*A9 z;OyIICiO9M$uTfjY$eC=9t*;{VkVtoFc1fDmc21pY=Yt{KrJ7$Qn8wcv2q;i?Q(-m zX-RE>+U+LLu%2TiV>)4!dy^ojdNju#E_cEmGkn1I}A#b2xzAsHts-cR9|h3*e}RKi2(+)@tyui-t0t2E;REEJN2|79J&-pQ`9=6U zKaF>Ebwx4IO+@$K5}2181!sRjfY$^yvTg^eVe}Eo(ovwckFrE;4fd06z8Z;G={EeG z$5Xf~vn^Ak9YVR{w`A*=k%4SMbeLjsAscOG<yEE%t9Ofe{mA#W`8nH3!4GrBe;L2Rew;dBSm0A=TYp5lo4ZY-75rPUyF8@+C0 z$_7;k%8a!p^}rqhwL>JtYBxR8Arn;D|oMX9gG2QAFkWC4>l+8>DL0)8wG;Hi*NDVOh|MhiQ0fAN2;~)jG|y z8)PyuPVt^afc< zW1lP~8@I#ElIkQf$p(HFg+>X4c7z_pfBTEPka#In3Nqd{!VN%P$pHzms^Pl06gNrV zP$Q{&#wBwi`V(>m?&lYMxMYt|hN!vTCc$k$#^9MAaN3-)p(jz0Fz55UAe+z21kdO3 zLoUk|6y2wYJ6%ZkX(rDwIl|;9lVeOCXL689fyp!zNRDyuFzIDN!FBI4xyNKLliz3Z z2TcBu$saM{o05_T_ZL}|Qz-xM@3Z(e6Z(kme_`&wG12iI{ZTi^>g5Bs74sazCvcIp&_?H9{GHChmThe7&UWH1bmw>5kF`J7-qfCL z?`ofJzuf*O$hRZk-@dzjyG56WP}2SgK}2`Gm&0-;-~(e^2^kjHE+^jW#ILQ?eEGog z@%SSxVXX#mst5bz9U@as#KR&8fghUVIMqdWiae9$`~(V!Z