From d9191265e0d63b16679aab785d3b329fbc20c09f Mon Sep 17 00:00:00 2001 From: martian2049 Date: Thu, 13 Apr 2017 01:42:55 +0800 Subject: [PATCH 1/3] updated to tensorflow 1.0 --- .DS_Store | Bin 0 -> 6148 bytes code/.DS_Store | Bin 0 -> 6148 bytes code/bilstm_crf/.DS_Store | Bin 0 -> 6148 bytes code/bilstm_crf/BILSTM_CRF.py | 72 +++++++++--------- .../__pycache__/BILSTM_CRF.cpython-36.pyc | Bin 0 -> 10303 bytes .../__pycache__/helper.cpython-36.pyc | Bin 0 -> 7078 bytes code/bilstm_crf/char2id | 7 ++ code/bilstm_crf/helper.py | 14 ++-- code/bilstm_crf/label2id | 5 ++ code/bilstm_crf/test.py | 10 +-- code/bilstm_crf/train.py | 30 ++++---- 11 files changed, 77 insertions(+), 61 deletions(-) create mode 100644 .DS_Store create mode 100644 code/.DS_Store create mode 100644 code/bilstm_crf/.DS_Store create mode 100644 code/bilstm_crf/__pycache__/BILSTM_CRF.cpython-36.pyc create mode 100644 code/bilstm_crf/__pycache__/helper.cpython-36.pyc create mode 100644 code/bilstm_crf/char2id create mode 100644 code/bilstm_crf/label2id diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..0647067d3a4b3d9ddba218a9021dd0b97e5ff93a GIT binary patch literal 6148 zcmeH~F$w}f3`G;&V!>uh%V|7-HyA`u-~|K~8*xF)z$?L_2URHKLJSWDxSmLu%B!J&7q|#Dlq;CI0gn1_$q-1 Dp;i*E literal 0 HcmV?d00001 diff --git a/code/.DS_Store b/code/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..061b4e3a1d6a3ee93e5b6e364296c89958d0cb5e GIT binary patch literal 6148 zcmeHKJ5EDE3>-s>2%40Xdj)Q=ijouL03e8hkV23Gq<$6W;%JONLPReV6f|fo*|Y2Q z?9-d#`5A!C-`3Z_48WA`h<6WT^KU!ajm~>bTA68GcnoumB&ih-G!+N5k z6p#YP3Y_P5=KX(3|6%?=CTS-Hq`<#Yz$S~;V$N5p-a2|Y@3oD7OZS=&x*OL)VTg82 jjCRb8x8tiQ%DU!jp7+8bG3d+(ov5Dy*F`1;{#t=g|S=8VYfy$@D~yc1Ovf9Fc1t(!hqj7 zv+GG#i$#KgVBjw?Am>9u6U>gqP>&9DS^@wYOshbbT0&xyV|FZtut3;CffmYMVz7l{ zKDl3ZEQS_N?8OKB%3s9`>*~m#)SWmRMjs3W17ij*Z8(?v{~o_gwa6cb#3&dD2L2fX z+_YW0#HRdiJ=>n#wF&J3O+@^PC=lqQM*s$Lj@+uz>`8R^WyfMDS!7 HCSc$b*8nK= literal 0 HcmV?d00001 diff --git a/code/bilstm_crf/BILSTM_CRF.py b/code/bilstm_crf/BILSTM_CRF.py index 55d18a2..d1e45c2 100755 --- a/code/bilstm_crf/BILSTM_CRF.py +++ b/code/bilstm_crf/BILSTM_CRF.py @@ -1,8 +1,10 @@ import math import helper import numpy as np +import os +os.environ['TF_CPP_MIN_LOG_LEVEL']='2' import tensorflow as tf -from tensorflow.models.rnn import rnn, rnn_cell +# from tensorflow.contrib import rnn class BILSTM_CRF(object): @@ -19,13 +21,13 @@ def __init__(self, num_chars, num_classes, num_steps=200, num_epochs=100, embedd self.num_steps = num_steps self.num_chars = num_chars self.num_classes = num_classes - + # placeholder of x, y and weight self.inputs = tf.placeholder(tf.int32, [None, self.num_steps]) self.targets = tf.placeholder(tf.int32, [None, self.num_steps]) self.targets_weight = tf.placeholder(tf.float32, [None, self.num_steps]) self.targets_transition = tf.placeholder(tf.int32, [None]) - + # char embedding if embedding_matrix != None: self.embedding = tf.Variable(embedding_matrix, trainable=False, name="emb", dtype=tf.float32) @@ -34,35 +36,37 @@ def __init__(self, num_chars, num_classes, num_steps=200, num_epochs=100, embedd self.inputs_emb = tf.nn.embedding_lookup(self.embedding, self.inputs) self.inputs_emb = tf.transpose(self.inputs_emb, [1, 0, 2]) self.inputs_emb = tf.reshape(self.inputs_emb, [-1, self.emb_dim]) - self.inputs_emb = tf.split(0, self.num_steps, self.inputs_emb) + self.inputs_emb = tf.split(self.inputs_emb,axis=0,num_or_size_splits=self.num_steps) - # lstm cell - lstm_cell_fw = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_dim) - lstm_cell_bw = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_dim) + # # lstm cell + lstm_cell_fw = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim) + lstm_cell_bw = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim) # dropout if is_training: - lstm_cell_fw = tf.nn.rnn_cell.DropoutWrapper(lstm_cell_fw, output_keep_prob=(1 - self.dropout_rate)) - lstm_cell_bw = tf.nn.rnn_cell.DropoutWrapper(lstm_cell_bw, output_keep_prob=(1 - self.dropout_rate)) + lstm_cell_fw = tf.contrib.rnn.DropoutWrapper(lstm_cell_fw, output_keep_prob=(1 - self.dropout_rate)) + lstm_cell_bw = tf.contrib.rnn.DropoutWrapper(lstm_cell_bw, output_keep_prob=(1 - self.dropout_rate)) - lstm_cell_fw = tf.nn.rnn_cell.MultiRNNCell([lstm_cell_fw] * self.num_layers) - lstm_cell_bw = tf.nn.rnn_cell.MultiRNNCell([lstm_cell_bw] * self.num_layers) + lstm_cell_fw = tf.contrib.rnn.MultiRNNCell([lstm_cell_fw] * self.num_layers) + lstm_cell_bw = tf.contrib.rnn.MultiRNNCell([lstm_cell_bw] * self.num_layers) - # get the length of each sample + # # get the length of each sample self.length = tf.reduce_sum(tf.sign(self.inputs), reduction_indices=1) self.length = tf.cast(self.length, tf.int32) - # forward and backward - self.outputs, _, _ = rnn.bidirectional_rnn( - lstm_cell_fw, + # # forward and backward + self.outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn( + lstm_cell_fw, lstm_cell_bw, - self.inputs_emb, + self.inputs_emb, dtype=tf.float32, sequence_length=self.length ) # softmax - self.outputs = tf.reshape(tf.concat(1, self.outputs), [-1, self.hidden_dim * 2]) + # print(self.outputs) + # print(self.hidden_dim) + self.outputs = tf.reshape(tf.concat(self.outputs,1),[-1,self.hidden_dim*2]) self.softmax_w = tf.get_variable("softmax_w", [self.hidden_dim * 2, self.num_classes]) self.softmax_b = tf.get_variable("softmax_b", [self.num_classes]) self.logits = tf.matmul(self.outputs, self.softmax_w) + self.softmax_b @@ -75,14 +79,14 @@ def __init__(self, num_chars, num_classes, num_steps=200, num_epochs=100, embedd dummy_val = -1000 class_pad = tf.Variable(dummy_val * np.ones((self.batch_size, self.num_steps, 1)), dtype=tf.float32) - self.observations = tf.concat(2, [self.tags_scores, class_pad]) + self.observations = tf.concat([self.tags_scores, class_pad],2) begin_vec = tf.Variable(np.array([[dummy_val] * self.num_classes + [0] for _ in range(self.batch_size)]), trainable=False, dtype=tf.float32) end_vec = tf.Variable(np.array([[0] + [dummy_val] * self.num_classes for _ in range(self.batch_size)]), trainable=False, dtype=tf.float32) begin_vec = tf.reshape(begin_vec, [self.batch_size, 1, self.num_classes + 1]) end_vec = tf.reshape(end_vec, [self.batch_size, 1, self.num_classes + 1]) - self.observations = tf.concat(1, [begin_vec, self.observations, end_vec]) + self.observations = tf.concat([begin_vec, self.observations, end_vec],1) self.mask = tf.cast(tf.reshape(tf.sign(self.targets),[self.batch_size * self.num_steps]), tf.float32) @@ -103,8 +107,8 @@ def __init__(self, num_chars, num_classes, num_steps=200, num_epochs=100, embedd self.loss = - (self.target_path_score - self.total_path_score) # summary - self.train_summary = tf.scalar_summary("loss", self.loss) - self.val_summary = tf.scalar_summary("loss", self.loss) + self.train_summary = tf.summary.scalar("loss", self.loss) + self.val_summary = tf.summary.scalar("loss", self.loss) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss) @@ -115,7 +119,7 @@ def logsumexp(self, x, axis=None): def forward(self, observations, transitions, length, is_viterbi=True, return_best_seq=True): length = tf.reshape(length, [self.batch_size]) - transitions = tf.reshape(tf.concat(0, [transitions] * self.batch_size), [self.batch_size, 6, 6]) + transitions = tf.reshape(tf.concat([transitions] * self.batch_size, 0), [self.batch_size, 6, 6]) observations = tf.reshape(observations, [self.batch_size, self.num_steps + 2, 6, 1]) observations = tf.transpose(observations, [1, 0, 2, 3]) previous = observations[0, :, :, :] @@ -133,15 +137,15 @@ def forward(self, observations, transitions, length, is_viterbi=True, return_bes alphas.append(alpha_t) previous = alpha_t - alphas = tf.reshape(tf.concat(0, alphas), [self.num_steps + 2, self.batch_size, 6, 1]) + alphas = tf.reshape(tf.concat(alphas,0), [self.num_steps + 2, self.batch_size, 6, 1]) alphas = tf.transpose(alphas, [1, 0, 2, 3]) alphas = tf.reshape(alphas, [self.batch_size * (self.num_steps + 2), 6, 1]) last_alphas = tf.gather(alphas, tf.range(0, self.batch_size) * (self.num_steps + 2) + length) last_alphas = tf.reshape(last_alphas, [self.batch_size, 6, 1]) - max_scores = tf.reshape(tf.concat(0, max_scores), (self.num_steps + 1, self.batch_size, 6)) - max_scores_pre = tf.reshape(tf.concat(0, max_scores_pre), (self.num_steps + 1, self.batch_size, 6)) + max_scores = tf.reshape(tf.concat(max_scores,0), (self.num_steps + 1, self.batch_size, 6)) + max_scores_pre = tf.reshape(tf.concat(max_scores_pre,0), (self.num_steps + 1, self.batch_size, 6)) max_scores = tf.transpose(max_scores, [1, 0, 2]) max_scores_pre = tf.transpose(max_scores_pre, [1, 0, 2]) @@ -153,9 +157,9 @@ def train(self, sess, save_file, X_train, y_train, X_val, y_val): char2id, id2char = helper.loadMap("char2id") label2id, id2label = helper.loadMap("label2id") - merged = tf.merge_all_summaries() - summary_writer_train = tf.train.SummaryWriter('loss_log/train_loss', sess.graph) - summary_writer_val = tf.train.SummaryWriter('loss_log/val_loss', sess.graph) + merged = tf.summary.merge_all() + summary_writer_train = tf.summary.FileWriter('loss_log/train_loss', sess.graph) + summary_writer_val = tf.summary.FileWriter('loss_log/val_loss', sess.graph) num_iterations = int(math.ceil(1.0 * len(X_train) / self.batch_size)) @@ -166,7 +170,7 @@ def train(self, sess, save_file, X_train, y_train, X_val, y_val): np.random.shuffle(sh_index) X_train = X_train[sh_index] y_train = y_train[sh_index] - print "current epoch: %d" % (epoch) + print( "current epoch: %d" % (epoch)) for iteration in range(num_iterations): # train X_train_batch, y_train_batch = helper.nextBatch(X_train, y_train, start_index=iteration * self.batch_size, batch_size=self.batch_size) @@ -194,7 +198,7 @@ def train(self, sess, save_file, X_train, y_train, X_val, y_val): cnt += 1 precision_train, recall_train, f1_train = self.evaluate(X_train_batch, y_train_batch, predicts_train, id2char, id2label) summary_writer_train.add_summary(train_summary, cnt) - print "iteration: %5d, train loss: %5d, train precision: %.5f, train recall: %.5f, train f1: %.5f" % (iteration, loss_train, precision_train, recall_train, f1_train) + print( "iteration: %5d, train loss: %5d, train precision: %.5f, train recall: %.5f, train f1: %.5f" % (iteration, loss_train, precision_train, recall_train, f1_train) ) # validation if iteration % 100 == 0: @@ -220,21 +224,21 @@ def train(self, sess, save_file, X_train, y_train, X_val, y_val): predicts_val = self.viterbi(max_scores, max_scores_pre, length, predict_size=self.batch_size) precision_val, recall_val, f1_val = self.evaluate(X_val_batch, y_val_batch, predicts_val, id2char, id2label) summary_writer_val.add_summary(val_summary, cnt) - print "iteration: %5d, valid loss: %5d, valid precision: %.5f, valid recall: %.5f, valid f1: %.5f" % (iteration, loss_val, precision_val, recall_val, f1_val) + print( "iteration: %5d, valid loss: %5d, valid precision: %.5f, valid recall: %.5f, valid f1: %.5f" % (iteration, loss_val, precision_val, recall_val, f1_val)) if f1_val > self.max_f1: self.max_f1 = f1_val save_path = saver.save(sess, save_file) - print "saved the best model with f1: %.5f" % (self.max_f1) + print( "saved the best model with f1: %.5f" % (self.max_f1)) def test(self, sess, X_test, X_test_str, output_path): char2id, id2char = helper.loadMap("char2id") label2id, id2label = helper.loadMap("label2id") num_iterations = int(math.ceil(1.0 * len(X_test) / self.batch_size)) - print "number of iteration: " + str(num_iterations) + print( "number of iteration: " + str(num_iterations)) with open(output_path, "wb") as outfile: for i in range(num_iterations): - print "iteration: " + str(i + 1) + print( "iteration: " + str(i + 1)) results = [] X_test_batch = X_test[i * self.batch_size : (i + 1) * self.batch_size] X_test_str_batch = X_test_str[i * self.batch_size : (i + 1) * self.batch_size] diff --git a/code/bilstm_crf/__pycache__/BILSTM_CRF.cpython-36.pyc b/code/bilstm_crf/__pycache__/BILSTM_CRF.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..943a181fda88efc29ea480352ef582efb90cafd8 GIT binary patch literal 10303 zcmcIqOOPB#TCR6jS3jnw)tZ+^Qd^@}>uJmD^#dWxvLvsKHEV3Gfgpkig5f~G5r-q-`?IQhdPZY{ z13Mj=mHFrU&;OnCe5vGYw*UQuzkF3u{#lv$Gf=*Y6y8L}6sGnRH~p%vD(jl7%dg=Y zO!JMd$@I%<56jKHp)iA)pDN7s?JeESGxa^CVQphBWm%|~MD6xX@AY@y@ov4j?7jW% zH@vrRyno~E7B!FlEEKLHg}0IU3N7ktY?fIp_o<4d=a|j%C>d;?Ijn$^$%?G>sp49! z%%y7^JBNgG?ajB}yM4=h{oR|}e~V?d8M5WuH(Pmp z>2C@thomEgKL$>i`bhN@rXg3Ej$C5~a-EsT4Q3%XWp2d;&&AZoj%idrrcq8*=oXPm zOnIbs?T)rZ(DJ^}sSH}#)pvTQZ%37*tFg(HyE^|1v>ZS=$83};EdQAHQ0vd2c97Xv zFOOtqYt{gr>COUDLn_Oe<^Y>VZH}2tWll#QLl-2J^3izedq_fYx~|O%kLj!gUF;t2 z9s$hSQj`3ix)WPZRKAfxkH&ezTvwY@Sn;to?tL_NcEJg2G;2N1cW5=ErcBITkDUo9 z^?e5E82XjBRGg&rm~gxV*m1z8gojHc{VP0FmAld<1`x*-S5;EQG+}sZUYzw zydrx(kJ2}&C)&@k!&%!U;@?q!iGAE+){E|MVy0O|&hx?})lj;~yU)L@Xo~m3#JuO( zk@Qx(RoVBOXnO`dU)WN!(l@tMdAlzH`V!K!NavB3wsc8fFXQH4LHa4csL#uDN`a{1 zGRFUO_X?W>?6REu)u$j=0eP)^4Lwvg|5)w5F8ktq-gTxxyd7QUD#TXjs~`R6-``sK z_|*<7`1_|n-zA*aDHD8!4dN}mAHFRod!f0*; zp{VxP*SrDu!eG<)!eK9n!iMTr2J6vqJ@W4Q{?Hrp!Ae6F)7)q4ttc4uy`ay6mLE1W zUcg*pD)jHI`~8;h_56M(S{3Km9PE8N%RSA%YvK9N(g?(;sb& z^wa^Q@CdSw5-T6gBcx!eoFs5Mv{ULh?P(t#{x z^VCEy56BB5&%1EBH)u6`;T04{V}MUmR8J$t1oSDS5Gr;M^Au8e4TCV_L7H(UEzeSU zj^n%=W`)rkgdhP?>iJFH$C*9ejQs8Y#vi9U3Ff}aO0wFxAuPJ-_H+C& zTHPY!gCVF#juOsFGit3$;&L6jq+atOurG3;P>%&`uCp32=J%;AWCwpZXsw1KPo2Za zA7;hYYLkcZNJ5L+qA=_=TmI^x$9yi#pdUSdNmxN2B#XJDChz!EX!iz9YAI(B?|~n5 zRwGdz)prO|*zY%aAbBN^z4?sc}Tb*BbOA9;^tR_xoG8IeobI20CEZ#5%6 zjVt0+WOQlK+IkOD4Laa?w-7Zup%=CW*f%Fy6Iy>Lj6vTIMRBkKp1F@jz`4R~a^8F> zOfYi?l4Y$K-W67-8LeXDh2a2~kxqjtiQluzbc%Cth>j@;_cP!Rnw+_nXb@q?6HN}0 zRJs{a-YG*A`FxQ!mjg1u!ViTNf{U75l%%jD97%8iJU_0OvQ3m;W6ia152IiWvW^Rz zIGu_jKNtZUQz?5KCp-}Y@g>TR(UKEa58Ux!LllD0lT(9MTniOQ1YxB_<`z>KfeZB7 z4<>6X4OMbx zMx|d>x79gqMy<)(j9yVIx~0xZ*o;0W+ci~F4g9LsF?B($qRmnbZBad<8R~Jhrd?1k z0KTBjXlL|qVf>=Lpx5<^URA5w=7D`=TZ_zV`XjkTD(Q8k@G7!M>8i0BYcNJNm>v4K zuEUHmVgq#ziQoqM>2okt&>O9!cp>tQ<=a3Cneieol@kO9q?Lp;On!nIM*IT05jx15 zzerHs-x#6?Nk5@(@DhE^4G*1MOMbjmD2^qg$YU(%3FAU^4(T9@T2agDW^Llo<4!Hv z0e#9yHj+H`ccBC1DfJjouE)_fK!R#Ck-jW0$~a;B!lU*a8j7jF)&_5 zn;ENc$l$UtGlii}Zk+O&Hkr4Lc_pzrN|HmeW2*xLe&eqriy5O?x3ufZM?X#+xt^8n z!H|2fvOPGHGp1#lSN4UgDEk&-D=y3`u{NtD#n^yLa3s!kbC|)7i`c0}H>fJHEn#^H z%cU^6iqdr?%ziw=L%N4zobTr1q8wXHtb2cOM`9CWbhsKR)+BbuJ0}d~(QcxB5<>)2 z;TTL{6nH0`lHp3Ng_v@SNvzBJkTchm7}!UjlANJ5UMV$rF)QGDoEz`P03PIB!c5l^ z!a>@@{+b>(dYBk5cPn>8W6ao|fKZvu)79*z=-%8CCM%Pc^Xis>@sCblwG2X$X8gTkot_@Nh`bqe&HWd^9{;~ zvl``8X1+!M(j5F{WYCMDcRz@Hz7n`o+>h3I-&^rRCqMQ#DcUF4-b_V6jciP0NK;Y-MfLyxPA5O|AHs96wvHT9fY zgpjP^cSLnmTZ34Gcq8F;aQ`w&GP}6(tX_`tk<%?OGb+Z?wZl!94oN31 znPOH_d7^E!qcWgykgh9Fw2$6TrlU%CI_(eFxr$m9!Gfyf6g{qj@AcS_Un8oqTs%F4 z`BaSEn8Qq?d7R_`_Ixm=z6WSkP&;x^n1M6Yl7lRdnP=jIaNwPJ<*p`61u9)rqS-Ct z_ZiGE7aznfW^tZ*j9)-HgtUls80iSo(GE$MdOVB#nYf1h80J#COYux#P_W=bE165? zlZE6^vY61QR_=IOLwsF@0wovjKkjZCA|AGe30w;9ijT9I6 zzPO{@``#UB5J;5TGFT86_oaXdTlzjIZ5|-c0A_FJVrpwqb#r7wgxA0+uHJI6}>+LoI2BF-5nUsnj$f0Tk}j4g!B4tP#=dm! z;o_|K`8zZ-+3GtYUMdFuGXx`v%1=|)pzJI%k;~MDuzlca9bTs>G#MM4@l&+;yK-ac z7T%_zM`@@*$ZVWTtvk5-Vd&-w;(34VcuG^S0bXz%CK# zb0TA5qbBJe{t{uBxwXv0@E%AMr}MehJy7gll*xr16KW{*77%DkE^zH|H6?zLANO!e z*)~1tG>cNU*|h4`M%C26$U0R(`5D?%Na|ZSk}D-w?1z50obgoVzqnIl3eFBxOldc( z+3n}DQb8gv+fqTIFRN73%H-}`TW$l$;B-~)h?8L$G<$AoXHBZYYA3FgO%~Q86sC7b ze&Jr7Pb~4Yt8*)nAOYxIptlv&*)${Fw#|p#;7Eb#3Js_AtxMLDLWZ4*M%1 zG>cj8Vm1EVs6cfJ_r?gxC*`y~nTkuFD{WQ2soc ztnxMwg6UU$ULUmUW5#}^F0=j(l^x8e&Nd9kxQvb zmN0s}NB}juPq1-vnWg{3GP{oYlSGq?%}=6<|HcTAVrv{O{Sxs1KSc50B&6+&s)>b! zvZ-IbdgV*CzG<#U?ei}Q)oE1ufVMcKETU|kvip=hpsY^W&m$9h7;!}N;LD|cok2lo zsq<4sJ`&ARu?nM$FrF}_iW3$h`>;(K)9Gb?M$=hPRiFytq+dj8xhQ0B22xy;F6F;U z8Sw`HHOjt^Ook*$Da%vKC@Dgnp`a?!m%Q}<=#fI`Y>-9AGMv%92!rj6_K+gaVVbd2C$p1ikElGP z(?GJC8^07Ou_2|){WQuF;pr2y%0hHRr(8-Jd*e!k2u})OB$;K5hH0Pa^9m%>kpxf4 za!elMY{+FBcDfE}2dRJWtvd>01CTjPr~XMUHYwU5+sbG&WmHTG6d90DqwH+^Ldk++$Zxn&E>)QFO5REwrgif`I~><7dRc{Q4J81WR6_Ls3{bDsn!bn1 zcT}hWs0;*qfM%(RvMV4oTgzb6yVxcFEo2!V?h;Sq*DDGjODVpXAdDwp^Wp3>6|efn zBPCYnN77l_rt?czX;StG9rh1alZ-PCNy^8X1;TbFrq50gZD!I*4&F~(BTn-Rv=l{W zCX7O9gnvS?I%QIfOX2bRs0xeu#Qajuh!X5?-fTrT`cV))Oi3yW{z$2ye&E8ehAc?8 z{YNz4?$ulh{E#yjO8)1rT;n0d^%)+85>Ffe#V)+Cw{z;A%5}P6XhXiU#3LAeFDG@oysnPJmq{#HHHbF0H zTpZI&Nw%plcq~av+C*D<+?JDV`b66l8GU5XfX*sec?#ue1RzzIEET*roALCPdNnnh zK%){4h%}>|xG?G`>xFwKAqcR4P}jde(Bxt2IfoGVg~12d6NP`3K?{K9Vhd0T(H@6j z$x4j)C|AP^D}84b^plV*#8_M&n>Z<{5(Dg_e4=7$zf8W{lI23Q4g(k`8Cx%E#yYSR zeh0(&A0o@F2L2~hR#Vlno9<6d(Ea3#>d0B#kLG`pTik~=Pw#AT*UDqDK>n~43$-1w zuuB}M{ErDEbn-nI*=M=N&T`rHC|5`K^fylz8rB}}POXPO#agfr{0QcRG}rzN0M{aS zbKU2EPTP@+oHiixW2a46slz6;_H%A-6>r)weN+BUOEjEH6P&I-bm4NZHWAO2t{443M2t`epRX2NfH#{qg5ak$m?^EDBip4foeo5 zz=V{#BodZy-@e&UmmAvc+n~D~AdLb&v(bM{c%E=P50RPm9+iupcW=GfOItWSnx~=g zn^Yo?l>d;jPbedi#c9_Q3Uh~^i6H2yUs~+8=BzrVbIg9nKJ1k20;g8!YH7@ofhSC$ zariI|(P#sDKJVWT_@FPG2r>NuM-=8k`u1fFR=RismQ(Dvvam0w$GHMzDq#^>Y7^NM H7uWv>;%4hR literal 0 HcmV?d00001 diff --git a/code/bilstm_crf/__pycache__/helper.cpython-36.pyc b/code/bilstm_crf/__pycache__/helper.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fff7c1f36177503b36c4cb5fc3fe8e07156eaded GIT binary patch literal 7078 zcmb_hOKc=Z8Sd)$v}eX%$LxA{P2y~x6E^Fxi3mhyA&=b%P@H75$u3R8(4K1BJ)W1V z+q<^aBZ0h#L?S#Qgn$EgIOKrDi5nnyfW%l`^;dOu zb@l)F{)eBg)q+2L@V6VES~85k7*jtB^yhJBA|H~j^gspjVd={Xs2Lf^ncGHKm5<1ptYfAoXXPAb>hgq~M?Wj? zlMCqQY$`OL~oak`!L2P*^BZ$H^hddW(*(QK)W zl{DVjjeD*5$xgH$cOZh5R$s;|>q#fgy1Zg5?hImeVX%KUSnVWf*6Mc$mlo-SbX?as zSl*1Y%OeS1Snc#%Q763wVzjtX(H@Q~Ux^0V?hoRgcG65G18pU}4D_IrWZLWvLgye> z{Zv~WEU~xa{j^aHXOiAV(utdcDBDt`>7kd%Ch;+>sQ#^H(v$I?c8gWo!j!J`c5!s9 zNJ9^*)s#s$42bAPc&C zkGONN5oU}!nonFm8f8*9KuAQdPZl$~3-Jz}V*=$yO+X(^y3`mi1Ik;S0-rg45JS-g?4Tpa2O997HLC+2buEm1GS_dCtgSoja z)Q#M3n>2#m!o87=3)+|W+gXAwn|X!rdY}TLc9y_xOMNf1+IHsTHq@9Ln%xU%rclQ6 zo>ShVRSqksy;gAD!SV0>6TgP5t%GDRrJQ8+tfBn*jT!AQ+G>}bNUEO1eH|nzK$}G| zQRC3bx`XCk(d`2`;#OZtEm~Skxv)Z$y@84ck%~Wy-BQx`LW7~j<{Sa8=`#=q&}!EV zpxL&VM?YsO64O|(Nn9jO*6SaE0PoDqXbTD`C`16)h_d$%a>mrQ#UX?Mrw9>UGV|O8 zA2B+`Zhz=yEy=)Kj;@o|~Ykn0*112a1wAPyrZP?g9v(xhF1!iL=ppYX?gd<}CO6*>b!GS9^YWeK+Y)P`uoY z28vMSz}d|3$m!YBke zki;oN+NU5(fFHc?B=W`%&eRSR9*uSo|8|FK_v~Ov246|;%Uk0ZREt~c1)T4|tOeR< z>eIZ>u4x24aE7fRj*^_t`93udKZ%3*a!#<>0$<-LGD3D z03{b0f*W8N7%J^eQ#$XvNIJ-O2njY9EvY^EeyF`!-b4C?duXTbSwKS{v1>)TSwI1l zbRn411Mxkpya%yO-E-*P(Wgr9gyd{|Z~PR*PF~HEUhYbN)j;oY@8`DkFh)eneFbPd zKFO*vp|P%l@eGXAJ2c^6ZBP!`_~U&{`0}Y zhkyUvl{>%3utlFrKSCE$H49hz9IF~(?c4Tyz{&S02WRw!>LDaKqIwV{r7zzyHlQFV zMfjAHlV~yTVP0UqGM*10LbzAyLm54Z>>B}&g-U@xtN@Sk>@diJHnMb3GoY&NT0X-8 zlb_E3wf(%pJsh(%wk`F$+#go+%CMHzF*5@tslHG9)rYga*AS=~*(@@Ca{*sN=^Cy5 z$XqS=C##BD!LXiJJ~B6(&B+>0Sv?eJXAi}Od6H2qujaM9p3kx>)~8ibQM}x66ou&O zO={kv<~lXx701R7o-_9eQCXT4dg48lo4{}oW}cJ?JKn|Y=^`{|Jyk1M->}px=!*2t z>P0=&9K;?1auR2uGL$Q5f3*{J*JX6+#~7u!jDs^soUbwPOqBwPMMeDs-s1m$M%BWr zaXeXctm3TLbV?PY`3WvQV2Rf_#WmDVYKT%%K1pPhCH)>pt_KgX*(_6L`rPs(^8E5- z!FaN9g0)|Lnf4E<`3f~}Q^SB^k=G_saWqWM83|ZtZSt0OUR5S`3ds z4!Q6ObE*ki`*=HBbU0gBH;ok?H>~fbakI6XX8kUG1fPD&vcE=KjWft_hRPyE#*>C9 zm>`QzHL`8qm9p)EaBk3E+S%tViHXR)SnSNzLhK_M{{Xkk*nCbbS&L%PTC^w-BZp5B zI=TD>YXN~7{>(*F4e-QWFTU}^93pDigZ4;&B{BEoV<6ZOKm_ibQE1*MsaJOLMkn4+|F zH?PqC9AC^A@kN2r6k!zZ=RexKiKX9z`luqOz?)~CLJ*Fjat1gBtPY=zo`MO@kKwSGlsBY{&AO%(~> zJXkE%f0B%pi;qVNdiq_Q_FV)PaMfasA!%g)rMfCCnm`qUz;7^Bpw#J)yl{kMY6FXo ze=V8pF%120K=eNY)Ne;;Q=7*ffF()^&+!K+iz~+?`1H$<`g_90Ka5@t4n7U~phosN z!MYg^)Bi2jsRYkVVciuP&ZB7rVTE=oinm)dO{IYeM1>v&DOeG@z#;AdL+I6ctP6u; zvPpq51CccaL@ZuX1l30KOI#`bF~Wgx)m$=t|BP5f#<2+XT0pympoLeCMN1hWMJq4j zN`Hg~g#od5`<|j#hW|E*O@znDH@NVHJj2{XwtP{>1bGE}*Q-0L zRPm#{kb1fY8Udy>;d=+K!QR^Vzl5|GP+%x?m(}Z>zfk$MEE5PW4BVlEPO`o`%nf%c z0OTxunHG(UGf3-dC8(E$)Hxu0FHOkP6*KDd^u$AGw8*%2phf$LhUi)W_T__r#Nu({ zJrBlGA&d&(g!m>3;3%-(!WyU&fe|`V+HrRc)K!|WRD|w|S}z7WlflQ-a~LQR3p-V9 zXGs?oh(XfYrsNiGY_MMzo(IURqO{OVD!g-TSABvgs>~I>_j#fyA>}Z+@Z02uSRMGV uQ=d$nE$UDHYPT 0 + 6 diff --git a/code/bilstm_crf/helper.py b/code/bilstm_crf/helper.py index 0a8391d..a3c2288 100755 --- a/code/bilstm_crf/helper.py +++ b/code/bilstm_crf/helper.py @@ -103,12 +103,12 @@ def extractEntity(sentence, labels): def loadMap(token2id_filepath): if not os.path.isfile(token2id_filepath): - print "file not exist, building map" + print( "file not exist, building map") buildMap() token2id = {} id2token = {} - with open(token2id_filepath) as infile: + with open(token2id_filepath,'rb') as infile: for row in infile: row = row.rstrip().decode("utf-8") token = row.split('\t')[0] @@ -118,13 +118,13 @@ def loadMap(token2id_filepath): return token2id, id2token def saveMap(id2char, id2label): - with open("char2id", "wb") as outfile: + with open("char2id", "w") as outfile: for idx in id2char: outfile.write(id2char[idx] + "\t" + str(idx) + "\r\n") - with open("label2id", "wb") as outfile: + with open("label2id", "w") as outfile: for idx in id2label: outfile.write(id2label[idx] + "\t" + str(idx) + "\r\n") - print "saved map between token and id" + print( "saved map between token and id") def buildMap(train_path="train.in"): df_train = pd.read_csv(train_path, delimiter='\t', quoting=csv.QUOTE_NONE, skip_blank_lines=False, header=None, names=["char", "label"]) @@ -174,7 +174,7 @@ def getTrain(train_path, val_path, train_val_ratio=0.99, use_custom_val=False, s X_val = X[int(num_samples * train_val_ratio):] y_val = y[int(num_samples * train_val_ratio):] - print "train size: %d, validation size: %d" %(len(X_train), len(y_val)) + print( "train size: %d, validation size: %d" %(len(X_train), len(y_val))) return X_train, y_train, X_val, y_val @@ -202,7 +202,7 @@ def mapFunc(x, char2id): df_test["char"] = df_test.char.map(lambda x : -1 if str(x) == str(np.nan) else x) X_test, _ = prepare(df_test["char_id"], df_test["char_id"], seq_max_len) X_test_str, _ = prepare(df_test["char"], df_test["char_id"], seq_max_len, is_padding=False) - print "test size: %d" %(len(X_test)) + print( "test size: %d" %(len(X_test))) return X_test, X_test_str def getTransition(y_train_batch): diff --git a/code/bilstm_crf/label2id b/code/bilstm_crf/label2id new file mode 100644 index 0000000..4e5d484 --- /dev/null +++ b/code/bilstm_crf/label2id @@ -0,0 +1,5 @@ +O 1 +B 2 +M 3 +E 4 + 0 diff --git a/code/bilstm_crf/test.py b/code/bilstm_crf/test.py index 92e27ae..cb31526 100755 --- a/code/bilstm_crf/test.py +++ b/code/bilstm_crf/test.py @@ -25,7 +25,7 @@ start_time = time.time() -print "preparing test data" +print( "preparing test data") X_test, X_test_str = helper.getTest(test_path=test_path, seq_max_len=num_steps) char2id, id2char = helper.loadMap("char2id") label2id, id2label = helper.loadMap("label2id") @@ -36,7 +36,7 @@ else: embedding_matrix = None -print "building model" +print( "building model") config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: with tf.device(gpu_config): @@ -44,12 +44,12 @@ with tf.variable_scope("model", reuse=None, initializer=initializer): model = BILSTM_CRF(num_chars=num_chars, num_classes=num_classes, num_steps=num_steps, embedding_matrix=embedding_matrix, is_training=False) - print "loading model parameter" + print( "loading model parameter") saver = tf.train.Saver() saver.restore(sess, model_path) - print "testing" + print( "testing") model.test(sess, X_test, X_test_str, output_path) end_time = time.time() - print "time used %f(hour)" % ((end_time - start_time) / 3600) \ No newline at end of file + print( "time used %f(hour)" % ((end_time - start_time) / 3600)) \ No newline at end of file diff --git a/code/bilstm_crf/train.py b/code/bilstm_crf/train.py index 030ca7c..aee4a5a 100755 --- a/code/bilstm_crf/train.py +++ b/code/bilstm_crf/train.py @@ -3,6 +3,8 @@ import argparse import numpy as np import pandas as pd +import os +os.environ['TF_CPP_MIN_LOG_LEVEL']='2' import tensorflow as tf from BILSTM_CRF import BILSTM_CRF @@ -23,34 +25,32 @@ val_path = args.val_path num_epochs = args.epoch emb_path = args.char_emb -gpu_config = "/gpu:"+str(args.gpu) +# gpu_config = "/gpu:"+str(args.gpu) num_steps = 200 # it must consist with the test start_time = time.time() -print "preparing train and validation data" X_train, y_train, X_val, y_val = helper.getTrain(train_path=train_path, val_path=val_path, seq_max_len=num_steps) char2id, id2char = helper.loadMap("char2id") label2id, id2label = helper.loadMap("label2id") num_chars = len(id2char.keys()) num_classes = len(id2label.keys()) if emb_path != None: - embedding_matrix = helper.getEmbedding(emb_path) + embedding_matrix = helper.getEmbedding(emb_path) else: - embedding_matrix = None + embedding_matrix = None -print "building model" config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: - with tf.device(gpu_config): - initializer = tf.random_uniform_initializer(-0.1, 0.1) - with tf.variable_scope("model", reuse=None, initializer=initializer): - model = BILSTM_CRF(num_chars=num_chars, num_classes=num_classes, num_steps=num_steps, num_epochs=num_epochs, embedding_matrix=embedding_matrix, is_training=True) +# # with tf.device(gpu_config): + initializer = tf.random_uniform_initializer(-0.1, 0.1) + with tf.variable_scope("model", reuse=None, initializer=initializer): + model = BILSTM_CRF(num_chars=num_chars, num_classes=num_classes, num_steps=num_steps, num_epochs=num_epochs, embedding_matrix=embedding_matrix, is_training=True) - print "training model" - tf.initialize_all_variables().run() - model.train(sess, save_path, X_train, y_train, X_val, y_val) + print ("training model") + tf.global_variables_initializer().run() + model.train(sess, save_path, X_train, y_train, X_val, y_val) - print "final best f1 is: %f" % (model.max_f1) + print ("final best f1 is: %f" % (model.max_f1)) - end_time = time.time() - print "time used %f(hour)" % ((end_time - start_time) / 3600) + end_time = time.time() + print ("time used %f(hour)" % ((end_time - start_time) / 3600)) From 25b40f321335a10515e268e4de5ae9a6ef529901 Mon Sep 17 00:00:00 2001 From: martian2049 Date: Thu, 13 Apr 2017 01:44:30 +0800 Subject: [PATCH 2/3] .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c34d28b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +.DS_Store \ No newline at end of file From 0f1d59edb738875a8b2cb7ded7d132d2fdcb415e Mon Sep 17 00:00:00 2001 From: martian2049 Date: Thu, 13 Apr 2017 01:45:43 +0800 Subject: [PATCH 3/3] readme,tf3.6 --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 701caac..f7aacc4 100755 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ +# Sequential Labeling +Updated to py3.6, tf 1.0 + # Sequential Labeling - HMM @@ -52,8 +55,8 @@ python test.py model test.in test.out -c char_emb -g 2 The first line of the embedding file is the number of char and embedding dimension, seperating by space, e.g 5 10. The remaining line is the char and embedding vector, seperating by space, e.g N dim1 ... dim 10 # Installation Dependencies -- python 2.7 -- tensorflow 0.8 +- python 3.6 +- tensorflow 1.0 - numpy - pandas