1212 "length:mean" , "length:min" , "length:max" , "length:variance" ,"length:cv" , "length:unique/len(data_list)" ,
1313 "whitespace_ratios:mean" ,"punctuation_ratios:mean" ,"special_character_ratios:mean" ,"numeric_ratios:mean" ,
1414 "whitespace_ratios:cv" ,"punctuation_ratios:cv" ,"special_character_ratios:cv" ,"numeric_ratios:cv" ,
15- "colname:bleu_score" , "colname:edit_distance" ,"colname:lcs" ,"colname:tsm_cosine" , "colname:one_in_one"
15+ "colname:bleu_score" , "colname:edit_distance" ,"colname:lcs" ,"colname:tsm_cosine" , "colname:one_in_one" , "instance_similarity:cosine" ,
1616 ]
1717
1818params = {
19- 'max_depth' : 3 ,
20- 'eta' : 0.03 ,
19+ 'max_depth' : 4 ,
20+ 'eta' : 0.1 ,
2121 'objective' : 'binary:logistic' ,
22- 'eval_metric' : 'auc ' ,
22+ 'eval_metric' : 'logloss ' ,
2323 }
2424
25- def train (train_features ,train_labels ,num_round = 900 ):
25+ def train (train_features ,train_labels ,num_round ):
2626 dtrain = xgb .DMatrix (train_features , label = train_labels )
2727 bst = xgb .train (params , dtrain , num_round )
2828 # get best_threshold
@@ -89,7 +89,7 @@ def get_feature_importances(bst):
8989 importance = sorted (importance , key = lambda x : x [0 ][1 ], reverse = True )
9090 return importance
9191
92- def train_loop (num_round = 900 ):
92+ def train_loop (num_round = 300 ):
9393 precision_list = []
9494 recall_list = []
9595 f1_list = []
@@ -100,8 +100,6 @@ def train_loop(num_round=900):
100100 bst , best_threshold = train (train_features , train_labels , num_round )
101101 precision , recall , f1 , c_matrix = test (bst ,best_threshold , test_features , test_labels )
102102 feature_importance = get_feature_importances (bst )
103- #print(f"Positive rate in Training: {sum(train_labels)/len(train_labels)*100:.2f}%")
104- #print(f"Positive rate in Testing: {sum(test_labels)/len(test_labels)*100:.2f}%")
105103 c_matrix_norm = c_matrix .astype ('float' ) / c_matrix .sum (axis = 1 )[:, np .newaxis ]
106104 precision_list .append (precision )
107105 recall_list .append (recall )
@@ -111,8 +109,6 @@ def train_loop(num_round=900):
111109 bst .save_model (model_save_pth + f"/{ i } .model" )
112110 with open (model_save_pth + f"/{ i } .threshold" ,'w' ) as f :
113111 f .write (str (best_threshold ))
114- #print(f1_list)
115- #print(np.mean(c_matrix_list,axis=0))
116112 # evaluate feature importance
117113 feature_name_importance = {}
118114 for feature_importance in feature_importance_list :
@@ -133,11 +129,15 @@ def optimize_hyperparameter(eta_candid,max_depth_candid,num_round_candid):
133129 params ["eta" ] = eta
134130 params ["max_depth" ] = max_depth
135131 precision_list , recall_list , f1_list , c_matrix_list , feature_name_importance = train_loop (num_round )
132+ print ("Average Precision: %.3f" % np .mean (precision_list ))
133+ print ("Average Recall: %.3f" % np .mean (recall_list ))
134+ print ("Average F1: %.3f" % np .mean (f1_list ))
136135 if np .mean (f1_list ) > best_f1 :
137136 best_f1 = np .mean (f1_list )
138137 best_params = params
139138 best_precision = np .mean (precision_list )
140139 best_recall = np .mean (recall_list )
140+ best_params ["num_round" ] = num_round
141141 return best_params , best_precision , best_recall , best_f1
142142
143143
@@ -146,17 +146,6 @@ def optimize_hyperparameter(eta_candid,max_depth_candid,num_round_candid):
146146 if not os .path .exists (model_save_pth ):
147147 os .makedirs (model_save_pth )
148148
149- # tune parameters
150- if False :
151- eta_candidate = [0.3 ,0.2 ,0.15 ,0.1 ,0.08 ,0.05 ,0.03 ]
152- max_depth_candidate = [5 ,10 ,15 ,20 ,25 ,30 ,35 ,40 ,45 ,50 ]
153- num_round_candidate = [100 ,200 ,300 ,400 ,500 ,600 ,700 ,800 ,900 ,1000 ]
154- best_params ,best_precision , best_recall , best_f1 = optimize_hyperparameter (eta_candidate ,max_depth_candidate ,num_round_candidate )
155- print (best_params )
156- print (best_precision )
157- print (best_recall )
158- print (best_f1 )
159-
160149 precision_list , recall_list , f1_list , c_matrix_list , feature_name_importance = train_loop ()
161150 # give evaluation results
162151 print ("Average Precision: %.3f" % np .mean (precision_list ))
@@ -166,4 +155,15 @@ def optimize_hyperparameter(eta_candid,max_depth_candid,num_round_candid):
166155 print ("Average Confusion Matrix: \n " , np .mean (c_matrix_list ,axis = 0 ))
167156 print ("Feature Importance:" )
168157 for importance in feature_name_importance :
169- print (f"{ importance [0 ]} : { importance [1 ]} " )
158+ print (f"{ importance [0 ]} : { importance [1 ]} " )
159+
160+ # tune parameters
161+ if False :
162+ eta_candidate = [0.08 ,0.05 ,0.03 , 0.01 ]
163+ max_depth_candidate = [3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,12 ,15 ,20 ]
164+ num_round_candidate = [100 ,200 ,300 ,400 ,500 ,600 ,700 ,800 ,900 ,1000 ]
165+ best_params ,best_precision , best_recall , best_f1 = optimize_hyperparameter (eta_candidate ,max_depth_candidate ,num_round_candidate )
166+ print (best_params )
167+ print (best_precision )
168+ print (best_recall )
169+ print (best_f1 )
0 commit comments