Add files via upload

fireindark707 · web-flow · commit bf449b952725 · 2022-04-12T12:09:21.000+08:00
diff --git a/init.py b/init.py
@@ -0,0 +1,3 @@
+from sentence_transformers import SentenceTransformer
+
+model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
diff --git a/relation_features.py b/relation_features.py
@@ -1,6 +1,7 @@
+import init
 import pandas as pd
 import numpy as np
-from torch import cosine_similarity
+from numpy.linalg import norm
 from self_features import make_self_features_from
 import random
 import os
@@ -9,24 +10,27 @@
 from strsimpy.damerau import Damerau
 from nltk.translate import bleu
 from nltk.translate.bleu_score import SmoothingFunction
-from sentence_transformers import SentenceTransformer, util
+from sentence_transformers import util
 import re
 
-model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
+model = init.model
+
 smoothie = SmoothingFunction().method4
 metriclcs = MetricLCS()
 damerau = Damerau()
 seed = 200
 random.seed(seed)
 
+def preprocess_text(text):
+    text = text.lower()
+    text = re.split(r'[\s\-\_\.]', text)
+    text = " ".join(text).strip()
+    return text
+
 def transformer_similarity(text1, text2):
     """
     Use sentence transformer to calculate similarity between two sentences.
     """
-    text1,text2 = text1.lower(), text2.lower()
-    text1 = re.split(r'[\s\-\_\.]', text1)
-    text1 = " ".join(text1).strip()
-    text2 = " ".join(text2).strip()
     embeddings1 = model.encode(text1)
     embeddings2 = model.encode(text2)
     cosine_similarity = util.cos_sim(embeddings1, embeddings2)
@@ -83,6 +87,13 @@ def get_colnames_features(text1,text2):
     colnames_features = np.array([bleu_score, edit_distance, lcs,transformer_score, one_in_one])
     return colnames_features
 
+def get_instance_similarity(embeddings1, embeddings2):
+    """
+    Use cosine similarity between two sentences.
+    """
+    cosine_similarity = np.inner(embeddings1, embeddings2) / (norm(embeddings1) * norm(embeddings2))
+    return np.array([cosine_similarity])
+
 def make_data_from(folder_path,type="train"):
     """
     Read data from folder and make relational features and labels as a matrix.
@@ -101,21 +112,24 @@ def make_data_from(folder_path,type="train"):
     table1_features = make_self_features_from(table1)
     table2_features = make_self_features_from(table2)
 
-    additional_feature_num = 5
-    output_feature_table = np.zeros((len(combinations_labels), table1_features.shape[1]*1 + additional_feature_num), dtype=np.float32)
+    additional_feature_num = 6
+    output_feature_table = np.zeros((len(combinations_labels), table1_features.shape[1] - 768+ additional_feature_num), dtype=np.float32)
     output_labels = np.zeros(len(combinations_labels), dtype=np.int32)
     for i, (combination,label) in enumerate(combinations_labels.items()):
         c1_name, c2_name = combination
         c1 = columns1.index(c1_name)
         c2 = columns2.index(c2_name)
         difference_features_percent = np.abs(table1_features[c1] - table2_features[c2]) / (table1_features[c1] + table2_features[c2] + 1e-8)
+        c1_name = preprocess_text(c1_name)
+        c2_name = preprocess_text(c2_name)
         colnames_features = get_colnames_features(c1_name, c2_name)
-        output_feature_table[i,:] = np.concatenate((difference_features_percent, colnames_features))
+        instance_similarity = get_instance_similarity(table1_features[c1][-768:], table2_features[c2][-768:])
+        output_feature_table[i,:] = np.concatenate((difference_features_percent[:-768], colnames_features,instance_similarity))
         output_labels[i] = label
         # add column names mask for training data
         if type == "train" and i % 5 == 0:
             colnames_features = np.array([0,12,0,0.2,0])
-            added_features = np.concatenate((difference_features_percent, colnames_features))
+            added_features = np.concatenate((difference_features_percent[:-768], colnames_features, instance_similarity))
             added_features = added_features.reshape((1, added_features.shape[0]))
             output_feature_table = np.concatenate((output_feature_table, added_features), axis=0)
             output_labels = np.concatenate((output_labels, np.array([label])))
diff --git a/self_features.py b/self_features.py
@@ -1,9 +1,10 @@
+import init
 import pandas as pd
 import numpy as np
 import re
-from dateutil.parser import parse as parse_date
 import random
-
+from dateutil.parser import parse as parse_date
+model = init.model
 unit_dict = {"万": 10000, "亿": 100000000, "萬": 10000, "億": 100000000, "K+": 1000, "M+": 1000000, "B+": 1000000000}
 
 def load_table(filepath):
@@ -153,6 +154,18 @@ def character_features(data_list):
                      np.var(whitespace_ratios)/np.mean(whitespace_ratios), np.var(punctuation_ratios)/np.mean(punctuation_ratios),
                         np.var(special_character_ratios)/np.mean(special_character_ratios), np.var(numeric_ratios)/np.mean(numeric_ratios)])
 
+def deep_embedding(data_list):
+    """
+    Extracts deep embedding features from the given data using sentence-transformers.
+    """
+    if len(data_list) < 20:
+        selected_data = data_list
+    else:
+        selected_data = random.sample(data_list,20)
+    embeddings = [model.encode(str(data)) for data in selected_data]
+    embeddings = np.array(embeddings)
+    return np.mean(embeddings, axis=0)
+
 def extract_features(data_list):
     """
     Extract some features from the given data(column) or list
@@ -181,12 +194,14 @@ def extract_features(data_list):
         num_fts = np.array([-1]*6)
     # If data is not numeric, give length features
     length_fts = numeric_features([len(str(d)) for d in data_list])
-    # Give character features if the data is string
+    # Give character features and deep embeddings if the data is string
     if data_type == "string" or (not strict_numeric(data_list) and  mainly_numeric(data_list)):
         char_fts = character_features(data_list)
+        deep_fts = deep_embedding(data_list)
     else:
         char_fts = np.array([-1]*8)
-    output_features = np.concatenate((data_type_feature, num_fts, length_fts, char_fts))
+        deep_fts = np.array([-999]*768)
+    output_features = np.concatenate((data_type_feature, num_fts, length_fts, char_fts, deep_fts))
     return output_features
 
 def make_self_features_from(filepath):
diff --git a/train.py b/train.py
@@ -12,17 +12,17 @@
                 "length:mean", "length:min", "length:max", "length:variance","length:cv", "length:unique/len(data_list)",
                 "whitespace_ratios:mean","punctuation_ratios:mean","special_character_ratios:mean","numeric_ratios:mean",
                 "whitespace_ratios:cv","punctuation_ratios:cv","special_character_ratios:cv","numeric_ratios:cv",
-                "colname:bleu_score", "colname:edit_distance","colname:lcs","colname:tsm_cosine", "colname:one_in_one"
+                "colname:bleu_score", "colname:edit_distance","colname:lcs","colname:tsm_cosine", "colname:one_in_one","instance_similarity:cosine",
                 ]
 
 params = {
-        'max_depth': 3,
-        'eta': 0.03,
+        'max_depth': 4,
+        'eta': 0.1,
         'objective': 'binary:logistic',
-        'eval_metric': 'auc',
+        'eval_metric': 'logloss',
     }
 
-def train(train_features,train_labels,num_round=900):
+def train(train_features,train_labels,num_round):
     dtrain = xgb.DMatrix(train_features, label=train_labels)
     bst = xgb.train(params, dtrain, num_round)
     # get best_threshold
@@ -89,7 +89,7 @@ def get_feature_importances(bst):
     importance = sorted(importance, key=lambda x: x[0][1], reverse=True)
     return importance
 
-def train_loop(num_round=900):
+def train_loop(num_round=300):
     precision_list = []
     recall_list = []
     f1_list = []
@@ -100,8 +100,6 @@ def train_loop(num_round=900):
         bst, best_threshold = train(train_features, train_labels, num_round)
         precision, recall, f1, c_matrix = test(bst,best_threshold, test_features, test_labels)
         feature_importance = get_feature_importances(bst)
-        #print(f"Positive rate in Training: {sum(train_labels)/len(train_labels)*100:.2f}%")
-        #print(f"Positive rate in Testing: {sum(test_labels)/len(test_labels)*100:.2f}%")
         c_matrix_norm = c_matrix.astype('float') / c_matrix.sum(axis=1)[:, np.newaxis]
         precision_list.append(precision)
         recall_list.append(recall)
@@ -111,8 +109,6 @@ def train_loop(num_round=900):
         bst.save_model(model_save_pth+f"/{i}.model")
         with open(model_save_pth+f"/{i}.threshold",'w') as f:
             f.write(str(best_threshold))
-    #print(f1_list)
-    #print(np.mean(c_matrix_list,axis=0))
     # evaluate feature importance
     feature_name_importance = {}
     for feature_importance in feature_importance_list:
@@ -133,11 +129,15 @@ def optimize_hyperparameter(eta_candid,max_depth_candid,num_round_candid):
                 params["eta"] = eta
                 params["max_depth"] = max_depth
                 precision_list, recall_list, f1_list, c_matrix_list, feature_name_importance = train_loop(num_round)
+                print("Average Precision: %.3f" % np.mean(precision_list))
+                print("Average Recall: %.3f" % np.mean(recall_list))
+                print("Average F1: %.3f" % np.mean(f1_list))
                 if np.mean(f1_list) > best_f1:
                     best_f1 = np.mean(f1_list)
                     best_params = params
                     best_precision = np.mean(precision_list)
                     best_recall = np.mean(recall_list)
+    best_params["num_round"] = num_round
     return best_params, best_precision, best_recall, best_f1
     
 
@@ -146,17 +146,6 @@ def optimize_hyperparameter(eta_candid,max_depth_candid,num_round_candid):
     if not os.path.exists(model_save_pth):
         os.makedirs(model_save_pth)
 
-    # tune parameters
-    if False:
-        eta_candidate = [0.3,0.2,0.15,0.1,0.08,0.05,0.03]
-        max_depth_candidate = [5,10,15,20,25,30,35,40,45,50]
-        num_round_candidate = [100,200,300,400,500,600,700,800,900,1000]
-        best_params,best_precision, best_recall, best_f1 = optimize_hyperparameter(eta_candidate,max_depth_candidate,num_round_candidate)
-        print(best_params)
-        print(best_precision)
-        print(best_recall)
-        print(best_f1)
-
     precision_list, recall_list, f1_list, c_matrix_list, feature_name_importance = train_loop()
     # give evaluation results
     print("Average Precision: %.3f" % np.mean(precision_list))
@@ -166,4 +155,15 @@ def optimize_hyperparameter(eta_candid,max_depth_candid,num_round_candid):
     print("Average Confusion Matrix: \n", np.mean(c_matrix_list,axis=0))
     print("Feature Importance:")
     for importance in feature_name_importance:
-        print(f"{importance[0]}: {importance[1]}")
+        print(f"{importance[0]}: {importance[1]}")
+
+    # tune parameters
+    if False:
+        eta_candidate = [0.08,0.05,0.03, 0.01]
+        max_depth_candidate = [3,4,5,6,7,8,9,10,12,15,20]
+        num_round_candidate = [100,200,300,400,500,600,700,800,900,1000]
+        best_params,best_precision, best_recall, best_f1 = optimize_hyperparameter(eta_candidate,max_depth_candidate,num_round_candidate)
+        print(best_params)
+        print(best_precision)
+        print(best_recall)
+        print(best_f1)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from sentence_transformers import SentenceTransformer`
	`2`	`+`
	`3`	`+model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')`