Skip to content

Commit bf449b9

Browse files
Add files via upload
1 parent b18d4b6 commit bf449b9

File tree

4 files changed

+69
-37
lines changed

4 files changed

+69
-37
lines changed

init.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from sentence_transformers import SentenceTransformer
2+
3+
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

relation_features.py

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
import init
12
import pandas as pd
23
import numpy as np
3-
from torch import cosine_similarity
4+
from numpy.linalg import norm
45
from self_features import make_self_features_from
56
import random
67
import os
@@ -9,24 +10,27 @@
910
from strsimpy.damerau import Damerau
1011
from nltk.translate import bleu
1112
from nltk.translate.bleu_score import SmoothingFunction
12-
from sentence_transformers import SentenceTransformer, util
13+
from sentence_transformers import util
1314
import re
1415

15-
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
16+
model = init.model
17+
1618
smoothie = SmoothingFunction().method4
1719
metriclcs = MetricLCS()
1820
damerau = Damerau()
1921
seed = 200
2022
random.seed(seed)
2123

24+
def preprocess_text(text):
25+
text = text.lower()
26+
text = re.split(r'[\s\-\_\.]', text)
27+
text = " ".join(text).strip()
28+
return text
29+
2230
def transformer_similarity(text1, text2):
2331
"""
2432
Use sentence transformer to calculate similarity between two sentences.
2533
"""
26-
text1,text2 = text1.lower(), text2.lower()
27-
text1 = re.split(r'[\s\-\_\.]', text1)
28-
text1 = " ".join(text1).strip()
29-
text2 = " ".join(text2).strip()
3034
embeddings1 = model.encode(text1)
3135
embeddings2 = model.encode(text2)
3236
cosine_similarity = util.cos_sim(embeddings1, embeddings2)
@@ -83,6 +87,13 @@ def get_colnames_features(text1,text2):
8387
colnames_features = np.array([bleu_score, edit_distance, lcs,transformer_score, one_in_one])
8488
return colnames_features
8589

90+
def get_instance_similarity(embeddings1, embeddings2):
91+
"""
92+
Use cosine similarity between two sentences.
93+
"""
94+
cosine_similarity = np.inner(embeddings1, embeddings2) / (norm(embeddings1) * norm(embeddings2))
95+
return np.array([cosine_similarity])
96+
8697
def make_data_from(folder_path,type="train"):
8798
"""
8899
Read data from folder and make relational features and labels as a matrix.
@@ -101,21 +112,24 @@ def make_data_from(folder_path,type="train"):
101112
table1_features = make_self_features_from(table1)
102113
table2_features = make_self_features_from(table2)
103114

104-
additional_feature_num = 5
105-
output_feature_table = np.zeros((len(combinations_labels), table1_features.shape[1]*1 + additional_feature_num), dtype=np.float32)
115+
additional_feature_num = 6
116+
output_feature_table = np.zeros((len(combinations_labels), table1_features.shape[1] - 768+ additional_feature_num), dtype=np.float32)
106117
output_labels = np.zeros(len(combinations_labels), dtype=np.int32)
107118
for i, (combination,label) in enumerate(combinations_labels.items()):
108119
c1_name, c2_name = combination
109120
c1 = columns1.index(c1_name)
110121
c2 = columns2.index(c2_name)
111122
difference_features_percent = np.abs(table1_features[c1] - table2_features[c2]) / (table1_features[c1] + table2_features[c2] + 1e-8)
123+
c1_name = preprocess_text(c1_name)
124+
c2_name = preprocess_text(c2_name)
112125
colnames_features = get_colnames_features(c1_name, c2_name)
113-
output_feature_table[i,:] = np.concatenate((difference_features_percent, colnames_features))
126+
instance_similarity = get_instance_similarity(table1_features[c1][-768:], table2_features[c2][-768:])
127+
output_feature_table[i,:] = np.concatenate((difference_features_percent[:-768], colnames_features,instance_similarity))
114128
output_labels[i] = label
115129
# add column names mask for training data
116130
if type == "train" and i % 5 == 0:
117131
colnames_features = np.array([0,12,0,0.2,0])
118-
added_features = np.concatenate((difference_features_percent, colnames_features))
132+
added_features = np.concatenate((difference_features_percent[:-768], colnames_features, instance_similarity))
119133
added_features = added_features.reshape((1, added_features.shape[0]))
120134
output_feature_table = np.concatenate((output_feature_table, added_features), axis=0)
121135
output_labels = np.concatenate((output_labels, np.array([label])))

self_features.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1+
import init
12
import pandas as pd
23
import numpy as np
34
import re
4-
from dateutil.parser import parse as parse_date
55
import random
6-
6+
from dateutil.parser import parse as parse_date
7+
model = init.model
78
unit_dict = {"万": 10000, "亿": 100000000, "萬": 10000, "億": 100000000, "K+": 1000, "M+": 1000000, "B+": 1000000000}
89

910
def load_table(filepath):
@@ -153,6 +154,18 @@ def character_features(data_list):
153154
np.var(whitespace_ratios)/np.mean(whitespace_ratios), np.var(punctuation_ratios)/np.mean(punctuation_ratios),
154155
np.var(special_character_ratios)/np.mean(special_character_ratios), np.var(numeric_ratios)/np.mean(numeric_ratios)])
155156

157+
def deep_embedding(data_list):
158+
"""
159+
Extracts deep embedding features from the given data using sentence-transformers.
160+
"""
161+
if len(data_list) < 20:
162+
selected_data = data_list
163+
else:
164+
selected_data = random.sample(data_list,20)
165+
embeddings = [model.encode(str(data)) for data in selected_data]
166+
embeddings = np.array(embeddings)
167+
return np.mean(embeddings, axis=0)
168+
156169
def extract_features(data_list):
157170
"""
158171
Extract some features from the given data(column) or list
@@ -181,12 +194,14 @@ def extract_features(data_list):
181194
num_fts = np.array([-1]*6)
182195
# If data is not numeric, give length features
183196
length_fts = numeric_features([len(str(d)) for d in data_list])
184-
# Give character features if the data is string
197+
# Give character features and deep embeddings if the data is string
185198
if data_type == "string" or (not strict_numeric(data_list) and mainly_numeric(data_list)):
186199
char_fts = character_features(data_list)
200+
deep_fts = deep_embedding(data_list)
187201
else:
188202
char_fts = np.array([-1]*8)
189-
output_features = np.concatenate((data_type_feature, num_fts, length_fts, char_fts))
203+
deep_fts = np.array([-999]*768)
204+
output_features = np.concatenate((data_type_feature, num_fts, length_fts, char_fts, deep_fts))
190205
return output_features
191206

192207
def make_self_features_from(filepath):

train.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,17 @@
1212
"length:mean", "length:min", "length:max", "length:variance","length:cv", "length:unique/len(data_list)",
1313
"whitespace_ratios:mean","punctuation_ratios:mean","special_character_ratios:mean","numeric_ratios:mean",
1414
"whitespace_ratios:cv","punctuation_ratios:cv","special_character_ratios:cv","numeric_ratios:cv",
15-
"colname:bleu_score", "colname:edit_distance","colname:lcs","colname:tsm_cosine", "colname:one_in_one"
15+
"colname:bleu_score", "colname:edit_distance","colname:lcs","colname:tsm_cosine", "colname:one_in_one","instance_similarity:cosine",
1616
]
1717

1818
params = {
19-
'max_depth': 3,
20-
'eta': 0.03,
19+
'max_depth': 4,
20+
'eta': 0.1,
2121
'objective': 'binary:logistic',
22-
'eval_metric': 'auc',
22+
'eval_metric': 'logloss',
2323
}
2424

25-
def train(train_features,train_labels,num_round=900):
25+
def train(train_features,train_labels,num_round):
2626
dtrain = xgb.DMatrix(train_features, label=train_labels)
2727
bst = xgb.train(params, dtrain, num_round)
2828
# get best_threshold
@@ -89,7 +89,7 @@ def get_feature_importances(bst):
8989
importance = sorted(importance, key=lambda x: x[0][1], reverse=True)
9090
return importance
9191

92-
def train_loop(num_round=900):
92+
def train_loop(num_round=300):
9393
precision_list = []
9494
recall_list = []
9595
f1_list = []
@@ -100,8 +100,6 @@ def train_loop(num_round=900):
100100
bst, best_threshold = train(train_features, train_labels, num_round)
101101
precision, recall, f1, c_matrix = test(bst,best_threshold, test_features, test_labels)
102102
feature_importance = get_feature_importances(bst)
103-
#print(f"Positive rate in Training: {sum(train_labels)/len(train_labels)*100:.2f}%")
104-
#print(f"Positive rate in Testing: {sum(test_labels)/len(test_labels)*100:.2f}%")
105103
c_matrix_norm = c_matrix.astype('float') / c_matrix.sum(axis=1)[:, np.newaxis]
106104
precision_list.append(precision)
107105
recall_list.append(recall)
@@ -111,8 +109,6 @@ def train_loop(num_round=900):
111109
bst.save_model(model_save_pth+f"/{i}.model")
112110
with open(model_save_pth+f"/{i}.threshold",'w') as f:
113111
f.write(str(best_threshold))
114-
#print(f1_list)
115-
#print(np.mean(c_matrix_list,axis=0))
116112
# evaluate feature importance
117113
feature_name_importance = {}
118114
for feature_importance in feature_importance_list:
@@ -133,11 +129,15 @@ def optimize_hyperparameter(eta_candid,max_depth_candid,num_round_candid):
133129
params["eta"] = eta
134130
params["max_depth"] = max_depth
135131
precision_list, recall_list, f1_list, c_matrix_list, feature_name_importance = train_loop(num_round)
132+
print("Average Precision: %.3f" % np.mean(precision_list))
133+
print("Average Recall: %.3f" % np.mean(recall_list))
134+
print("Average F1: %.3f" % np.mean(f1_list))
136135
if np.mean(f1_list) > best_f1:
137136
best_f1 = np.mean(f1_list)
138137
best_params = params
139138
best_precision = np.mean(precision_list)
140139
best_recall = np.mean(recall_list)
140+
best_params["num_round"] = num_round
141141
return best_params, best_precision, best_recall, best_f1
142142

143143

@@ -146,17 +146,6 @@ def optimize_hyperparameter(eta_candid,max_depth_candid,num_round_candid):
146146
if not os.path.exists(model_save_pth):
147147
os.makedirs(model_save_pth)
148148

149-
# tune parameters
150-
if False:
151-
eta_candidate = [0.3,0.2,0.15,0.1,0.08,0.05,0.03]
152-
max_depth_candidate = [5,10,15,20,25,30,35,40,45,50]
153-
num_round_candidate = [100,200,300,400,500,600,700,800,900,1000]
154-
best_params,best_precision, best_recall, best_f1 = optimize_hyperparameter(eta_candidate,max_depth_candidate,num_round_candidate)
155-
print(best_params)
156-
print(best_precision)
157-
print(best_recall)
158-
print(best_f1)
159-
160149
precision_list, recall_list, f1_list, c_matrix_list, feature_name_importance = train_loop()
161150
# give evaluation results
162151
print("Average Precision: %.3f" % np.mean(precision_list))
@@ -166,4 +155,15 @@ def optimize_hyperparameter(eta_candid,max_depth_candid,num_round_candid):
166155
print("Average Confusion Matrix: \n", np.mean(c_matrix_list,axis=0))
167156
print("Feature Importance:")
168157
for importance in feature_name_importance:
169-
print(f"{importance[0]}: {importance[1]}")
158+
print(f"{importance[0]}: {importance[1]}")
159+
160+
# tune parameters
161+
if False:
162+
eta_candidate = [0.08,0.05,0.03, 0.01]
163+
max_depth_candidate = [3,4,5,6,7,8,9,10,12,15,20]
164+
num_round_candidate = [100,200,300,400,500,600,700,800,900,1000]
165+
best_params,best_precision, best_recall, best_f1 = optimize_hyperparameter(eta_candidate,max_depth_candidate,num_round_candidate)
166+
print(best_params)
167+
print(best_precision)
168+
print(best_recall)
169+
print(best_f1)

0 commit comments

Comments
 (0)