-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreateTreeModel.py
More file actions
100 lines (81 loc) · 3.69 KB
/
createTreeModel.py
File metadata and controls
100 lines (81 loc) · 3.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
import pickle
import os
_data_home = 'data'
_model_home = 'treeModels'
_dataset_ids = [4538, 44, 43174, 1475, 41150, 41145, 41168, 44975, 4549, 1219]
_id2depth = {
41168 : 40,
41150 : 20,
41145 : 15,
44: 15,
1475 : 20,
4538 : 10,
43174 : 10,
44975 : 30,
4549 : 50}
_classification_ids = [41168, 41150, 41145, 44, 1475, 4538, 1219]
_datasets_using_scaler = [44975, 4549]
if 'test' in os.getcwd().split(os.sep):
_data_home = os.path.join('..', _data_home)
_model_home = os.path.join('..', _model_home)
def createTreeModel(dataset_id, n_estimators, random_seed, depth=None):
assert dataset_id in _dataset_ids
X_train, X_test, y_train, y_test = data_from_openml(dataset_id, random_seed)
if dataset_id in _classification_ids:
if n_estimators:
model = GradientBoostingClassifier(n_estimators=n_estimators,
max_depth=depth or _id2depth[dataset_id],
random_state=random_seed)
else:
model = DecisionTreeClassifier(random_state=random_seed,
max_depth=depth or _id2depth[dataset_id])
else:
if n_estimators:
model = GradientBoostingRegressor(n_estimators=n_estimators,
max_depth=depth or _id2depth[dataset_id],
random_state=random_seed)
else:
model = DecisionTreeRegressor(random_state=random_seed,
max_depth=depth or _id2depth[dataset_id])
os.makedirs(_model_home, exist_ok=True)
if n_estimators:
model_path = os.path.join(_model_home,
f'dataset_id={dataset_id}-depth={depth or _id2depth[dataset_id]}-n_estimators={n_estimators}.pkl')
else:
model_path = os.path.join(_model_home,
f'dataset_id={dataset_id}-depth={depth or _id2depth[dataset_id]}.pkl')
if not os.path.exists(model_path):
model.fit(X_train, y_train)
with open(model_path, 'wb') as f:
pickle.dump(model, f)
else:
with open(model_path, 'rb') as f:
model = pickle.load(f)
return model, X_test, y_test
def data_from_openml(dataset_id, random_seed, test_size=0.2):
X, y = fetch_openml(data_id=dataset_id, return_X_y=True, as_frame=False,
data_home=_data_home)
if dataset_id in _classification_ids:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_seed, stratify=y)
else:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_seed)
if dataset_id in _datasets_using_scaler:
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train[:, None])[:, 0]
y_test = scaler.transform(y_test[:, None])[:, 0]
return X_train, X_test, y_train, y_test
if __name__ == '__main__':
for dataset_id in _dataset_ids[:-1]:
print(dataset_id)
for n_estimators in [0, 5]:
createTreeModel(dataset_id, n_estimators, 2025)
for depth in range(35, 66):
print(depth)
createTreeModel(1219, 0, 2025, depth)