-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathtest.py
More file actions
123 lines (105 loc) · 4.1 KB
/
test.py
File metadata and controls
123 lines (105 loc) · 4.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from __future__ import division
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import KFold
def update(df1,df2,on_column,columns_to_omit):
# Both dataframes have to have same column names
header = list(df1)
header = header[columns_to_omit:]
start = df1.shape[1]
to_update = df1.merge(df2,on=on_column,how='left').iloc[:,start:].dropna()
to_update.columns = header
#UPDATE just on NaN values
# for elem in header:
# df1.loc[df1[elem].isnull(),elem] = to_update[elem]
# print(df1)
#UPDATE whole row when NaN appears
df1.loc[df1[header[0]].isnull(),header] = to_update
return df1
def dataHandler(wEmb, cogData, feature, dim,config):
# READ Datasets into dataframes
df_cD = pd.read_csv(cogData, sep=" ")
df_wE = pd.read_csv(wEmb, sep=" ",
encoding="utf-8", quoting=csv.QUOTE_NONE)
print(df_cD.shape)
# with pd.option_context('display.precision', 10):
# print(df_cD)
# print(df_cD)
if dim=="single":
df_cD = df_cD[['word',feature]]
df_cD.dropna(inplace=True)
print(df_cD.shape)
# # Create chunks of df to perform 'MemorySafe'-join
# chunk_number = 10
# df_join = df_cD
# rows = df_wE.shape[0]
# chunk_size = rows // chunk_number
# rest = rows % chunk_number
# for i in range(0, chunk_number):
# begin = chunk_size * i
# end = chunk_size * (i + 1)
# if i == 0:
# df_join = pd.merge(df_join, df_wE.iloc[begin:end, :], how='left', on=['word'])
# else:
# if i == chunk_number - 1:
# end = end + rest
# update(df_join, df_wE.iloc[begin:end, :], on_column=['word'],columns_to_omit=2)
# #Join from chunked FILE
df_join = df_cD
chunk_number = 4
file = "/home/delatvan/Dropbox/university/ETH/4fs/projektArbeit/datasets/embeddings/glove-6B/test"
end = ".csv"
for i in range(0,chunk_number):
df = pd.read_csv(file + str(i) + end, sep=" ",
encoding="utf-8", quoting=csv.QUOTE_NONE)
print(df.shape)
df.drop(df.columns[0],axis=1,inplace=True)
if i == 0:
df_join = pd.merge(df_join, df, how='left', on=['word'])
else:
update(df_join, df, on_column=['word'],columns_to_omit=2)
# # Left (outer) Join to get wordembedding vectors for all words in cognitive dataset
#df_join = pd.merge(df_cD, df_wE, how='left', on=['word'])
# #df_join = df_cD.join(df_wE,on=['word'],how='left')
df_join.dropna(inplace=True)
#
#print(df_join)
#
# words = df_join['word']
# words = np.array(words, dtype='str').reshape(-1,1)
#
# df_join.drop(['word'], axis=1, inplace=True)
#
# if dim == "single":
# y = df_join[feature]
# y = np.array(y, dtype='float').reshape(-1, 1)
#
# X = df_join.drop(feature, axis=1)
# X = np.array(X, dtype='float')
# else:
# features = config
# y = df_join[features]
# y = np.array(y, dtype='float')
#
# X = df_join.drop(features, axis=1)
# X = np.array(X, dtype='float')
#
# print(words)
# print(y)
# print(X)
print(df_join)
print(df_join.shape)
print('SUCCESS')
return 0
def main():
#emb = "/home/delatvan/Dropbox/university/ETH/4fs/projektArbeit/datasets/embeddings/fasttext/crawl-300d-2M.vec"
#emb = "/home/delatvan/Dropbox/university/ETH/4fs/projektArbeit/datasets/embeddings/word2vec/word2vec.txt"
#emb = "/home/delatvan/Dropbox/university/ETH/4fs/projektArbeit/datasets/embeddings/fasttext/wiki-news-300d-1M.vec"
#emb = "/home/delatvan/Dropbox/university/ETH/4fs/projektArbeit/datasets/embeddings/wordnet2vec/wnet2vec_brain.txt"
emb = "/home/delatvan/Dropbox/university/ETH/4fs/projektArbeit/datasets/embeddings/glove-6B/glove.6B.50d.txt"
all_data = "/home/delatvan/Dropbox/university/ETH/4fs/projektArbeit/datasets/cognitive-data/gaze/all/all_scaled.txt"
config =["ffd","fpd","tfd", "nfix","mfd","gpt"]
dataHandler(emb,all_data,"ffd",dim='single', config=config)
if __name__=="__main__":
main()