forked from TMcMac/MentorMatchingApp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapi_matching.py
More file actions
executable file
·148 lines (106 loc) · 4.71 KB
/
api_matching.py
File metadata and controls
executable file
·148 lines (106 loc) · 4.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from string import punctuation
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
# import or make api call for mentee and mentor list
# mentees = api_call() <------ fill with api call function for mentees
# mentors = api_call() <------ fill with api call function for mentors
# Convert JSON to pd.DataFrame
mentee_raw = pd.DataFrame(mentees)
mentor_raw = pd.DataFrame(mentors)
# extract only the necessary columns
col_list = ["fullName", "countryOrigin", "languages", "mentorshipArea", "prefferedOutcomes", "skillMentorship", "researchAreas", "careerAdvice", "conferencePreference"]
clean_mentee = mentee_raw[col_list].copy()
clean_mentor = mentor_raw[col_list].copy()
# drop all duplicates
clean_mentee.drop_duplicates(subset=['fullName'], keep='first', inplace=True)
clean_mentor.drop_duplicates(subset=['fullName'], keep='first', inplace=True)
# convert all column dtypes to string
for col in col_list:
clean_mentee[col] = clean_mentee[col].astype(str)
clean_mentor[col] = clean_mentor[col].astype(str)
# fill all missing values with empty string
clean_mentee.fillna('', inplace=True)
clean_mentor.fillna('', inplace=True)
# create new text column for creating tokens
mentee_df = clean_mentee.copy()
mentee_df["text"] = mentee_df[col_list[1:]].apply(lambda x: " | ".join(x), axis=1)
mentor_df = clean_mentor.copy()
mentor_df["text"] = mentor_df[col_list[1:]].apply(lambda x: " | ".join(x), axis=1)
# remove single quotes
mentee_df = mentee_df.applymap(lambda x: x.replace("'", ""))
mentor_df = mentor_df.applymap(lambda x: x.replace("'", ""))
def clean_text(dataframe):
"""Pre-process text and generate tokens"""
sw = stopwords.words('english')
# tokenize
dataframe['tokens'] = dataframe['text'].apply(word_tokenize)
dataframe['tokens'] = dataframe['tokens'].apply(lambda x: [i for i in x if i not in punctuation])
dataframe['tokens'] = dataframe['tokens'].apply(lambda x: [i.lower() for i in x if i not in sw])
dataframe['tokens'] = dataframe['tokens'].apply(lambda x: [i.strip() for i in x])
clean_text(mentee_df)
clean_text(mentor_df)
# remove empty tokens
mentee_tt = mentee_df.loc[mentee_df.tokens.map(lambda x: len(x) > 0), ["fullName", "text", "tokens"]]
mentor_tt = mentor_df.loc[mentor_df.tokens.map(lambda x: len(x) > 0), ["fullName", "text", "tokens"]]
# concat both dataframes to train word2vec model on vocab
both_tt = pd.concat([mentee_tt, mentor_tt])
docs = both_tt["text"].values
tokenized_docs = both_tt["tokens"].values
names = both_tt["fullName"].values
model = Word2Vec(sentences=tokenized_docs, vector_size=80, min_count=2, workers=4, seed=42)
def vectorize(list_of_docs, model):
"""Generate vectors for list of documents using a Word Embedding
Args:
list_of_docs: List of documents
model: Gensim's Word Embedding
Returns:
List of document vectors
"""
features = []
# tokens = ['12th', 'man', 'arrested', 'statutory', 'rape', 'case', 'alabama', 'college', 'get', 'breaking', 'national',
for tokens in list_of_docs:
# model.vector_size = 100
zero_vector = np.zeros(model.vector_size)
vectors = []
for token in tokens:
if token in model.wv:
try:
# model.wv["trump"], length = 100
vectors.append(model.wv[token])
except KeyError:
continue
if vectors:
# vector shape = (44, 100) or (31, 100)
vectors = np.asarray(vectors)
# avg_vec shape = (100,)
avg_vec = vectors.mean(axis=0)
features.append(avg_vec)
else:
features.append(zero_vector)
return features
vectorized_docs = vectorize(tokenized_docs, model=model)
def kmeans_clusters(X, k):
"""Generate clusters.
Args:
X: Matrix of features.
k: Number of clusters.
Returns:
Trained clustering model and labels based on X.
"""
km = KMeans(n_clusters=k, n_init=10, max_iter=300, init='k-means++').fit(X)
return km, km.labels_
clustering, cluster_labels = kmeans_clusters(X=vectorized_docs, k=9)
# Create new dataframe with cluster labels
df_clusters = pd.DataFrame({
"Name": names,
"cluster": cluster_labels
})
# Seperate mentee from mentor
cluster_mentee = df_clusters.loc[df_clusters["Name"].isin(mentee_df["fullName"]), ["Name", "cluster"]].copy()
cluster_mentor = df_clusters.loc[df_clusters["Name"].isin(mentor_df["fullName"]), ["Name", "cluster"]].copy()
final_cluster_df = pd.merge(cluster_mentee, cluster_mentor, on="cluster", how="inner", suffixes=("_mentee", "_mentor"))
print(final_cluster_df.head())