From 01dd137f5261d6cf472e0498f65d10d966832aaf Mon Sep 17 00:00:00 2001 From: Tyler Robertson Date: Tue, 4 Dec 2012 14:17:13 -0500 Subject: [PATCH 01/14] updated .gitignore to ignore temp files, began setting up new ranking algorithm --- .gitignore | 4 +++- request.py | 1 + server.py | 21 ++++++++++++++++++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 47771b9..42f4ce0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ html/ .DS_Store venv -*.pyc \ No newline at end of file +*.pyc +*~* +*#* \ No newline at end of file diff --git a/request.py b/request.py index d20308a..a3eb66d 100644 --- a/request.py +++ b/request.py @@ -45,3 +45,4 @@ def download_reflections_pages(): f.close() if __name__ == '__main__': download_reflections_pages() + diff --git a/server.py b/server.py index fb204c2..a0dd029 100644 --- a/server.py +++ b/server.py @@ -27,7 +27,7 @@ def get_icon(): #described above. def get_JSON(name): - + # debug print "got request for " + name match_data = {} @@ -36,6 +36,8 @@ def get_JSON(name): match_data['children'] = [] doc = collection.find_one({'name':name}) + + # debug print "name: " + name my_keywords = doc['keywords'].keys() @@ -46,6 +48,9 @@ def get_JSON(name): word_data['name'] = keyword word_data['children'] = [] + # how many times 'name' has used the keyword. used to determine weight ratio + my_times = doc['keywords'][keyword] + match_names = [] match_docs = collection.find({'keywords.'+keyword:{'$exists':True}}).sort('keywords.'+keyword, -1) @@ -57,6 +62,20 @@ def get_JSON(name): match_names.append(match_name) kw_matches_by_name[match_name] = kw_matches_by_name.get(match_name, 0) + 1 + for person in match_names: + # INCLUDE TO CONCEAL LAST NAMES + # space1 = person.find(" ") + # space2 = person.find(" ", space1+1) + + person_data = {} + num_kw_matches = collection.find_one({'name':person})['keywords'][keyword] + person_data['name'] = (person + # INCLUDE TO CONCEAL LAST NAMES + # [:space1] + + " (matches: " + str(num_kw_matches) + ", weight: " + str(ratio) + ")" ) + word_data['children'].append(person_data) + + for person in match_names: # INCLUDE TO CONCEAL LAST NAMES # space1 = person.find(" ") From 41ce1eff41d35639aaeccb545685430e2295036e Mon Sep 17 00:00:00 2001 From: Tyler Robertson Date: Tue, 4 Dec 2012 14:19:14 -0500 Subject: [PATCH 02/14] formatted output string --- server.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/server.py b/server.py index a0dd029..e7566f3 100644 --- a/server.py +++ b/server.py @@ -69,10 +69,14 @@ def get_JSON(name): person_data = {} num_kw_matches = collection.find_one({'name':person})['keywords'][keyword] - person_data['name'] = (person # INCLUDE TO CONCEAL LAST NAMES # [:space1] - + " (matches: " + str(num_kw_matches) + ", weight: " + str(ratio) + ")" ) + person_data['name'] = (person + + " (matches: " + + str(num_kw_matches) + + ", weight: " + + str(ratio) + + ")" ) word_data['children'].append(person_data) From 73963fda597d52ab1386a5c2614c6213e1dd40e9 Mon Sep 17 00:00:00 2001 From: Tyler Robertson Date: Tue, 4 Dec 2012 14:36:49 -0500 Subject: [PATCH 03/14] wrote code to display weights --- server.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/server.py b/server.py index e7566f3..1e43823 100644 --- a/server.py +++ b/server.py @@ -69,6 +69,18 @@ def get_JSON(name): person_data = {} num_kw_matches = collection.find_one({'name':person})['keywords'][keyword] + + # figure out the weight to assign the matches. We want the lowest ratio. + r1 = my_times / num_kw_matches + r2 = num_kw_matches / my_times + + # final ratio + ratio = 0 + if r1 > r2: + ratio = r2 + else: + ratio = r1 + # INCLUDE TO CONCEAL LAST NAMES # [:space1] person_data['name'] = (person + From 609a14207fc18e373e00d91580b38a8eebafb2c3 Mon Sep 17 00:00:00 2001 From: Tyler Robertson Date: Tue, 4 Dec 2012 15:20:12 -0500 Subject: [PATCH 04/14] more work on new ranking algorithm. and comments --- request.py | 23 +++++++++++------------ server.py | 33 ++++++++++++--------------------- 2 files changed, 23 insertions(+), 33 deletions(-) diff --git a/request.py b/request.py index a3eb66d..23813bd 100644 --- a/request.py +++ b/request.py @@ -8,30 +8,30 @@ import requests def get_session(email, password, host='https://www.hackerschool.com'): - s = requests.session() - #host = 'http://localhost:5000' - # This request is to get the CSRF token (the point of which is to make sure other websites - # can't make requests on your behalf I think, something to with cross-site scripting - # http://en.wikipedia.org/wiki/Cross-site_request_forgery - r = s.get(host+'/login', verify=False) - m = re.search(r' r2: - ratio = r2 + ratio += r2 else: - ratio = r1 + ratio += r1 + # sum of ratios, per person + weight_sum[person] += ratio + # INCLUDE TO CONCEAL LAST NAMES # [:space1] person_data['name'] = (person + - " (matches: " + - str(num_kw_matches) + - ", weight: " + - str(ratio) + + " (" + + str(ratio)[:5] + ")" ) word_data['children'].append(person_data) - - for person in match_names: - # INCLUDE TO CONCEAL LAST NAMES - # space1 = person.find(" ") - # space2 = person.find(" ", space1+1) - - person_data = {} - num_kw_matches = collection.find_one({'name':person})['keywords'][keyword] - person_data['name'] = (person - # INCLUDE TO CONCEAL LAST NAMES - # [:space1] - + " (" + str(num_kw_matches) + ")" ) - word_data['children'].append(person_data) - match_data['children'].append(word_data) match_data['top'] = max(kw_matches_by_name, key=kw_matches_by_name.get) From 099d30be0862ca8546084bd288876e00b66efe6e Mon Sep 17 00:00:00 2001 From: Tyler Robertson Date: Tue, 4 Dec 2012 15:21:27 -0500 Subject: [PATCH 05/14] tweaked main.py --- request.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/request.py b/request.py index 23813bd..2fa08c2 100644 --- a/request.py +++ b/request.py @@ -31,7 +31,7 @@ def download_reflections_pages(): host = 'https://hackerschool.com' email = os.environ.get('REFLECTION_ID') password = os.environ.get('REFLECTION_SECRET') - s = get_session('tyler.evan.robertson@gmail.com', 'robertson1') + s = get_session(email, password) if not os.path.exists ("html"): os.mkdir ("html") From 55bf63587fe1dd456a794976a4add1d54ed8cf3f Mon Sep 17 00:00:00 2001 From: Tyler Robertson Date: Tue, 4 Dec 2012 15:24:56 -0500 Subject: [PATCH 06/14] changed port --- server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server.py b/server.py index 4743809..6adb4b0 100644 --- a/server.py +++ b/server.py @@ -104,6 +104,6 @@ def get_JSON(name): if __name__ == '__main__': port = int(os.environ.get('PORT', 80)) - app.run(host='0.0.0.0', port=port) + app.run(host='127.0.0.1', port=port) From 0f01c3914fc61fa92e83ab5bae8a6f5491b98aad Mon Sep 17 00:00:00 2001 From: Tyler Robertson Date: Tue, 4 Dec 2012 15:39:20 -0500 Subject: [PATCH 07/14] cleaning up the code --- server.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/server.py b/server.py index 6adb4b0..f855169 100644 --- a/server.py +++ b/server.py @@ -47,21 +47,21 @@ def get_JSON(name): weight_sum = defaultdict(float) for keyword in my_keywords: - word_data = {} - word_data['name'] = keyword - word_data['children'] = [] + kw_data = {} + kw_data['name'] = keyword + kw_data['children'] = [] # how many times 'name' has used the keyword. used to determine weight ratio my_times = doc['keywords'][keyword] match_names = [] # all the other people who have used the keyword, sorted by how many times they've used it - match_docs = collection.find({'keywords.'+keyword:{'$exists':True}}).sort('keywords.'+keyword, -1) + matches = collection.find({'keywords.'+keyword:{'$exists':True}}).sort('keywords.'+keyword, -1) - #INCLUDE TO LIMIT MATCHES BY WORD - #for match_doc in match_docs[:10]: - for match_doc in match_docs[:10]: - match_name = match_doc['name'] + # the [:10] slice limits how many people to check against, in order to reduce + # visual clutter on the final page + for match in matches[:10]: + match_name = match['name'] if match_name != name: match_names.append(match_name) kw_matches_by_name[match_name] = kw_matches_by_name.get(match_name, 0) + 1 @@ -94,9 +94,9 @@ def get_JSON(name): " (" + str(ratio)[:5] + ")" ) - word_data['children'].append(person_data) + kw_data['children'].append(person_data) - match_data['children'].append(word_data) + match_data['children'].append(kw_data) match_data['top'] = max(kw_matches_by_name, key=kw_matches_by_name.get) From 60deda77b2c1984dddcd91bb75bcb9275947b21c Mon Sep 17 00:00:00 2001 From: Tyler Robertson Date: Tue, 4 Dec 2012 15:44:18 -0500 Subject: [PATCH 08/14] still cleaning code --- server.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/server.py b/server.py index f855169..3ad0867 100644 --- a/server.py +++ b/server.py @@ -41,22 +41,22 @@ def get_JSON(name): print "name: " + name my_keywords = doc['keywords'].keys() - kw_matches_by_name = {} + kw_matches = {} # contains each person, and their project similarity, through sum of keyword ratios - weight_sum = defaultdict(float) + weight_sums = defaultdict(float) - for keyword in my_keywords: + for kw in my_kws: kw_data = {} - kw_data['name'] = keyword + kw_data['name'] = kw kw_data['children'] = [] - # how many times 'name' has used the keyword. used to determine weight ratio - my_times = doc['keywords'][keyword] + # how many times 'name' has used the kw. used to determine weight ratio + my_times = doc['kws'][kw] match_names = [] - # all the other people who have used the keyword, sorted by how many times they've used it - matches = collection.find({'keywords.'+keyword:{'$exists':True}}).sort('keywords.'+keyword, -1) + # all the other people who have used the kw, sorted by how many times they've used it + matches = collection.find({'kws.'+kw:{'$exists':True}}).sort('kws.'+kw, -1) # the [:10] slice limits how many people to check against, in order to reduce # visual clutter on the final page @@ -64,7 +64,7 @@ def get_JSON(name): match_name = match['name'] if match_name != name: match_names.append(match_name) - kw_matches_by_name[match_name] = kw_matches_by_name.get(match_name, 0) + 1 + kw_matches[match_name] = kw_matches.get(match_name, 0) + 1 for person in match_names: # INCLUDE TO CONCEAL LAST NAMES @@ -72,7 +72,7 @@ def get_JSON(name): # space2 = person.find(" ", space1+1) person_data = {} - num_kw_matches = collection.find_one({'name':person})['keywords'][keyword] + num_kw_matches = collection.find_one({'name':person})['kws'][kw] # figure out the weight to assign the matches. We want the lowest ratio. r1 = my_times / num_kw_matches @@ -86,7 +86,7 @@ def get_JSON(name): ratio += r1 # sum of ratios, per person - weight_sum[person] += ratio + weight_sums[person] += ratio # INCLUDE TO CONCEAL LAST NAMES # [:space1] @@ -98,7 +98,7 @@ def get_JSON(name): match_data['children'].append(kw_data) - match_data['top'] = max(kw_matches_by_name, key=kw_matches_by_name.get) + match_data['top'] = max(kw_matches, key=kw_matches.get) return json.dumps(match_data) From 891143d240db1a1d6d49a58869e2b541eff732c9 Mon Sep 17 00:00:00 2001 From: Tyler Robertson Date: Tue, 4 Dec 2012 15:49:08 -0500 Subject: [PATCH 09/14] worked on comments --- server.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/server.py b/server.py index 3ad0867..a84b392 100644 --- a/server.py +++ b/server.py @@ -56,7 +56,7 @@ def get_JSON(name): match_names = [] # all the other people who have used the kw, sorted by how many times they've used it - matches = collection.find({'kws.'+kw:{'$exists':True}}).sort('kws.'+kw, -1) + matches = collection.find({'keywords.'+kw:{'$exists':True}}).sort('keywords.'+kw, -1) # the [:10] slice limits how many people to check against, in order to reduce # visual clutter on the final page @@ -66,13 +66,10 @@ def get_JSON(name): match_names.append(match_name) kw_matches[match_name] = kw_matches.get(match_name, 0) + 1 + # make a child node for each person who used the kw for person in match_names: - # INCLUDE TO CONCEAL LAST NAMES - # space1 = person.find(" ") - # space2 = person.find(" ", space1+1) - person_data = {} - num_kw_matches = collection.find_one({'name':person})['kws'][kw] + num_kw_matches = collection.find_one({'name':person})['keywords'][kw] # figure out the weight to assign the matches. We want the lowest ratio. r1 = my_times / num_kw_matches @@ -87,19 +84,22 @@ def get_JSON(name): # sum of ratios, per person weight_sums[person] += ratio - - # INCLUDE TO CONCEAL LAST NAMES - # [:space1] - person_data['name'] = (person + - " (" + + + # how the person's name will appear on the page + person_data['name'] = (person + + " (" + str(ratio)[:5] + ")" ) + # add this entry to the people per keyword kw_data['children'].append(person_data) + # add this keyword's data to the tree match_data['children'].append(kw_data) + # select the person with the highest score match_data['top'] = max(kw_matches, key=kw_matches.get) + # dump it return json.dumps(match_data) if __name__ == '__main__': From 292b375cb88e2e15bf920e2d0b70db493191675e Mon Sep 17 00:00:00 2001 From: Tyler Robertson Date: Tue, 4 Dec 2012 15:52:50 -0500 Subject: [PATCH 10/14] fixed some replace-string errors --- server.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/server.py b/server.py index a84b392..ad3222e 100644 --- a/server.py +++ b/server.py @@ -29,7 +29,8 @@ def get_icon(): def get_JSON(name): # debug print "got request for " + name - + + # the tree that will be converted to json, and passed to the frontend match_data = {} match_data['name'] = name @@ -37,9 +38,7 @@ def get_JSON(name): doc = collection.find_one({'name':name}) - # debug - print "name: " + name - my_keywords = doc['keywords'].keys() + my_kws = doc['keywords'].keys() kw_matches = {} @@ -52,7 +51,7 @@ def get_JSON(name): kw_data['children'] = [] # how many times 'name' has used the kw. used to determine weight ratio - my_times = doc['kws'][kw] + my_times = doc['keywords'][kw] match_names = [] # all the other people who have used the kw, sorted by how many times they've used it From 0d0af3572c2ddc7c24357f4419b282c53ef3d6dd Mon Sep 17 00:00:00 2001 From: Tyler Robertson Date: Tue, 4 Dec 2012 16:44:17 -0500 Subject: [PATCH 11/14] got it to work again. tweaked what it displays --- server.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/server.py b/server.py index ad3222e..5621c23 100644 --- a/server.py +++ b/server.py @@ -2,6 +2,7 @@ from pymongo import Connection import json import os +from collections import defaultdict app = Flask(__name__) connection = Connection() @@ -51,7 +52,7 @@ def get_JSON(name): kw_data['children'] = [] # how many times 'name' has used the kw. used to determine weight ratio - my_times = doc['keywords'][kw] + name_times = doc['keywords'][kw] match_names = [] # all the other people who have used the kw, sorted by how many times they've used it @@ -60,22 +61,24 @@ def get_JSON(name): # the [:10] slice limits how many people to check against, in order to reduce # visual clutter on the final page for match in matches[:10]: - match_name = match['name'] - if match_name != name: - match_names.append(match_name) - kw_matches[match_name] = kw_matches.get(match_name, 0) + 1 + mn = match['name'] + if mn != name: + match_names.append(mn) + kw_matches[mn] = kw_matches.get(mn, 0) + 1 - # make a child node for each person who used the kw + # make a child node for each person who used the keyword for person in match_names: person_data = {} num_kw_matches = collection.find_one({'name':person})['keywords'][kw] # figure out the weight to assign the matches. We want the lowest ratio. - r1 = my_times / num_kw_matches - r2 = num_kw_matches / my_times + # need a better way to make these floats + r1 = 1.0 * name_times / num_kw_matches + # how much they've been doing, in comparison to name + r2 = 1.0 * num_kw_matches / name_times - # final ratio - ratio = 0.0 + # final ratio (for weighting) + ratio = 0 if r1 > r2: ratio += r2 else: @@ -84,10 +87,12 @@ def get_JSON(name): # sum of ratios, per person weight_sums[person] += ratio - # how the person's name will appear on the page - person_data['name'] = (person + - " (" + - str(ratio)[:5] + + # how the person's name will appear on the page. we use r2 because we want + # to show how much of a topic each person has done. so where ratio is + # 0 <= ratio <= 1 , r2 is 0 <= r2 <= FLOAT_MAX + person_data['name'] = (person + + " (" + + str(r2)[:5] + ")" ) # add this entry to the people per keyword kw_data['children'].append(person_data) From da60d31ce20b8a12fb3bf08e610fde950cade5f1 Mon Sep 17 00:00:00 2001 From: Tyler Robertson Date: Tue, 4 Dec 2012 16:46:16 -0500 Subject: [PATCH 12/14] changed display back --- server.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/server.py b/server.py index 5621c23..2d5c023 100644 --- a/server.py +++ b/server.py @@ -74,7 +74,6 @@ def get_JSON(name): # figure out the weight to assign the matches. We want the lowest ratio. # need a better way to make these floats r1 = 1.0 * name_times / num_kw_matches - # how much they've been doing, in comparison to name r2 = 1.0 * num_kw_matches / name_times # final ratio (for weighting) @@ -87,12 +86,10 @@ def get_JSON(name): # sum of ratios, per person weight_sums[person] += ratio - # how the person's name will appear on the page. we use r2 because we want - # to show how much of a topic each person has done. so where ratio is - # 0 <= ratio <= 1 , r2 is 0 <= r2 <= FLOAT_MAX + # how the person's name will appear on the page person_data['name'] = (person + " (" + - str(r2)[:5] + + str(ratio)[:5] + ")" ) # add this entry to the people per keyword kw_data['children'].append(person_data) From 67de949088f058931bb8cb8c7f962bab55361f44 Mon Sep 17 00:00:00 2001 From: Tyler Robertson Date: Tue, 4 Dec 2012 16:50:39 -0500 Subject: [PATCH 13/14] switched to the new ranking algo --- server.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/server.py b/server.py index 2d5c023..9cc1951 100644 --- a/server.py +++ b/server.py @@ -45,7 +45,8 @@ def get_JSON(name): # contains each person, and their project similarity, through sum of keyword ratios weight_sums = defaultdict(float) - + + # for each keyword 'name' used for kw in my_kws: kw_data = {} kw_data['name'] = kw @@ -98,7 +99,7 @@ def get_JSON(name): match_data['children'].append(kw_data) # select the person with the highest score - match_data['top'] = max(kw_matches, key=kw_matches.get) + match_data['top'] = max(weight_sums, key=weight_sums.get) # dump it return json.dumps(match_data) From 93badacf6f8157495bc0b541346a456725fd377c Mon Sep 17 00:00:00 2001 From: Tyler Robertson Date: Tue, 4 Dec 2012 16:55:37 -0500 Subject: [PATCH 14/14] removed old algo --- server.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/server.py b/server.py index 9cc1951..a34b81c 100644 --- a/server.py +++ b/server.py @@ -41,8 +41,6 @@ def get_JSON(name): my_kws = doc['keywords'].keys() - kw_matches = {} - # contains each person, and their project similarity, through sum of keyword ratios weight_sums = defaultdict(float) @@ -65,7 +63,6 @@ def get_JSON(name): mn = match['name'] if mn != name: match_names.append(mn) - kw_matches[mn] = kw_matches.get(mn, 0) + 1 # make a child node for each person who used the keyword for person in match_names: