From 30e96225cc2f6ef4dbb5a186ee21bd3afc31fc34 Mon Sep 17 00:00:00 2001
From: JoyceYuen <yuanyunying@yeah.net>
Date: Sat, 3 Jan 2015 21:07:17 +0800
Subject: [PATCH] Update classes.py in knn

Hi,
I like your code. It's concise and efficient.
But when i read the recommenders part, that's the "class UserBasedRecommender(UserRecommender)", i found the code in the method named estimated_preference can not guarantee that one neighbor's preference will multiple the his similarity rather than others.

It is the previous code:
        prefs = prefs[~np.isnan(prefs)]
        similarities = similarities[~np.isnan(prefs)]

        prefs_sim = np.sum(prefs[~np.isnan(similarities)] *
                             similarities[~np.isnan(similarities)])
        total_similarity = np.sum(similarities)

I take a simple example:
>>> import numpy as np
>>> p = np.array([np.nan, 3,4,5,np.nan,5,6,np.nan,9,10])
>>> p
array([ nan,   3.,   4.,   5.,  nan,   5.,   6.,  nan,   9.,  10.])
>>> s = np.array([1,np.nan,4,6,np.nan,6,7,8,9,10])
>>> s
array([  1.,  nan,   4.,   6.,  nan,   6.,   7.,   8.,   9.,  10.])
>>> p = p[~np.isnan(p)]
>>> p
array([  3.,   4.,   5.,   5.,   6.,   9.,  10.])
>>> s = s[~np.isnan(p)]
>>> s
array([  1.,  nan,   4.,   6.,  nan,   6.,   7.])
>>> p[~np.isnan(s)]
array([  3.,   5.,   5.,   9.,  10.])
>>> s[~np.isnan(s)]
array([ 1.,  4.,  6.,  6.,  7.])
>>> p[~np.isnan(s)]*s[~np.isnan(s)]
array([  3.,  20.,  30.,  54.,  70.])

it follows the steps as the code. as you can see, it gets a wrong result.

my code is like this:
        temp_prefs = [~np.isnan(prefs)]
        temp_similarities = [~np.isnan(similarities)]
        noNaN_indices = np.logical_and(temp_prefs, temp_similarities)

        prefs_sim = np.sum(prefs[noNaN_indices[0] == True] *
                             similarities[noNaN_indices[0] == True])

        similarities = similarities[~np.isnan(similarities)]
        total_similarity = np.sum(similarities)

with the same example:
>>> pp = np.array([np.nan,3,4,5,np.nan,5,6,np.nan,9,10])
>>> pp
array([ nan,   3.,   4.,   5.,  nan,   5.,   6.,  nan,   9.,  10.])
>>> ss = np.array([1,np.nan,4,6,np.nan,6,7,8,9,10])
>>> ss
array([  1.,  nan,   4.,   6.,  nan,   6.,   7.,   8.,   9.,  10.])
>>> tss = [~np.isnan(ss)]
>>> tss
[array([ True, False,  True,  True, False,  True,  True,  True,  True,  True], dtype=bool)]
>>> tpp = [~np.isnan(pp)]
>>> tpp
[array([False,  True,  True,  True, False,  True,  True, False,  True,  True], dtype=bool)]
>>> nonNaN = np.logical_and(tss,tpp)
>>> nonNaN
array([[False, False,  True,  True, False,  True,  True, False,  True,
         True]], dtype=bool)
>>> ss[nonNaN[0] == True] * pp[nonNaN[0] == True]
array([  16.,   30.,   30.,   42.,   81.,  100.])

as you can see, it gets the right answer.

if i misunderstood, please let me know. Thank you in advance.

Best Wishes
---
 scikits/crab/recommenders/knn/classes.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/scikits/crab/recommenders/knn/classes.py b/scikits/crab/recommenders/knn/classes.py
index bf96af3..7e4e86d 100644
--- a/scikits/crab/recommenders/knn/classes.py
+++ b/scikits/crab/recommenders/knn/classes.py
@@ -532,12 +532,15 @@ def estimate_preference(self, user_id, item_id, **params):
 
         prefs = np.array([self.model.preference_value(to_user_id, item_id)
                  for to_user_id in nearest_neighbors])
-
-        prefs = prefs[~np.isnan(prefs)]
-        similarities = similarities[~np.isnan(prefs)]
-
-        prefs_sim = np.sum(prefs[~np.isnan(similarities)] *
-                             similarities[~np.isnan(similarities)])
+        
+        temp_prefs = [~np.isnan(prefs)]
+        temp_similarities = [~np.isnan(similarities)]
+        noNaN_indices = np.logical_and(temp_prefs, temp_similarities)
+        
+        prefs_sim = np.sum(prefs[noNaN_indices[0] == True] *
+                             similarities[noNaN_indices[0] == True])
+                             
+        similarities = similarities[~np.isnan(similarities)]
         total_similarity = np.sum(similarities)
 
         #Throw out the estimate if it was based on no data points,