From 2430cca7f2e55fb8921099b0ae627e5f19373b36 Mon Sep 17 00:00:00 2001
From: Joseph Albert <jxa357@psu.edu>
Date: Thu, 1 Sep 2016 05:12:58 -0400
Subject: [PATCH 1/4] Attempting to fix slow NaiveBayes

Three changes:

1) basic_extractor can accept a list of strings as well as a list of
('word','label') tuples.

2) BaseClassifier now has an instance variable _word_set which is a set
of tokens seen by the classifier.

1+2) BaseClassifier.extract_features passes _word_set to extractor
rather than the training set.

3)  NLTKClassifier.update adds new words to the _word_set.
---
 textblob/classifiers.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/textblob/classifiers.py b/textblob/classifiers.py
index 782bbebc..c3b81ce1 100644
--- a/textblob/classifiers.py
+++ b/textblob/classifiers.py
@@ -76,9 +76,15 @@ def basic_extractor(document, train_set):
 
     :param document: The text to extract features from. Can be a string or an iterable.
     :param list train_set: Training data set, a list of tuples of the form
-        ``(words, label)``.
+        ``(words, label)`` OR an iterable of strings.
     """
-    word_features = _get_words_from_dataset(train_set)
+    el_zero = iter(train_set).next() #Infer input from first element.
+    if isinstance(el_zero, tuple):
+        word_features = _get_words_from_dataset(train_set)
+    elif isinstance(el_zero, str):
+        word_features = train_set
+    else:
+        raise ValueError('train_set is proabably malformed.')
     tokens = _get_document_tokens(document)
     features = dict(((u'contains({0})'.format(word), (word in tokens))
                                             for word in word_features))
@@ -123,6 +129,7 @@ def __init__(self, train_set, feature_extractor=basic_extractor, format=None, **
             self.train_set = self._read_data(train_set, format)
         else:  # train_set is a list of tuples
             self.train_set = train_set
+        self._word_set = _get_words_from_dataset(train_set) #Keep a hidden set of unique words.
         self.train_features = None
 
     def _read_data(self, dataset, format=None):
@@ -166,7 +173,7 @@ def extract_features(self, text):
         '''
         # Feature extractor may take one or two arguments
         try:
-            return self.feature_extractor(text, self.train_set)
+            return self.feature_extractor(text, self._word_set)
         except (TypeError, AttributeError):
             return self.feature_extractor(text)
 
@@ -260,6 +267,7 @@ def update(self, new_data, *args, **kwargs):
             ``(text, label)``.
         """
         self.train_set += new_data
+        self._word_set.update(_get_words_from_dataset(new_data))
         self.train_features = [(self.extract_features(d), c)
                                 for d, c in self.train_set]
         try:

From 7505da49800d907ac211f08e4477e35284a2332c Mon Sep 17 00:00:00 2001
From: Joseph Albert <jxa357@psu.edu>
Date: Thu, 1 Sep 2016 06:36:22 -0400
Subject: [PATCH 2/4] Special-cased when train_set is the null set

Now returns an empty dict if passed an empty training set.

Also, cover some bases if train_set is consumed by .next()
---
 textblob/classifiers.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/textblob/classifiers.py b/textblob/classifiers.py
index c3b81ce1..faf7c193 100644
--- a/textblob/classifiers.py
+++ b/textblob/classifiers.py
@@ -78,13 +78,20 @@ def basic_extractor(document, train_set):
     :param list train_set: Training data set, a list of tuples of the form
         ``(words, label)`` OR an iterable of strings.
     """
-    el_zero = iter(train_set).next() #Infer input from first element.
-    if isinstance(el_zero, tuple):
-        word_features = _get_words_from_dataset(train_set)
-    elif isinstance(el_zero, str):
-        word_features = train_set
+
+    try:
+        el_zero = iter(train_set).next() #Infer input from first element.
+    except StopIteration:
+        return {}
+    if isinstance(el_zero, str):
+        word_features = [w for w in chain([el_zero],train_set)]
     else:
-        raise ValueError('train_set is proabably malformed.')
+        try:
+            assert(isinstance(el_zero[0], str))
+            word_features = _get_words_from_dataset(chain([el_zero],train_set))
+        except:
+            raise ValueError('train_set is proabably malformed.')
+
     tokens = _get_document_tokens(document)
     features = dict(((u'contains({0})'.format(word), (word in tokens))
                                             for word in word_features))

From 61c7e4768114ef05c93e0d1a69bd60fcf4256d06 Mon Sep 17 00:00:00 2001
From: Joseph Albert <jxa357@psu.edu>
Date: Sat, 6 May 2017 19:04:49 -0400
Subject: [PATCH 3/4] Base_Classifier wasn't unicode-ready.

Fixed bug where _word_set was based on train_set, even if train_set
is filelike instead of iterable.
---
 textblob/classifiers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/textblob/classifiers.py b/textblob/classifiers.py
index faf7c193..0f1afe18 100644
--- a/textblob/classifiers.py
+++ b/textblob/classifiers.py
@@ -83,11 +83,11 @@ def basic_extractor(document, train_set):
         el_zero = iter(train_set).next() #Infer input from first element.
     except StopIteration:
         return {}
-    if isinstance(el_zero, str):
+    if isinstance(el_zero, basestring):
         word_features = [w for w in chain([el_zero],train_set)]
     else:
         try:
-            assert(isinstance(el_zero[0], str))
+            assert(isinstance(el_zero[0], basestring))
             word_features = _get_words_from_dataset(chain([el_zero],train_set))
         except:
             raise ValueError('train_set is proabably malformed.')
@@ -136,7 +136,7 @@ def __init__(self, train_set, feature_extractor=basic_extractor, format=None, **
             self.train_set = self._read_data(train_set, format)
         else:  # train_set is a list of tuples
             self.train_set = train_set
-        self._word_set = _get_words_from_dataset(train_set) #Keep a hidden set of unique words.
+        self._word_set = _get_words_from_dataset(self.train_set) #Keep a hidden set of unique words.
         self.train_features = None
 
     def _read_data(self, dataset, format=None):

From 57b8969a9d71eb9ad28c652aa73e22f5be8000ca Mon Sep 17 00:00:00 2001
From: jcalbert <jxa357@psu.edu>
Date: Thu, 11 May 2017 02:28:36 -0400
Subject: [PATCH 4/4] Fixed a .next() call that broke py3 compatibility.

---
 textblob/classifiers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/textblob/classifiers.py b/textblob/classifiers.py
index 0f1afe18..742e837c 100644
--- a/textblob/classifiers.py
+++ b/textblob/classifiers.py
@@ -80,7 +80,7 @@ def basic_extractor(document, train_set):
     """
 
     try:
-        el_zero = iter(train_set).next() #Infer input from first element.
+        el_zero = next(iter(train_set)) #Infer input from first element.
     except StopIteration:
         return {}
     if isinstance(el_zero, basestring):