From 2430cca7f2e55fb8921099b0ae627e5f19373b36 Mon Sep 17 00:00:00 2001 From: Joseph Albert Date: Thu, 1 Sep 2016 05:12:58 -0400 Subject: [PATCH 1/4] Attempting to fix slow NaiveBayes Three changes: 1) basic_extractor can accept a list of strings as well as a list of ('word','label') tuples. 2) BaseClassifier now has an instance variable _word_set which is a set of tokens seen by the classifier. 1+2) BaseClassifier.extract_features passes _word_set to extractor rather than the training set. 3) NLTKClassifier.update adds new words to the _word_set. --- textblob/classifiers.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/textblob/classifiers.py b/textblob/classifiers.py index 782bbebc..c3b81ce1 100644 --- a/textblob/classifiers.py +++ b/textblob/classifiers.py @@ -76,9 +76,15 @@ def basic_extractor(document, train_set): :param document: The text to extract features from. Can be a string or an iterable. :param list train_set: Training data set, a list of tuples of the form - ``(words, label)``. + ``(words, label)`` OR an iterable of strings. """ - word_features = _get_words_from_dataset(train_set) + el_zero = iter(train_set).next() #Infer input from first element. + if isinstance(el_zero, tuple): + word_features = _get_words_from_dataset(train_set) + elif isinstance(el_zero, str): + word_features = train_set + else: + raise ValueError('train_set is proabably malformed.') tokens = _get_document_tokens(document) features = dict(((u'contains({0})'.format(word), (word in tokens)) for word in word_features)) @@ -123,6 +129,7 @@ def __init__(self, train_set, feature_extractor=basic_extractor, format=None, ** self.train_set = self._read_data(train_set, format) else: # train_set is a list of tuples self.train_set = train_set + self._word_set = _get_words_from_dataset(train_set) #Keep a hidden set of unique words. self.train_features = None def _read_data(self, dataset, format=None): @@ -166,7 +173,7 @@ def extract_features(self, text): ''' # Feature extractor may take one or two arguments try: - return self.feature_extractor(text, self.train_set) + return self.feature_extractor(text, self._word_set) except (TypeError, AttributeError): return self.feature_extractor(text) @@ -260,6 +267,7 @@ def update(self, new_data, *args, **kwargs): ``(text, label)``. """ self.train_set += new_data + self._word_set.update(_get_words_from_dataset(new_data)) self.train_features = [(self.extract_features(d), c) for d, c in self.train_set] try: From 7505da49800d907ac211f08e4477e35284a2332c Mon Sep 17 00:00:00 2001 From: Joseph Albert Date: Thu, 1 Sep 2016 06:36:22 -0400 Subject: [PATCH 2/4] Special-cased when train_set is the null set Now returns an empty dict if passed an empty training set. Also, cover some bases if train_set is consumed by .next() --- textblob/classifiers.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/textblob/classifiers.py b/textblob/classifiers.py index c3b81ce1..faf7c193 100644 --- a/textblob/classifiers.py +++ b/textblob/classifiers.py @@ -78,13 +78,20 @@ def basic_extractor(document, train_set): :param list train_set: Training data set, a list of tuples of the form ``(words, label)`` OR an iterable of strings. """ - el_zero = iter(train_set).next() #Infer input from first element. - if isinstance(el_zero, tuple): - word_features = _get_words_from_dataset(train_set) - elif isinstance(el_zero, str): - word_features = train_set + + try: + el_zero = iter(train_set).next() #Infer input from first element. + except StopIteration: + return {} + if isinstance(el_zero, str): + word_features = [w for w in chain([el_zero],train_set)] else: - raise ValueError('train_set is proabably malformed.') + try: + assert(isinstance(el_zero[0], str)) + word_features = _get_words_from_dataset(chain([el_zero],train_set)) + except: + raise ValueError('train_set is proabably malformed.') + tokens = _get_document_tokens(document) features = dict(((u'contains({0})'.format(word), (word in tokens)) for word in word_features)) From 61c7e4768114ef05c93e0d1a69bd60fcf4256d06 Mon Sep 17 00:00:00 2001 From: Joseph Albert Date: Sat, 6 May 2017 19:04:49 -0400 Subject: [PATCH 3/4] Base_Classifier wasn't unicode-ready. Fixed bug where _word_set was based on train_set, even if train_set is filelike instead of iterable. --- textblob/classifiers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/textblob/classifiers.py b/textblob/classifiers.py index faf7c193..0f1afe18 100644 --- a/textblob/classifiers.py +++ b/textblob/classifiers.py @@ -83,11 +83,11 @@ def basic_extractor(document, train_set): el_zero = iter(train_set).next() #Infer input from first element. except StopIteration: return {} - if isinstance(el_zero, str): + if isinstance(el_zero, basestring): word_features = [w for w in chain([el_zero],train_set)] else: try: - assert(isinstance(el_zero[0], str)) + assert(isinstance(el_zero[0], basestring)) word_features = _get_words_from_dataset(chain([el_zero],train_set)) except: raise ValueError('train_set is proabably malformed.') @@ -136,7 +136,7 @@ def __init__(self, train_set, feature_extractor=basic_extractor, format=None, ** self.train_set = self._read_data(train_set, format) else: # train_set is a list of tuples self.train_set = train_set - self._word_set = _get_words_from_dataset(train_set) #Keep a hidden set of unique words. + self._word_set = _get_words_from_dataset(self.train_set) #Keep a hidden set of unique words. self.train_features = None def _read_data(self, dataset, format=None): From 57b8969a9d71eb9ad28c652aa73e22f5be8000ca Mon Sep 17 00:00:00 2001 From: jcalbert Date: Thu, 11 May 2017 02:28:36 -0400 Subject: [PATCH 4/4] Fixed a .next() call that broke py3 compatibility. --- textblob/classifiers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textblob/classifiers.py b/textblob/classifiers.py index 0f1afe18..742e837c 100644 --- a/textblob/classifiers.py +++ b/textblob/classifiers.py @@ -80,7 +80,7 @@ def basic_extractor(document, train_set): """ try: - el_zero = iter(train_set).next() #Infer input from first element. + el_zero = next(iter(train_set)) #Infer input from first element. except StopIteration: return {} if isinstance(el_zero, basestring):