-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclassifiers.py
More file actions
96 lines (77 loc) · 2.51 KB
/
classifiers.py
File metadata and controls
96 lines (77 loc) · 2.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from documents import StringDocumentFrequencies
from extractors import NGramExtractor
class Classifier:
def __init__(self,corpus):
self.corpus = corpus
self.categories = corpus.documents.keys()
self.extractor = corpus.extractor
def scoreString(self,string,category):
return self.scoreDocument(StringDocumentFrequencies(string,self.extractor),category)
def scoreDocument(self,doc,category):
raise NotImplementedError()
def getStringScores(self,string):
return self.getDocumentScores(StringDocumentFrequencies(string,self.extractor));
def getDocumentScores(self,document):
ls = [
(category,self.scoreDocument(document,category))
for category in self.categories
]
ls.sort(lambda x,y: cmp(y[1],x[1]))
return ls
def bm25formula(corpus,doc,word,b=0.75,k1=1.5):
freq = doc[word]
return max(0,corpus.idfs[word]*freq*(k1+1)/(freq+k1*(1+b*(len(doc)/corpus.averageDocumentLength-1))))
class BM25Classifier(Classifier):
def __init__(self,corpus,b=0.75,k1=1.5):
Classifier.__init__(self,corpus)
self.b = b
self.k1 = k1
def scoreDocument(self,query,category):
doc = self.corpus.documents[category]
return sum(
freq*bm25formula(self.corpus,doc,term,self.b,self.k1)
for (term,freq) in query.iteritems()
)
class GramMatchClassifier(Classifier):
def __init__(self,corpus):
Classifier.__init__(self,corpus)
def scoreDocument(self,query,category):
doc = self.corpus.documents[category]
intersection = sum(1 for word in query if word in doc)
try:
scorep = intersection / float(len(doc))
scoreq = intersection / float(len(query))
return 2*(scorep*scoreq)/(scorep+scoreq)
except ZeroDivisionError:
return None;
"""
class TermMatchingClassifier(Classifier):
def __init__(self,corpus,n):
Classifier.__init__(self,corpus,n)
def scoreDocument(self,query,document):
idfs = self.idfs
def scoreWords(queryTerm,passageTerm):
if queryTerm is passageTerm:
return idfs[queryTerm]
elif queryTerm is None:
return -idfs[passageTerm]
elif passageTerm is None:
return -idfs[queryTerm]
else:
return -idfs[passageTerm]
def nonePadder(gen):
gram = next(gen)
for i in xrange(len(gram)-1,0,-1):
yield i*(None,)+gram[i:]
yield gram
try:
while True:
gram = next(gen)
yield gram
except StopIteration:
for i in xrange(`
scores = [[0 for i in xrange(len(query))] for j in xrange(len(document))]
ex = NGramExtractor(len(query))
query = [next(ex.termsFromString(query.string))]
for passage in ex.termsFromFile(document.filename):
"""