1- import itertools , collections , logging
1+ import itertools
2+ import collections
3+ import logging
24
35import nltk
46import nltk .metrics
911
1012logger = logging .getLogger (__name__ )
1113
14+
1215def get_features (paragraph ):
1316 features = {}
14- ptext = paragraph .cleantext ()
15- assert ptext
16-
17- features ['starts_with_hyphen' ] = algo .features .starts_with_hyphen (ptext )
18- features ['is_indented' ] = algo .features .is_indented (ptext )
19- features ['par_length' ] = algo .features .par_length (ptext )
20- for w in ('=' , '--' , '[' , '|' , ',' ):
21- features ['first_line_contains_%s' % w ] = algo .features .first_line_contains (ptext , w )
22- features ['first_line_length' ] = algo .features .first_line_length (ptext )
23- features ['first_line_word_count' ] = algo .features .first_line_word_count (ptext )
24- features ['is_good_section' ] = algo .features .is_good_section (paragraph )
25- features ['word_count' ] = algo .features .word_count (ptext )
17+ p_text = paragraph .clean_text ()
18+ logger .debug (f"length of p_text: { len (p_text )} " )
19+ assert p_text
20+
21+ features ["starts_with_hyphen" ] = algo .features .starts_with_hyphen (p_text )
22+ features ["is_indented" ] = algo .features .is_indented (p_text )
23+ features ["par_length" ] = algo .features .par_length (p_text )
24+ for w in ("=" , "--" , "[" , "|" , "," ):
25+ features [f"first_line_contains_{ w } " ] = algo .features .first_line_contains (
26+ p_text , w
27+ )
28+ features ["first_line_length" ] = algo .features .first_line_length (p_text )
29+ features ["first_line_word_count" ] = algo .features .first_line_word_count (p_text )
30+ features ["is_good_section" ] = algo .features .is_good_section (paragraph )
31+ features ["word_count" ] = algo .features .word_count (p_text )
2632
2733 return features
2834
29- class classifier (object ):
30- '''classify the paragraphs of a man page as having command line options
31- or not'''
35+
36+ class Classifier :
37+ """classify the paragraphs of a man page as having command line options
38+ or not"""
39+
3240 def __init__ (self , store , algo , ** classifier_args ):
3341 self .store = store
3442 self .algo = algo
@@ -39,59 +47,60 @@ def train(self):
3947 if self .classifier :
4048 return
4149
42- manpages = self .store .trainingset ()
50+ man_pages = self .store .training_set ()
4351
4452 # flatten the manpages so we get a list of (manpage-name, paragraph)
4553 def flatten_manpages (manpage ):
46- l = []
54+ p_list = []
4755 for para in manpage .paragraphs :
48- l .append (para )
49- return l
50- paragraphs = itertools .chain (* [flatten_manpages (m ) for m in manpages ])
56+ p_list .append (para )
57+ return p_list
58+
59+ paragraphs = itertools .chain (* [flatten_manpages (m ) for m in man_pages ])
5160 training = list (paragraphs )
5261
53- negids = [p for p in training if not p .is_option ]
54- posids = [p for p in training if p .is_option ]
62+ neg_ids = [p for p in training if not p .is_option ]
63+ pos_ids = [p for p in training if p .is_option ]
5564
56- negfeats = [(get_features (p ), False ) for p in negids ]
57- posfeats = [(get_features (p ), True ) for p in posids ]
65+ neg_feats = [(get_features (p ), False ) for p in neg_ids ]
66+ pos_feats = [(get_features (p ), True ) for p in pos_ids ]
5867
59- negcutoff = len (negfeats ) * 3 / 4
60- poscutoff = len (posfeats ) * 3 / 4
68+ neg_cutoff = int ( len (neg_feats ) * 3 / 4 )
69+ pos_cutoff = int ( len (pos_feats ) * 3 / 4 )
6170
62- trainfeats = negfeats [: negcutoff ] + posfeats [: poscutoff ]
63- self .testfeats = negfeats [ negcutoff :] + posfeats [ poscutoff :]
71+ train_feats = neg_feats [: neg_cutoff ] + pos_feats [: pos_cutoff ]
72+ self .test_feats = neg_feats [ neg_cutoff :] + pos_feats [ pos_cutoff :]
6473
65- logger .info (' train on %d instances' , len (trainfeats ))
74+ logger .info (" train on %d instances" , len (train_feats ))
6675
67- if self .algo == ' maxent' :
76+ if self .algo == " maxent" :
6877 c = nltk .classify .maxent .MaxentClassifier
69- elif self .algo == ' bayes' :
78+ elif self .algo == " bayes" :
7079 c = nltk .classify .NaiveBayesClassifier
7180 else :
72- raise ValueError (' unknown classifier' )
81+ raise ValueError (" unknown classifier" )
7382
74- self .classifier = c .train (trainfeats , ** self .classifier_args )
83+ self .classifier = c .train (train_feats , ** self .classifier_args )
7584
7685 def evaluate (self ):
7786 self .train ()
78- refsets = collections .defaultdict (set )
79- testsets = collections .defaultdict (set )
87+ ref_sets = collections .defaultdict (set )
88+ test_sets = collections .defaultdict (set )
8089
81- for i , (feats , label ) in enumerate (self .testfeats ):
82- refsets [label ].add (i )
90+ for i , (feats , label ) in enumerate (self .test_feats ):
91+ ref_sets [label ].add (i )
8392 guess = self .classifier .prob_classify (feats )
8493 observed = guess .max ()
85- testsets [observed ].add (i )
86- #if label != observed:
87- # print 'label:', label, 'observed:', observed, feats
94+ test_sets [observed ].add (i )
95+ # if label != observed:
96+ # print( 'label:', label, 'observed:', observed, feats
8897
89- print ' pos precision:' , nltk .metrics .precision (refsets [True ], testsets [True ])
90- print ' pos recall:' , nltk .metrics .recall (refsets [True ], testsets [True ])
91- print ' neg precision:' , nltk .metrics .precision (refsets [False ], testsets [False ])
92- print ' neg recall:' , nltk .metrics .recall (refsets [False ], testsets [False ])
98+ print ( " pos precision:" , nltk .metrics .precision (ref_sets [True ], test_sets [True ]) )
99+ print ( " pos recall:" , nltk .metrics .recall (ref_sets [True ], test_sets [True ]) )
100+ print ( " neg precision:" , nltk .metrics .precision (ref_sets [False ], test_sets [False ]) )
101+ print ( " neg recall:" , nltk .metrics .recall (ref_sets [False ], test_sets [False ]) )
93102
94- print self .classifier .show_most_informative_features (10 )
103+ print ( self .classifier .show_most_informative_features (10 ) )
95104
96105 def classify (self , manpage ):
97106 self .train ()
@@ -102,10 +111,9 @@ def classify(self, manpage):
102111 option = guess .max ()
103112 certainty = guess .prob (option )
104113
105- if option :
106- if certainty < config .CLASSIFIER_CUTOFF :
107- pass
108- else :
109- logger .info ('classified %s (%f) as an option paragraph' , item , certainty )
110- item .is_option = True
111- yield certainty , item
114+ if option and certainty >= config .CLASSIFIER_CUTOFF :
115+ logger .info (
116+ "classified %s (%f) as an option paragraph" , item , certainty
117+ )
118+ item .is_option = True
119+ yield certainty , item
0 commit comments