Skip to content

Commit 551d1d3

Browse files
Merge PR idank#330: Python 3 migration
2 parents 6807d53 + 8b98cc6 commit 551d1d3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+3539
-2199
lines changed

.dockerignore

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
.gitignore
2+
.git/
3+
.github/
4+
misc/
5+
tests/
6+
tools
7+
venv/
8+
dump/
9+
.mpypy_cache/
10+
*.pyc
11+
*.log
12+
README.md
13+
docker-compose.yml
14+
Dockerfile
15+
Makefile

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,7 @@
22
*.swp
33
.coverage
44
.vagrant
5-
application.log
5+
*.log
6+
venv/
7+
__pycache__
8+
.mpypy_cache/

Dockerfile

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,15 @@
1-
FROM python:2.7
1+
FROM python:3.12
22

3-
RUN apt-get update \
4-
&& apt-get install man-db -y \
5-
&& apt-get clean
3+
RUN apt update \
4+
&& apt install man-db -y \
5+
&& apt clean
66

7-
ADD ./requirements.txt /tmp/requirements.txt
7+
WORKDIR /opt/webapp
8+
COPY . .
89

9-
RUN pip install --upgrade pip \
10-
&& python --version \
11-
&& pip install -r /tmp/requirements.txt \
12-
&& rm -rf ~/.cache/pip/*
10+
RUN pip3 install --no-cache-dir --no-warn-script-location --upgrade pip setuptools wheel virtualenv \
11+
&& pip3 install --no-cache-dir --no-warn-script-location -r requirements.txt
1312

14-
ADD ./ /opt/webapp/
15-
WORKDIR /opt/webapp
1613
EXPOSE 5000
1714

18-
CMD ["make", "serve"]
15+
CMD ["python3", "runserver.py"]

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@ tests:
22
nosetests --exe --with-doctest tests/ explainshell/
33

44
serve:
5-
python runserver.py
5+
docker-compose up --build
66

77
.PHONY: tests

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ When querying explainshell, it:
3737
>
3838
> If you're relying on manpages, be aware that they may not reflect the latest behavior. Contributions in this area are welcome but would require rethinking the documentation pipeline.
3939
40-
Right now explainshell.com contains the entire [archive of Ubuntu](http://manpages.ubuntu.com/). It's not
40+
Right now explainshell.com contains the entire [archive of Ubuntu](https://manpages.ubuntu.com/). It's not
4141
possible to directly add a missing man page to the live site (it might be in the future).
4242

4343
## Running explainshell locally

docker-compose.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
version: '2'
21
services:
32
db:
43
image: mongo
54
web:
6-
build: .
7-
command: make serve
5+
build:
6+
context: .
7+
dockerfile: Dockerfile
88
environment:
99
- MONGO_URI=mongodb://db
1010
- HOST_IP=0.0.0.0

explainshell/algo/classifier.py

Lines changed: 61 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
import itertools, collections, logging
1+
import itertools
2+
import collections
3+
import logging
24

35
import nltk
46
import nltk.metrics
@@ -9,26 +11,32 @@
911

1012
logger = logging.getLogger(__name__)
1113

14+
1215
def get_features(paragraph):
1316
features = {}
14-
ptext = paragraph.cleantext()
15-
assert ptext
16-
17-
features['starts_with_hyphen'] = algo.features.starts_with_hyphen(ptext)
18-
features['is_indented'] = algo.features.is_indented(ptext)
19-
features['par_length'] = algo.features.par_length(ptext)
20-
for w in ('=', '--', '[', '|', ','):
21-
features['first_line_contains_%s' % w] = algo.features.first_line_contains(ptext, w)
22-
features['first_line_length'] = algo.features.first_line_length(ptext)
23-
features['first_line_word_count'] = algo.features.first_line_word_count(ptext)
24-
features['is_good_section'] = algo.features.is_good_section(paragraph)
25-
features['word_count'] = algo.features.word_count(ptext)
17+
p_text = paragraph.clean_text()
18+
logger.debug(f"length of p_text: {len(p_text)}")
19+
assert p_text
20+
21+
features["starts_with_hyphen"] = algo.features.starts_with_hyphen(p_text)
22+
features["is_indented"] = algo.features.is_indented(p_text)
23+
features["par_length"] = algo.features.par_length(p_text)
24+
for w in ("=", "--", "[", "|", ","):
25+
features[f"first_line_contains_{w}"] = algo.features.first_line_contains(
26+
p_text, w
27+
)
28+
features["first_line_length"] = algo.features.first_line_length(p_text)
29+
features["first_line_word_count"] = algo.features.first_line_word_count(p_text)
30+
features["is_good_section"] = algo.features.is_good_section(paragraph)
31+
features["word_count"] = algo.features.word_count(p_text)
2632

2733
return features
2834

29-
class classifier(object):
30-
'''classify the paragraphs of a man page as having command line options
31-
or not'''
35+
36+
class Classifier:
37+
"""classify the paragraphs of a man page as having command line options
38+
or not"""
39+
3240
def __init__(self, store, algo, **classifier_args):
3341
self.store = store
3442
self.algo = algo
@@ -39,59 +47,60 @@ def train(self):
3947
if self.classifier:
4048
return
4149

42-
manpages = self.store.trainingset()
50+
man_pages = self.store.training_set()
4351

4452
# flatten the manpages so we get a list of (manpage-name, paragraph)
4553
def flatten_manpages(manpage):
46-
l = []
54+
p_list = []
4755
for para in manpage.paragraphs:
48-
l.append(para)
49-
return l
50-
paragraphs = itertools.chain(*[flatten_manpages(m) for m in manpages])
56+
p_list.append(para)
57+
return p_list
58+
59+
paragraphs = itertools.chain(*[flatten_manpages(m) for m in man_pages])
5160
training = list(paragraphs)
5261

53-
negids = [p for p in training if not p.is_option]
54-
posids = [p for p in training if p.is_option]
62+
neg_ids = [p for p in training if not p.is_option]
63+
pos_ids = [p for p in training if p.is_option]
5564

56-
negfeats = [(get_features(p), False) for p in negids]
57-
posfeats = [(get_features(p), True) for p in posids]
65+
neg_feats = [(get_features(p), False) for p in neg_ids]
66+
pos_feats = [(get_features(p), True) for p in pos_ids]
5867

59-
negcutoff = len(negfeats)*3/4
60-
poscutoff = len(posfeats)*3/4
68+
neg_cutoff = int(len(neg_feats) * 3 / 4)
69+
pos_cutoff = int(len(pos_feats) * 3 / 4)
6170

62-
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
63-
self.testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
71+
train_feats = neg_feats[:neg_cutoff] + pos_feats[:pos_cutoff]
72+
self.test_feats = neg_feats[neg_cutoff:] + pos_feats[pos_cutoff:]
6473

65-
logger.info('train on %d instances', len(trainfeats))
74+
logger.info("train on %d instances", len(train_feats))
6675

67-
if self.algo == 'maxent':
76+
if self.algo == "maxent":
6877
c = nltk.classify.maxent.MaxentClassifier
69-
elif self.algo == 'bayes':
78+
elif self.algo == "bayes":
7079
c = nltk.classify.NaiveBayesClassifier
7180
else:
72-
raise ValueError('unknown classifier')
81+
raise ValueError("unknown classifier")
7382

74-
self.classifier = c.train(trainfeats, **self.classifier_args)
83+
self.classifier = c.train(train_feats, **self.classifier_args)
7584

7685
def evaluate(self):
7786
self.train()
78-
refsets = collections.defaultdict(set)
79-
testsets = collections.defaultdict(set)
87+
ref_sets = collections.defaultdict(set)
88+
test_sets = collections.defaultdict(set)
8089

81-
for i, (feats, label) in enumerate(self.testfeats):
82-
refsets[label].add(i)
90+
for i, (feats, label) in enumerate(self.test_feats):
91+
ref_sets[label].add(i)
8392
guess = self.classifier.prob_classify(feats)
8493
observed = guess.max()
85-
testsets[observed].add(i)
86-
#if label != observed:
87-
# print 'label:', label, 'observed:', observed, feats
94+
test_sets[observed].add(i)
95+
# if label != observed:
96+
# print('label:', label, 'observed:', observed, feats
8897

89-
print 'pos precision:', nltk.metrics.precision(refsets[True], testsets[True])
90-
print 'pos recall:', nltk.metrics.recall(refsets[True], testsets[True])
91-
print 'neg precision:', nltk.metrics.precision(refsets[False], testsets[False])
92-
print 'neg recall:', nltk.metrics.recall(refsets[False], testsets[False])
98+
print("pos precision:", nltk.metrics.precision(ref_sets[True], test_sets[True]))
99+
print("pos recall:", nltk.metrics.recall(ref_sets[True], test_sets[True]))
100+
print("neg precision:", nltk.metrics.precision(ref_sets[False], test_sets[False]))
101+
print("neg recall:", nltk.metrics.recall(ref_sets[False], test_sets[False]))
93102

94-
print self.classifier.show_most_informative_features(10)
103+
print(self.classifier.show_most_informative_features(10))
95104

96105
def classify(self, manpage):
97106
self.train()
@@ -102,10 +111,9 @@ def classify(self, manpage):
102111
option = guess.max()
103112
certainty = guess.prob(option)
104113

105-
if option:
106-
if certainty < config.CLASSIFIER_CUTOFF:
107-
pass
108-
else:
109-
logger.info('classified %s (%f) as an option paragraph', item, certainty)
110-
item.is_option = True
111-
yield certainty, item
114+
if option and certainty >= config.CLASSIFIER_CUTOFF:
115+
logger.info(
116+
"classified %s (%f) as an option paragraph", item, certainty
117+
)
118+
item.is_option = True
119+
yield certainty, item

explainshell/algo/features.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import re
22

3+
34
def extract_first_line(paragraph):
4-
'''
5+
"""
56
>>> extract_first_line('a b cd')
67
'a b'
78
>>> extract_first_line('a b cd')
@@ -10,54 +11,63 @@ def extract_first_line(paragraph):
1011
'a b cd'
1112
>>> extract_first_line(' a b cd')
1213
'a b'
13-
'''
14+
"""
1415
lines = paragraph.splitlines()
1516
first = lines[0].strip()
16-
spaces = list(re.finditer(r'(\s+)', first))
17+
spaces = list(re.finditer(r"(\s+)", first))
1718
# handle options that have their description in the first line by trying
1819
# to treat it as two lines (looking at spaces between option and the rest
1920
# of the text)
2021
if spaces:
2122
longest = max(spaces, key=lambda m: m.span()[1] - m.span()[0])
2223
if longest and longest.start() > 1 and longest.end() - longest.start() > 1:
23-
first = first[:longest.start()]
24+
first = first[: longest.start()]
2425
return first
2526

27+
2628
def starts_with_hyphen(paragraph):
27-
return paragraph.lstrip()[0] == '-'
29+
return paragraph.lstrip()[0] == "-"
30+
2831

2932
def is_indented(paragraph):
3033
return paragraph != paragraph.lstrip()
3134

35+
3236
def par_length(paragraph):
3337
return round(len(paragraph.strip()), -1) / 2
3438

39+
3540
def first_line_contains(paragraph, what):
36-
l = paragraph.splitlines()[0]
37-
return what in l
41+
ln = paragraph.splitlines()[0]
42+
return what in ln
43+
3844

3945
def first_line_length(paragraph):
4046
first = extract_first_line(paragraph)
4147
return round(len(first), -1) / 2
4248

49+
4350
def first_line_word_count(paragraph):
4451
first = extract_first_line(paragraph)
4552
splitted = [s for s in first.split() if len(s) > 1]
4653

4754
return round(len(splitted), -1)
4855

56+
4957
def is_good_section(paragraph):
5058
if not paragraph.section:
5159
return False
5260
s = paragraph.section.lower()
53-
if 'options' in s:
61+
if "options" in s:
5462
return True
55-
if s in ('description', 'function letters'):
63+
if s in ("description", "function letters"):
5664
return True
5765
return False
5866

67+
5968
def word_count(text):
60-
return round(len(re.findall(r'\w+', text)), -1)
69+
return round(len(re.findall(r"\w+", text)), -1)
70+
6171

6272
def has_bold(html):
63-
return '<b>' in html
73+
return "<b>" in html

explainshell/config.py

Lines changed: 6 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,14 @@
11
import os
22

3-
_currdir = os.path.dirname(os.path.dirname(__file__))
3+
_curr_dir = os.path.dirname(os.path.dirname(__file__))
44

5-
MANPAGEDIR = os.path.join(_currdir, 'manpages')
5+
MAN_PAGE_DIR = os.path.join(_curr_dir, "manpages")
66
CLASSIFIER_CUTOFF = 0.7
7-
TOOLSDIR = os.path.join(_currdir, 'tools')
7+
TOOLS_DIR = os.path.join(_curr_dir, "tools")
88

9-
MAN2HTML = os.path.join(TOOLSDIR, 'w3mman2html.cgi')
9+
MAN2HTML = os.path.join(TOOLS_DIR, "w3mman2html.cgi")
1010

1111
# host to pass into Flask's app.run.
12-
HOST_IP = os.getenv('HOST_IP', False)
13-
MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost')
12+
HOST_IP = os.getenv("HOST_IP", "")
13+
MONGO_URI = os.getenv("MONGO_URI", "mongodb://localhost")
1414
DEBUG = True
15-
16-
LOGGING_DICT = {
17-
'version': 1,
18-
'disable_existing_loggers': False,
19-
'formatters': {
20-
'standard': {
21-
'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
22-
},
23-
},
24-
'handlers': {
25-
'console': {
26-
'level' : 'INFO',
27-
'class' : 'logging.StreamHandler',
28-
'formatter': 'standard',
29-
},
30-
'file': {
31-
'class': 'logging.FileHandler',
32-
'level': 'INFO',
33-
'formatter': 'standard',
34-
'filename': 'application.log',
35-
'mode': 'a',
36-
},
37-
},
38-
'loggers': {
39-
'explainshell': {
40-
'handlers': ['console'],
41-
'level': 'INFO',
42-
'propagate': False
43-
}
44-
}
45-
}

explainshell/errors.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
class ProgramDoesNotExist(Exception):
22
pass
33

4+
45
class EmptyManpage(Exception):
56
pass

0 commit comments

Comments
 (0)