From 25d4ee1ddc37f812fcf799fc8f3522d04ccc4228 Mon Sep 17 00:00:00 2001 From: Reed Johnson Date: Sun, 19 May 2024 21:48:04 +0000 Subject: [PATCH] added stopwords --- mapper.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mapper.py b/mapper.py index 9fa3def..0baf058 100644 --- a/mapper.py +++ b/mapper.py @@ -5,10 +5,13 @@ for line in sys.stdin: # remove leading and trailing whitespace line = line.strip() + line = line.lower() # split the line into words; splits on any whitespace words = line.split() + stopwords = set(['the','and','a','I','.',',','!','or','for','not','in','on','to','an','be','but','of','is','it']) # output tuples (word, 1) in tab-delimited format for word in words: + if word not in stopwords: print '%s\t%s' % (word, "1")