-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmypreprocessing.py
More file actions
55 lines (49 loc) · 2.03 KB
/
mypreprocessing.py
File metadata and controls
55 lines (49 loc) · 2.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from collections import defaultdict
import nltk
import numpy as np
import pandas as pd
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
fixContractions = False
isVerbose = False
np.random.seed()
def write_corpus(path, fix_contractions=False, verbose=False):
global isVerbose
isVerbose = verbose
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
corpus = pd.read_csv(path + "dataset-ib.csv")
corpus['text'] = corpus['headline'] + " " + corpus["content"]
corpus['text'].dropna(inplace=True)
corpus['text'] = [entry.lower() for entry in corpus['text']]
vprint('Tokenize words...')
corpus['text'] = [word_tokenize(entry) for entry in corpus['text']]
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
vprint('Lemmatize words...')
for index, entry in enumerate(corpus['text']):
# Declaring Empty List to store the words that follow the rules for this step
Final_words = []
# Initializing WordNetLemmatizer()
word_Lemmatized = WordNetLemmatizer()
# pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
for word, tag in pos_tag(entry):
# Below condition is to check for Stop words and consider only alphabets
if word not in stopwords.words('english') and word.isalpha():
word_Final = word_Lemmatized.lemmatize(word, tag_map[tag[0]])
Final_words.append(word_Final)
# The final processed set of words for each iteration will be stored in 'text_final'
corpus.loc[index, 'text_final'] = str(Final_words)
corpus = corpus.loc[:, ['category', 'text_final']]
corpus.to_csv(path + 'dataset_final_ib.csv')
return corpus
def vprint(*data):
if isVerbose:
print(*data)