-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreprocessing.py
More file actions
180 lines (145 loc) · 6.49 KB
/
preprocessing.py
File metadata and controls
180 lines (145 loc) · 6.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# -*- coding: utf-8 -*-
__author__ = "Cristian Cardellino"
import argparse
import numpy as np
import os
import pandas as pd
import re
import spacy
import sys
from collections import defaultdict
def build_cooccurrence_matrix(corpus, window_size=5, scale_factor="scaled",
vocab_size=5000, unkown_vector=True, verbose=False):
"""
Builds a co-occurrence matrix with `window_size` and `scale_factor` over `corpus`.
Parameters
----------
corpus : iterator
Should be a generator where each document is returned as a list of words. This
corpus should already be normalized.
window_size : int
The size of the symmetrical window. The words to be considered when doing the count
will be the ones being `window_size` before and after the center word.
scale_factor : str, one of {"flat", "scaled"}
The factor to use in order to weight the importance of a word regarding the center
word. Can be either "flat" (equal weights to every word), or "scaled" where the
word is weighted by 1/d where d is the distance to the center word.
vocab_size : int
The maximum size of the vocabulary
unknown_vector : bool
Whether to use a vector "UNK" for the words outside of the vocabulary.
verbose : bool
Activate to print a simple progress indicator
Returns
-------
pd.DataFrame
The (scaled) matrix of cooccurrences for the top `vocab_size` words in the vocabulary.
"""
assert scale_factor in {"flat", "scaled"}
word_count = defaultdict(int)
word_word = defaultdict(int)
for didx, document in enumerate(corpus, start=1):
if verbose:
print(f"\rDocument No. {didx}", end="", file=sys.stderr)
for idx, word in enumerate(document):
word_count[word] += 1
lwindow = reversed(document[max(idx-window_size, 0):idx])
rwindow = document[idx+1:idx+1+window_size]
for lidx, lword in enumerate(lwindow):
word_word[(word, lword)] += 1/(lidx+1) if scale_factor == "scaled" else 1
for ridx, rword in enumerate(rwindow):
word_word[(word, rword)] += 1/(ridx+1) if scale_factor == "scaled" else 1
if verbose:
print(file=sys.stderr)
vocab = [word for word in sorted(word_count, key=lambda w: word_count[w],
reverse=True)[:vocab_size]]
vocab = {word: idx for idx, word in enumerate(sorted(vocab))}
if unkown_vector:
vocab["UNK"] = len(vocab)
word_matrix = np.zeros((len(vocab), len(vocab)))
for (w1, w2), count in word_word.items():
w1_idx = vocab.get(w1, vocab.get("UNK", None))
w2_idx = vocab.get(w2, vocab.get("UNK", None))
if w1_idx is not None and w2_idx is not None:
word_matrix[w1_idx, w2_idx] = count
vocab_ = [word for word in sorted(vocab, key=lambda x: vocab[x])]
return pd.DataFrame(word_matrix, index=vocab_, columns=vocab_)
def corpus_processor(corpus_directory, language_model="es_core_news_sm",
remove_stopwords=True, lowercase=True):
"""
Generator to retrieve and process the files of a corpus.
Parameters
----------
corpus_directory : str
Path to where the corpus is stored as text documents.
language_model : str
The SpaCy language model to use for the processing.
remove_stopwords : bool
Whether to remove stopwords from the processed corpus.
lowercase : bool
Whether to lowercase the words.
Returns
-------
generator
A generator object where each entry is a processed document.
"""
nlp = spacy.load(language_model)
for fname in os.listdir(corpus_directory):
with open(os.path.join(corpus_directory, fname), "r") as fh:
# Careful with this for very large docs
document = re.sub(r"\s+", " ", fh.read())
nlp.max_length = max(len(document), nlp.max_length)
tokens = [
token.text for token in nlp(document, disable=["tagger", "parser", "ner"])
if not (remove_stopwords and token.is_stop)
]
if lowercase:
tokens = [token.lower() for token in tokens]
yield tokens
if __name__ == "__main__":
parser = argparse.ArgumentParser("preprocessing")
parser.add_argument("corpus_directory",
help="Path to the directory holding the corpus files.")
parser.add_argument("output_file",
help="Path to store the matrix (as csv file).")
parser.add_argument("--language-model", "-l",
default="es_core_news_sm",
help="Name of the SpaCy language model to use for tokenization.")
parser.add_argument("--ignore-unknown", "-u",
action="store_true",
help="Activate to avoid reserving a `UNK` vector for unkown words.")
parser.add_argument("--scale-factor", "-f",
default="flat",
help="The scale factor. Can be either `flat` or `scaled`.")
parser.add_argument("--use-stopwords", "-s",
action="store_true",
help="Activate to avoid ignoring the stopwords in the model.")
parser.add_argument("--use-case", "-c",
action="store_true",
help="Activate to avoid word normalization via lowercase.")
parser.add_argument("--vocab-size", "-v",
default=5000,
help="The maximum amount of words to consider.",
type=int)
parser.add_argument("--verbose",
action="store_true",
help="Activate to print a simple progress value.")
parser.add_argument("--window-size", "-w",
default=5,
help="The size of the co-occurrence window.",
type=int)
args = parser.parse_args()
cooccurrence_matrix = build_cooccurrence_matrix(
corpus=corpus_processor(
corpus_directory=args.corpus_directory,
language_model=args.language_model,
remove_stopwords=not args.use_stopwords,
lowercase=not args.use_case
),
window_size=args.window_size,
scale_factor=args.scale_factor,
vocab_size=args.vocab_size,
unkown_vector=not args.ignore_unknown,
verbose=args.verbose
)
cooccurrence_matrix.to_csv(args.output_file)