text-mining-2019/preprocessing.py at master · PLN-FaMAF/text-mining-2019 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# -*- coding: utf-8 -*-

__author__ = "Cristian Cardellino"

import argparse
import numpy as np
import os
import pandas as pd
import re
import spacy
import sys

from collections import defaultdict


def build_cooccurrence_matrix(corpus, window_size=5, scale_factor="scaled",
                              vocab_size=5000, unkown_vector=True, verbose=False):
    """
    Builds a co-occurrence matrix with `window_size` and `scale_factor` over `corpus`.

    Parameters
    ----------
    corpus : iterator
        Should be a generator where each document is returned as a list of words. This
        corpus should already be normalized.
    window_size : int
        The size of the symmetrical window. The words to be considered when doing the count
        will be the ones being `window_size` before and after the center word.
    scale_factor : str, one of {"flat", "scaled"}
        The factor to use in order to weight the importance of a word regarding the center
        word. Can be either "flat" (equal weights to every word), or "scaled" where the
        word is weighted by 1/d where d is the distance to the center word.
    vocab_size : int
        The maximum size of the vocabulary
    unknown_vector : bool
        Whether to use a vector "UNK" for the words outside of the vocabulary.
    verbose : bool
        Activate to print a simple progress indicator

    Returns
    -------
    pd.DataFrame
        The (scaled) matrix of cooccurrences for the top `vocab_size` words in the vocabulary.
    """

    assert scale_factor in {"flat", "scaled"}

    word_count = defaultdict(int)
    word_word = defaultdict(int)

    for didx, document in enumerate(corpus, start=1):
        if verbose:
            print(f"\rDocument No. {didx}", end="", file=sys.stderr)
        for idx, word in enumerate(document):
            word_count[word] += 1
            lwindow = reversed(document[max(idx-window_size, 0):idx])
            rwindow = document[idx+1:idx+1+window_size]

            for lidx, lword in enumerate(lwindow):
                word_word[(word, lword)] += 1/(lidx+1) if scale_factor == "scaled" else 1

            for ridx, rword in enumerate(rwindow):
                word_word[(word, rword)] += 1/(ridx+1) if scale_factor == "scaled" else 1

    if verbose:
        print(file=sys.stderr)

    vocab = [word for word in sorted(word_count, key=lambda w: word_count[w],
                                     reverse=True)[:vocab_size]]
    vocab = {word: idx for idx, word in enumerate(sorted(vocab))}

    if unkown_vector:
        vocab["UNK"] = len(vocab)

    word_matrix = np.zeros((len(vocab), len(vocab)))

    for (w1, w2), count in word_word.items():
        w1_idx = vocab.get(w1, vocab.get("UNK", None))
        w2_idx = vocab.get(w2, vocab.get("UNK", None))

        if w1_idx is not None and w2_idx is not None:
            word_matrix[w1_idx, w2_idx] = count

    vocab_ = [word for word in sorted(vocab, key=lambda x: vocab[x])]

    return pd.DataFrame(word_matrix, index=vocab_, columns=vocab_)


def corpus_processor(corpus_directory, language_model="es_core_news_sm",
                     remove_stopwords=True, lowercase=True):
    """
    Generator to retrieve and process the files of a corpus.

    Parameters
    ----------
    corpus_directory : str
        Path to where the corpus is stored as text documents.
    language_model : str
        The SpaCy language model to use for the processing.
    remove_stopwords : bool
        Whether to remove stopwords from the processed corpus.
    lowercase : bool
        Whether to lowercase the words.

    Returns
    -------
        generator
        A generator object where each entry is a processed document.
    """

    nlp = spacy.load(language_model)

    for fname in os.listdir(corpus_directory):
        with open(os.path.join(corpus_directory, fname), "r") as fh:
            # Careful with this for very large docs
            document = re.sub(r"\s+", " ", fh.read())

        nlp.max_length = max(len(document), nlp.max_length)

        tokens = [
            token.text for token in nlp(document, disable=["tagger", "parser", "ner"])
            if not (remove_stopwords and token.is_stop)
        ]

        if lowercase:
            tokens = [token.lower() for token in tokens]

        yield tokens


if __name__ == "__main__":
    parser = argparse.ArgumentParser("preprocessing")
    parser.add_argument("corpus_directory",
                        help="Path to the directory holding the corpus files.")
    parser.add_argument("output_file",
                        help="Path to store the matrix (as csv file).")
    parser.add_argument("--language-model", "-l",
                        default="es_core_news_sm",
                        help="Name of the SpaCy language model to use for tokenization.")
    parser.add_argument("--ignore-unknown", "-u",
                        action="store_true",
                        help="Activate to avoid reserving a `UNK` vector for unkown words.")
    parser.add_argument("--scale-factor", "-f",
                        default="flat",
                        help="The scale factor. Can be either `flat` or `scaled`.")
    parser.add_argument("--use-stopwords", "-s",
                        action="store_true",
                        help="Activate to avoid ignoring the stopwords in the model.")
    parser.add_argument("--use-case", "-c",
                        action="store_true",
                        help="Activate to avoid word normalization via lowercase.")
    parser.add_argument("--vocab-size", "-v",
                        default=5000,
                        help="The maximum amount of words to consider.",
                        type=int)
    parser.add_argument("--verbose",
                        action="store_true",
                        help="Activate to print a simple progress value.")
    parser.add_argument("--window-size", "-w",
                        default=5,
                        help="The size of the co-occurrence window.",
                        type=int)

    args = parser.parse_args()

    cooccurrence_matrix = build_cooccurrence_matrix(
        corpus=corpus_processor(
            corpus_directory=args.corpus_directory,
            language_model=args.language_model,
            remove_stopwords=not args.use_stopwords,
            lowercase=not args.use_case
        ),
        window_size=args.window_size,
        scale_factor=args.scale_factor,
        vocab_size=args.vocab_size,
        unkown_vector=not args.ignore_unknown,
        verbose=args.verbose
    )

    cooccurrence_matrix.to_csv(args.output_file)