diff --git a/tests/data/corpora/sentencise/text.txt b/tests/data/corpora/sentencise/text.txt new file mode 100644 index 0000000..c20792d --- /dev/null +++ b/tests/data/corpora/sentencise/text.txt @@ -0,0 +1,6 @@ +3.14 is a number, not some B.S. You know M.D. Bob is my friend. +I've got my Ph.D. in 2014. I have Ph.D. I got it in 2014. + +А по-русски слабо? Что делать с гос. служащими? +富士山が見える。こんにちは + \ No newline at end of file diff --git a/vecto/_version.py b/vecto/_version.py index 8cc0063..246cf2a 100644 --- a/vecto/_version.py +++ b/vecto/_version.py @@ -1,3 +1,3 @@ """Version of vecto package.""" -VERSION = "0.2.16" +VERSION = "0.2.21" diff --git a/vecto/corpus/corpus.py b/vecto/corpus/corpus.py index eaf2cbc..7a78156 100644 --- a/vecto/corpus/corpus.py +++ b/vecto/corpus/corpus.py @@ -6,8 +6,8 @@ from vecto.utils.data import get_uncompressed_size from vecto.utils.metadata import WithMetaData -from .iterators import (DirIterator, FileIterator, FileLineIterator, - LoopedLineIterator, SequenceIterator, +from .iterators import (CharIterator, DirIterator, FileIterator, + FileLineIterator, LoopedLineIterator, SequenceIterator, SlidingWindowIterator, TokenIterator, TokenizedSequenceIterator, ViewLineIterator) from .tokenization import (DEFAULT_JAP_TOKENIZER, DEFAULT_SENT_TOKENIZER, @@ -51,7 +51,7 @@ def get_token_iterator(self, tokenizer=None, verbose=False): return TokenIterator(self.get_sentence_iterator(tokenizer, verbose)) def get_character_iterator(self, verbose=False): - return TokenIterator(self.get_line_iterator(verbose)) + return CharIterator(self.get_line_iterator(verbose)) def get_sentence_iterator(self, tokenizer=None, verbose=False): if tokenizer is None: @@ -68,10 +68,12 @@ def get_sequence_iterator(self, sequence_length, tokenizer): sequence_length=sequence_length, tokenizer=tokenizer) - def get_looped_sequence_iterator(self, sequence_length, tokenizer, rank, size): + def get_looped_sequence_iterator(self, sequence_length, tokenizer, rank, size, min_length=0, reset_on_new_line=False): return SequenceIterator(self.get_looped_line_iterator(rank, size), sequence_length=sequence_length, - tokenizer=tokenizer) + tokenizer=tokenizer, + minimal_length=min_length, + reset_on_new_line=reset_on_new_line) class Corpus(BaseCorpus): diff --git a/vecto/corpus/iterators.py b/vecto/corpus/iterators.py index 057d751..b55c4a9 100644 --- a/vecto/corpus/iterators.py +++ b/vecto/corpus/iterators.py @@ -35,11 +35,14 @@ def __init__(self, dirname, verbose=0): self.dirname = dirname def _generate_samples(self): - for root, _, files in os.walk(self.dirname, followlinks=True): - for good_fname in sorted(fnmatch.filter(files, "*")): - full_file_path = os.path.join(root, good_fname) - logger.info("processing " + full_file_path) - yield full_file_path + if os.path.isfile(self.dirname): + yield self.dirname + else: + for root, _, files in os.walk(self.dirname, followlinks=True): + for good_fname in sorted(fnmatch.filter(files, "*")): + full_file_path = os.path.join(root, good_fname) + logger.info("processing " + full_file_path) + yield full_file_path class FileLineIterator(BaseIterator): @@ -110,6 +113,7 @@ def _generate_samples(self): filename = self.tree[self.id_file][0] file_in = detect_archive_format_and_open(filename) seek_unicode(file_in, self.start_offset) + file_in.readline() while True: for line in file_in: line = line.strip() @@ -150,20 +154,26 @@ def _generate_samples(self): class SequenceIterator(BaseIterator): - def __init__(self, line_terator, sequence_length, tokenizer): + def __init__(self, line_terator, sequence_length, tokenizer, minimal_length=0, reset_on_new_line=False): super().__init__() self.line_iterator = line_terator self.sequence_length = sequence_length self.tokenizer = tokenizer self.buffer = [] + self.minimal_length = minimal_length + self.reset_on_new_line = reset_on_new_line def _generate_samples(self): # TODO: consider removing too small chunks of sentences at the end # TODO: consider leveraging sentence iterator is corpus has mark-up for line in self.line_iterator: tokens = self.tokenizer(line) + if self.reset_on_new_line: + self.buffer = [] + elif len(self.buffer) < self.minimal_length: + self.buffer = [] self.buffer += tokens - while len(self.buffer) > self.sequence_length: + while len(self.buffer) > self.sequence_length - self.minimal_length: s = self.buffer[: self.sequence_length] self.buffer = self.buffer[self.sequence_length:] yield s @@ -184,13 +194,20 @@ def __init__(self, parent_iterator, verbose=0): class TokenIterator(BaseNestedIterator): - def _generate_samples(self): for tokenized_str in self.parent_iterator: for token in tokenized_str: yield token +class CharIterator(BaseNestedIterator): + def _generate_samples(self): + for line in self.parent_iterator: + for c in line: + yield c + yield " " + + def iter_sliding_window(seq, left_ctx_size, right_ctx_size): for i, current in enumerate(seq): ctx = [] diff --git a/vecto/corpus/preprocess.py b/vecto/corpus/preprocess.py new file mode 100644 index 0000000..530a02f --- /dev/null +++ b/vecto/corpus/preprocess.py @@ -0,0 +1,142 @@ +# import spacy +# import numpy as np +# from nltk.tokenize import sent_tokenize +# import nltk +import json +import random +import sys + +from transformers import AutoTokenizer +from vecto.corpus import Corpus + + +def simple_char_iter(text): + for c in text: + yield c + + +# def sentencize(text): + # nlp = spacy.load("en_core_web_sm") + # doc = nlp(text) + # sents = [sent.text for sent in doc.sents] + # return [s for s in sentence_iter(char_iter(text))] + # sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') + # return sent_detector.tokenize(text) + + +# TODO: ok let's do streaming sentence splitter +# ingest character by character +# append to sentence, unles +other_delimiters = {"?", "!", "。"} + +known_abbreviations = {"md", "bs", "mr", "ms"} + + +def is_abbreviation(token): + if "." in token: + return True + if len(token) == 1: + return True + if token.lower() in known_abbreviations: + return True + return False + + +def sentence_iter(char_iter): + size_buffer = 10000 + buffer = [" "] * size_buffer + pos = 0 + prev_char = "" + prev_token = "" + for c in char_iter: + is_sentence_end = False + if c == " " and prev_char == ".": + # print(prev_token) + if not is_abbreviation(prev_token[:-1]): + is_sentence_end = True + if prev_char in other_delimiters and c != "\"": + is_sentence_end = True + #buffer[pos] = c + #pos += 1 + if is_sentence_end: + if pos > 0: + yield "".join(buffer[: pos]).strip() + buffer = [" "] * size_buffer + pos = 0 + continue + prev_char = c + if pos >= len(buffer): + print("buffer overflow:") + # print("".join(buffer[:100])) + print("".join(buffer[-100:])) + pos = 0 + buffer[pos] = c + prev_token += c + if c == " ": + prev_token = "" + pos += 1 + if pos > 0: + yield "".join(buffer[: pos]) + + +def main(): + # samples = [] + # samples.append("Hey how do you do? M.D. Bob is my friend. Mr. John too.") + # samples.append("А по-русски слабо? Что делать с гос. служащими?") + # samples.append("富士山が見える。こんにちは") + # for s in samples: + # tokenized = sentencize(s) + # print(tokenized) + # path = "./tests/data/corpora/sentencise" + path = sys.argv[1] + # path = "/mnt/storage/Data/NLP/corpora/wiki_clean.txt" + # path = "/mnt/storage/Data/NLP/corpora/toronto_clean.txt" + # path = "./quotes/13th_Reality-1.txt" + name_tokenizer = "roberta-base" + tokenizer = AutoTokenizer.from_pretrained(name_tokenizer) + corpus = Corpus(path) + corpus.load_dir_strucute() + char_iter = corpus.get_character_iterator() + sent_iter = sentence_iter(char_iter) + # cnt = 0 + sample = [tokenizer.cls_token_id] + max_length = 128 + cnt = 0 + proba_shortening = 0.1 + with open("lines.jsonl", "w") as f_out: + for line in sent_iter: + tokens = tokenizer(line, + add_special_tokens=False, + return_attention_mask=False,)["input_ids"] + sample += tokens + if len(sample) > max_length - 10: + sample = sample[:max_length - 1] + min_length = 5 + if random.random() < proba_shortening: + sample = sample[: random.randint(min_length, len(sample))] + sample += [tokenizer.sep_token_id] + sample += [tokenizer.pad_token_id] * (max_length - len(sample)) + # print(len(sample)) + serialized = json.dumps(sample) + if ":" in serialized: + print(sample) + print(serialized) + f_out.write(serialized) + f_out.write("\n") + #print(tokenizer.decode(sample)) + #print(len(sample)) + #print() + sample = [tokenizer.cls_token_id] + cnt += 1 + if cnt % 10000 == 0: + print(cnt, "last line", len(tokens)) + # print(tokenizer.convert_ids_to_tokens(tokens)) + # print(line) + # print() + # if cnt > 100: + # break + # cnt += 1 + + +if __name__ == "__main__": + main()