Open-Chat/training_data_getter.py at main · Bean91/Open-Chat · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import json;
from datasets import load_dataset;

english = True;
french = True;
code = False;
file_limit = 10000
current_batch = [];
current_size = 0;
min_size_bytes = 10240;

if (english): #{
    file_count = 0;
    ds = load_dataset("nampdn-ai/mini-en", split="train");
    for row in ds: #{
        row_json = json.dumps(row["text"]);
        row_size = len(row_json.encode('utf-8'));

        current_batch.append(row_json);
        current_size += row_size;

        if (file_count >= file_limit):
            break;

        if (current_size >= min_size_bytes): #{
            filename = f"training_corpus/english/file{file_count:03d}.txt";

            with open(filename, "w") as f: #{
                json.dump("\n\n".join(current_batch), f);
            #}

            file_count += 1;
            current_batch = [];
            current_size = 0;
        #}
    #}
#}

if (french): #{
    file_count = 0;
    ds = load_dataset("MaxLSB/LeCarnet", split="train");
    replacements = {
        r'\\u00e9': 'é',
        r'\\u00e0': 'à',
        r'\\u00e8': 'è',
        r'\\u00e7': 'ç',
        r'\\u00f9': 'ù',
        r'\\u00ea': 'ê',
        r'\\u00ee': 'î',
        r'\\u00ef': 'ï',
        r'\\u00f4': 'ô',
        r'\\u00fb': 'û',
        r'\\u00e2': 'â',
        r'\\u00eb': 'ë',
    }
    for row in ds: #{
        row_json = json.dumps(row["text"]);
        for old, new in replacements.items():
            row_json = row_json.replace(old, new)
        row_size = len(row_json.encode('utf-8'));

        current_batch.append(row_json);
        current_size += row_size;

        if (file_count >= file_limit):
            break;

        if (current_size >= min_size_bytes): #{
            filename = f"training_corpus/french/file{file_count:03d}.txt";

            with open(filename, "w") as f: #{
                json.dump("\n\n".join(current_batch), f);
            #}

            file_count += 1;
            current_batch = [];
            current_size = 0;
        #}
    #}
#}

if (code): #{
    # ds = load_dataset("MaxLSB/LeCarnet", split="train");
    file_count = 3;
    for row in ds: #{
        row_json = json.dumps(row["text"]);
        row_size = len(row_json.encode('utf-8'));

        current_batch.append(row_json);
        current_size += row_size;

        if (current_size >= min_size_bytes): #{
            filename = f"training_corpus/code/file{file_count:03d}.txt";

            with open(filename, "w") as f: #{
                json.dump("\n\n".join(current_batch), f);
            #}

            file_count += 1;
            current_batch = [];
            current_size = 0;
        #}
    #}
#}