fasttext_classifications/bin_classifier.py at master · isultane/fasttext_classifications · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
'''
# Author: Sultan S. Alqahtani
# Date: 13/04/2022
# It is modified version from original code https://github.com/ChristianBirchler/ticket-tagger-analysis/blob/main/code-pipeline/classifiers/ml_bin_classifier.py
'''
import json
import fasttext
import os
import os.path
import numpy as np
from numpy import array
from sklearn.model_selection import KFold
import sys


# returns the label and chance of that label
def get_guess(r):
    labels = r[0]
    j = 0
    if labels[0] == '__label__sec-report':
        j = 1
    return r[0][j], r[1][j]


# example command to run classifier on the balanced pandas dataset:
# python ml_bin_classifier.py ../../datasets/data_set-pandas-balanced.txt ./out.txt
if __name__ == '__main__':

    print("start here")
    # get sys args
    data_set = "./data/temp/chromium.txt"
    fn_in = os.path.basename(data_set)
    f_out = "./data/temp/out.txt"

    # define paths for temporary files
    b_path_train = "./data/temp/tmp/b_tmp_train.txt"
   # e_path_train = "./data/temp/tmp/e_tmp_train.txt"
   # q_path_train = "./data/temp/tmp/q_tmp_train.txt"
   # b_path = './data/temp/BUG-' + fn_in
   # e_path = './data/temp/ENHANCEMENT-' + fn_in
   # q_path = './data/temp/QUESTION-' + fn_in

    try:
        print("Converting dataset to array")
        f = open(data_set, 'r+', encoding="UTF-8")
        data = array(f.readlines())
        f.close()

        # array for details
        fold_outputs = []

        # fold count
        fold = 1

        # ten fold loop
        kfold = KFold(10, shuffle=True, random_state=1)
        for train, test in kfold.split(data):

            # init stats
            TP_b = 0
            TP_FN_b = 0
            TP_FP_b = 0

            print("New tenfold iteration:", str(fold), "-----------------------------------------")
            print("Creating bug train file")
            b_tmp_train = open(b_path_train, "w", encoding="UTF-8")
            for line in data[train]:
                b_tmp_train.write("".join(line))
            b_tmp_train.close()


            # get test data
            test_data = data[test]

            print("start training...")
            # train the models
            b_model = fasttext.train_supervised(input=b_path_train,epoch=40, lr=1.0)


            # testing loop
            print("start testing for tenfold iteration...")
            for i, line in enumerate(test_data):

                # get correct answer and input text from test data
                t = line.partition(' ')
                issue_text = t[2].replace('\n', '').replace('\r', '')
                correct_answer = t[0]

                # predict with the models
                b_res = b_model.predict(issue_text, k=-1)

                # parse guesses
                b_guess = get_guess(b_res)


                # get most likely label from the results
                res = {
                    b_guess[0]: b_guess[1]
                }
                guess = max(res, key=res.get)

                # save results for recall, precision and f1 calculations

                if guess == '__label__nonsec-report':
                    TP_FP_b += 1
                    if guess == correct_answer:
                        TP_b += 1

                if correct_answer == '__label__sec-report':
                    TP_FN_b += 1

                # log
                print("Issue nr." + str(i) + " has predicted: ")
                print("Bug res: ", str(b_guess))
                print("Final guess: ", str(guess))
                print("Correct answer: ", correct_answer)
                print("TP_FP_b:",TP_FP_b)
                print("TP_b" ,TP_b)
                print("TP_FN_b",TP_FN_b)
                print("-------------------------------------------------")

            # calculate benchmarks for ten fold iteration
            b_recall = TP_b / TP_FN_b
            b_precision = TP_b / TP_FP_b
            b_f1 = 2 * ((b_precision * b_recall) / (b_precision + b_recall))

            ges_recall = (b_recall) / 1
            ges_precision = (b_precision) / 1
            ges_f1 = (b_f1 ) / 1

            micro = (TP_b) / len(test_data)

            result = {
                '10-Fold iteration:': fold,
                'Mean f1': ges_f1,
                'Mean recall': ges_recall,
                'Mean precision': ges_precision,
                'Bug recall': b_recall,
                'Bug precision': b_precision,
                'Bug f1': b_f1,
                'Micro': micro
            }
            # log
            print("Fold over, here are results: ")
            print(json.dumps(result, indent=4))
            fold_outputs.append(result)

            fold += 1

        print("Done with 10 fold validation")
        # calculate over-all results
        mean_recall = 0
        mean_precision = 0
        mean_f1 = 0
        mean_micro = 0
        for f in fold_outputs:
            mean_f1 += (f['Mean f1'] / 10)
            mean_recall += (f['Mean recall'] / 10)
            mean_precision += (f['Mean precision'] / 10)
            mean_micro += (f['Micro'] / 10)

        # compile results as json
        output = {
            'Results': {
                'F1': mean_f1,
                'Recall': mean_recall,
                'Precision': mean_precision,
                'Micro': mean_micro
            },
            'Details': fold_outputs
        }
        dump = json.dumps(output, indent=4)
        print(dump)
        # write to output
        print("Writing output to file")
        o = open(f_out, 'w', encoding="UTF-8")
        o.write(dump)
        o.close()
    # catch and print exceptions
    except Exception as e:
        print(e)
    # in any case delete existing temporary files
    finally:
        print("Deleting tmp files")
        if os.path.exists(b_path_train):
            os.remove(b_path_train)
        print("Exit.")

# formulas used for calculations:

# recall = TP/(TP+FN)
# recall_bug = #final guess == bug && correct answer == bug / #correct answers == bug

# precision = TP / (TP+FP)
# precision_bug = #final guess == bug && correct answer == bug / #final guess == bug

# f1 = 2*((precision*recall)/(precision+recall))