Answer_Evaluation/04_outcome.py at main · Jinx2000/Answer_Evaluation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import matplotlib.pyplot as plt
import numpy as np
import os, json, re

def get_file_names(directory):
    # 获取目录下的所有文件和文件夹
    all_items = os.listdir(directory)
    # 过滤出文件（排除文件夹）
    file_names = [item for item in all_items if os.path.isfile(os.path.join(directory, item))]
    return file_names


# 该函数用于计算 baseline 的 accuracy 数值
def cal_gpt_indicator():
    print("=============== Here are GPT scores ============")
    output_filename = "gpt_scores.json"

    accuracy = list()

    with open(output_filename, "r", encoding='utf-8') as f:
        eval_results = json.load(f)


    for result in eval_results:
        accuracy.append(result["answer_correctness"])

    accuracy_mean = np.mean(accuracy)
    accuracy_var = np.var(accuracy)
    print(f"accuracy mean is {accuracy_mean}, variance is {accuracy_var}")


# 该函数用于统计某一 RAG 版本各个 Metric 在 0-1 不同区间的分布情况
def plt_data(arrays, filename):

    # 提取 `test_n` 作为标识
    base_name = os.path.basename(filename)  # 获取文件名部分（去掉路径）
    test_name = base_name.split("_ragas_scores")[0]  # 提取 `test_5` 这种格式

    # 定义区间
    bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

    # 统计每个数组在区间内的个数
    hist_data = [np.histogram(array, bins=bins)[0] for array in arrays]

    # 区间标签
    bin_labels = ['0-0.2', '0.2-0.4', '0.4-0.6', '0.6-0.8', '0.8-1']

    # 设置柱状图的宽度
    bar_width = 0.15

    # 设置x轴的位置
    x = np.arange(len(bin_labels))

    # 创建图形
    plt.figure(figsize=(12, 6))

    # 绘制每个数组的柱状图
    for i in range(len(arrays)):
        bars = plt.bar(x + i * bar_width, hist_data[i], width=bar_width, label=f'Array {i+1}')

        # 在每个柱状图上方标注具体数字
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width() / 2, height, f'{int(height)}',
                    ha='center', va='bottom', fontsize=9)

    plt.xticks(x + bar_width * 2, bin_labels)
    plt.xlabel('Intervals')
    plt.ylabel('Count')
    plt.title(f'Distribution of Numbers - {test_name}') # 标题包含 test 名字
    plt.legend(["faithfulness", "answer_relevancy", "context_precision", "context_recall", "accuracy"])

    # 确保输出目录存在
    output_dir = "./graphs"
    os.makedirs(output_dir, exist_ok=True)
    # 生成独立的文件名
    output_path = os.path.join(output_dir, f"number_distribution_{test_name}.png")
    # 保存图片
    plt.savefig(output_path, dpi=300, bbox_inches='tight')

    #plt.show()


# 该函数用于统计某个 RAG 版本的 Metric 分数
def cal_rag_score(filename):

    print("filename is :", filename)
    # 五个 Metric
    faithfulness = list()
    answer_relevancy = list()
    context_precision = list()
    context_recall = list()
    # accuracy = list()

    with open(filename, "r", encoding='utf-8') as f:
        eval_results = json.load(f)

    eval_results = eval_results[:100]

    for result in eval_results:
        try:
            faithfulness.append(result["faithfulness"])
            answer_relevancy.append(result["answer_relevancy"])
            context_precision.append(result["context_precision"])
            context_recall.append(result["context_recall"])
            # accuracy.append(result["answer_correctness"])
        except:
            # accuracy.append(result["answer_correctness"])
            pass

    # if "5" in filename:
    #     faithfulness = np.array([x if x is not None else np.nan for x in faithfulness])
    #     faithfulness = np.array([1 if x > 0.2 else 0 for x in faithfulness])

    faithfulness_mean = np.nanmean(faithfulness)
    faithfulness_var = np.nanvar(faithfulness)

    answer_relevancy = np.array([x if x is not None else np.nan for x in answer_relevancy])
    answer_relevancy_mean = np.nanmean(answer_relevancy)
    answer_relevancy_var = np.var(answer_relevancy)

    context_precision = np.array([x if x is not None else np.nan for x in context_precision])
    context_precision_mean = np.nanmean(context_precision)
    context_precision_var = np.var(context_precision)

    context_recall = np.array([x if x is not None else np.nan for x in context_recall])
    context_recall_mean = np.nanmean(context_recall)
    context_recall_var = np.var(context_recall)

    # accuracy = np.array([x if x is not None else np.nan for x in accuracy])
    # accuracy_mean = np.nanmean(accuracy)
    # accuracy_var = np.var(accuracy)
    plt_data([faithfulness, answer_relevancy, context_precision, context_recall], filename)

    return [faithfulness_mean, answer_relevancy_mean, context_precision_mean, context_recall_mean]

def plt_compare_scores():
    directory_path = './score_data'
    file_names = get_file_names(directory_path)


    # scores 记录各个版本的 五个指标 的数据
    scores = list()
    for file in file_names:
        scores.append(cal_rag_score(directory_path + "/" + file))


    # 提取所有 test 版本号
    test_numbers = []
    for file in file_names:
        match = re.search(r'test_(\d+)_ragas_scores', file)  # 提取 `test_X`
        if match:
            test_numbers.append(int(match.group(1)))

    # 如果 test_numbers 为空，则退出
    if not test_numbers:
        print("No valid test versions found in filenames!")
        return

    # 计算 test 版本范围
    min_test = min(test_numbers)
    max_test = max(test_numbers)
    test_range = f"{min_test}to{max_test}"  # 形成 "1to5" 形式

    # 打印出这些数据，以方便利用
    print(scores)

    scores = np.array(scores)

    # Metric 名称
    metrics = ['faithfulness_mean', 'answer_relevancy_mean', 'context_precision_mean', 'context_recall_mean']

    # 生成 RAG 版本标记（从 test_ 提取版本号）
    versions = [f'Test {num}' for num in test_numbers]


    # 设置柱状图的宽度
    bar_width = 0.15

    # 设置x轴的位置
    x = np.arange(len(metrics))

    plt.figure(figsize=(12, 6))

    # 绘制每个版本的柱状图
    for i in range(len(versions)):
        bars = plt.bar(x + i * bar_width, scores[i], width=bar_width, label=versions[i])

        # 在每个柱子顶部标注具体数值
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.2f}',
                     ha='center', va='bottom', fontsize=9, color='black')

    plt.xticks(x + bar_width * 2, metrics)  # 将x轴刻度居中
    plt.xlabel('Metrics')
    plt.ylabel('Scores')
    plt.title(f'Comparison of RAG System Versions')
    # plt.legend(["Baseline", "New Embedding Model", "Structured Reference", "Refine Mechanism", "New faithfulness"])
    plt.legend()
    # save the graph
    # 确保 `./graphs` 目录存在
    output_dir = "./graphs"
    os.makedirs(output_dir, exist_ok=True)

    # 生成动态命名的文件
    output_path = os.path.join(output_dir, f"rag_comparison_by_metrics_{test_range}.png")

    plt.savefig(output_path, dpi=300, bbox_inches='tight')

    plt.show()

plt_compare_scores()
# cal_gpt_indicator()