Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,935 changes: 3,935 additions & 0 deletions data/eq_bench_v2_questions_171_pl.json

Large diffs are not rendered by default.

8 changes: 5 additions & 3 deletions eq-bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import signal
import sys
import io
import gc
import torch


ooba_instance = None
Expand Down Expand Up @@ -63,7 +65,7 @@ def main():
parser.add_argument('-v', action='store_true',
help="Display more verbose output.")
parser.add_argument('-l', default='en',
help="Set the language of the question dataset. Currently supported: en, de")
help="Set the language of the question dataset. Currently supported: en, de, pl")
parser.add_argument('-r', type=int, default=5,
help="Set the number of retries to attempt if a benchmark run fails. Default 5.")
args = parser.parse_args()
Expand Down Expand Up @@ -96,7 +98,7 @@ def main():
if args.l: # If language is provided via command line argument
language = args.l.strip()

if language not in ['en', 'de']:
if language not in ['en', 'de', 'pl']:
raise Exception('Invalid language value specified.')

questions_fn = './data/eq_bench_v2_questions_171.json'
Expand Down Expand Up @@ -254,7 +256,7 @@ def main():

if ooba_instance:
ooba_instance.stop()
gpu_cleanup()
gpu_cleanup()

models_remaining = models_remaining[1:]

Expand Down
4 changes: 3 additions & 1 deletion lib/eq_bench_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import json
from lib.scoring import calculate_score, calculate_score_fullscale, parse_answers, parse_answers_de
from lib.scoring import calculate_score, calculate_score_fullscale, parse_answers, parse_answers_de, parse_answers_pl
from lib.run_bench_helper_functions import remove_revision_instructions
from lib.run_query import run_query
from lib.util import safe_dump
Expand Down Expand Up @@ -63,6 +63,8 @@ def process_question(question_id, q, model_path, prompt_type, model, tokenizer,

if language == "de":
first_pass_answers, revised_answers = parse_answers_de(inference, REVISE)
elif language == "pl":
first_pass_answers, revised_answers = parse_answers_pl(inference, REVISE)
else:
first_pass_answers, revised_answers = parse_answers(inference, REVISE)

Expand Down
12 changes: 7 additions & 5 deletions lib/run_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import time
import json
import datetime
import traceback
from tqdm import tqdm
from lib.load_model import load_model
from lib.eq_bench_utils import process_question
Expand Down Expand Up @@ -274,9 +275,9 @@ def save_and_upload_results(run_id, formatted_datetime, bench_success, prompt_ty
model_path if benchmark_type != 'judgemark' else 'N/A',
lora_path if benchmark_type != 'judgemark' else 'N/A',
quantization if benchmark_type != 'judgemark' else 'N/A',
'FAILED',
round(this_score, 2),
f"{benchmark_type}{lang_suffix}",
'FAILED',
parseable,
n_iterations,
inference_engine,
ooba_params,
Expand Down Expand Up @@ -380,7 +381,9 @@ def run_generic_benchmark(run_id, model_path, lora_path, prompt_type, quantizati

except Exception as e:
print(e)
last_error = ' '.join(str(e).split('\n'))
last_error = ' '.join(str(e).split('\n'))
print(e)
print(traceback.format_exc())
print(f"{benchmark_type} benchmark run failed.")
bench_tries += 1
if bench_tries <= max_bench_retries:
Expand All @@ -399,7 +402,6 @@ def run_generic_benchmark(run_id, model_path, lora_path, prompt_type, quantizati
print('Model:', model_path)
if lora_path:
print('Lora:', lora_path)
delete_model_files = delete_cache

if benchmark_type == 'eq-bench':
if language != 'en':
Expand Down Expand Up @@ -442,7 +444,7 @@ def run_generic_benchmark(run_id, model_path, lora_path, prompt_type, quantizati

save_and_upload_results(run_id, formatted_datetime, bench_success, prompt_type, model_path, lora_path, quantization, benchmark_type, lang_suffix, this_score, parseable, n_iterations, inference_engine, ooba_params, include_patterns, exclude_patterns, judge_params, results, run_index, last_error, bench_tries, max_bench_retries, google_spreadsheet_url, save_result_to_db_fn, eqbench_version)

cleanup(model, tokenizer, inference_engine, launch_ooba, ooba_instance, delete_model_files, model_path, include_patterns, exclude_patterns, models_to_delete, models_remaining, verbose)
cleanup(model, tokenizer, inference_engine, launch_ooba, ooba_instance, delete_cache, model_path, include_patterns, exclude_patterns, models_to_delete, models_remaining, verbose)



Expand Down
7 changes: 6 additions & 1 deletion lib/run_bench_helper_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,5 +142,10 @@ def remove_revision_instructions(prompt, language):
prompt = prompt[:start_index] + replacement_string + prompt[end_index:]
prompt = prompt.replace('Erste Bewertung:\n', '')
prompt = prompt[:prompt.find('Kritik: <Ihre Kritik hier>')] + '\n' + prompt[prompt.find('[Ende der Antwort]'):]
prompt += '\nIhre Antwort:\n'
prompt += '\nIhre Antwort:\n'
elif language == 'pl':
prompt = prompt.replace(' Następnie skrytykuj swoją odpowiedź, przemyśl ją krok po kroku. Wprowadź zmiany i na koniec podaj ostateczne oceny.', '')
prompt = prompt.replace('Pierwsze oceny:\n', '')
prompt = prompt[:prompt.find('Weryfikacja: <twoja opinia tutaj>')] + '\n' + prompt[prompt.find('[Koniec odpowiedzi]'):]
prompt += '\nTwoja odpowiedź:\n'
return prompt
36 changes: 32 additions & 4 deletions lib/scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def parse_answers_de(text, REVISE):

first_pass_heading_pattern = r'(Erste.*?):\s*(.*?)(?=Überarbeitete|$)'
revised_heading_pattern = r'(Überarbeitete.*?):\s*(.*)'

if REVISE:
first_pass_match = re.search(first_pass_heading_pattern, text, re.IGNORECASE | re.DOTALL)
if first_pass_match:
Expand All @@ -59,6 +59,34 @@ def parse_answers_de(text, REVISE):

return first_pass_answers, revised_answers


# we parse answers in Polish language ("pl")
def parse_answers_pl(text, REVISE):
first_pass_answers = {}
revised_answers = {}

# Strip out markdown
text = text.replace('*', '').replace('#', '')

parsing_regexp = re.compile(r'([\w ]+): *<?(\d+)>?')
# Extracting first pass answers
if REVISE:
first_pass_match = re.search(r'Pierwsze oceny:(.*?)Zmienione oceny:', text, re.DOTALL)
if first_pass_match:
first_pass_text = first_pass_match.group(1)
first_pass_answers = dict(parsing_regexp.findall(first_pass_text))

# Extracting revised answers
revised_match = re.search(r'Zmienione oceny:(.*?)$', text, re.DOTALL)
if revised_match:
revised_text = revised_match.group(1)
revised_answers = dict(parsing_regexp.findall(revised_text))
else:
first_pass_answers = dict(parsing_regexp.findall(text))
revised_answers = {}

return first_pass_answers, revised_answers

# Calculate the score for an individual question using v2 scoring system
def calculate_score_fullscale(reference, user):
# First check that the emotions specified in the answer match those in the reference
Expand Down Expand Up @@ -161,7 +189,7 @@ def calculate_eq_bench_score(run_index, results, results_path, fullscale=False):
n_iterations = results[run_index]['run_metadata']['total_iterations']
n_iterations_tallied = 0

for run_iter in results[run_index]['iterations']:
for run_iter in results[run_index]['iterations']:
if n_iterations_tallied >= n_iterations:
break
score_sum_first_pass = 0
Expand Down Expand Up @@ -225,9 +253,9 @@ def calculate_eq_bench_score(run_index, results, results_path, fullscale=False):
averaged_score = round(averaged_score, 2)
else:
averaged_score = round(averaged_score, 2)

safe_dump(results, results_path, max_retries=3)


return (averaged_score, round(parseable_tally / n_iterations, 2))

Expand Down