EQ-bench · djstrong · Jun 18, 2024 · Jun 18, 2024 · Jun 18, 2024 · Jun 18, 2024
diff --git a/data/eq_bench_v2_questions_171_pl.json b/data/eq_bench_v2_questions_171_pl.json
diff --git a/eq-bench.py b/eq-bench.py
@@ -7,6 +7,8 @@
 import signal
 import sys
 import io
+import gc
+import torch
 
 
 ooba_instance = None
@@ -63,7 +65,7 @@ def main():
 	parser.add_argument('-v', action='store_true',
 							help="Display more verbose output.")
 	parser.add_argument('-l', default='en',
-							help="Set the language of the question dataset. Currently supported: en, de")
+							help="Set the language of the question dataset. Currently supported: en, de, pl")
 	parser.add_argument('-r', type=int, default=5,
 							help="Set the number of retries to attempt if a benchmark run fails. Default 5.")
 	args = parser.parse_args()
@@ -96,7 +98,7 @@ def main():
 	if args.l:  # If language is provided via command line argument
 		language = args.l.strip()
 
-	if language not in ['en', 'de']:
+	if language not in ['en', 'de', 'pl']:
 		raise Exception('Invalid language value specified.')
 
 	questions_fn = './data/eq_bench_v2_questions_171.json'
@@ -254,7 +256,7 @@ def main():
 
 		if ooba_instance:
 			ooba_instance.stop()
-			gpu_cleanup()
+		gpu_cleanup()
 
 		models_remaining = models_remaining[1:]
 

diff --git a/lib/eq_bench_utils.py b/lib/eq_bench_utils.py
@@ -1,5 +1,5 @@
 import json
-from lib.scoring import calculate_score, calculate_score_fullscale, parse_answers, parse_answers_de
+from lib.scoring import calculate_score, calculate_score_fullscale, parse_answers, parse_answers_de, parse_answers_pl
 from lib.run_bench_helper_functions import remove_revision_instructions
 from lib.run_query import run_query
 from lib.util import safe_dump
@@ -63,6 +63,8 @@ def process_question(question_id, q, model_path, prompt_type, model, tokenizer,
 
 			if language == "de":
 				first_pass_answers, revised_answers = parse_answers_de(inference, REVISE)
+			elif language == "pl":
+				first_pass_answers, revised_answers = parse_answers_pl(inference, REVISE)
 			else:
 				first_pass_answers, revised_answers = parse_answers(inference, REVISE)
 

diff --git a/lib/run_bench.py b/lib/run_bench.py
@@ -3,6 +3,7 @@
 import time
 import json
 import datetime
+import traceback
 from tqdm import tqdm
 from lib.load_model import load_model
 from lib.eq_bench_utils import process_question
@@ -274,9 +275,9 @@ def save_and_upload_results(run_id, formatted_datetime, bench_success, prompt_ty
 			model_path if benchmark_type != 'judgemark' else 'N/A',
 			lora_path if benchmark_type != 'judgemark' else 'N/A',
 			quantization if benchmark_type != 'judgemark' else 'N/A',
-			'FAILED',
+			round(this_score, 2),
 			f"{benchmark_type}{lang_suffix}",
-			'FAILED',
+			parseable,
 			n_iterations,
 			inference_engine,
 			ooba_params,
@@ -380,7 +381,9 @@ def run_generic_benchmark(run_id, model_path, lora_path, prompt_type, quantizati
 
 			except Exception as e:  
 						print(e)
-						last_error = ' '.join(str(e).split('\n')) 
+						last_error = ' '.join(str(e).split('\n'))
+						print(e)
+						print(traceback.format_exc())
 						print(f"{benchmark_type} benchmark run failed.")
 						bench_tries += 1
 						if bench_tries <= max_bench_retries:
@@ -399,7 +402,6 @@ def run_generic_benchmark(run_id, model_path, lora_path, prompt_type, quantizati
 			print('Model:', model_path)
 		if lora_path:
 			print('Lora:', lora_path)
-		delete_model_files = delete_cache
 
 		if benchmark_type == 'eq-bench': 
 			if language != 'en':
@@ -442,7 +444,7 @@ def run_generic_benchmark(run_id, model_path, lora_path, prompt_type, quantizati
 
 	save_and_upload_results(run_id, formatted_datetime, bench_success, prompt_type, model_path, lora_path, quantization, benchmark_type, lang_suffix, this_score, parseable, n_iterations, inference_engine, ooba_params, include_patterns, exclude_patterns, judge_params, results, run_index, last_error, bench_tries, max_bench_retries, google_spreadsheet_url, save_result_to_db_fn, eqbench_version)
 
-	cleanup(model, tokenizer, inference_engine, launch_ooba, ooba_instance, delete_model_files, model_path, include_patterns, exclude_patterns, models_to_delete, models_remaining, verbose)
+	cleanup(model, tokenizer, inference_engine, launch_ooba, ooba_instance, delete_cache, model_path, include_patterns, exclude_patterns, models_to_delete, models_remaining, verbose)
 
 
 

diff --git a/lib/run_bench_helper_functions.py b/lib/run_bench_helper_functions.py
@@ -142,5 +142,10 @@ def remove_revision_instructions(prompt, language):
 		prompt = prompt[:start_index] + replacement_string + prompt[end_index:]
 		prompt = prompt.replace('Erste Bewertung:\n', '')
 		prompt = prompt[:prompt.find('Kritik: <Ihre Kritik hier>')] + '\n' + prompt[prompt.find('[Ende der Antwort]'):]
-		prompt += '\nIhre Antwort:\n'	
+		prompt += '\nIhre Antwort:\n'
+	elif language == 'pl':
+		prompt = prompt.replace(' Następnie skrytykuj swoją odpowiedź, przemyśl ją krok po kroku. Wprowadź zmiany i na koniec podaj ostateczne oceny.', '')
+		prompt = prompt.replace('Pierwsze oceny:\n', '')
+		prompt = prompt[:prompt.find('Weryfikacja: <twoja opinia tutaj>')] + '\n' + prompt[prompt.find('[Koniec odpowiedzi]'):]
+		prompt += '\nTwoja odpowiedź:\n'
 	return prompt
diff --git a/lib/scoring.py b/lib/scoring.py
@@ -39,7 +39,7 @@ def parse_answers_de(text, REVISE):
 
 	first_pass_heading_pattern = r'(Erste.*?):\s*(.*?)(?=Überarbeitete|$)'
 	revised_heading_pattern = r'(Überarbeitete.*?):\s*(.*)'
-	
+
 	if REVISE:
 		first_pass_match = re.search(first_pass_heading_pattern, text, re.IGNORECASE | re.DOTALL)
 		if first_pass_match:
@@ -59,6 +59,34 @@ def parse_answers_de(text, REVISE):
 
 	return first_pass_answers, revised_answers
 
+
+# we parse answers in Polish language ("pl")
+def parse_answers_pl(text, REVISE):
+	first_pass_answers = {}
+	revised_answers = {}
+
+	# Strip out markdown
+	text = text.replace('*', '').replace('#', '')
+
+	parsing_regexp = re.compile(r'([\w ]+): *<?(\d+)>?')
+	# Extracting first pass answers
+	if REVISE:
+		first_pass_match = re.search(r'Pierwsze oceny:(.*?)Zmienione oceny:', text, re.DOTALL)
+		if first_pass_match:
+			first_pass_text = first_pass_match.group(1)
+			first_pass_answers = dict(parsing_regexp.findall(first_pass_text))
+
+		# Extracting revised answers
+		revised_match = re.search(r'Zmienione oceny:(.*?)$', text, re.DOTALL)
+		if revised_match:
+			revised_text = revised_match.group(1)
+			revised_answers = dict(parsing_regexp.findall(revised_text))
+	else:
+		first_pass_answers = dict(parsing_regexp.findall(text))
+		revised_answers = {}
+
+	return first_pass_answers, revised_answers
+
 # Calculate the score for an individual question using v2 scoring system
 def calculate_score_fullscale(reference, user):
 	# First check that the emotions specified in the answer match those in the reference
@@ -161,7 +189,7 @@ def calculate_eq_bench_score(run_index, results, results_path, fullscale=False):
 	n_iterations = results[run_index]['run_metadata']['total_iterations']
 	n_iterations_tallied = 0
 
-	for run_iter in results[run_index]['iterations']:		
+	for run_iter in results[run_index]['iterations']:
 		if n_iterations_tallied >= n_iterations:
 			break
 		score_sum_first_pass = 0
@@ -225,9 +253,9 @@ def calculate_eq_bench_score(run_index, results, results_path, fullscale=False):
 		averaged_score = round(averaged_score, 2)	
 	else:
 		averaged_score = round(averaged_score, 2)
-	
+
 	safe_dump(results, results_path, max_retries=3)
-	
+
 
 	return (averaged_score, round(parseable_tally / n_iterations, 2))