From 4756d1a0dd6e1e019c01ce16d409dfc68bbd4a79 Mon Sep 17 00:00:00 2001 From: CrispStrobe Date: Sat, 25 Jan 2025 21:47:11 +0100 Subject: [PATCH] longoutput --- eq-bench.py | 10 +++++++++- lib/eq_bench_utils.py | 12 +++--------- lib/run_bench.py | 24 ++++++++++++++---------- lib/scoring.py | 10 +++++++++- 4 files changed, 35 insertions(+), 21 deletions(-) diff --git a/eq-bench.py b/eq-bench.py index 007bfed..89a1df7 100644 --- a/eq-bench.py +++ b/eq-bench.py @@ -66,6 +66,9 @@ def main(): help="Set the language of the question dataset. Currently supported: en, de") parser.add_argument('-r', type=int, default=5, help="Set the number of retries to attempt if a benchmark run fails. Default 5.") + parser.add_argument('--longoutput', action='store_true', + help="Remove token limit restrictions on outputs to handle very long responses.") + args = parser.parse_args() resume = not args.w @@ -110,6 +113,11 @@ def main(): base_filename, extension = questions_fn.rsplit('.', 1) # Appending language denotifier questions_fn = f"{base_filename}_{language}.{extension}" + + if args.longoutput: + COMPLETION_TOKENS = 4096 # Allow for very long outputs + else: + COMPLETION_TOKENS = 600 if REVISE else 60 # Creative writing Judge params judge_params = { @@ -242,7 +250,7 @@ def main(): ooba_params_global=ooba_params_global, fast_download=args.f, hf_access_token=hf_access_token, ooba_request_timeout=ooba_request_timeout, questions_fn=questions_fn, openai_client=openai_client, language=language, - REVISE=REVISE, benchmark_types=args.benchmarks, judge_params = judge_params) + REVISE=REVISE, benchmark_types=args.benchmarks, judge_params = judge_params, completion_tokens=COMPLETION_TOKENS) except KeyboardInterrupt: if ooba_instance: ooba_instance.stop() diff --git a/lib/eq_bench_utils.py b/lib/eq_bench_utils.py index f50c675..ecb22c6 100644 --- a/lib/eq_bench_utils.py +++ b/lib/eq_bench_utils.py @@ -9,7 +9,7 @@ def process_question(question_id, q, model_path, prompt_type, model, tokenizer, results, run_index, run_iter, verbose, n_question_attempts, inference_engine, ooba_instance, launch_ooba, ooba_request_timeout, openai_client, eqbench_version, language, - REVISE): + REVISE, completion_tokens): """ Process a single question and update the results. :param question_id: ID of the question. @@ -24,6 +24,7 @@ def process_question(question_id, q, model_path, prompt_type, model, tokenizer, :param verbose: Verbose output flag. :param n_question_attempts: Number of attempts per question. :param language: language of the test questions ("en" default, "de" also supported) + :param completion_tokens: Maximum number of tokens for model output. :return: Updated results. """ @@ -34,16 +35,9 @@ def process_question(question_id, q, model_path, prompt_type, model, tokenizer, else: ref_fullscale = None - COMPLETION_TOKENS = 60 - if REVISE: - COMPLETION_TOKENS = 600 - if eqbench_version == 'v2' and not REVISE: prompt = remove_revision_instructions(prompt, language) - - - tries = 0 success = False temp = 0.01 # Low temp is important for consistency of results @@ -52,7 +46,7 @@ def process_question(question_id, q, model_path, prompt_type, model, tokenizer, prev_result_inference = None prev_result_parsed_answers = None while tries < n_question_attempts and not success: - inference = run_query(model_path, prompt_type, prompt, [], COMPLETION_TOKENS, model, tokenizer, temp, inference_engine, ooba_instance, launch_ooba, ooba_request_timeout, openai_client) + inference = run_query(model_path, prompt_type, prompt, [], completion_tokens, model, tokenizer, temp, inference_engine, ooba_instance, launch_ooba, ooba_request_timeout, openai_client) try: if verbose: diff --git a/lib/run_bench.py b/lib/run_bench.py index 7718d0a..00e70f0 100644 --- a/lib/run_bench.py +++ b/lib/run_bench.py @@ -153,7 +153,7 @@ def load_model_and_launch_ooba(model_path, lora_path, quantization, inference_en raise Exception("Ooba failed to launch.") return model, tokenizer, ooba_instance -def process_questions(benchmark_type, model, ooba_instance, inference_engine, results, model_path, prompt_type, tokenizer, launch_ooba, ooba_request_timeout, run_index, run_iter, verbose, n_attempts, openai_client, questions, eqbench_version, language, REVISE, judge_params, test_model_outputs, process_fn): +def process_questions(benchmark_type, model, ooba_instance, inference_engine, results, model_path, prompt_type, tokenizer, launch_ooba, ooba_request_timeout, run_index, run_iter, verbose, n_attempts, openai_client, questions, eqbench_version, language, REVISE, judge_params, test_model_outputs, process_fn, completion_tokens): if benchmark_type == 'judgemark': for model_name, model_outputs in test_model_outputs.items(): print('########################') @@ -174,7 +174,7 @@ def process_questions(benchmark_type, model, ooba_instance, inference_engine, re scores = process_fn(prompt_id, prompt_data, None, None, None, None, results, run_index, run_iter, verbose, 0, inference_engine, ooba_instance, launch_ooba, ooba_request_timeout, openai_client, judge_params, - test_model_response, model_name) + test_model_response, model_name, completion_tokens) model_scores.append(scores) safe_dump(results, RAW_RESULTS_PATH) @@ -187,11 +187,11 @@ def process_questions(benchmark_type, model, ooba_instance, inference_engine, re if benchmark_type == 'eq-bench': process_fn(question_id, q, model_path, prompt_type, model, tokenizer, results, run_index, run_iter, verbose, n_attempts, inference_engine, ooba_instance, launch_ooba, ooba_request_timeout, openai_client, eqbench_version, - language, REVISE) + language, REVISE, completion_tokens) elif benchmark_type == 'creative-writing': scores = process_fn(question_id, q, model_path, prompt_type, model, tokenizer, results, run_index, run_iter, verbose, n_attempts, inference_engine, ooba_instance, launch_ooba, - ooba_request_timeout, openai_client, judge_params) + ooba_request_timeout, openai_client, judge_params, completion_tokens) if scores: if verbose: print(scores) @@ -330,10 +330,14 @@ def run_generic_benchmark(run_id, model_path, lora_path, prompt_type, quantizati ooba_params_global, fast_download, hf_access_token, ooba_request_timeout, questions_fn=None, openai_client=None, language='en', - REVISE=False, benchmark_type='eq-bench', judge_params={}): + REVISE=False, benchmark_type='eq-bench', judge_params={}, completion_tokens=None): questions, process_fn, scoring_fn, save_result_to_db_fn, run_index, eqbench_version, test_model_outputs = setup_benchmark(benchmark_type, run_id, model_path, lora_path, prompt_type, quantization, inference_engine, ooba_params, include_patterns, exclude_patterns, language, judge_params, questions_fn) + if completion_tokens is None: + if benchmark_type == 'eq-bench': + completion_tokens = 600 if (REVISE or eqbench_version == 'v1') else 60 + results = initialize_results(run_index, benchmark_type, resume, n_iterations, run_id, model_path, lora_path, prompt_type, quantization, inference_engine, ooba_params, include_patterns, exclude_patterns, judge_params, language, eqbench_version) initialize_iterations(results, run_index, n_iterations, benchmark_type, resume) @@ -364,7 +368,7 @@ def run_generic_benchmark(run_id, model_path, lora_path, prompt_type, quantizati run_index, run_iter, verbose) - process_questions(benchmark_type, model, ooba_instance, inference_engine, results, model_path, prompt_type, tokenizer, launch_ooba, ooba_request_timeout, run_index, run_iter, verbose, n_attempts, openai_client, questions, eqbench_version, language, REVISE, judge_params, test_model_outputs, process_fn) + process_questions(benchmark_type, model, ooba_instance, inference_engine, results, model_path, prompt_type, tokenizer, launch_ooba, ooba_request_timeout, run_index, run_iter, verbose, n_attempts, openai_client, questions, eqbench_version, language, REVISE, judge_params, test_model_outputs, process_fn, completion_tokens) if benchmark_type == 'judgemark': compute_judgemark_results(results, run_index, test_model_outputs, verbose) @@ -460,7 +464,7 @@ def run_benchmark(run_id, model_path, lora_path, prompt_type, quantization, ooba_params_global='', fast_download=False, hf_access_token=None, ooba_request_timeout=300, questions_fn=None, openai_client=None, language='en', - REVISE=False, benchmark_types=[], judge_params={}): + REVISE=False, benchmark_types=[], judge_params={}, completion_tokens=None): for benchmark_type in benchmark_types: if benchmark_type == 'eq-bench': @@ -476,7 +480,7 @@ def run_benchmark(run_id, model_path, lora_path, prompt_type, quantization, ooba_params_global, fast_download, hf_access_token, ooba_request_timeout, questions_fn, openai_client, language, - REVISE, benchmark_type) + REVISE, benchmark_type, completion_tokens=completion_tokens) elif benchmark_type == 'creative-writing': run_generic_benchmark(run_id, model_path, lora_path, prompt_type, quantization, @@ -491,7 +495,7 @@ def run_benchmark(run_id, model_path, lora_path, prompt_type, quantization, ooba_params_global, fast_download, hf_access_token, ooba_request_timeout, openai_client=openai_client, judge_params=judge_params, - benchmark_type=benchmark_type) + benchmark_type=benchmark_type, completion_tokens=completion_tokens) elif benchmark_type == 'judgemark': run_generic_benchmark(run_id, None, None, None, None, @@ -506,7 +510,7 @@ def run_benchmark(run_id, model_path, lora_path, prompt_type, quantization, ooba_params_global, fast_download, hf_access_token, ooba_request_timeout, openai_client=openai_client, judge_params=judge_params, - benchmark_type=benchmark_type) + benchmark_type=benchmark_type, completion_tokens=completion_tokens) diff --git a/lib/scoring.py b/lib/scoring.py index 884880f..39af55b 100644 --- a/lib/scoring.py +++ b/lib/scoring.py @@ -2,8 +2,15 @@ import math from lib.util import safe_dump +def remove_think_blocks(text): + """Remove all content between and tags.""" + return re.sub(r'.*?', '', text, flags=re.DOTALL) + # Parse the emotion intensity ratings from the raw inference text def parse_answers(text, REVISE): + # First remove any think blocks + text = remove_think_blocks(text) + first_pass_answers = {} revised_answers = {} @@ -30,7 +37,8 @@ def parse_answers(text, REVISE): # we parse answers in German language ("de") def parse_answers_de(text, REVISE): - #print("Using german parsing.") + # First remove any think blocks + text = remove_think_blocks(text) first_pass_answers = {} revised_answers = {}