From 086faf404a10dd6aa9e2541507d767126f5b0fba Mon Sep 17 00:00:00 2001 From: Dacheng Li Date: Sun, 18 May 2025 03:52:25 +0000 Subject: [PATCH] Clean S* TODOs --- .../codecontest_evaluate_multiprocess.py | 6 +++--- .../test-time-scaling/evaluate_multiprocess.py | 4 ++-- .../test-time-scaling/live_code_bench_execute.py | 4 ++-- .../test-time-scaling/live_code_bench_program.py | 12 ++++++------ 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/skythought/test-time-scaling/codecontest_evaluate_multiprocess.py b/skythought/test-time-scaling/codecontest_evaluate_multiprocess.py index a72301b3..020c986f 100644 --- a/skythought/test-time-scaling/codecontest_evaluate_multiprocess.py +++ b/skythought/test-time-scaling/codecontest_evaluate_multiprocess.py @@ -309,7 +309,7 @@ def get_accuracy_all_rounds(dataset, num_process_evaluate, method="selfdebug", t return final_accuracy -def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to support all methods +def generate_and_evaluate(arguments): """ Takes in a single dspy example, generate code and evaluate it. @@ -342,7 +342,7 @@ def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to supp # assert False ## Initialize the code generator if method == "selfdebug": - ## initialize debug lm to be 40mini : TODO(Alex): delete this if not work, or add a new argument for this if this works + ## initialize debug lm to be 40mini : TODO: delete this if not work, or add a new argument for this if this works debug_lm = dspy.LM('openai/gpt-4o-mini', cache=use_dspy_cache) test_program = CodeGeneratorWithSelfDebug(extracted_tests, num_round=args.num_round, n=args.n, temperature=args.temperature, lm=lm, selection=args.selection, context=args.context, judge_lm=judge_lm, pre_computed_tests=tests, @@ -357,7 +357,7 @@ def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to supp test_program = NaiveCodeGeneratorNoDSPy(args) - # TODO: @DL support oracle + # TODO: support oracle # if args.selection == "debug_all": # eval_metric=live_code_bench_evaluate_batch # else: diff --git a/skythought/test-time-scaling/evaluate_multiprocess.py b/skythought/test-time-scaling/evaluate_multiprocess.py index aa2aa427..f5811d87 100644 --- a/skythought/test-time-scaling/evaluate_multiprocess.py +++ b/skythought/test-time-scaling/evaluate_multiprocess.py @@ -313,7 +313,7 @@ def get_accuracy_all_rounds(dataset, num_process_evaluate, method="selfdebug", t return final_accuracy -def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to support all methods +def generate_and_evaluate(arguments): ##TODO take in a method here to support all methods """ Takes in a single dspy example, generate code and evaluate it. @@ -346,7 +346,7 @@ def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to supp # assert False ## Initialize the code generator if method == "selfdebug": - ## initialize debug lm to be 40mini : TODO(Alex): delete this if not work, or add a new argument for this if this works + ## initialize debug lm to be 40mini : TODO delete this if not work, or add a new argument for this if this works debug_lm = dspy.LM('openai/gpt-4o-mini', cache=use_dspy_cache) test_program = CodeGeneratorWithSelfDebug(extracted_tests, num_round=args.num_round, n=args.n, temperature=args.temperature, lm=lm, selection=args.selection, context=args.context, judge_lm=judge_lm, pre_computed_tests=tests, diff --git a/skythought/test-time-scaling/live_code_bench_execute.py b/skythought/test-time-scaling/live_code_bench_execute.py index 0e667586..7da37596 100644 --- a/skythought/test-time-scaling/live_code_bench_execute.py +++ b/skythought/test-time-scaling/live_code_bench_execute.py @@ -200,7 +200,7 @@ def unsafe_lcb_run_timeout_tests(timeout_input_list, completion, timeout, is_std result = manager.list() p = multiprocessing.Process(target=run_single_timeout_test_list, args=(timeout_input_list, completion, timeout, is_stdin, result)) p.start() - p.join(timeout = timeout * len(timeout_input_list) + 3) # TODO Alex: what should be the timeout here? + p.join(timeout = timeout * len(timeout_input_list) + 3) # TODO what should be the timeout here? if p.is_alive(): p.kill() @@ -247,7 +247,7 @@ def unsafe_lcb_runTests(problem, completion, timeout, runtime_debug, is_extracte p.start() # print(f"There are {len(test_cases)} test cases.") if fast_check: - p.join(fast_check_global_time_out) # TODO Alex: Check whether number of task cases is correct + p.join(fast_check_global_time_out) # TODO Check whether number of task cases is correct else: p.join(timeout = (timeout+1) * len(test_cases) + 5) if p.is_alive(): diff --git a/skythought/test-time-scaling/live_code_bench_program.py b/skythought/test-time-scaling/live_code_bench_program.py index ad8363c5..10cdc65e 100644 --- a/skythought/test-time-scaling/live_code_bench_program.py +++ b/skythought/test-time-scaling/live_code_bench_program.py @@ -882,7 +882,7 @@ def forward( ): prompt = example["prompt"] - ## TODO: (Alex) Here make sure this is the right place to read cache + ## TODO: Here make sure this is the right place to read cache if self.args.load_cached_preds: ## load the cached prediction completions and get the public accuracy to replicate zipped_history codes = self.cached_preds_dict[task_id]['codes'] @@ -961,7 +961,7 @@ def forward( api_key=os.environ.get("OPENAI_API_KEY"), # This is the default and can be omitted ) # print(f"Using {self.args.generator} to generate code") - # TODO: DL, please clean up these massy naming + # TODO: please clean up these massy naming generator = self.args.generator if generator == "4o-mini": generator = "gpt-4o-mini" @@ -1090,7 +1090,7 @@ def forward( zipped_history[n].append(("", 0, "", "", 0)) ## if any exception occur (like context window limit exceeded, fallback to simply empty completion) # print(zipped_history[-1][-1]) # print(f"=" * 10 + "Finished generating selfdebug prediction" + "=" * 10) - return self.selection_function(zipped_history, task_id, prompt, is_stdin, example), None ## (Alex) example is newly added, could get rid of the redundancy for prompt and is_stdin + return self.selection_function(zipped_history, task_id, prompt, is_stdin, example), None ## example is newly added, could get rid of the redundancy for prompt and is_stdin def get_anchor_break_and_feedback(self, prompt, pred, extracted_tests, public_test_acc, public_test_feedback_string, generated_test_anchors): anchor_break = False @@ -1312,7 +1312,7 @@ def selection_function(self, zipped_history, task_id, prompt, is_stdin, example) ] # print(private_tests) if self.selection == "generated_tests": - with dspy.context(lm=self.debug_lm): ## TODO (Alex): here I simply used debug_lm because debug_lm is fixed to be 4omini, we can make this an argument + with dspy.context(lm=self.debug_lm): ## TODO here I simply used debug_lm because debug_lm is fixed to be 4omini, we can make this an argument timeout_input_list = generate_tests_for_one_example(example, generation_fun=generate_timeout_tests_repeat,num_timeout_tests=3) best_rate = -1 public_correct_samples_pass_rate = [] @@ -1338,7 +1338,7 @@ def selection_function(self, zipped_history, task_id, prompt, is_stdin, example) preds_pass = [ list( map( - lambda test: 1# test["count"] # @DL: This is weird, should just reduce the same tests. + lambda test: 1# test["count"] # This is weird, should just reduce the same tests. if check_test( [test["test"]], post_process_code(public_correct_sample), 0, prompt, "dummy", runtime_debug=True, raw=True, is_extracted=False )[0] @@ -2042,7 +2042,7 @@ def generate_tests_for_whole_dataset( zero_correct_tests_count, ) -def generate_tests_for_one_example(example, ##TODO :Alex, make sure what ever takes the output of this function to be able to handle the new output format +def generate_tests_for_one_example(example, ##TODO Make sure what ever takes the output of this function to be able to handle the new output format generation_fun, completions = None, judge_lm=None,