NovaSky-AI · DachengLi1 · May 18, 2025 · May 18, 2025
diff --git a/skythought/test-time-scaling/codecontest_evaluate_multiprocess.py b/skythought/test-time-scaling/codecontest_evaluate_multiprocess.py
@@ -309,7 +309,7 @@ def get_accuracy_all_rounds(dataset, num_process_evaluate, method="selfdebug", t
     return final_accuracy
 
 
-def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to support all methods
+def generate_and_evaluate(arguments): 
     """
     Takes in a single dspy example, generate code and evaluate it. 
 
@@ -342,7 +342,7 @@ def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to supp
         # assert False
     ## Initialize the code generator
     if method == "selfdebug":
-        ## initialize debug lm to be 40mini : TODO(Alex): delete this if not work, or add a new argument for this if this works
+        ## initialize debug lm to be 40mini : TODO: delete this if not work, or add a new argument for this if this works
         debug_lm = dspy.LM('openai/gpt-4o-mini', cache=use_dspy_cache)
         test_program = CodeGeneratorWithSelfDebug(extracted_tests, num_round=args.num_round, n=args.n, temperature=args.temperature, 
                                                   lm=lm, selection=args.selection, context=args.context, judge_lm=judge_lm, pre_computed_tests=tests, 
@@ -357,7 +357,7 @@ def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to supp
             test_program = NaiveCodeGeneratorNoDSPy(args)
 
 
-    # TODO: @DL support oracle
+    # TODO: support oracle
     # if args.selection == "debug_all":
     #    eval_metric=live_code_bench_evaluate_batch
     # else:

diff --git a/skythought/test-time-scaling/evaluate_multiprocess.py b/skythought/test-time-scaling/evaluate_multiprocess.py
@@ -313,7 +313,7 @@ def get_accuracy_all_rounds(dataset, num_process_evaluate, method="selfdebug", t
     return final_accuracy
 
 
-def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to support all methods
+def generate_and_evaluate(arguments): ##TODO take in a method here to support all methods
     """
     Takes in a single dspy example, generate code and evaluate it. 
 
@@ -346,7 +346,7 @@ def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to supp
         # assert False
     ## Initialize the code generator
     if method == "selfdebug":
-        ## initialize debug lm to be 40mini : TODO(Alex): delete this if not work, or add a new argument for this if this works
+        ## initialize debug lm to be 40mini : TODO delete this if not work, or add a new argument for this if this works
         debug_lm = dspy.LM('openai/gpt-4o-mini', cache=use_dspy_cache)
         test_program = CodeGeneratorWithSelfDebug(extracted_tests, num_round=args.num_round, n=args.n, temperature=args.temperature, 
                                                   lm=lm, selection=args.selection, context=args.context, judge_lm=judge_lm, pre_computed_tests=tests, 

diff --git a/skythought/test-time-scaling/live_code_bench_execute.py b/skythought/test-time-scaling/live_code_bench_execute.py
@@ -200,7 +200,7 @@ def unsafe_lcb_run_timeout_tests(timeout_input_list, completion, timeout, is_std
     result = manager.list()
     p = multiprocessing.Process(target=run_single_timeout_test_list, args=(timeout_input_list, completion, timeout, is_stdin, result))
     p.start()
-    p.join(timeout = timeout * len(timeout_input_list) + 3) # TODO Alex: what should be the timeout here?
+    p.join(timeout = timeout * len(timeout_input_list) + 3) # TODO what should be the timeout here?
     if p.is_alive():
         p.kill()
 
@@ -247,7 +247,7 @@ def unsafe_lcb_runTests(problem, completion, timeout, runtime_debug, is_extracte
     p.start()
     # print(f"There are {len(test_cases)} test cases.")
     if fast_check:
-        p.join(fast_check_global_time_out) # TODO Alex: Check whether number of task cases is correct
+        p.join(fast_check_global_time_out) # TODO Check whether number of task cases is correct
     else:
         p.join(timeout = (timeout+1) * len(test_cases) + 5)
     if p.is_alive():

diff --git a/skythought/test-time-scaling/live_code_bench_program.py b/skythought/test-time-scaling/live_code_bench_program.py
@@ -882,7 +882,7 @@ def forward(
     ):
         prompt = example["prompt"]
 
-        ## TODO: (Alex) Here make sure this is the right place to read cache
+        ## TODO: Here make sure this is the right place to read cache
         if self.args.load_cached_preds:
             ## load the cached prediction completions and get the public accuracy to replicate zipped_history
             codes = self.cached_preds_dict[task_id]['codes']
@@ -961,7 +961,7 @@ def forward(
                                     api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
                                 )
                                 # print(f"Using {self.args.generator} to generate code")
-                                # TODO: DL, please clean up these massy naming
+                                # TODO: please clean up these massy naming
                                 generator = self.args.generator
                                 if generator == "4o-mini":
                                     generator = "gpt-4o-mini"
@@ -1090,7 +1090,7 @@ def forward(
                     zipped_history[n].append(("", 0, "", "", 0)) ## if any exception occur (like context window limit exceeded, fallback to simply empty completion)  
         # print(zipped_history[-1][-1])
         # print(f"=" * 10 + "Finished generating selfdebug prediction" + "=" * 10)
-        return self.selection_function(zipped_history, task_id, prompt, is_stdin, example), None ## (Alex) example is newly added, could get rid of the redundancy for prompt and is_stdin
+        return self.selection_function(zipped_history, task_id, prompt, is_stdin, example), None ## example is newly added, could get rid of the redundancy for prompt and is_stdin
 
     def get_anchor_break_and_feedback(self, prompt, pred, extracted_tests, public_test_acc, public_test_feedback_string, generated_test_anchors):
         anchor_break = False
@@ -1312,7 +1312,7 @@ def selection_function(self, zipped_history, task_id, prompt, is_stdin, example)
                         ]
                         # print(private_tests)
                         if self.selection == "generated_tests":
-                            with dspy.context(lm=self.debug_lm): ## TODO (Alex): here I simply used debug_lm because debug_lm is fixed to be 4omini, we can make this an argument
+                            with dspy.context(lm=self.debug_lm): ## TODO here I simply used debug_lm because debug_lm is fixed to be 4omini, we can make this an argument
                                 timeout_input_list = generate_tests_for_one_example(example, generation_fun=generate_timeout_tests_repeat,num_timeout_tests=3)
                             best_rate = -1
                             public_correct_samples_pass_rate = []
@@ -1338,7 +1338,7 @@ def selection_function(self, zipped_history, task_id, prompt, is_stdin, example)
                             preds_pass = [
                                 list(
                                     map(
-                                        lambda test: 1# test["count"] # @DL: This is weird, should just reduce the same tests.
+                                        lambda test: 1# test["count"] # This is weird, should just reduce the same tests.
                                         if check_test(
                                             [test["test"]], post_process_code(public_correct_sample), 0, prompt, "dummy", runtime_debug=True, raw=True, is_extracted=False
                                         )[0]
@@ -2042,7 +2042,7 @@ def generate_tests_for_whole_dataset(
         zero_correct_tests_count,
     )
 
-def generate_tests_for_one_example(example,  ##TODO :Alex, make sure what ever takes the output of this function to be able to handle the new output format
+def generate_tests_for_one_example(example,  ##TODO Make sure what ever takes the output of this function to be able to handle the new output format
     generation_fun,
     completions = None,
     judge_lm=None,