Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ def get_accuracy_all_rounds(dataset, num_process_evaluate, method="selfdebug", t
return final_accuracy


def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to support all methods
def generate_and_evaluate(arguments):
"""
Takes in a single dspy example, generate code and evaluate it.

Expand Down Expand Up @@ -342,7 +342,7 @@ def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to supp
# assert False
## Initialize the code generator
if method == "selfdebug":
## initialize debug lm to be 40mini : TODO(Alex): delete this if not work, or add a new argument for this if this works
## initialize debug lm to be 40mini : TODO: delete this if not work, or add a new argument for this if this works
debug_lm = dspy.LM('openai/gpt-4o-mini', cache=use_dspy_cache)
test_program = CodeGeneratorWithSelfDebug(extracted_tests, num_round=args.num_round, n=args.n, temperature=args.temperature,
lm=lm, selection=args.selection, context=args.context, judge_lm=judge_lm, pre_computed_tests=tests,
Expand All @@ -357,7 +357,7 @@ def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to supp
test_program = NaiveCodeGeneratorNoDSPy(args)


# TODO: @DL support oracle
# TODO: support oracle
# if args.selection == "debug_all":
# eval_metric=live_code_bench_evaluate_batch
# else:
Expand Down
4 changes: 2 additions & 2 deletions skythought/test-time-scaling/evaluate_multiprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ def get_accuracy_all_rounds(dataset, num_process_evaluate, method="selfdebug", t
return final_accuracy


def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to support all methods
def generate_and_evaluate(arguments): ##TODO take in a method here to support all methods
"""
Takes in a single dspy example, generate code and evaluate it.

Expand Down Expand Up @@ -346,7 +346,7 @@ def generate_and_evaluate(arguments): ##TODO Alex, take in a method here to supp
# assert False
## Initialize the code generator
if method == "selfdebug":
## initialize debug lm to be 40mini : TODO(Alex): delete this if not work, or add a new argument for this if this works
## initialize debug lm to be 40mini : TODO delete this if not work, or add a new argument for this if this works
debug_lm = dspy.LM('openai/gpt-4o-mini', cache=use_dspy_cache)
test_program = CodeGeneratorWithSelfDebug(extracted_tests, num_round=args.num_round, n=args.n, temperature=args.temperature,
lm=lm, selection=args.selection, context=args.context, judge_lm=judge_lm, pre_computed_tests=tests,
Expand Down
4 changes: 2 additions & 2 deletions skythought/test-time-scaling/live_code_bench_execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def unsafe_lcb_run_timeout_tests(timeout_input_list, completion, timeout, is_std
result = manager.list()
p = multiprocessing.Process(target=run_single_timeout_test_list, args=(timeout_input_list, completion, timeout, is_stdin, result))
p.start()
p.join(timeout = timeout * len(timeout_input_list) + 3) # TODO Alex: what should be the timeout here?
p.join(timeout = timeout * len(timeout_input_list) + 3) # TODO what should be the timeout here?
if p.is_alive():
p.kill()

Expand Down Expand Up @@ -247,7 +247,7 @@ def unsafe_lcb_runTests(problem, completion, timeout, runtime_debug, is_extracte
p.start()
# print(f"There are {len(test_cases)} test cases.")
if fast_check:
p.join(fast_check_global_time_out) # TODO Alex: Check whether number of task cases is correct
p.join(fast_check_global_time_out) # TODO Check whether number of task cases is correct
else:
p.join(timeout = (timeout+1) * len(test_cases) + 5)
if p.is_alive():
Expand Down
12 changes: 6 additions & 6 deletions skythought/test-time-scaling/live_code_bench_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -882,7 +882,7 @@ def forward(
):
prompt = example["prompt"]

## TODO: (Alex) Here make sure this is the right place to read cache
## TODO: Here make sure this is the right place to read cache
if self.args.load_cached_preds:
## load the cached prediction completions and get the public accuracy to replicate zipped_history
codes = self.cached_preds_dict[task_id]['codes']
Expand Down Expand Up @@ -961,7 +961,7 @@ def forward(
api_key=os.environ.get("OPENAI_API_KEY"), # This is the default and can be omitted
)
# print(f"Using {self.args.generator} to generate code")
# TODO: DL, please clean up these massy naming
# TODO: please clean up these massy naming
generator = self.args.generator
if generator == "4o-mini":
generator = "gpt-4o-mini"
Expand Down Expand Up @@ -1090,7 +1090,7 @@ def forward(
zipped_history[n].append(("", 0, "", "", 0)) ## if any exception occur (like context window limit exceeded, fallback to simply empty completion)
# print(zipped_history[-1][-1])
# print(f"=" * 10 + "Finished generating selfdebug prediction" + "=" * 10)
return self.selection_function(zipped_history, task_id, prompt, is_stdin, example), None ## (Alex) example is newly added, could get rid of the redundancy for prompt and is_stdin
return self.selection_function(zipped_history, task_id, prompt, is_stdin, example), None ## example is newly added, could get rid of the redundancy for prompt and is_stdin

def get_anchor_break_and_feedback(self, prompt, pred, extracted_tests, public_test_acc, public_test_feedback_string, generated_test_anchors):
anchor_break = False
Expand Down Expand Up @@ -1312,7 +1312,7 @@ def selection_function(self, zipped_history, task_id, prompt, is_stdin, example)
]
# print(private_tests)
if self.selection == "generated_tests":
with dspy.context(lm=self.debug_lm): ## TODO (Alex): here I simply used debug_lm because debug_lm is fixed to be 4omini, we can make this an argument
with dspy.context(lm=self.debug_lm): ## TODO here I simply used debug_lm because debug_lm is fixed to be 4omini, we can make this an argument
timeout_input_list = generate_tests_for_one_example(example, generation_fun=generate_timeout_tests_repeat,num_timeout_tests=3)
best_rate = -1
public_correct_samples_pass_rate = []
Expand All @@ -1338,7 +1338,7 @@ def selection_function(self, zipped_history, task_id, prompt, is_stdin, example)
preds_pass = [
list(
map(
lambda test: 1# test["count"] # @DL: This is weird, should just reduce the same tests.
lambda test: 1# test["count"] # This is weird, should just reduce the same tests.
if check_test(
[test["test"]], post_process_code(public_correct_sample), 0, prompt, "dummy", runtime_debug=True, raw=True, is_extracted=False
)[0]
Expand Down Expand Up @@ -2042,7 +2042,7 @@ def generate_tests_for_whole_dataset(
zero_correct_tests_count,
)

def generate_tests_for_one_example(example, ##TODO :Alex, make sure what ever takes the output of this function to be able to handle the new output format
def generate_tests_for_one_example(example, ##TODO Make sure what ever takes the output of this function to be able to handle the new output format
generation_fun,
completions = None,
judge_lm=None,
Expand Down