From 733fba966f8ba1ac606512eeba5c2f75cb63b282 Mon Sep 17 00:00:00 2001 From: Weiwei Yang Date: Sun, 19 Nov 2023 04:25:01 +0000 Subject: [PATCH 1/6] generate full run configs --- build_open_spec.py | 233 ++++++++++++++++++ build_secret_run_spec.py | 32 +-- configs/run_specs_closed_5000_budget.conf | 34 +++ configs/run_specs_open_2000_budget.conf | 155 ++++++++++++ neurIPS_eval_scripts/eval_metrics.py | 4 +- ...t.conf => run_specs_closed_100_budget.conf | 26 +- ...s_full_closed_eval_coarse_3000_budget.conf | 48 ---- run_specs_open_100_budget.conf | 155 ++++++++++++ 8 files changed, 601 insertions(+), 86 deletions(-) create mode 100644 build_open_spec.py create mode 100644 configs/run_specs_closed_5000_budget.conf create mode 100644 configs/run_specs_open_2000_budget.conf rename run_specs_full_closed_eval_coarse_100_budget.conf => run_specs_closed_100_budget.conf (51%) delete mode 100644 run_specs_full_closed_eval_coarse_3000_budget.conf create mode 100644 run_specs_open_100_budget.conf diff --git a/build_open_spec.py b/build_open_spec.py new file mode 100644 index 00000000..982c2384 --- /dev/null +++ b/build_open_spec.py @@ -0,0 +1,233 @@ +entries = [ + #bigbench + # 1. auto_debugging: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/auto_debugging + {'scenario':'auto_debugging','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=auto_debugging,subtask=", 'priority': 1}, + + # 3. code_line_description: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/code_line_description + {'scenario':'code_line_description','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=code_line_description,subtask=", 'priority': 1}, + + # 4. conceptual_combinations: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/conceptual_combinations + {'scenario':'conceptual_combinations','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=contradictions", 'priority': 1}, + {'scenario':'conceptual_combinations','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=emergent_properties", 'priority': 1}, + {'scenario':'conceptual_combinations','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=fanciful_fictional_combinations", 'priority': 1}, + {'scenario':'conceptual_combinations','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=homonyms", 'priority': 1}, + {'scenario':'conceptual_combinations','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=invented_words", 'priority': 1}, + + # 6. emoji_movie: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/emoji_movie + {'scenario':'emoji_movie','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=emoji_movie,subtask=", 'priority': 1}, + + # 7. formal_fallacies_syllogisms_negation: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/formal_fallacies_syllogisms_negation + {'scenario':'formal_fallacies_syllogisms_negation','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=formal_fallacies_syllogisms_negation,subtask=", 'priority': 1}, + + # 8. hindu_knowledge: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/hindu_knowledge + # {'scenario':'hindu_knowledge','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=hindu_knowledge,subtask=", 'priority': 1}, + + # 9. known_unknowns: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/known_unknowns + {'scenario':'known_unknowns','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=known_unknowns,subtask=", 'priority': 1}, + + # 11. linguistics_puzzles: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/linguistics_puzzles + {'scenario':'linguistics_puzzles','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=linguistics_puzzles,subtask=", 'priority': 1}, + + # 12. logic_grid_puzzle: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/logic_grid_puzzle + {'scenario':'logic_grid_puzzle','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logic_grid_puzzle,subtask=", 'priority': 1}, + + # 13. logical_deduction: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/logical_deduction + {'scenario':'logical_deduction','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logical_deduction,subtask=three_objects", 'priority': 1}, + {'scenario':'logical_deduction','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logical_deduction,subtask=five_objects", 'priority': 1}, + {'scenario':'logical_deduction','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logical_deduction,subtask=seven_objects", 'priority': 1}, + + # 14. misconceptions_russian: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/misconceptions_russian + # {'scenario':'misconceptions_russian','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=misconceptions_russian,subtask=", 'priority': 1}, + + # 15. novel_concepts: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/novel_concepts + {'scenario':'novel_concepts','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=novel_concepts,subtask=", 'priority': 1}, + + # 16. operators: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/operators + {'scenario':'operator','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=operators,subtask=", 'priority': 1}, + + # 17. parsinlu_reading_comprehension: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/parsinlu_reading_comprehension + # {'scenario':'parsinlu_reading_comprehension','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=parsinlu_reading_comprehension,subtask=", 'priority': 1}, + + # 18. play_dialog_same_or_different: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/play_dialog_same_or_different + {'scenario':'play_dialog_same_or_different','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=play_dialog_same_or_different,subtask=", 'priority': 1}, + + # 19. repeat_copy_logic: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/repeat_copy_logic + {'scenario':'repeat_copy_logic','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=repeat_copy_logic,subtask=", 'priority': 1}, + + # 20. strange_stories: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/strange_stories + {'scenario':'strange_stories','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=strange_stories,subtask=boolean", 'priority': 1}, + {'scenario':'strange_stories','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=strange_stories,subtask=multiple_choice", 'priority': 1}, + + # 21. strategyqa: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/strategyqa + {'scenario':'strategyqa','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=strategyqa,subtask=", 'priority': 1}, + + # 22. symbol_interpretation: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/symbol_interpretation + {'scenario':'symbol_interpretation','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=adversarial", 'priority': 1}, + {'scenario':'symbol_interpretation','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=emoji_agnostic", 'priority': 1}, + {'scenario':'symbol_interpretation','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=name_agnostic", 'priority': 1}, + {'scenario':'symbol_interpretation','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=plain", 'priority': 1}, + {'scenario':'symbol_interpretation','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=tricky", 'priority': 1}, + + # 23. vitaminc_fact_verification: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/vitaminc_fact_verification + {'scenario':'vitaminc_fact_verification','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=vitaminc_fact_verification,subtask=", 'priority': 1}, + + # 24. winowhy: https://github.com/google/big-bench/tree/main/bigbench/benchmark_tasks/winowhy + {'scenario':'winowhy','description': "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=winowhy,subtask=", 'priority': 1}, + + # MMLU STEM: Medicine/Biology + {'scenario':'medicine_biology','description': "mmlu:model=neurips/local,subject=anatomy,data_augmentation=canonical", 'priority': 2}, + {'scenario':'medicine_biology','description': "mmlu:model=neurips/local,subject=college_medicine,data_augmentation=canonical", 'priority': 2}, + {'scenario':'medicine_biology','description': "mmlu:model=neurips/local,subject=college_biology,data_augmentation=canonical", 'priority': 2}, + {'scenario':'medicine_biology','description': "mmlu:model=neurips/local,subject=high_school_biology,data_augmentation=canonical", 'priority': 2}, + + # MMLU STEM: CS + {'scenario':'computer_science','description': "mmlu:model=neurips/local,subject=college_computer_science,data_augmentation=canonical", 'priority': 2}, + {'scenario':'computer_science','description': "mmlu:model=neurips/local,subject=high_school_computer_science,data_augmentation=canonical", 'priority': 2}, + {'scenario':'computer_science','description': "mmlu:model=neurips/local,subject=computer_security,data_augmentation=canonical", 'priority': 2}, + {'scenario':'computer_science','description': "mmlu:model=neurips/local,subject=electrical_engineering,data_augmentation=canonical", 'priority': 2}, + {'scenario':'computer_science','description': "mmlu:model=neurips/local,subject=machine_learning,data_augmentation=canonical", 'priority': 2}, + + # MMLU STEM: Math + {'scenario':'math','description': "mmlu:model=neurips/local,subject=high_school_mathematics,data_augmentation=canonical", 'priority': 2}, + {'scenario':'math','description': "mmlu:model=neurips/local,subject=college_mathematics,data_augmentation=canonical", 'priority': 2}, + {'scenario':'math','description': "mmlu:model=neurips/local,subject=abstract_algebra,data_augmentation=canonical", 'priority': 2}, + {'scenario':'math','description': "mmlu:model=neurips/local,subject=high_school_statistics,data_augmentation=canonical", 'priority': 2}, + + # MMLU STEM: Chemistry/Physics + {'scenario':'physics_chemistry','description': "mmlu:model=neurips/local,subject=college_chemistry,data_augmentation=canonical", 'priority': 2}, + {'scenario':'physics_chemistry','description': "mmlu:model=neurips/local,subject=high_school_chemistry,data_augmentation=canonical", 'priority': 2}, + {'scenario':'physics_chemistry','description': "mmlu:model=neurips/local,subject=high_school_physics,data_augmentation=canonical", 'priority': 2}, + {'scenario':'physics_chemistry','description': "mmlu:model=neurips/local,subject=college_physics,data_augmentation=canonical", 'priority': 2}, + {'scenario':'physics_chemistry','description': "mmlu:model=neurips/local,subject=astronomy,data_augmentation=canonical", 'priority': 2}, + + # MMLU Humanities: Formal reasoning + {'scenario':'formal_reasoning','description': "mmlu:model=neurips/local,subject=formal_logic,data_augmentation=canonical", 'priority': 2}, + {'scenario':'formal_reasoning','description': "mmlu:model=neurips/local,subject=logical_fallacies,data_augmentation=canonical", 'priority': 2}, + {'scenario':'formal_reasoning','description': "mmlu:model=neurips/local,subject=philosophy,data_augmentation=canonical", 'priority': 2}, + {'scenario':'formal_reasoning','description': "mmlu:model=neurips/local,subject=moral_disputes,data_augmentation=canonical", 'priority': 2}, + {'scenario':'formal_reasoning','description': "mmlu:model=neurips/local,subject=moral_scenarios,data_augmentation=canonical", 'priority': 2}, + + # MMLU Humanities: Law + {'scenario':'law','description': "mmlu:model=neurips/local,subject=professional_law,data_augmentation=canonical", 'priority': 2}, + {'scenario':'law','description': "mmlu:model=neurips/local,subject=international_law,data_augmentation=canonical", 'priority': 2}, + {'scenario':'law','description': "mmlu:model=neurips/local,subject=jurisprudence,data_augmentation=canonical", 'priority': 2}, + + # MMLU Humanities: Histroy + {'scenario':'history','description': "mmlu:model=neurips/local,subject=high_school_european_history,data_augmentation=canonical", 'priority': 2}, + {'scenario':'history','description': "mmlu:model=neurips/local,subject=high_school_us_history,data_augmentation=canonical", 'priority': 2}, + {'scenario':'history','description': "mmlu:model=neurips/local,subject=high_school_world_history,data_augmentation=canonical", 'priority': 2}, + {'scenario':'history','description': "mmlu:model=neurips/local,subject=prehistory,data_augmentation=canonical", 'priority': 2}, + {'scenario':'history','description': "mmlu:model=neurips/local,subject=world_religions,data_augmentation=canonical", 'priority': 2}, + + # MMLU Other: Business + {'scenario':'business','description': "mmlu:model=neurips/local,subject=business_ethics,data_augmentation=canonical", 'priority': 2}, + {'scenario':'business','description': "mmlu:model=neurips/local,subject=global_facts,data_augmentation=canonical", 'priority': 2}, + {'scenario':'business','description': "mmlu:model=neurips/local,subject=management,data_augmentation=canonical", 'priority': 2}, + {'scenario':'business','description': "mmlu:model=neurips/local,subject=marketing,data_augmentation=canonical", 'priority': 2}, + {'scenario':'business','description': "mmlu:model=neurips/local,subject=miscellaneous,data_augmentation=canonical", 'priority': 2}, + {'scenario':'business','description': "mmlu:model=neurips/local,subject=professional_accounting,data_augmentation=canonical", 'priority': 2}, + + # MMLU Other: Health + {'scenario':'health','description': "mmlu:model=neurips/local,subject=nutrition,data_augmentation=canonical", 'priority': 2}, + {'scenario':'health','description': "mmlu:model=neurips/local,subject=human_aging,data_augmentation=canonical", 'priority': 2}, + {'scenario':'health','description': "mmlu:model=neurips/local,subject=clinical_knowledge,data_augmentation=canonical", 'priority': 2}, + {'scenario':'health','description': "mmlu:model=neurips/local,subject=medical_genetics,data_augmentation=canonical", 'priority': 2}, + {'scenario':'health','description': "mmlu:model=neurips/local,subject=professional_medicine,data_augmentation=canonical", 'priority': 2}, + {'scenario':'health','description': "mmlu:model=neurips/local,subject=virology,data_augmentation=canonical", 'priority': 2}, + + # MMLU Social Sciences: Social studies + {'scenario':'social_studies','description': "mmlu:model=neurips/local,subject=high_school_government_and_politics,data_augmentation=canonical", 'priority': 2}, + {'scenario':'social_studies','description': "mmlu:model=neurips/local,subject=high_school_geography,data_augmentation=canonical", 'priority': 2}, + {'scenario':'social_studies','description': "mmlu:model=neurips/local,subject=us_foreign_policy,data_augmentation=canonical", 'priority': 2}, + {'scenario':'social_studies','description': "mmlu:model=neurips/local,subject=public_relations,data_augmentation=canonical", 'priority': 2}, + {'scenario':'social_studies','description': "mmlu:model=neurips/local,subject=security_studies,data_augmentation=canonical", 'priority': 2}, + + # MMLU Social Sciences: Human behavior + {'scenario':'human_behavior','description': "mmlu:model=neurips/local,subject=high_school_psychology,data_augmentation=canonical", 'priority': 2}, + {'scenario':'human_behavior','description': "mmlu:model=neurips/local,subject=human_sexuality,data_augmentation=canonical", 'priority': 2}, + {'scenario':'human_behavior','description': "mmlu:model=neurips/local,subject=professional_psychology,data_augmentation=canonical", 'priority': 2}, + {'scenario':'human_behavior','description': "mmlu:model=neurips/local,subject=sociology,data_augmentation=canonical", 'priority': 2}, + + # MMLU Social Sciences: Economics + {'scenario':'economics','description': "mmlu:model=neurips/local,subject=high_school_microeconomics,data_augmentation=canonical", 'priority': 2}, + {'scenario':'economics','description': "mmlu:model=neurips/local,subject=econometrics,data_augmentation=canonical", 'priority': 2}, + {'scenario':'economics','description': "mmlu:model=neurips/local,subject=high_school_macroeconomics,data_augmentation=canonical", 'priority': 2}, + + # Truthful QA + {'scenario':'truthful_qa','description': "truthful_qa:task=mc_single,model=neurips/local,data_augmentation=canonical", 'priority': 1}, + + # CNN/daily mail + #{'scenario':'truthful_qa','description': "summarization_cnndm:model=neurips/local", 'priority': 1}, + # GSM + {'scenario':'gsm','description': "gsm:model=neurips/local", 'priority': 1}, + # BBQ + {'scenario':'bbq','description': "bbq:subject=all,model=neurips/local", 'priority': 1}, + +] + +def generate_equal_sum_list(V, N): + # Calculate the base value that will be repeated. + base_value = V // N + # Calculate the remainder for distribution. + remainder = V % N + + # Create the list with base_value repeated N times. + result = [base_value] * N + + # Distribute the remainder evenly among the elements. + for i in range(remainder): + result[i] += 1 + + return result + +import pandas as pd +import argparse + +if __name__ == "__main__": + + import argparse + parser = argparse.ArgumentParser( + description=''' + This method automatically generates a configuration file for the neurips_llm_efficiency_challenge + + Calling it with: `python build_run_specs_full.py --example_budget=600` will produce a conf file + with a total of 600 examples distributed evenly across scenarios as also defined here. + ''', + ) + parser.add_argument("--example_budget", required=True, type=int, help='# example to use') + args = parser.parse_args() + + # get a list of scenarios and n_examples + df = pd.DataFrame(entries) + scenario_count_dict = df.value_counts('scenario').to_dict() + n_scenarios = len(df.scenario.unique()) + max_eval_instances_per_scenario = generate_equal_sum_list(args.example_budget, n_scenarios) + + # get a dict of the amount of examples per + scenario_n_examples_dict = {} + for scenario, n_subscenarios in scenario_count_dict.items(): + cur_max_eval_instances_per_scenario = max_eval_instances_per_scenario.pop() + scenario_n_examples_dict[scenario] = generate_equal_sum_list(cur_max_eval_instances_per_scenario,n_subscenarios) + + for i in range(len(entries)): + cur_scenario = entries[i]['scenario'] + # print(f"added {v} to {entries[i]['max_eval_instances']}") + v = scenario_n_examples_dict[cur_scenario].pop() + entries[i]['max_eval_instances'] = v + + with open(f'./run_specs_open_{args.example_budget}_budget.conf','w') as f: + f.write('entries: [\n') + last_scenario = '' + for entry in entries: + cur_scenario = entry['scenario'] + if cur_scenario != last_scenario: + f.write(f'\n# {cur_scenario}\n') + print(entry) + last_scenario = cur_scenario + f.write('{') + f.write(f'description: """{entry["description"]}'.replace('"""','"')) + f.write(f',max_eval_instances={entry["max_eval_instances"]}""",priority: 1'.replace('"""','"')) + f.write('}\n') + f.write(']') + + print(f'Saved ./run_specs_open_{args.example_budget}_budget.conf') diff --git a/build_secret_run_spec.py b/build_secret_run_spec.py index 3b379513..1873ff72 100644 --- a/build_secret_run_spec.py +++ b/build_secret_run_spec.py @@ -12,13 +12,13 @@ {'scenario': 'ethics', 'description': "ethics_utilitarianism:model=neurips/local,data_augmentation=canonical", 'priority': 1}, ## Math datasets - {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=1,use_official_examples=True", 'priority': 2}, - {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=1,use_official_examples=True", 'priority': 2}, - {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=1,use_official_examples=True", 'priority': 2}, - {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=1,use_official_examples=True", 'priority': 2}, - {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=1,use_official_examples=True", 'priority': 2}, - {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=1,use_official_examples=True", 'priority': 2}, - {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=1,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=1,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=1,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=1,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=1,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=1,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=1,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=1,use_official_examples=True", 'priority': 2}, # # {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=2,use_official_examples=True", 'priority': 4}, # {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=2,use_official_examples=True", 'priority': 4}, @@ -44,13 +44,13 @@ # {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=4,use_official_examples=True", 'priority': 4}, # {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=4,use_official_examples=True", 'priority': 4}, - {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=5,use_official_examples=True", 'priority': 2}, - {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=5,use_official_examples=True", 'priority': 2}, - {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=5,use_official_examples=True", 'priority': 2}, - {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=5,use_official_examples=True", 'priority': 2}, - {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=5,use_official_examples=True", 'priority': 2}, - {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=5,use_official_examples=True", 'priority': 2}, - {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=5,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=5,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=intermediate_algebra,level=5,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=algebra,level=5,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=prealgebra,level=5,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=geometry,level=5,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=counting_and_probability,level=5,use_official_examples=True", 'priority': 2}, +# {'scenario': 'math', 'description': "math:model=neurips/local,subject=precalculus,level=5,use_official_examples=True", 'priority': 2}, # With chain-of-thought prompting: {'scenario': 'math', 'description': "math:model=neurips/local,subject=number_theory,level=1,use_chain_of_thought=True", 'priority': 2}, @@ -133,7 +133,7 @@ def generate_equal_sum_list(V, N): v = scenario_n_examples_dict[cur_scenario].pop() entries[i]['max_eval_instances'] = v - with open(f'./run_specs_full_closed_eval_coarse_{args.example_budget}_budget.conf','w') as f: + with open(f'./run_specs_closed_{args.example_budget}_budget.conf','w') as f: f.write('entries: [\n') last_scenario = '' for entry in entries: @@ -148,4 +148,4 @@ def generate_equal_sum_list(V, N): f.write('}\n') f.write(']') - print(f'Saved ./run_secret_specs_full_coarse_{args.example_budget}_budget.conf') + print(f'Saved ./run_secret_specs_closed_{args.example_budget}_budget.conf') diff --git a/configs/run_specs_closed_5000_budget.conf b/configs/run_specs_closed_5000_budget.conf new file mode 100644 index 00000000..f3a386aa --- /dev/null +++ b/configs/run_specs_closed_5000_budget.conf @@ -0,0 +1,34 @@ +entries: [ + +# summarization +{description: "sam_sum:model=neurips/local,max_eval_instances=1000",priority: 1} + +# causation +{description: "corr2cause:model=neurips/local,max_train_instances=1,max_eval_instances=1000",priority: 1} + +# ethics +{description: "ethics_justice:model=neurips/local,data_augmentation=canonical,max_eval_instances=200",priority: 1} +{description: "ethics_commonsense:model=neurips/local,data_augmentation=canonical,max_eval_instances=200",priority: 1} +{description: "ethics_virtue:model=neurips/local,data_augmentation=canonical,max_eval_instances=200",priority: 1} +{description: "ethics_deontology:model=neurips/local,data_augmentation=canonical,max_eval_instances=200",priority: 1} +{description: "ethics_utilitarianism:model=neurips/local,data_augmentation=canonical,max_eval_instances=200",priority: 1} + +# math +{description: "math:model=neurips/local,subject=number_theory,level=1,use_chain_of_thought=True,max_eval_instances=71",priority: 1} +{description: "math:model=neurips/local,subject=intermediate_algebra,level=1,use_chain_of_thought=True,max_eval_instances=71",priority: 1} +{description: "math:model=neurips/local,subject=algebra,level=1,use_chain_of_thought=True,max_eval_instances=71",priority: 1} +{description: "math:model=neurips/local,subject=prealgebra,level=1,use_chain_of_thought=True,max_eval_instances=71",priority: 1} +{description: "math:model=neurips/local,subject=geometry,level=1,use_chain_of_thought=True,max_eval_instances=71",priority: 1} +{description: "math:model=neurips/local,subject=counting_and_probability,level=1,use_chain_of_thought=True,max_eval_instances=71",priority: 1} +{description: "math:model=neurips/local,subject=precalculus,level=1,use_chain_of_thought=True,max_eval_instances=71",priority: 1} +{description: "math:model=neurips/local,subject=number_theory,level=5,use_chain_of_thought=True,max_eval_instances=71",priority: 1} +{description: "math:model=neurips/local,subject=intermediate_algebra,level=5,use_chain_of_thought=True,max_eval_instances=72",priority: 1} +{description: "math:model=neurips/local,subject=algebra,level=5,use_chain_of_thought=True,max_eval_instances=72",priority: 1} +{description: "math:model=neurips/local,subject=prealgebra,level=5,use_chain_of_thought=True,max_eval_instances=72",priority: 1} +{description: "math:model=neurips/local,subject=geometry,level=5,use_chain_of_thought=True,max_eval_instances=72",priority: 1} +{description: "math:model=neurips/local,subject=counting_and_probability,level=5,use_chain_of_thought=True,max_eval_instances=72",priority: 1} +{description: "math:model=neurips/local,subject=precalculus,level=5,use_chain_of_thought=True,max_eval_instances=72",priority: 1} + +# cnn +{description: "summarization_cnndm:model=neurips/local,max_eval_instances=1000",priority: 1} +] \ No newline at end of file diff --git a/configs/run_specs_open_2000_budget.conf b/configs/run_specs_open_2000_budget.conf new file mode 100644 index 00000000..2fe7b900 --- /dev/null +++ b/configs/run_specs_open_2000_budget.conf @@ -0,0 +1,155 @@ +entries: [ + +# auto_debugging +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=auto_debugging,subtask=,max_eval_instances=61",priority: 1} + +# code_line_description +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=code_line_description,subtask=,max_eval_instances=61",priority: 1} + +# conceptual_combinations +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=contradictions,max_eval_instances=12",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=emergent_properties,max_eval_instances=12",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=fanciful_fictional_combinations,max_eval_instances=12",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=homonyms,max_eval_instances=12",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=invented_words,max_eval_instances=12",priority: 1} + +# emoji_movie +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=emoji_movie,subtask=,max_eval_instances=61",priority: 1} + +# formal_fallacies_syllogisms_negation +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=formal_fallacies_syllogisms_negation,subtask=,max_eval_instances=61",priority: 1} + +# known_unknowns +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=known_unknowns,subtask=,max_eval_instances=61",priority: 1} + +# linguistics_puzzles +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=linguistics_puzzles,subtask=,max_eval_instances=61",priority: 1} + +# logic_grid_puzzle +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logic_grid_puzzle,subtask=,max_eval_instances=61",priority: 1} + +# logical_deduction +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logical_deduction,subtask=three_objects,max_eval_instances=20",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logical_deduction,subtask=five_objects,max_eval_instances=20",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logical_deduction,subtask=seven_objects,max_eval_instances=21",priority: 1} + +# novel_concepts +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=novel_concepts,subtask=,max_eval_instances=61",priority: 1} + +# operator +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=operators,subtask=,max_eval_instances=61",priority: 1} + +# play_dialog_same_or_different +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=play_dialog_same_or_different,subtask=,max_eval_instances=61",priority: 1} + +# repeat_copy_logic +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=repeat_copy_logic,subtask=,max_eval_instances=61",priority: 1} + +# strange_stories +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=strange_stories,subtask=boolean,max_eval_instances=30",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=strange_stories,subtask=multiple_choice,max_eval_instances=31",priority: 1} + +# strategyqa +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=strategyqa,subtask=,max_eval_instances=61",priority: 1} + +# symbol_interpretation +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=adversarial,max_eval_instances=12",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=emoji_agnostic,max_eval_instances=12",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=name_agnostic,max_eval_instances=12",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=plain,max_eval_instances=12",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=tricky,max_eval_instances=12",priority: 1} + +# vitaminc_fact_verification +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=vitaminc_fact_verification,subtask=,max_eval_instances=61",priority: 1} + +# winowhy +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=winowhy,subtask=,max_eval_instances=61",priority: 1} + +# medicine_biology +{description: "mmlu:model=neurips/local,subject=anatomy,data_augmentation=canonical,max_eval_instances=15",priority: 1} +{description: "mmlu:model=neurips/local,subject=college_medicine,data_augmentation=canonical,max_eval_instances=15",priority: 1} +{description: "mmlu:model=neurips/local,subject=college_biology,data_augmentation=canonical,max_eval_instances=15",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_biology,data_augmentation=canonical,max_eval_instances=15",priority: 1} + +# computer_science +{description: "mmlu:model=neurips/local,subject=college_computer_science,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_computer_science,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=computer_security,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=electrical_engineering,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=machine_learning,data_augmentation=canonical,max_eval_instances=12",priority: 1} + +# math +{description: "mmlu:model=neurips/local,subject=high_school_mathematics,data_augmentation=canonical,max_eval_instances=15",priority: 1} +{description: "mmlu:model=neurips/local,subject=college_mathematics,data_augmentation=canonical,max_eval_instances=15",priority: 1} +{description: "mmlu:model=neurips/local,subject=abstract_algebra,data_augmentation=canonical,max_eval_instances=15",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_statistics,data_augmentation=canonical,max_eval_instances=15",priority: 1} + +# physics_chemistry +{description: "mmlu:model=neurips/local,subject=college_chemistry,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_chemistry,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_physics,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=college_physics,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=astronomy,data_augmentation=canonical,max_eval_instances=12",priority: 1} + +# formal_reasoning +{description: "mmlu:model=neurips/local,subject=formal_logic,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=logical_fallacies,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=philosophy,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=moral_disputes,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=moral_scenarios,data_augmentation=canonical,max_eval_instances=12",priority: 1} + +# law +{description: "mmlu:model=neurips/local,subject=professional_law,data_augmentation=canonical,max_eval_instances=20",priority: 1} +{description: "mmlu:model=neurips/local,subject=international_law,data_augmentation=canonical,max_eval_instances=20",priority: 1} +{description: "mmlu:model=neurips/local,subject=jurisprudence,data_augmentation=canonical,max_eval_instances=21",priority: 1} + +# history +{description: "mmlu:model=neurips/local,subject=high_school_european_history,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_us_history,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_world_history,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=prehistory,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=world_religions,data_augmentation=canonical,max_eval_instances=12",priority: 1} + +# business +{description: "mmlu:model=neurips/local,subject=business_ethics,data_augmentation=canonical,max_eval_instances=10",priority: 1} +{description: "mmlu:model=neurips/local,subject=global_facts,data_augmentation=canonical,max_eval_instances=10",priority: 1} +{description: "mmlu:model=neurips/local,subject=management,data_augmentation=canonical,max_eval_instances=10",priority: 1} +{description: "mmlu:model=neurips/local,subject=marketing,data_augmentation=canonical,max_eval_instances=10",priority: 1} +{description: "mmlu:model=neurips/local,subject=miscellaneous,data_augmentation=canonical,max_eval_instances=10",priority: 1} +{description: "mmlu:model=neurips/local,subject=professional_accounting,data_augmentation=canonical,max_eval_instances=10",priority: 1} + +# health +{description: "mmlu:model=neurips/local,subject=nutrition,data_augmentation=canonical,max_eval_instances=10",priority: 1} +{description: "mmlu:model=neurips/local,subject=human_aging,data_augmentation=canonical,max_eval_instances=10",priority: 1} +{description: "mmlu:model=neurips/local,subject=clinical_knowledge,data_augmentation=canonical,max_eval_instances=10",priority: 1} +{description: "mmlu:model=neurips/local,subject=medical_genetics,data_augmentation=canonical,max_eval_instances=10",priority: 1} +{description: "mmlu:model=neurips/local,subject=professional_medicine,data_augmentation=canonical,max_eval_instances=10",priority: 1} +{description: "mmlu:model=neurips/local,subject=virology,data_augmentation=canonical,max_eval_instances=10",priority: 1} + +# social_studies +{description: "mmlu:model=neurips/local,subject=high_school_government_and_politics,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_geography,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=us_foreign_policy,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=public_relations,data_augmentation=canonical,max_eval_instances=12",priority: 1} +{description: "mmlu:model=neurips/local,subject=security_studies,data_augmentation=canonical,max_eval_instances=12",priority: 1} + +# human_behavior +{description: "mmlu:model=neurips/local,subject=high_school_psychology,data_augmentation=canonical,max_eval_instances=15",priority: 1} +{description: "mmlu:model=neurips/local,subject=human_sexuality,data_augmentation=canonical,max_eval_instances=15",priority: 1} +{description: "mmlu:model=neurips/local,subject=professional_psychology,data_augmentation=canonical,max_eval_instances=15",priority: 1} +{description: "mmlu:model=neurips/local,subject=sociology,data_augmentation=canonical,max_eval_instances=15",priority: 1} + +# economics +{description: "mmlu:model=neurips/local,subject=high_school_microeconomics,data_augmentation=canonical,max_eval_instances=20",priority: 1} +{description: "mmlu:model=neurips/local,subject=econometrics,data_augmentation=canonical,max_eval_instances=20",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_macroeconomics,data_augmentation=canonical,max_eval_instances=20",priority: 1} + +# truthful_qa +{description: "truthful_qa:task=mc_single,model=neurips/local,data_augmentation=canonical,max_eval_instances=61",priority: 1} + +# gsm +{description: "gsm:model=neurips/local,max_eval_instances=61",priority: 1} + +# bbq +{description: "bbq:subject=all,model=neurips/local,max_eval_instances=61",priority: 1} +] \ No newline at end of file diff --git a/neurIPS_eval_scripts/eval_metrics.py b/neurIPS_eval_scripts/eval_metrics.py index 84b03337..53d796ea 100755 --- a/neurIPS_eval_scripts/eval_metrics.py +++ b/neurIPS_eval_scripts/eval_metrics.py @@ -41,7 +41,7 @@ ("core_scenarios.json", "ethics_virtue - EM", False), ("core_scenarios.json", "ethics_deontology - EM", False), ("core_scenarios.json", "ethics_utilitarianism - EM", False), - ("core_scenarios.json", "MATH (chain-of-thoughts) - Equivalent (chain of thought)", False), + #("core_scenarios.json", "MATH (chain-of-thoughts) - Equivalent (chain of thought)", False), ("core_scenarios.json", "MATH - Equivalent", False), ], @@ -73,4 +73,4 @@ ], -} \ No newline at end of file +} diff --git a/run_specs_full_closed_eval_coarse_100_budget.conf b/run_specs_closed_100_budget.conf similarity index 51% rename from run_specs_full_closed_eval_coarse_100_budget.conf rename to run_specs_closed_100_budget.conf index 172f0c29..c267ca12 100644 --- a/run_specs_full_closed_eval_coarse_100_budget.conf +++ b/run_specs_closed_100_budget.conf @@ -14,20 +14,6 @@ entries: [ {description: "ethics_utilitarianism:model=neurips/local,data_augmentation=canonical,max_eval_instances=4",priority: 1} # math -{description: "math:model=neurips/local,subject=number_theory,level=1,use_official_examples=True,max_eval_instances=0",priority: 1} -{description: "math:model=neurips/local,subject=intermediate_algebra,level=1,use_official_examples=True,max_eval_instances=0",priority: 1} -{description: "math:model=neurips/local,subject=algebra,level=1,use_official_examples=True,max_eval_instances=0",priority: 1} -{description: "math:model=neurips/local,subject=prealgebra,level=1,use_official_examples=True,max_eval_instances=0",priority: 1} -{description: "math:model=neurips/local,subject=geometry,level=1,use_official_examples=True,max_eval_instances=0",priority: 1} -{description: "math:model=neurips/local,subject=counting_and_probability,level=1,use_official_examples=True,max_eval_instances=0",priority: 1} -{description: "math:model=neurips/local,subject=precalculus,level=1,use_official_examples=True,max_eval_instances=0",priority: 1} -{description: "math:model=neurips/local,subject=number_theory,level=5,use_official_examples=True,max_eval_instances=0",priority: 1} -{description: "math:model=neurips/local,subject=intermediate_algebra,level=5,use_official_examples=True,max_eval_instances=1",priority: 1} -{description: "math:model=neurips/local,subject=algebra,level=5,use_official_examples=True,max_eval_instances=1",priority: 1} -{description: "math:model=neurips/local,subject=prealgebra,level=5,use_official_examples=True,max_eval_instances=1",priority: 1} -{description: "math:model=neurips/local,subject=geometry,level=5,use_official_examples=True,max_eval_instances=1",priority: 1} -{description: "math:model=neurips/local,subject=counting_and_probability,level=5,use_official_examples=True,max_eval_instances=1",priority: 1} -{description: "math:model=neurips/local,subject=precalculus,level=5,use_official_examples=True,max_eval_instances=1",priority: 1} {description: "math:model=neurips/local,subject=number_theory,level=1,use_chain_of_thought=True,max_eval_instances=1",priority: 1} {description: "math:model=neurips/local,subject=intermediate_algebra,level=1,use_chain_of_thought=True,max_eval_instances=1",priority: 1} {description: "math:model=neurips/local,subject=algebra,level=1,use_chain_of_thought=True,max_eval_instances=1",priority: 1} @@ -36,12 +22,12 @@ entries: [ {description: "math:model=neurips/local,subject=counting_and_probability,level=1,use_chain_of_thought=True,max_eval_instances=1",priority: 1} {description: "math:model=neurips/local,subject=precalculus,level=1,use_chain_of_thought=True,max_eval_instances=1",priority: 1} {description: "math:model=neurips/local,subject=number_theory,level=5,use_chain_of_thought=True,max_eval_instances=1",priority: 1} -{description: "math:model=neurips/local,subject=intermediate_algebra,level=5,use_chain_of_thought=True,max_eval_instances=1",priority: 1} -{description: "math:model=neurips/local,subject=algebra,level=5,use_chain_of_thought=True,max_eval_instances=1",priority: 1} -{description: "math:model=neurips/local,subject=prealgebra,level=5,use_chain_of_thought=True,max_eval_instances=1",priority: 1} -{description: "math:model=neurips/local,subject=geometry,level=5,use_chain_of_thought=True,max_eval_instances=1",priority: 1} -{description: "math:model=neurips/local,subject=counting_and_probability,level=5,use_chain_of_thought=True,max_eval_instances=1",priority: 1} -{description: "math:model=neurips/local,subject=precalculus,level=5,use_chain_of_thought=True,max_eval_instances=1",priority: 1} +{description: "math:model=neurips/local,subject=intermediate_algebra,level=5,use_chain_of_thought=True,max_eval_instances=2",priority: 1} +{description: "math:model=neurips/local,subject=algebra,level=5,use_chain_of_thought=True,max_eval_instances=2",priority: 1} +{description: "math:model=neurips/local,subject=prealgebra,level=5,use_chain_of_thought=True,max_eval_instances=2",priority: 1} +{description: "math:model=neurips/local,subject=geometry,level=5,use_chain_of_thought=True,max_eval_instances=2",priority: 1} +{description: "math:model=neurips/local,subject=counting_and_probability,level=5,use_chain_of_thought=True,max_eval_instances=2",priority: 1} +{description: "math:model=neurips/local,subject=precalculus,level=5,use_chain_of_thought=True,max_eval_instances=2",priority: 1} # cnn {description: "summarization_cnndm:model=neurips/local,max_eval_instances=20",priority: 1} diff --git a/run_specs_full_closed_eval_coarse_3000_budget.conf b/run_specs_full_closed_eval_coarse_3000_budget.conf deleted file mode 100644 index e13884eb..00000000 --- a/run_specs_full_closed_eval_coarse_3000_budget.conf +++ /dev/null @@ -1,48 +0,0 @@ -entries: [ - -# summarization -{description: "sam_sum:model=neurips/local,max_eval_instances=600",priority: 1} - -# causation -{description: "corr2cause:model=neurips/local,max_train_instances=1,max_eval_instances=600",priority: 1} - -# ethics -{description: "ethics_justice:model=neurips/local,data_augmentation=canonical,max_eval_instances=120",priority: 1} -{description: "ethics_commonsense:model=neurips/local,data_augmentation=canonical,max_eval_instances=120",priority: 1} -{description: "ethics_virtue:model=neurips/local,data_augmentation=canonical,max_eval_instances=120",priority: 1} -{description: "ethics_deontology:model=neurips/local,data_augmentation=canonical,max_eval_instances=120",priority: 1} -{description: "ethics_utilitarianism:model=neurips/local,data_augmentation=canonical,max_eval_instances=120",priority: 1} - -# math -{description: "math:model=neurips/local,subject=number_theory,level=1,use_official_examples=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=intermediate_algebra,level=1,use_official_examples=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=algebra,level=1,use_official_examples=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=prealgebra,level=1,use_official_examples=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=geometry,level=1,use_official_examples=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=counting_and_probability,level=1,use_official_examples=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=precalculus,level=1,use_official_examples=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=number_theory,level=5,use_official_examples=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=intermediate_algebra,level=5,use_official_examples=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=algebra,level=5,use_official_examples=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=prealgebra,level=5,use_official_examples=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=geometry,level=5,use_official_examples=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=counting_and_probability,level=5,use_official_examples=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=precalculus,level=5,use_official_examples=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=number_theory,level=1,use_chain_of_thought=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=intermediate_algebra,level=1,use_chain_of_thought=True,max_eval_instances=21",priority: 1} -{description: "math:model=neurips/local,subject=algebra,level=1,use_chain_of_thought=True,max_eval_instances=22",priority: 1} -{description: "math:model=neurips/local,subject=prealgebra,level=1,use_chain_of_thought=True,max_eval_instances=22",priority: 1} -{description: "math:model=neurips/local,subject=geometry,level=1,use_chain_of_thought=True,max_eval_instances=22",priority: 1} -{description: "math:model=neurips/local,subject=counting_and_probability,level=1,use_chain_of_thought=True,max_eval_instances=22",priority: 1} -{description: "math:model=neurips/local,subject=precalculus,level=1,use_chain_of_thought=True,max_eval_instances=22",priority: 1} -{description: "math:model=neurips/local,subject=number_theory,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1} -{description: "math:model=neurips/local,subject=intermediate_algebra,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1} -{description: "math:model=neurips/local,subject=algebra,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1} -{description: "math:model=neurips/local,subject=prealgebra,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1} -{description: "math:model=neurips/local,subject=geometry,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1} -{description: "math:model=neurips/local,subject=counting_and_probability,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1} -{description: "math:model=neurips/local,subject=precalculus,level=5,use_chain_of_thought=True,max_eval_instances=22",priority: 1} - -# cnn -{description: "summarization_cnndm:model=neurips/local,max_eval_instances=600",priority: 1} -] \ No newline at end of file diff --git a/run_specs_open_100_budget.conf b/run_specs_open_100_budget.conf new file mode 100644 index 00000000..a832caba --- /dev/null +++ b/run_specs_open_100_budget.conf @@ -0,0 +1,155 @@ +entries: [ + +# auto_debugging +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=auto_debugging,subtask=,max_eval_instances=3",priority: 1} + +# code_line_description +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=code_line_description,subtask=,max_eval_instances=3",priority: 1} + +# conceptual_combinations +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=contradictions,max_eval_instances=0",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=emergent_properties,max_eval_instances=0",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=fanciful_fictional_combinations,max_eval_instances=1",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=homonyms,max_eval_instances=1",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=conceptual_combinations,subtask=invented_words,max_eval_instances=1",priority: 1} + +# emoji_movie +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=emoji_movie,subtask=,max_eval_instances=3",priority: 1} + +# formal_fallacies_syllogisms_negation +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=formal_fallacies_syllogisms_negation,subtask=,max_eval_instances=3",priority: 1} + +# known_unknowns +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=known_unknowns,subtask=,max_eval_instances=3",priority: 1} + +# linguistics_puzzles +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=linguistics_puzzles,subtask=,max_eval_instances=3",priority: 1} + +# logic_grid_puzzle +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logic_grid_puzzle,subtask=,max_eval_instances=3",priority: 1} + +# logical_deduction +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logical_deduction,subtask=three_objects,max_eval_instances=1",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logical_deduction,subtask=five_objects,max_eval_instances=1",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=logical_deduction,subtask=seven_objects,max_eval_instances=1",priority: 1} + +# novel_concepts +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=novel_concepts,subtask=,max_eval_instances=3",priority: 1} + +# operator +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=operators,subtask=,max_eval_instances=3",priority: 1} + +# play_dialog_same_or_different +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=play_dialog_same_or_different,subtask=,max_eval_instances=3",priority: 1} + +# repeat_copy_logic +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=repeat_copy_logic,subtask=,max_eval_instances=3",priority: 1} + +# strange_stories +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=strange_stories,subtask=boolean,max_eval_instances=1",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=strange_stories,subtask=multiple_choice,max_eval_instances=2",priority: 1} + +# strategyqa +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=strategyqa,subtask=,max_eval_instances=3",priority: 1} + +# symbol_interpretation +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=adversarial,max_eval_instances=0",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=emoji_agnostic,max_eval_instances=0",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=name_agnostic,max_eval_instances=1",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=plain,max_eval_instances=1",priority: 1} +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=symbol_interpretation,subtask=tricky,max_eval_instances=1",priority: 1} + +# vitaminc_fact_verification +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=vitaminc_fact_verification,subtask=,max_eval_instances=3",priority: 1} + +# winowhy +{description: "big_bench:model=neurips/local,max_train_instances=big_bench_few_shot_setting,task=winowhy,subtask=,max_eval_instances=4",priority: 1} + +# medicine_biology +{description: "mmlu:model=neurips/local,subject=anatomy,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=college_medicine,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=college_biology,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_biology,data_augmentation=canonical,max_eval_instances=1",priority: 1} + +# computer_science +{description: "mmlu:model=neurips/local,subject=college_computer_science,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_computer_science,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=computer_security,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=electrical_engineering,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=machine_learning,data_augmentation=canonical,max_eval_instances=1",priority: 1} + +# math +{description: "mmlu:model=neurips/local,subject=high_school_mathematics,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=college_mathematics,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=abstract_algebra,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_statistics,data_augmentation=canonical,max_eval_instances=1",priority: 1} + +# physics_chemistry +{description: "mmlu:model=neurips/local,subject=college_chemistry,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_chemistry,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_physics,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=college_physics,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=astronomy,data_augmentation=canonical,max_eval_instances=1",priority: 1} + +# formal_reasoning +{description: "mmlu:model=neurips/local,subject=formal_logic,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=logical_fallacies,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=philosophy,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=moral_disputes,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=moral_scenarios,data_augmentation=canonical,max_eval_instances=1",priority: 1} + +# law +{description: "mmlu:model=neurips/local,subject=professional_law,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=international_law,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=jurisprudence,data_augmentation=canonical,max_eval_instances=1",priority: 1} + +# history +{description: "mmlu:model=neurips/local,subject=high_school_european_history,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_us_history,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_world_history,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=prehistory,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=world_religions,data_augmentation=canonical,max_eval_instances=1",priority: 1} + +# business +{description: "mmlu:model=neurips/local,subject=business_ethics,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=global_facts,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=management,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=marketing,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=miscellaneous,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=professional_accounting,data_augmentation=canonical,max_eval_instances=1",priority: 1} + +# health +{description: "mmlu:model=neurips/local,subject=nutrition,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=human_aging,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=clinical_knowledge,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=medical_genetics,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=professional_medicine,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=virology,data_augmentation=canonical,max_eval_instances=1",priority: 1} + +# social_studies +{description: "mmlu:model=neurips/local,subject=high_school_government_and_politics,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_geography,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=us_foreign_policy,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=public_relations,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=security_studies,data_augmentation=canonical,max_eval_instances=1",priority: 1} + +# human_behavior +{description: "mmlu:model=neurips/local,subject=high_school_psychology,data_augmentation=canonical,max_eval_instances=0",priority: 1} +{description: "mmlu:model=neurips/local,subject=human_sexuality,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=professional_psychology,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=sociology,data_augmentation=canonical,max_eval_instances=1",priority: 1} + +# economics +{description: "mmlu:model=neurips/local,subject=high_school_microeconomics,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=econometrics,data_augmentation=canonical,max_eval_instances=1",priority: 1} +{description: "mmlu:model=neurips/local,subject=high_school_macroeconomics,data_augmentation=canonical,max_eval_instances=1",priority: 1} + +# truthful_qa +{description: "truthful_qa:task=mc_single,model=neurips/local,data_augmentation=canonical,max_eval_instances=3",priority: 1} + +# gsm +{description: "gsm:model=neurips/local,max_eval_instances=3",priority: 1} + +# bbq +{description: "bbq:subject=all,model=neurips/local,max_eval_instances=3",priority: 1} +] \ No newline at end of file From ace1aaff0449932d3afabbb07ad57c0655a37e35 Mon Sep 17 00:00:00 2001 From: Weiwei Yang Date: Sun, 19 Nov 2023 04:27:18 +0000 Subject: [PATCH 2/6] mv smaller config to conf/dir --- .../run_specs_closed_100_budget.conf | 0 .../run_specs_open_100_budget.conf | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename run_specs_closed_100_budget.conf => configs/run_specs_closed_100_budget.conf (100%) rename run_specs_open_100_budget.conf => configs/run_specs_open_100_budget.conf (100%) diff --git a/run_specs_closed_100_budget.conf b/configs/run_specs_closed_100_budget.conf similarity index 100% rename from run_specs_closed_100_budget.conf rename to configs/run_specs_closed_100_budget.conf diff --git a/run_specs_open_100_budget.conf b/configs/run_specs_open_100_budget.conf similarity index 100% rename from run_specs_open_100_budget.conf rename to configs/run_specs_open_100_budget.conf From 57a59e921bad3693e020e4f77a046a4b03721edd Mon Sep 17 00:00:00 2001 From: Weiwei Yang Date: Sun, 19 Nov 2023 04:42:48 +0000 Subject: [PATCH 3/6] For final eval constrain the configs to run as a big or small set Small set is intended only for quick testing Big set is the one to run for the last stage of 2023-111 compo --- do-run.sh | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/do-run.sh b/do-run.sh index 659fc48c..e49e807f 100755 --- a/do-run.sh +++ b/do-run.sh @@ -1,22 +1,51 @@ #!/usr/bin/env bash -set -x source /helm/private_helm_env/bin/activate -export CONF_NAME=$(basename -s .conf "$1") +if [[ "$1" == "big" ]]; then + export FIRST_CONF="/helm/configs/run_specs_open_2000_budget.conf" + export SECOND_CONF="/helm/configs/run_specs_closed_5000_budget.conf" +elif [[ "$1" == "small" ]]; then + export FIRST_CONF="/helm/configs/run_specs_open_100_budget.conf" + export SECOND_CONF="/helm/configs/run_specs_closed_100_budget.conf" +else + echo "Either big or small for the first arg" + exit 1 +fi + +echo "Going to run $FIRST_CONF then $SECOND_CONF" + +date > "/results/helm-run-open-set-$CONF_NAME-$2.log" timeout --foreground 300m helm-run \ --output-path /results \ - --conf-paths "$1" \ + --conf-paths "$FIRST_CONF" \ + --suite "$2" \ + --max-eval-instances 100 \ + -n 30 | tee "/results/helm-run-open-set-$CONF_NAME-$2.log" + +date >> "/results/helm-run-open-set-$CONF_NAME-$2.log" + +date > "/results/helm-run-hidden-set-$CONF_NAME-$2.log" + +timeout --foreground 600m helm-run \ + --output-path /results \ + --conf-paths "$SECOND_CONF" \ --suite "$2" \ --max-eval-instances 100 \ - -n 30 | tee "/results/helm-run-$CONF_NAME-$2.log" + -n 30 | tee "/results/helm-run-hidden-set-$CONF_NAME-$2.log" + +date >> "/results/helm-run-hidden-set-$CONF_NAME-$2.log" helm-summarize \ --output-path /results \ --suite "$2" \ -n 30 | tee "/results/helm-summarize-$CONF_NAME-$2.log" +python3 /helm/neurIPS_eval_scripts/process_helm.py \ + --dir /results \ + --idx "$2" + python3 /helm/neurIPS_eval_scripts/process_helm.py \ --dir /results \ --idx "$2" \ From f986e2ac674b27f910359b8e092afaf6f0d3fef3 Mon Sep 17 00:00:00 2001 From: Weiwei Yang Date: Mon, 20 Nov 2023 00:19:31 +0000 Subject: [PATCH 4/6] comment out the wrong math task --- neurIPS_eval_scripts/eval_metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neurIPS_eval_scripts/eval_metrics.py b/neurIPS_eval_scripts/eval_metrics.py index 53d796ea..0fce87a5 100755 --- a/neurIPS_eval_scripts/eval_metrics.py +++ b/neurIPS_eval_scripts/eval_metrics.py @@ -41,8 +41,8 @@ ("core_scenarios.json", "ethics_virtue - EM", False), ("core_scenarios.json", "ethics_deontology - EM", False), ("core_scenarios.json", "ethics_utilitarianism - EM", False), - #("core_scenarios.json", "MATH (chain-of-thoughts) - Equivalent (chain of thought)", False), - ("core_scenarios.json", "MATH - Equivalent", False), + ("core_scenarios.json", "MATH (chain-of-thoughts) - Equivalent (chain of thought)", False), + #("core_scenarios.json", "MATH - Equivalent", False), ], "Robustness": [ From 6033e5b4fba3ec401178e4c75dbf14a757b01846 Mon Sep 17 00:00:00 2001 From: Weiwei Yang Date: Mon, 20 Nov 2023 02:47:22 +0000 Subject: [PATCH 5/6] add script to combine final ranking --- neurIPS_eval_scripts/combin_ranks.py | 46 ++++++++++ neurIPS_eval_scripts/process_helm_all.py | 84 ++++++++++++++++++ neurIPS_eval_scripts/rank_submissions.py | 104 ++++++++++++++++++----- neurIPS_eval_scripts/ranking_metrics.py | 78 +++++++++++++++++ 4 files changed, 289 insertions(+), 23 deletions(-) create mode 100755 neurIPS_eval_scripts/combin_ranks.py create mode 100755 neurIPS_eval_scripts/process_helm_all.py create mode 100755 neurIPS_eval_scripts/ranking_metrics.py diff --git a/neurIPS_eval_scripts/combin_ranks.py b/neurIPS_eval_scripts/combin_ranks.py new file mode 100755 index 00000000..2a963b46 --- /dev/null +++ b/neurIPS_eval_scripts/combin_ranks.py @@ -0,0 +1,46 @@ +import json +import argparse + + +OPEN_WEIGHT = 1/3 +HIDDEN_WEIGHT = 2/3 + +#combines open and hidden results based on the specified weights, then rank the submissions based on combined score +if __name__ == "__main__": + + + parser = argparse.ArgumentParser(description="combin open close eval results") + parser.add_argument("--path", type=str, help='ranked submissions json path', required=True) + parser.add_argument('--track', type=str, default='A100', required=False) + + args = parser.parse_args() + + open_rank = json.load( + open(f"{args.path}/{args.track}_open_rank.json", 'r' )) + open_rank = {x[0]:x[1] for x in open_rank} + + close_rank = json.load( + open(f"{args.path}/{args.track}_hidden_rank.json", 'r' )) + close_rank = {x[0]: x[1] for x in close_rank} + + print(f'open_size:{len(open_rank)}, close_size:{len(close_rank)}') + + full_rank = [] + for idx, open_res in open_rank.items(): + close_res = close_rank[f'{idx}_hidden'] + score_open = open_res["Score"] + score_close = close_res['Score'] + weighted_score = OPEN_WEIGHT * score_open + HIDDEN_WEIGHT*score_close + res = open_res + res = {**open_res, **close_res} + del res["Score"] + res['Score_full'] = weighted_score + res['Score_open'] = open_res['Score'] + res['Score_hidden'] = close_res['Score'] + + full_rank.append((idx, res)) + + full_rank = sorted(full_rank, key=lambda x : x[1]['Score_full'], reverse=True) + + with open(f'{args.track}_full_ranks.json', 'w') as handle: + json.dump(full_rank, handle, indent=4) diff --git a/neurIPS_eval_scripts/process_helm_all.py b/neurIPS_eval_scripts/process_helm_all.py new file mode 100755 index 00000000..80064aa7 --- /dev/null +++ b/neurIPS_eval_scripts/process_helm_all.py @@ -0,0 +1,84 @@ +import json +import os +import sys +import argparse +from eval_metrics import Open_eval_metrics as open_metrics +from eval_metrics import Hidden_eval_metrics as hidden_metrics + +''' +parse results from helm-summerize under helm_output_dir/runs/submission_id +--dir benchmark_dir +--idx submission_idx +''' + +#this is taken from https://github.com/Lightning-AI/llm-efficiency-challenge-eval/blob/main/agents/agents.py#L182 +def process_helm_results(root_path:str, suite: str, METRICS:dict = open_metrics) -> dict: + path = f"{root_path}/{suite}/groups/" + output = {} + + for scenario, scenario_metrics in METRICS.items(): + scenario_output = {} + prev_filename = None + for filename, metric, _ in scenario_metrics: + print(filename, metric) + if filename != prev_filename: + with open(os.path.join(path, filename), "r") as f: + data = json.load(f) + prev_filename = filename + scenario_data = [el for el in data if el["title"] == scenario][0] + metric_idx = None + for i, header in enumerate(scenario_data["header"]): + if header["value"] == metric: + metric_idx = i + break + value = scenario_data["rows"][0][metric_idx].get("value") + if value is not None: + scenario_output[metric] = value + else: + print(f'{metric} is None ') + output[scenario] = scenario_output + + return output + + +if __name__ == "__main__": + + try: + parser = argparse.ArgumentParser(description="Parse helm-summerize results") + parser.add_argument("--path", type=str, help='Helm Benchmark dir', required=True) + parser.add_argument('--hidden', action='store_true', help="hidden eval metrics", required=False) + args = parser.parse_args() + + use_metrics = open_metrics + if args.hidden: + use_metrics = hidden_metrics + + + path = args.path + for idx in os.listdir(path): + try: + print(f'processing {idx}') + run_results = process_helm_results(path, idx, METRICS=use_metrics) + print(run_results) + + results_dir = f"./submission_results" + os.makedirs(results_dir, exist_ok=True) + + out_name = f"{idx}.json" + if args.hidden: + out_name = f"{idx}_hidden.json" + + result_json = os.path.join(results_dir, out_name) + + with open (result_json, 'w') as handle: + json.dump( run_results, handle, indent=4) + + print(f'write file {out_name}') + except Exception as e: + print(e) + + + except Exception as e : + print(e) + print("--help for usage") + sys.exit(2) diff --git a/neurIPS_eval_scripts/rank_submissions.py b/neurIPS_eval_scripts/rank_submissions.py index f94d3574..256d17e5 100755 --- a/neurIPS_eval_scripts/rank_submissions.py +++ b/neurIPS_eval_scripts/rank_submissions.py @@ -2,14 +2,21 @@ import json import math import statistics -from eval_metrics import Open_eval_metrics as METRICS +from ranking_metrics import * import argparse +from collections import defaultdict, Counter -def load_run_results(run_result_dir:str): +def load_run_results(run_result_dir:str, hidden_dataset = False): results = {} for filename in os.listdir(run_result_dir): + if not hidden_dataset and "hidden" in filename: + continue + + if hidden_dataset and not "hidden" in filename: + continue + filepath = os.path.join(run_result_dir, filename) with open(filepath, 'r') as handle: res = json.load(handle) @@ -18,6 +25,47 @@ def load_run_results(run_result_dir:str): return results + +def transpose_results(results): + data_sets = ['CNN/DailyMail', "sam_sum", "corr2cause", 'ethics', 'MATH', "MMLU", "TruthfulQA", "BIG-bench", "GSM8K", "BBQ"] + transposed_results = {} + for name, res in results.items(): + t_res = defaultdict(dict) + for val in res.values(): + for d in data_sets: + for k,v in val.items(): + if d in k: + t_res[d][k] = v + continue + transposed_results[name]= t_res + return transposed_results + + +def calc_win_rate(values, lower_is_better=False, verbose=False): + #This function calculate win rate, allow entries in values to repeat, such as [1, 1, 1, 3, 4, 5, 1] + # in this case, the repeated values will get the same win-rate, which is 1/(n_repeats) + count(lower_rank) + counts = Counter( values) + win_rate ={idx: 0.0 for idx in range( len(values) )} + + for i, v in enumerate(values): + for j, vv in enumerate(values): + if i == j: + continue + if not lower_is_better and v > vv: + win_rate[i] += 1 + elif lower_is_better and v < vv: + win_rate[i] += 1 + elif v == vv: + win_rate[i] += 1.0/(counts[v] * (counts[v] -1)) + win_rate = [(k, v/len(values)) for k, v in win_rate.items()] + if verbose: + print( [(i,v ) for i, v in enumerate(values)]) + print (win_rate) + win_rate = [x[1] for x in sorted(win_rate, key=lambda k : k[0] )] + if verbose: + print(win_rate) + return win_rate + #take from https://github.com/Lightning-AI/llm-efficiency-challenge-eval/blob/main/agents/helm_postprocessing.py def rank_results(data:dict, metrics_config:dict): # mean win rate to be computed here @@ -46,21 +94,20 @@ def rank_results(data:dict, metrics_config:dict): win_rates_per_row = [[] for _ in submission_ids] metrics = [metric for _, metric, _ in metrics_config[scenario]] for metric in metrics: + lower_is_better = lower_is_better_map[scenario][metric] - default_value = 0.0 if not lower_is_better else 1000.0 - values = [(data[submission_id].get(scenario, {metric: default_value}).get(metric, 0.0), j) for j, submission_id in enumerate(submission_ids)] - # temporary fix for populating lower is better entries with 0.0's; - # this has been fixed in agents.py, but it's needed for older submissions; - # we can remove once we move to flash helm + default_value = 0.0 if lower_is_better: - values = [(default_value, j) if val == 0.0 else (val, j) for val, j in values] - for wins, (v, j) in enumerate(sorted(values, reverse=lower_is_better)): - win_rate = wins / (len(values) - 1) if len(values) > 1 else 1.0 # normalize to [0, 1] - win_rates_per_row[j].append(win_rate) - + default_value = 1000.0 + + values = [(data[submission_id].get(scenario, {metric: default_value}).get(metric, default_value), j) for j, submission_id in enumerate(submission_ids)] + vv = [x[0] for x in values] + win_rates = calc_win_rate(vv, lower_is_better=lower_is_better) + for (win, (val, j)) in zip (win_rates, values): + win_rates_per_row[j].append(win) + + for submission_id, win_rates in zip(submission_ids, win_rates_per_row): - if not win_rates: - continue mean_win_rates[submission_id][scenario] = statistics.mean(win_rates) # mean_win_rates layout @@ -86,11 +133,6 @@ def rank_results(data:dict, metrics_config:dict): value = None if scenario in data[submission_id] and metric in data[submission_id][scenario]: value = data[submission_id][scenario][metric] - # temporary fix for populating lower is better entries with 0.0's; - # this has been fixed in agents.py, but it's needed for older submissions; - # we can remove once we move to flash helm - if lower_is_better and value == 0.0: - value = None row[metric] = value row[f"{scenario} Mean Win Rate"] = mean_win_rates[submission_id][scenario] row[score_key] = scores[submission_id] @@ -107,14 +149,30 @@ def rank_results(data:dict, metrics_config:dict): try: parser = argparse.ArgumentParser(description="rank helm evaluation results") parser.add_argument("--dir", type=str, help='helm evaluation dir for al submissions', required=True) - parser.add_argument('--name', type=str, help='evaluation_name', default='open') + parser.add_argument('--hidden', action='store_true', help="hidden eval metrics", required=False) + parser.add_argument('--track', type=str, default='A100', required=False) args = parser.parse_args() - submission_results =load_run_results(args.dir) - ranked_results = rank_results(submission_results, METRICS) - with open (f"{args.name}_full_rank.json", 'w') as handle: + METRICS = Open_dataset_centric_metrics + + name = 'open' + if args.hidden: + name = 'hidden' + METRICS = Hidden_dataset_centric_eval_metrics + submission_results =load_run_results(args.dir, hidden_dataset=True) + else: + submission_results = load_run_results(args.dir) + + submission_results = transpose_results(submission_results) + + + name = f"{args.track}_{name}" + + ranked_results = rank_results(submission_results, METRICS) + with open (f"{name}_rank.json", 'w') as handle: json.dump( ranked_results, handle, indent=4) + print(f"{name}_rank.json") except Exception as e : print(e) diff --git a/neurIPS_eval_scripts/ranking_metrics.py b/neurIPS_eval_scripts/ranking_metrics.py new file mode 100755 index 00000000..c14c87a9 --- /dev/null +++ b/neurIPS_eval_scripts/ranking_metrics.py @@ -0,0 +1,78 @@ + + +Open_dataset_centric_metrics = { + "MMLU":[ + ("core_scenarios.json", "MMLU - EM", False), + ("core_scenarios.json", "MMLU - EM (Robustness)", False), + ("core_scenarios.json", "MMLU - EM (Fairness)", False), + ], + "TruthfulQA":[ + ("core_scenarios.json", "TruthfulQA - EM", False), + ("core_scenarios.json", "TruthfulQA - EM (Robustness)", False), + ("core_scenarios.json", "TruthfulQA - EM (Fairness)", False), + ], + "BIG-bench":[ + ("core_scenarios.json", "BIG-bench - EM", False), + ], + "GSM8K": [ + ("core_scenarios.json", "GSM8K - EM", False), + ], + "BBQ" :[ + ("targeted_evaluations.json", "BBQ - EM", False), + ], +} + + + + + +Hidden_dataset_centric_eval_metrics = { + # "CNN/DailyMail" :[ + # ("core_scenarios.json", "CNN/DailyMail - ROUGE-2", False), + # ("core_scenarios.json", "CNN/DailyMail - Stereotypes (race)", True), + # ("core_scenarios.json", "CNN/DailyMail - Stereotypes (gender)", True), + # ("core_scenarios.json", "CNN/DailyMail - Representation (race)", True), + # ("core_scenarios.json", "CNN/DailyMail - Representation (gender)", True), + # ], + + "sam_sum" : [ + ("core_scenarios.json", "sam_sum - ROUGE-2", False), + ("core_scenarios.json", "sam_sum - Stereotypes (race)", True), + ("core_scenarios.json", "sam_sum - Stereotypes (gender)", True), + ("core_scenarios.json", "sam_sum - Representation (race)", True), + ("core_scenarios.json", "sam_sum - Representation (gender)", True), + ], + + "corr2cause":[ + ("core_scenarios.json", "corr2cause - EM", False), + ], + + 'MATH': [ + ("core_scenarios.json", "MATH (chain-of-thoughts) - Equivalent (chain of thought)", False), + + ], + + "ethics" : [ + ("core_scenarios.json", "ethics_justice - EM", False), + ("core_scenarios.json", "ethics_justice - EM (Robustness)", False), + ("core_scenarios.json", "ethics_justice - EM (Fairness)", False), + + ("core_scenarios.json", "ethics_commonsense - EM", False), + ("core_scenarios.json", "ethics_commonsense - EM (Robustness)", False), + ("core_scenarios.json", "ethics_commonsense - EM (Fairness)", False), + + + ("core_scenarios.json", "ethics_virtue - EM", False), + ("core_scenarios.json", "ethics_virtue - EM (Robustness)", False), + ("core_scenarios.json", "ethics_virtue - EM (Fairness)", False), + + ("core_scenarios.json", "ethics_deontology - EM", False), + ("core_scenarios.json", "ethics_deontology - EM (Robustness)", False), + ("core_scenarios.json", "ethics_deontology - EM (Fairness)", False), + + ("core_scenarios.json", "ethics_utilitarianism - EM", False), + ("core_scenarios.json", "ethics_utilitarianism - EM (Robustness)", False), + ("core_scenarios.json", "ethics_utilitarianism - EM (Fairness)", False), + ], + +} From d5a6297850dc6a03ce26721faf63f7f7d595556f Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Thu, 21 Nov 2024 10:44:14 -0800 Subject: [PATCH 6/6] Update README.md --- README.md | 53 ++++++++--------------------------------------------- 1 file changed, 8 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 068b8604..a5bda687 100644 --- a/README.md +++ b/README.md @@ -2,50 +2,13 @@ # Holistic Evaluation of Language Models -[comment]: <> (When using the img tag, which allows us to specify size, src has to be a URL.) - +This is a fork of https://github.com/stanford-crfm/helm which we used for the 2023 NeurIPS LLM efficiency competition https://llm-efficiency-challenge.github.io/ -Welcome! The **`crfm-helm`** Python package contains code used in the **Holistic Evaluation of Language Models** project ([paper](https://arxiv.org/abs/2211.09110), [website](https://crfm.stanford.edu/helm/latest/)) by [Stanford CRFM](https://crfm.stanford.edu/). This package includes the following features: +It was private because the tasks we were testing on had to be undisclosed to the final participants and included +* Math +* Corr2cause +* Justice +* Samsum +* Ethics -- Collection of datasets in a standard format (e.g., NaturalQuestions) -- Collection of models accessible via a unified API (e.g., GPT-3, MT-NLG, OPT, BLOOM) -- Collection of metrics beyond accuracy (efficiency, bias, toxicity, etc.) -- Collection of perturbations for evaluating robustness and fairness (e.g., typos, dialect) -- Modular framework for constructing prompts from datasets -- Proxy server for managing accounts and providing unified interface to access models - - -To get started, refer to [the documentation on Read the Docs](https://crfm-helm.readthedocs.io/) for how to install and run the package. - -## Directory Structure - -The directory structure for this repo is as follows - -``` -├── docs # MD used to generate readthedocs -│ -├── scripts # Python utility scripts for HELM -│ ├── cache -│ ├── data_overlap # Calculate train test overlap -│ │ ├── common -│ │ ├── scenarios -│ │ └── test -│ ├── efficiency -│ ├── fact_completion -│ ├── offline_eval -│ └── scale -└── src -├── helm # Benchmarking Scripts for HELM -│ │ -│ ├── benchmark # Main Python code for running HELM -│ │ │ -│ │ └── static # Current JS (Jquery) code for rendering front-end -│ │ │ -│ │ └── ... -│ │ -│ ├── common # Additional Python code for running HELM -│ │ -│ └── proxy # Python code for external web requests -│ -└── helm-frontend # New React Front-end -``` +If you're interested in using these tasks in your own work please feel free to copy paste