From 0ef848e5c0ffcde41d87ff903fd7aaeefbbb59a7 Mon Sep 17 00:00:00 2001 From: Zach Date: Tue, 18 Feb 2025 10:04:56 -0500 Subject: [PATCH] Adding graph code for LM Arena/Training Compute graphs in paper (Figure 1) --- paper/arena_tc_graphs/graphs.R | 379 ++++++++++++++++++ paper/arena_tc_graphs/model_names.csv | 131 ++++++ .../training_compute_epoch.csv | 18 + 3 files changed, 528 insertions(+) create mode 100644 paper/arena_tc_graphs/graphs.R create mode 100644 paper/arena_tc_graphs/model_names.csv create mode 100644 paper/arena_tc_graphs/training_compute_epoch.csv diff --git a/paper/arena_tc_graphs/graphs.R b/paper/arena_tc_graphs/graphs.R new file mode 100644 index 00000000..99da7f5f --- /dev/null +++ b/paper/arena_tc_graphs/graphs.R @@ -0,0 +1,379 @@ +rm(list = ls()) +library(rvest) +library(janitor) +library(reticulate) +library(dplyr) +library(httr) +library(jsonlite) +library(lubridate) +library(ggplot2) +library(ggrepel) +library(scales) +pd <- import("pandas") +io <- import("io") + +human_leaderboard_url <- "https://github.com/forecastingresearch/forecastbench-datasets/raw/refs/heads/main/leaderboards/csv/human_leaderboard_overall.csv" + +# Get leaderboard +response <- GET(human_leaderboard_url) +stop_for_status(response) + +# Convert response content to a data frame +human_leaderboard <- read.csv(textConnection(content(response, as = "text")), stringsAsFactors = FALSE) + +human_leaderboard <- human_leaderboard %>% + clean_names() +names(human_leaderboard) <- gsub("_n_\\d{1,3}(_\\d{3})*\\b", "", names(human_leaderboard)) + +supers_brier <- human_leaderboard %>% + filter(model == "Superforecaster median forecast") %>% + select(overall_score) %>% + unlist() + +# Import arena +repo_owner <- "lmarena-ai" +repo_name <- "chatbot-arena-leaderboard" +branch <- "main" + +# Fetch list of files in repo +api_url <- paste0("https://huggingface.co/api/spaces/", repo_owner, "/", repo_name, "/tree/", branch) +response <- GET(api_url) +stop_for_status(response) +file_list <- content(response, as = "parsed", simplifyVector = TRUE) + +# Filter to elo_results .pkl files +pickle_files <- file_list %>% + filter(grepl("elo_results", path)) %>% + rowwise() %>% + # Get dates from .pkl paths + mutate(date = ymd( + gsub( + ".pkl", + "", + gsub("elo_results_", "", path) + ) + )) + +most_recent_pkl <- pickle_files %>% + ungroup() %>% + filter(date == max(date)) + +# Download pkl +download_url <- paste0("https://huggingface.co/spaces/", repo_owner, "/", repo_name, "/resolve/", branch, "/", most_recent_pkl$path, "?download=true") + +response <- GET(download_url) +stop_for_status(response) + +pickle_data <- content(response, "raw") + +pickle_obj <- pd$read_pickle(io$BytesIO(pickle_data)) + +arena <- pickle_obj$text$full$leaderboard_table_df %>% + arrange(final_ranking) +arena <- arena %>% + mutate(model = rownames((arena))) +rownames(arena) <- NULL + +# Get unique model names +arena_models <- unique(arena$model) +# leaderboard_models <- unique(c(human_leaderboard$model, llm_leaderboard$model)) +leaderboard_models <- unique(human_leaderboard$model) +models <- data.frame(arena_models, + leaderboard_models = c(leaderboard_models, rep(NA, length(arena_models) - length(leaderboard_models))), + arena = rep(NA, length(arena_models)), + leaderboard = rep(NA, length(arena_models)) +) + +# Import model names table +models <- read.csv("model_names.csv") +arena <- arena %>% + filter(model %in% models$arena) +human_leaderboard <- human_leaderboard %>% + filter(model %in% models$leaderboard) + +# attach arena_model_name and score to leaderboards +human_leaderboard <- human_leaderboard %>% + rowwise() %>% + mutate(arena_model_name = unique(models[models$leaderboard == model, ]$arena)) %>% + mutate(arena_score = arena[arena$model == arena_model_name, ]$rating) + +# Separate parentheticals from models +parenthetical_separation <- function(leaderboard) { + leaderboard$prompt <- NA + leaderboard$freeze_values <- NA + leaderboard$news <- NA + for (i in 1:nrow(leaderboard)) { + # Get prompt + if (grepl("scratchpad", leaderboard$model[i])) { + prompt <- "scratchpad" + } else if (grepl("zero shot", leaderboard$model[i])) { + prompt <- "zero shot" + } else if (grepl("superforecaster", leaderboard$model[i])) { + prompt <- "superforecaster" + } else { + print("No prompt found") + prompt <- "?" + } + # Get freeze values + if (grepl("freeze values", leaderboard$model[i])) { + freeze_values <- TRUE + } else { + freeze_values <- FALSE + } + # Get news + if (grepl("news", leaderboard$model[i])) { + news <- TRUE + if (prompt == "superforecaster") { + if (grepl("news 1", leaderboard$model[i])) { + prompt <- paste(prompt, "1") + } else if (grepl("news 2", leaderboard$model[i])) { + prompt <- paste(prompt, "2") + } else if (grepl("news 3", leaderboard$model[i])) { + prompt <- paste(prompt, "3") + } + } + } else { + news <- FALSE + } + # Add results to leaderboard + leaderboard$prompt[i] <- prompt + leaderboard$freeze_values[i] <- freeze_values + leaderboard$news[i] <- news + # Get rid of parentheticals from model names + leaderboard$model[i] <- gsub(" \\(.*?\\)", "", leaderboard$model[i]) + } + return(leaderboard) +} + +human_leaderboard <- parenthetical_separation(human_leaderboard) +# llm_leaderboard <- parenthetical_separation(llm_leaderboard) + +# Add in pretty model names +add_pretty_model_names <- function(leaderboard) { + leaderboard <- leaderboard %>% + rowwise() %>% + mutate(pretty_name = unique(models[models$arena == arena_model_name, ]$pretty_name)) +} + +human_leaderboard <- add_pretty_model_names(human_leaderboard) +# llm_leaderboard <- add_pretty_model_names(llm_leaderboard) + +# Produce graphs +leaderboard_arena_graphs <- function(leaderboard) { + leaderboard <- leaderboard %>% + filter(prompt == "scratchpad") %>% + filter(freeze_values == TRUE) %>% + filter(news == FALSE) + + # Fit a linear model to find the equation of the smooth line + lm_fit <- lm(overall_score ~ arena_score, data = leaderboard) + + # Extract coefficients + intercept <- coef(lm_fit)[1] + slope <- coef(lm_fit)[2] + + print(summary(lm_fit)) + + # Calculate the x-coordinate where y = 0.091 + y_intercept <- supers_brier + x_intersect <- (y_intercept - intercept) / slope + + # Bootstrap to calculate confidence intervals for the intersection + set.seed(123) # For reproducibility + n_bootstraps <- 1000 + bootstrap_intersects <- replicate(n_bootstraps, { + # Resample the data with replacement + sample_data <- leaderboard[sample(1:nrow(leaderboard), size = nrow(leaderboard), replace = TRUE), ] + + # Fit the linear model to the resampled data + lm_boot <- lm(overall_score ~ arena_score, data = sample_data) + + # Extract coefficients + intercept_boot <- coef(lm_boot)[1] + slope_boot <- coef(lm_boot)[2] + + # Calculate the x-coordinate of the intersection + (y_intercept - intercept_boot) / slope_boot + }) + + # Compute the 95% confidence interval for the intersections + ci_lower <- quantile(bootstrap_intersects, 0.025) + ci_upper <- quantile(bootstrap_intersects, 0.975) + print(ci_lower) + print(ci_upper) + + label_data <- data.frame( + x = x_intersect, + y = y_intercept, + label = "Superforecasters" + ) + + p <- ggplot(leaderboard, aes(y = overall_score, x = arena_score)) + + geom_point(size = 3, color = "#F8766D") + # Adjust point size + geom_abline( + intercept = intercept, slope = slope, + color = "gray", linetype = "dashed" + ) + # Linear fit line + geom_text_repel(aes(label = pretty_name), size = 3, max.overlaps = Inf, box.padding = 0.75) + + geom_hline(yintercept = supers_brier, linetype = "dotted", color = "blue", size = 1) + # Horizontal line + geom_point(aes(x = x_intersect, y = y_intercept), color = "red", size = 3) + # Intersection point + geom_errorbarh( + aes( + xmin = ci_lower, + xmax = ci_upper, + y = y_intercept + ), + color = "red", height = 0.01, size = 1, alpha = 0.03 + ) + + geom_text_repel( + data = label_data, + aes(x = x, y = y, label = label), + color = "red", + segment.color = "black", + size = 3, + hjust = -0.1, # Optional adjustment + vjust = -1.5 # Optional adjustment + ) + + # labs( + # title = "Model Performance: LLM Arena Score vs. Brier Score", + # x = "Arena Score (higher is better)", + # y = "Brier Score (lower is better)", + # color = "Prompt Type", + # shape = "Freeze Values" + # ) + + theme_minimal() + + coord_cartesian( # clip = "off", + xlim = c(NA, 1625), + ylim = c(0, 0.25) + ) + + theme( + axis.title.x = element_blank(), + axis.title.y = element_blank() + ) + + print(cor.test(leaderboard$arena_score, leaderboard$overall_score)) + + return(p) +} + +p <- leaderboard_arena_graphs(human_leaderboard) + +ggsave("arena_v_overall.png", p, units = c("px"), width = 1900, height = 1479, bg = "white") + +# Import training compute guesses +tc <- read.csv("training_compute_epoch.csv") + +leaderboard_compute_graphs <- function(leaderboard) { + leaderboard <- leaderboard %>% + select(model, overall_score, arena_score, prompt, freeze_values, news, pretty_name) %>% + mutate(tc = tc[tc$model == model, ]$tc) + leaderboard <- leaderboard %>% + filter(prompt == "scratchpad") %>% + filter(freeze_values == TRUE) %>% + filter(news == FALSE) + + # Fit a linear model to find the equation of the smooth line + lm_fit <- lm(overall_score ~ log(tc), data = leaderboard) + print(summary(lm_fit)) + + # Extract coefficients + intercept <- coef(lm_fit)[1] + slope <- coef(lm_fit)[2] + + # Calculate the x-coordinate where y = 0.091 + y_intercept <- supers_brier + x_intersect <- exp((y_intercept - intercept) / slope) + + # Bootstrap to calculate confidence intervals for the intersection + set.seed(123) # For reproducibility + n_bootstraps <- 1000 + bootstrap_intersects <- replicate(n_bootstraps, { + # Resample the data with replacement + sample_data <- leaderboard[sample(1:nrow(leaderboard), size = nrow(leaderboard), replace = TRUE), ] + + # Fit the linear model to the resampled data + lm_boot <- lm(overall_score ~ log(tc), data = sample_data) + + # Extract coefficients + intercept_boot <- coef(lm_boot)[1] + slope_boot <- coef(lm_boot)[2] + + # Calculate the x-coordinate of the intersection + exp((y_intercept - intercept_boot) / slope_boot) + }) + + # Compute the 95% confidence interval for the intersections + ci_lower <- quantile(bootstrap_intersects, 0.025) + ci_upper <- quantile(bootstrap_intersects, 0.975) + print(ci_lower) + print(ci_upper) + + label_data <- data.frame( + x = x_intersect, + y = y_intercept, + label = "Superforecasters" + ) + print(label_data) + + p <- ggplot(leaderboard, aes(y = overall_score, x = tc)) + + geom_point(size = 3, color = "#F8766D") + # Adjust point size + stat_function( + fun = function(x) intercept + slope * log(x), # Define the abline as a function + color = "gray", linetype = "dashed" + ) + # Linear fit line + geom_text_repel(aes(label = pretty_name), + size = 3, + max.overlaps = Inf, + hjust = -0.1, # Optional adjustment + vjust = -1.5 # Optional adjustment + ) + + geom_hline(yintercept = supers_brier, linetype = "dotted", color = "blue", size = 1) + # Horizontal line + geom_point(aes(x = x_intersect, y = y_intercept), color = "red", size = 3) + # Intersection point + + geom_errorbarh( + aes( + xmin = ci_lower, + xmax = ci_upper, + y = y_intercept + ), + color = "red", height = 0.01, size = 1, alpha = 0.03 + ) + + # annotate("text", x = x_intersect, y = y_intercept, + # label = "Superforecasters", + # hjust = 1, vjust = -0.5, color = "red", size = 3) + # Annotate the intersection point + geom_text_repel( + data = label_data, + aes(x = x, y = y, label = label), + color = "red", + segment.color = "black", + size = 3, + hjust = -0.1, # Optional adjustment + vjust = -1.5 # Optional adjustment + ) + + labs( + # title = "Model Performance: Estimated Training Compute vs.\nOverall Score", + x = "Estimated Training Compute", + y = "Overall Score", + color = "Prompt Type", + shape = "Freeze Values" + ) + + theme_minimal() + + coord_cartesian( + clip = "off", + xlim = c(NA, 1e+28), + ylim = c(0, 0.25) + ) + + scale_x_continuous( + trans = pseudo_log_trans(base = 10), + breaks = c(1e+23, 1e+24, 1e+25, 1e+26, 1e+27, 1e+28) + ) + + theme( + axis.title.x = element_blank(), + axis.title.y = element_blank() + ) + print(cor.test(log(leaderboard$tc), leaderboard$overall_score)) + return(p) +} + +p <- leaderboard_compute_graphs(human_leaderboard) + +ggsave("tc_v_overall.png", p, units = c("px"), width = 1900, height = 1479, bg = "white") diff --git a/paper/arena_tc_graphs/model_names.csv b/paper/arena_tc_graphs/model_names.csv new file mode 100644 index 00000000..26d47d14 --- /dev/null +++ b/paper/arena_tc_graphs/model_names.csv @@ -0,0 +1,131 @@ +arena,leaderboard,pretty_name +claude-3-5-sonnet-20240620,Claude-3-5-Sonnet-20240620 (scratchpad with freeze values),Claude-3.5-Sonnet +claude-3-5-sonnet-20240620,Claude-3-5-Sonnet-20240620 (scratchpad with news with freeze values),Claude-3.5-Sonnet +gpt-4-turbo-2024-04-09,GPT-4-Turbo-2024-04-09 (zero shot with freeze values),GPT-4-Turbo +claude-3-5-sonnet-20240620,Claude-3-5-Sonnet-20240620 (zero shot with freeze values),Claude-3.5-Sonnet +gpt-4-0613,GPT-4 (zero shot with freeze values),GPT-4 +gpt-4o-2024-05-13,GPT-4o (scratchpad with news with freeze values),GPT-4o +claude-3-5-sonnet-20240620,Claude-3-5-Sonnet-20240620 (scratchpad),Claude-3.5-Sonnet +gpt-4o-2024-05-13,GPT-4o (scratchpad with freeze values),GPT-4o +claude-3-5-sonnet-20240620,Claude-3-5-Sonnet-20240620 (scratchpad with news),Claude-3.5-Sonnet +claude-3-opus-20240229,Claude-3-Opus-20240229 (zero shot with freeze values),Claude-3-Opus +claude-3-5-sonnet-20240620,Claude-3-5-Sonnet-20240620 (superforecaster with news 3),Claude-3.5-Sonnet +gpt-4o-2024-05-13,GPT-4o (scratchpad),GPT-4o +claude-3-5-sonnet-20240620,Claude-3-5-Sonnet-20240620 (superforecaster with news 1),Claude-3.5-Sonnet +gpt-4o-2024-05-13,GPT-4o (scratchpad with news),GPT-4o +claude-3-opus-20240229,Claude-3-Opus-20240229 (scratchpad with freeze values),Claude-3-Opus +mistral-large-2402,Mistral-Large-Latest (zero shot with freeze values),Mistral-Large-Latest +gemini-1.5-pro-001,Gemini-1.5-Pro (scratchpad with news with freeze values),Gemini-1.5-Pro +gemini-1.5-pro-001,Gemini-1.5-Pro (scratchpad),Gemini-1.5-Pro +mistral-large-2402,Mistral-Large-Latest (scratchpad with freeze values),Mistral-Large-Latest +gpt-4-turbo-2024-04-09,GPT-4-Turbo-2024-04-09 (zero shot),GPT-4-Turbo +gpt-4-turbo-2024-04-09,GPT-4-Turbo-2024-04-09 (scratchpad with freeze values),GPT-4-Turbo +gpt-4-0613,GPT-4 (scratchpad with freeze values),GPT-4 +gemini-1.5-pro-001,Gemini-1.5-Pro (scratchpad with freeze values),Gemini-1.5-Pro +claude-3-5-sonnet-20240620,Claude-3-5-Sonnet-20240620 (zero shot),Claude-3.5-Sonnet +gemini-1.5-pro-001,Gemini-1.5-Pro (scratchpad with news),Gemini-1.5-Pro +claude-3-opus-20240229,Claude-3-Opus-20240229 (superforecaster with news 1),Claude-3-Opus +gpt-4-turbo-2024-04-09,GPT-4-Turbo-2024-04-09 (scratchpad with news with freeze values),GPT-4-Turbo +gpt-4o-2024-05-13,GPT-4o (zero shot with freeze values),GPT-4o +llama-3-70b-instruct,Llama-3-70b-Chat-Hf (scratchpad with freeze values),Llama-3-70b +mixtral-8x22b-instruct-v0.1,Mixtral-8x22B-Instruct-V0.1 (scratchpad with freeze values),Mixtral-8x22b +gpt-4-turbo-2024-04-09,GPT-4-Turbo-2024-04-09 (scratchpad),GPT-4-Turbo +gemini-1.5-pro-001,Gemini-1.5-Pro (zero shot with freeze values),Gemini-1.5-Pro +claude-3-opus-20240229,Claude-3-Opus-20240229 (scratchpad),Claude-3-Opus +gpt-4-0613,GPT-4 (scratchpad),GPT-4 +qwen1.5-110b-chat,Qwen1.5-110B-Chat (zero shot with freeze values),Qwen1.5-110b +claude-2.1,Claude-2.1 (scratchpad with freeze values),Claude-2.1 +gemini-1.5-flash-001,Gemini-1.5-Flash (zero shot with freeze values),Gemini-1.5-Flash +mixtral-8x22b-instruct-v0.1,Mixtral-8x22B-Instruct-V0.1 (zero shot with freeze values),Mixtral-8x22b +llama-3-70b-instruct,Llama-3-70b-Chat-Hf (zero shot with freeze values),Llama-3-70b +gpt-4-turbo-2024-04-09,GPT-4-Turbo-2024-04-09 (scratchpad with news),GPT-4-Turbo +qwen1.5-110b-chat,Qwen1.5-110B-Chat (scratchpad with freeze values),Qwen1.5-110b +claude-2.1,Claude-2.1 (scratchpad),Claude-2.1 +qwen1.5-110b-chat,Qwen1.5-110B-Chat (scratchpad with news with freeze values),Qwen1.5-110b +claude-2.1,Claude-2.1 (zero shot with freeze values),Claude-2.1 +mistral-large-2402,Mistral-Large-Latest (scratchpad),Mistral-Large-Latest +claude-3-5-sonnet-20240620,Claude-3-5-Sonnet-20240620 (scratchpad with SECOND news),Claude-3.5-Sonnet +claude-3-opus-20240229,Claude-3-Opus-20240229 (zero shot),Claude-3-Opus +gpt-4o-2024-05-13,GPT-4o (superforecaster with news 3),GPT-4o +gpt-4-0613,GPT-4 (zero shot),GPT-4 +gemini-1.5-pro-001,Gemini-1.5-Pro (superforecaster with news 3),Gemini-1.5-Pro +mixtral-8x22b-instruct-v0.1,Mixtral-8x22B-Instruct-V0.1 (scratchpad with news with freeze values),Mixtral-8x22b +mixtral-8x22b-instruct-v0.1,Mixtral-8x22B-Instruct-V0.1 (scratchpad with news),Mixtral-8x22b +gpt-4o-2024-05-13,GPT-4o (zero shot),GPT-4o +qwen1.5-110b-chat,Qwen1.5-110B-Chat (scratchpad with news),Qwen1.5-110b +mixtral-8x22b-instruct-v0.1,Mixtral-8x22B-Instruct-V0.1 (scratchpad),Mixtral-8x22b +qwen1.5-110b-chat,Qwen1.5-110B-Chat (scratchpad),Qwen1.5-110b +claude-3-opus-20240229,Claude-3-Opus-20240229 (scratchpad with news with freeze values),Claude-3-Opus +claude-3-opus-20240229,Claude-3-Opus-20240229 (superforecaster with news 3),Claude-3-Opus +claude-3-5-sonnet-20240620,Claude-3-5-Sonnet-20240620 (superforecaster with news 2),Claude-3.5-Sonnet +llama-3-8b-instruct,Llama-3-8b-Chat-Hf (zero shot with freeze values),Llama-3-8b +gpt-4-turbo-2024-04-09,GPT-4-Turbo-2024-04-09 (superforecaster with news 3),GPT-4-Turbo +mixtral-8x7b-instruct-v0.1,Mixtral-8x7B-Instruct-V0.1 (scratchpad),Mixtral-8x7b +claude-3-opus-20240229,Claude-3-Opus-20240229 (scratchpad with news),Claude-3-Opus +gemini-1.5-flash-001,Gemini-1.5-Flash (scratchpad with freeze values),Gemini-1.5-Flash +mistral-large-2402,Mistral-Large-Latest (zero shot),Mistral-Large-Latest +llama-3-8b-instruct,Llama-3-8b-Chat-Hf (scratchpad with freeze values),Llama-3-8b +claude-3-opus-20240229,Claude-3-Opus-20240229 (superforecaster with news 2),Claude-3-Opus +qwen1.5-110b-chat,Qwen1.5-110B-Chat (superforecaster with news 1),Qwen1.5-110b +gemini-1.5-pro-001,Gemini-1.5-Pro (zero shot),Gemini-1.5-Pro +mixtral-8x7b-instruct-v0.1,Mixtral-8x7B-Instruct-V0.1 (zero shot with freeze values),Mixtral-8x7b +mistral-large-2402,Mistral-Large-Latest (scratchpad with news with freeze values),Mistral-Large-Latest +mixtral-8x22b-instruct-v0.1,Mixtral-8x22B-Instruct-V0.1 (superforecaster with news 3),Mixtral-8x22b +qwen1.5-110b-chat,Qwen1.5-110B-Chat (zero shot),Qwen1.5-110b +claude-2.1,Claude-2.1 (scratchpad with news with freeze values),Claude-2.1 +gpt-4o-2024-05-13,GPT-4o (superforecaster with news 1),GPT-4o +gemini-1.5-flash-001,Gemini-1.5-Flash (scratchpad with news with freeze values),Gemini-1.5-Flash +gpt-4o-2024-05-13,GPT-4o (scratchpad with SECOND news),GPT-4o +llama-3-8b-instruct,Llama-3-8b-Chat-Hf (zero shot),Llama-3-8b +mixtral-8x22b-instruct-v0.1,Mixtral-8x22B-Instruct-V0.1 (superforecaster with news 1),Mixtral-8x22b +claude-2.1,Claude-2.1 (scratchpad with news),Claude-2.1 +gemini-1.5-pro-001,Gemini-1.5-Pro (superforecaster with news 1),Gemini-1.5-Pro +gpt-4-turbo-2024-04-09,GPT-4-Turbo-2024-04-09 (superforecaster with news 1),GPT-4-Turbo +llama-3-70b-instruct,Llama-3-70b-Chat-Hf (zero shot),Llama-3-70b +mistral-large-2402,Mistral-Large-Latest (scratchpad with news),Mistral-Large-Latest +gpt-4o-2024-05-13,GPT-4o (superforecaster with news 2),GPT-4o +llama-3-70b-instruct,Llama-3-70b-Chat-Hf (scratchpad),Llama-3-70b +mixtral-8x22b-instruct-v0.1,Mixtral-8x22B-Instruct-V0.1 (zero shot),Mixtral-8x22b +gemini-1.5-flash-001,Gemini-1.5-Flash (scratchpad),Gemini-1.5-Flash +gemini-1.5-flash-001,Gemini-1.5-Flash (zero shot),Gemini-1.5-Flash +gemini-1.5-flash-001,Gemini-1.5-Flash (scratchpad with news),Gemini-1.5-Flash +mistral-large-2402,Mistral-Large-Latest (superforecaster with news 1),Mistral-Large-Latest +qwen1.5-110b-chat,Qwen1.5-110B-Chat (superforecaster with news 3),Qwen1.5-110b +mixtral-8x7b-instruct-v0.1,Mixtral-8x7B-Instruct-V0.1 (zero shot),Mixtral-8x7b +mixtral-8x7b-instruct-v0.1,Mixtral-8x7B-Instruct-V0.1 (superforecaster with news 2),Mixtral-8x7b +claude-2.1,Claude-2.1 (zero shot),Claude-2.1 +gpt-4-turbo-2024-04-09,GPT-4-Turbo-2024-04-09 (superforecaster with news 2),GPT-4-Turbo +llama-3-8b-instruct,Llama-3-8b-Chat-Hf (scratchpad),Llama-3-8b +mixtral-8x22b-instruct-v0.1,Mixtral-8x22B-Instruct-V0.1 (superforecaster with news 2),Mixtral-8x22b +mistral-large-2402,Mistral-Large-Latest (superforecaster with news 3),Mistral-Large-Latest +mistral-large-2402,Mistral-Large-Latest (superforecaster with news 2),Mistral-Large-Latest +claude-2.1,Claude-2.1 (superforecaster with news 3),Claude-2.1 +mixtral-8x7b-instruct-v0.1,Mixtral-8x7B-Instruct-V0.1 (superforecaster with news 1),Mixtral-8x7b +mixtral-8x7b-instruct-v0.1,Mixtral-8x7B-Instruct-V0.1 (scratchpad with freeze values),Mixtral-8x7b +gemini-1.5-pro-001,Gemini-1.5-Pro (superforecaster with news 2),Gemini-1.5-Pro +qwen1.5-110b-chat,Qwen1.5-110B-Chat (superforecaster with news 2),Qwen1.5-110b +llama-2-70b-chat,Llama-2-70b-Chat-Hf (zero shot with freeze values),Llama-2-70b +gemini-1.5-flash-001,Gemini-1.5-Flash (superforecaster with news 3),Gemini-1.5-Flash +llama-2-70b-chat,Llama-2-70b-Chat-Hf (scratchpad with freeze values),Llama-2-70b +gemini-1.5-flash-001,Gemini-1.5-Flash (superforecaster with news 2),Gemini-1.5-Flash +mixtral-8x7b-instruct-v0.1,Mixtral-8x7B-Instruct-V0.1 (scratchpad with news with freeze values),Mixtral-8x7b +claude-3-haiku-20240307,Claude-3-Haiku-20240307 (scratchpad with freeze values),Claude-3-Haiku +claude-3-haiku-20240307,Claude-3-Haiku-20240307 (superforecaster with news 2),Claude-3-Haiku +mixtral-8x7b-instruct-v0.1,Mixtral-8x7B-Instruct-V0.1 (superforecaster with news 3),Mixtral-8x7b +gemini-1.5-flash-001,Gemini-1.5-Flash (superforecaster with news 1),Gemini-1.5-Flash +claude-3-haiku-20240307,Claude-3-Haiku-20240307 (scratchpad),Claude-3-Haiku +claude-3-haiku-20240307,Claude-3-Haiku-20240307 (zero shot with freeze values),Claude-3-Haiku +llama-2-70b-chat,Llama-2-70b-Chat-Hf (scratchpad),Llama-2-70b +claude-2.1,Claude-2.1 (superforecaster with news 2),Claude-2.1 +claude-2.1,Claude-2.1 (superforecaster with news 1),Claude-2.1 +gpt-3.5-turbo-0125,GPT-3.5-Turbo-0125 (scratchpad with freeze values),GPT-3.5-Turbo +claude-3-haiku-20240307,Claude-3-Haiku-20240307 (scratchpad with news with freeze values),Claude-3-Haiku +claude-3-haiku-20240307,Claude-3-Haiku-20240307 (scratchpad with news),Claude-3-Haiku +mixtral-8x7b-instruct-v0.1,Mixtral-8x7B-Instruct-V0.1 (scratchpad with news),Mixtral-8x7b +gpt-3.5-turbo-0125,GPT-3.5-Turbo-0125 (scratchpad),GPT-3.5-Turbo +claude-3-haiku-20240307,Claude-3-Haiku-20240307 (zero shot),Claude-3-Haiku +claude-3-haiku-20240307,Claude-3-Haiku-20240307 (superforecaster with news 3),Claude-3-Haiku +llama-2-70b-chat,Llama-2-70b-Chat-Hf (zero shot),Llama-2-70b +claude-3-haiku-20240307,Claude-3-Haiku-20240307 (superforecaster with news 1),Claude-3-Haiku +gpt-3.5-turbo-0125,GPT-3.5-Turbo-0125 (zero shot with freeze values),GPT-3.5-Turbo +gpt-3.5-turbo-0125,GPT-3.5-Turbo-0125 (zero shot),GPT-3.5-Turbo \ No newline at end of file diff --git a/paper/arena_tc_graphs/training_compute_epoch.csv b/paper/arena_tc_graphs/training_compute_epoch.csv new file mode 100644 index 00000000..6faaaab9 --- /dev/null +++ b/paper/arena_tc_graphs/training_compute_epoch.csv @@ -0,0 +1,18 @@ +model,tc,source,assumption +Claude-3-5-Sonnet-20240620,NA,NA,NA +GPT-4,NA,NA,NA +GPT-4-Turbo-2024-04-09,NA,NA,NA +GPT-4o,3.81E+25,Epoch, +Mistral-Large-Latest,1.12E+25,Epoch, +Claude-3-Opus-20240229,NA,NA,NA +Gemini-1.5-Pro,1.58E+25,Epoch, +Llama-3-70b-Chat-Hf,7.86E+24,Epoch, +Claude-2.1,3.87E+24,Epoch,Claude 2 +Gemini-1.5-Flash,NA,NA,NA +Qwen1.5-110B-Chat,1.30E+24,Epoch,Qwen1.5 72B +Mixtral-8x22B-Instruct-V0.1,NA,NA,NA +Llama-3-8b-Chat-Hf,7.20E+23,Halawi estimate, +Mixtral-8x7B-Instruct-V0.1,NA,NA,NA +Llama-2-70b-Chat-Hf,8.10E+23,Epoch, +Claude-3-Haiku-20240307,NA,NA,NA +GPT-3.5-Turbo-0125,2.58E+24,Epoch,GPT-3.5 \ No newline at end of file