From 948ffe89c0c62b7e4f3325bc427507fe440db6c4 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Tue, 16 Dec 2025 14:44:06 -0800
Subject: [PATCH 01/17] pull in all changees from #561 made by @dajmcdon

---
 DESCRIPTION                       |   3 +-
 NAMESPACE                         |   3 +
 R/check-metric.R                  |  14 +++
 R/quant-weighted_internal_score.R | 173 ++++++++++++++++++++++++++++++
 R/validation.R                    |  40 +++++++
 man/check_metric.Rd               |   5 +
 man/weighted_interval_score.Rd    | 105 ++++++++++++++++++
 7 files changed, 342 insertions(+), 1 deletion(-)
 create mode 100644 R/quant-weighted_internal_score.R
 create mode 100644 man/weighted_interval_score.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 419fb8e6..9c488069 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -23,7 +23,7 @@ Imports:
     cli,
     dplyr (>= 1.1.0),
     generics (>= 0.1.2),
-    hardhat (>= 1.3.0),
+    hardhat (>= 1.4.2.9000),
     lifecycle (>= 1.0.3),
     rlang (>= 1.1.4),
     tibble,
@@ -116,6 +116,7 @@ Collate:
     'prob-roc_aunp.R'
     'prob-roc_aunu.R'
     'prob-roc_curve.R'
+    'quant-weighted_internal_score.R'
     'reexports.R'
     'surv-brier_survival.R'
     'surv-brier_survival_integrated.R'
diff --git a/NAMESPACE b/NAMESPACE
index e998640c..79c42a25 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -150,6 +150,7 @@ export(check_linear_pred_survival_metric)
 export(check_numeric_metric)
 export(check_ordered_prob_metric)
 export(check_prob_metric)
+export(check_quantile_metric)
 export(check_static_survival_metric)
 export(class_metric_summarizer)
 export(classification_cost)
@@ -265,6 +266,8 @@ export(specificity_vec)
 export(static_survival_metric_summarizer)
 export(tidy)
 export(validate_estimator)
+export(weighted_interval_score)
+export(weighted_interval_score_vec)
 export(yardstick_any_missing)
 export(yardstick_remove_missing)
 import(rlang)
diff --git a/R/check-metric.R b/R/check-metric.R
index 9aee511d..5ae45197 100644
--- a/R/check-metric.R
+++ b/R/check-metric.R
@@ -15,6 +15,7 @@
 #'   - For `check_ordered_prob_metric()`, an ordered factor.
 #'   - For `check_dynamic_survival_metric()`, a Surv object.
 #'   - For `check_static_survival_metric()`, a Surv object.
+#'   - For `check_quantile_metric()`, a numeric vector.
 #'
 #' @param estimate The realized `estimate` result.
 #'   - For `check_numeric_metric()`, a numeric vector.
@@ -25,6 +26,7 @@
 #'     a numeric matrix for multic-class `truth`.
 #'   - For `check_dynamic_survival_metric()`, list-column of data.frames.
 #'   - For `check_static_survival_metric()`, a numeric vector.
+#'   - For `check_quantile_metric()`, a `hardhat::quantile_pred` vector.
 #'
 #' @param case_weights The realized case weights, as a numeric vector. This must
 #'   be the same length as `truth`.
@@ -132,3 +134,15 @@ check_linear_pred_survival_metric <- function(
   validate_case_weights(case_weights, size = nrow(truth), call = call)
   validate_surv_truth_numeric_estimate(truth, estimate, call = call)
 }
+
+#' @rdname check_metric
+#' @export
+check_quantile_metric <- function(
+  truth,
+  estimate,
+  case_weights,
+  call = caller_env()
+) {
+  validate_numeric_truth_quantile_estimate(truth, estimate, call = call)
+  validate_case_weights(case_weights, size = nrow(truth), call = call)
+}
diff --git a/R/quant-weighted_internal_score.R b/R/quant-weighted_internal_score.R
new file mode 100644
index 00000000..5279d1b7
--- /dev/null
+++ b/R/quant-weighted_internal_score.R
@@ -0,0 +1,173 @@
+#' Compute weighted interval score
+#'
+#' Weighted interval score (WIS), a well-known quantile-based
+#' approximation of the commonly-used continuous ranked probability score
+#' (CRPS). WIS is a proper score, and can be thought of as a distributional
+#' generalization of absolute error. For example, see [Bracher et
+#' al. (2020)](https://arxiv.org/abs/2005.12881) for discussion in the context
+#' of COVID-19 forecasting.
+#'
+#' @param x A vector of class `quantile_pred`.
+#' @param actual double. Actual value(s)
+#' @param quantile_levels probabilities. If specified, the score will be
+#'   computed at this set of levels. Otherwise, those present in `x` will be
+#'   used. If `quantile_levels` do not exactly match those available in `x`,
+#'   then some quantiles will have implicit missingness. Handling of these
+#'   is determined by `quantile_estimate_nas`.
+#' @param quantile_estimate_nas character. This argument applies only to `x`.
+#'   It handles imputation of individual `quantile_levels` that are necessary to
+#'   compute a score. Because each element of `x` is a [hardhat::quantile_pred],
+#'   it is possible for these to be missing for particular
+#'   `quantile_levels`. There are a number of different possibilities for such
+#'   missingness. The options are as follows:
+#'   * For `"impute"`, both explicit and implicit missing values will be imputed
+#'   using [hardhat::impute_quantiles()] prior to the calculation of the score.
+#'   So the score will be `NA` only if imputation fails.
+#'   * For `"drop"`, any explicit missing values will be removed
+#'   before calculating the score for a particular prediction. This may be
+#'   reasonable due to the weighting. For example, if the estimate has
+#'   `quantile_levels = c(.25, .5, .75)` but the median is `NA` for a particular
+#'   prediction, it may be reasonable to average the accuracy of `c(.25, .75)`
+#'   for that prediction with others that don't have missingness. This option
+#'   is only works if `quantile_levels = NULL` or is a subset of the
+#'   `quantile_levels` in `x`.
+#'   * For `"propagate"`, any missing value predictions will result in that
+#'   element of `x` having a score of `NA`. If `na_rm = TRUE`, then these will
+#'   be removed before averaging.
+#' @param na_rm logical. If `TRUE`, missing values in `actual` or both implicit and
+#'   explicit (values of `NA` present in `x`), will be ignored (dropped) in the
+#'   calculation of the summary score. If `FALSE` (the default), any `NA`s will
+#'   result in the summary being `NA`.
+#' @param ... not used
+#'
+#' @return a vector of nonnegative scores.
+#'
+#' @export
+#' @examples
+#' quantile_levels <- c(.2, .4, .6, .8)
+#' pred1 <- 1:4
+#' pred2 <- 8:11
+#' preds <- quantile_pred(rbind(pred1, pred2), quantile_levels)
+#' truth <- c(3.3, 7.1)
+#' weighted_interval_score_vec(truth, preds)
+#' weighted_interval_score_vec(truth, preds, quantile_levels = c(.25, .5, .75))
+#'
+#' # Missing value behaviours
+#'
+#' preds_na <- quantile_pred(rbind(pred1, c(1, 2, NA, 4)), 1:4 / 5)
+#' truth <- c(2.5, 2.5)
+#' weighted_interval_score_vec(truth, preds_na)
+#' weighted_interval_score_vec(truth, preds_na, quantile_levels = 1:9 / 10)
+#' expect_error(weighted_interval_score_vec(
+#'   truth,
+#'   preds_na,
+#'   quantile_levels = 1:9 / 10,
+#'   quantile_estimate_nas = "drop"
+#' ))
+#' weighted_interval_score_vec(
+#'   truth,
+#'   preds_na,
+#'   quantile_levels = c(2, 3) / 5,
+#'   quantile_estimate_nas = "drop"
+#' )
+#' weighted_interval_score_vec(
+#'   truth, preds_na, na_rm = TRUE, quantile_estimate_nas = "propagate"
+#' )
+#' weighted_interval_score_vec(
+#'   truth, preds_na, quantile_estimate_nas = "propagate"
+#' )
+#'
+weighted_interval_score <- function(data, ...) {
+  UseMethod("weighted_interval_score")
+}
+weighted_interval_score <- new_numeric_metric(
+  mae,
+  direction = "minimize"
+)
+
+#' @export
+#' @rdname weighted_interval_score
+weighted_interval_score_vec <- function(
+  truth,
+  estimate,
+  quantile_levels = NULL,
+  na_rm = FALSE,
+  quantile_estimate_nas = c("impute", "drop", "propagate"),
+  case_weights = NULL,
+  ...
+) {
+  check_quantile_metric(truth, estimate, case_weights)
+  estimate_quantile_levels <- hardhat::extract_quantile_levels(estimate)
+  quantile_estimate_nas <- rlang::arg_match(quantile_estimate_nas)
+  if (!is.null(quantile_levels)) {
+    hardhat::check_quantile_levels(quantile_levels)
+    all_levels_estimated <- all(quantile_levels %in% estimate_quantile_levels)
+    if (quantile_estimate_nas == "drop" && !all_levels_estimated) {
+      cli::cli_abort(
+        "When `quantile_levels` is not a subset of those available in `estimate`, 
+      `quantile_estimate_nas` may not be `'drop'`."
+      )
+    }
+    if (!all_levels_estimated && (quantile_estimate_nas == "propagate")) {
+      # We requested particular levels, but the levels aren't all there,
+      # and NAs propagate, so return NA
+      return(NA_real_)
+    }
+  }
+
+  quantile_levels <- quantile_levels %||% estimate_quantile_levels
+  if (quantile_estimate_nas %in% c("drop", "propagate")) {
+    levels_estimated <- estimate_quantile_levels %in% quantile_levels
+    estimate <- as.matrix(estimate)[, levels_estimated, drop = FALSE]
+  } else {
+    estimate <- as.matrix(hardhat::impute_quantiles(estimate, quantile_levels))
+  }
+
+  vec_wis <- wis_impl(
+    truth = truth,
+    estimate = estimate,
+    quantile_levels = quantile_levels,
+    rowwise_na_rm = (quantile_estimate_nas == "drop")
+  )
+
+  if (na_rm) {
+    result <- yardstick_remove_missing(truth, vec_wis, case_weights)
+
+    truth <- result$truth
+    vec_wis <- result$estimate
+    case_weights <- result$case_weights
+  } else if (yardstick_any_missing(truth, vec_wis, case_weights)) {
+    return(NA_real_)
+  }
+
+  yardstick_mean(vec_wis, case_weights = case_weights)
+}
+
+wis_impl <- function(
+  truth,
+  estimate,
+  quantile_levels,
+  rowwise_na_rm = TRUE
+) {
+  as.vector(
+    mapply(
+      FUN = function(.x, .y) {
+        wis_one_quantile(.x, quantile_levels, .y, rowwise_na_rm)
+      },
+      vctrs::vec_chop(estimate),
+      truth
+    ),
+    "double"
+  )
+}
+
+wis_one_quantile <- function(values, quantile_levels, truth, na_rm) {
+  2 *
+    mean(
+      pmax(
+        quantile_levels * (truth - values),
+        (1 - quantile_levels) * (values - truth)
+      ),
+      na.rm = na_rm
+    )
+}
diff --git a/R/validation.R b/R/validation.R
index 547e072a..d69bb4e3 100644
--- a/R/validation.R
+++ b/R/validation.R
@@ -443,3 +443,43 @@ validate_case_weights <- function(case_weights, size, call = caller_env()) {
 
   invisible(NULL)
 }
+
+validate_numeric_truth_quantile_estimate <- function(
+  truth,
+  estimate,
+  call = caller_env()
+) {
+  if (!is.numeric(truth)) {
+    cli::cli_abort(
+      "{.arg truth} should be a numeric vector,
+      not {.obj_type_friendly {truth}}.",
+      call = call
+    )
+  }
+
+  if (!inherits(estimate, "quantile_pred")) {
+    cli::cli_abort(
+      "{.arg estimate} should be a {.cls quantile_pred} object,
+      not {.obj_type_friendly {estimate}}.",
+      call = call
+    )
+  }
+
+  if (is.matrix(truth)) {
+    cli::cli_abort(
+      "{.arg truth} should be a numeric vector, not a numeric matrix.",
+      call = call
+    )
+  }
+
+  n_truth <- length(truth)
+  n_estimate <- vctrs::vec_size(estimate)
+
+  if (n_truth != n_estimate) {
+    cli::cli_abort(
+      "{.arg truth} ({n_truth}) and
+      {.arg estimate} ({n_estimate}) must be the same length.",
+      call = call
+    )
+  }
+}
diff --git a/man/check_metric.Rd b/man/check_metric.Rd
index 19c4b51c..7c302d00 100644
--- a/man/check_metric.Rd
+++ b/man/check_metric.Rd
@@ -9,6 +9,7 @@
 \alias{check_dynamic_survival_metric}
 \alias{check_static_survival_metric}
 \alias{check_linear_pred_survival_metric}
+\alias{check_quantile_metric}
 \title{Developer function for checking inputs in new metrics}
 \usage{
 check_numeric_metric(truth, estimate, case_weights, call = caller_env())
@@ -57,6 +58,8 @@ check_linear_pred_survival_metric(
   case_weights,
   call = caller_env()
 )
+
+check_quantile_metric(truth, estimate, case_weights, call = caller_env())
 }
 \arguments{
 \item{truth}{The realized vector of \code{truth}.
@@ -67,6 +70,7 @@ check_linear_pred_survival_metric(
 \item For \code{check_ordered_prob_metric()}, an ordered factor.
 \item For \code{check_dynamic_survival_metric()}, a Surv object.
 \item For \code{check_static_survival_metric()}, a Surv object.
+\item For \code{check_quantile_metric()}, a numeric vector.
 }}
 
 \item{estimate}{The realized \code{estimate} result.
@@ -79,6 +83,7 @@ a numeric matrix for multic-class \code{truth}.
 a numeric matrix for multic-class \code{truth}.
 \item For \code{check_dynamic_survival_metric()}, list-column of data.frames.
 \item For \code{check_static_survival_metric()}, a numeric vector.
+\item For \code{check_quantile_metric()}, a \code{hardhat::quantile_pred} vector.
 }}
 
 \item{case_weights}{The realized case weights, as a numeric vector. This must
diff --git a/man/weighted_interval_score.Rd b/man/weighted_interval_score.Rd
new file mode 100644
index 00000000..a8dc6b95
--- /dev/null
+++ b/man/weighted_interval_score.Rd
@@ -0,0 +1,105 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/quant-weighted_internal_score.R
+\name{weighted_interval_score}
+\alias{weighted_interval_score}
+\alias{weighted_interval_score_vec}
+\title{Compute weighted interval score}
+\usage{
+weighted_interval_score(data, ...)
+
+weighted_interval_score_vec(
+  truth,
+  estimate,
+  quantile_levels = NULL,
+  na_rm = FALSE,
+  quantile_estimate_nas = c("impute", "drop", "propagate"),
+  case_weights = NULL,
+  ...
+)
+}
+\arguments{
+\item{...}{not used}
+
+\item{quantile_levels}{probabilities. If specified, the score will be
+computed at this set of levels. Otherwise, those present in \code{x} will be
+used. If \code{quantile_levels} do not exactly match those available in \code{x},
+then some quantiles will have implicit missingness. Handling of these
+is determined by \code{quantile_estimate_nas}.}
+
+\item{na_rm}{logical. If \code{TRUE}, missing values in \code{actual} or both implicit and
+explicit (values of \code{NA} present in \code{x}), will be ignored (dropped) in the
+calculation of the summary score. If \code{FALSE} (the default), any \code{NA}s will
+result in the summary being \code{NA}.}
+
+\item{quantile_estimate_nas}{character. This argument applies only to \code{x}.
+It handles imputation of individual \code{quantile_levels} that are necessary to
+compute a score. Because each element of \code{x} is a \link[hardhat:quantile_pred]{hardhat::quantile_pred},
+it is possible for these to be missing for particular
+\code{quantile_levels}. There are a number of different possibilities for such
+missingness. The options are as follows:
+\itemize{
+\item For \code{"impute"}, both explicit and implicit missing values will be imputed
+using \code{\link[hardhat:impute_quantiles]{hardhat::impute_quantiles()}} prior to the calculation of the score.
+So the score will be \code{NA} only if imputation fails.
+\item For \code{"drop"}, any explicit missing values will be removed
+before calculating the score for a particular prediction. This may be
+reasonable due to the weighting. For example, if the estimate has
+\code{quantile_levels = c(.25, .5, .75)} but the median is \code{NA} for a particular
+prediction, it may be reasonable to average the accuracy of \code{c(.25, .75)}
+for that prediction with others that don't have missingness. This option
+is only works if \code{quantile_levels = NULL} or is a subset of the
+\code{quantile_levels} in \code{x}.
+\item For \code{"propagate"}, any missing value predictions will result in that
+element of \code{x} having a score of \code{NA}. If \code{na_rm = TRUE}, then these will
+be removed before averaging.
+}}
+
+\item{x}{A vector of class \code{quantile_pred}.}
+
+\item{actual}{double. Actual value(s)}
+}
+\value{
+a vector of nonnegative scores.
+}
+\description{
+Weighted interval score (WIS), a well-known quantile-based
+approximation of the commonly-used continuous ranked probability score
+(CRPS). WIS is a proper score, and can be thought of as a distributional
+generalization of absolute error. For example, see \href{https://arxiv.org/abs/2005.12881}{Bracher et al. (2020)} for discussion in the context
+of COVID-19 forecasting.
+}
+\examples{
+quantile_levels <- c(.2, .4, .6, .8)
+pred1 <- 1:4
+pred2 <- 8:11
+preds <- quantile_pred(rbind(pred1, pred2), quantile_levels)
+truth <- c(3.3, 7.1)
+weighted_interval_score_vec(truth, preds)
+weighted_interval_score_vec(truth, preds, quantile_levels = c(.25, .5, .75))
+
+# Missing value behaviours
+
+preds_na <- quantile_pred(rbind(pred1, c(1, 2, NA, 4)), 1:4 / 5)
+truth <- c(2.5, 2.5)
+weighted_interval_score_vec(truth, preds_na)
+weighted_interval_score_vec(truth, preds_na, quantile_levels = 1:9 / 10)
+expect_error(weighted_interval_score_vec(
+  truth,
+  preds_na,
+  quantile_levels = 1:9 / 10,
+  quantile_estimate_nas = "drop"
+))
+weighted_interval_score_vec(
+  truth,
+  preds_na,
+  quantile_levels = c(2, 3) / 5,
+  quantile_estimate_nas = "drop"
+)
+weighted_interval_score_vec(
+  truth, preds_na, na_rm = TRUE, quantile_estimate_nas = "propagate"
+)
+weighted_interval_score_vec(
+  truth, preds_na, quantile_estimate_nas = "propagate"
+)
+
+}

From d2959f53ada37c71129ed4db0842d68ace932005 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Tue, 16 Dec 2025 14:56:01 -0800
Subject: [PATCH 02/17] make R CMD Check run clean

---
 R/quant-weighted_internal_score.R | 19 ++++++++++++++++---
 man/weighted_interval_score.Rd    | 18 ++++++++++++++----
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/R/quant-weighted_internal_score.R b/R/quant-weighted_internal_score.R
index 5279d1b7..0221f5f5 100644
--- a/R/quant-weighted_internal_score.R
+++ b/R/quant-weighted_internal_score.R
@@ -7,8 +7,13 @@
 #' al. (2020)](https://arxiv.org/abs/2005.12881) for discussion in the context
 #' of COVID-19 forecasting.
 #'
-#' @param x A vector of class `quantile_pred`.
-#' @param actual double. Actual value(s)
+#' @param data A `data.frame` containing the columns specified by the `truth`
+#' and `estimate` arguments.
+#'
+#' @param truth double. Actual value(s)
+#'
+#' @param estimate A vector of class `quantile_pred`.
+#'
 #' @param quantile_levels probabilities. If specified, the score will be
 #'   computed at this set of levels. Otherwise, those present in `x` will be
 #'   used. If `quantile_levels` do not exactly match those available in `x`,
@@ -38,12 +43,20 @@
 #'   explicit (values of `NA` present in `x`), will be ignored (dropped) in the
 #'   calculation of the summary score. If `FALSE` (the default), any `NA`s will
 #'   result in the summary being `NA`.
+#'
+#' @param case_weights The optional column identifier for case weights. This
+#' should be an unquoted column name that evaluates to a numeric column in
+#' `data`. For `_vec()` functions, a numeric vector,
+#' [hardhat::importance_weights()], or [hardhat::frequency_weights()].
+#'
 #' @param ... not used
 #'
 #' @return a vector of nonnegative scores.
 #'
 #' @export
 #' @examples
+#' library(hardhat)
+#'
 #' quantile_levels <- c(.2, .4, .6, .8)
 #' pred1 <- 1:4
 #' pred2 <- 8:11
@@ -58,7 +71,7 @@
 #' truth <- c(2.5, 2.5)
 #' weighted_interval_score_vec(truth, preds_na)
 #' weighted_interval_score_vec(truth, preds_na, quantile_levels = 1:9 / 10)
-#' expect_error(weighted_interval_score_vec(
+#' try(weighted_interval_score_vec(
 #'   truth,
 #'   preds_na,
 #'   quantile_levels = 1:9 / 10,
diff --git a/man/weighted_interval_score.Rd b/man/weighted_interval_score.Rd
index a8dc6b95..19312301 100644
--- a/man/weighted_interval_score.Rd
+++ b/man/weighted_interval_score.Rd
@@ -18,8 +18,15 @@ weighted_interval_score_vec(
 )
 }
 \arguments{
+\item{data}{A \code{data.frame} containing the columns specified by the \code{truth}
+and \code{estimate} arguments.}
+
 \item{...}{not used}
 
+\item{truth}{double. Actual value(s)}
+
+\item{estimate}{A vector of class \code{quantile_pred}.}
+
 \item{quantile_levels}{probabilities. If specified, the score will be
 computed at this set of levels. Otherwise, those present in \code{x} will be
 used. If \code{quantile_levels} do not exactly match those available in \code{x},
@@ -54,9 +61,10 @@ element of \code{x} having a score of \code{NA}. If \code{na_rm = TRUE}, then th
 be removed before averaging.
 }}
 
-\item{x}{A vector of class \code{quantile_pred}.}
-
-\item{actual}{double. Actual value(s)}
+\item{case_weights}{The optional column identifier for case weights. This
+should be an unquoted column name that evaluates to a numeric column in
+\code{data}. For \verb{_vec()} functions, a numeric vector,
+\code{\link[hardhat:importance_weights]{hardhat::importance_weights()}}, or \code{\link[hardhat:frequency_weights]{hardhat::frequency_weights()}}.}
 }
 \value{
 a vector of nonnegative scores.
@@ -69,6 +77,8 @@ generalization of absolute error. For example, see \href{https://arxiv.org/abs/2
 of COVID-19 forecasting.
 }
 \examples{
+library(hardhat)
+
 quantile_levels <- c(.2, .4, .6, .8)
 pred1 <- 1:4
 pred2 <- 8:11
@@ -83,7 +93,7 @@ preds_na <- quantile_pred(rbind(pred1, c(1, 2, NA, 4)), 1:4 / 5)
 truth <- c(2.5, 2.5)
 weighted_interval_score_vec(truth, preds_na)
 weighted_interval_score_vec(truth, preds_na, quantile_levels = 1:9 / 10)
-expect_error(weighted_interval_score_vec(
+try(weighted_interval_score_vec(
   truth,
   preds_na,
   quantile_levels = 1:9 / 10,

From 8bfc2f195a394d8e0ec2c194b77817b9a970ab8d Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Tue, 16 Dec 2025 15:07:47 -0800
Subject: [PATCH 03/17] apply new_numeric_metric to weighted_interval_score
 itself

---
 R/quant-weighted_internal_score.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/quant-weighted_internal_score.R b/R/quant-weighted_internal_score.R
index 0221f5f5..62f77061 100644
--- a/R/quant-weighted_internal_score.R
+++ b/R/quant-weighted_internal_score.R
@@ -94,7 +94,7 @@ weighted_interval_score <- function(data, ...) {
   UseMethod("weighted_interval_score")
 }
 weighted_interval_score <- new_numeric_metric(
-  mae,
+  weighted_interval_score,
   direction = "minimize"
 )
 

From b7990f4ec4c4f265d90ba3cc61af4495ae1e556b Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Tue, 16 Dec 2025 15:07:57 -0800
Subject: [PATCH 04/17] add pkgdown

---
 _pkgdown.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/_pkgdown.yml b/_pkgdown.yml
index 2d123afa..2415c8e6 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -94,6 +94,10 @@ reference:
   contents:
   - roc_curve_survival
 
+- title: Quantile Metrics
+  contents:
+  - weighted_interval_score
+
 - title: Curve Functions
   contents:
   - roc_curve

From 36e5fc03a972ddb7e1ae6d37152c8bbc4a9e79e1 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Tue, 16 Dec 2025 15:08:06 -0800
Subject: [PATCH 05/17] align documentation

---
 R/quant-weighted_internal_score.R | 45 ++++++++++++++++++-------------
 man/weighted_interval_score.Rd    | 34 ++++++++++++++---------
 2 files changed, 48 insertions(+), 31 deletions(-)

diff --git a/R/quant-weighted_internal_score.R b/R/quant-weighted_internal_score.R
index 62f77061..facf9ad3 100644
--- a/R/quant-weighted_internal_score.R
+++ b/R/quant-weighted_internal_score.R
@@ -1,24 +1,32 @@
 #' Compute weighted interval score
 #'
-#' Weighted interval score (WIS), a well-known quantile-based
-#' approximation of the commonly-used continuous ranked probability score
-#' (CRPS). WIS is a proper score, and can be thought of as a distributional
-#' generalization of absolute error. For example, see [Bracher et
-#' al. (2020)](https://arxiv.org/abs/2005.12881) for discussion in the context
-#' of COVID-19 forecasting.
+#' Weighted interval score (WIS), a well-known quantile-based approximation of
+#' the commonly-used continuous ranked probability score (CRPS). WIS is a proper
+#' score, and can be thought of as a distributional generalization of absolute
+#' error. For example, see
+#' [Bracher et al. (2020)](https://arxiv.org/abs/2005.12881) for discussion in
+#' the context of COVID-19 forecasting.
 #'
 #' @param data A `data.frame` containing the columns specified by the `truth`
-#' and `estimate` arguments.
+#'   and `estimate` arguments.
 #'
-#' @param truth double. Actual value(s)
+#' @param truth The column identifier for the true class results
+#'   (that is a `numeric`). This should be an unquoted column name although
+#'   this argument is passed by expression and supports
+#'   [quasiquotation][rlang::quasiquotation] (you can unquote column
+#'   names). For `_vec()` functions, a `factor` vector.
 #'
-#' @param estimate A vector of class `quantile_pred`.
+#' @param estimate The column identifier for the predicted class results
+#'   (that is also `quantile_pred`). As with `truth` this can be specified
+#'   different ways but the primary method is to use an unquoted variable name.
+#'   For `_vec()` functions, a `quantile_pred` vector.
 #'
 #' @param quantile_levels probabilities. If specified, the score will be
 #'   computed at this set of levels. Otherwise, those present in `x` will be
 #'   used. If `quantile_levels` do not exactly match those available in `x`,
-#'   then some quantiles will have implicit missingness. Handling of these
-#'   is determined by `quantile_estimate_nas`.
+#'   then some quantiles will have implicit missingness. Handling of these is
+#'   determined by `quantile_estimate_nas`.
+#'
 #' @param quantile_estimate_nas character. This argument applies only to `x`.
 #'   It handles imputation of individual `quantile_levels` that are necessary to
 #'   compute a score. Because each element of `x` is a [hardhat::quantile_pred],
@@ -39,15 +47,16 @@
 #'   * For `"propagate"`, any missing value predictions will result in that
 #'   element of `x` having a score of `NA`. If `na_rm = TRUE`, then these will
 #'   be removed before averaging.
-#' @param na_rm logical. If `TRUE`, missing values in `actual` or both implicit and
-#'   explicit (values of `NA` present in `x`), will be ignored (dropped) in the
-#'   calculation of the summary score. If `FALSE` (the default), any `NA`s will
-#'   result in the summary being `NA`.
+#'
+#' @param na_rm logical. If `TRUE`, missing values in `actual` or both implicit
+#'   and explicit (values of `NA` present in `x`), will be ignored (dropped) in
+#'   the calculation of the summary score. If `FALSE` (the default), any `NA`s
+#'   will result in the summary being `NA`.
 #'
 #' @param case_weights The optional column identifier for case weights. This
-#' should be an unquoted column name that evaluates to a numeric column in
-#' `data`. For `_vec()` functions, a numeric vector,
-#' [hardhat::importance_weights()], or [hardhat::frequency_weights()].
+#'   should be an unquoted column name that evaluates to a numeric column in
+#'   `data`. For `_vec()` functions, a numeric vector,
+#'   [hardhat::importance_weights()], or [hardhat::frequency_weights()].
 #'
 #' @param ... not used
 #'
diff --git a/man/weighted_interval_score.Rd b/man/weighted_interval_score.Rd
index 19312301..ff5c3695 100644
--- a/man/weighted_interval_score.Rd
+++ b/man/weighted_interval_score.Rd
@@ -23,20 +23,27 @@ and \code{estimate} arguments.}
 
 \item{...}{not used}
 
-\item{truth}{double. Actual value(s)}
+\item{truth}{The column identifier for the true class results
+(that is a \code{numeric}). This should be an unquoted column name although
+this argument is passed by expression and supports
+\link[rlang:topic-inject]{quasiquotation} (you can unquote column
+names). For \verb{_vec()} functions, a \code{factor} vector.}
 
-\item{estimate}{A vector of class \code{quantile_pred}.}
+\item{estimate}{The column identifier for the predicted class results
+(that is also \code{quantile_pred}). As with \code{truth} this can be specified
+different ways but the primary method is to use an unquoted variable name.
+For \verb{_vec()} functions, a \code{quantile_pred} vector.}
 
 \item{quantile_levels}{probabilities. If specified, the score will be
 computed at this set of levels. Otherwise, those present in \code{x} will be
 used. If \code{quantile_levels} do not exactly match those available in \code{x},
-then some quantiles will have implicit missingness. Handling of these
-is determined by \code{quantile_estimate_nas}.}
+then some quantiles will have implicit missingness. Handling of these is
+determined by \code{quantile_estimate_nas}.}
 
-\item{na_rm}{logical. If \code{TRUE}, missing values in \code{actual} or both implicit and
-explicit (values of \code{NA} present in \code{x}), will be ignored (dropped) in the
-calculation of the summary score. If \code{FALSE} (the default), any \code{NA}s will
-result in the summary being \code{NA}.}
+\item{na_rm}{logical. If \code{TRUE}, missing values in \code{actual} or both implicit
+and explicit (values of \code{NA} present in \code{x}), will be ignored (dropped) in
+the calculation of the summary score. If \code{FALSE} (the default), any \code{NA}s
+will result in the summary being \code{NA}.}
 
 \item{quantile_estimate_nas}{character. This argument applies only to \code{x}.
 It handles imputation of individual \code{quantile_levels} that are necessary to
@@ -70,11 +77,12 @@ should be an unquoted column name that evaluates to a numeric column in
 a vector of nonnegative scores.
 }
 \description{
-Weighted interval score (WIS), a well-known quantile-based
-approximation of the commonly-used continuous ranked probability score
-(CRPS). WIS is a proper score, and can be thought of as a distributional
-generalization of absolute error. For example, see \href{https://arxiv.org/abs/2005.12881}{Bracher et al. (2020)} for discussion in the context
-of COVID-19 forecasting.
+Weighted interval score (WIS), a well-known quantile-based approximation of
+the commonly-used continuous ranked probability score (CRPS). WIS is a proper
+score, and can be thought of as a distributional generalization of absolute
+error. For example, see
+\href{https://arxiv.org/abs/2005.12881}{Bracher et al. (2020)} for discussion in
+the context of COVID-19 forecasting.
 }
 \examples{
 library(hardhat)

From 2222f7f0af37f21c7243a2eb96a36c26106987d3 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Tue, 16 Dec 2025 15:27:32 -0800
Subject: [PATCH 06/17] add remotes hardhat

---
 DESCRIPTION | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/DESCRIPTION b/DESCRIPTION
index 9c488069..688dc76a 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -126,3 +126,5 @@ Collate:
     'template.R'
     'validation.R'
     'yardstick-package.R'
+Remotes:
+    tidymodels/hardhat
\ No newline at end of file

From 0435183cbc0288e4dbbbcce27103e501660196b6 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Wed, 17 Dec 2025 14:28:35 -0800
Subject: [PATCH 07/17] add missing newline

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 688dc76a..ff661dfd 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -127,4 +127,4 @@ Collate:
     'validation.R'
     'yardstick-package.R'
 Remotes:
-    tidymodels/hardhat
\ No newline at end of file
+    tidymodels/hardhat

From 70f1dfb1a56456848db3e480ba0f8a4d95862c84 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Wed, 17 Dec 2025 14:29:15 -0800
Subject: [PATCH 08/17] correctly document truth to be nuemric in
 weighted_interval_score()

---
 R/quant-weighted_internal_score.R | 2 +-
 man/weighted_interval_score.Rd    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/quant-weighted_internal_score.R b/R/quant-weighted_internal_score.R
index facf9ad3..a10bee5d 100644
--- a/R/quant-weighted_internal_score.R
+++ b/R/quant-weighted_internal_score.R
@@ -14,7 +14,7 @@
 #'   (that is a `numeric`). This should be an unquoted column name although
 #'   this argument is passed by expression and supports
 #'   [quasiquotation][rlang::quasiquotation] (you can unquote column
-#'   names). For `_vec()` functions, a `factor` vector.
+#'   names). For `_vec()` functions, a `numeric` vector.
 #'
 #' @param estimate The column identifier for the predicted class results
 #'   (that is also `quantile_pred`). As with `truth` this can be specified
diff --git a/man/weighted_interval_score.Rd b/man/weighted_interval_score.Rd
index ff5c3695..09f3ad39 100644
--- a/man/weighted_interval_score.Rd
+++ b/man/weighted_interval_score.Rd
@@ -27,7 +27,7 @@ and \code{estimate} arguments.}
 (that is a \code{numeric}). This should be an unquoted column name although
 this argument is passed by expression and supports
 \link[rlang:topic-inject]{quasiquotation} (you can unquote column
-names). For \verb{_vec()} functions, a \code{factor} vector.}
+names). For \verb{_vec()} functions, a \code{numeric} vector.}
 
 \item{estimate}{The column identifier for the predicted class results
 (that is also \code{quantile_pred}). As with \code{truth} this can be specified

From db031c6733b57e3f4db75d8837dac527fd435b79 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Wed, 17 Dec 2025 14:31:11 -0800
Subject: [PATCH 09/17] add quantile metric infrastructure

---
 NAMESPACE                         |   2 +
 R/aaa-metrics.R                   |  54 ++++++++++++-
 R/aaa-new.R                       |   7 ++
 R/quant-weighted_internal_score.R |   2 +-
 R/template.R                      | 123 ++++++++++++++++++++++++++----
 man/metric-summarizers.Rd         |  42 +++++++---
 man/new-metric.Rd                 |   3 +
 vignettes/metric-types.Rmd        |  10 +++
 8 files changed, 216 insertions(+), 27 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 79c42a25..645afe8e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -214,6 +214,7 @@ export(new_linear_pred_survival_metric)
 export(new_numeric_metric)
 export(new_ordered_prob_metric)
 export(new_prob_metric)
+export(new_quantile_metric)
 export(new_static_survival_metric)
 export(npv)
 export(npv_vec)
@@ -229,6 +230,7 @@ export(pr_curve)
 export(precision)
 export(precision_vec)
 export(prob_metric_summarizer)
+export(quantile_metric_summarizer)
 export(ranked_prob_score)
 export(ranked_prob_score_vec)
 export(recall)
diff --git a/R/aaa-metrics.R b/R/aaa-metrics.R
index add35de0..4d48d7c3 100644
--- a/R/aaa-metrics.R
+++ b/R/aaa-metrics.R
@@ -267,6 +267,8 @@ metric_set <- function(...) {
     fn_cls %in% c("prob_metric", "class_metric", "ordered_prob_metric")
   ) {
     make_prob_class_metric_function(fns)
+  } else if (fn_cls == "quantile_metric") {
+    make_quantile_metric_function(fns)
   } else if (
     fn_cls %in%
       c(
@@ -663,6 +665,55 @@ make_survival_metric_function <- function(fns) {
   metric_function
 }
 
+make_quantile_metric_function <- function(fns) {
+  metric_function <- function(
+    data,
+    truth,
+    estimate,
+    na_rm = TRUE,
+    case_weights = NULL,
+    ...
+  ) {
+    # Construct common argument set for each metric call
+    # Doing this dynamically inside the generated function means
+    # we capture the correct arguments
+    call_args <- quos(
+      data = data,
+      truth = !!enquo(truth),
+      estimate = !!enquo(estimate),
+      na_rm = na_rm,
+      case_weights = !!enquo(case_weights),
+      ... = ...
+    )
+
+    # Construct calls from the functions + arguments
+    calls <- lapply(fns, call2, !!!call_args)
+
+    calls <- mapply(call_remove_static_arguments, calls, fns)
+
+    # Evaluate
+    metric_list <- mapply(
+      FUN = eval_safely,
+      calls, # .x
+      names(calls), # .y
+      SIMPLIFY = FALSE,
+      USE.NAMES = FALSE
+    )
+
+    dplyr::bind_rows(metric_list)
+  }
+
+  class(metric_function) <- c(
+    "quantile_metric_set",
+    "metric_set",
+    class(metric_function)
+  )
+
+  attr(metric_function, "metrics") <- fns
+
+  metric_function
+}
+
 validate_not_empty <- function(x, call = caller_env()) {
   if (is_empty(x)) {
     cli::cli_abort(
@@ -705,7 +756,8 @@ validate_function_class <- function(fns) {
     "dynamic_survival_metric",
     "static_survival_metric",
     "integrated_survival_metric",
-    "linear_pred_survival_metric"
+    "linear_pred_survival_metric",
+    "quantile_metric"
   )
 
   if (n_unique == 1L) {
diff --git a/R/aaa-new.R b/R/aaa-new.R
index 0e4c4ec7..0151382b 100644
--- a/R/aaa-new.R
+++ b/R/aaa-new.R
@@ -76,6 +76,12 @@ new_linear_pred_survival_metric <- function(fn, direction) {
   new_metric(fn, direction, class = "linear_pred_survival_metric")
 }
 
+#' @rdname new-metric
+#' @export
+new_quantile_metric <- function(fn, direction) {
+  new_metric(fn, direction, class = "quantile_metric")
+}
+
 #' @include import-standalone-types-check.R
 new_metric <- function(fn, direction, class = NULL, call = caller_env()) {
   check_function(fn, call = call)
@@ -128,6 +134,7 @@ format.metric <- function(x, ...) {
       "static_survival_metric" = "static survival metric",
       "integrated_survival_metric" = "integrated survival metric",
       "linear_pred_survival_metric" = "linear predictor survival metric",
+      "quantile_metric" = "quantile metric",
       "metric"
     )
 
diff --git a/R/quant-weighted_internal_score.R b/R/quant-weighted_internal_score.R
index a10bee5d..c622cc61 100644
--- a/R/quant-weighted_internal_score.R
+++ b/R/quant-weighted_internal_score.R
@@ -102,7 +102,7 @@
 weighted_interval_score <- function(data, ...) {
   UseMethod("weighted_interval_score")
 }
-weighted_interval_score <- new_numeric_metric(
+weighted_interval_score <- new_quantile_metric(
   weighted_interval_score,
   direction = "minimize"
 )
diff --git a/R/template.R b/R/template.R
index 5b54ea59..d63c533c 100644
--- a/R/template.R
+++ b/R/template.R
@@ -2,22 +2,24 @@
 #'
 #' `numeric_metric_summarizer()`, `class_metric_summarizer()`,
 #' `prob_metric_summarizer()`, `curve_metric_summarizer()`,
-#' `dynamic_survival_metric_summarizer()`, and
-#' `static_survival_metric_summarizer()` are useful alongside [check_metric] and
-#' [yardstick_remove_missing] for implementing new custom metrics. These
-#' functions call the metric function inside `dplyr::summarise()` or
-#' `dplyr::reframe()` for `curve_metric_summarizer()`. See [Custom performance
-#' metrics](https://www.tidymodels.org/learn/develop/metrics/) for more
-#' information.
+#' `dynamic_survival_metric_summarizer()`,
+#' `static_survival_metric_summarizer()`, and `quantile_metric_summarizer()` are
+#' useful alongside [check_metric] and [yardstick_remove_missing] for
+#' implementing new custom metrics. These functions call the metric function
+#' inside `dplyr::summarise()` or `dplyr::reframe()` for
+#' `curve_metric_summarizer()`. See
+#' [Custom performance metrics](https://www.tidymodels.org/learn/develop/metrics/)
+#' for more information.
 #'
 #' @details
 #'
 #' `numeric_metric_summarizer()`, `class_metric_summarizer()`,
 #' `prob_metric_summarizer()`, `curve_metric_summarizer()`,
-#' `dynamic_survival_metric_summarizer()`, and
-#' `dynamic_survival_metric_summarizer()` are generally called from the data
-#' frame version of your metric function. It knows how to call your metric over
-#' grouped data frames and returns a `tibble` consistent with other metrics.
+#' `dynamic_survival_metric_summarizer()`,
+#' `dynamic_survival_metric_summarizer()`, and `quantile_metric_summarizer()`
+#' are generally called from the data frame version of your metric function. It
+#' knows how to call your metric over grouped data frames and returns a `tibble`
+#' consistent with other metrics.
 #'
 #' @inheritParams rlang::args_dots_empty
 #' @inheritParams rlang::args_error_context
@@ -34,8 +36,9 @@
 #'   the data frame version of your metric function that called
 #'   `numeric_metric_summarizer()`, `class_metric_summarizer()`,
 #'   `prob_metric_summarizer()`, `curve_metric_summarizer()`,
-#'   `dynamic_survival_metric_summarizer()`, or
-#'   `static_survival_metric_summarizer()`.
+#'   `dynamic_survival_metric_summarizer()`,
+#'   `static_survival_metric_summarizer()`, or
+#'   `quantile_metric_summarizer()`.
 #'
 #' @param truth The unquoted column name corresponding to the `truth` column.
 #'
@@ -918,6 +921,100 @@ linear_pred_survival_metric_summarizer <- function(
   out
 }
 
+#' @rdname metric-summarizers
+#' @export
+quantile_metric_summarizer <- function(
+  name,
+  fn,
+  data,
+  truth,
+  estimate,
+  ...,
+  na_rm = TRUE,
+  case_weights = NULL,
+  fn_options = list(),
+  error_call = caller_env()
+) {
+  check_dots_empty(call = error_call)
+
+  truth <- enquo(truth)
+  estimate <- enquo(estimate)
+  case_weights <- enquo(case_weights)
+
+  truth <- yardstick_eval_select(
+    expr = truth,
+    data = data,
+    arg = "truth",
+    error_call = error_call
+  )
+  estimate <- yardstick_eval_select(
+    expr = estimate,
+    data = data,
+    arg = "estimate",
+    error_call = error_call
+  )
+
+  if (quo_is_null(case_weights)) {
+    group_case_weights <- NULL
+  } else {
+    case_weights <- yardstick_eval_select(
+      expr = case_weights,
+      data = data,
+      arg = "case_weights",
+      error_call = error_call
+    )
+  }
+
+  group_rows <- dplyr::group_rows(data)
+  group_keys <- dplyr::group_keys(data)
+  data <- dplyr::ungroup(data)
+  groups <- vec_chop(data, indices = group_rows)
+  out <- vector("list", length = length(groups))
+
+  for (i in seq_along(groups)) {
+    group <- groups[[i]]
+
+    group_truth <- group[[truth]]
+    group_estimate <- group[[estimate]]
+
+    if (is_string(case_weights)) {
+      group_case_weights <- group[[case_weights]]
+    }
+
+    elt_out <- list(
+      .metric = name,
+      .estimator = finalize_estimator(
+        group_truth,
+        metric_class = name,
+        call = error_call
+      ),
+      .estimate = inject(
+        withCallingHandlers(
+          fn(
+            truth = group_truth,
+            estimate = group_estimate,
+            case_weights = group_case_weights,
+            na_rm = na_rm,
+            !!!fn_options
+          ),
+          error = function(cnd) {
+            cnd$call <- error_call
+            cnd_signal(cnd)
+          }
+        )
+      )
+    )
+
+    out[[i]] <- tibble::new_tibble(elt_out)
+  }
+
+  group_keys <- vec_rep_each(group_keys, times = list_sizes(out))
+  out <- vec_rbind(!!!out)
+  out <- vec_cbind(group_keys, out)
+
+  out
+}
+
 prob_estimate_convert <- function(estimate) {
   check_data_frame(estimate, .internal = TRUE)
 
diff --git a/man/metric-summarizers.Rd b/man/metric-summarizers.Rd
index eb2f7149..ea513087 100644
--- a/man/metric-summarizers.Rd
+++ b/man/metric-summarizers.Rd
@@ -11,6 +11,7 @@
 \alias{static_survival_metric_summarizer}
 \alias{curve_survival_metric_summarizer}
 \alias{linear_pred_survival_metric_summarizer}
+\alias{quantile_metric_summarizer}
 \title{Developer function for summarizing new metrics}
 \usage{
 numeric_metric_summarizer(
@@ -132,6 +133,19 @@ linear_pred_survival_metric_summarizer(
   fn_options = list(),
   error_call = caller_env()
 )
+
+quantile_metric_summarizer(
+  name,
+  fn,
+  data,
+  truth,
+  estimate,
+  ...,
+  na_rm = TRUE,
+  case_weights = NULL,
+  fn_options = list(),
+  error_call = caller_env()
+)
 }
 \arguments{
 \item{name}{A single character representing the name of the metric to
@@ -146,8 +160,9 @@ needed to calculate the metric.}
 the data frame version of your metric function that called
 \code{numeric_metric_summarizer()}, \code{class_metric_summarizer()},
 \code{prob_metric_summarizer()}, \code{curve_metric_summarizer()},
-\code{dynamic_survival_metric_summarizer()}, or
-\code{static_survival_metric_summarizer()}.}
+\code{dynamic_survival_metric_summarizer()},
+\code{static_survival_metric_summarizer()}, or
+\code{quantile_metric_summarizer()}.}
 
 \item{truth}{The unquoted column name corresponding to the \code{truth} column.}
 
@@ -186,20 +201,23 @@ to pass along describing which level should be considered the "event".}
 \description{
 \code{numeric_metric_summarizer()}, \code{class_metric_summarizer()},
 \code{prob_metric_summarizer()}, \code{curve_metric_summarizer()},
-\code{dynamic_survival_metric_summarizer()}, and
-\code{static_survival_metric_summarizer()} are useful alongside \link{check_metric} and
-\link{yardstick_remove_missing} for implementing new custom metrics. These
-functions call the metric function inside \code{dplyr::summarise()} or
-\code{dplyr::reframe()} for \code{curve_metric_summarizer()}. See \href{https://www.tidymodels.org/learn/develop/metrics/}{Custom performance metrics} for more
-information.
+\code{dynamic_survival_metric_summarizer()},
+\code{static_survival_metric_summarizer()}, and \code{quantile_metric_summarizer()} are
+useful alongside \link{check_metric} and \link{yardstick_remove_missing} for
+implementing new custom metrics. These functions call the metric function
+inside \code{dplyr::summarise()} or \code{dplyr::reframe()} for
+\code{curve_metric_summarizer()}. See
+\href{https://www.tidymodels.org/learn/develop/metrics/}{Custom performance metrics}
+for more information.
 }
 \details{
 \code{numeric_metric_summarizer()}, \code{class_metric_summarizer()},
 \code{prob_metric_summarizer()}, \code{curve_metric_summarizer()},
-\code{dynamic_survival_metric_summarizer()}, and
-\code{dynamic_survival_metric_summarizer()} are generally called from the data
-frame version of your metric function. It knows how to call your metric over
-grouped data frames and returns a \code{tibble} consistent with other metrics.
+\code{dynamic_survival_metric_summarizer()},
+\code{dynamic_survival_metric_summarizer()}, and \code{quantile_metric_summarizer()}
+are generally called from the data frame version of your metric function. It
+knows how to call your metric over grouped data frames and returns a \code{tibble}
+consistent with other metrics.
 }
 \seealso{
 \link{check_metric} \link{yardstick_remove_missing} \code{\link[=finalize_estimator]{finalize_estimator()}} \code{\link[=dots_to_estimate]{dots_to_estimate()}}
diff --git a/man/new-metric.Rd b/man/new-metric.Rd
index ce027523..8085afb7 100644
--- a/man/new-metric.Rd
+++ b/man/new-metric.Rd
@@ -10,6 +10,7 @@
 \alias{new_integrated_survival_metric}
 \alias{new_static_survival_metric}
 \alias{new_linear_pred_survival_metric}
+\alias{new_quantile_metric}
 \title{Construct a new metric function}
 \usage{
 new_class_metric(fn, direction)
@@ -27,6 +28,8 @@ new_integrated_survival_metric(fn, direction)
 new_static_survival_metric(fn, direction)
 
 new_linear_pred_survival_metric(fn, direction)
+
+new_quantile_metric(fn, direction)
 }
 \arguments{
 \item{fn}{A function. The metric function to attach a metric-specific class
diff --git a/vignettes/metric-types.Rmd b/vignettes/metric-types.Rmd
index 63177d7a..f8bc8e68 100644
--- a/vignettes/metric-types.Rmd
+++ b/vignettes/metric-types.Rmd
@@ -69,6 +69,12 @@ metrics, along with the types of the inputs they take.
 
     - `estimate` - numeric
 
+8) **Quantile metrics**
+
+    - `truth` - numeric
+
+    - `estimate` - quantile_pred
+
 ## Example
 
 In the following example, the `hpc_cv` data set is used. It contains class
@@ -151,6 +157,10 @@ all_metrics <- bind_rows(
   tibble(
     type = "static survival",
     metric = get_metrics(fns, "static_survival_metric")
+  ),
+  tibble(
+    type = "quantile",
+    metric = get_metrics(fns, "quantile_metric")
   )
 )
 

From c2049dbe644c242de3799d81893a3bee20a44976 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Wed, 17 Dec 2025 14:31:16 -0800
Subject: [PATCH 10/17] update news

---
 NEWS.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 78690207..9b7bb30d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -8,6 +8,10 @@
 
 * Added infrastructure for survival metrics on the linear predictor. (#551)
 
+* Added infrastructure for quantile metrics. (#569)
+
+* Added quantile metric `weighted_interval_score()`. (#569)
+
 # yardstick 1.3.2
 
 * All messages, warnings and errors has been translated to use {cli} package (#517, #522).

From 3080c95a413b4e736ceec9571501e02aab295c84 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Wed, 17 Dec 2025 15:44:29 -0800
Subject: [PATCH 11/17] update check_quantile_metric() to work correctly with
 case_weights

---
 R/check-metric.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/check-metric.R b/R/check-metric.R
index 5ae45197..6f84ba34 100644
--- a/R/check-metric.R
+++ b/R/check-metric.R
@@ -144,5 +144,5 @@ check_quantile_metric <- function(
   call = caller_env()
 ) {
   validate_numeric_truth_quantile_estimate(truth, estimate, call = call)
-  validate_case_weights(case_weights, size = nrow(truth), call = call)
+  validate_case_weights(case_weights, size = length(truth), call = call)
 }

From e8bfbe4606fb433284a12fac84dab2a02ecb2d72 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Wed, 17 Dec 2025 15:44:38 -0800
Subject: [PATCH 12/17] test quantile_metric_summarizer()

---
 tests/testthat/_snaps/template.md |  31 ++++
 tests/testthat/test-template.R    | 230 ++++++++++++++++++++++++++++++
 2 files changed, 261 insertions(+)

diff --git a/tests/testthat/_snaps/template.md b/tests/testthat/_snaps/template.md
index 574e4b68..b8712cc2 100644
--- a/tests/testthat/_snaps/template.md
+++ b/tests/testthat/_snaps/template.md
@@ -116,6 +116,37 @@
       x This metric doesn't use the `estimate` argument.
       i Specify the columns without `estimate = `.
 
+# quantile_metric_summarizer()'s errors when wrong things are passes
+
+    Code
+      quantile_metric_summarizer(name = "weighted_interval_score", fn = weighted_interval_score_vec,
+        data = example, truth = not_a_real_column_name, estimate = preds)
+    Condition
+      Error:
+      ! Can't select columns that don't exist.
+      x Column `not_a_real_column_name` doesn't exist.
+
+---
+
+    Code
+      quantile_metric_summarizer(name = "weighted_interval_score", fn = weighted_interval_score_vec,
+        data = example, truth = truth, estimate = not_a_real_column_name)
+    Condition
+      Error:
+      ! Can't select columns that don't exist.
+      x Column `not_a_real_column_name` doesn't exist.
+
+---
+
+    Code
+      quantile_metric_summarizer(name = "weighted_interval_score", fn = weighted_interval_score_vec,
+        data = example, truth = truth, estimate = preds, obviouslywrong = TRUE)
+    Condition
+      Error:
+      ! `...` must be empty.
+      x Problematic argument:
+      * obviouslywrong = TRUE
+
 # curve_metric_summarizer()'s na_rm argument work
 
     Code
diff --git a/tests/testthat/test-template.R b/tests/testthat/test-template.R
index e9d8d650..e69961a0 100644
--- a/tests/testthat/test-template.R
+++ b/tests/testthat/test-template.R
@@ -789,6 +789,236 @@ test_that("prob_metric_summarizer() handles column name collisions", {
   expect_identical(roc_auc_res, roc_auc_exp)
 })
 
+## quantile_metric_summarizer --------------------------------------------------
+
+test_that("quantile_metric_summarizer() works as expected", {
+  example <- dplyr::tibble(
+    preds = hardhat::quantile_pred(rbind(1:4, 8:11), c(0.2, 0.4, 0.6, 0.8)),
+    truth = c(3.3, 7.1)
+  )
+
+  wis_res <- quantile_metric_summarizer(
+    name = "weighted_interval_score",
+    fn = weighted_interval_score_vec,
+    data = example,
+    truth = truth,
+    estimate = preds,
+    na_rm = TRUE,
+    case_weights = NULL
+  )
+
+  wis_exp <- dplyr::tibble(
+    .metric = "weighted_interval_score",
+    .estimator = "standard",
+    .estimate = weighted_interval_score_vec(example$truth, example$preds)
+  )
+
+  expect_identical(wis_res, wis_exp)
+})
+
+test_that("quantile_metric_summarizer() works with grouped input", {
+  example <- dplyr::tibble(
+    preds = hardhat::quantile_pred(rbind(1:4, 8:11), c(0.2, 0.4, 0.6, 0.8)),
+    truth = c(3.3, 7.1),
+    group = c(1, 2)
+  )
+
+  wis_res <- quantile_metric_summarizer(
+    name = "weighted_interval_score",
+    fn = weighted_interval_score_vec,
+    data = dplyr::group_by(example, group),
+    truth = truth,
+    estimate = preds,
+    na_rm = TRUE,
+    case_weights = NULL
+  )
+
+  example_split <- vctrs::vec_split(example, example$group)
+
+  wis_exp <- dplyr::tibble(
+    group = example_split$key,
+    .metric = "weighted_interval_score",
+    .estimator = "standard",
+    .estimate = vapply(
+      example_split$val,
+      function(x) weighted_interval_score_vec(x$truth, x$preds),
+      FUN.VALUE = numeric(1)
+    )
+  )
+
+  expect_identical(wis_res, wis_exp)
+})
+
+test_that("quantile_metric_summarizer()'s na_rm argument work", {
+  example <- dplyr::tibble(
+    preds = hardhat::quantile_pred(rbind(1:4, 8:11), c(0.2, 0.4, 0.6, 0.8)),
+    truth = c(3.3, NA)
+  )
+
+  wis_res <- quantile_metric_summarizer(
+    name = "weighted_interval_score",
+    fn = weighted_interval_score_vec,
+    data = example,
+    truth = truth,
+    estimate = preds,
+    na_rm = TRUE,
+    case_weights = NULL
+  )
+
+  wis_exp <- dplyr::tibble(
+    .metric = "weighted_interval_score",
+    .estimator = "standard",
+    .estimate = weighted_interval_score_vec(
+      example$truth[-2],
+      example$preds[-2]
+    )
+  )
+
+  expect_identical(wis_res, wis_exp)
+
+  wis_res <- quantile_metric_summarizer(
+    name = "weighted_interval_score",
+    fn = weighted_interval_score_vec,
+    data = example,
+    truth = truth,
+    estimate = preds,
+    na_rm = FALSE,
+    case_weights = NULL
+  )
+
+  wis_exp <- dplyr::tibble(
+    .metric = "weighted_interval_score",
+    .estimator = "standard",
+    .estimate = na_dbl
+  )
+
+  expect_identical(wis_res, wis_exp)
+})
+
+test_that("quantile_metric_summarizer()'s case_weights argument work", {
+  example <- dplyr::tibble(
+    preds = hardhat::quantile_pred(rbind(1:4, 8:11), c(0.2, 0.4, 0.6, 0.8)),
+    truth = c(3.3, 7.1),
+    wts = c(1, 2)
+  )
+
+  wis_res <- quantile_metric_summarizer(
+    name = "weighted_interval_score",
+    fn = weighted_interval_score_vec,
+    data = example,
+    truth = truth,
+    estimate = preds,
+    na_rm = TRUE,
+    case_weights = wts
+  )
+
+  wis_exp <- dplyr::tibble(
+    .metric = "weighted_interval_score",
+    .estimator = "standard",
+    .estimate = weighted_interval_score_vec(
+      example$truth,
+      example$preds,
+      case_weights = example$wts
+    )
+  )
+
+  expect_identical(wis_res, wis_exp)
+})
+
+test_that("quantile_metric_summarizer()'s errors when wrong things are passes", {
+  example <- dplyr::tibble(
+    preds = hardhat::quantile_pred(rbind(1:4, 8:11), c(0.2, 0.4, 0.6, 0.8)),
+    truth = c(3.3, 7.1)
+  )
+
+  expect_snapshot(
+    error = TRUE,
+    quantile_metric_summarizer(
+      name = "weighted_interval_score",
+      fn = weighted_interval_score_vec,
+      data = example,
+      truth = not_a_real_column_name,
+      estimate = preds
+    )
+  )
+
+  expect_snapshot(
+    error = TRUE,
+    quantile_metric_summarizer(
+      name = "weighted_interval_score",
+      fn = weighted_interval_score_vec,
+      data = example,
+      truth = truth,
+      estimate = not_a_real_column_name
+    )
+  )
+
+  expect_snapshot(
+    error = TRUE,
+    quantile_metric_summarizer(
+      name = "weighted_interval_score",
+      fn = weighted_interval_score_vec,
+      data = example,
+      truth = truth,
+      estimate = preds,
+      obviouslywrong = TRUE
+    )
+  )
+})
+
+test_that("quantile_metric_summarizer() deals with characters in truth and estimate", {
+  example <- dplyr::tibble(
+    preds = hardhat::quantile_pred(rbind(1:4, 8:11), c(0.2, 0.4, 0.6, 0.8)),
+    truth = c(3.3, 7.1)
+  )
+
+  wis_res <- quantile_metric_summarizer(
+    name = "weighted_interval_score",
+    fn = weighted_interval_score_vec,
+    data = example,
+    truth = "truth",
+    estimate = "preds"
+  )
+
+  wis_exp <- dplyr::tibble(
+    .metric = "weighted_interval_score",
+    .estimator = "standard",
+    .estimate = weighted_interval_score_vec(example$truth, example$preds)
+  )
+
+  expect_identical(wis_res, wis_exp)
+})
+
+test_that("quantile_metric_summarizer() handles column name collisions", {
+  example <- dplyr::tibble(
+    preds = hardhat::quantile_pred(rbind(1:4, 8:11), c(0.2, 0.4, 0.6, 0.8)),
+    truth = c(3.3, 7.1)
+  )
+
+  example$name <- example$truth
+  example$estimator <- example$truth
+  example$event_level <- example$truth
+  example$na_rm <- example$truth
+  example$estimate <- example$truth
+
+  wis_res <- quantile_metric_summarizer(
+    name = "weighted_interval_score",
+    fn = weighted_interval_score_vec,
+    data = example,
+    truth = truth,
+    estimate = preds
+  )
+
+  wis_exp <- dplyr::tibble(
+    .metric = "weighted_interval_score",
+    .estimator = "standard",
+    .estimate = weighted_interval_score_vec(example$truth, example$preds)
+  )
+
+  expect_identical(wis_res, wis_exp)
+})
+
+
 ## curve_metric_summarizer --------------------------------------------------
 
 test_that("curve_metric_summarizer() works as expected", {

From b1b62f5783ff2711e663e4b1f1af8481e804bf04 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Wed, 17 Dec 2025 17:23:55 -0800
Subject: [PATCH 13/17] typo name

---
 tests/testthat/test-validation.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test-validation.R b/tests/testthat/test-validation.R
index 2e096a26..27ce7834 100644
--- a/tests/testthat/test-validation.R
+++ b/tests/testthat/test-validation.R
@@ -373,7 +373,7 @@ test_that("validate_ordered_truth_matrix_estimate errors as expected for non-bin
   )
 })
 
-test_that("validate_numeric_truth_numeric_estimate errors as expected", {
+test_that("validate_binary_estimator errors as expected", {
   expect_no_error(
     validate_binary_estimator(
       factor(c("a", "b", "a"), levels = c("a", "b", "c")),

From 764b9eb4ae86523e0ba0cfb63b275c75a217d5c1 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Wed, 17 Dec 2025 17:24:18 -0800
Subject: [PATCH 14/17] add tests for quantile infrastructure

---
 tests/testthat/_snaps/check_metric.md |  9 +++++
 tests/testthat/_snaps/validation.md   | 44 +++++++++++++++++-----
 tests/testthat/test-aaa-new.R         |  6 +++
 tests/testthat/test-check_metric.R    | 16 ++++++++
 tests/testthat/test-validation.R      | 54 +++++++++++++++++++++++++++
 5 files changed, 120 insertions(+), 9 deletions(-)

diff --git a/tests/testthat/_snaps/check_metric.md b/tests/testthat/_snaps/check_metric.md
index d0d44040..b48f29d8 100644
--- a/tests/testthat/_snaps/check_metric.md
+++ b/tests/testthat/_snaps/check_metric.md
@@ -93,3 +93,12 @@
       Error:
       ! `truth` (228) and `case_weights` (150) must be the same length.
 
+# check_quantile_metric() validates inputs
+
+    Code
+      check_quantile_metric(truth = as.character(example$truth), estimate = example$
+        preds, case_weights = 1:2)
+    Condition
+      Error:
+      ! `truth` should be a numeric vector, not a character vector.
+
diff --git a/tests/testthat/_snaps/validation.md b/tests/testthat/_snaps/validation.md
index fc8ccb4f..c278e0df 100644
--- a/tests/testthat/_snaps/validation.md
+++ b/tests/testthat/_snaps/validation.md
@@ -38,15 +38,6 @@
       Error:
       ! `truth` (4) and `estimate` (5) must be the same length.
 
----
-
-    Code
-      validate_binary_estimator(factor(c("a", "b", "a"), levels = c("a", "b", "c")),
-      estimator = "binary")
-    Condition
-      Error:
-      ! `estimator` is binary, only two class `truth` factors are allowed. A factor with 3 levels was provided.
-
 # validate_factor_truth_factor_estimate errors as expected
 
     Code
@@ -244,6 +235,15 @@
       Error:
       ! The number of levels in `truth` (2) must match the number of columns supplied in `...` (5).
 
+# validate_binary_estimator errors as expected
+
+    Code
+      validate_binary_estimator(factor(c("a", "b", "a"), levels = c("a", "b", "c")),
+      estimator = "binary")
+    Condition
+      Error:
+      ! `estimator` is binary, only two class `truth` factors are allowed. A factor with 3 levels was provided.
+
 # validate_surv_truth_numeric_estimate errors as expected
 
     Code
@@ -380,3 +380,29 @@
       Error:
       ! `truth` (11) and `case_weights` (10) must be the same length.
 
+# validate_numeric_truth_quantile_estimate errors as expected
+
+    Code
+      validate_numeric_truth_quantile_estimate("4", hardhat::quantile_pred(rbind(1:4),
+      c(0.2, 0.4, 0.6, 0.8)))
+    Condition
+      Error:
+      ! `truth` should be a numeric vector, not a string.
+
+---
+
+    Code
+      validate_numeric_truth_quantile_estimate(3.3, 3.4)
+    Condition
+      Error:
+      ! `estimate` should be a <quantile_pred> object, not a number.
+
+---
+
+    Code
+      validate_numeric_truth_quantile_estimate(c(3.3, 7.1, 1), hardhat::quantile_pred(
+        rbind(1:4, 8:11), c(0.2, 0.4, 0.6, 0.8)))
+    Condition
+      Error:
+      ! `truth` (3) and `estimate` (2) must be the same length.
+
diff --git a/tests/testthat/test-aaa-new.R b/tests/testthat/test-aaa-new.R
index 488cd24c..54f39432 100644
--- a/tests/testthat/test-aaa-new.R
+++ b/tests/testthat/test-aaa-new.R
@@ -6,6 +6,7 @@ test_that("can create metric functions", {
   fn4 <- new_dynamic_survival_metric(function() 1, "minimize")
   fn5 <- new_static_survival_metric(function() 1, "minimize")
   fn6 <- new_integrated_survival_metric(function() 1, "minimize")
+  fn7 <- new_quantile_metric(function() 1, "minimize")
 
   expect_identical(class(fn1), c("class_metric", "metric", "function"))
   expect_identical(class(fn2), c("prob_metric", "metric", "function"))
@@ -23,6 +24,10 @@ test_that("can create metric functions", {
     class(fn6),
     c("integrated_survival_metric", "metric", "function")
   )
+  expect_identical(
+    class(fn7),
+    c("quantile_metric", "metric", "function")
+  )
 
   expect_identical(attr(fn1, "direction"), "maximize")
   expect_identical(attr(fn2, "direction"), "maximize")
@@ -31,6 +36,7 @@ test_that("can create metric functions", {
   expect_identical(attr(fn4, "direction"), "minimize")
   expect_identical(attr(fn5, "direction"), "minimize")
   expect_identical(attr(fn6, "direction"), "minimize")
+  expect_identical(attr(fn7, "direction"), "minimize")
 })
 
 test_that("`fn` is validated", {
diff --git a/tests/testthat/test-check_metric.R b/tests/testthat/test-check_metric.R
index ab04a7d6..060ee630 100644
--- a/tests/testthat/test-check_metric.R
+++ b/tests/testthat/test-check_metric.R
@@ -111,3 +111,19 @@ test_that("check_static_survival_metric() validates inputs", {
     )
   )
 })
+
+test_that("check_quantile_metric() validates inputs", {
+  example <- dplyr::tibble(
+    preds = hardhat::quantile_pred(rbind(1:4, 8:11), c(0.2, 0.4, 0.6, 0.8)),
+    truth = c(3.3, 7.1)
+  )
+
+  expect_snapshot(
+    error = TRUE,
+    check_quantile_metric(
+      truth = as.character(example$truth),
+      estimate = example$preds,
+      case_weights = 1:2
+    )
+  )
+})
diff --git a/tests/testthat/test-validation.R b/tests/testthat/test-validation.R
index 27ce7834..cb1be6f2 100644
--- a/tests/testthat/test-validation.R
+++ b/tests/testthat/test-validation.R
@@ -594,3 +594,57 @@ test_that("validate_case_weights errors as expected", {
     validate_case_weights(1:10, 11)
   )
 })
+
+test_that("validate_numeric_truth_quantile_estimate errors as expected", {
+  expect_no_error(
+    validate_numeric_truth_quantile_estimate(
+      c(3.3, 7.1),
+      hardhat::quantile_pred(rbind(1:4, 8:11), c(0.2, 0.4, 0.6, 0.8))
+    )
+  )
+
+  expect_no_error(
+    validate_numeric_truth_quantile_estimate(
+      3.3,
+      hardhat::quantile_pred(rbind(1:4), c(0.2, 0.4, 0.6, 0.8))
+    )
+  )
+
+  expect_no_error(
+    validate_numeric_truth_quantile_estimate(
+      3L,
+      hardhat::quantile_pred(rbind(1:4), c(0.2, 0.4, 0.6, 0.8))
+    )
+  )
+
+  expect_no_error(
+    validate_numeric_truth_quantile_estimate(
+      numeric(),
+      hardhat::quantile_pred(matrix(nrow = 0, ncol = 4), c(0.2, 0.4, 0.6, 0.8))
+    )
+  )
+
+  expect_snapshot(
+    error = TRUE,
+    validate_numeric_truth_quantile_estimate(
+      "4",
+      hardhat::quantile_pred(rbind(1:4), c(0.2, 0.4, 0.6, 0.8))
+    )
+  )
+
+  expect_snapshot(
+    error = TRUE,
+    validate_numeric_truth_quantile_estimate(
+      3.3,
+      3.4
+    )
+  )
+
+  expect_snapshot(
+    error = TRUE,
+    validate_numeric_truth_quantile_estimate(
+      c(3.3, 7.1, 1),
+      hardhat::quantile_pred(rbind(1:4, 8:11), c(0.2, 0.4, 0.6, 0.8))
+    )
+  )
+})

From 7f9bd1dbcec23f1bc0f93c1ccbfb5f1a525a0f31 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Wed, 17 Dec 2025 17:43:38 -0800
Subject: [PATCH 15/17] add weighted_interval_score.data.frame() method

---
 NAMESPACE                         |  1 +
 R/quant-weighted_internal_score.R | 60 ++++++++++++++++++++++---------
 man/weighted_interval_score.Rd    | 12 +++++++
 3 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 645afe8e..c547ef58 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -130,6 +130,7 @@ S3method(tidy,conf_mat)
 S3method(validate_truth_estimate_types,default)
 S3method(validate_truth_estimate_types,factor)
 S3method(validate_truth_estimate_types,numeric)
+S3method(weighted_interval_score,data.frame)
 export(accuracy)
 export(accuracy_vec)
 export(average_precision)
diff --git a/R/quant-weighted_internal_score.R b/R/quant-weighted_internal_score.R
index c622cc61..d0f04dbe 100644
--- a/R/quant-weighted_internal_score.R
+++ b/R/quant-weighted_internal_score.R
@@ -107,6 +107,33 @@ weighted_interval_score <- new_quantile_metric(
   direction = "minimize"
 )
 
+#' @rdname weighted_interval_score
+#' @export
+weighted_interval_score.data.frame <- function(
+  data,
+  truth,
+  estimate,
+  quantile_levels = NULL,
+  na_rm = TRUE,
+  quantile_estimate_nas = c("impute", "drop", "propagate"),
+  case_weights = NULL,
+  ...
+) {
+  quantile_metric_summarizer(
+    name = "weighted_interval_score",
+    fn = weighted_interval_score_vec,
+    data = data,
+    truth = !!enquo(truth),
+    estimate = !!enquo(estimate),
+    na_rm = na_rm,
+    case_weights = !!enquo(case_weights),
+    fn_options = list(
+      quantile_levels = quantile_levels,
+      quantile_estimate_nas = quantile_estimate_nas
+    )
+  )
+}
+
 #' @export
 #' @rdname weighted_interval_score
 weighted_interval_score_vec <- function(
@@ -119,8 +146,10 @@ weighted_interval_score_vec <- function(
   ...
 ) {
   check_quantile_metric(truth, estimate, case_weights)
+
   estimate_quantile_levels <- hardhat::extract_quantile_levels(estimate)
   quantile_estimate_nas <- rlang::arg_match(quantile_estimate_nas)
+
   if (!is.null(quantile_levels)) {
     hardhat::check_quantile_levels(quantile_levels)
     all_levels_estimated <- all(quantile_levels %in% estimate_quantile_levels)
@@ -171,25 +200,22 @@ wis_impl <- function(
   quantile_levels,
   rowwise_na_rm = TRUE
 ) {
-  as.vector(
-    mapply(
-      FUN = function(.x, .y) {
-        wis_one_quantile(.x, quantile_levels, .y, rowwise_na_rm)
-      },
-      vctrs::vec_chop(estimate),
-      truth
-    ),
-    "double"
+  res <- mapply(
+    FUN = function(.x, .y) {
+      wis_one_quantile(.x, quantile_levels, .y, rowwise_na_rm)
+    },
+    vctrs::vec_chop(estimate),
+    truth
   )
+
+  as.vector(res, "double")
 }
 
 wis_one_quantile <- function(values, quantile_levels, truth, na_rm) {
-  2 *
-    mean(
-      pmax(
-        quantile_levels * (truth - values),
-        (1 - quantile_levels) * (values - truth)
-      ),
-      na.rm = na_rm
-    )
+  res <- pmax(
+    quantile_levels * (truth - values),
+    (1 - quantile_levels) * (values - truth)
+  )
+
+  2 * mean(res, na.rm = na_rm)
 }
diff --git a/man/weighted_interval_score.Rd b/man/weighted_interval_score.Rd
index 09f3ad39..1cf548cf 100644
--- a/man/weighted_interval_score.Rd
+++ b/man/weighted_interval_score.Rd
@@ -2,11 +2,23 @@
 % Please edit documentation in R/quant-weighted_internal_score.R
 \name{weighted_interval_score}
 \alias{weighted_interval_score}
+\alias{weighted_interval_score.data.frame}
 \alias{weighted_interval_score_vec}
 \title{Compute weighted interval score}
 \usage{
 weighted_interval_score(data, ...)
 
+\method{weighted_interval_score}{data.frame}(
+  data,
+  truth,
+  estimate,
+  quantile_levels = NULL,
+  na_rm = TRUE,
+  quantile_estimate_nas = c("impute", "drop", "propagate"),
+  case_weights = NULL,
+  ...
+)
+
 weighted_interval_score_vec(
   truth,
   estimate,

From d798845923698efe96e1a37dae56af171ab72dfe Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Wed, 17 Dec 2025 18:15:07 -0800
Subject: [PATCH 16/17] add tests for weighted_internal_score

---
 .../_snaps/quant-weighted_internal_score.md   |  9 ++
 .../test-quant-weighted_internal_score.R      | 86 +++++++++++++++++++
 2 files changed, 95 insertions(+)
 create mode 100644 tests/testthat/_snaps/quant-weighted_internal_score.md
 create mode 100644 tests/testthat/test-quant-weighted_internal_score.R

diff --git a/tests/testthat/_snaps/quant-weighted_internal_score.md b/tests/testthat/_snaps/quant-weighted_internal_score.md
new file mode 100644
index 00000000..22df78b9
--- /dev/null
+++ b/tests/testthat/_snaps/quant-weighted_internal_score.md
@@ -0,0 +1,9 @@
+# Missing value behaviours works
+
+    Code
+      weighted_interval_score_vec(truth, preds_na, quantile_levels = 1:9 / 10,
+      quantile_estimate_nas = "drop")
+    Condition
+      Error in `weighted_interval_score_vec()`:
+      ! When `quantile_levels` is not a subset of those available in `estimate`, `quantile_estimate_nas` may not be `'drop'`.
+
diff --git a/tests/testthat/test-quant-weighted_internal_score.R b/tests/testthat/test-quant-weighted_internal_score.R
new file mode 100644
index 00000000..dae000e1
--- /dev/null
+++ b/tests/testthat/test-quant-weighted_internal_score.R
@@ -0,0 +1,86 @@
+test_that("weighted_interval_score_vec works", {
+  quantile_levels <- c(.2, .4, .6, .8)
+  pred1 <- 1:4
+  pred2 <- 8:11
+  example <- dplyr::tibble(
+    preds = hardhat::quantile_pred(rbind(pred1, pred2), quantile_levels),
+    truth = c(3.3, 7.1)
+  )
+
+  expect_identical(
+    weighted_interval_score(example, truth = truth, estimate = preds)[[
+      ".estimate"
+    ]],
+    weighted_interval_score_vec(example$truth, example$preds)
+  )
+})
+
+test_that("quantile_levels argument works", {
+  quantile_levels <- c(.2, .4, .6, .8)
+  pred1 <- 1:4
+  pred2 <- 8:11
+  example <- dplyr::tibble(
+    preds = hardhat::quantile_pred(rbind(pred1, pred2), quantile_levels),
+    truth = c(3.3, 7.1)
+  )
+
+  levels_set <- weighted_interval_score(
+    example,
+    truth = truth,
+    estimate = preds,
+    quantile_levels = c(.25, .5, .75)
+  )
+
+  levels_default <- weighted_interval_score(
+    example,
+    truth = truth,
+    estimate = preds
+  )
+
+  expect_true(levels_set$.estimate != levels_default$.estimate)
+})
+
+test_that("Missing value behaviours works", {
+  pred1 <- 1:4
+  preds_na <- hardhat::quantile_pred(rbind(pred1, c(1, 2, NA, 4)), 1:4 / 5)
+  truth <- c(2.5, 2.5)
+
+  expect_snapshot(
+    error = TRUE,
+    weighted_interval_score_vec(
+      truth,
+      preds_na,
+      quantile_levels = 1:9 / 10,
+      quantile_estimate_nas = "drop"
+    )
+  )
+
+  expect_identical(
+    weighted_interval_score_vec(
+      truth,
+      preds_na,
+      quantile_levels = c(2, 3) / 5,
+      quantile_estimate_nas = "drop"
+    ),
+    0.4
+  )
+
+  expect_identical(
+    weighted_interval_score_vec(
+      truth,
+      preds_na,
+      na_rm = TRUE,
+      quantile_estimate_nas = "propagate"
+    ),
+    0.5
+  )
+
+  expect_identical(
+    weighted_interval_score_vec(
+      truth,
+      preds_na,
+      quantile_estimate_nas = "propagate"
+    ),
+    NA_real_
+  )
+})

From b1fd61f5753aba9eea452f6a69f48f592e6657e5 Mon Sep 17 00:00:00 2001
From: Emil Hvitfeldt <emilhhvitfeldt@gmail.com>
Date: Wed, 17 Dec 2025 18:20:16 -0800
Subject: [PATCH 17/17] fix file name typo

---
 DESCRIPTION                                                     | 2 +-
 ...eighted_internal_score.R => quant-weighted_interval_score.R} | 0
 man/weighted_interval_score.Rd                                  | 2 +-
 ...ghted_internal_score.md => quant-weighted_interval_score.md} | 0
 ...ed_internal_score.R => test-quant-weighted_interval_score.R} | 0
 5 files changed, 2 insertions(+), 2 deletions(-)
 rename R/{quant-weighted_internal_score.R => quant-weighted_interval_score.R} (100%)
 rename tests/testthat/_snaps/{quant-weighted_internal_score.md => quant-weighted_interval_score.md} (100%)
 rename tests/testthat/{test-quant-weighted_internal_score.R => test-quant-weighted_interval_score.R} (100%)

diff --git a/DESCRIPTION b/DESCRIPTION
index ff661dfd..bbafe663 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -116,7 +116,7 @@ Collate:
     'prob-roc_aunp.R'
     'prob-roc_aunu.R'
     'prob-roc_curve.R'
-    'quant-weighted_internal_score.R'
+    'quant-weighted_interval_score.R'
     'reexports.R'
     'surv-brier_survival.R'
     'surv-brier_survival_integrated.R'
diff --git a/R/quant-weighted_internal_score.R b/R/quant-weighted_interval_score.R
similarity index 100%
rename from R/quant-weighted_internal_score.R
rename to R/quant-weighted_interval_score.R
diff --git a/man/weighted_interval_score.Rd b/man/weighted_interval_score.Rd
index 1cf548cf..3023f22a 100644
--- a/man/weighted_interval_score.Rd
+++ b/man/weighted_interval_score.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/quant-weighted_internal_score.R
+% Please edit documentation in R/quant-weighted_interval_score.R
 \name{weighted_interval_score}
 \alias{weighted_interval_score}
 \alias{weighted_interval_score.data.frame}
diff --git a/tests/testthat/_snaps/quant-weighted_internal_score.md b/tests/testthat/_snaps/quant-weighted_interval_score.md
similarity index 100%
rename from tests/testthat/_snaps/quant-weighted_internal_score.md
rename to tests/testthat/_snaps/quant-weighted_interval_score.md
diff --git a/tests/testthat/test-quant-weighted_internal_score.R b/tests/testthat/test-quant-weighted_interval_score.R
similarity index 100%
rename from tests/testthat/test-quant-weighted_internal_score.R
rename to tests/testthat/test-quant-weighted_interval_score.R