diff --git a/R/get_nhgis_crosswalk.R b/R/get_nhgis_crosswalk.R index 59680f7..979e660 100644 --- a/R/get_nhgis_crosswalk.R +++ b/R/get_nhgis_crosswalk.R @@ -428,10 +428,13 @@ is_nhgis_crosswalk_available <- function( source_geography, target_geography, source_year, - target_year) { + target_year, + available_crosswalks = NULL) { - # Get the list of available crosswalks - available_crosswalks <- list_nhgis_crosswalks() + # Get the list of available crosswalks if not provided + if (is.null(available_crosswalks)) { + available_crosswalks <- list_nhgis_crosswalks() + } # Standardize inputs for matching source_year_chr <- as.character(source_year) diff --git a/R/plan_crosswalk_chain.R b/R/plan_crosswalk_chain.R index fae0366..5c4535a 100644 --- a/R/plan_crosswalk_chain.R +++ b/R/plan_crosswalk_chain.R @@ -81,19 +81,84 @@ plan_crosswalk_chain <- function( return(result) } - # Case 2: Same geography, different years - single NHGIS crosswalk + # Cache available crosswalks for reuse across checks + available_crosswalks <- list_nhgis_crosswalks() + + # Case 2: Same geography, different years - temporal crosswalk(s) if (!geography_changes && year_changes) { - result$steps <- tibble::tibble( - step_number = 1L, - source_geography = source_geography, - source_year = source_year_chr, - target_geography = target_geography, - target_year = target_year_chr, - crosswalk_source = determine_temporal_source(source_year_chr, target_year_chr), - description = stringr::str_c( - source_year_chr, " ", source_geog_std, " -> ", - target_year_chr, " ", target_geog_std, " (inter-temporal)")) - result$composition_note <- "Single crosswalk; use allocation_factor_source_to_target directly." + # Check for special sources first (e.g. CTData 2020<->2022) + temporal_source <- determine_temporal_source(source_year_chr, target_year_chr) + if (temporal_source != "nhgis") { + result$steps <- tibble::tibble( + step_number = 1L, + source_geography = source_geography, + source_year = source_year_chr, + target_geography = target_geography, + target_year = target_year_chr, + crosswalk_source = temporal_source, + description = stringr::str_c( + source_year_chr, " ", source_geog_std, " -> ", + target_year_chr, " ", target_geog_std, " (inter-temporal)")) + result$composition_note <- "Single crosswalk; use allocation_factor_source_to_target directly." + return(result) + } + + # Find temporal path via NHGIS same-geography crosswalks + temporal_path <- find_temporal_path( + source_geog_std, source_year_chr, target_year_chr, available_crosswalks) + + if (is.null(temporal_path)) { + result$error <- stringr::str_c( + "No temporal crosswalk path found from ", source_year_chr, " to ", + target_year_chr, " for geography '", source_geography, + "'. NHGIS may not provide same-geography crosswalks for this ", + "geography or year combination.") + return(result) + } + + n_hops <- length(temporal_path) + + if (n_hops == 1L) { + # Single temporal hop + result$steps <- tibble::tibble( + step_number = 1L, + source_geography = source_geography, + source_year = source_year_chr, + target_geography = target_geography, + target_year = target_year_chr, + crosswalk_source = "nhgis", + description = stringr::str_c( + source_year_chr, " ", source_geog_std, " -> ", + target_year_chr, " ", target_geog_std, " (inter-temporal)")) + result$composition_note <- "Single crosswalk; use allocation_factor_source_to_target directly." + } else { + # Multi-hop temporal chain + result$is_multi_step <- TRUE + result$intermediate_geography <- source_geography[1] + result$intermediate_year <- purrr::map_chr( + temporal_path[-n_hops], ~ .x$target_year) + + result$steps <- tibble::tibble( + step_number = seq_len(n_hops), + source_geography = rep(source_geography[1], n_hops), + source_year = purrr::map_chr(temporal_path, ~ .x$source_year), + target_geography = rep(target_geography[1], n_hops), + target_year = purrr::map_chr(temporal_path, ~ .x$target_year), + crosswalk_source = rep("nhgis", n_hops), + description = purrr::map_chr(seq_len(n_hops), ~ stringr::str_c( + temporal_path[[.x]]$source_year, " ", source_geog_std, " -> ", + temporal_path[[.x]]$target_year, " ", source_geog_std, + " (inter-temporal via nhgis)"))) + + result$composition_note <- stringr::str_c( + "Compose crosswalks by joining on intermediate geography (", + source_geog_std, " ", + paste(result$intermediate_year, collapse = ", "), + ") and multiplying allocation factors: final_allocation = ", + paste( + purrr::map_chr(seq_len(n_hops), ~ stringr::str_c("step", .x, "_allocation")), + collapse = " * ")) + } return(result) } @@ -126,7 +191,8 @@ plan_crosswalk_chain <- function( source_geography = source_geography, target_geography = target_geography, source_year = source_year, - target_year = target_year) + target_year = target_year, + available_crosswalks = available_crosswalks) if (nhgis_direct_available) { # Single-step NHGIS crosswalk available @@ -162,16 +228,6 @@ plan_crosswalk_chain <- function( return(result) } - # Determine the intermediate point (year-first approach) - # Step 1: source_geog(source_year) -> source_geog(target_year) via NHGIS - # Step 2: source_geog(target_year) -> target_geog(target_year) via Geocorr - - result$intermediate_geography <- source_geography[1] - result$intermediate_year <- target_year_chr[1] - - # Determine temporal crosswalk source for step 1 - step1_source <- determine_temporal_source(source_year_chr[1], target_year_chr[1]) - # Use first element to ensure scalar values src_geog <- source_geography[1] tgt_geog <- target_geography[1] @@ -180,25 +236,66 @@ plan_crosswalk_chain <- function( src_std <- source_geog_std[1] tgt_std <- target_geog_std[1] - result$steps <- tibble::tibble( - step_number = c(1L, 2L), - source_geography = c(src_geog, src_geog), - source_year = c(src_year, tgt_year), - target_geography = c(src_geog, tgt_geog), - target_year = c(tgt_year, tgt_year), - crosswalk_source = c(step1_source, "geocorr"), - description = c( - stringr::str_c( - src_year, " ", src_std, " -> ", - tgt_year, " ", src_std, " (inter-temporal via ", step1_source, ")"), - stringr::str_c( - tgt_year, " ", src_std, " -> ", - tgt_year, " ", tgt_std, " (inter-geography via Geocorr)"))) + # Find temporal path at source geography from source_year to target_year + temporal_path <- find_temporal_path( + src_std, src_year, tgt_year, available_crosswalks) + + if (is.null(temporal_path)) { + result$error <- stringr::str_c( + "Multi-step crosswalk not possible: no temporal crosswalk path found from ", + src_year, " to ", tgt_year, " for geography '", source_geography, + "'. NHGIS may not provide same-geography crosswalks for this ", + "geography or year combination.") + return(result) + } + + n_temporal <- length(temporal_path) + n_total <- n_temporal + 1L # temporal hops + 1 Geocorr geography step + + # Build intermediate year vector (all bridge points between steps) + # For temporal hops: the target year of each hop except the last becomes a bridge + + # The target year of the last temporal hop is also a bridge to the Geocorr step + intermediate_years <- purrr::map_chr(temporal_path, ~ .x$target_year) + # The last intermediate year (= tgt_year) is the bridge to the Geocorr step + result$intermediate_geography <- src_geog + result$intermediate_year <- intermediate_years + + # Build temporal step rows + temporal_steps <- tibble::tibble( + step_number = seq_len(n_temporal), + source_geography = rep(src_geog, n_temporal), + source_year = purrr::map_chr(temporal_path, ~ .x$source_year), + target_geography = rep(src_geog, n_temporal), + target_year = purrr::map_chr(temporal_path, ~ .x$target_year), + crosswalk_source = rep("nhgis", n_temporal), + description = purrr::map_chr(seq_len(n_temporal), ~ stringr::str_c( + temporal_path[[.x]]$source_year, " ", src_std, " -> ", + temporal_path[[.x]]$target_year, " ", src_std, + " (inter-temporal via nhgis)"))) + + # Geocorr geography step (final step) + geocorr_step <- tibble::tibble( + step_number = n_total, + source_geography = src_geog, + source_year = tgt_year, + target_geography = tgt_geog, + target_year = tgt_year, + crosswalk_source = "geocorr", + description = stringr::str_c( + tgt_year, " ", src_std, " -> ", + tgt_year, " ", tgt_std, " (inter-geography via Geocorr)")) + + result$steps <- dplyr::bind_rows(temporal_steps, geocorr_step) result$composition_note <- stringr::str_c( "Compose crosswalks by joining on intermediate geography (", - src_std, " ", tgt_year, ") and multiplying allocation factors: ", - "final_allocation = step1_allocation * step2_allocation") + src_std, " ", + paste(intermediate_years, collapse = ", "), + ") and multiplying allocation factors: final_allocation = ", + paste( + purrr::map_chr(seq_len(n_total), ~ stringr::str_c("step", .x, "_allocation")), + collapse = " * ")) return(result) } @@ -293,6 +390,83 @@ format_chain_plan_message <- function(plan) { stringr::str_c( "Multi-step crosswalk required:\n", paste(step_lines, collapse = "\n"), "\n", - "\nIntermediate: ", plan$intermediate_geography, " (", plan$intermediate_year, ")\n", + "\nIntermediate: ", plan$intermediate_geography, + " (", paste(plan$intermediate_year, collapse = ", "), ")\n", "\n", plan$composition_note) } + + +#' Find Temporal Path Between Years via NHGIS Same-Geography Crosswalks +#' +#' Uses BFS to find the shortest sequence of same-geography NHGIS crosswalks +#' bridging a temporal span. The graph is built from `list_nhgis_crosswalks()` +#' so future NHGIS additions are picked up automatically. +#' +#' @param geography_std Character. Standardized geography name (e.g. "tract", +#' "block", "block_group") matching `list_nhgis_crosswalks()` output. +#' @param source_year Character or numeric. Source year. +#' @param target_year Character or numeric. Target year. +#' @param available_crosswalks Tibble or NULL. Pre-fetched output of +#' `list_nhgis_crosswalks()` to avoid redundant calls. +#' +#' @return A list of `list(source_year, target_year)` hops representing the +#' shortest path, or `NULL` if no path exists. +#' @keywords internal +#' @noRd +find_temporal_path <- function(geography_std, + source_year, + target_year, + available_crosswalks = NULL) { + if (is.null(available_crosswalks)) { + available_crosswalks <- list_nhgis_crosswalks() + } + + source_year_chr <- as.character(source_year) + target_year_chr <- as.character(target_year) + + # Filter to same-geography temporal edges + same_geog <- available_crosswalks |> + dplyr::filter( + source_geography == geography_std, + target_geography == geography_std) + + if (nrow(same_geog) == 0) return(NULL) + + # Build adjacency list: year -> list of reachable years + adj <- list() + for (i in seq_len(nrow(same_geog))) { + from <- same_geog$source_year[i] + to <- same_geog$target_year[i] + adj[[from]] <- unique(c(adj[[from]], to)) + } + + # BFS from source_year to target_year + queue <- list(list(year = source_year_chr, path = list())) + visited <- character() + + while (length(queue) > 0) { + current <- queue[[1]] + queue <- queue[-1] + + if (current$year == target_year_chr) { + return(current$path) + } + + if (current$year %in% visited) next + visited <- c(visited, current$year) + + neighbors <- adj[[current$year]] + if (is.null(neighbors)) next + + for (neighbor in neighbors) { + if (!neighbor %in% visited) { + new_path <- c( + current$path, + list(list(source_year = current$year, target_year = neighbor))) + queue <- c(queue, list(list(year = neighbor, path = new_path))) + } + } + } + + return(NULL) +} diff --git a/README.md b/README.md index 203b8e3..6505ad4 100644 --- a/README.md +++ b/README.md @@ -188,12 +188,13 @@ The list contains three elements: ### Multi-Step Crosswalks For some source year/geography -\> target year/geography combinations, -there is not a single direct crosswalk. In such cases, we need two -crosswalks. The package automatically plans and fetches the required -crosswalks: +there is not a single direct crosswalk. The package automatically plans +and fetches the required chain of crosswalks, using a year-first +strategy: -1. **Step 1 (NHGIS)**: Change year, keep geography constant -2. **Step 2 (Geocorr)**: Change geography at target year +1. **NHGIS step(s)**: Change year while keeping geography constant + (multiple hops if the temporal span requires it, e.g. 1990→2010→2020) +2. **Geocorr step**: Change geography at the target year ``` r result <- get_crosswalk( @@ -207,6 +208,12 @@ result <- get_crosswalk( # Two crosswalks are returned # Step 1: 2010 tracts -> 2020 tracts (NHGIS) # Step 2: 2020 tracts -> 2020 ZCTAs (Geocorr) + +# Longer chains are produced when needed, e.g. +# 2000 tracts -> 2020 ZCTAs produces three steps: +# Step 1: 2000 tracts -> 2010 tracts (NHGIS) +# Step 2: 2010 tracts -> 2020 tracts (NHGIS) +# Step 3: 2020 tracts -> 2020 ZCTAs (Geocorr) ``` ### Crosswalk Structure diff --git a/tests/testthat/test-plan_crosswalk_chain.R b/tests/testthat/test-plan_crosswalk_chain.R index 3ca85b5..9b06d68 100644 --- a/tests/testthat/test-plan_crosswalk_chain.R +++ b/tests/testthat/test-plan_crosswalk_chain.R @@ -353,3 +353,240 @@ test_that("plan_crosswalk_chain describes GeoCorr 2022 when no years specified", expect_false(plan$is_multi_step) expect_true(stringr::str_detect(plan$steps$description[1], "Geocorr 2022")) }) + +# ============================================================================== +# find_temporal_path() unit tests +# ============================================================================== + +test_that("find_temporal_path finds direct single-hop path (2010->2020 tract)", { + path <- crosswalk:::find_temporal_path("tract", "2010", "2020") + + expect_equal(length(path), 1) + expect_equal(path[[1]]$source_year, "2010") + expect_equal(path[[1]]$target_year, "2020") +}) + +test_that("find_temporal_path finds multi-hop path (1990->2020 tract)", { + path <- crosswalk:::find_temporal_path("tract", "1990", "2020") + + expect_equal(length(path), 2) + expect_equal(path[[1]]$source_year, "1990") + expect_equal(path[[1]]$target_year, "2010") + expect_equal(path[[2]]$source_year, "2010") + expect_equal(path[[2]]$target_year, "2020") +}) + +test_that("find_temporal_path finds multi-hop path (2000->2020 tract)", { + path <- crosswalk:::find_temporal_path("tract", "2000", "2020") + + expect_equal(length(path), 2) + expect_equal(path[[1]]$source_year, "2000") + expect_equal(path[[1]]$target_year, "2010") + expect_equal(path[[2]]$source_year, "2010") + expect_equal(path[[2]]$target_year, "2020") +}) + +test_that("find_temporal_path finds multi-hop path (1990->2020 block)", { + path <- crosswalk:::find_temporal_path("block", "1990", "2020") + + expect_equal(length(path), 2) + expect_equal(path[[1]]$source_year, "1990") + expect_equal(path[[1]]$target_year, "2010") + expect_equal(path[[2]]$source_year, "2010") + expect_equal(path[[2]]$target_year, "2020") +}) + +test_that("find_temporal_path returns NULL for impossible path (2020->1990 tract)", { + path <- crosswalk:::find_temporal_path("tract", "2020", "1990") + + expect_null(path) +}) + +test_that("find_temporal_path returns NULL for unsupported geography (zcta)", { + path <- crosswalk:::find_temporal_path("zcta", "2010", "2020") + + expect_null(path) +}) + +test_that("find_temporal_path accepts pre-fetched available_crosswalks", { + available <- crosswalk:::list_nhgis_crosswalks() + path <- crosswalk:::find_temporal_path("tract", "2010", "2020", available) + + expect_equal(length(path), 1) + expect_equal(path[[1]]$source_year, "2010") + expect_equal(path[[1]]$target_year, "2020") +}) + +# ============================================================================== +# Multi-hop year-only plan tests +# ============================================================================== + +test_that("plan_crosswalk_chain produces 2-step plan for 1990->2020 tract", { + plan <- plan_crosswalk_chain( + source_geography = "tract", + target_geography = "tract", + source_year = 1990, + target_year = 2020) + + expect_true(plan$is_multi_step) + expect_equal(nrow(plan$steps), 2) + expect_equal(plan$steps$crosswalk_source[1], "nhgis") + expect_equal(plan$steps$crosswalk_source[2], "nhgis") + + # Step 1: 1990 tract -> 2010 tract + expect_equal(plan$steps$source_year[1], "1990") + expect_equal(plan$steps$target_year[1], "2010") + + # Step 2: 2010 tract -> 2020 tract + expect_equal(plan$steps$source_year[2], "2010") + expect_equal(plan$steps$target_year[2], "2020") + + expect_equal(plan$intermediate_geography, "tract") + expect_equal(plan$intermediate_year, "2010") +}) + +test_that("plan_crosswalk_chain produces 2-step plan for 2000->2020 tract", { + plan <- plan_crosswalk_chain( + source_geography = "tract", + target_geography = "tract", + source_year = 2000, + target_year = 2020) + + expect_true(plan$is_multi_step) + expect_equal(nrow(plan$steps), 2) + expect_equal(plan$steps$source_year[1], "2000") + expect_equal(plan$steps$target_year[1], "2010") + expect_equal(plan$steps$source_year[2], "2010") + expect_equal(plan$steps$target_year[2], "2020") +}) + +# ============================================================================== +# 3-step combined plan tests (multi-hop temporal + geography change) +# ============================================================================== + +test_that("plan_crosswalk_chain produces 3-step plan for 2000 tract -> 2020 ZCTA", { + plan <- plan_crosswalk_chain( + source_geography = "tract", + target_geography = "zcta", + source_year = 2000, + target_year = 2020) + + expect_true(plan$is_multi_step) + expect_equal(nrow(plan$steps), 3) + + # Step 1: 2000 tract -> 2010 tract (NHGIS) + expect_equal(plan$steps$crosswalk_source[1], "nhgis") + expect_equal(plan$steps$source_year[1], "2000") + expect_equal(plan$steps$target_year[1], "2010") + expect_equal(plan$steps$source_geography[1], "tract") + expect_equal(plan$steps$target_geography[1], "tract") + + # Step 2: 2010 tract -> 2020 tract (NHGIS) + expect_equal(plan$steps$crosswalk_source[2], "nhgis") + expect_equal(plan$steps$source_year[2], "2010") + expect_equal(plan$steps$target_year[2], "2020") + expect_equal(plan$steps$source_geography[2], "tract") + expect_equal(plan$steps$target_geography[2], "tract") + + # Step 3: 2020 tract -> 2020 ZCTA (Geocorr) + expect_equal(plan$steps$crosswalk_source[3], "geocorr") + expect_equal(plan$steps$source_year[3], "2020") + expect_equal(plan$steps$target_year[3], "2020") + expect_equal(plan$steps$source_geography[3], "tract") + expect_equal(plan$steps$target_geography[3], "zcta") + + expect_equal(plan$intermediate_geography, "tract") + expect_equal(plan$intermediate_year, c("2010", "2020")) +}) + +test_that("plan_crosswalk_chain produces 3-step plan for 1990 tract -> 2020 PUMA", { + plan <- plan_crosswalk_chain( + source_geography = "tract", + target_geography = "puma", + source_year = 1990, + target_year = 2020) + + expect_true(plan$is_multi_step) + expect_equal(nrow(plan$steps), 3) + expect_equal(plan$steps$crosswalk_source[1], "nhgis") + expect_equal(plan$steps$crosswalk_source[2], "nhgis") + expect_equal(plan$steps$crosswalk_source[3], "geocorr") +}) + +# ============================================================================== +# Error cases for impossible temporal paths +# ============================================================================== + +test_that("plan_crosswalk_chain errors for impossible temporal path (2020->1990 tract)", { + plan <- plan_crosswalk_chain( + source_geography = "tract", + target_geography = "tract", + source_year = 2020, + target_year = 1990) + + expect_false(is.null(plan$error)) + expect_true(stringr::str_detect(plan$error, "No temporal crosswalk path")) +}) + +test_that("plan_crosswalk_chain errors for impossible temporal path in combined plan", { + plan <- plan_crosswalk_chain( + source_geography = "tract", + target_geography = "zcta", + source_year = 2020, + target_year = 1990) + + expect_false(is.null(plan$error)) + expect_true(stringr::str_detect(plan$error, "no temporal crosswalk path")) +}) + +# ============================================================================== +# Format message tests for multi-hop plans +# ============================================================================== + +test_that("format_chain_plan_message formats 3-step plan correctly", { + plan <- plan_crosswalk_chain( + source_geography = "tract", + target_geography = "zcta", + source_year = 2000, + target_year = 2020) + + message <- crosswalk:::format_chain_plan_message(plan) + + expect_type(message, "character") + expect_true(stringr::str_detect(message, "Multi-step")) + expect_true(stringr::str_detect(message, "Step 1")) + expect_true(stringr::str_detect(message, "Step 2")) + expect_true(stringr::str_detect(message, "Step 3")) + expect_true(stringr::str_detect(message, "2010, 2020")) +}) + +test_that("format_chain_plan_message formats 2-step year-only plan correctly", { + plan <- plan_crosswalk_chain( + source_geography = "tract", + target_geography = "tract", + source_year = 1990, + target_year = 2020) + + message <- crosswalk:::format_chain_plan_message(plan) + + expect_type(message, "character") + expect_true(stringr::str_detect(message, "Multi-step")) + expect_true(stringr::str_detect(message, "Step 1")) + expect_true(stringr::str_detect(message, "Step 2")) +}) + +# ============================================================================== +# Composition note tests for multi-hop plans +# ============================================================================== + +test_that("plan_crosswalk_chain composition note covers all steps in 3-step plan", { + plan <- plan_crosswalk_chain( + source_geography = "tract", + target_geography = "zcta", + source_year = 2000, + target_year = 2020) + + expect_true(stringr::str_detect(plan$composition_note, "step1_allocation")) + expect_true(stringr::str_detect(plan$composition_note, "step2_allocation")) + expect_true(stringr::str_detect(plan$composition_note, "step3_allocation")) +})