From af938e9ea0d8c17742ef05514bd8a48f972671da Mon Sep 17 00:00:00 2001 From: Antoine Fabri Date: Fri, 22 Oct 2021 18:30:24 +0200 Subject: [PATCH 01/10] Extract big step into get_matches* functions --- R/fuzzy_join.R | 196 +++--------------------------------------------- R/get_matches.R | 190 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 200 insertions(+), 186 deletions(-) create mode 100644 R/get_matches.R diff --git a/R/fuzzy_join.R b/R/fuzzy_join.R index 62c159e..314dc7e 100644 --- a/R/fuzzy_join.R +++ b/R/fuzzy_join.R @@ -60,195 +60,19 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL, mode <- match.arg(mode, c("inner", "left", "right", "full", "semi", "anti")) - non_nulls <- (!is.null(multi_match_fun)) + - (!is.null(match_fun)) + - (!is.null(index_match_fun)) - if (sum(non_nulls) != 1) { + match_fun_type <- c("match_fun", "multi_match_fun", "index_match_fun")[ + c(!is.null(match_fun), !is.null(multi_match_fun), !is.null(index_match_fun)) + ] + if (length(match_fun_type) > 1) { stop("Must give exactly one of match_fun, multi_match_fun, and index_match_fun") } - if (!is.null(match_fun)) { - by <- common_by(by, x, y) - - # Support formula notation for functions - if (is.list(match_fun)) { - match_fun <- purrr::map(match_fun, purrr::as_mapper) - } else { - match_fun <- purrr::as_mapper(match_fun) - } - - if (length(match_fun) == 1) { - match_fun <- rep(c(match_fun), length(by$x)) - } - if (length(match_fun) != length(by$x)) { - stop("Length of match_fun not equal to columns specified in 'by'.", call. = FALSE) - } - - matches <- dplyr::bind_rows(lapply(seq_along(by$x), function(i) { - col_x <- x[[by$x[i]]] - col_y <- y[[by$y[i]]] - - indices_x <- tibble::tibble( - col = col_x, - indices = seq_along(col_x) - ) %>% - dplyr::group_by(col) %>% - tidyr::nest() %>% - dplyr::mutate(indices = purrr::map(data, "indices")) - - indices_y <- tibble::tibble( - col = col_y, - indices = seq_along(col_y) - ) %>% - dplyr::group_by(col) %>% - tidyr::nest() %>% - dplyr::mutate(indices = purrr::map(data, "indices")) - - u_x <- indices_x$col - u_y <- indices_y$col - - if (!is.null(names(match_fun))) { - # match_fun is a named list, use the names in x - mf <- match_fun[[by$x[[i]]]] - } else { - mf <- match_fun[[i]] - } - - extra_cols <- NULL - - n_x <- length(u_x) - n_y <- length(u_y) - m <- mf(rep(u_x, n_y), rep(u_y, each = n_x), ...) - - if (is.data.frame(m)) { - if (ncol(m) > 1) { - # first column is logical, others are included as distance columns - extra_cols <- m[, -1, drop = FALSE] - } - m <- m[[1]] - } - - # return as a data frame of x and y indices that match - w <- which(m) - 1 - - if (length(w) == 0) { - # there are no matches - ret <- tibble::tibble(i = numeric(0), x = numeric(0), y = numeric(0)) - return(ret) - } - - x_indices_l <- indices_x$indices[w %% n_x + 1] - y_indices_l <- indices_y$indices[w %/% n_x + 1] - - xls <- sapply(x_indices_l, length) - yls <- sapply(y_indices_l, length) - - x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y))) - y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x))) - - ret <- tibble::tibble(i = i, x = x_rep, y = y_rep) - - if (!is.null(extra_cols)) { - extra_indices <- rep(w, xls * yls) - extra_cols_rep <- extra_cols[extra_indices + 1, , drop = FALSE] - ret <- dplyr::bind_cols(ret, extra_cols_rep) - } - - ret - })) - - if (length(by$x) > 1) { - # only take cases where all pairs have matches - accept <- matches %>% - dplyr::count(x, y) %>% - dplyr::ungroup() %>% - dplyr::filter(n == length(by$x)) - - matches <- matches %>% - dplyr::semi_join(accept, by = c("x", "y")) - - if (ncol(matches) > 3) { - # include one for each - matches <- matches %>% - dplyr::semi_join(accept, by = c("x", "y")) %>% - dplyr::mutate(name = by$x[i]) %>% - dplyr::select(-i) %>% - tidyr::gather(key, value, -x, -y, -name) %>% - tidyr::unite(newname, name, key, sep = ".") %>% - tidyr::spread(newname, value) - } else { - matches <- dplyr::distinct(matches, x, y) - } - } - } else if (!is.null(multi_match_fun)) { - multi_match_fun <- purrr::as_mapper(multi_match_fun) - - # use multiple matches - by <- common_by(multi_by, x, y) - - number_x_rows <- nrow(x) - number_y_rows <- nrow(y) - - indices_x <- x %>% - dplyr::select_at(by$x) %>% - dplyr::mutate(indices = seq_len(number_x_rows)) %>% - dplyr::group_by_at(dplyr::vars(-dplyr::one_of("indices"))) %>% - tidyr::nest() %>% - dplyr::mutate(indices = purrr::map(data, "indices")) - indices_y <- y %>% - dplyr::select_at(by$y) %>% - dplyr::mutate(indices = seq_len(number_y_rows)) %>% - dplyr::group_by_at(dplyr::vars(-dplyr::one_of("indices"))) %>% - tidyr::nest() %>% - dplyr::mutate(indices = purrr::map(data, "indices")) - - ux <- as.matrix(indices_x[by$x]) - uy <- as.matrix(indices_y[by$y]) - - pairs <- matrix(NA, nrow(ux), nrow(uy)) - ix <- row(pairs) - iy <- col(pairs) - ux_input <- ux[ix, ] - uy_input <- uy[iy, ] - - m <- multi_match_fun(ux_input, uy_input) - - extra_cols <- NULL - if (is.data.frame(m)) { - if (ncol(m) > 1) { - extra_cols <- m[, -1, drop = FALSE] - } - m <- m[[1]] - } - - if (sum(m) == 0) { - # there are no matches - matches <- tibble::tibble(x = numeric(0), y = numeric(0)) - } else { - x_indices_l <- indices_x$indices[ix[m]] - y_indices_l <- indices_y$indices[iy[m]] - xls <- purrr::map_dbl(x_indices_l, length) - yls <- purrr::map_dbl(y_indices_l, length) - x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y))) - y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x))) - - matches <- tibble::tibble(x = x_rep, y = y_rep) - if (!is.null(extra_cols)) { - extra_indices <- rep(which(m), xls * yls) - extra_cols_rep <- extra_cols[extra_indices, , drop = FALSE] - matches <- dplyr::bind_cols(matches, extra_cols_rep) - } - } - } else { - # raw index-index function - index_match_fun <- purrr::as_mapper(index_match_fun) - by <- common_by(multi_by, x, y) - - d1 <- x[, by$x, drop = FALSE] - d2 <- y[, by$y, drop = FALSE] - - matches <- index_match_fun(d1, d2) - } + matches <- switch( + match_fun_type, + match_fun = get_matches(x, y, by, match_fun, ...), + multi_match_fun = get_matches_multi(x, y, by, multi_match_fun, multi_by), + index_match_fun = get_matches_index(x, y, by, index_match_fun, multi_by) + ) matches$i <- NULL if (mode == "semi") { diff --git a/R/get_matches.R b/R/get_matches.R new file mode 100644 index 0000000..47aeb17 --- /dev/null +++ b/R/get_matches.R @@ -0,0 +1,190 @@ +get_matches <- function(x, y, by, match_fun, ...) { + by <- common_by(by, x, y) + + # Support formula notation for functions + if (is.list(match_fun)) { + match_fun <- purrr::map(match_fun, purrr::as_mapper) + } else { + match_fun <- purrr::as_mapper(match_fun) + } + + if (length(match_fun) == 1) { + match_fun <- rep(c(match_fun), length(by$x)) + } + if (length(match_fun) != length(by$x)) { + stop("Length of match_fun not equal to columns specified in 'by'.", call. = FALSE) + } + + matches <- dplyr::bind_rows(lapply(seq_along(by$x), function(i) { + col_x <- x[[by$x[i]]] + col_y <- y[[by$y[i]]] + + indices_x <- tibble::tibble( + col = col_x, + indices = seq_along(col_x) + ) %>% + dplyr::group_by(col) %>% + tidyr::nest() %>% + dplyr::mutate(indices = purrr::map(data, "indices")) + + indices_y <- tibble::tibble( + col = col_y, + indices = seq_along(col_y) + ) %>% + dplyr::group_by(col) %>% + tidyr::nest() %>% + dplyr::mutate(indices = purrr::map(data, "indices")) + + u_x <- indices_x$col + u_y <- indices_y$col + + if (!is.null(names(match_fun))) { + # match_fun is a named list, use the names in x + mf <- match_fun[[by$x[[i]]]] + } else { + mf <- match_fun[[i]] + } + + extra_cols <- NULL + + n_x <- length(u_x) + n_y <- length(u_y) + m <- mf(rep(u_x, n_y), rep(u_y, each = n_x), ...) + + if (is.data.frame(m)) { + if (ncol(m) > 1) { + # first column is logical, others are included as distance columns + extra_cols <- m[, -1, drop = FALSE] + } + m <- m[[1]] + } + + # return as a data frame of x and y indices that match + w <- which(m) - 1 + + if (length(w) == 0) { + # there are no matches + ret <- tibble::tibble(i = numeric(0), x = numeric(0), y = numeric(0)) + return(ret) + } + + x_indices_l <- indices_x$indices[w %% n_x + 1] + y_indices_l <- indices_y$indices[w %/% n_x + 1] + + xls <- sapply(x_indices_l, length) + yls <- sapply(y_indices_l, length) + + x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y))) + y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x))) + + ret <- tibble::tibble(i = i, x = x_rep, y = y_rep) + + if (!is.null(extra_cols)) { + extra_indices <- rep(w, xls * yls) + extra_cols_rep <- extra_cols[extra_indices + 1, , drop = FALSE] + ret <- dplyr::bind_cols(ret, extra_cols_rep) + } + + ret + })) + + if (length(by$x) > 1) { + # only take cases where all pairs have matches + accept <- matches %>% + dplyr::count(x, y) %>% + dplyr::ungroup() %>% + dplyr::filter(n == length(by$x)) + + matches <- matches %>% + dplyr::semi_join(accept, by = c("x", "y")) + + if (ncol(matches) > 3) { + # include one for each + matches <- matches %>% + dplyr::semi_join(accept, by = c("x", "y")) %>% + dplyr::mutate(name = by$x[i]) %>% + dplyr::select(-i) %>% + tidyr::gather(key, value, -x, -y, -name) %>% + tidyr::unite(newname, name, key, sep = ".") %>% + tidyr::spread(newname, value) + } else { + matches <- dplyr::distinct(matches, x, y) + } + } + matches +} + +get_matches_multi <- function(x, y, by, multi_match_fun, multi_by) { + multi_match_fun <- purrr::as_mapper(multi_match_fun) + + # use multiple matches + by <- common_by(multi_by, x, y) + + number_x_rows <- nrow(x) + number_y_rows <- nrow(y) + + indices_x <- x %>% + dplyr::select_at(by$x) %>% + dplyr::mutate(indices = seq_len(number_x_rows)) %>% + dplyr::group_by_at(dplyr::vars(-dplyr::one_of("indices"))) %>% + tidyr::nest() %>% + dplyr::mutate(indices = purrr::map(data, "indices")) + indices_y <- y %>% + dplyr::select_at(by$y) %>% + dplyr::mutate(indices = seq_len(number_y_rows)) %>% + dplyr::group_by_at(dplyr::vars(-dplyr::one_of("indices"))) %>% + tidyr::nest() %>% + dplyr::mutate(indices = purrr::map(data, "indices")) + + ux <- as.matrix(indices_x[by$x]) + uy <- as.matrix(indices_y[by$y]) + + pairs <- matrix(NA, nrow(ux), nrow(uy)) + ix <- row(pairs) + iy <- col(pairs) + ux_input <- ux[ix, ] + uy_input <- uy[iy, ] + + m <- multi_match_fun(ux_input, uy_input) + + extra_cols <- NULL + if (is.data.frame(m)) { + if (ncol(m) > 1) { + extra_cols <- m[, -1, drop = FALSE] + } + m <- m[[1]] + } + + if (sum(m) == 0) { + # there are no matches + matches <- tibble::tibble(x = numeric(0), y = numeric(0)) + } else { + x_indices_l <- indices_x$indices[ix[m]] + y_indices_l <- indices_y$indices[iy[m]] + xls <- purrr::map_dbl(x_indices_l, length) + yls <- purrr::map_dbl(y_indices_l, length) + x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y))) + y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x))) + + matches <- tibble::tibble(x = x_rep, y = y_rep) + if (!is.null(extra_cols)) { + extra_indices <- rep(which(m), xls * yls) + extra_cols_rep <- extra_cols[extra_indices, , drop = FALSE] + matches <- dplyr::bind_cols(matches, extra_cols_rep) + } + } + matches +} + + +get_matches_index <- function(x, y, by, index_match_fun, multi_by) { + # raw index-index function + index_match_fun <- purrr::as_mapper(index_match_fun) + by <- common_by(multi_by, x, y) + + d1 <- x[, by$x, drop = FALSE] + d2 <- y[, by$y, drop = FALSE] + + matches <- index_match_fun(d1, d2) + matches +} From ea3262225b969f155fb643b1be9724a3ad4d03d9 Mon Sep 17 00:00:00 2001 From: Antoine Fabri Date: Fri, 22 Oct 2021 18:47:33 +0200 Subject: [PATCH 02/10] Extract code to get matches for one key pair into `get_matches1` --- R/get_matches.R | 148 +++++++++++++++++++++++++----------------------- 1 file changed, 76 insertions(+), 72 deletions(-) diff --git a/R/get_matches.R b/R/get_matches.R index 47aeb17..138f6ed 100644 --- a/R/get_matches.R +++ b/R/get_matches.R @@ -15,78 +15,8 @@ get_matches <- function(x, y, by, match_fun, ...) { stop("Length of match_fun not equal to columns specified in 'by'.", call. = FALSE) } - matches <- dplyr::bind_rows(lapply(seq_along(by$x), function(i) { - col_x <- x[[by$x[i]]] - col_y <- y[[by$y[i]]] - - indices_x <- tibble::tibble( - col = col_x, - indices = seq_along(col_x) - ) %>% - dplyr::group_by(col) %>% - tidyr::nest() %>% - dplyr::mutate(indices = purrr::map(data, "indices")) - - indices_y <- tibble::tibble( - col = col_y, - indices = seq_along(col_y) - ) %>% - dplyr::group_by(col) %>% - tidyr::nest() %>% - dplyr::mutate(indices = purrr::map(data, "indices")) - - u_x <- indices_x$col - u_y <- indices_y$col - - if (!is.null(names(match_fun))) { - # match_fun is a named list, use the names in x - mf <- match_fun[[by$x[[i]]]] - } else { - mf <- match_fun[[i]] - } - - extra_cols <- NULL - - n_x <- length(u_x) - n_y <- length(u_y) - m <- mf(rep(u_x, n_y), rep(u_y, each = n_x), ...) - - if (is.data.frame(m)) { - if (ncol(m) > 1) { - # first column is logical, others are included as distance columns - extra_cols <- m[, -1, drop = FALSE] - } - m <- m[[1]] - } - - # return as a data frame of x and y indices that match - w <- which(m) - 1 - - if (length(w) == 0) { - # there are no matches - ret <- tibble::tibble(i = numeric(0), x = numeric(0), y = numeric(0)) - return(ret) - } - - x_indices_l <- indices_x$indices[w %% n_x + 1] - y_indices_l <- indices_y$indices[w %/% n_x + 1] - - xls <- sapply(x_indices_l, length) - yls <- sapply(y_indices_l, length) - - x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y))) - y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x))) - - ret <- tibble::tibble(i = i, x = x_rep, y = y_rep) - - if (!is.null(extra_cols)) { - extra_indices <- rep(w, xls * yls) - extra_cols_rep <- extra_cols[extra_indices + 1, , drop = FALSE] - ret <- dplyr::bind_cols(ret, extra_cols_rep) - } - - ret - })) + # for each pair of key columns build a match data frame, and bind them + matches <- purrr::map_dfr(seq_along(by$x), get_matches1, x, y, by, match_fun, ...) if (length(by$x) > 1) { # only take cases where all pairs have matches @@ -188,3 +118,77 @@ get_matches_index <- function(x, y, by, index_match_fun, multi_by) { matches <- index_match_fun(d1, d2) matches } + + +get_matches1 <- function(i, x, y, by, match_fun, ...) { + col_x <- x[[by$x[i]]] + col_y <- y[[by$y[i]]] + + indices_x <- tibble::tibble( + col = col_x, + indices = seq_along(col_x) + ) %>% + dplyr::group_by(col) %>% + tidyr::nest() %>% + dplyr::mutate(indices = purrr::map(data, "indices")) + + indices_y <- tibble::tibble( + col = col_y, + indices = seq_along(col_y) + ) %>% + dplyr::group_by(col) %>% + tidyr::nest() %>% + dplyr::mutate(indices = purrr::map(data, "indices")) + + u_x <- indices_x$col + u_y <- indices_y$col + + if (!is.null(names(match_fun))) { + # match_fun is a named list, use the names in x + mf <- match_fun[[by$x[[i]]]] + } else { + mf <- match_fun[[i]] + } + + extra_cols <- NULL + + n_x <- length(u_x) + n_y <- length(u_y) + m <- mf(rep(u_x, n_y), rep(u_y, each = n_x), ...) + + if (is.data.frame(m)) { + if (ncol(m) > 1) { + # first column is logical, others are included as distance columns + extra_cols <- m[, -1, drop = FALSE] + } + m <- m[[1]] + } + + # return as a data frame of x and y indices that match + w <- which(m) - 1 + + if (length(w) == 0) { + # there are no matches + ret <- tibble::tibble(i = numeric(0), x = numeric(0), y = numeric(0)) + return(ret) + } + + x_indices_l <- indices_x$indices[w %% n_x + 1] + y_indices_l <- indices_y$indices[w %/% n_x + 1] + + xls <- sapply(x_indices_l, length) + yls <- sapply(y_indices_l, length) + + x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y))) + y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x))) + + ret <- tibble::tibble(i = i, x = x_rep, y = y_rep) + + if (!is.null(extra_cols)) { + extra_indices <- rep(w, xls * yls) + extra_cols_rep <- extra_cols[extra_indices + 1, , drop = FALSE] + ret <- dplyr::bind_cols(ret, extra_cols_rep) + } + + ret +} From 45e9d215a3c887698124f8e83e788a912c626712 Mon Sep 17 00:00:00 2001 From: Antoine Fabri Date: Fri, 22 Oct 2021 20:15:53 +0200 Subject: [PATCH 03/10] return early --- R/get_matches.R | 47 +++++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/R/get_matches.R b/R/get_matches.R index 138f6ed..44bc709 100644 --- a/R/get_matches.R +++ b/R/get_matches.R @@ -18,29 +18,32 @@ get_matches <- function(x, y, by, match_fun, ...) { # for each pair of key columns build a match data frame, and bind them matches <- purrr::map_dfr(seq_along(by$x), get_matches1, x, y, by, match_fun, ...) - if (length(by$x) > 1) { - # only take cases where all pairs have matches - accept <- matches %>% - dplyr::count(x, y) %>% - dplyr::ungroup() %>% - dplyr::filter(n == length(by$x)) - - matches <- matches %>% - dplyr::semi_join(accept, by = c("x", "y")) - - if (ncol(matches) > 3) { - # include one for each - matches <- matches %>% - dplyr::semi_join(accept, by = c("x", "y")) %>% - dplyr::mutate(name = by$x[i]) %>% - dplyr::select(-i) %>% - tidyr::gather(key, value, -x, -y, -name) %>% - tidyr::unite(newname, name, key, sep = ".") %>% - tidyr::spread(newname, value) - } else { - matches <- dplyr::distinct(matches, x, y) - } + if (length(by$x) == 1) { + return(matches) } + + # only take cases where all pairs have matches + accept <- matches %>% + dplyr::count(x, y) %>% + dplyr::ungroup() %>% + dplyr::filter(n == length(by$x)) + + matches <- matches %>% + dplyr::semi_join(accept, by = c("x", "y")) + + if (ncol(matches) == 3) { + return(dplyr::distinct(matches, x, y)) + } + + # include one for each + matches <- matches %>% + dplyr::semi_join(accept, by = c("x", "y")) %>% + dplyr::mutate(name = by$x[i]) %>% + dplyr::select(-i) %>% + tidyr::gather(key, value, -x, -y, -name) %>% + tidyr::unite(newname, name, key, sep = ".") %>% + tidyr::spread(newname, value) + matches } From a7f8c28262e4c960575765b8a4e953fe1f7e4853 Mon Sep 17 00:00:00 2001 From: Antoine Fabri Date: Fri, 22 Oct 2021 20:37:44 +0200 Subject: [PATCH 04/10] Extract code into `complete_matches()` function --- R/fuzzy_join.R | 20 +++----------------- R/get_matches.R | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/R/fuzzy_join.R b/R/fuzzy_join.R index 314dc7e..90a94b9 100644 --- a/R/fuzzy_join.R +++ b/R/fuzzy_join.R @@ -73,7 +73,8 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL, multi_match_fun = get_matches_multi(x, y, by, multi_match_fun, multi_by), index_match_fun = get_matches_index(x, y, by, index_match_fun, multi_by) ) - matches$i <- NULL + + matches <- complete_matches(matches, mode, nrow(x), nrow(y)) if (mode == "semi") { # just use the x indices to include @@ -87,31 +88,16 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL, return(regroup(x[-sort(unique(matches$x)), , drop = FALSE])) } - matches <- dplyr::arrange(matches, x, y) - # in cases where columns share a name, rename each to .x and .y n <- intersect(colnames(x), colnames(y)) x <- dplyr::rename_at(x, .vars = n, ~ paste0(.x, ".x")) y <- dplyr::rename_at(y, .vars = n, ~ paste0(.x, ".y")) - # fill in indices of the x, y, or both - # curious if there's a higher performance approach - if (mode == "left") { - matches <- tibble::tibble(x = seq_len(nrow(x))) %>% - dplyr::left_join(matches, by = "x") - } else if (mode == "right") { - matches <- tibble::tibble(y = seq_len(nrow(y))) %>% - dplyr::left_join(matches, by = "y") - } else if (mode == "full") { - matches <- matches %>% - dplyr::full_join(tibble::tibble(x = seq_len(nrow(x))), by = "x") %>% - dplyr::full_join(tibble::tibble(y = seq_len(nrow(y))), by = "y") - } - ret <- dplyr::bind_cols( unrowwname(x[matches$x, , drop = FALSE]), unrowwname(y[matches$y, , drop = FALSE]) ) + if (ncol(matches) > 2) { extra_cols <- unrowwname(matches[, -(1:2), drop = FALSE]) ret <- dplyr::bind_cols(ret, extra_cols) diff --git a/R/get_matches.R b/R/get_matches.R index 44bc709..317afba 100644 --- a/R/get_matches.R +++ b/R/get_matches.R @@ -195,3 +195,26 @@ get_matches1 <- function(i, x, y, by, match_fun, ...) { ret } + +complete_matches <- function(matches, mode, n_x, n_y) { + matches$i <- NULL + if(mode %in% c("semi", "anti")) { + return(matches) + } + matches <- dplyr::arrange(matches, x, y) + + # fill in indices of the x, y, or both + # curious if there's a higher performance approach + if (mode == "left") { + matches <- tibble::tibble(x = seq_len(n_x)) %>% + dplyr::left_join(matches, by = "x") + } else if (mode == "right") { + matches <- tibble::tibble(y = seq_len(n_y)) %>% + dplyr::left_join(matches, by = "y") + } else if (mode == "full") { + matches <- matches %>% + dplyr::full_join(tibble::tibble(x = seq_len(n_x)), by = "x") %>% + dplyr::full_join(tibble::tibble(y = seq_len(n_y)), by = "y") + } + matches +} From 8dd86863d39d59b2c3fbc8eb132aa6a793059589 Mon Sep 17 00:00:00 2001 From: Antoine Fabri Date: Fri, 22 Oct 2021 20:50:13 +0200 Subject: [PATCH 05/10] Extract code that builds the output from matches into `build_output()` --- R/fuzzy_join.R | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/R/fuzzy_join.R b/R/fuzzy_join.R index 90a94b9..3bc7bce 100644 --- a/R/fuzzy_join.R +++ b/R/fuzzy_join.R @@ -76,16 +76,30 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL, matches <- complete_matches(matches, mode, nrow(x), nrow(y)) + ret <- build_output(x, y, matches, mode) + ret <- regroup(ret) + + # Base the type (data.frame vs tbl_df) on x, not on y + if (!inherits(x, "tbl_df")) { + ret <- as.data.frame(ret) + } + + ret +} + + +build_output <- function(x, y, matches, mode) { if (mode == "semi") { # just use the x indices to include - return(regroup(x[sort(unique(matches$x)), , drop = FALSE])) + return(x[sort(unique(matches$x)), , drop = FALSE]) } + if (mode == "anti") { if (nrow(matches) == 0) { - return(regroup(x)) + return(x) } # just use the x indices to exclude - return(regroup(x[-sort(unique(matches$x)), , drop = FALSE])) + return(x[-sort(unique(matches$x)), , drop = FALSE]) } # in cases where columns share a name, rename each to .x and .y @@ -98,22 +112,17 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL, unrowwname(y[matches$y, , drop = FALSE]) ) + # bind extra columns if (ncol(matches) > 2) { extra_cols <- unrowwname(matches[, -(1:2), drop = FALSE]) ret <- dplyr::bind_cols(ret, extra_cols) } - ret <- regroup(ret) - - # Base the type (data.frame vs tbl_df) on x, not on y - if (!inherits(x, "tbl_df")) { - ret <- as.data.frame(ret) - } - ret } + #' @rdname fuzzy_join #' @export fuzzy_inner_join <- function(x, y, by = NULL, match_fun, ...) { From c74eecdba03963b4972facf7b9428187fad49475 Mon Sep 17 00:00:00 2001 From: Antoine Fabri Date: Fri, 22 Oct 2021 20:55:32 +0200 Subject: [PATCH 06/10] Move `regroup()` out of `fuzzy_join()` --- R/fuzzy_join.R | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/R/fuzzy_join.R b/R/fuzzy_join.R index 3bc7bce..67dc73e 100644 --- a/R/fuzzy_join.R +++ b/R/fuzzy_join.R @@ -45,18 +45,6 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL, # preserve the grouping of x x_groups <- dplyr::groups(x) x <- dplyr::ungroup(x) - regroup <- function(d) { - if (length(x_groups) == 0) { - return(d) - } - - g <- purrr::map_chr(x_groups, as.character) - missing <- !(g %in% colnames(d)) - # add .x to those that are missing; they've been renamed - g[missing] <- paste0(g[missing], ".x") - - dplyr::group_by_at(d, g) - } mode <- match.arg(mode, c("inner", "left", "right", "full", "semi", "anti")) @@ -77,7 +65,7 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL, matches <- complete_matches(matches, mode, nrow(x), nrow(y)) ret <- build_output(x, y, matches, mode) - ret <- regroup(ret) + ret <- regroup(ret, x_groups) # Base the type (data.frame vs tbl_df) on x, not on y if (!inherits(x, "tbl_df")) { @@ -87,6 +75,18 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL, ret } +regroup <- function(d, x_groups) { + if (length(x_groups) == 0) { + return(d) + } + + g <- purrr::map_chr(x_groups, as.character) + missing <- !(g %in% colnames(d)) + # add .x to those that are missing; they've been renamed + g[missing] <- paste0(g[missing], ".x") + + dplyr::group_by_at(d, g) +} build_output <- function(x, y, matches, mode) { if (mode == "semi") { From 0caf939aef9059f0ead01b66b233e3c394dffc52 Mon Sep 17 00:00:00 2001 From: Antoine Fabri Date: Fri, 22 Oct 2021 20:56:49 +0200 Subject: [PATCH 07/10] Put match.arg() first --- R/fuzzy_join.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/fuzzy_join.R b/R/fuzzy_join.R index 67dc73e..15999c5 100644 --- a/R/fuzzy_join.R +++ b/R/fuzzy_join.R @@ -42,12 +42,12 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL, multi_by = NULL, multi_match_fun = NULL, index_match_fun = NULL, mode = "inner", ...) { + mode <- match.arg(mode, c("inner", "left", "right", "full", "semi", "anti")) + # preserve the grouping of x x_groups <- dplyr::groups(x) x <- dplyr::ungroup(x) - mode <- match.arg(mode, c("inner", "left", "right", "full", "semi", "anti")) - match_fun_type <- c("match_fun", "multi_match_fun", "index_match_fun")[ c(!is.null(match_fun), !is.null(multi_match_fun), !is.null(index_match_fun)) ] From 8ff3cf5a4932c8e7f0baa12c8e792d99d6eadbde Mon Sep 17 00:00:00 2001 From: Antoine Fabri Date: Fri, 22 Oct 2021 21:00:13 +0200 Subject: [PATCH 08/10] Improve readability --- R/fuzzy_join.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/fuzzy_join.R b/R/fuzzy_join.R index 15999c5..ad55d0a 100644 --- a/R/fuzzy_join.R +++ b/R/fuzzy_join.R @@ -48,9 +48,13 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL, x_groups <- dplyr::groups(x) x <- dplyr::ungroup(x) - match_fun_type <- c("match_fun", "multi_match_fun", "index_match_fun")[ - c(!is.null(match_fun), !is.null(multi_match_fun), !is.null(index_match_fun)) + match_fun_types <- c("match_fun", "multi_match_fun", "index_match_fun") + match_fun_type <- match_fun_types[ + c(!is.null(match_fun), + !is.null(multi_match_fun), + !is.null(index_match_fun)) ] + if (length(match_fun_type) > 1) { stop("Must give exactly one of match_fun, multi_match_fun, and index_match_fun") } @@ -61,9 +65,7 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL, multi_match_fun = get_matches_multi(x, y, by, multi_match_fun, multi_by), index_match_fun = get_matches_index(x, y, by, index_match_fun, multi_by) ) - matches <- complete_matches(matches, mode, nrow(x), nrow(y)) - ret <- build_output(x, y, matches, mode) ret <- regroup(ret, x_groups) From 94dcc0f8550c4df3da6606a3a2d1955bf350e759 Mon Sep 17 00:00:00 2001 From: Antoine Fabri Date: Fri, 22 Oct 2021 21:06:06 +0200 Subject: [PATCH 09/10] Return early --- R/get_matches.R | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/R/get_matches.R b/R/get_matches.R index 317afba..fdcdc97 100644 --- a/R/get_matches.R +++ b/R/get_matches.R @@ -62,6 +62,7 @@ get_matches_multi <- function(x, y, by, multi_match_fun, multi_by) { dplyr::group_by_at(dplyr::vars(-dplyr::one_of("indices"))) %>% tidyr::nest() %>% dplyr::mutate(indices = purrr::map(data, "indices")) + indices_y <- y %>% dplyr::select_at(by$y) %>% dplyr::mutate(indices = seq_len(number_y_rows)) %>% @@ -91,21 +92,23 @@ get_matches_multi <- function(x, y, by, multi_match_fun, multi_by) { if (sum(m) == 0) { # there are no matches matches <- tibble::tibble(x = numeric(0), y = numeric(0)) - } else { - x_indices_l <- indices_x$indices[ix[m]] - y_indices_l <- indices_y$indices[iy[m]] - xls <- purrr::map_dbl(x_indices_l, length) - yls <- purrr::map_dbl(y_indices_l, length) - x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y))) - y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x))) - - matches <- tibble::tibble(x = x_rep, y = y_rep) - if (!is.null(extra_cols)) { - extra_indices <- rep(which(m), xls * yls) - extra_cols_rep <- extra_cols[extra_indices, , drop = FALSE] - matches <- dplyr::bind_cols(matches, extra_cols_rep) - } + return(matches) } + + x_indices_l <- indices_x$indices[ix[m]] + y_indices_l <- indices_y$indices[iy[m]] + xls <- purrr::map_dbl(x_indices_l, length) + yls <- purrr::map_dbl(y_indices_l, length) + x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y))) + y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x))) + + matches <- tibble::tibble(x = x_rep, y = y_rep) + if (!is.null(extra_cols)) { + extra_indices <- rep(which(m), xls * yls) + extra_cols_rep <- extra_cols[extra_indices, , drop = FALSE] + matches <- dplyr::bind_cols(matches, extra_cols_rep) + } + matches } From 8a0d52154a0b084e9f519f5c94b8218af1e8da8d Mon Sep 17 00:00:00 2001 From: Antoine Fabri Date: Fri, 22 Oct 2021 22:40:53 +0200 Subject: [PATCH 10/10] Remove unnecessary by arg --- R/fuzzy_join.R | 4 ++-- R/get_matches.R | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/R/fuzzy_join.R b/R/fuzzy_join.R index ad55d0a..da2bfc7 100644 --- a/R/fuzzy_join.R +++ b/R/fuzzy_join.R @@ -62,8 +62,8 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL, matches <- switch( match_fun_type, match_fun = get_matches(x, y, by, match_fun, ...), - multi_match_fun = get_matches_multi(x, y, by, multi_match_fun, multi_by), - index_match_fun = get_matches_index(x, y, by, index_match_fun, multi_by) + multi_match_fun = get_matches_multi(x, y, multi_by, multi_match_fun), + index_match_fun = get_matches_index(x, y, multi_by, index_match_fun) ) matches <- complete_matches(matches, mode, nrow(x), nrow(y)) ret <- build_output(x, y, matches, mode) diff --git a/R/get_matches.R b/R/get_matches.R index fdcdc97..1f3b16e 100644 --- a/R/get_matches.R +++ b/R/get_matches.R @@ -47,7 +47,7 @@ get_matches <- function(x, y, by, match_fun, ...) { matches } -get_matches_multi <- function(x, y, by, multi_match_fun, multi_by) { +get_matches_multi <- function(x, y, multi_by, multi_match_fun) { multi_match_fun <- purrr::as_mapper(multi_match_fun) # use multiple matches @@ -76,6 +76,7 @@ get_matches_multi <- function(x, y, by, multi_match_fun, multi_by) { pairs <- matrix(NA, nrow(ux), nrow(uy)) ix <- row(pairs) iy <- col(pairs) + # we should have drop = FALSE here ux_input <- ux[ix, ] uy_input <- uy[iy, ] @@ -113,7 +114,7 @@ get_matches_multi <- function(x, y, by, multi_match_fun, multi_by) { } -get_matches_index <- function(x, y, by, index_match_fun, multi_by) { +get_matches_index <- function(x, y, multi_by, index_match_fun) { # raw index-index function index_match_fun <- purrr::as_mapper(index_match_fun) by <- common_by(multi_by, x, y)