From af938e9ea0d8c17742ef05514bd8a48f972671da Mon Sep 17 00:00:00 2001
From: Antoine Fabri <antoine.fabri@gmail.com>
Date: Fri, 22 Oct 2021 18:30:24 +0200
Subject: [PATCH 01/10] Extract big step into get_matches* functions

---
 R/fuzzy_join.R  | 196 +++---------------------------------------------
 R/get_matches.R | 190 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 200 insertions(+), 186 deletions(-)
 create mode 100644 R/get_matches.R

diff --git a/R/fuzzy_join.R b/R/fuzzy_join.R
index 62c159e..314dc7e 100644
--- a/R/fuzzy_join.R
+++ b/R/fuzzy_join.R
@@ -60,195 +60,19 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL,
 
   mode <- match.arg(mode, c("inner", "left", "right", "full", "semi", "anti"))
 
-  non_nulls <- (!is.null(multi_match_fun)) +
-    (!is.null(match_fun)) +
-    (!is.null(index_match_fun))
-  if (sum(non_nulls) != 1) {
+  match_fun_type <- c("match_fun", "multi_match_fun", "index_match_fun")[
+    c(!is.null(match_fun), !is.null(multi_match_fun), !is.null(index_match_fun))
+  ]
+  if (length(match_fun_type) > 1) {
     stop("Must give exactly one of match_fun, multi_match_fun, and index_match_fun")
   }
 
-  if (!is.null(match_fun)) {
-    by <- common_by(by, x, y)
-
-    # Support formula notation for functions
-    if (is.list(match_fun)) {
-      match_fun <- purrr::map(match_fun, purrr::as_mapper)
-    } else {
-      match_fun <- purrr::as_mapper(match_fun)
-    }
-
-    if (length(match_fun) == 1) {
-      match_fun <- rep(c(match_fun), length(by$x))
-    }
-    if (length(match_fun) != length(by$x)) {
-      stop("Length of match_fun not equal to columns specified in 'by'.", call. = FALSE)
-    }
-
-    matches <- dplyr::bind_rows(lapply(seq_along(by$x), function(i) {
-      col_x <- x[[by$x[i]]]
-      col_y <- y[[by$y[i]]]
-
-      indices_x <- tibble::tibble(
-        col = col_x,
-        indices = seq_along(col_x)
-      ) %>%
-        dplyr::group_by(col) %>%
-        tidyr::nest() %>%
-        dplyr::mutate(indices = purrr::map(data, "indices"))
-
-      indices_y <- tibble::tibble(
-        col = col_y,
-        indices = seq_along(col_y)
-      ) %>%
-        dplyr::group_by(col) %>%
-        tidyr::nest() %>%
-        dplyr::mutate(indices = purrr::map(data, "indices"))
-
-      u_x <- indices_x$col
-      u_y <- indices_y$col
-
-      if (!is.null(names(match_fun))) {
-        # match_fun is a named list, use the names in x
-        mf <- match_fun[[by$x[[i]]]]
-      } else {
-        mf <- match_fun[[i]]
-      }
-
-      extra_cols <- NULL
-
-      n_x <- length(u_x)
-      n_y <- length(u_y)
-      m <- mf(rep(u_x, n_y), rep(u_y, each = n_x), ...)
-
-      if (is.data.frame(m)) {
-        if (ncol(m) > 1) {
-          # first column is logical, others are included as distance columns
-          extra_cols <- m[, -1, drop = FALSE]
-        }
-        m <- m[[1]]
-      }
-
-      # return as a data frame of x and y indices that match
-      w <- which(m) - 1
-
-      if (length(w) == 0) {
-        # there are no matches
-        ret <- tibble::tibble(i = numeric(0), x = numeric(0), y = numeric(0))
-        return(ret)
-      }
-
-      x_indices_l <- indices_x$indices[w %% n_x + 1]
-      y_indices_l <- indices_y$indices[w %/% n_x + 1]
-
-      xls <- sapply(x_indices_l, length)
-      yls <- sapply(y_indices_l, length)
-
-      x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y)))
-      y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x)))
-
-      ret <- tibble::tibble(i = i, x = x_rep, y = y_rep)
-
-      if (!is.null(extra_cols)) {
-        extra_indices <- rep(w, xls * yls)
-        extra_cols_rep <- extra_cols[extra_indices + 1, , drop = FALSE]
-        ret <- dplyr::bind_cols(ret, extra_cols_rep)
-      }
-
-      ret
-    }))
-
-    if (length(by$x) > 1) {
-      # only take cases where all pairs have matches
-      accept <- matches %>%
-        dplyr::count(x, y) %>%
-        dplyr::ungroup() %>%
-        dplyr::filter(n == length(by$x))
-
-      matches <- matches %>%
-        dplyr::semi_join(accept, by = c("x", "y"))
-
-      if (ncol(matches) > 3) {
-        # include one for each
-        matches <- matches %>%
-          dplyr::semi_join(accept, by = c("x", "y")) %>%
-          dplyr::mutate(name = by$x[i]) %>%
-          dplyr::select(-i) %>%
-          tidyr::gather(key, value, -x, -y, -name) %>%
-          tidyr::unite(newname, name, key, sep = ".") %>%
-          tidyr::spread(newname, value)
-      } else {
-        matches <- dplyr::distinct(matches, x, y)
-      }
-    }
-  } else if (!is.null(multi_match_fun)) {
-    multi_match_fun <- purrr::as_mapper(multi_match_fun)
-
-    # use multiple matches
-    by <- common_by(multi_by, x, y)
-
-    number_x_rows <- nrow(x)
-    number_y_rows <- nrow(y)
-
-    indices_x <- x %>%
-      dplyr::select_at(by$x) %>%
-      dplyr::mutate(indices = seq_len(number_x_rows)) %>%
-      dplyr::group_by_at(dplyr::vars(-dplyr::one_of("indices"))) %>%
-      tidyr::nest() %>%
-      dplyr::mutate(indices = purrr::map(data, "indices"))
-    indices_y <- y %>%
-      dplyr::select_at(by$y) %>%
-      dplyr::mutate(indices = seq_len(number_y_rows)) %>%
-      dplyr::group_by_at(dplyr::vars(-dplyr::one_of("indices"))) %>%
-      tidyr::nest() %>%
-      dplyr::mutate(indices = purrr::map(data, "indices"))
-
-    ux <- as.matrix(indices_x[by$x])
-    uy <- as.matrix(indices_y[by$y])
-
-    pairs <- matrix(NA, nrow(ux), nrow(uy))
-    ix <- row(pairs)
-    iy <- col(pairs)
-    ux_input <- ux[ix, ]
-    uy_input <- uy[iy, ]
-
-    m <- multi_match_fun(ux_input, uy_input)
-
-    extra_cols <- NULL
-    if (is.data.frame(m)) {
-      if (ncol(m) > 1) {
-        extra_cols <- m[, -1, drop = FALSE]
-      }
-      m <- m[[1]]
-    }
-
-    if (sum(m) == 0) {
-      # there are no matches
-      matches <- tibble::tibble(x = numeric(0), y = numeric(0))
-    } else {
-      x_indices_l <- indices_x$indices[ix[m]]
-      y_indices_l <- indices_y$indices[iy[m]]
-      xls <- purrr::map_dbl(x_indices_l, length)
-      yls <- purrr::map_dbl(y_indices_l, length)
-      x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y)))
-      y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x)))
-
-      matches <- tibble::tibble(x = x_rep, y = y_rep)
-      if (!is.null(extra_cols)) {
-        extra_indices <- rep(which(m), xls * yls)
-        extra_cols_rep <- extra_cols[extra_indices, , drop = FALSE]
-        matches <- dplyr::bind_cols(matches, extra_cols_rep)
-      }
-    }
-  } else {
-    # raw index-index function
-    index_match_fun <- purrr::as_mapper(index_match_fun)
-    by <- common_by(multi_by, x, y)
-
-    d1 <- x[, by$x, drop = FALSE]
-    d2 <- y[, by$y, drop = FALSE]
-
-    matches <- index_match_fun(d1, d2)
-  }
+  matches <- switch(
+    match_fun_type,
+    match_fun = get_matches(x, y, by, match_fun, ...),
+    multi_match_fun = get_matches_multi(x, y, by, multi_match_fun, multi_by),
+    index_match_fun = get_matches_index(x, y, by, index_match_fun, multi_by)
+  )
   matches$i <- NULL
 
   if (mode == "semi") {
diff --git a/R/get_matches.R b/R/get_matches.R
new file mode 100644
index 0000000..47aeb17
--- /dev/null
+++ b/R/get_matches.R
@@ -0,0 +1,190 @@
+get_matches <- function(x, y, by, match_fun, ...) {
+  by <- common_by(by, x, y)
+
+  # Support formula notation for functions
+  if (is.list(match_fun)) {
+    match_fun <- purrr::map(match_fun, purrr::as_mapper)
+  } else {
+    match_fun <- purrr::as_mapper(match_fun)
+  }
+
+  if (length(match_fun) == 1) {
+    match_fun <- rep(c(match_fun), length(by$x))
+  }
+  if (length(match_fun) != length(by$x)) {
+    stop("Length of match_fun not equal to columns specified in 'by'.", call. = FALSE)
+  }
+
+  matches <- dplyr::bind_rows(lapply(seq_along(by$x), function(i) {
+    col_x <- x[[by$x[i]]]
+    col_y <- y[[by$y[i]]]
+
+    indices_x <- tibble::tibble(
+      col = col_x,
+      indices = seq_along(col_x)
+    ) %>%
+      dplyr::group_by(col) %>%
+      tidyr::nest() %>%
+      dplyr::mutate(indices = purrr::map(data, "indices"))
+
+    indices_y <- tibble::tibble(
+      col = col_y,
+      indices = seq_along(col_y)
+    ) %>%
+      dplyr::group_by(col) %>%
+      tidyr::nest() %>%
+      dplyr::mutate(indices = purrr::map(data, "indices"))
+
+    u_x <- indices_x$col
+    u_y <- indices_y$col
+
+    if (!is.null(names(match_fun))) {
+      # match_fun is a named list, use the names in x
+      mf <- match_fun[[by$x[[i]]]]
+    } else {
+      mf <- match_fun[[i]]
+    }
+
+    extra_cols <- NULL
+
+    n_x <- length(u_x)
+    n_y <- length(u_y)
+    m <- mf(rep(u_x, n_y), rep(u_y, each = n_x), ...)
+
+    if (is.data.frame(m)) {
+      if (ncol(m) > 1) {
+        # first column is logical, others are included as distance columns
+        extra_cols <- m[, -1, drop = FALSE]
+      }
+      m <- m[[1]]
+    }
+
+    # return as a data frame of x and y indices that match
+    w <- which(m) - 1
+
+    if (length(w) == 0) {
+      # there are no matches
+      ret <- tibble::tibble(i = numeric(0), x = numeric(0), y = numeric(0))
+      return(ret)
+    }
+
+    x_indices_l <- indices_x$indices[w %% n_x + 1]
+    y_indices_l <- indices_y$indices[w %/% n_x + 1]
+
+    xls <- sapply(x_indices_l, length)
+    yls <- sapply(y_indices_l, length)
+
+    x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y)))
+    y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x)))
+
+    ret <- tibble::tibble(i = i, x = x_rep, y = y_rep)
+
+    if (!is.null(extra_cols)) {
+      extra_indices <- rep(w, xls * yls)
+      extra_cols_rep <- extra_cols[extra_indices + 1, , drop = FALSE]
+      ret <- dplyr::bind_cols(ret, extra_cols_rep)
+    }
+
+    ret
+  }))
+
+  if (length(by$x) > 1) {
+    # only take cases where all pairs have matches
+    accept <- matches %>%
+      dplyr::count(x, y) %>%
+      dplyr::ungroup() %>%
+      dplyr::filter(n == length(by$x))
+
+    matches <- matches %>%
+      dplyr::semi_join(accept, by = c("x", "y"))
+
+    if (ncol(matches) > 3) {
+      # include one for each
+      matches <- matches %>%
+        dplyr::semi_join(accept, by = c("x", "y")) %>%
+        dplyr::mutate(name = by$x[i]) %>%
+        dplyr::select(-i) %>%
+        tidyr::gather(key, value, -x, -y, -name) %>%
+        tidyr::unite(newname, name, key, sep = ".") %>%
+        tidyr::spread(newname, value)
+    } else {
+      matches <- dplyr::distinct(matches, x, y)
+    }
+  }
+  matches
+}
+
+get_matches_multi <- function(x, y, by, multi_match_fun, multi_by) {
+  multi_match_fun <- purrr::as_mapper(multi_match_fun)
+
+  # use multiple matches
+  by <- common_by(multi_by, x, y)
+
+  number_x_rows <- nrow(x)
+  number_y_rows <- nrow(y)
+
+  indices_x <- x %>%
+    dplyr::select_at(by$x) %>%
+    dplyr::mutate(indices = seq_len(number_x_rows)) %>%
+    dplyr::group_by_at(dplyr::vars(-dplyr::one_of("indices"))) %>%
+    tidyr::nest() %>%
+    dplyr::mutate(indices = purrr::map(data, "indices"))
+  indices_y <- y %>%
+    dplyr::select_at(by$y) %>%
+    dplyr::mutate(indices = seq_len(number_y_rows)) %>%
+    dplyr::group_by_at(dplyr::vars(-dplyr::one_of("indices"))) %>%
+    tidyr::nest() %>%
+    dplyr::mutate(indices = purrr::map(data, "indices"))
+
+  ux <- as.matrix(indices_x[by$x])
+  uy <- as.matrix(indices_y[by$y])
+
+  pairs <- matrix(NA, nrow(ux), nrow(uy))
+  ix <- row(pairs)
+  iy <- col(pairs)
+  ux_input <- ux[ix, ]
+  uy_input <- uy[iy, ]
+
+  m <- multi_match_fun(ux_input, uy_input)
+
+  extra_cols <- NULL
+  if (is.data.frame(m)) {
+    if (ncol(m) > 1) {
+      extra_cols <- m[, -1, drop = FALSE]
+    }
+    m <- m[[1]]
+  }
+
+  if (sum(m) == 0) {
+    # there are no matches
+    matches <- tibble::tibble(x = numeric(0), y = numeric(0))
+  } else {
+    x_indices_l <- indices_x$indices[ix[m]]
+    y_indices_l <- indices_y$indices[iy[m]]
+    xls <- purrr::map_dbl(x_indices_l, length)
+    yls <- purrr::map_dbl(y_indices_l, length)
+    x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y)))
+    y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x)))
+
+    matches <- tibble::tibble(x = x_rep, y = y_rep)
+    if (!is.null(extra_cols)) {
+      extra_indices <- rep(which(m), xls * yls)
+      extra_cols_rep <- extra_cols[extra_indices, , drop = FALSE]
+      matches <- dplyr::bind_cols(matches, extra_cols_rep)
+    }
+  }
+  matches
+}
+
+
+get_matches_index <- function(x, y, by, index_match_fun, multi_by) {
+  # raw index-index function
+  index_match_fun <- purrr::as_mapper(index_match_fun)
+  by <- common_by(multi_by, x, y)
+
+  d1 <- x[, by$x, drop = FALSE]
+  d2 <- y[, by$y, drop = FALSE]
+
+  matches <- index_match_fun(d1, d2)
+  matches
+}

From ea3262225b969f155fb643b1be9724a3ad4d03d9 Mon Sep 17 00:00:00 2001
From: Antoine Fabri <antoine.fabri@gmail.com>
Date: Fri, 22 Oct 2021 18:47:33 +0200
Subject: [PATCH 02/10] Extract code to get matches for one key pair into
 `get_matches1`

---
 R/get_matches.R | 148 +++++++++++++++++++++++++-----------------------
 1 file changed, 76 insertions(+), 72 deletions(-)

diff --git a/R/get_matches.R b/R/get_matches.R
index 47aeb17..138f6ed 100644
--- a/R/get_matches.R
+++ b/R/get_matches.R
@@ -15,78 +15,8 @@ get_matches <- function(x, y, by, match_fun, ...) {
     stop("Length of match_fun not equal to columns specified in 'by'.", call. = FALSE)
   }
 
-  matches <- dplyr::bind_rows(lapply(seq_along(by$x), function(i) {
-    col_x <- x[[by$x[i]]]
-    col_y <- y[[by$y[i]]]
-
-    indices_x <- tibble::tibble(
-      col = col_x,
-      indices = seq_along(col_x)
-    ) %>%
-      dplyr::group_by(col) %>%
-      tidyr::nest() %>%
-      dplyr::mutate(indices = purrr::map(data, "indices"))
-
-    indices_y <- tibble::tibble(
-      col = col_y,
-      indices = seq_along(col_y)
-    ) %>%
-      dplyr::group_by(col) %>%
-      tidyr::nest() %>%
-      dplyr::mutate(indices = purrr::map(data, "indices"))
-
-    u_x <- indices_x$col
-    u_y <- indices_y$col
-
-    if (!is.null(names(match_fun))) {
-      # match_fun is a named list, use the names in x
-      mf <- match_fun[[by$x[[i]]]]
-    } else {
-      mf <- match_fun[[i]]
-    }
-
-    extra_cols <- NULL
-
-    n_x <- length(u_x)
-    n_y <- length(u_y)
-    m <- mf(rep(u_x, n_y), rep(u_y, each = n_x), ...)
-
-    if (is.data.frame(m)) {
-      if (ncol(m) > 1) {
-        # first column is logical, others are included as distance columns
-        extra_cols <- m[, -1, drop = FALSE]
-      }
-      m <- m[[1]]
-    }
-
-    # return as a data frame of x and y indices that match
-    w <- which(m) - 1
-
-    if (length(w) == 0) {
-      # there are no matches
-      ret <- tibble::tibble(i = numeric(0), x = numeric(0), y = numeric(0))
-      return(ret)
-    }
-
-    x_indices_l <- indices_x$indices[w %% n_x + 1]
-    y_indices_l <- indices_y$indices[w %/% n_x + 1]
-
-    xls <- sapply(x_indices_l, length)
-    yls <- sapply(y_indices_l, length)
-
-    x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y)))
-    y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x)))
-
-    ret <- tibble::tibble(i = i, x = x_rep, y = y_rep)
-
-    if (!is.null(extra_cols)) {
-      extra_indices <- rep(w, xls * yls)
-      extra_cols_rep <- extra_cols[extra_indices + 1, , drop = FALSE]
-      ret <- dplyr::bind_cols(ret, extra_cols_rep)
-    }
-
-    ret
-  }))
+  # for each pair of key columns build a match data frame, and bind them
+  matches <- purrr::map_dfr(seq_along(by$x), get_matches1, x, y, by, match_fun, ...)
 
   if (length(by$x) > 1) {
     # only take cases where all pairs have matches
@@ -188,3 +118,77 @@ get_matches_index <- function(x, y, by, index_match_fun, multi_by) {
   matches <- index_match_fun(d1, d2)
   matches
 }
+
+
+get_matches1 <- function(i, x, y, by, match_fun, ...) {
+  col_x <- x[[by$x[i]]]
+  col_y <- y[[by$y[i]]]
+
+  indices_x <- tibble::tibble(
+    col = col_x,
+    indices = seq_along(col_x)
+  ) %>%
+    dplyr::group_by(col) %>%
+    tidyr::nest() %>%
+    dplyr::mutate(indices = purrr::map(data, "indices"))
+
+  indices_y <- tibble::tibble(
+    col = col_y,
+    indices = seq_along(col_y)
+  ) %>%
+    dplyr::group_by(col) %>%
+    tidyr::nest() %>%
+    dplyr::mutate(indices = purrr::map(data, "indices"))
+
+  u_x <- indices_x$col
+  u_y <- indices_y$col
+
+  if (!is.null(names(match_fun))) {
+    # match_fun is a named list, use the names in x
+    mf <- match_fun[[by$x[[i]]]]
+  } else {
+    mf <- match_fun[[i]]
+  }
+
+  extra_cols <- NULL
+
+  n_x <- length(u_x)
+  n_y <- length(u_y)
+  m <- mf(rep(u_x, n_y), rep(u_y, each = n_x), ...)
+
+  if (is.data.frame(m)) {
+    if (ncol(m) > 1) {
+      # first column is logical, others are included as distance columns
+      extra_cols <- m[, -1, drop = FALSE]
+    }
+    m <- m[[1]]
+  }
+
+  # return as a data frame of x and y indices that match
+  w <- which(m) - 1
+
+  if (length(w) == 0) {
+    # there are no matches
+    ret <- tibble::tibble(i = numeric(0), x = numeric(0), y = numeric(0))
+    return(ret)
+  }
+
+  x_indices_l <- indices_x$indices[w %% n_x + 1]
+  y_indices_l <- indices_y$indices[w %/% n_x + 1]
+
+  xls <- sapply(x_indices_l, length)
+  yls <- sapply(y_indices_l, length)
+
+  x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y)))
+  y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x)))
+
+  ret <- tibble::tibble(i = i, x = x_rep, y = y_rep)
+
+  if (!is.null(extra_cols)) {
+    extra_indices <- rep(w, xls * yls)
+    extra_cols_rep <- extra_cols[extra_indices + 1, , drop = FALSE]
+    ret <- dplyr::bind_cols(ret, extra_cols_rep)
+  }
+
+  ret
+}

From 45e9d215a3c887698124f8e83e788a912c626712 Mon Sep 17 00:00:00 2001
From: Antoine Fabri <antoine.fabri@gmail.com>
Date: Fri, 22 Oct 2021 20:15:53 +0200
Subject: [PATCH 03/10] return early

---
 R/get_matches.R | 47 +++++++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/R/get_matches.R b/R/get_matches.R
index 138f6ed..44bc709 100644
--- a/R/get_matches.R
+++ b/R/get_matches.R
@@ -18,29 +18,32 @@ get_matches <- function(x, y, by, match_fun, ...) {
   # for each pair of key columns build a match data frame, and bind them
   matches <- purrr::map_dfr(seq_along(by$x), get_matches1, x, y, by, match_fun, ...)
 
-  if (length(by$x) > 1) {
-    # only take cases where all pairs have matches
-    accept <- matches %>%
-      dplyr::count(x, y) %>%
-      dplyr::ungroup() %>%
-      dplyr::filter(n == length(by$x))
-
-    matches <- matches %>%
-      dplyr::semi_join(accept, by = c("x", "y"))
-
-    if (ncol(matches) > 3) {
-      # include one for each
-      matches <- matches %>%
-        dplyr::semi_join(accept, by = c("x", "y")) %>%
-        dplyr::mutate(name = by$x[i]) %>%
-        dplyr::select(-i) %>%
-        tidyr::gather(key, value, -x, -y, -name) %>%
-        tidyr::unite(newname, name, key, sep = ".") %>%
-        tidyr::spread(newname, value)
-    } else {
-      matches <- dplyr::distinct(matches, x, y)
-    }
+  if (length(by$x) == 1) {
+    return(matches)
   }
+
+  # only take cases where all pairs have matches
+  accept <- matches %>%
+    dplyr::count(x, y) %>%
+    dplyr::ungroup() %>%
+    dplyr::filter(n == length(by$x))
+
+  matches <- matches %>%
+    dplyr::semi_join(accept, by = c("x", "y"))
+
+  if (ncol(matches) == 3) {
+    return(dplyr::distinct(matches, x, y))
+  }
+
+  # include one for each
+  matches <- matches %>%
+    dplyr::semi_join(accept, by = c("x", "y")) %>%
+    dplyr::mutate(name = by$x[i]) %>%
+    dplyr::select(-i) %>%
+    tidyr::gather(key, value, -x, -y, -name) %>%
+    tidyr::unite(newname, name, key, sep = ".") %>%
+    tidyr::spread(newname, value)
+
   matches
 }
 

From a7f8c28262e4c960575765b8a4e953fe1f7e4853 Mon Sep 17 00:00:00 2001
From: Antoine Fabri <antoine.fabri@gmail.com>
Date: Fri, 22 Oct 2021 20:37:44 +0200
Subject: [PATCH 04/10] Extract code into `complete_matches()` function

---
 R/fuzzy_join.R  | 20 +++-----------------
 R/get_matches.R | 23 +++++++++++++++++++++++
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/R/fuzzy_join.R b/R/fuzzy_join.R
index 314dc7e..90a94b9 100644
--- a/R/fuzzy_join.R
+++ b/R/fuzzy_join.R
@@ -73,7 +73,8 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL,
     multi_match_fun = get_matches_multi(x, y, by, multi_match_fun, multi_by),
     index_match_fun = get_matches_index(x, y, by, index_match_fun, multi_by)
   )
-  matches$i <- NULL
+
+  matches <- complete_matches(matches, mode, nrow(x), nrow(y))
 
   if (mode == "semi") {
     # just use the x indices to include
@@ -87,31 +88,16 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL,
     return(regroup(x[-sort(unique(matches$x)), , drop = FALSE]))
   }
 
-  matches <- dplyr::arrange(matches, x, y)
-
   # in cases where columns share a name, rename each to .x and .y
   n <- intersect(colnames(x), colnames(y))
   x <- dplyr::rename_at(x, .vars = n, ~ paste0(.x, ".x"))
   y <- dplyr::rename_at(y, .vars = n, ~ paste0(.x, ".y"))
 
-  # fill in indices of the x, y, or both
-  # curious if there's a higher performance approach
-  if (mode == "left") {
-    matches <- tibble::tibble(x = seq_len(nrow(x))) %>%
-      dplyr::left_join(matches, by = "x")
-  } else if (mode == "right") {
-    matches <- tibble::tibble(y = seq_len(nrow(y))) %>%
-      dplyr::left_join(matches, by = "y")
-  } else if (mode == "full") {
-    matches <- matches %>%
-      dplyr::full_join(tibble::tibble(x = seq_len(nrow(x))), by = "x") %>%
-      dplyr::full_join(tibble::tibble(y = seq_len(nrow(y))), by = "y")
-  }
-
   ret <- dplyr::bind_cols(
     unrowwname(x[matches$x, , drop = FALSE]),
     unrowwname(y[matches$y, , drop = FALSE])
   )
+
   if (ncol(matches) > 2) {
     extra_cols <- unrowwname(matches[, -(1:2), drop = FALSE])
     ret <- dplyr::bind_cols(ret, extra_cols)
diff --git a/R/get_matches.R b/R/get_matches.R
index 44bc709..317afba 100644
--- a/R/get_matches.R
+++ b/R/get_matches.R
@@ -195,3 +195,26 @@ get_matches1 <- function(i, x, y, by, match_fun, ...) {
 
   ret
 }
+
+complete_matches <- function(matches, mode, n_x, n_y) {
+  matches$i <- NULL
+  if(mode %in% c("semi", "anti")) {
+    return(matches)
+  }
+  matches <- dplyr::arrange(matches, x, y)
+
+  # fill in indices of the x, y, or both
+  # curious if there's a higher performance approach
+  if (mode == "left") {
+    matches <- tibble::tibble(x = seq_len(n_x)) %>%
+      dplyr::left_join(matches, by = "x")
+  } else if (mode == "right") {
+    matches <- tibble::tibble(y = seq_len(n_y)) %>%
+      dplyr::left_join(matches, by = "y")
+  } else if (mode == "full") {
+    matches <- matches %>%
+      dplyr::full_join(tibble::tibble(x = seq_len(n_x)), by = "x") %>%
+      dplyr::full_join(tibble::tibble(y = seq_len(n_y)), by = "y")
+  }
+  matches
+}

From 8dd86863d39d59b2c3fbc8eb132aa6a793059589 Mon Sep 17 00:00:00 2001
From: Antoine Fabri <antoine.fabri@gmail.com>
Date: Fri, 22 Oct 2021 20:50:13 +0200
Subject: [PATCH 05/10] Extract code that builds the output from matches into
 `build_output()`

---
 R/fuzzy_join.R | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/R/fuzzy_join.R b/R/fuzzy_join.R
index 90a94b9..3bc7bce 100644
--- a/R/fuzzy_join.R
+++ b/R/fuzzy_join.R
@@ -76,16 +76,30 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL,
 
   matches <- complete_matches(matches, mode, nrow(x), nrow(y))
 
+  ret <- build_output(x, y, matches, mode)
+  ret <- regroup(ret)
+
+  # Base the type (data.frame vs tbl_df) on x, not on y
+  if (!inherits(x, "tbl_df")) {
+    ret <- as.data.frame(ret)
+  }
+
+  ret
+}
+
+
+build_output <- function(x, y, matches, mode) {
   if (mode == "semi") {
     # just use the x indices to include
-    return(regroup(x[sort(unique(matches$x)), , drop = FALSE]))
+    return(x[sort(unique(matches$x)), , drop = FALSE])
   }
+
   if (mode == "anti") {
     if (nrow(matches) == 0) {
-      return(regroup(x))
+      return(x)
     }
     # just use the x indices to exclude
-    return(regroup(x[-sort(unique(matches$x)), , drop = FALSE]))
+    return(x[-sort(unique(matches$x)), , drop = FALSE])
   }
 
   # in cases where columns share a name, rename each to .x and .y
@@ -98,22 +112,17 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL,
     unrowwname(y[matches$y, , drop = FALSE])
   )
 
+  # bind extra columns
   if (ncol(matches) > 2) {
     extra_cols <- unrowwname(matches[, -(1:2), drop = FALSE])
     ret <- dplyr::bind_cols(ret, extra_cols)
   }
 
-  ret <- regroup(ret)
-
-  # Base the type (data.frame vs tbl_df) on x, not on y
-  if (!inherits(x, "tbl_df")) {
-    ret <- as.data.frame(ret)
-  }
-
   ret
 }
 
 
+
 #' @rdname fuzzy_join
 #' @export
 fuzzy_inner_join <- function(x, y, by = NULL, match_fun, ...) {

From c74eecdba03963b4972facf7b9428187fad49475 Mon Sep 17 00:00:00 2001
From: Antoine Fabri <antoine.fabri@gmail.com>
Date: Fri, 22 Oct 2021 20:55:32 +0200
Subject: [PATCH 06/10] Move `regroup()` out of `fuzzy_join()`

---
 R/fuzzy_join.R | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/R/fuzzy_join.R b/R/fuzzy_join.R
index 3bc7bce..67dc73e 100644
--- a/R/fuzzy_join.R
+++ b/R/fuzzy_join.R
@@ -45,18 +45,6 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL,
   # preserve the grouping of x
   x_groups <- dplyr::groups(x)
   x <- dplyr::ungroup(x)
-  regroup <- function(d) {
-    if (length(x_groups) == 0) {
-      return(d)
-    }
-
-    g <- purrr::map_chr(x_groups, as.character)
-    missing <- !(g %in% colnames(d))
-    # add .x to those that are missing; they've been renamed
-    g[missing] <- paste0(g[missing], ".x")
-
-    dplyr::group_by_at(d, g)
-  }
 
   mode <- match.arg(mode, c("inner", "left", "right", "full", "semi", "anti"))
 
@@ -77,7 +65,7 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL,
   matches <- complete_matches(matches, mode, nrow(x), nrow(y))
 
   ret <- build_output(x, y, matches, mode)
-  ret <- regroup(ret)
+  ret <- regroup(ret, x_groups)
 
   # Base the type (data.frame vs tbl_df) on x, not on y
   if (!inherits(x, "tbl_df")) {
@@ -87,6 +75,18 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL,
   ret
 }
 
+regroup <- function(d, x_groups) {
+  if (length(x_groups) == 0) {
+    return(d)
+  }
+
+  g <- purrr::map_chr(x_groups, as.character)
+  missing <- !(g %in% colnames(d))
+  # add .x to those that are missing; they've been renamed
+  g[missing] <- paste0(g[missing], ".x")
+
+  dplyr::group_by_at(d, g)
+}
 
 build_output <- function(x, y, matches, mode) {
   if (mode == "semi") {

From 0caf939aef9059f0ead01b66b233e3c394dffc52 Mon Sep 17 00:00:00 2001
From: Antoine Fabri <antoine.fabri@gmail.com>
Date: Fri, 22 Oct 2021 20:56:49 +0200
Subject: [PATCH 07/10] Put match.arg() first

---
 R/fuzzy_join.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/fuzzy_join.R b/R/fuzzy_join.R
index 67dc73e..15999c5 100644
--- a/R/fuzzy_join.R
+++ b/R/fuzzy_join.R
@@ -42,12 +42,12 @@
 fuzzy_join <- function(x, y, by = NULL, match_fun = NULL,
                        multi_by = NULL, multi_match_fun = NULL,
                        index_match_fun = NULL, mode = "inner", ...) {
+  mode <- match.arg(mode, c("inner", "left", "right", "full", "semi", "anti"))
+
   # preserve the grouping of x
   x_groups <- dplyr::groups(x)
   x <- dplyr::ungroup(x)
 
-  mode <- match.arg(mode, c("inner", "left", "right", "full", "semi", "anti"))
-
   match_fun_type <- c("match_fun", "multi_match_fun", "index_match_fun")[
     c(!is.null(match_fun), !is.null(multi_match_fun), !is.null(index_match_fun))
   ]

From 8ff3cf5a4932c8e7f0baa12c8e792d99d6eadbde Mon Sep 17 00:00:00 2001
From: Antoine Fabri <antoine.fabri@gmail.com>
Date: Fri, 22 Oct 2021 21:00:13 +0200
Subject: [PATCH 08/10] Improve readability

---
 R/fuzzy_join.R | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/R/fuzzy_join.R b/R/fuzzy_join.R
index 15999c5..ad55d0a 100644
--- a/R/fuzzy_join.R
+++ b/R/fuzzy_join.R
@@ -48,9 +48,13 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL,
   x_groups <- dplyr::groups(x)
   x <- dplyr::ungroup(x)
 
-  match_fun_type <- c("match_fun", "multi_match_fun", "index_match_fun")[
-    c(!is.null(match_fun), !is.null(multi_match_fun), !is.null(index_match_fun))
+  match_fun_types <- c("match_fun", "multi_match_fun", "index_match_fun")
+  match_fun_type <- match_fun_types[
+    c(!is.null(match_fun),
+      !is.null(multi_match_fun),
+      !is.null(index_match_fun))
   ]
+
   if (length(match_fun_type) > 1) {
     stop("Must give exactly one of match_fun, multi_match_fun, and index_match_fun")
   }
@@ -61,9 +65,7 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL,
     multi_match_fun = get_matches_multi(x, y, by, multi_match_fun, multi_by),
     index_match_fun = get_matches_index(x, y, by, index_match_fun, multi_by)
   )
-
   matches <- complete_matches(matches, mode, nrow(x), nrow(y))
-
   ret <- build_output(x, y, matches, mode)
   ret <- regroup(ret, x_groups)
 

From 94dcc0f8550c4df3da6606a3a2d1955bf350e759 Mon Sep 17 00:00:00 2001
From: Antoine Fabri <antoine.fabri@gmail.com>
Date: Fri, 22 Oct 2021 21:06:06 +0200
Subject: [PATCH 09/10] Return early

---
 R/get_matches.R | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/R/get_matches.R b/R/get_matches.R
index 317afba..fdcdc97 100644
--- a/R/get_matches.R
+++ b/R/get_matches.R
@@ -62,6 +62,7 @@ get_matches_multi <- function(x, y, by, multi_match_fun, multi_by) {
     dplyr::group_by_at(dplyr::vars(-dplyr::one_of("indices"))) %>%
     tidyr::nest() %>%
     dplyr::mutate(indices = purrr::map(data, "indices"))
+
   indices_y <- y %>%
     dplyr::select_at(by$y) %>%
     dplyr::mutate(indices = seq_len(number_y_rows)) %>%
@@ -91,21 +92,23 @@ get_matches_multi <- function(x, y, by, multi_match_fun, multi_by) {
   if (sum(m) == 0) {
     # there are no matches
     matches <- tibble::tibble(x = numeric(0), y = numeric(0))
-  } else {
-    x_indices_l <- indices_x$indices[ix[m]]
-    y_indices_l <- indices_y$indices[iy[m]]
-    xls <- purrr::map_dbl(x_indices_l, length)
-    yls <- purrr::map_dbl(y_indices_l, length)
-    x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y)))
-    y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x)))
-
-    matches <- tibble::tibble(x = x_rep, y = y_rep)
-    if (!is.null(extra_cols)) {
-      extra_indices <- rep(which(m), xls * yls)
-      extra_cols_rep <- extra_cols[extra_indices, , drop = FALSE]
-      matches <- dplyr::bind_cols(matches, extra_cols_rep)
-    }
+    return(matches)
   }
+
+  x_indices_l <- indices_x$indices[ix[m]]
+  y_indices_l <- indices_y$indices[iy[m]]
+  xls <- purrr::map_dbl(x_indices_l, length)
+  yls <- purrr::map_dbl(y_indices_l, length)
+  x_rep <- unlist(purrr::map2(x_indices_l, yls, function(x, y) rep(x, each = y)))
+  y_rep <- unlist(purrr::map2(y_indices_l, xls, function(y, x) rep(y, x)))
+
+  matches <- tibble::tibble(x = x_rep, y = y_rep)
+  if (!is.null(extra_cols)) {
+    extra_indices <- rep(which(m), xls * yls)
+    extra_cols_rep <- extra_cols[extra_indices, , drop = FALSE]
+    matches <- dplyr::bind_cols(matches, extra_cols_rep)
+  }
+
   matches
 }
 

From 8a0d52154a0b084e9f519f5c94b8218af1e8da8d Mon Sep 17 00:00:00 2001
From: Antoine Fabri <antoine.fabri@gmail.com>
Date: Fri, 22 Oct 2021 22:40:53 +0200
Subject: [PATCH 10/10] Remove unnecessary by arg

---
 R/fuzzy_join.R  | 4 ++--
 R/get_matches.R | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/R/fuzzy_join.R b/R/fuzzy_join.R
index ad55d0a..da2bfc7 100644
--- a/R/fuzzy_join.R
+++ b/R/fuzzy_join.R
@@ -62,8 +62,8 @@ fuzzy_join <- function(x, y, by = NULL, match_fun = NULL,
   matches <- switch(
     match_fun_type,
     match_fun = get_matches(x, y, by, match_fun, ...),
-    multi_match_fun = get_matches_multi(x, y, by, multi_match_fun, multi_by),
-    index_match_fun = get_matches_index(x, y, by, index_match_fun, multi_by)
+    multi_match_fun = get_matches_multi(x, y, multi_by, multi_match_fun),
+    index_match_fun = get_matches_index(x, y, multi_by, index_match_fun)
   )
   matches <- complete_matches(matches, mode, nrow(x), nrow(y))
   ret <- build_output(x, y, matches, mode)
diff --git a/R/get_matches.R b/R/get_matches.R
index fdcdc97..1f3b16e 100644
--- a/R/get_matches.R
+++ b/R/get_matches.R
@@ -47,7 +47,7 @@ get_matches <- function(x, y, by, match_fun, ...) {
   matches
 }
 
-get_matches_multi <- function(x, y, by, multi_match_fun, multi_by) {
+get_matches_multi <- function(x, y, multi_by, multi_match_fun) {
   multi_match_fun <- purrr::as_mapper(multi_match_fun)
 
   # use multiple matches
@@ -76,6 +76,7 @@ get_matches_multi <- function(x, y, by, multi_match_fun, multi_by) {
   pairs <- matrix(NA, nrow(ux), nrow(uy))
   ix <- row(pairs)
   iy <- col(pairs)
+  # we should have drop = FALSE here
   ux_input <- ux[ix, ]
   uy_input <- uy[iy, ]
 
@@ -113,7 +114,7 @@ get_matches_multi <- function(x, y, by, multi_match_fun, multi_by) {
 }
 
 
-get_matches_index <- function(x, y, by, index_match_fun, multi_by) {
+get_matches_index <- function(x, y, multi_by, index_match_fun) {
   # raw index-index function
   index_match_fun <- purrr::as_mapper(index_match_fun)
   by <- common_by(multi_by, x, y)