diff --git a/DESCRIPTION b/DESCRIPTION index b4080c8..593786e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -34,4 +34,4 @@ Suggests: maps, IRanges, covr -RoxygenNote: 6.0.1 +RoxygenNote: 6.1.1 diff --git a/NAMESPACE b/NAMESPACE index d635caa..9cdd734 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -57,4 +57,6 @@ export(stringdist_left_join) export(stringdist_right_join) export(stringdist_semi_join) importFrom(dplyr,"%>%") +importFrom(dplyr,common_by) +importFrom(dplyr,data_frame) importFrom(utils,data) diff --git a/R/geo_join.R b/R/geo_join.R index d3393b0..a6833ec 100644 --- a/R/geo_join.R +++ b/R/geo_join.R @@ -8,7 +8,9 @@ #' #' @param x A tbl #' @param y A tbl -#' @param by Columns by which to join the two tables +#' @param by Columns by which to join the two tables. Must contain 'lon' +#' for longitute, and 'lat' for latitude, such as 'lat', or +#' 'lat.y', or 'latitude'. #' @param max_dist Maximum distance to use for joining #' @param method Method to use for computing distance: one of #' "haversine" (default), "geo", "cosine", "meeus", "vincentysphere", @@ -29,6 +31,7 @@ #' \code{fuzzy_join}. #' #' @importFrom utils data +#' @importFrom dplyr common_by data_frame #' #' @examples #' @@ -74,7 +77,7 @@ geo_join <- function(x, y, by = NULL, max_dist, unit <- match.arg(unit) # make sure longitude and latitude are in the right order - by <- common_by(by, x, y) + by <- dplyr::common_by(by, x, y) by <- lapply(by, function(e) { if (length(e) != 2) { stop("Trying to join on ", paste(e, collapse = ", "), diff --git a/man/geo_join.Rd b/man/geo_join.Rd index 74681c3..47e2f9a 100644 --- a/man/geo_join.Rd +++ b/man/geo_join.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/geo_join.R +% Please edit documentation in R/geo_join.R, R/geo_join2.R \name{geo_join} \alias{geo_join} \alias{geo_inner_join} @@ -10,9 +10,31 @@ \alias{geo_anti_join} \title{Join two tables based on a geo distance of longitudes and latitudes} \usage{ -geo_join(x, y, by = NULL, max_dist, method = c("haversine", "geo", "cosine", - "meeus", "vincentysphere", "vincentyellipsoid"), unit = c("miles", "km"), - mode = "inner", distance_col = NULL, ...) +geo_join(x, y, by = NULL, max_dist, method = c("haversine", "geo", + "cosine", "meeus", "vincentysphere", "vincentyellipsoid"), + unit = c("miles", "km"), mode = "inner", distance_col = NULL, ...) + +geo_inner_join(x, y, by = NULL, method = "haversine", max_dist = 1, + distance_col = NULL, ...) + +geo_left_join(x, y, by = NULL, method = "haversine", max_dist = 1, + distance_col = NULL, ...) + +geo_right_join(x, y, by = NULL, method = "haversine", max_dist = 1, + distance_col = NULL, ...) + +geo_full_join(x, y, by = NULL, method = "haversine", max_dist = 1, + distance_col = NULL, ...) + +geo_semi_join(x, y, by = NULL, method = "haversine", max_dist = 1, + distance_col = NULL, ...) + +geo_anti_join(x, y, by = NULL, method = "haversine", max_dist = 1, + distance_col = NULL, ...) + +geo_join(x, y, by = NULL, max_dist, method = c("haversine", "geo", + "cosine", "meeus", "vincentysphere", "vincentyellipsoid"), + unit = c("miles", "km"), mode = "inner", distance_col = NULL, ...) geo_inner_join(x, y, by = NULL, method = "haversine", max_dist = 1, distance_col = NULL, ...) @@ -37,6 +59,29 @@ geo_anti_join(x, y, by = NULL, method = "haversine", max_dist = 1, \item{y}{A tbl} +\item{by}{Columns by which to join the two tables. Must contain 'lon' +for longitute, and 'lat' for latitude, such as 'lat', or +'lat.y', or 'latitude'.} + +\item{max_dist}{Maximum distance to use for joining} + +\item{method}{Method to use for computing distance: one of +"haversine" (default), "geo", "cosine", "meeus", "vincentysphere", +"vincentyellipsoid"} + +\item{unit}{Unit of distance for threshold (default "miles")} + +\item{mode}{One of "inner", "left", "right", "full" "semi", or "anti"} + +\item{distance_col}{If given, will add a column with this +name containing the geographical distance between the two} + +\item{...}{Extra arguments passed on to the distance method} + +\item{x}{A tbl} + +\item{y}{A tbl} + \item{by}{Columns by which to join the two tables} \item{max_dist}{Maximum distance to use for joining} @@ -55,6 +100,12 @@ name containing the geographical distance between the two} \item{...}{Extra arguments passed on to the distance method} } \description{ +This allows joining based on combinations of longitudes and latitudes. If +you are using a distance metric that is *not* based on latitude and +longitude, use \code{\link{distance_join}} instead. Distances are +calculated based on the \code{distHaversine}, \code{distGeo}, +\code{distCosine}, etc methods in the geosphere package. + This allows joining based on combinations of longitudes and latitudes. If you are using a distance metric that is *not* based on latitude and longitude, use \code{\link{distance_join}} instead. Distances are @@ -67,12 +118,52 @@ approximately the fastest method. Note that by far the slowest method is vincentyellipsoid, and on fuzzy joins should only be used when there are very few pairs and accuracy is imperative. +If you need to use a custom geo method, you may want to write it directly +with the \code{multi_by} and \code{multi_match_fun} arguments to +\code{fuzzy_join}. + +"Haversine" was chosen as default since in some tests it is +approximately the fastest method. Note that by far the slowest method is +vincentyellipsoid, and on fuzzy joins should only be used when there are +very few pairs and accuracy is imperative. + If you need to use a custom geo method, you may want to write it directly with the \code{multi_by} and \code{multi_match_fun} arguments to \code{fuzzy_join}. } \examples{ +library(dplyr) +data("state") + +# find pairs of US states whose centers are within +# 200 miles of each other +states <- data_frame(state = state.name, + longitude = state.center$x, + latitude = state.center$y) + +s1 <- rename(states, state1 = state) +s2 <- rename(states, state2 = state) + +pairs <- s1 \%>\% + geo_inner_join(s2, max_dist = 200) \%>\% + filter(state1 != state2) + +pairs + +# plot them +library(ggplot2) +ggplot(pairs, aes(x = longitude.x, y = latitude.x, + xend = longitude.y, yend = latitude.y)) + + geom_segment(color = "red") + + borders("state") + + theme_void() + +# also get distances +s1 \%>\% + geo_inner_join(s2, max_dist = 200, distance_col = "distance") + + library(dplyr) data("state")