pyjanitor-devs
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎janitor/functions/_conditional_join/_greater_than_indices.py‎
Lines changed: 141 additions & 0 deletions b/‎janitor/functions/_conditional_join/_greater_than_indices.py‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎janitor/functions/_conditional_join/_helpers.py‎
Lines changed: 82 additions & 0 deletions b/‎janitor/functions/_conditional_join/_helpers.py‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎janitor/functions/_conditional_join/_less_than_indices.py‎
Lines changed: 150 additions & 0 deletions b/‎janitor/functions/_conditional_join/_less_than_indices.py‎
Lines changed: 150 additions & 0 deletions
@@ -143,3 +143,4 @@ tags
 *.profraw
 /scratch.py
 midpoint.csv
+examples/notebooks/cond_join.ipynb
@@ -1,7 +1,7 @@
 # Changelog
 
 ## [Unreleased]
--   [ENH] Added `row_count` parameter for janitor.conditional_join - Issue #1269 @samukweku
+-   [ENH] `return_ragged_arrays` deprecated; get_join_indices function now returns a dictionary - Issue #520 @samukweku
 -   [ENH] Reverse deprecation of `pivot_wider()` -- Issue #1464
 -   [ENH] Add accessor and method for pandas DataFrameGroupBy objects. - Issue #587 @samukweku
 -   [ENH] Call mutate/summarise directly on groupby objects instead. Also add `ungroup` method to expose underlying dataframe of a grouped object. - Issue #1511 @samukweku
 
@@ -0,0 +1,141 @@
+# helper functions for >/>=
+import numpy as np
+import pandas as pd
+
+from janitor.functions._conditional_join._helpers import (
+    _null_checks_cond_join,
+    _sort_if_not_monotonic,
+)
+
+
+def _ge_gt_indices(
+    left: pd.array,
+    left_index: np.ndarray,
+    right: pd.array,
+    strict: bool,
+) -> tuple | None:
+    """
+    Use binary search to get indices where left
+    is greater than or equal to right.
+
+    If strict is True, then only indices
+    where `left` is greater than
+    (but not equal to) `right` are returned.
+    """
+    search_indices = right.searchsorted(left, side="right")
+    # if any of the positions in `search_indices`
+    # is equal to 0 (less than 1), it implies that
+    # left[position] is not greater than any value
+    # in right
+    booleans = search_indices > 0
+    if not booleans.any():
+        return None
+    if not booleans.all():
+        left = left[booleans]
+        left_index = left_index[booleans]
+        search_indices = search_indices[booleans]
+    # the idea here is that if there are any equal values
+    # shift downwards to the immediate next position
+    # that is not equal
+    if strict:
+        booleans = left == right[search_indices - 1]
+        # replace positions where rows are equal with
+        # searchsorted('left');
+        # this works fine since we will be using the value
+        # as the right side of a slice, which is not included
+        # in the final computed value
+        if booleans.any():
+            replacements = right.searchsorted(left, side="left")
+            # now we can safely replace values
+            # with strictly greater than positions
+            search_indices = np.where(booleans, replacements, search_indices)
+        # any value less than 1 should be discarded
+        # since the lowest value for binary search
+        # with side='right' should be 1
+        booleans = search_indices > 0
+        if not booleans.any():
+            return None
+        if not booleans.all():
+            left_index = left_index[booleans]
+            search_indices = search_indices[booleans]
+    return left_index, search_indices
+
+
+def _greater_than_indices(
+    left: pd.Series,
+    right: pd.Series,
+    strict: bool,
+    keep: str,
+    return_matching_indices: bool,
+) -> dict | None:
+    """
+    Use binary search to get indices where left
+    is greater than or equal to right.
+
+    If strict is True, then only indices
+    where `left` is greater than
+    (but not equal to) `right` are returned.
+    """
+    # quick break, avoiding the hassle
+    if left.max() < right.min():
+        return {
+            "left_index": np.array([], dtype=np.intp),
+            "right_index": np.array([], dtype=np.intp),
+        }
+    outcome = _null_checks_cond_join(series=left)
+    if outcome is None:
+        return {
+            "left_index": np.array([], dtype=np.intp),
+            "right_index": np.array([], dtype=np.intp),
+        }
+    left, _ = outcome
+    outcome = _null_checks_cond_join(series=right)
+    if outcome is None:
+        return {
+            "left_index": np.array([], dtype=np.intp),
+            "right_index": np.array([], dtype=np.intp),
+        }
+    right, any_nulls = outcome
+    right, right_is_sorted = _sort_if_not_monotonic(series=right)
+    outcome = _ge_gt_indices(
+        left=left.array,
+        right=right.array,
+        left_index=left.index._values,
+        strict=strict,
+    )
+    if outcome is None:
+        return {
+            "left_index": np.array([], dtype=np.intp),
+            "right_index": np.array([], dtype=np.intp),
+        }
+    left_index, search_indices = outcome
+    right_index = right.index._values
+    if right_is_sorted & (keep == "first"):
+        indexer = np.zeros_like(search_indices)
+        return {"left_index": left_index, "right_index": right_index[indexer]}
+    if right_is_sorted & (keep == "last") & any_nulls:
+        return {
+            "left_index": left_index,
+            "right_index": right_index[search_indices - 1],
+        }
+    if right_is_sorted & (keep == "last"):
+        return {"left_index": left_index, "right_index": search_indices - 1}
+    if keep == "first":
+        right = [right_index[:ind] for ind in search_indices]
+        right = [arr.min() for arr in right]
+        return {"left_index": left_index, "right_index": right}
+    if keep == "last":
+        right = [right_index[:ind] for ind in search_indices]
+        right = [arr.max() for arr in right]
+        return {"left_index": left_index, "right_index": right}
+    if return_matching_indices:
+        return dict(
+            left_index=left_index,
+            right_index=right_index,
+            starts=np.repeat(0, search_indices.size),
+            ends=search_indices,
+        )
+    right = [right_index[:ind] for ind in search_indices]
+    right = np.concatenate(right)
+    left = left_index.repeat(search_indices)
+    return {"left_index": left, "right_index": right}
@@ -0,0 +1,82 @@
+# helper functions for conditional_join.py
+
+from enum import Enum
+from typing import Sequence
+
+import numpy as np
+import pandas as pd
+
+
+class _JoinOperator(Enum):
+    """
+    List of operators used in conditional_join.
+    """
+
+    GREATER_THAN = ">"
+    LESS_THAN = "<"
+    GREATER_THAN_OR_EQUAL = ">="
+    LESS_THAN_OR_EQUAL = "<="
+    STRICTLY_EQUAL = "=="
+    NOT_EQUAL = "!="
+
+
+less_than_join_types = {
+    _JoinOperator.LESS_THAN.value,
+    _JoinOperator.LESS_THAN_OR_EQUAL.value,
+}
+greater_than_join_types = {
+    _JoinOperator.GREATER_THAN.value,
+    _JoinOperator.GREATER_THAN_OR_EQUAL.value,
+}
+
+
+def _maybe_remove_nulls_from_dataframe(
+    df: pd.DataFrame, columns: Sequence, return_bools: bool = False
+):
+    """
+    Remove nulls if op is not !=;
+    """
+    any_nulls = df.loc[:, [*columns]].isna().any(axis=1)
+    if any_nulls.all():
+        return None
+    if return_bools:
+        any_nulls = ~any_nulls
+        return any_nulls
+    if any_nulls.any():
+        df = df.loc[~any_nulls]
+    return df
+
+
+def _null_checks_cond_join(series: pd.Series) -> tuple | None:
+    """
+    Checks for nulls in the pandas series before conducting binary search.
+    """
+    any_nulls = series.isna()
+    if any_nulls.all():
+        return None
+    if any_nulls.any():
+        series = series[~any_nulls]
+    return series, any_nulls.any()
+
+
+def _sort_if_not_monotonic(series: pd.Series) -> pd.Series | None:
+    """
+    Sort the pandas `series` if it is not monotonic increasing
+    """
+
+    is_sorted = series.is_monotonic_increasing
+    if not is_sorted:
+        series = series.sort_values(kind="stable")
+    return series, is_sorted
+
+
+def _keep_output(keep: str, left: np.ndarray, right: np.ndarray):
+    """return indices for left and right index based on the value of `keep`."""
+    if keep == "all":
+        return left, right
+    grouped = pd.Series(right).groupby(left, sort=False)
+    if keep == "first":
+        grouped = grouped.min()
+        return grouped.index, grouped._values
+    grouped = grouped.max()
+    return grouped.index, grouped._values
@@ -0,0 +1,150 @@
+# helper functions for </<=
+import numpy as np
+import pandas as pd
+
+from janitor.functions._conditional_join._helpers import (
+    _null_checks_cond_join,
+    _sort_if_not_monotonic,
+)
+
+
+def _le_lt_indices(
+    left: pd.array,
+    left_index: np.ndarray,
+    right: pd.array,
+    strict: bool,
+) -> tuple | None:
+    """
+    Use binary search to get indices where left
+    is less than or equal to right.
+
+    If strict is True, then only indices
+    where `left` is less than
+    (but not equal to) `right` are returned.
+
+    Returns the left index and the binary search positions for left in right.
+    """
+    search_indices = right.searchsorted(left, side="left")
+    # if any of the positions in `search_indices`
+    # is equal to the length of `right_keys`
+    # that means the respective position in `left`
+    # has no values from `right` that are less than
+    # or equal, and should therefore be discarded
+    len_right = right.size
+    booleans = search_indices < len_right
+    if not booleans.any():
+        return None
+    if not booleans.all():
+        left = left[booleans]
+        left_index = left_index[booleans]
+        search_indices = search_indices[booleans]
+    # the idea here is that if there are any equal values
+    # shift to the right to the immediate next position
+    # that is not equal
+    if strict:
+        booleans = left == right[search_indices]
+        # replace positions where rows are equal
+        # with positions from searchsorted('right')
+        # positions from searchsorted('right') will never
+        # be equal and will be the furthermost in terms of position
+        # example : right -> [2, 2, 2, 3], and we need
+        # positions where values are not equal for 2;
+        # the furthermost will be 3, and searchsorted('right')
+        # will return position 3.
+        if booleans.any():
+            replacements = right.searchsorted(left, side="right")
+            # now we can safely replace values
+            # with strictly less than positions
+            search_indices = np.where(booleans, replacements, search_indices)
+        # check again if any of the values
+        # have become equal to length of right
+        # and get rid of them
+        booleans = search_indices < len_right
+        if not booleans.any():
+            return None
+        if not booleans.all():
+            left_index = left_index[booleans]
+            search_indices = search_indices[booleans]
+    return left_index, search_indices
+
+
+def _less_than_indices(
+    left: pd.Series,
+    right: pd.Series,
+    strict: bool,
+    keep: str,
+    return_matching_indices: bool,
+) -> dict | None:
+    """
+    Use binary search to get indices where left
+    is less than or equal to right.
+
+    If strict is True, then only indices
+    where `left` is less than
+    (but not equal to) `right` are returned.
+    """
+    # no point going through all the hassle
+    if left.min() > right.max():
+        return {
+            "left_index": np.array([], dtype=np.intp),
+            "right_index": np.array([], dtype=np.intp),
+        }
+    outcome = _null_checks_cond_join(series=left)
+    if not outcome:
+        return {
+            "left_index": np.array([], dtype=np.intp),
+            "right_index": np.array([], dtype=np.intp),
+        }
+    left, _ = outcome
+    outcome = _null_checks_cond_join(series=right)
+    if not outcome:
+        return {
+            "left_index": np.array([], dtype=np.intp),
+            "right_index": np.array([], dtype=np.intp),
+        }
+    right, any_nulls = outcome
+    right, right_is_sorted = _sort_if_not_monotonic(series=right)
+    outcome = _le_lt_indices(
+        left=left.array,
+        right=right.array,
+        left_index=left.index._values,
+        strict=strict,
+    )
+    if not outcome:
+        return {
+            "left_index": np.array([], dtype=np.intp),
+            "right_index": np.array([], dtype=np.intp),
+        }
+    left_index, search_indices = outcome
+    len_right = right.size
+    right_index = right.index._values
+    if right_is_sorted & (keep == "last"):
+        indexer = np.empty_like(search_indices)
+        indexer[:] = len_right - 1
+        return {"left_index": left_index, "right_index": right_index[indexer]}
+    if right_is_sorted & (keep == "first") & any_nulls:
+        return {
+            "left_index": left_index,
+            "right_index": right_index[search_indices],
+        }
+    if right_is_sorted & (keep == "first"):
+        return {"left_index": left_index, "right_index": search_indices}
+    if keep == "first":
+        right = [right_index[ind:len_right] for ind in search_indices]
+        right = [arr.min() for arr in right]
+        return {"left_index": left_index, "right_index": right}
+    if keep == "last":
+        right = [right_index[ind:len_right] for ind in search_indices]
+        right = [arr.max() for arr in right]
+        return {"left_index": left_index, "right_index": right}
+    if return_matching_indices:
+        return dict(
+            left_index=left_index,
+            right_index=right_index,
+            starts=search_indices,
+            ends=np.repeat(len_right, search_indices.size),
+        )
+    right = [right_index[ind:len_right] for ind in search_indices]
+    right = np.concatenate(right)
+    left = left_index.repeat(len_right - search_indices)
+    return {"left_index": left, "right_index": right}