Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ import editdistance.osa
# Example usage:
str1 = "kitten"
str2 = "sitting"
distance = editdistance.osa.calculate_distance(str1, str2)
distance = editdistance.osa.calculate_distance(str1, str2, swap_weight=0.1)
print(f"The edit distance between '{{}}' and '{{}}' is {{}}".format(str1, str2, distance))
```

Expand Down
44 changes: 22 additions & 22 deletions editdistance/_edit_distance_osa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
std::vector<std::vector<double>> compute_dp_table(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
const std::map<CppEditopName, double>& cost_map
) {
int len_a = a.length();
int len_b = b.length();
Expand All @@ -30,7 +30,7 @@ std::vector<std::vector<double>> compute_dp_table(
if (i > 1 && j > 1 &&
a[i-1] == b[j-2] && a[i-2] == b[j-1]) {
dp[i][j] = std::min(dp[i][j],
dp[i-2][j-2] + cost_map.at(TRANSPOSE));
dp[i-2][j-2] + cost_map.at(SWAP));
}
}
}
Expand All @@ -42,42 +42,42 @@ std::vector<std::vector<double>> compute_dp_table(
double cpp_compute_distance(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
const std::map<CppEditopName, double>& cost_map
) {
auto dp = compute_dp_table(a, b, cost_map);
return dp[a.length()][b.length()];
}

std::vector<std::vector<Editop>> backtrack_all_paths(
std::vector<std::vector<CppEditop>> backtrack_all_paths(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map,
const std::map<CppEditopName, double>& cost_map,
const std::vector<std::vector<double>>& dp,
int i,
int j,
std::vector<Editop>& current_path
std::vector<CppEditop>& current_path
) {
if (i == 0 && j == 0) {
std::vector<Editop> reversed_path = current_path;
std::vector<CppEditop> reversed_path = current_path;
std::reverse(reversed_path.begin(), reversed_path.end());
return {reversed_path};
}

std::vector<std::vector<Editop>> all_paths;
std::vector<std::vector<CppEditop>> all_paths;
double current_cost = dp[i][j];
const double tol = 1e-6;


if (i > 0 && std::abs((dp[i-1][j] + cost_map.at(DELETE)) - current_cost) < tol) {
Editop op(DELETE, i-1, i-1, cost_map.at(DELETE), std::string(1, a[i-1]));
CppEditop op(DELETE, i-1, i-1, cost_map.at(DELETE), std::string(1, a[i-1]));
current_path.push_back(op);
auto paths = backtrack_all_paths(a, b, cost_map, dp, i-1, j, current_path);
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
current_path.pop_back();
}

if (j > 0 && std::abs((dp[i][j-1] + cost_map.at(INSERT)) - current_cost) < tol) {
Editop op(INSERT, i, i, cost_map.at(INSERT), std::string(1, b[j-1]));
CppEditop op(INSERT, i, i, cost_map.at(INSERT), std::string(1, b[j-1]));
current_path.push_back(op);
auto paths = backtrack_all_paths(a, b, cost_map, dp, i, j-1, current_path);
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
Expand All @@ -89,7 +89,7 @@ std::vector<std::vector<Editop>> backtrack_all_paths(
double sub_cost = (a[i-1] == b[j-1]) ? 0.0 : cost_map.at(REPLACE);
if (std::abs((dp[i-1][j-1] + sub_cost) - current_cost) < tol) {
std::string out_char = (sub_cost == 0.0) ? std::string(1, a[i-1]) : std::string(1, b[j-1]);
Editop op(REPLACE, i-1, j-1, sub_cost, out_char);
CppEditop op(REPLACE, i-1, j-1, sub_cost, out_char);
current_path.push_back(op);
auto paths = backtrack_all_paths(a, b, cost_map, dp, i-1, j-1, current_path);
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
Expand All @@ -100,9 +100,9 @@ std::vector<std::vector<Editop>> backtrack_all_paths(

if (i > 1 && j > 1 &&
a[i-1] == b[j-2] && a[i-2] == b[j-1] &&
std::abs((dp[i-2][j-2] + cost_map.at(TRANSPOSE)) - current_cost) < tol) {
std::string transpose_str = std::string(1, b[j-2]) + std::string(1, b[j-1]);
Editop op(TRANSPOSE, i-2, j-2, cost_map.at(TRANSPOSE), transpose_str);
std::abs((dp[i-2][j-2] + cost_map.at(SWAP)) - current_cost) < tol) {
std::string swap_str = std::string(1, b[j-2]) + std::string(1, b[j-1]);
CppEditop op(SWAP, i-2, j-2, cost_map.at(SWAP), swap_str);
current_path.push_back(op);
auto paths = backtrack_all_paths(a, b, cost_map, dp, i-2, j-2, current_path);
all_paths.insert(all_paths.end(), paths.begin(), paths.end());
Expand All @@ -113,21 +113,21 @@ std::vector<std::vector<Editop>> backtrack_all_paths(
}


std::vector<std::vector<Editop>> cpp_compute_all_paths(
std::vector<std::vector<CppEditop>> cpp_compute_all_paths(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
const std::map<CppEditopName, double>& cost_map
) {
auto dp = compute_dp_table(a, b, cost_map);
std::vector<Editop> current_path;
std::vector<CppEditop> current_path;
return backtrack_all_paths(a, b, cost_map, dp, a.length(), b.length(), current_path);
}


void cpp_print_all_paths(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
const std::map<CppEditopName, double>& cost_map
) {
auto paths = cpp_compute_all_paths(a, b, cost_map);
double distance = cpp_compute_distance(a, b, cost_map);
Expand All @@ -145,18 +145,18 @@ void cpp_print_all_paths(
}
}

std::string editop_name_to_string(EditopName name) {
std::string editop_name_to_string(CppEditopName name) {
switch (name) {
case INSERT: return "INSERT";
case DELETE: return "DELETE";
case REPLACE: return "REPLACE";
case TRANSPOSE: return "TRANSPOSE";
case SWAP: return "SWAP";
default: return "UNKNOWN";
}
}

std::ostream& operator<<(std::ostream& os, const Editop& op) {
os << "Editop(name=" << editop_name_to_string(op.name)
std::ostream& operator<<(std::ostream& os, const CppEditop& op) {
os << "CppEditop(name=" << editop_name_to_string(op.name)
<< ", src_idx=" << op.src_idx
<< ", dst_idx=" << op.dst_idx
<< ", cost=" << op.cost
Expand Down
32 changes: 16 additions & 16 deletions editdistance/_edit_distance_osa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,66 +7,66 @@
#include <iostream>


enum EditopName {
enum CppEditopName {
INSERT,
DELETE,
REPLACE,
TRANSPOSE
SWAP
};

struct Editop {
EditopName name;
struct CppEditop {
CppEditopName name;
int src_idx;
int dst_idx;
double cost;
std::string output_string;

Editop() : name(INSERT), src_idx(0), dst_idx(0), cost(0.0), output_string("") {}
Editop(EditopName n, int si, int di, double c, const std::string& os)
CppEditop() : name(INSERT), src_idx(0), dst_idx(0), cost(0.0), output_string("") {}
CppEditop(CppEditopName n, int si, int di, double c, const std::string& os)
: name(n), src_idx(si), dst_idx(di), cost(c), output_string(os) {}
};


std::vector<std::vector<double>> compute_dp_table(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
const std::map<CppEditopName, double>& cost_map
);


double cpp_compute_distance(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
const std::map<CppEditopName, double>& cost_map
);


std::vector<std::vector<Editop>> backtrack_all_paths(
std::vector<std::vector<CppEditop>> backtrack_all_paths(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map,
const std::map<CppEditopName, double>& cost_map,
const std::vector<std::vector<double>>& dp,
int i,
int j,
std::vector<Editop>& current_path
std::vector<CppEditop>& current_path
);


std::vector<std::vector<Editop>> cpp_compute_all_paths(
std::vector<std::vector<CppEditop>> cpp_compute_all_paths(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
const std::map<CppEditopName, double>& cost_map
);


void cpp_print_all_paths(
const std::string& a,
const std::string& b,
const std::map<EditopName, double>& cost_map
const std::map<CppEditopName, double>& cost_map
);


std::string editop_name_to_string(EditopName name);
std::ostream& operator<<(std::ostream& os, const Editop& op);
std::string editop_name_to_string(CppEditopName name);
std::ostream& operator<<(std::ostream& os, const CppEditop& op);

#endif // EDIT_DISTANCE_OSA_HPP
111 changes: 76 additions & 35 deletions editdistance/edit_distance_osa.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,32 +9,32 @@ from enum import Enum


cdef extern from "_edit_distance_osa.hpp":
cdef enum EditopName:
cdef enum CppEditopName:
INSERT
DELETE
REPLACE
TRANSPOSE
SWAP

cdef struct Editop:
EditopName name
cdef struct CppEditop:
CppEditopName name
int src_idx
int dst_idx
double cost
string output_string

vector[vector[Editop]] cpp_compute_all_paths(const string& a, const string& b, const map[EditopName, double]& cost_map)
void cpp_print_all_paths(const string& a, const string& b, const map[EditopName, double]& cost_map)
double cpp_compute_distance(const string& a, const string& b, const map[EditopName, double]& cost_map)
vector[vector[CppEditop]] cpp_compute_all_paths(const string& a, const string& b, const map[CppEditopName, double]& cost_map)
void cpp_print_all_paths(const string& a, const string& b, const map[CppEditopName, double]& cost_map)
double cpp_compute_distance(const string& a, const string& b, const map[CppEditopName, double]& cost_map)


class PyEditopName(Enum):
class EditopName(Enum):
INSERT = 0
DELETE = 1
REPLACE = 2
TRANSPOSE = 3
SWAP = 3


cdef class PyEditop:
cdef class Editop:
cdef readonly object name
cdef readonly int src_idx
cdef readonly int dst_idx
Expand All @@ -52,41 +52,56 @@ cdef class PyEditop:
return f"Editop(name={self.name}, src_idx={self.src_idx}, dst_idx={self.dst_idx}, cost={self.cost}, output_string='{self.output_string}')"


cdef map[EditopName, double] _convert_cost_map(dict cost_map):
cdef map[EditopName, double] cpp_cost_map
if PyEditopName.INSERT in cost_map:
cpp_cost_map[INSERT] = cost_map[PyEditopName.INSERT]
if PyEditopName.DELETE in cost_map:
cpp_cost_map[DELETE] = cost_map[PyEditopName.DELETE]
if PyEditopName.REPLACE in cost_map:
cpp_cost_map[REPLACE] = cost_map[PyEditopName.REPLACE]
if PyEditopName.TRANSPOSE in cost_map:
cpp_cost_map[TRANSPOSE] = cost_map[PyEditopName.TRANSPOSE]
cdef map[CppEditopName, double] _convert_cost_map(dict cost_map):
cdef map[CppEditopName, double] cpp_cost_map
if EditopName.INSERT in cost_map:
cpp_cost_map[INSERT] = cost_map[EditopName.INSERT]
if EditopName.DELETE in cost_map:
cpp_cost_map[DELETE] = cost_map[EditopName.DELETE]
if EditopName.REPLACE in cost_map:
cpp_cost_map[REPLACE] = cost_map[EditopName.REPLACE]
if EditopName.SWAP in cost_map:
cpp_cost_map[SWAP] = cost_map[EditopName.SWAP]
return cpp_cost_map


def compute_with_all_paths(str a, str b, dict cost_map):
def get_all_paths(
str a,
str b,
double replace_weight=1.0,
double insert_weight=1.0,
double delete_weight=1.0,
double swap_weight=1.0
):
cdef dict cost_map = {
EditopName.REPLACE: replace_weight,
EditopName.INSERT: insert_weight,
EditopName.DELETE: delete_weight,
EditopName.SWAP: swap_weight
}
cdef string cpp_a = a.encode("utf-8")
cdef string cpp_b = b.encode("utf-8")
cdef map[EditopName, double] cpp_cost_map = _convert_cost_map(cost_map)
cdef vector[vector[Editop]] cpp_paths = cpp_compute_all_paths(cpp_a, cpp_b, cpp_cost_map)
cdef map[CppEditopName, double] cpp_cost_map = _convert_cost_map(cost_map)
cdef vector[vector[CppEditop]] cpp_paths = cpp_compute_all_paths(cpp_a, cpp_b, cpp_cost_map)
python_paths = []
cdef vector[Editop] cpp_path
cdef Editop cpp_op
cdef vector[CppEditop] cpp_path
cdef CppEditop cpp_op
for cpp_path in cpp_paths:
python_path = []
for cpp_op in cpp_path:
if cpp_op.cost == 0:
continue
if cpp_op.name == INSERT:
py_name = PyEditopName.INSERT
py_name = EditopName.INSERT
elif cpp_op.name == DELETE:
py_name = PyEditopName.DELETE
py_name = EditopName.DELETE
elif cpp_op.name == REPLACE:
py_name = PyEditopName.REPLACE
elif cpp_op.name == TRANSPOSE:
py_name = PyEditopName.TRANSPOSE
py_name = EditopName.REPLACE
elif cpp_op.name == SWAP:
py_name = EditopName.SWAP
else:
py_name = None
python_path.append(PyEditop(
python_path.append(Editop(
py_name,
cpp_op.src_idx,
cpp_op.dst_idx,
Expand All @@ -97,15 +112,41 @@ def compute_with_all_paths(str a, str b, dict cost_map):
return python_paths


def print_all_paths(str a, str b, dict cost_map):
def print_all_paths(
str a,
str b,
double replace_weight=1.0,
double insert_weight=1.0,
double delete_weight=1.0,
double swap_weight=1.0
):
cdef dict cost_map = {
EditopName.REPLACE: replace_weight,
EditopName.INSERT: insert_weight,
EditopName.DELETE: delete_weight,
EditopName.SWAP: swap_weight
}
cdef string cpp_a = a.encode("utf-8")
cdef string cpp_b = b.encode("utf-8")
cdef map[EditopName, double] cpp_cost_map = _convert_cost_map(cost_map)
cdef map[CppEditopName, double] cpp_cost_map = _convert_cost_map(cost_map)
cpp_print_all_paths(cpp_a, cpp_b, cpp_cost_map)


def compute_distance(str a, str b, dict cost_map):
def compute_distance(
str a,
str b,
double replace_weight=1.0,
double insert_weight=1.0,
double delete_weight=1.0,
double swap_weight=1.0
):
cdef dict cost_map = {
EditopName.REPLACE: replace_weight,
EditopName.INSERT: insert_weight,
EditopName.DELETE: delete_weight,
EditopName.SWAP: swap_weight
}
cdef string cpp_a = a.encode("utf-8")
cdef string cpp_b = b.encode("utf-8")
cdef map[EditopName, double] cpp_cost_map = _convert_cost_map(cost_map)
cdef map[CppEditopName, double] cpp_cost_map = _convert_cost_map(cost_map)
return cpp_compute_distance(cpp_a, cpp_b, cpp_cost_map)
Loading