diff --git a/README.md b/README.md index 15c0b5f..2d1d5dc 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ import editdistance.osa # Example usage: str1 = "kitten" str2 = "sitting" -distance = editdistance.osa.calculate_distance(str1, str2) +distance = editdistance.osa.calculate_distance(str1, str2, swap_weight=0.1) print(f"The edit distance between '{{}}' and '{{}}' is {{}}".format(str1, str2, distance)) ``` diff --git a/editdistance/_edit_distance_osa.cpp b/editdistance/_edit_distance_osa.cpp index 952f11e..b24eca1 100644 --- a/editdistance/_edit_distance_osa.cpp +++ b/editdistance/_edit_distance_osa.cpp @@ -5,7 +5,7 @@ std::vector> compute_dp_table( const std::string& a, const std::string& b, - const std::map& cost_map + const std::map& cost_map ) { int len_a = a.length(); int len_b = b.length(); @@ -30,7 +30,7 @@ std::vector> compute_dp_table( if (i > 1 && j > 1 && a[i-1] == b[j-2] && a[i-2] == b[j-1]) { dp[i][j] = std::min(dp[i][j], - dp[i-2][j-2] + cost_map.at(TRANSPOSE)); + dp[i-2][j-2] + cost_map.at(SWAP)); } } } @@ -42,34 +42,34 @@ std::vector> compute_dp_table( double cpp_compute_distance( const std::string& a, const std::string& b, - const std::map& cost_map + const std::map& cost_map ) { auto dp = compute_dp_table(a, b, cost_map); return dp[a.length()][b.length()]; } -std::vector> backtrack_all_paths( +std::vector> backtrack_all_paths( const std::string& a, const std::string& b, - const std::map& cost_map, + const std::map& cost_map, const std::vector>& dp, int i, int j, - std::vector& current_path + std::vector& current_path ) { if (i == 0 && j == 0) { - std::vector reversed_path = current_path; + std::vector reversed_path = current_path; std::reverse(reversed_path.begin(), reversed_path.end()); return {reversed_path}; } - std::vector> all_paths; + std::vector> all_paths; double current_cost = dp[i][j]; const double tol = 1e-6; if (i > 0 && std::abs((dp[i-1][j] + cost_map.at(DELETE)) - current_cost) < tol) { - Editop op(DELETE, i-1, i-1, cost_map.at(DELETE), std::string(1, a[i-1])); + CppEditop op(DELETE, i-1, i-1, cost_map.at(DELETE), std::string(1, a[i-1])); current_path.push_back(op); auto paths = backtrack_all_paths(a, b, cost_map, dp, i-1, j, current_path); all_paths.insert(all_paths.end(), paths.begin(), paths.end()); @@ -77,7 +77,7 @@ std::vector> backtrack_all_paths( } if (j > 0 && std::abs((dp[i][j-1] + cost_map.at(INSERT)) - current_cost) < tol) { - Editop op(INSERT, i, i, cost_map.at(INSERT), std::string(1, b[j-1])); + CppEditop op(INSERT, i, i, cost_map.at(INSERT), std::string(1, b[j-1])); current_path.push_back(op); auto paths = backtrack_all_paths(a, b, cost_map, dp, i, j-1, current_path); all_paths.insert(all_paths.end(), paths.begin(), paths.end()); @@ -89,7 +89,7 @@ std::vector> backtrack_all_paths( double sub_cost = (a[i-1] == b[j-1]) ? 0.0 : cost_map.at(REPLACE); if (std::abs((dp[i-1][j-1] + sub_cost) - current_cost) < tol) { std::string out_char = (sub_cost == 0.0) ? std::string(1, a[i-1]) : std::string(1, b[j-1]); - Editop op(REPLACE, i-1, j-1, sub_cost, out_char); + CppEditop op(REPLACE, i-1, j-1, sub_cost, out_char); current_path.push_back(op); auto paths = backtrack_all_paths(a, b, cost_map, dp, i-1, j-1, current_path); all_paths.insert(all_paths.end(), paths.begin(), paths.end()); @@ -100,9 +100,9 @@ std::vector> backtrack_all_paths( if (i > 1 && j > 1 && a[i-1] == b[j-2] && a[i-2] == b[j-1] && - std::abs((dp[i-2][j-2] + cost_map.at(TRANSPOSE)) - current_cost) < tol) { - std::string transpose_str = std::string(1, b[j-2]) + std::string(1, b[j-1]); - Editop op(TRANSPOSE, i-2, j-2, cost_map.at(TRANSPOSE), transpose_str); + std::abs((dp[i-2][j-2] + cost_map.at(SWAP)) - current_cost) < tol) { + std::string swap_str = std::string(1, b[j-2]) + std::string(1, b[j-1]); + CppEditop op(SWAP, i-2, j-2, cost_map.at(SWAP), swap_str); current_path.push_back(op); auto paths = backtrack_all_paths(a, b, cost_map, dp, i-2, j-2, current_path); all_paths.insert(all_paths.end(), paths.begin(), paths.end()); @@ -113,13 +113,13 @@ std::vector> backtrack_all_paths( } -std::vector> cpp_compute_all_paths( +std::vector> cpp_compute_all_paths( const std::string& a, const std::string& b, - const std::map& cost_map + const std::map& cost_map ) { auto dp = compute_dp_table(a, b, cost_map); - std::vector current_path; + std::vector current_path; return backtrack_all_paths(a, b, cost_map, dp, a.length(), b.length(), current_path); } @@ -127,7 +127,7 @@ std::vector> cpp_compute_all_paths( void cpp_print_all_paths( const std::string& a, const std::string& b, - const std::map& cost_map + const std::map& cost_map ) { auto paths = cpp_compute_all_paths(a, b, cost_map); double distance = cpp_compute_distance(a, b, cost_map); @@ -145,18 +145,18 @@ void cpp_print_all_paths( } } -std::string editop_name_to_string(EditopName name) { +std::string editop_name_to_string(CppEditopName name) { switch (name) { case INSERT: return "INSERT"; case DELETE: return "DELETE"; case REPLACE: return "REPLACE"; - case TRANSPOSE: return "TRANSPOSE"; + case SWAP: return "SWAP"; default: return "UNKNOWN"; } } -std::ostream& operator<<(std::ostream& os, const Editop& op) { - os << "Editop(name=" << editop_name_to_string(op.name) +std::ostream& operator<<(std::ostream& os, const CppEditop& op) { + os << "CppEditop(name=" << editop_name_to_string(op.name) << ", src_idx=" << op.src_idx << ", dst_idx=" << op.dst_idx << ", cost=" << op.cost diff --git a/editdistance/_edit_distance_osa.hpp b/editdistance/_edit_distance_osa.hpp index b7f86a9..2c82ff2 100644 --- a/editdistance/_edit_distance_osa.hpp +++ b/editdistance/_edit_distance_osa.hpp @@ -7,22 +7,22 @@ #include -enum EditopName { +enum CppEditopName { INSERT, DELETE, REPLACE, - TRANSPOSE + SWAP }; -struct Editop { - EditopName name; +struct CppEditop { + CppEditopName name; int src_idx; int dst_idx; double cost; std::string output_string; - Editop() : name(INSERT), src_idx(0), dst_idx(0), cost(0.0), output_string("") {} - Editop(EditopName n, int si, int di, double c, const std::string& os) + CppEditop() : name(INSERT), src_idx(0), dst_idx(0), cost(0.0), output_string("") {} + CppEditop(CppEditopName n, int si, int di, double c, const std::string& os) : name(n), src_idx(si), dst_idx(di), cost(c), output_string(os) {} }; @@ -30,43 +30,43 @@ struct Editop { std::vector> compute_dp_table( const std::string& a, const std::string& b, - const std::map& cost_map + const std::map& cost_map ); double cpp_compute_distance( const std::string& a, const std::string& b, - const std::map& cost_map + const std::map& cost_map ); -std::vector> backtrack_all_paths( +std::vector> backtrack_all_paths( const std::string& a, const std::string& b, - const std::map& cost_map, + const std::map& cost_map, const std::vector>& dp, int i, int j, - std::vector& current_path + std::vector& current_path ); -std::vector> cpp_compute_all_paths( +std::vector> cpp_compute_all_paths( const std::string& a, const std::string& b, - const std::map& cost_map + const std::map& cost_map ); void cpp_print_all_paths( const std::string& a, const std::string& b, - const std::map& cost_map + const std::map& cost_map ); -std::string editop_name_to_string(EditopName name); -std::ostream& operator<<(std::ostream& os, const Editop& op); +std::string editop_name_to_string(CppEditopName name); +std::ostream& operator<<(std::ostream& os, const CppEditop& op); #endif // EDIT_DISTANCE_OSA_HPP \ No newline at end of file diff --git a/editdistance/edit_distance_osa.pyx b/editdistance/edit_distance_osa.pyx index 553c02d..76c81b9 100644 --- a/editdistance/edit_distance_osa.pyx +++ b/editdistance/edit_distance_osa.pyx @@ -9,32 +9,32 @@ from enum import Enum cdef extern from "_edit_distance_osa.hpp": - cdef enum EditopName: + cdef enum CppEditopName: INSERT DELETE REPLACE - TRANSPOSE + SWAP - cdef struct Editop: - EditopName name + cdef struct CppEditop: + CppEditopName name int src_idx int dst_idx double cost string output_string - vector[vector[Editop]] cpp_compute_all_paths(const string& a, const string& b, const map[EditopName, double]& cost_map) - void cpp_print_all_paths(const string& a, const string& b, const map[EditopName, double]& cost_map) - double cpp_compute_distance(const string& a, const string& b, const map[EditopName, double]& cost_map) + vector[vector[CppEditop]] cpp_compute_all_paths(const string& a, const string& b, const map[CppEditopName, double]& cost_map) + void cpp_print_all_paths(const string& a, const string& b, const map[CppEditopName, double]& cost_map) + double cpp_compute_distance(const string& a, const string& b, const map[CppEditopName, double]& cost_map) -class PyEditopName(Enum): +class EditopName(Enum): INSERT = 0 DELETE = 1 REPLACE = 2 - TRANSPOSE = 3 + SWAP = 3 -cdef class PyEditop: +cdef class Editop: cdef readonly object name cdef readonly int src_idx cdef readonly int dst_idx @@ -52,41 +52,56 @@ cdef class PyEditop: return f"Editop(name={self.name}, src_idx={self.src_idx}, dst_idx={self.dst_idx}, cost={self.cost}, output_string='{self.output_string}')" -cdef map[EditopName, double] _convert_cost_map(dict cost_map): - cdef map[EditopName, double] cpp_cost_map - if PyEditopName.INSERT in cost_map: - cpp_cost_map[INSERT] = cost_map[PyEditopName.INSERT] - if PyEditopName.DELETE in cost_map: - cpp_cost_map[DELETE] = cost_map[PyEditopName.DELETE] - if PyEditopName.REPLACE in cost_map: - cpp_cost_map[REPLACE] = cost_map[PyEditopName.REPLACE] - if PyEditopName.TRANSPOSE in cost_map: - cpp_cost_map[TRANSPOSE] = cost_map[PyEditopName.TRANSPOSE] +cdef map[CppEditopName, double] _convert_cost_map(dict cost_map): + cdef map[CppEditopName, double] cpp_cost_map + if EditopName.INSERT in cost_map: + cpp_cost_map[INSERT] = cost_map[EditopName.INSERT] + if EditopName.DELETE in cost_map: + cpp_cost_map[DELETE] = cost_map[EditopName.DELETE] + if EditopName.REPLACE in cost_map: + cpp_cost_map[REPLACE] = cost_map[EditopName.REPLACE] + if EditopName.SWAP in cost_map: + cpp_cost_map[SWAP] = cost_map[EditopName.SWAP] return cpp_cost_map -def compute_with_all_paths(str a, str b, dict cost_map): +def get_all_paths( + str a, + str b, + double replace_weight=1.0, + double insert_weight=1.0, + double delete_weight=1.0, + double swap_weight=1.0 +): + cdef dict cost_map = { + EditopName.REPLACE: replace_weight, + EditopName.INSERT: insert_weight, + EditopName.DELETE: delete_weight, + EditopName.SWAP: swap_weight + } cdef string cpp_a = a.encode("utf-8") cdef string cpp_b = b.encode("utf-8") - cdef map[EditopName, double] cpp_cost_map = _convert_cost_map(cost_map) - cdef vector[vector[Editop]] cpp_paths = cpp_compute_all_paths(cpp_a, cpp_b, cpp_cost_map) + cdef map[CppEditopName, double] cpp_cost_map = _convert_cost_map(cost_map) + cdef vector[vector[CppEditop]] cpp_paths = cpp_compute_all_paths(cpp_a, cpp_b, cpp_cost_map) python_paths = [] - cdef vector[Editop] cpp_path - cdef Editop cpp_op + cdef vector[CppEditop] cpp_path + cdef CppEditop cpp_op for cpp_path in cpp_paths: python_path = [] for cpp_op in cpp_path: + if cpp_op.cost == 0: + continue if cpp_op.name == INSERT: - py_name = PyEditopName.INSERT + py_name = EditopName.INSERT elif cpp_op.name == DELETE: - py_name = PyEditopName.DELETE + py_name = EditopName.DELETE elif cpp_op.name == REPLACE: - py_name = PyEditopName.REPLACE - elif cpp_op.name == TRANSPOSE: - py_name = PyEditopName.TRANSPOSE + py_name = EditopName.REPLACE + elif cpp_op.name == SWAP: + py_name = EditopName.SWAP else: py_name = None - python_path.append(PyEditop( + python_path.append(Editop( py_name, cpp_op.src_idx, cpp_op.dst_idx, @@ -97,15 +112,41 @@ def compute_with_all_paths(str a, str b, dict cost_map): return python_paths -def print_all_paths(str a, str b, dict cost_map): +def print_all_paths( + str a, + str b, + double replace_weight=1.0, + double insert_weight=1.0, + double delete_weight=1.0, + double swap_weight=1.0 +): + cdef dict cost_map = { + EditopName.REPLACE: replace_weight, + EditopName.INSERT: insert_weight, + EditopName.DELETE: delete_weight, + EditopName.SWAP: swap_weight + } cdef string cpp_a = a.encode("utf-8") cdef string cpp_b = b.encode("utf-8") - cdef map[EditopName, double] cpp_cost_map = _convert_cost_map(cost_map) + cdef map[CppEditopName, double] cpp_cost_map = _convert_cost_map(cost_map) cpp_print_all_paths(cpp_a, cpp_b, cpp_cost_map) -def compute_distance(str a, str b, dict cost_map): +def compute_distance( + str a, + str b, + double replace_weight=1.0, + double insert_weight=1.0, + double delete_weight=1.0, + double swap_weight=1.0 +): + cdef dict cost_map = { + EditopName.REPLACE: replace_weight, + EditopName.INSERT: insert_weight, + EditopName.DELETE: delete_weight, + EditopName.SWAP: swap_weight + } cdef string cpp_a = a.encode("utf-8") cdef string cpp_b = b.encode("utf-8") - cdef map[EditopName, double] cpp_cost_map = _convert_cost_map(cost_map) + cdef map[CppEditopName, double] cpp_cost_map = _convert_cost_map(cost_map) return cpp_compute_distance(cpp_a, cpp_b, cpp_cost_map) diff --git a/examples/osa_example.py b/examples/osa_example.py index 5b701aa..c9451d7 100644 --- a/examples/osa_example.py +++ b/examples/osa_example.py @@ -6,38 +6,31 @@ try: from editdistance.osa import ( - PyEditopName, compute_distance, - compute_with_all_paths, - print_all_paths, + get_all_paths, ) def main(): - # Define cost map - cost_map = { - PyEditopName.DELETE: 1.0, - PyEditopName.INSERT: 1.0, - PyEditopName.REPLACE: 1.0, - PyEditopName.TRANSPOSE: 1.0, - } - # Test case from original Python code print("Testing OSA distance with all paths:") - print_all_paths("aaaaaaaaaa", "abaabababa", cost_map) - # Additional test case - print("\nAdditional test case:") - paths = compute_with_all_paths("CA", "AX", cost_map) - distance = compute_distance("CA", "AX", cost_map) + test_cases = ( + ("aaaaaaaaaa", "abaabababa"), + ("CA", "AX"), + ) + + for source, target in test_cases: + print(f"\nComputing OSA distance from '{source}' to '{target}':") + distance = compute_distance(source, target) + print(f"Distance: {distance}") - print(f"OSA Distance from 'CA' to 'AX': {distance}") - print(f"Number of optimal edit sequences: {len(paths)}") - print() + paths = get_all_paths(source, target) + print(f"Number of optimal edit sequences: {len(paths)}") - for i, path in enumerate(paths, 1): - print(f"Path {i}:") - for op in path: - print(f" {op}") + for i, path in enumerate(paths, 1): + print(f"Path {i}:") + for op in path: + print(f" {op}") print() if __name__ == "__main__": diff --git a/tests/tests_osa.py b/tests/tests_osa.py index f6fa7d0..6035ab2 100644 --- a/tests/tests_osa.py +++ b/tests/tests_osa.py @@ -1,9 +1,8 @@ import unittest from editdistance.osa import ( - PyEditopName, compute_distance, - compute_with_all_paths, + get_all_paths, ) COMPUTE_DISTANCE_TEST_CASES = [ @@ -25,14 +24,6 @@ class TestOsaDistance(unittest.TestCase): - def setUp(self): - self.cost_map = { - PyEditopName.DELETE: 1.0, - PyEditopName.INSERT: 1.0, - PyEditopName.REPLACE: 1.0, - PyEditopName.TRANSPOSE: 1.0, - } - def test_compute_distance(self): for ( description, @@ -43,10 +34,10 @@ def test_compute_distance(self): with self.subTest( description, ): - distance = compute_distance(source, target, self.cost_map) + distance = compute_distance(source, target) self.assertEqual(distance, expected_distance) - def test_compute_with_all_paths(self): + def test_get_all_paths(self): for ( description, source, @@ -56,5 +47,5 @@ def test_compute_with_all_paths(self): with self.subTest( description, ): - paths = compute_with_all_paths(source, target, self.cost_map) + paths = get_all_paths(source, target) self.assertEqual(len(paths), expected_num_paths)