From f7bc9961d8688df9ea203fd82249b1535eb50c7f Mon Sep 17 00:00:00 2001 From: jackwadden Date: Fri, 22 Jan 2021 14:44:35 -0500 Subject: [PATCH 1/4] Refactor doubly nested 2D dynamic programming loops to save values. Provides a ~30% speedup in my swalign-dependent code. --- swalign/__init__.py | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/swalign/__init__.py b/swalign/__init__.py index 5f12b73..f1df340 100644 --- a/swalign/__init__.py +++ b/swalign/__init__.py @@ -103,6 +103,7 @@ def __init__(self, scoring_matrix, gap_penalty=-1, gap_extension_penalty=-1, gap self.full_query = full_query def align(self, ref, query, ref_name='', query_name='', rc=False): + orig_ref = ref orig_query = query @@ -122,38 +123,43 @@ def align(self, ref, query, ref_name='', query_name='', rc=False): # calculate matrix for row in range(1, matrix.rows): + + left = matrix.get(row, 0) + diag = matrix.get(row - 1, 0) + up = matrix.get(row - 1, 1) + for col in range(1, matrix.cols): - mm_val = matrix.get(row - 1, col - 1)[0] + self.scoring_matrix.score(query[row - 1], ref[col - 1], self.wildcard) + mm_val = diag[0] + self.scoring_matrix.score(query[row - 1], ref[col - 1], self.wildcard) ins_run = 0 del_run = 0 - if matrix.get(row - 1, col)[1] == 'i': - ins_run = matrix.get(row - 1, col)[2] - if matrix.get(row - 1, col)[0] == 0: + if up[1] == 'i': + ins_run = up[2] + if up[0] == 0: # no penalty to start the alignment ins_val = 0 else: if not self.gap_extension_decay: - ins_val = matrix.get(row - 1, col)[0] + self.gap_extension_penalty + ins_val = up[0] + self.gap_extension_penalty else: - ins_val = matrix.get(row - 1, col)[0] + min(0, self.gap_extension_penalty + ins_run * self.gap_extension_decay) + ins_val = up[0] + min(0, self.gap_extension_penalty + ins_run * self.gap_extension_decay) else: - ins_val = matrix.get(row - 1, col)[0] + self.gap_penalty + ins_val = up[0] + self.gap_penalty - if matrix.get(row, col - 1)[1] == 'd': - del_run = matrix.get(row, col - 1)[2] - if matrix.get(row, col - 1)[0] == 0: + if left[1] == 'd': + del_run = left[2] + if left[0] == 0: # no penalty to start the alignment del_val = 0 else: if not self.gap_extension_decay: - del_val = matrix.get(row, col - 1)[0] + self.gap_extension_penalty + del_val = left[0] + self.gap_extension_penalty else: - del_val = matrix.get(row, col - 1)[0] + min(0, self.gap_extension_penalty + del_run * self.gap_extension_decay) + del_val = left[0] + min(0, self.gap_extension_penalty + del_run * self.gap_extension_decay) else: - del_val = matrix.get(row, col - 1)[0] + self.gap_penalty + del_val = left[0] + self.gap_penalty if self.globalalign or self.full_query: cell_val = max(mm_val, del_val, ins_val) @@ -184,6 +190,12 @@ def align(self, ref, query, ref_name='', query_name='', rc=False): matrix.set(row, col, val) + # adjust all values + diag = up + left = val + if(col + 1 < matrix.cols): + up = matrix.get(row - 1, col + 1) + # backtrack if self.globalalign: # backtrack from last cell From e9ffe4c5d99e8e2c2297753b4d47ad6838a1630a Mon Sep 17 00:00:00 2001 From: jackwadden Date: Fri, 22 Jan 2021 14:48:20 -0500 Subject: [PATCH 2/4] Clarified comments --- swalign/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/swalign/__init__.py b/swalign/__init__.py index f1df340..34421f0 100644 --- a/swalign/__init__.py +++ b/swalign/__init__.py @@ -124,6 +124,7 @@ def align(self, ref, query, ref_name='', query_name='', rc=False): # calculate matrix for row in range(1, matrix.rows): + # saving matrix values and re-using them across columns as locals improves performance left = matrix.get(row, 0) diag = matrix.get(row - 1, 0) up = matrix.get(row - 1, 1) @@ -190,7 +191,7 @@ def align(self, ref, query, ref_name='', query_name='', rc=False): matrix.set(row, col, val) - # adjust all values + # adjust temp values to reduce total matrix accesses diag = up left = val if(col + 1 < matrix.cols): From 80e5b5dbc8840d7313cd1d841ae48fcfa65a2d13 Mon Sep 17 00:00:00 2001 From: jackwadden Date: Wed, 12 May 2021 12:41:49 -0400 Subject: [PATCH 3/4] Updated setup.py to enable cythonization. --- setup.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 968cbb7..6a447ed 100755 --- a/setup.py +++ b/setup.py @@ -1,8 +1,16 @@ -from setuptools import setup +from setuptools import setup, Extension +from Cython.Build import cythonize with open("README.md", "r") as fh: long_description = fh.read() +ext_modules = [ + Extension( + r'swalign', + [r'swalign/__init__.py'] + ), +] + setup(name='swalign', version='0.3.6', description='Smith-Waterman local aligner', @@ -10,9 +18,8 @@ long_description_content_type="text/markdown", author='Marcus Breese', author_email='marcus@breese.com', - url='http://github.com/mbreese/swalign/', packages=['swalign'], scripts=['bin/swalign'], python_requires='>=3.1', - + ext_modules=cythonize(ext_modules), ) From fc8e6afa90808a14d3a0da9a023b08aae00537d2 Mon Sep 17 00:00:00 2001 From: jackwadden Date: Wed, 12 May 2021 12:44:47 -0400 Subject: [PATCH 4/4] Updated README with cython compilation instructions. --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8ab1df5..ed1b293 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ subject to a decay to prioritize long gaps. The input files are FASTA format sequences, or strings of sequences. Here is some skeleton code to get you started: - +``` import swalign # choose your own values here… 2 and -1 are common. match = 2 @@ -19,5 +19,11 @@ Here is some skeleton code to get you started: sw = swalign.LocalAlignment(scoring) # you can also choose gap penalties, etc... alignment = sw.align('ACACACTA','AGCACACA') alignment.dump() +``` + +To cythonize this package for accelerated use, run the following and copy the .so file to the desired python library path. +``` +python setup.py build_ext --inplace +``` For other uses, see the script in bin/swalign or https://compgen.io/projects/swalign