Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions extensions/multiplacement/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,14 @@

setup(
name='multiplacement',
version='1.0.0',
version='1.0.1',
author='Quinn Mood',
author_email='qmood1@umbc.edu',
description='C extension to expedite calculations for complex transcription factor binding',
ext_modules=[Extension("_multiplacement",['src/_multiplacement.c'])],
ext_modules=[
Extension("_multiplacement",
['src/_interface.c', 'src/_multiplacement.c', 'src/_aux.c'],
include_dirs = ['src'],
)
],
)
116 changes: 116 additions & 0 deletions extensions/multiplacement/src/_aux.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#include "_aux.h"
int min(int n, int k){
if (n > k)
return k;
else
return n;
}

unsigned long long bin(unsigned long long n, unsigned long long k){
if (n == k)
return 1;
unsigned long long c = 1;
for (unsigned long long i = 1; i <= k; i++, n--) {

if (c/i > ULONG_MAX/n) // return 0 on potential overflow
return 0;

c = c / i * n + c % i * n / i; // split c * n / i into (c / i * i + c % i) * n / i
}

return c;
}

float norm_cdf(float x, float mu, float sigma){
float z = (x - mu) / fabs(sigma);
return (1 + erff(z / sqrtf(2.0))) / 2.00;
}

float norm_pf(float x, float mu, float sigma){
if (sigma != 0)
return norm_cdf(x + 0.5, mu, sigma) - norm_cdf(x - 0.5, mu, sigma);
if (x == mu)
return 1.00;
return 0.00;
}

float get_numerator(int dna_length, int distance, float mu, float sigma){
float numerator = norm_pf(distance, mu, sigma);
if (sigma == 0.00)
return numerator;

float auc = norm_cdf(dna_length - 1, mu, sigma) - norm_cdf(0, mu, sigma);
if (auc < 0.000001)
auc = 0.000001;

if (numerator < 0.00001)
numerator = 0.00001;

return numerator /= auc;

}

float get_denominator(int d, int N, int L){
if (1 <= d && d <= L - N + 1)
return (float) bin(L - d, N - 1) / (float) bin(L, N);
return 0.0001;
}

float get_score(float *arr, int dna_length, int effective_length, int num_rec,
int gap_size, int max_length, int curr_conn, bool is_precomputed){
if (is_precomputed == false){
return log2f(get_numerator(dna_length, gap_size, arr[curr_conn * 2], arr[curr_conn * 2 + 1]) /
get_denominator(gap_size + 1, num_rec, effective_length));
}

return log2(arr[curr_conn * max_length + gap_size]) -
(
(
NUMERATORS[(effective_length - (gap_size + 1)) - 1] -
NUMERATORS[(num_rec - 1) - 1] -
NUMERATORS[effective_length - (gap_size + 1) - (num_rec - 1) - 1]
) -

(
NUMERATORS[effective_length - 1] -
NUMERATORS[num_rec - 1] -
NUMERATORS[effective_length - num_rec - 1]
)
);
}

int get_forward_offset(int index, int cols[], int num_rec) {
// finds the first possible possition for a pssm
// based on number of columns of preceding pssms

int offset = 0;
for (int i = 0; i < index; i++) {
offset += cols[i];
}
return offset;
}

int get_reverse_offset(int index, int cols[], int num_rec) {
// finds the last possible possition for a pssm
// based on number of columns of subsequenct pssms

int offset = 0;
for (int i = num_rec - 1; i >= index; i--) {
offset += cols[i];
}

// this is important to get the right number of possible alignments
return offset - 1;
}

int max_index(float *arr, int size) {
int max_index = 0;
for (int i = 0; i < size; i++) {

if (arr[i] > arr[max_index]) {
max_index = i;
}
}
return max_index;
}

30 changes: 30 additions & 0 deletions extensions/multiplacement/src/_aux.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#ifndef _AUX_H
#define _AUX_H
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <stdbool.h>
#include <limits.h>
#include "_constants.h"

int min(int n, int k);

unsigned long long bin(unsigned long long n, unsigned long long k);

float norm_cdf(float x, float mu, float sigma);

float norm_pf(float x, float mu, float sigma);

float get_numerator(int dna_length, int distance, float mu, float sigma);

float get_denominator(int d, int N, int L);

float get_score(float *arr, int dna_length, int effective_length, int num_rec, int gap_size, int max_length, int curr_conn, bool is_precomputed);

int get_forward_offset(int index, int cols[], int num_rec);

int get_reverse_offset(int index, int cols[], int num_rec);

int max_index(float *arr, int size);

#endif
5 changes: 4 additions & 1 deletion extensions/multiplacement/src/_constants.h

Large diffs are not rendered by default.

147 changes: 147 additions & 0 deletions extensions/multiplacement/src/_interface.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#include "_interface.h"

static int matrix_converter(PyObject *object, void *address) {
const int flags = PyBUF_C_CONTIGUOUS | PyBUF_FORMAT;
char datatype;
Py_buffer *view = address;

if (object == NULL)
goto exit;
if (PyObject_GetBuffer(object, view, flags) == -1) {
PyErr_SetString(PyExc_RuntimeError,
"position-weight matrix is not an array");
return 0;
}
datatype = view->format[0];
switch (datatype) {
case '@':
case '=':
case '<':
case '>':
case '!':
datatype = view->format[1];
break;
default:
break;
}
return Py_CLEANUP_SUPPORTED;

exit:
PyBuffer_Release(view);
return 0;
}

static char calculate__doc__[] =
" calculate(sequence, recognizers, recognizer_lengths, connectors, "
"recognizer_scores, connector_scores, connector_legnths)\n"
"\n"
"This function computes optimal placement for a \n"
"transcription factor composed of PSSM recogniers\n"
"and variable length connectors.\n";

static PyObject *py_calculate(PyObject *self, PyObject *args,
PyObject *keywords) {
const char *seq;
static char *kwlist[] = {
"sequence", "rec_matrices", "rec_lengths", "con_matrices",
"rec_scores", "con_scores", "con_lengths", "max_length", NULL};
Py_ssize_t len_seq;
Py_ssize_t num_rec;
int max_length;
PyObject *result = Py_None;
Py_buffer rec_matrices;
Py_buffer con_matrices;
Py_buffer rec_lengths;
Py_buffer rec_scores;
Py_buffer con_scores;
Py_buffer con_lengths;

rec_matrices.obj = NULL;
con_matrices.obj = NULL;
rec_lengths.obj = NULL;
rec_scores.obj = NULL;
con_scores.obj = NULL;
con_lengths.obj = NULL;
if (!PyArg_ParseTupleAndKeywords(
args, keywords, "y#O&O&O&O&O&O&i", kwlist, &seq, &len_seq,
matrix_converter, &rec_matrices, matrix_converter, &rec_lengths,
matrix_converter, &con_matrices, matrix_converter, &rec_scores,
matrix_converter, &con_scores, matrix_converter, &con_lengths, &max_length))
return NULL;
// sequence: DNA sequence used for placement
// rec_matrices: one dimensional flattened representation of the
// scoring matrices for all recognizers
// rec_lengths: length of each recognizer (the number of columns)
// con_matrices: one dimensional flattened representation of pre-computed
// scores for each connector for each gap length
// rec_scores: buffer used to store our calculated scores for recognizers
// con_scores: buffer used to score our calculated scores for connectors
// con_lengths: buffer used to store the length of each connector for our
// placment
num_rec = rec_lengths.shape[0];
float *rec_matrices_ptr = rec_matrices.buf;
float *con_matrices_ptr = con_matrices.buf;
float *con_scores_ptr = con_scores.buf;
float *rec_scores_ptr = rec_scores.buf;
int *rec_lengths_ptr = rec_lengths.buf;
int *con_lengths_ptr = con_lengths.buf;
bool is_precomputed = true;
// getting the number of alignments is needed for calculating the size
// of the array we will use to store the scores for each recognizer alignment
int forward_offset = get_forward_offset(0, rec_lengths_ptr, num_rec);
int reverse_offset = get_reverse_offset(0, rec_lengths_ptr, num_rec);
int num_alignments = len_seq - forward_offset - reverse_offset;
float *score_matrix =
PyMem_Calloc(num_alignments * num_rec, sizeof(*rec_matrices_ptr));
fill_matrix(seq, len_seq, rec_matrices_ptr, rec_lengths_ptr, num_rec,
score_matrix, num_alignments);

if (con_matrices.shape[0] == (num_rec - 1) * 2)
is_precomputed = false;
// traceback function breaks when the number of recognizers is less than
// two since it opperates on the assumption of having at least one connector
if (num_rec == 1) {
con_lengths_ptr[0] =
max_index(score_matrix, len_seq - forward_offset - reverse_offset);
rec_scores_ptr[0] = score_matrix[con_lengths_ptr[0]];
con_scores_ptr[0] = 0.00;
} else {
fill_traceback_matrix(score_matrix, num_alignments, con_matrices_ptr, rec_lengths_ptr,
num_rec, len_seq, con_scores_ptr, rec_scores_ptr,
con_lengths_ptr, max_length, is_precomputed);
}

PyMem_Free(score_matrix);
Py_INCREF(Py_None);
result = Py_None;
matrix_converter(NULL, &rec_matrices);
matrix_converter(NULL, &rec_lengths);
matrix_converter(NULL, &con_matrices);
matrix_converter(NULL, &rec_scores);
matrix_converter(NULL, &con_scores);
matrix_converter(NULL, &con_lengths);
return result;
}

static struct PyMethodDef methods[] = {
{
"calculate",
(PyCFunction)py_calculate,
METH_VARARGS | METH_KEYWORDS,
PyDoc_STR(calculate__doc__),
},
{NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
"_multiplacement",
PyDoc_STR("Fast calculations involving multiple connected PSSMs"),
-1,
methods,
NULL,
NULL,
NULL,
NULL};

PyObject *PyInit__multiplacement(void) { return PyModule_Create(&moduledef); }
10 changes: 10 additions & 0 deletions extensions/multiplacement/src/_interface.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#ifndef _INTERFACE_H
#define _INTERFACE_H
#define PY_SSIZE_T_CLEAN
#include <python3.11/Python.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "_multiplacement.h"

#endif
Loading