Skip to content

Commit 95ac1b1

Browse files
authored
[ENH] aptacom database loader (#158)
Added: AptaCom loader - Loads AptaCom dataset from hugging face in one of two formats: pandas dataframe, hugging face dataset;
1 parent a394f2a commit 95ac1b1

File tree

5 files changed

+195
-0
lines changed

5 files changed

+195
-0
lines changed

pyaptamer/datasets/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
"""Contains datasets along with their loaders."""
22

3+
from pyaptamer.datasets._loaders._aptacom_loader import (
4+
load_aptacom_full,
5+
load_aptacom_xy,
6+
)
37
from pyaptamer.datasets._loaders._csv_loader import load_csv_dataset
48
from pyaptamer.datasets._loaders._hf_loader import load_hf_dataset
59
from pyaptamer.datasets._loaders._one_gnh import load_1gnh, load_1gnh_structure
610
from pyaptamer.datasets._loaders._online_databank import load_from_rcsb
711
from pyaptamer.datasets._loaders._pfoa import load_pfoa, load_pfoa_structure
812

913
__all__ = [
14+
"load_aptacom_full",
15+
"load_aptacom_xy",
1016
"load_csv_dataset",
1117
"load_hf_dataset",
1218
"load_pfoa",

pyaptamer/datasets/_loaders/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,19 @@
11
"""Loaders for different data structures."""
22

3+
from pyaptamer.datasets._loaders._aptacom_loader import (
4+
load_aptacom_full,
5+
load_aptacom_xy,
6+
)
37
from pyaptamer.datasets._loaders._csv_loader import load_csv_dataset
48
from pyaptamer.datasets._loaders._hf_loader import load_hf_dataset
59
from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
610
from pyaptamer.datasets._loaders._pfoa import load_pfoa_structure
711

812
__all__ = [
13+
"load_pfoa_structure",
14+
"load_1gnh_structure",
15+
"load_aptacom_full",
16+
"load_aptacom_xy",
917
"load_csv_dataset",
1018
"load_hf_dataset",
1119
"load_pfoa_structure",
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
__author__ = "rpgv"
2+
__all__ = ["load_aptacom_full", "load_aptacom_xy"]
3+
4+
from pyaptamer.datasets._loaders._hf_loader import load_hf_dataset
5+
6+
filter_map = {
7+
"protein_target": ("target_chemistry", ["Protein", "peptide"]),
8+
"small_target": (
9+
"target_chemistry",
10+
["Small Organic", "Small Molecule", "Molecule"],
11+
),
12+
"dna_apt": (
13+
"aptamer_chemistry",
14+
[
15+
"DNA",
16+
"L-DNA",
17+
"ssDNA",
18+
"2',4'-BNA/LNA-DNA",
19+
"5-uracil-modified-DNA",
20+
"dsDNA",
21+
],
22+
),
23+
"rna_apt": (
24+
"aptamer_chemistry",
25+
[
26+
"RNA",
27+
"2'-F-RNA",
28+
"2'-NH2-RNA",
29+
"L-RNA",
30+
"2'-O-Me-RNA",
31+
"ssRNA",
32+
"2'-fluoro/amino-RNA",
33+
"2'-fluoro-RNA",
34+
"2'-amino-RNA",
35+
"2'-fluoro/O-Me-RNA",
36+
"5-uracil-modified-RNA",
37+
"4'-thio-RNA",
38+
],
39+
),
40+
}
41+
42+
43+
def filter_columns(ds, columns=None):
44+
""" " Selects columns to keep on dataset
45+
Parameters:
46+
-----------
47+
ds: pd dataframe, required
48+
Pandas dataframe to filter
49+
columns: list, optional, default=None
50+
If empty returns entire AptaCom dataset, otherwise
51+
returns only the selected columns from the
52+
AptaCom dataset
53+
Returns:
54+
--------
55+
object: pandas dataframe object with
56+
the selected columns
57+
"""
58+
59+
if columns is not None:
60+
ds = ds[columns]
61+
return ds
62+
63+
64+
def prepare_xy(ds):
65+
""" " Prepares dataset for usage as training data
66+
Parameters:
67+
-----------
68+
ds: pandas dataframe, required
69+
70+
Returns:
71+
--------
72+
Pandas dataframe object processed for training
73+
with columns "aptamer_sequence", "target_sequence",
74+
"new_affinity" and a total of 709 rows
75+
"""
76+
ds.dropna(
77+
subset=["aptamer_sequence", "target_sequence", "new_affinity"], inplace=True
78+
)
79+
ds = ds[["aptamer_sequence", "target_sequence", "new_affinity"]]
80+
return ds
81+
82+
83+
def load_aptacom_full(select_columns=None):
84+
"""Loads a AptaCom dataset from hugging face
85+
with customizable options.
86+
87+
Parameters:
88+
-----------
89+
select_columns: list, optional, default=None
90+
A list used to filter the columns dataset features.
91+
Defaults to empty, which returns the complete dataset.
92+
Column names:
93+
['reference',
94+
'aptamer_chemistry',
95+
'aptamer_name',
96+
'target_name',
97+
'aptamer_sequence',
98+
'origin',
99+
'target_chemistry',
100+
'external_id',
101+
'target_sequence',
102+
'new_affinity']
103+
104+
Returns:
105+
--------
106+
object: A pandas dataframe with 5556 rows in total.
107+
The returned object contains the dataset, possibly
108+
filtered with different columns.
109+
"""
110+
aptacom = load_hf_dataset("AptaCom", store=False)
111+
dataset = filter_columns(aptacom, columns=select_columns)
112+
113+
return dataset
114+
115+
116+
def load_aptacom_xy(return_X_y=False):
117+
"""Loads Aptacom dataset for training
118+
119+
Parameters:
120+
----------
121+
return_X_y: bool, optional, default = False
122+
If true returns X (aptamer and target sequence)
123+
and y (new_affinity) otherwise returns a
124+
pandas dataframe containing the three columns
125+
126+
Returns:
127+
--------
128+
Either a pandas dataframe with three columns
129+
or two pandas dataframe objects with two and one
130+
columns respectively.
131+
"""
132+
aptacom = load_hf_dataset("AptaCom", store=False)
133+
dataset = prepare_xy(aptacom)
134+
if return_X_y:
135+
X = dataset[["aptamer_sequence", "target_sequence"]]
136+
y = dataset[["new_affinity"]]
137+
return X, y
138+
return dataset
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
__author__ = "rpgv"
2+
3+
import pytest
4+
from pandas import DataFrame
5+
6+
from pyaptamer.datasets import load_aptacom_full, load_aptacom_xy
7+
8+
9+
@pytest.mark.parametrize(
10+
"select_columns",
11+
[
12+
["reference"],
13+
["aptamer_chemistry"],
14+
["aptamer_name"],
15+
["target_name"],
16+
["aptamer_sequence"],
17+
["origin"],
18+
["target_chemistry"],
19+
["external_id"],
20+
["target_sequence"],
21+
["new_affinity"],
22+
],
23+
)
24+
def test_load_aptacom_full(select_columns):
25+
"""
26+
The test_download_aptacom function
27+
"""
28+
dataset = load_aptacom_full(select_columns)
29+
if not isinstance(dataset, DataFrame):
30+
raise ValueError(f"""Dataset format {type(dataset)}
31+
is not DataFrame""")
32+
33+
34+
@pytest.mark.parametrize("return_X_y", [True, False])
35+
def test_download_aptacom_x_y(return_X_y):
36+
"""
37+
The test_download_aptacom function
38+
"""
39+
dataset = load_aptacom_xy(return_X_y)
40+
if not isinstance(dataset, tuple | DataFrame):
41+
raise ValueError(f"""Dataset format {type(dataset)}
42+
is not X, y tuple or DataFrame""")

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ dependencies = [
2424
"scikit-learn>=1.3.0",
2525
"skorch",
2626
"imblearn",
27+
"datasets",
2728
]
2829

2930
[project.optional-dependencies]

0 commit comments

Comments
 (0)