-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path02_extract_features.py
More file actions
113 lines (95 loc) · 2.58 KB
/
02_extract_features.py
File metadata and controls
113 lines (95 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Python 3.6
This script will extract numerical features from the sequence:
- Amino acid count
- Average hydrophobicity
- Median hydrophobicity
- Average helical propensity
- Median helical propensity
Libraries:
- Pandas
- NumPy
by MAS 06.2019
"""
import time
import pandas as pd
import numpy as np
# Timer
start_time = time.time()
# Read in sequence data with classifications
DF = pd.read_csv("./ecoli_proteome.csv")
# Drop any data that does not have a classification
# There are no classifications if the subcellular localization is not
# annotated in the uniprot database
DF = DF.dropna(how="any")
# Dictionary of hydrophobicity scores
aa_hydro = {"I": 4.5,
"V": 4.2,
"L": 3.8,
"F": 2.8,
"C": 2.5,
"M": 1.9,
"A": 1.8,
"G": -0.4,
"T": -0.7,
"S": -0.8,
"W": -0.9,
"Y": -1.3,
"P": -1.6,
"H": -3.2,
"E": -3.5,
"Q": -3.5,
"D": -3.5,
"N": -3.5,
"K": -3.9,
"R": -4.5,
"X": 0,
"U": 0
}
# Dictionary of helical propensity scores
aa_secon = {"I": 0.97,
"V": 0.91,
"L": 1.3,
"F": 1.07,
"C": 1.11,
"M": 1.47,
"A": 1.29,
"G": 0.56,
"T": 0.82,
"S": 0.82,
"W": 0.99,
"Y": 0.72,
"P": 0.52,
"H": 1.22,
"E": 1.44,
"Q": 1.27,
"D": 1.040,
"N": 0.9,
"K": 1.23,
"R": 0.96,
"X": 0,
"U": 0
}
AAS = tuple(aa_hydro.keys())
def aa_count(aa, dataframe):
scores = ["hydro_mean", "ss_mean"]
aa = list(aa)
aa.extend(scores)
blank_df = pd.DataFrame(columns=aa)
for index, row in dataframe.iterrows():
counts = [row["Sequence"].count(a) / len(row["Sequence"])
for a in aa[:-len(scores)]]
counts.extend(score(row["Sequence"]))
blank_df.loc[index] = counts
return blank_df
def score(sequence):
hscore = [aa_hydro[res] for res in sequence]
sscore = [aa_secon[res] for res in sequence]
return [np.mean(hscore), np.mean(sscore)]
def get_all_features(aas=AAS, df=DF):
temp_df = aa_count(aas, df)
final_df = pd.concat([df, temp_df], axis=1)
final_df.to_csv("./ecoli_proteome_features.csv", index=False)
if __name__ == '__main__':
get_all_features()
print('run time = ' + str(time.time() - start_time) + ' s')