-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
25 lines (18 loc) · 971 Bytes
/
preprocessing.py
File metadata and controls
25 lines (18 loc) · 971 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import os
import numpy as np
import pandas as pd
from utils import extract_data_from_Fasta, kmers, one_hot_encode_kmer
genomic_filepath = './data/genomic/'
amr_filepath = './data/amr/'
genomic_files = os.listdir(genomic_filepath)
amr_files = os.listdir(amr_filepath)
# create dataframes
wgs_files = pd.DataFrame([extract_data_from_Fasta(file, genomic_filepath) for file in genomic_files])
# amr_files = pd.DataFrame([extract_data_from_Fasta(file, amr_filepath) for file in amr_files])
wgs_files['kmers'] = wgs_files['sequence'].apply(kmers)
# amr_files['kmers'] = amr_files['sequence'].apply(kmers)
wgs_files['encoded_kmers'] = wgs_files['kmers'].apply(lambda kmers: [one_hot_encode_kmer(kmer) for kmer in kmers])
# amr_files['encoded_kmers'] = amr_files['kmers'].apply(lambda kmers: [one_hot_encode_kmer(kmer) for kmer in kmers])
print(wgs_files[['size', 'kmers', 'encoded_kmers']])
# wgs_files.to_hdf('data.h5', key='df', mode='w')
# wgs_files.to_csv('data.csv')