Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions Emily_scripts/AA_coordinate-converter.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#! /bin/bash

'''
Requested by collaborators. From the phased mutation table generated in the DRM pipeline, this shell script maps the gag-pol coordinate and AA changes from WT to Sub into the protease, RTase, and integrase regions.
'''

input_file="DRMtable.csv"
output_file="DRMoutput.csv"
AAPOS="3"
AAREF="5"
AASUB="6"

awk -F',' 'BEGIN {
print "AAREF" "," "AAPOS" "," "AASUB" "," "CDS"
}
NR > 1 {
if ($'$AAPOS' >= 489 && $'$AAPOS' <= 587){print $'$AAREF' "," $'$AAPOS' - 488 "," $'$AASUB' "," "NP_705926.1"}
else if ($'$AAPOS' >= 588 && $'$AAPOS' <= 1147){print $'$AAREF' "," $'$AAPOS' - 587 "," $'$AASUB' "," "NP_705927.1"}
else if ($'$AAPOS' >= 1148 && $'$AAPOS' <= 1435){print $'$AAREF' "," $'$AAPOS' - 1147 "," $'$AASUB' "," "NP_705928.1"}
}' "$input_file" > "$output_file"
29 changes: 29 additions & 0 deletions Emily_scripts/DRM_formula_printer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#DRM formula script
'''
This script is meant to synthesize wildtype and mutation information per each row of the DRM table and format the information into the JavaScript line necessary for the HIVE annotation table.
'''

import pandas as pd

# access the csv file

# path = "C:/Users/Emily/OneDrive/Desktop/School/Thesis/DRM_mock_table.csv"

path = "C:/Users/Emily/OneDrive/Desktop/School/Thesis/DRM_table_final.csv"

drmFile = pd.read_csv(path, delimiter = ',')

# check to see if the file has been located by printing the column headers

print(drmFile.columns)

# extract cell values from rows and make into a single row

drmFile['formula'] = "${seq}== '" + drmFile['refseq_protein_accession'].astype(str) + "' && (int)${pos}==" + drmFile['amino_acid_pos_refseq'].astype(str) + " && ${wt}=='" + drmFile['wt_amino_acid'] + "' && ${mut}=='" + drmFile['mut_amino_acid'] + "'"

print(drmFile['formula'])

# save as new csv file with formula column

drmFile.to_csv('drm_table_export.csv', index=False)

24 changes: 24 additions & 0 deletions Emily_scripts/DRM_shifter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pandas as pd
with open('input_file.csv') as inputFile:
df = pd.read_csv(inputFile)

# categorize rows based on mature peptide
bins=[0,489,588,1148,1436]
labels=['gag','protease','reverse transcriptase','integrase']
df['labels']=pd.cut(df['AAPOS'], bins=bins, labels=labels, include_lowest=True)

# change the value in CDS to reflect the mature peptides
df.loc[df['labels']=='protease', 'CDS'] = 'NP_705926.1'
df.loc[df['labels']=='reverse transcriptase', 'CDS'] = 'NP_705927.1'
df.loc[df['labels']=='integrase', 'CDS'] = 'NP_705928.1'

# Change AAPOS to the correct value mapped to the mature peptide
df.loc[df['labels'] == 'protease','AAPOS'] -= 488
df.loc[df['labels'] == 'reverse transcriptase','AAPOS'] -= 587
df.loc[df['labels'] == 'integrase','AAPOS'] -= 1147

# remove gag-pol rows that are not in the mature peptide CDS regions and save to a new CSV file labeled "outfile"
slabels = ['protease','reverse transcriptase','integrase']
df1 = df[df['labels'].isin(slabels)]
df1.drop('labels', axis=1, inplace=True)
df1.to_csv('outfile.csv', index=False)