GW-HIVE · penningtonea · May 26, 2023 · Jun 6, 2024
diff --git a/Emily_scripts/AA_coordinate-converter.sh b/Emily_scripts/AA_coordinate-converter.sh
@@ -0,0 +1,20 @@
+#! /bin/bash
+
+'''
+Requested by collaborators. From the phased mutation table generated in the DRM pipeline, this shell script maps the gag-pol coordinate and AA changes from WT to Sub into the protease, RTase, and integrase regions.
+'''
+
+input_file="DRMtable.csv"
+output_file="DRMoutput.csv"
+AAPOS="3"
+AAREF="5"
+AASUB="6"
+
+awk -F',' 'BEGIN {
+    print "AAREF" "," "AAPOS" "," "AASUB" "," "CDS" 
+}
+NR > 1 {
+    if ($'$AAPOS' >= 489 && $'$AAPOS' <= 587){print $'$AAREF' "," $'$AAPOS' - 488 "," $'$AASUB' "," "NP_705926.1"}
+    else if ($'$AAPOS' >= 588 && $'$AAPOS' <= 1147){print $'$AAREF' "," $'$AAPOS' - 587 "," $'$AASUB' "," "NP_705927.1"} 
+    else if ($'$AAPOS' >= 1148 && $'$AAPOS' <= 1435){print $'$AAREF' "," $'$AAPOS' - 1147 "," $'$AASUB' "," "NP_705928.1"}
+}' "$input_file" > "$output_file"
diff --git a/Emily_scripts/DRM_formula_printer.py b/Emily_scripts/DRM_formula_printer.py
@@ -0,0 +1,29 @@
+#DRM formula script
+'''
+This script is meant to synthesize wildtype and mutation information per each row of the DRM table and format the information into the JavaScript line necessary for the HIVE annotation table.
+'''
+
+import pandas as pd 
+
+# access the csv file 
+
+# path = "C:/Users/Emily/OneDrive/Desktop/School/Thesis/DRM_mock_table.csv"
+
+path = "C:/Users/Emily/OneDrive/Desktop/School/Thesis/DRM_table_final.csv"
+
+drmFile = pd.read_csv(path, delimiter = ',')
+
+# check to see if the file has been located by printing the column headers
+
+print(drmFile.columns)
+
+# extract cell values from rows and make into a single row 
+
+drmFile['formula'] = "${seq}== '" + drmFile['refseq_protein_accession'].astype(str) + "' && (int)${pos}==" + drmFile['amino_acid_pos_refseq'].astype(str) + " && ${wt}=='" + drmFile['wt_amino_acid'] + "' && ${mut}=='" + drmFile['mut_amino_acid'] + "'"
+
+print(drmFile['formula'])
+
+# save as new csv file with formula column
+
+drmFile.to_csv('drm_table_export.csv', index=False)
+
diff --git a/Emily_scripts/DRM_shifter.py b/Emily_scripts/DRM_shifter.py
@@ -0,0 +1,24 @@
+import pandas as pd 
+with open('input_file.csv') as inputFile:
+    df = pd.read_csv(inputFile)
+
+# categorize rows based on mature peptide
+bins=[0,489,588,1148,1436]
+labels=['gag','protease','reverse transcriptase','integrase']    
+df['labels']=pd.cut(df['AAPOS'], bins=bins, labels=labels, include_lowest=True) 
+
+# change the value in CDS to reflect the mature peptides
+df.loc[df['labels']=='protease', 'CDS'] = 'NP_705926.1'
+df.loc[df['labels']=='reverse transcriptase', 'CDS'] = 'NP_705927.1'
+df.loc[df['labels']=='integrase', 'CDS'] = 'NP_705928.1'
+
+# Change AAPOS to the correct value mapped to the mature peptide
+df.loc[df['labels'] == 'protease','AAPOS'] -= 488
+df.loc[df['labels'] == 'reverse transcriptase','AAPOS'] -= 587
+df.loc[df['labels'] == 'integrase','AAPOS'] -= 1147
+
+# remove gag-pol rows that are not in the mature peptide CDS regions and save to a new CSV file labeled "outfile"
+slabels = ['protease','reverse transcriptase','integrase']
+df1 = df[df['labels'].isin(slabels)]
+df1.drop('labels', axis=1, inplace=True)
+df1.to_csv('outfile.csv', index=False)