diff --git a/Emily_scripts/AA_coordinate-converter.sh b/Emily_scripts/AA_coordinate-converter.sh new file mode 100644 index 0000000..9ddcb27 --- /dev/null +++ b/Emily_scripts/AA_coordinate-converter.sh @@ -0,0 +1,20 @@ +#! /bin/bash + +''' +Requested by collaborators. From the phased mutation table generated in the DRM pipeline, this shell script maps the gag-pol coordinate and AA changes from WT to Sub into the protease, RTase, and integrase regions. +''' + +input_file="DRMtable.csv" +output_file="DRMoutput.csv" +AAPOS="3" +AAREF="5" +AASUB="6" + +awk -F',' 'BEGIN { + print "AAREF" "," "AAPOS" "," "AASUB" "," "CDS" +} +NR > 1 { + if ($'$AAPOS' >= 489 && $'$AAPOS' <= 587){print $'$AAREF' "," $'$AAPOS' - 488 "," $'$AASUB' "," "NP_705926.1"} + else if ($'$AAPOS' >= 588 && $'$AAPOS' <= 1147){print $'$AAREF' "," $'$AAPOS' - 587 "," $'$AASUB' "," "NP_705927.1"} + else if ($'$AAPOS' >= 1148 && $'$AAPOS' <= 1435){print $'$AAREF' "," $'$AAPOS' - 1147 "," $'$AASUB' "," "NP_705928.1"} +}' "$input_file" > "$output_file" \ No newline at end of file diff --git a/Emily_scripts/DRM_formula_printer.py b/Emily_scripts/DRM_formula_printer.py new file mode 100644 index 0000000..139a882 --- /dev/null +++ b/Emily_scripts/DRM_formula_printer.py @@ -0,0 +1,29 @@ +#DRM formula script +''' +This script is meant to synthesize wildtype and mutation information per each row of the DRM table and format the information into the JavaScript line necessary for the HIVE annotation table. +''' + +import pandas as pd + +# access the csv file + +# path = "C:/Users/Emily/OneDrive/Desktop/School/Thesis/DRM_mock_table.csv" + +path = "C:/Users/Emily/OneDrive/Desktop/School/Thesis/DRM_table_final.csv" + +drmFile = pd.read_csv(path, delimiter = ',') + +# check to see if the file has been located by printing the column headers + +print(drmFile.columns) + +# extract cell values from rows and make into a single row + +drmFile['formula'] = "${seq}== '" + drmFile['refseq_protein_accession'].astype(str) + "' && (int)${pos}==" + drmFile['amino_acid_pos_refseq'].astype(str) + " && ${wt}=='" + drmFile['wt_amino_acid'] + "' && ${mut}=='" + drmFile['mut_amino_acid'] + "'" + +print(drmFile['formula']) + +# save as new csv file with formula column + +drmFile.to_csv('drm_table_export.csv', index=False) + diff --git a/Emily_scripts/DRM_shifter.py b/Emily_scripts/DRM_shifter.py new file mode 100644 index 0000000..ff140b8 --- /dev/null +++ b/Emily_scripts/DRM_shifter.py @@ -0,0 +1,24 @@ +import pandas as pd +with open('input_file.csv') as inputFile: + df = pd.read_csv(inputFile) + +# categorize rows based on mature peptide +bins=[0,489,588,1148,1436] +labels=['gag','protease','reverse transcriptase','integrase'] +df['labels']=pd.cut(df['AAPOS'], bins=bins, labels=labels, include_lowest=True) + +# change the value in CDS to reflect the mature peptides +df.loc[df['labels']=='protease', 'CDS'] = 'NP_705926.1' +df.loc[df['labels']=='reverse transcriptase', 'CDS'] = 'NP_705927.1' +df.loc[df['labels']=='integrase', 'CDS'] = 'NP_705928.1' + +# Change AAPOS to the correct value mapped to the mature peptide +df.loc[df['labels'] == 'protease','AAPOS'] -= 488 +df.loc[df['labels'] == 'reverse transcriptase','AAPOS'] -= 587 +df.loc[df['labels'] == 'integrase','AAPOS'] -= 1147 + +# remove gag-pol rows that are not in the mature peptide CDS regions and save to a new CSV file labeled "outfile" +slabels = ['protease','reverse transcriptase','integrase'] +df1 = df[df['labels'].isin(slabels)] +df1.drop('labels', axis=1, inplace=True) +df1.to_csv('outfile.csv', index=False) \ No newline at end of file