From 2cde6db21819891595eea6db0c994b5cc7c51b85 Mon Sep 17 00:00:00 2001 From: penningtonea <70979519+penningtonea@users.noreply.github.com> Date: Fri, 26 May 2023 16:46:45 -0400 Subject: [PATCH 1/2] testing with 2 scripts --- Emily_scripts/AA_coordinate-converter.sh | 20 ++++++++++++++++ Emily_scripts/DRM_formula_printer.py | 29 ++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 Emily_scripts/AA_coordinate-converter.sh create mode 100644 Emily_scripts/DRM_formula_printer.py diff --git a/Emily_scripts/AA_coordinate-converter.sh b/Emily_scripts/AA_coordinate-converter.sh new file mode 100644 index 0000000..9ddcb27 --- /dev/null +++ b/Emily_scripts/AA_coordinate-converter.sh @@ -0,0 +1,20 @@ +#! /bin/bash + +''' +Requested by collaborators. From the phased mutation table generated in the DRM pipeline, this shell script maps the gag-pol coordinate and AA changes from WT to Sub into the protease, RTase, and integrase regions. +''' + +input_file="DRMtable.csv" +output_file="DRMoutput.csv" +AAPOS="3" +AAREF="5" +AASUB="6" + +awk -F',' 'BEGIN { + print "AAREF" "," "AAPOS" "," "AASUB" "," "CDS" +} +NR > 1 { + if ($'$AAPOS' >= 489 && $'$AAPOS' <= 587){print $'$AAREF' "," $'$AAPOS' - 488 "," $'$AASUB' "," "NP_705926.1"} + else if ($'$AAPOS' >= 588 && $'$AAPOS' <= 1147){print $'$AAREF' "," $'$AAPOS' - 587 "," $'$AASUB' "," "NP_705927.1"} + else if ($'$AAPOS' >= 1148 && $'$AAPOS' <= 1435){print $'$AAREF' "," $'$AAPOS' - 1147 "," $'$AASUB' "," "NP_705928.1"} +}' "$input_file" > "$output_file" \ No newline at end of file diff --git a/Emily_scripts/DRM_formula_printer.py b/Emily_scripts/DRM_formula_printer.py new file mode 100644 index 0000000..139a882 --- /dev/null +++ b/Emily_scripts/DRM_formula_printer.py @@ -0,0 +1,29 @@ +#DRM formula script +''' +This script is meant to synthesize wildtype and mutation information per each row of the DRM table and format the information into the JavaScript line necessary for the HIVE annotation table. +''' + +import pandas as pd + +# access the csv file + +# path = "C:/Users/Emily/OneDrive/Desktop/School/Thesis/DRM_mock_table.csv" + +path = "C:/Users/Emily/OneDrive/Desktop/School/Thesis/DRM_table_final.csv" + +drmFile = pd.read_csv(path, delimiter = ',') + +# check to see if the file has been located by printing the column headers + +print(drmFile.columns) + +# extract cell values from rows and make into a single row + +drmFile['formula'] = "${seq}== '" + drmFile['refseq_protein_accession'].astype(str) + "' && (int)${pos}==" + drmFile['amino_acid_pos_refseq'].astype(str) + " && ${wt}=='" + drmFile['wt_amino_acid'] + "' && ${mut}=='" + drmFile['mut_amino_acid'] + "'" + +print(drmFile['formula']) + +# save as new csv file with formula column + +drmFile.to_csv('drm_table_export.csv', index=False) + From b1b6ab4213acc28cc39979ee9a5c41603fb395c8 Mon Sep 17 00:00:00 2001 From: penningtonea <70979519+penningtonea@users.noreply.github.com> Date: Thu, 6 Jun 2024 18:35:11 -0400 Subject: [PATCH 2/2] Adding the DRM coordinate converter script --- Emily_scripts/DRM_shifter.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 Emily_scripts/DRM_shifter.py diff --git a/Emily_scripts/DRM_shifter.py b/Emily_scripts/DRM_shifter.py new file mode 100644 index 0000000..ff140b8 --- /dev/null +++ b/Emily_scripts/DRM_shifter.py @@ -0,0 +1,24 @@ +import pandas as pd +with open('input_file.csv') as inputFile: + df = pd.read_csv(inputFile) + +# categorize rows based on mature peptide +bins=[0,489,588,1148,1436] +labels=['gag','protease','reverse transcriptase','integrase'] +df['labels']=pd.cut(df['AAPOS'], bins=bins, labels=labels, include_lowest=True) + +# change the value in CDS to reflect the mature peptides +df.loc[df['labels']=='protease', 'CDS'] = 'NP_705926.1' +df.loc[df['labels']=='reverse transcriptase', 'CDS'] = 'NP_705927.1' +df.loc[df['labels']=='integrase', 'CDS'] = 'NP_705928.1' + +# Change AAPOS to the correct value mapped to the mature peptide +df.loc[df['labels'] == 'protease','AAPOS'] -= 488 +df.loc[df['labels'] == 'reverse transcriptase','AAPOS'] -= 587 +df.loc[df['labels'] == 'integrase','AAPOS'] -= 1147 + +# remove gag-pol rows that are not in the mature peptide CDS regions and save to a new CSV file labeled "outfile" +slabels = ['protease','reverse transcriptase','integrase'] +df1 = df[df['labels'].isin(slabels)] +df1.drop('labels', axis=1, inplace=True) +df1.to_csv('outfile.csv', index=False) \ No newline at end of file