-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathPCAPlot.py
More file actions
110 lines (78 loc) · 4.21 KB
/
PCAPlot.py
File metadata and controls
110 lines (78 loc) · 4.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 31 23:99:00 2024
@author: Kathryn Stepwise Regression
"""
import argparse
def readEigenvecFile(eigenvec, dictPA, numPCs):
file = open(f"{eigenvec}")
for line in file:
split = line.strip().split()
ID = split[1]
dictPCA[ID] = {}
for i in range(0, args.numPCA):
PC = f"PC{i+1}"
dictPCA[ID][PC] = split[i+2]
return dictPCA
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='PCA plot')
data = parser.add_argument_group("Arguments")
data.add_argument('-L', '--parentalList', help='List of parental population ID (Ex: AFR,EUR,EAS,SAS,NAT)', required=False)
data.add_argument('-F', '--parentalFolder', help='Folder with the files with the samples ID for each parental population (Ex: /home/my/folder/Parentals)', required=False)
data.add_argument('-C', '--parentalColor', help='Color for each parental population in hexdecimal (Ex: #1874CD,#EE2C2C,#FFD39B,#E066FF,#008B45)', required=False)
data.add_argument('-p', '--pcaRef', help='File with PCA for reference file', required=False)
data.add_argument('-P', '--pcaTarget', help='File with PCA for target file', required=False)
data.add_argument('-n', '--numPCA', help='Number of PCAs to be ploted (default = 2)', required=False, default = 2, type=int)
data.add_argument('-N', '--popName', help='Name of your population. (default = TARGET)', required=False, default = "TARGET")
data.add_argument('-o', '--outputName', help='Prefix name to output files (Ex: MyResultsToPlotBecauseIAmAnAwesomeAndVeryProductivePerson)', required=False, default = 2)
args = parser.parse_args()
dictSample = {}
dictColor = {}
dictPCA = {}
if args.parentalList:
parentalList = args.parentalList.split(",")
for pop in parentalList:
file = open(f"{args.parentalFolder}/{pop}.txt")
for line in file:
dictSample[line.strip()] = pop
if args.parentalColor:
parentalColor = args.parentalColor.split(",")
for i in range(len(parentalColor)):
dictColor[parentalList[i]] = parentalColor[i].replace("\\", "")
print(dictColor)
if args.pcaRef:
dictPCA = readEigenvecFile(args.pcaRef, dictPCA, args.numPCA)
if args.pcaTarget:
dictPCA = readEigenvecFile(args.pcaTarget, dictPCA, args.numPCA)
outputRScript = open(f"{args.outputName}.R", "w")
outputTable = open(f"{args.outputName}.tsv", "w")
outputTable.write("ID\tPOP")
for i in range(1, args.numPCA+1, 2):
if i <= args.numPCA and i+1 <= args.numPCA:
outputTable.write(f"\tPC{i}\tPC{i+1}")
outputTable.write("\n")
for ID in dictPCA:
outputTable.write(f"{ID}")
pop = args.popName
if ID in dictSample:
pop = dictSample[ID]
outputTable.write(f"\t{pop}")
for i in range(1, args.numPCA+1, 2):
if i <= args.numPCA and i+1 <= args.numPCA:
PCi = dictPCA[ID][f"PC{i}"]
PCiPlusOne = dictPCA[ID][f"PC{i+1}"]
outputTable.write(f"\t{PCi}\t{PCiPlusOne}")
outputTable.write(f"\n")
outputTable.close()
#Prepare to scale_color_manual
colorToScale = f'\"{args.popName}\"=\"black\"'
for pop in parentalList:
colorToScale = colorToScale+f', \"{pop}\"=\"{dictColor[pop]}\"'
outputRScript.write(f'library(ggplot2)\n'
f'data = read.table(\"{args.outputName}.tsv\", sep = \"\t\", header = T)\n'
f'\n')
for i in range(1, args.numPCA+1, 2):
if i <= args.numPCA and i+1 <= args.numPCA:
outputRScript.write(f'ggplot(data, aes(PC{i}, PC{i+1}))+geom_point(aes(color=POP))+scale_color_manual(values=c({colorToScale}))\n')
outputRScript.close()
print(f"Now open your RScript ({args.outputName}.R) on R Studio")