-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathplotSignalPeptideVenn.py
More file actions
138 lines (119 loc) · 5.85 KB
/
plotSignalPeptideVenn.py
File metadata and controls
138 lines (119 loc) · 5.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import sys,re,os,getopt
from scipy import stats
from scipy.stats import binom_test
import numpy as np
from Bio import SeqIO
import matplotlib as mpl
mpl.use('Agg') # enables saving graphics
import matplotlib.pyplot as plt
from matplotlib_venn import venn2,venn3
from collections import OrderedDict
###############
# SUBROUTINES #
###############
# retrieve list of gene IDs from qval set
# file1 format looks like this:
# geneID protein length log score p-val q-val
# ENST00000215832.11 MAPK1 5881 11.08212786285333 2.1972759969529324e-23 7.50809208158817e-20
def readQValueFile(qValueFile):
qValueDict = {}
f1 = open(qValueFile)
dataf1 = f1.readlines()
linePattern = re.compile('(\S*)') #\s(\S*)\s(\S*)\s(\S*)\s(\S*)\s(\S*)')
for line in dataf1:
if linePattern.search(line):
match = linePattern.search(line)
geneID = match.group(1)
qValueDict[geneID] = True
f1.close()
return qValueDict
# retrieve gene IDs and SignalP scores for all valid proteins from gencode human mRNA transcriptome set
# file2 format looks like this:
# geneID SignalP classificaiton signal peptide score "other" score
# ENST00000641515.2 OTHER 0.000304 0.999696
def readSignalPOutputFile(signalPOutputFile):
signalpScoreList = []
f2 = open(signalPOutputFile)
dataf2 = f2.readlines()
linePattern = re.compile('(\S*)\s(\S*)\s(\S*)\s(\S*)')
for line in dataf2:
if linePattern.search(line):
match = linePattern.search(line)
geneID = match.group(1)
signalpScore = float(match.group(3))
if signalpScore > 0.5:
signalpScoreList.append( ( geneID, float(signalpScore) ) )
f2.close()
return signalpScoreList
# retrieve gene IDs and Predisi scores for all valid proteins from gencode human mRNA transcriptome set
# file2 format looks like this:
# geneID Predisi classificaiton signal peptide score "other" score
# ENST00000641515.2 OTHER 0.000304 0.999696
def readPredisiOutputFile(predisiOutputFile):
predisiScoreList = []
f2 = open(predisiOutputFile)
dataf2 = f2.readlines()
linePattern = re.compile('(\S*)\s(\S*)\s(\S*)\s(\S*)')
for line in dataf2:
if linePattern.search(line):
match = linePattern.search(line)
geneID = match.group(1)
predisiScore = float(match.group(2))
if predisiScore > 0.5:
predisiScoreList.append( ( geneID, float(predisiScore) ) )
f2.close()
return predisiScoreList
####################
# GLOBAL VARIABLES #
####################
stops = ['UAA','UGA','UAG']
TICs = ['UAC','AAC','UUC','UAU','AAG','GAG','AAU','GAU','AUC','GAC','GUG']
codonMap = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
"UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
"UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*",
"UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W",
"CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
"CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
"CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
"CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
"AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
"ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
"AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
"AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
"GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
"GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
"GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
"GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
residueMap = {"F":("UUU","UUC"), "L":("UUA","UUG","CUU","CUC","CUA","CUG"), "S":("UCU","UCC","UCA","UCG","AGU","AGC"),
"Y":("UAU","UAC"), "*":("UAA","UGA","UAG"), "C":("UGU","UGC"), "W":("UGG",), "P":("CCU","CCC","CCA","CCG"),
"H":("CAU","CAC"), "Q":("CAA","CAG"), "R":("CGU","CGC","CGA","CGG","AGA","AGG"), "I":("AUU","AUC","AUA"),
"M":("AUG",), "T":("ACU","ACC","ACA","ACG"), "N":("AAU","AAC"), "K":("AAA","AAG"), "V":("GUU","GUC","GUA","GUG"),
"A":("GCU","GCC","GCA","GCG"), "D":("GAU","GAC"), "E":("GAA","GAG"), "G":("GGU","GGC","GGA","GGG")}
##########
## MAIN ##
##########
usage = "Usage: python " + sys.argv[0] + " <q-value file> <signalP output file> <prediSi output file>"
if len(sys.argv) != 4:
print usage
sys.exit()
qValueFile = sys.argv[1]
signalPOutputFile = sys.argv[2]
predisiOutputFile = sys.argv[3]
print('Reading q-value file.')
qValueDict = readQValueFile(qValueFile) # retrieve list of gene IDs from qval set
print('Reading signalP file.')
signalpScoreList = readSignalPOutputFile(signalPOutputFile) # retrieve gene IDs and SignalP scores for all valid proteins from gencode human mRNA transcriptome set
print('Reading prediSi file.')
predisiScoreList = readPredisiOutputFile(predisiOutputFile)
signalpSet = [entry[0] for entry in signalpScoreList] # this is SignalP geneIDs for ONLY the intersection of the gencode and qval sets
predisiSet = [entry[0] for entry in predisiScoreList] # this is PrediSi geneIDs for ONLY the intersection of the gencode and qval sets
qvalSet = []
for key in qValueDict:
qvalSet.append(key)
print('Plotting venn diagrams.')
plt.figure()
venn3([set(signalpSet), set(predisiSet), set(qvalSet)], set_labels = ('SignalP (all GENCODE)','PrediSi (all GENCODE)','significant PDCUB'))
plt.savefig("vennThreeway.pdf")
plt.savefig("vennThreeway.png")
plt.close()
print('Venn plotted.')