Spaces:
Running
Running
Commit
·
02bf24d
1
Parent(s):
fa18295
Create add_annotations.py
Browse files- code/add_annotations.py +211 -0
code/add_annotations.py
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ssl
|
2 |
+
import requests as r
|
3 |
+
from decimal import *
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import json
|
7 |
+
import ast
|
8 |
+
|
9 |
+
UNIPROT_ANNOTATION_COLS = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
|
10 |
+
'activeSite',
|
11 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
|
12 |
+
'crosslink', 'mutagenesis', 'strand',
|
13 |
+
'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain',
|
14 |
+
'caBinding', 'bindingSite', 'region',
|
15 |
+
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
|
16 |
+
'coiledCoil', 'peptide',
|
17 |
+
'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary',
|
18 |
+
'intMetBinary', 'intramembraneBinary',
|
19 |
+
'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
|
20 |
+
'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
|
21 |
+
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
|
22 |
+
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
23 |
+
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
|
24 |
+
'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
|
25 |
+
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
|
26 |
+
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
|
27 |
+
'glycosylationBinary', 'propeptideBinary']
|
28 |
+
|
29 |
+
annotation_list = UNIPROT_ANNOTATION_COLS[0:30]
|
30 |
+
|
31 |
+
def add_annotations(dataframe):
|
32 |
+
print('Downloading UniProt sequence annotations...\n')
|
33 |
+
ssl._create_default_https_context = ssl._create_unverified_context
|
34 |
+
|
35 |
+
original_annot_name = ['DISULFID', 'INIT_MET', 'INTRAMEM', 'VARIANT', 'DNA_BIND', 'ACT_SITE', 'NP_BIND', 'LIPID',
|
36 |
+
'SITE', 'TRANSMEM', 'CROSSLNK', 'MUTAGEN', 'STRAND', 'HELIX', 'TURN', 'METAL', 'REPEAT', 'TOPO_DOM',
|
37 |
+
'CA_BIND', 'BINDING', 'REGION', 'SIGNAL', 'MOD_RES', 'ZN_FING', 'MOTIF', 'COILED', 'PEPTIDE',
|
38 |
+
'TRANSIT', 'CARBOHYD', 'PROPEP']
|
39 |
+
|
40 |
+
annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
|
41 |
+
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
|
42 |
+
'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
|
43 |
+
'region', 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
|
44 |
+
'transitPeptide', 'glycosylation', 'propeptide']
|
45 |
+
|
46 |
+
dataframe = dataframe.reset_index().drop(['index'], axis=1)
|
47 |
+
for protein in list(set(dataframe.uniprotID.to_list())):
|
48 |
+
print('Retieving annotations for ' + protein)
|
49 |
+
uniprot_entry = r.get("http://www.uniprot.org/uniprot/" + protein + ".txt")
|
50 |
+
uniprot_entry = uniprot_entry.text.split('\n')
|
51 |
+
annot_for_protein = []
|
52 |
+
for annotation in original_annot_name:
|
53 |
+
for line in uniprot_entry:
|
54 |
+
if annotation.strip() in line and line.startswith(
|
55 |
+
'FT') and 'evidence' not in line and 'ECO' not in line and 'note' not in line:
|
56 |
+
annot_for_protein.append(list(filter(None, line.split(' ')))[1:])
|
57 |
+
annotations_present = []
|
58 |
+
for select in annot_for_protein:
|
59 |
+
if select[0] not in annotations_present:
|
60 |
+
dataframe.loc[dataframe.uniprotID == protein, select[0]] = str((select[1].replace('..', '-') + '; '))
|
61 |
+
annotations_present.append(select[0])
|
62 |
+
else:
|
63 |
+
dataframe.loc[dataframe.uniprotID == protein, select[0]] += str((select[1].replace('..', '-') + '; '))
|
64 |
+
missingAnnotations = list(set(original_annot_name) - set(annotations_present))
|
65 |
+
for miss in missingAnnotations:
|
66 |
+
dataframe.loc[dataframe.uniprotID == protein, miss] = np.NaN
|
67 |
+
|
68 |
+
for i in range(len(original_annot_name)):
|
69 |
+
dataframe = dataframe.rename(columns={original_annot_name[i]: annotation_list[i]})
|
70 |
+
# Fix annotation positions
|
71 |
+
print('Processing positions...\n')
|
72 |
+
for i in dataframe.index:
|
73 |
+
all_positions = []
|
74 |
+
for annot in annotation_list:
|
75 |
+
if (annot != 'disulfide') & (pd.isna(dataframe.at[i, annot]) != True):
|
76 |
+
dataframe.at[i, annot] = [x for x in [k.strip() for k in dataframe.at[i, annot].split(';')] if x]
|
77 |
+
all_positions.append(dataframe.at[i, annot])
|
78 |
+
elif (annot == 'disulfide') & (pd.isna(dataframe.at[i, annot]) != True):
|
79 |
+
dataframe.at[i, annot] = dataframe.at[i, annot].split(';')
|
80 |
+
dataframe.at[i, annot] = [i.split('-') for i in dataframe.at[i, annot]]
|
81 |
+
dataframe.at[i, annot] = [e for v in dataframe.at[i, annot] for e in v]
|
82 |
+
dataframe.at[i, annot] = [i for i in dataframe.at[i, annot] if i != ' ']
|
83 |
+
all_positions.append(dataframe.at[i, annot])
|
84 |
+
dataframe.at[i, annot] = str(dataframe.at[i, annot])
|
85 |
+
all_positions = [item for sublist in all_positions for item in sublist]
|
86 |
+
updated_allPos = []
|
87 |
+
for pos in all_positions:
|
88 |
+
if '-' in pos:
|
89 |
+
first = pos.split('-')[0]
|
90 |
+
second = pos.split('-')[1]
|
91 |
+
newPos = list(range(int(first), int(second)+1))
|
92 |
+
updated_allPos += newPos
|
93 |
+
else:
|
94 |
+
updated_allPos.append(int(pos))
|
95 |
+
updated_allPos.append(dataframe.at[i, 'pos'])
|
96 |
+
updated_allPos.append(dataframe.at[i, 'domEnd'])
|
97 |
+
updated_allPos.append(dataframe.at[i, 'domStart'])
|
98 |
+
updated_allPos = [int(i) for i in updated_allPos]
|
99 |
+
dataframe.loc[i, 'POSITIONS'] = str(list(set(updated_allPos)))
|
100 |
+
|
101 |
+
# Add binary annotations
|
102 |
+
print('Adding binary annotations...\n')
|
103 |
+
for i in dataframe.index:
|
104 |
+
for k in annotation_list: # get the positions of each attribute as a list
|
105 |
+
txt = k + 'Binary'
|
106 |
+
dataframe.at[i, txt] = np.NaN
|
107 |
+
try:
|
108 |
+
for positions in dataframe.at[i, k].split(','):
|
109 |
+
position = positions.strip('[').strip(']').replace("'", "")
|
110 |
+
if (position != np.NaN) and (position != '') and ('-' not in position) and (int(
|
111 |
+
dataframe.at[i, 'pos']) == int(position)):
|
112 |
+
dataframe.at[i, txt] = '1'
|
113 |
+
break
|
114 |
+
elif (position != np.NaN) and (position != '') and ('-' not in position) and (int(
|
115 |
+
dataframe.at[i, 'pos']) != int(position)):
|
116 |
+
dataframe.at[i, txt] = '0'
|
117 |
+
elif (position != np.NaN) and (position != '') and ('-' in position):
|
118 |
+
if int(position.split('-')[0]) < int(dataframe.at[i, 'pos']) < int(position.split('-')[1]):
|
119 |
+
dataframe.at[i, txt] = '1'
|
120 |
+
break
|
121 |
+
else:
|
122 |
+
dataframe.at[i, txt] = '0'
|
123 |
+
except:
|
124 |
+
ValueError
|
125 |
+
# Final corrections
|
126 |
+
dataframe = dataframe.replace({'[\'?\']': np.NaN})
|
127 |
+
dataframe = dataframe.replace({'[]': np.NaN})
|
128 |
+
dataframe = dataframe.replace({'': np.NaN})
|
129 |
+
dataframe = dataframe.fillna(np.NaN)
|
130 |
+
return dataframe
|
131 |
+
|
132 |
+
def changeUPtoPDB(dataframe):
|
133 |
+
for i in dataframe.index:
|
134 |
+
for col in annotation_list:
|
135 |
+
newList = []
|
136 |
+
if dataframe.at[i, col] != np.NaN:
|
137 |
+
if type(dataframe.at[i, col]) == str:
|
138 |
+
list_v = dataframe.at[i, col][1:-1].split(',')
|
139 |
+
positionList = [i.strip().strip('\'') for i in list_v]
|
140 |
+
elif type(dataframe.at[i, col]) == list:
|
141 |
+
positionList = dataframe.at[i, col]
|
142 |
+
else:
|
143 |
+
positionList = []
|
144 |
+
for position in positionList:
|
145 |
+
if '-' in position:
|
146 |
+
all_annots = list(range(int(position.split('-')[0]), int(position.split('-')[1])+1))
|
147 |
+
for annot in all_annots:
|
148 |
+
try:
|
149 |
+
newList.append(ast.literal_eval(dataframe.at[i, 'MATCHDICT'])[str(annot)])
|
150 |
+
except KeyError:
|
151 |
+
pass
|
152 |
+
except TypeError:
|
153 |
+
pass
|
154 |
+
else:
|
155 |
+
try:
|
156 |
+
newList.append(ast.literal_eval(dataframe.at[i, 'MATCHDICT'])[str(position)])
|
157 |
+
except KeyError:
|
158 |
+
pass
|
159 |
+
except TypeError:
|
160 |
+
pass
|
161 |
+
dataframe.loc[i, col] = str(newList)
|
162 |
+
return dataframe
|
163 |
+
|
164 |
+
|
165 |
+
def changeUPtoModels(dataframe):
|
166 |
+
dataframe.fillna(np.NaN, inplace=True)
|
167 |
+
for i in dataframe.index:
|
168 |
+
for col in annotation_list:
|
169 |
+
newList = []
|
170 |
+
if (dataframe.at[i, col] != np.NaN) or (type(dataframe.at[i, col]) != 'float'):
|
171 |
+
if (type(dataframe.at[i, col]) == str) and (str(dataframe.at[i, col]) != 'nan') :
|
172 |
+
list_v = dataframe.at[i, col][1:-1].split(',')
|
173 |
+
positionList = [i.strip().strip('\'') for i in list_v]
|
174 |
+
elif type(dataframe.at[i, col]) == list:
|
175 |
+
positionList = dataframe.at[i, col]
|
176 |
+
else:
|
177 |
+
positionList = []
|
178 |
+
|
179 |
+
if positionList != []:
|
180 |
+
for position in positionList:
|
181 |
+
if '-' in position:
|
182 |
+
all_annots = list(range(int(position.split('-')[0]), int(position.split('-')[1])+1))
|
183 |
+
newList += all_annots
|
184 |
+
else:
|
185 |
+
newList.append(str(position))
|
186 |
+
pass
|
187 |
+
else:
|
188 |
+
all_annots = np.NaN
|
189 |
+
else:
|
190 |
+
all_annots = np.NaN
|
191 |
+
newList = [str(i) for i in newList]
|
192 |
+
|
193 |
+
dataframe.loc[i, col] = str(newList)
|
194 |
+
|
195 |
+
return dataframe
|
196 |
+
|
197 |
+
|
198 |
+
def isZeroDistance(data):
|
199 |
+
data.fillna(np.NaN, inplace=True)
|
200 |
+
for i in data.index:
|
201 |
+
|
202 |
+
for col in UNIPROT_ANNOTATION_COLS[0:30]:
|
203 |
+
if data.at[i, col] != np.NaN:
|
204 |
+
if type(data.at[i, col]) != 'dict':
|
205 |
+
annotList = ast.literal_eval(data.at[i, col])
|
206 |
+
else:
|
207 |
+
annotList = data.at[i, col]
|
208 |
+
annotList = [int(i.strip()) for i in annotList if i != 'null']
|
209 |
+
if int(data.at[i, 'pos']) in annotList:
|
210 |
+
data.at[i, col] = 'hit'
|
211 |
+
return data
|