fatmacankara commited on
Commit
02bf24d
·
1 Parent(s): fa18295

Create add_annotations.py

Browse files
Files changed (1) hide show
  1. code/add_annotations.py +211 -0
code/add_annotations.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ssl
2
+ import requests as r
3
+ from decimal import *
4
+ import numpy as np
5
+ import pandas as pd
6
+ import json
7
+ import ast
8
+
9
+ UNIPROT_ANNOTATION_COLS = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
10
+ 'activeSite',
11
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
12
+ 'crosslink', 'mutagenesis', 'strand',
13
+ 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain',
14
+ 'caBinding', 'bindingSite', 'region',
15
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
16
+ 'coiledCoil', 'peptide',
17
+ 'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary',
18
+ 'intMetBinary', 'intramembraneBinary',
19
+ 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
20
+ 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
21
+ 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
22
+ 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
23
+ 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
24
+ 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
25
+ 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
26
+ 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
27
+ 'glycosylationBinary', 'propeptideBinary']
28
+
29
+ annotation_list = UNIPROT_ANNOTATION_COLS[0:30]
30
+
31
+ def add_annotations(dataframe):
32
+ print('Downloading UniProt sequence annotations...\n')
33
+ ssl._create_default_https_context = ssl._create_unverified_context
34
+
35
+ original_annot_name = ['DISULFID', 'INIT_MET', 'INTRAMEM', 'VARIANT', 'DNA_BIND', 'ACT_SITE', 'NP_BIND', 'LIPID',
36
+ 'SITE', 'TRANSMEM', 'CROSSLNK', 'MUTAGEN', 'STRAND', 'HELIX', 'TURN', 'METAL', 'REPEAT', 'TOPO_DOM',
37
+ 'CA_BIND', 'BINDING', 'REGION', 'SIGNAL', 'MOD_RES', 'ZN_FING', 'MOTIF', 'COILED', 'PEPTIDE',
38
+ 'TRANSIT', 'CARBOHYD', 'PROPEP']
39
+
40
+ annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
41
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
42
+ 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
43
+ 'region', 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
44
+ 'transitPeptide', 'glycosylation', 'propeptide']
45
+
46
+ dataframe = dataframe.reset_index().drop(['index'], axis=1)
47
+ for protein in list(set(dataframe.uniprotID.to_list())):
48
+ print('Retieving annotations for ' + protein)
49
+ uniprot_entry = r.get("http://www.uniprot.org/uniprot/" + protein + ".txt")
50
+ uniprot_entry = uniprot_entry.text.split('\n')
51
+ annot_for_protein = []
52
+ for annotation in original_annot_name:
53
+ for line in uniprot_entry:
54
+ if annotation.strip() in line and line.startswith(
55
+ 'FT') and 'evidence' not in line and 'ECO' not in line and 'note' not in line:
56
+ annot_for_protein.append(list(filter(None, line.split(' ')))[1:])
57
+ annotations_present = []
58
+ for select in annot_for_protein:
59
+ if select[0] not in annotations_present:
60
+ dataframe.loc[dataframe.uniprotID == protein, select[0]] = str((select[1].replace('..', '-') + '; '))
61
+ annotations_present.append(select[0])
62
+ else:
63
+ dataframe.loc[dataframe.uniprotID == protein, select[0]] += str((select[1].replace('..', '-') + '; '))
64
+ missingAnnotations = list(set(original_annot_name) - set(annotations_present))
65
+ for miss in missingAnnotations:
66
+ dataframe.loc[dataframe.uniprotID == protein, miss] = np.NaN
67
+
68
+ for i in range(len(original_annot_name)):
69
+ dataframe = dataframe.rename(columns={original_annot_name[i]: annotation_list[i]})
70
+ # Fix annotation positions
71
+ print('Processing positions...\n')
72
+ for i in dataframe.index:
73
+ all_positions = []
74
+ for annot in annotation_list:
75
+ if (annot != 'disulfide') & (pd.isna(dataframe.at[i, annot]) != True):
76
+ dataframe.at[i, annot] = [x for x in [k.strip() for k in dataframe.at[i, annot].split(';')] if x]
77
+ all_positions.append(dataframe.at[i, annot])
78
+ elif (annot == 'disulfide') & (pd.isna(dataframe.at[i, annot]) != True):
79
+ dataframe.at[i, annot] = dataframe.at[i, annot].split(';')
80
+ dataframe.at[i, annot] = [i.split('-') for i in dataframe.at[i, annot]]
81
+ dataframe.at[i, annot] = [e for v in dataframe.at[i, annot] for e in v]
82
+ dataframe.at[i, annot] = [i for i in dataframe.at[i, annot] if i != ' ']
83
+ all_positions.append(dataframe.at[i, annot])
84
+ dataframe.at[i, annot] = str(dataframe.at[i, annot])
85
+ all_positions = [item for sublist in all_positions for item in sublist]
86
+ updated_allPos = []
87
+ for pos in all_positions:
88
+ if '-' in pos:
89
+ first = pos.split('-')[0]
90
+ second = pos.split('-')[1]
91
+ newPos = list(range(int(first), int(second)+1))
92
+ updated_allPos += newPos
93
+ else:
94
+ updated_allPos.append(int(pos))
95
+ updated_allPos.append(dataframe.at[i, 'pos'])
96
+ updated_allPos.append(dataframe.at[i, 'domEnd'])
97
+ updated_allPos.append(dataframe.at[i, 'domStart'])
98
+ updated_allPos = [int(i) for i in updated_allPos]
99
+ dataframe.loc[i, 'POSITIONS'] = str(list(set(updated_allPos)))
100
+
101
+ # Add binary annotations
102
+ print('Adding binary annotations...\n')
103
+ for i in dataframe.index:
104
+ for k in annotation_list: # get the positions of each attribute as a list
105
+ txt = k + 'Binary'
106
+ dataframe.at[i, txt] = np.NaN
107
+ try:
108
+ for positions in dataframe.at[i, k].split(','):
109
+ position = positions.strip('[').strip(']').replace("'", "")
110
+ if (position != np.NaN) and (position != '') and ('-' not in position) and (int(
111
+ dataframe.at[i, 'pos']) == int(position)):
112
+ dataframe.at[i, txt] = '1'
113
+ break
114
+ elif (position != np.NaN) and (position != '') and ('-' not in position) and (int(
115
+ dataframe.at[i, 'pos']) != int(position)):
116
+ dataframe.at[i, txt] = '0'
117
+ elif (position != np.NaN) and (position != '') and ('-' in position):
118
+ if int(position.split('-')[0]) < int(dataframe.at[i, 'pos']) < int(position.split('-')[1]):
119
+ dataframe.at[i, txt] = '1'
120
+ break
121
+ else:
122
+ dataframe.at[i, txt] = '0'
123
+ except:
124
+ ValueError
125
+ # Final corrections
126
+ dataframe = dataframe.replace({'[\'?\']': np.NaN})
127
+ dataframe = dataframe.replace({'[]': np.NaN})
128
+ dataframe = dataframe.replace({'': np.NaN})
129
+ dataframe = dataframe.fillna(np.NaN)
130
+ return dataframe
131
+
132
+ def changeUPtoPDB(dataframe):
133
+ for i in dataframe.index:
134
+ for col in annotation_list:
135
+ newList = []
136
+ if dataframe.at[i, col] != np.NaN:
137
+ if type(dataframe.at[i, col]) == str:
138
+ list_v = dataframe.at[i, col][1:-1].split(',')
139
+ positionList = [i.strip().strip('\'') for i in list_v]
140
+ elif type(dataframe.at[i, col]) == list:
141
+ positionList = dataframe.at[i, col]
142
+ else:
143
+ positionList = []
144
+ for position in positionList:
145
+ if '-' in position:
146
+ all_annots = list(range(int(position.split('-')[0]), int(position.split('-')[1])+1))
147
+ for annot in all_annots:
148
+ try:
149
+ newList.append(ast.literal_eval(dataframe.at[i, 'MATCHDICT'])[str(annot)])
150
+ except KeyError:
151
+ pass
152
+ except TypeError:
153
+ pass
154
+ else:
155
+ try:
156
+ newList.append(ast.literal_eval(dataframe.at[i, 'MATCHDICT'])[str(position)])
157
+ except KeyError:
158
+ pass
159
+ except TypeError:
160
+ pass
161
+ dataframe.loc[i, col] = str(newList)
162
+ return dataframe
163
+
164
+
165
+ def changeUPtoModels(dataframe):
166
+ dataframe.fillna(np.NaN, inplace=True)
167
+ for i in dataframe.index:
168
+ for col in annotation_list:
169
+ newList = []
170
+ if (dataframe.at[i, col] != np.NaN) or (type(dataframe.at[i, col]) != 'float'):
171
+ if (type(dataframe.at[i, col]) == str) and (str(dataframe.at[i, col]) != 'nan') :
172
+ list_v = dataframe.at[i, col][1:-1].split(',')
173
+ positionList = [i.strip().strip('\'') for i in list_v]
174
+ elif type(dataframe.at[i, col]) == list:
175
+ positionList = dataframe.at[i, col]
176
+ else:
177
+ positionList = []
178
+
179
+ if positionList != []:
180
+ for position in positionList:
181
+ if '-' in position:
182
+ all_annots = list(range(int(position.split('-')[0]), int(position.split('-')[1])+1))
183
+ newList += all_annots
184
+ else:
185
+ newList.append(str(position))
186
+ pass
187
+ else:
188
+ all_annots = np.NaN
189
+ else:
190
+ all_annots = np.NaN
191
+ newList = [str(i) for i in newList]
192
+
193
+ dataframe.loc[i, col] = str(newList)
194
+
195
+ return dataframe
196
+
197
+
198
+ def isZeroDistance(data):
199
+ data.fillna(np.NaN, inplace=True)
200
+ for i in data.index:
201
+
202
+ for col in UNIPROT_ANNOTATION_COLS[0:30]:
203
+ if data.at[i, col] != np.NaN:
204
+ if type(data.at[i, col]) != 'dict':
205
+ annotList = ast.literal_eval(data.at[i, col])
206
+ else:
207
+ annotList = data.at[i, col]
208
+ annotList = [int(i.strip()) for i in annotList if i != 'null']
209
+ if int(data.at[i, 'pos']) in annotList:
210
+ data.at[i, col] = 'hit'
211
+ return data