TornikeO commited on
Commit
c73c4b9
·
1 Parent(s): ec6a83d

Fix the demo

Browse files
Files changed (1) hide show
  1. app.py +81 -60
app.py CHANGED
@@ -1,9 +1,15 @@
1
  import gradio as gr
2
- import torch
3
- import os
4
  from pathlib import Path
5
  from matchms import Spectrum
6
  from typing import List, Optional, Literal
 
 
 
 
 
 
 
 
7
  # os.system("nvidia-smi")
8
  # print("TORCH_CUDA", torch.cuda.is_available())
9
 
@@ -29,6 +35,7 @@ def preprocess_spectra(spectra: List[Spectrum]) -> Spectrum:
29
  return spectra
30
 
31
  def run(r_filepath:Path, q_filepath:Path,
 
32
  tolerance: float = 0.1,
33
  mz_power: float = 0.0,
34
  intensity_power: float = 1.0,
@@ -37,7 +44,9 @@ def run(r_filepath:Path, q_filepath:Path,
37
  n_max_peaks: int = 1024,
38
  match_limit: int = 2048,
39
  array_type: Literal['sparse','numpy'] = "numpy",
40
- sparse_threshold: float = .75):
 
 
41
  print('\n>>>>', r_filepath, q_filepath, array_type, '\n')
42
  # debug = os.getenv('CUDAMS_DEBUG') == '1'
43
  # if debug:
@@ -46,65 +55,63 @@ def run(r_filepath:Path, q_filepath:Path,
46
 
47
  assert r_filepath is not None, "Reference file is missing."
48
  assert q_filepath is not None, "Query file is missing."
49
- import tempfile
50
- import numpy as np
51
- from simms.similarity import CudaCosineGreedy
52
- from matchms.importing import load_from_mgf
53
- from matchms import calculate_scores
54
- import matplotlib.pyplot as plt
55
 
56
- refs = preprocess_spectra(list(load_from_mgf(str(r_filepath))))
57
- ques = preprocess_spectra(list(load_from_mgf(str(q_filepath))))
 
 
58
 
59
  # If we have small spectra, don't make a huge batch
60
  if batch_size > max(len(refs), len(ques)):
61
  batch_size = max(len(refs), len(ques))
62
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  scores_obj = calculate_scores(
64
  refs, ques,
65
- similarity_function=CudaCosineGreedy(
66
- tolerance=tolerance,
67
- mz_power=mz_power,
68
- intensity_power=intensity_power,
69
- shift=shift,
70
- batch_size=batch_size,
71
- n_max_peaks=n_max_peaks,
72
- match_limit=match_limit,
73
- sparse_threshold=sparse_threshold
74
- ),
75
  array_type=array_type
76
  )
77
 
78
  score_vis = tempfile.NamedTemporaryFile(suffix='.jpg', delete=False)
79
 
80
- fig, axs = plt.subplots(1, 2,
81
- figsize=(10, 5),
82
- dpi=150)
83
-
84
  scores = scores_obj.to_array()
85
- ax = axs[0]
86
- ax.imshow(scores['CudaCosineGreedy_score'])
87
-
88
- ax = axs[1]
89
- ax.imshow(scores['CudaCosineGreedy_matches'])
 
 
 
90
 
91
- plt.suptitle("Score and matches")
92
  plt.savefig(score_vis.name)
93
 
94
- score = tempfile.NamedTemporaryFile(suffix='.npz', delete=False)
95
  np.savez(score.name, scores=scores)
96
 
97
- import pickle
98
- pickle_ = tempfile.NamedTemporaryFile(suffix='.pickle', delete=False)
99
 
100
  Path(pickle_.name).write_bytes(pickle.dumps(scores_obj))
101
  return score.name, score_vis.name, pickle_.name
102
 
103
  with gr.Blocks() as demo:
104
  gr.Markdown("""
105
- # CudaMS - Faster mass spectrometry
106
- Calculate cosine greedy similarity matrix using CUDA. See [main repo](https://github.com/tornikeo/cudams) for this project.
107
- This approach is x100-x500 faster than [MatchMS](https://github.com/matchms/matchms/). Upload your MGF files below, or run the sample `pesticides.mgf` files against each other.
 
108
  """)
109
  with gr.Row():
110
  refs = gr.File(label="Upload REFERENCES.mgf",
@@ -113,28 +120,33 @@ with gr.Blocks() as demo:
113
  ques = gr.File(label="Upload QUERIES.mgf",
114
  interactive=True, value='pesticides.mgf')
115
  with gr.Row():
116
- tolerance = gr.Slider(minimum=0, maximum=1, value=0.1, label="Tolerance")
117
- mz_power = gr.Slider(minimum=0, maximum=2, value=0.0, label="mz_power")
118
- intensity_power = gr.Slider(minimum=0, maximum=2, value=1.0, label="Intensity Power")
119
- shift = gr.Slider(minimum=-10, maximum=10, value=0, label="Shift")
 
 
 
120
  with gr.Row():
121
- batch_size = gr.Number(value=2048, label="Batch Size", info='How many spectra to process pairwise, in one step. Limited by GPU size, default works well for the T4 GPU.')
122
- n_max_peaks = gr.Number(value=1024, label="Maximum Number of Peaks",
123
- info="Some spectra are too large to fit on GPU,"
124
- "so we have to trim them to only use the first "
125
- "n_max_peaks number of peaks.")
126
- match_limit = gr.Number(value=2048, label="Match Limit",
127
- info="Two very similar spectra of size N and M can have N * M matches, before filtering."
128
- "This doesn't fit on GPU, so we stop accumulating more matches once we have at most match_limit number of them."
129
- "In practice, a value of 2048 gives more than 99.99% accuracy on GNPS")
130
  with gr.Row():
131
- array_type = gr.Radio(['numpy', 'sparse'], value='numpy', type='value',
132
- label='How to handle outputs - if sparse, everything with score less than sparse_threshold will be discarded. If `numpy`, we disable sparse behaviour.')
133
- sparse_threshold = gr.Slider(minimum=0, maximum=1, value=0.75, label="Sparse Threshold",
134
- info="For very large results, when comparing, more than 10k x 10k, the output dense score matrix can grow too large for RAM."
135
- "While most of the scores aren't useful (near zero). This argument discards all scores less than sparse_threshold, and returns "
136
- "results as a SparseStack format."
137
- )
 
 
138
  with gr.Row():
139
  score_vis = gr.Image()
140
 
@@ -143,10 +155,19 @@ with gr.Blocks() as demo:
143
  interactive=False)
144
  out_pickle = gr.File(label="Download full `Scores` object (.pickle format)",
145
  interactive=False)
 
 
 
 
 
 
 
146
  btn = gr.Button("Run")
147
- btn.click(fn=run, inputs=[refs, ques, tolerance, mz_power, intensity_power, shift,
148
- batch_size, n_max_peaks, match_limit,
149
- array_type, sparse_threshold], outputs=[out_npz, score_vis, out_pickle])
 
 
150
 
151
  if __name__ == "__main__":
152
  demo.launch(debug=True)
 
1
  import gradio as gr
 
 
2
  from pathlib import Path
3
  from matchms import Spectrum
4
  from typing import List, Optional, Literal
5
+ import tempfile
6
+ import numpy as np
7
+ from simms.similarity import CudaCosineGreedy, CudaModifiedCosine
8
+ from matchms.importing import load_from_mgf
9
+ from matchms import calculate_scores
10
+ import matplotlib.pyplot as plt
11
+ import pickle
12
+
13
  # os.system("nvidia-smi")
14
  # print("TORCH_CUDA", torch.cuda.is_available())
15
 
 
35
  return spectra
36
 
37
  def run(r_filepath:Path, q_filepath:Path,
38
+ similarity_method: Literal['CosineGreedy','ModifiedCosine'] = 'CosineGreedy',
39
  tolerance: float = 0.1,
40
  mz_power: float = 0.0,
41
  intensity_power: float = 1.0,
 
44
  n_max_peaks: int = 1024,
45
  match_limit: int = 2048,
46
  array_type: Literal['sparse','numpy'] = "numpy",
47
+ sparse_threshold: float = .75,
48
+ do_preprocess: bool = False,
49
+ ):
50
  print('\n>>>>', r_filepath, q_filepath, array_type, '\n')
51
  # debug = os.getenv('CUDAMS_DEBUG') == '1'
52
  # if debug:
 
55
 
56
  assert r_filepath is not None, "Reference file is missing."
57
  assert q_filepath is not None, "Query file is missing."
 
 
 
 
 
 
58
 
59
+ refs, ques = list(load_from_mgf(str(r_filepath))), list(load_from_mgf(str(q_filepath)))
60
+ if do_preprocess:
61
+ refs = preprocess_spectra(refs)
62
+ ques = preprocess_spectra(ques)
63
 
64
  # If we have small spectra, don't make a huge batch
65
  if batch_size > max(len(refs), len(ques)):
66
  batch_size = max(len(refs), len(ques))
67
 
68
+
69
+ kwargs = dict(tolerance=tolerance, mz_power=mz_power, intensity_power=intensity_power, shift=shift, batch_size=batch_size,
70
+ n_max_peaks=n_max_peaks, match_limit=match_limit, sparse_threshold=sparse_threshold)
71
+
72
+ if similarity_method == 'ModifiedCosine' and shift != 0:
73
+ gr.Error("`ModifiedCosine` can not use shift - we will proceed as if shift is 0")
74
+
75
+ if similarity_method == 'ModifiedCosine':
76
+ kwargs.pop('shift')
77
+
78
+ similarity_class = CudaCosineGreedy if similarity_method == 'CosineGreedy' else CudaModifiedCosine
79
+
80
  scores_obj = calculate_scores(
81
  refs, ques,
82
+ similarity_function=similarity_class(**kwargs),
 
 
 
 
 
 
 
 
 
83
  array_type=array_type
84
  )
85
 
86
  score_vis = tempfile.NamedTemporaryFile(suffix='.jpg', delete=False)
87
 
 
 
 
 
88
  scores = scores_obj.to_array()
89
+
90
+ outputs = len(scores.dtype.names)
91
+
92
+ fig, axs = plt.subplots(1, outputs,
93
+ figsize=(5*outputs, 5))
94
+ for title, ax in zip(scores.dtype.names, axs):
95
+ ax.imshow(scores[title])
96
+ ax.set_title(title)
97
 
98
+ plt.suptitle("Output values")
99
  plt.savefig(score_vis.name)
100
 
101
+ score = tempfile.NamedTemporaryFile(prefix='scores-', suffix='.npz', delete=False)
102
  np.savez(score.name, scores=scores)
103
 
104
+ pickle_ = tempfile.NamedTemporaryFile(prefix='scores-', suffix='.pickle', delete=False)
 
105
 
106
  Path(pickle_.name).write_bytes(pickle.dumps(scores_obj))
107
  return score.name, score_vis.name, pickle_.name
108
 
109
  with gr.Blocks() as demo:
110
  gr.Markdown("""
111
+ # SimMS: A GPU-Accelerated Cosine Similarity implementation for Tandem Mass Spectrometry
112
+
113
+ Calculate cosine greedy similarity matrix using CUDA. See the [main repo](https://github.com/pangeai/simms) for this project.
114
+ This approach is x100-x500 faster than [MatchMS](https://github.com/matchms/matchms). Upload your MGF files below, or run the sample `pesticides.mgf` files against each other.
115
  """)
116
  with gr.Row():
117
  refs = gr.File(label="Upload REFERENCES.mgf",
 
120
  ques = gr.File(label="Upload QUERIES.mgf",
121
  interactive=True, value='pesticides.mgf')
122
  with gr.Row():
123
+ similarity_method = gr.Radio(['CosineGreedy', 'ModifiedCosine'], value='ModifiedCosine', type='value',
124
+ info="Choose one of the supported similarity methods. Need more? Let us know in github issues."
125
+ )
126
+ tolerance = gr.Number(value=0.1, label="tolerance")
127
+ mz_power = gr.Number(value=0.0, label="m/z power")
128
+ intensity_power = gr.Number(value=1.0, label="intensity power")
129
+ shift = gr.Number(value=0, label="mass shift")
130
  with gr.Row():
131
+ batch_size = gr.Number(value=2048, label="Batch Size",
132
+ info='Compare this many spectra to same amount of other spectra at each iteration.')
133
+ n_max_peaks = gr.Number(value=1024, label="Maximum Number of Peaks",
134
+ info="Consider this many m/z peaks at most, per spectrum.")
135
+ match_limit = gr.Number(value=2048, label="Match Limit",
136
+ info="Consider this many pairs of m/z before stopping. "
137
+ "In practice, a value of 2048 gives more than 99.99% accuracy on GNPS")
138
+ do_preprocess = gr.Checkbox(value=False, label="filter spectra",
139
+ info="If you want to filter spectra before processing, we can do that. Look at the code to see details.")
140
  with gr.Row():
141
+ array_type = gr.Radio(['numpy', 'sparse'],
142
+ value='numpy', type='value',
143
+ label='If `sparse`, everything with score less than `sparse_threshold` will be discarded.'
144
+ 'If `numpy`, we disable sparse behaviour.')
145
+ sparse_threshold = gr.Slider(minimum=0, maximum=1, value=0.75, label="Sparse Threshold",
146
+ info="For very large results, when comparing, more than 10k x 10k, the output dense score matrix can grow too large for RAM."
147
+ "While most of the scores aren't useful (near zero). This argument discards all scores less than sparse_threshold, and returns "
148
+ "results as a SparseStack format."
149
+ )
150
  with gr.Row():
151
  score_vis = gr.Image()
152
 
 
155
  interactive=False)
156
  out_pickle = gr.File(label="Download full `Scores` object (.pickle format)",
157
  interactive=False)
158
+ gr.Markdown("""
159
+ **NOTE** You can use this snippet to use the downloaded array:
160
+ ```py
161
+ import numpy as np
162
+ arr = np.load('scores-nr0hqp85.npz')['scores']
163
+ print(arr)
164
+ ```""")
165
  btn = gr.Button("Run")
166
+ btn.click(fn=run,
167
+ inputs=[refs, ques, similarity_method, tolerance, mz_power, intensity_power, shift,
168
+ batch_size, n_max_peaks, match_limit,
169
+ array_type, sparse_threshold, do_preprocess],
170
+ outputs=[out_npz, score_vis, out_pickle])
171
 
172
  if __name__ == "__main__":
173
  demo.launch(debug=True)