File size: 8,319 Bytes
b31ccfa
 
 
 
c73c4b9
 
 
 
 
 
 
 
b31ccfa
 
10b9217
b31ccfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c73c4b9
b31ccfa
 
 
 
 
 
 
 
c73c4b9
 
 
b31ccfa
 
 
 
 
 
 
 
 
c73c4b9
15498d2
c73c4b9
 
 
15498d2
 
 
 
b31ccfa
 
 
 
 
c73c4b9
 
 
 
 
 
 
 
 
 
15498d2
c73c4b9
 
b31ccfa
 
c73c4b9
b31ccfa
 
 
 
 
 
c73c4b9
 
 
 
 
 
 
 
b31ccfa
c73c4b9
b31ccfa
 
c73c4b9
b31ccfa
 
c73c4b9
b31ccfa
 
 
 
 
8ba2fce
c73c4b9
 
 
 
15498d2
 
8ba2fce
b31ccfa
 
 
4dfb914
b31ccfa
8ba2fce
b31ccfa
c73c4b9
 
 
 
 
 
 
b31ccfa
c73c4b9
 
 
 
 
 
 
15498d2
c73c4b9
b31ccfa
c73c4b9
 
 
 
 
 
 
 
 
b31ccfa
 
 
 
 
 
 
 
c73c4b9
 
 
 
 
 
 
b31ccfa
c73c4b9
 
 
 
 
b31ccfa
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import gradio as gr
from pathlib import Path
from matchms import Spectrum
from typing import List, Optional, Literal
import tempfile
import numpy as np
from simms.similarity import CudaCosineGreedy, CudaModifiedCosine
from matchms.importing import load_from_mgf
from matchms import calculate_scores
import matplotlib.pyplot as plt
import pickle

# os.system("nvidia-smi")
# print("TORCH_CUDA", torch.cuda.is_available())

def preprocess_spectra(spectra: List[Spectrum]) -> Spectrum:
    from matchms.filtering import select_by_intensity, \
        normalize_intensities, \
        select_by_relative_intensity, \
        reduce_to_number_of_peaks, \
        select_by_mz, \
        require_minimum_number_of_peaks
    
    def process_spectrum(spectrum: Spectrum) -> Optional[Spectrum]:
        """
        One of the many ways to preprocess the spectrum - we use this by default.
        """
        spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
        spectrum = normalize_intensities(spectrum)
        spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
        spectrum = reduce_to_number_of_peaks(spectrum, n_max=1024)
        return spectrum
    
    spectra = list(process_spectrum(s) for s in spectra) # Some might be None
    return spectra

def run(r_filepath:Path, q_filepath:Path,
        similarity_method: Literal['CosineGreedy','ModifiedCosine'] = 'CosineGreedy',
        tolerance: float = 0.1,
        mz_power: float = 0.0,
        intensity_power: float = 1.0,
        shift: float = 0,
        batch_size: int = 2048,
        n_max_peaks: int = 1024,
        match_limit: int = 2048,
        array_type: Literal['sparse','numpy'] = "numpy",
        sparse_threshold: float = .75,
        do_preprocess: bool = False,
        ):
    print('\n>>>>', r_filepath, q_filepath, array_type, '\n')
    # debug = os.getenv('CUDAMS_DEBUG') == '1'
    # if debug:
    #     r_filepath = Path('tests/data/pesticides.mgf')
    #     q_filepath = Path('tests/data/pesticides.mgf')

    assert r_filepath is not None, "Reference file is missing."
    assert q_filepath is not None, "Query file is missing."

    refs, ques = list(load_from_mgf(str(r_filepath))), list(load_from_mgf(str(q_filepath)))
    
    if do_preprocess:
        refs = preprocess_spectra(refs)
        ques = preprocess_spectra(ques)
        if not refs: gr.Error("References are empty after filtering") 
        if not ques: gr.Error("Queries are empty after filtering")
    else:
        gr.Warning("Filtering is skipped. Malformed spectra can cause errors.")

    # If we have small spectra, don't make a huge batch
    if batch_size > max(len(refs), len(ques)):
         batch_size = max(len(refs), len(ques))

    
    kwargs = dict(tolerance=tolerance, mz_power=mz_power, intensity_power=intensity_power, shift=shift, batch_size=batch_size, 
              n_max_peaks=n_max_peaks, match_limit=match_limit, sparse_threshold=sparse_threshold)
    
    if similarity_method == 'ModifiedCosine' and shift != 0:
        gr.Error("`ModifiedCosine` can not use shift - we will proceed as if shift is 0")
    
    if similarity_method == 'ModifiedCosine':
        kwargs.pop('shift')
        
        
    similarity_class = CudaCosineGreedy if similarity_method == 'CosineGreedy' else CudaModifiedCosine
        
    scores_obj = calculate_scores(
        refs, ques, 
        similarity_function=similarity_class(**kwargs),
        array_type=array_type
    )

    score_vis = tempfile.NamedTemporaryFile(suffix='.jpg', delete=False)

    scores = scores_obj.to_array()
    
    outputs = len(scores.dtype.names)
    
    fig, axs = plt.subplots(1, outputs,
                            figsize=(5*outputs, 5))
    for title, ax in zip(scores.dtype.names, axs):
        ax.imshow(scores[title])
        ax.set_title(title)

    plt.suptitle("Output values")
    plt.savefig(score_vis.name)

    score = tempfile.NamedTemporaryFile(prefix='scores-', suffix='.npz', delete=False)
    np.savez(score.name, scores=scores)

    pickle_ = tempfile.NamedTemporaryFile(prefix='scores-', suffix='.pickle', delete=False)

    Path(pickle_.name).write_bytes(pickle.dumps(scores_obj))
    return score.name,  score_vis.name, pickle_.name

with gr.Blocks() as demo:
    gr.Markdown("""
    # SimMS: A GPU-Accelerated Cosine Similarity implementation for Tandem Mass Spectrometry
     
    Calculate cosine greedy similarity matrix using CUDA. See the [main repo](https://github.com/pangeai/simms) for this project. 
    This approach is x100-x500 faster than [MatchMS](https://github.com/matchms/matchms). Upload your MGF files below, or run the sample `pesticides.mgf` files against each other.
    
    **In case of errors, check the "logs" above - malformed spectra will cause errors**
    """)
    with gr.Row():
        refs = gr.File(label="Upload REFERENCES.mgf",
                       interactive=True,
                               value='pesticides.mgf')
        ques = gr.File(label="Upload QUERIES.mgf",
                       interactive=True, value='pesticides.mgf')
    with gr.Row():
        similarity_method = gr.Radio(['CosineGreedy', 'ModifiedCosine'], value='ModifiedCosine', type='value',
                                     info="Choose one of the supported similarity methods. Need more? Let us know in github issues."
                                     )
        tolerance = gr.Number(value=0.1, label="tolerance")
        mz_power = gr.Number(value=0.0, label="m/z power")
        intensity_power = gr.Number(value=1.0, label="intensity power")
        shift = gr.Number(value=0, label="mass shift")
    with gr.Row():
        batch_size = gr.Number(value=2048, label="Batch Size", 
                                info='Compare this many spectra to same amount of other spectra at each iteration.')
        n_max_peaks = gr.Number(value=1024, label="Maximum Number of Peaks", 
                                info="Consider this many m/z peaks at most, per spectrum.")
        match_limit = gr.Number(value=2048, label="Match Limit", 
                                info="Consider this many pairs of m/z before stopping. "
                                    "In practice, a value of 2048 gives more than 99.99% accuracy on GNPS")
        do_preprocess = gr.Checkbox(value=True, label="filter spectra", 
                                    info="If you want to filter spectra before processing, we can do that. Look at the code to see details.")
    with gr.Row():
        array_type = gr.Radio(['numpy', 'sparse'], 
                              value='numpy', type='value',
                              label='If `sparse`, everything with score less than `sparse_threshold` will be discarded.'
                                    'If `numpy`, we disable sparse behaviour.')
        sparse_threshold = gr.Slider(minimum=0, maximum=1, value=0.75, label="Sparse Threshold",
                                        info="For very large results, when comparing, more than 10k x 10k, the output dense score matrix can grow too large for RAM."
                                        "While most of the scores aren't useful (near zero). This argument discards all scores less than sparse_threshold, and returns "
                                        "results as a SparseStack format."
                                        )
    with gr.Row():
        score_vis = gr.Image()

    with gr.Row():
        out_npz = gr.File(label="Download similarity matrix (.npz format)", 
                      interactive=False)
        out_pickle = gr.File(label="Download full `Scores` object (.pickle format)", 
                      interactive=False)
    gr.Markdown("""
            **NOTE** You can use this snippet to use the downloaded array:
            ```py
            import numpy as np
            arr = np.load('scores-nr0hqp85.npz')['scores']
            print(arr)
            ```""")
    btn = gr.Button("Run")
    btn.click(fn=run, 
            inputs=[refs, ques, similarity_method, tolerance, mz_power, intensity_power, shift, 
                            batch_size, n_max_peaks, match_limit, 
                            array_type, sparse_threshold, do_preprocess], 
            outputs=[out_npz, score_vis, out_pickle])

if __name__ == "__main__":
    demo.launch(debug=True)