Spaces:

TornikeO
/

SimMS

Sleeping

File size: 8,319 Bytes

b31ccfa
 
 
 
c73c4b9
 
 
 
 
 
 
 
b31ccfa
 
10b9217
b31ccfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c73c4b9
b31ccfa
 
 
 
 
 
 
 
c73c4b9
 
 
b31ccfa
 
 
 
 
 
 
 
 
c73c4b9
15498d2
c73c4b9
 
 
15498d2
 
 
 
b31ccfa
 
 
 
 
c73c4b9
 
 
 
 
 
 
 
 
 
15498d2
c73c4b9
 
b31ccfa
 
c73c4b9
b31ccfa
 
 
 
 
 
c73c4b9
 
 
 
 
 
 
 
b31ccfa
c73c4b9
b31ccfa
 
c73c4b9
b31ccfa
 
c73c4b9
b31ccfa
 
 
 
 
8ba2fce
c73c4b9
 
 
 
15498d2
 
8ba2fce
b31ccfa
 
 
4dfb914
b31ccfa
8ba2fce
b31ccfa
c73c4b9
 
 
 
 
 
 
b31ccfa
c73c4b9
 
 
 
 
 
 
15498d2
c73c4b9
b31ccfa
c73c4b9
 
 
 
 
 
 
 
 
b31ccfa
 
 
 
 
 
 
 
c73c4b9
 
 
 
 
 
 
b31ccfa
c73c4b9
 
 
 
 
b31ccfa

import gradio as gr
from pathlib import Path
from matchms import Spectrum
from typing import List, Optional, Literal
import tempfile
import numpy as np
from simms.similarity import CudaCosineGreedy, CudaModifiedCosine
from matchms.importing import load_from_mgf
from matchms import calculate_scores
import matplotlib.pyplot as plt
import pickle

# os.system("nvidia-smi")
# print("TORCH_CUDA", torch.cuda.is_available())

def preprocess_spectra(spectra: List[Spectrum]) -> Spectrum:
    from matchms.filtering import select_by_intensity, \
        normalize_intensities, \
        select_by_relative_intensity, \
        reduce_to_number_of_peaks, \
        select_by_mz, \
        require_minimum_number_of_peaks
    
    def process_spectrum(spectrum: Spectrum) -> Optional[Spectrum]:
        """
        One of the many ways to preprocess the spectrum - we use this by default.
        """
        spectrum = select_by_mz(spectrum, mz_from=10.0, mz_to=1000.0)
        spectrum = normalize_intensities(spectrum)
        spectrum = select_by_relative_intensity(spectrum, intensity_from=0.001)
        spectrum = reduce_to_number_of_peaks(spectrum, n_max=1024)
        return spectrum
    
    spectra = list(process_spectrum(s) for s in spectra) # Some might be None
    return spectra

def run(r_filepath:Path, q_filepath:Path,
        similarity_method: Literal['CosineGreedy','ModifiedCosine'] = 'CosineGreedy',
        tolerance: float = 0.1,
        mz_power: float = 0.0,
        intensity_power: float = 1.0,
        shift: float = 0,
        batch_size: int = 2048,
        n_max_peaks: int = 1024,
        match_limit: int = 2048,
        array_type: Literal['sparse','numpy'] = "numpy",
        sparse_threshold: float = .75,
        do_preprocess: bool = False,
        ):
    print('\n>>>>', r_filepath, q_filepath, array_type, '\n')
    # debug = os.getenv('CUDAMS_DEBUG') == '1'
    # if debug:
    #     r_filepath = Path('tests/data/pesticides.mgf')
    #     q_filepath = Path('tests/data/pesticides.mgf')

    assert r_filepath is not None, "Reference file is missing."
    assert q_filepath is not None, "Query file is missing."

    refs, ques = list(load_from_mgf(str(r_filepath))), list(load_from_mgf(str(q_filepath)))
    
    if do_preprocess:
        refs = preprocess_spectra(refs)
        ques = preprocess_spectra(ques)
        if not refs: gr.Error("References are empty after filtering") 
        if not ques: gr.Error("Queries are empty after filtering")
    else:
        gr.Warning("Filtering is skipped. Malformed spectra can cause errors.")

    # If we have small spectra, don't make a huge batch
    if batch_size > max(len(refs), len(ques)):
         batch_size = max(len(refs), len(ques))

    
    kwargs = dict(tolerance=tolerance, mz_power=mz_power, intensity_power=intensity_power, shift=shift, batch_size=batch_size, 
              n_max_peaks=n_max_peaks, match_limit=match_limit, sparse_threshold=sparse_threshold)
    
    if similarity_method == 'ModifiedCosine' and shift != 0:
        gr.Error("`ModifiedCosine` can not use shift - we will proceed as if shift is 0")
    
    if similarity_method == 'ModifiedCosine':
        kwargs.pop('shift')
        
        
    similarity_class = CudaCosineGreedy if similarity_method == 'CosineGreedy' else CudaModifiedCosine
        
    scores_obj = calculate_scores(
        refs, ques, 
        similarity_function=similarity_class(**kwargs),
        array_type=array_type
    )

    score_vis = tempfile.NamedTemporaryFile(suffix='.jpg', delete=False)

    scores = scores_obj.to_array()
    
    outputs = len(scores.dtype.names)
    
    fig, axs = plt.subplots(1, outputs,
                            figsize=(5*outputs, 5))
    for title, ax in zip(scores.dtype.names, axs):
        ax.imshow(scores[title])
        ax.set_title(title)

    plt.suptitle("Output values")
    plt.savefig(score_vis.name)

    score = tempfile.NamedTemporaryFile(prefix='scores-', suffix='.npz', delete=False)
    np.savez(score.name, scores=scores)

    pickle_ = tempfile.NamedTemporaryFile(prefix='scores-', suffix='.pickle', delete=False)

    Path(pickle_.name).write_bytes(pickle.dumps(scores_obj))
    return score.name,  score_vis.name, pickle_.name

with gr.Blocks() as demo:
    gr.Markdown("""
    # SimMS: A GPU-Accelerated Cosine Similarity implementation for Tandem Mass Spectrometry
     
    Calculate cosine greedy similarity matrix using CUDA. See the [main repo](https://github.com/pangeai/simms) for this project. 
    This approach is x100-x500 faster than [MatchMS](https://github.com/matchms/matchms). Upload your MGF files below, or run the sample `pesticides.mgf` files against each other.
    
    **In case of errors, check the "logs" above - malformed spectra will cause errors**
    """)
    with gr.Row():
        refs = gr.File(label="Upload REFERENCES.mgf",
                       interactive=True,
                               value='pesticides.mgf')
        ques = gr.File(label="Upload QUERIES.mgf",
                       interactive=True, value='pesticides.mgf')
    with gr.Row():
        similarity_method = gr.Radio(['CosineGreedy', 'ModifiedCosine'], value='ModifiedCosine', type='value',
                                     info="Choose one of the supported similarity methods. Need more? Let us know in github issues."
                                     )
        tolerance = gr.Number(value=0.1, label="tolerance")
        mz_power = gr.Number(value=0.0, label="m/z power")
        intensity_power = gr.Number(value=1.0, label="intensity power")
        shift = gr.Number(value=0, label="mass shift")
    with gr.Row():
        batch_size = gr.Number(value=2048, label="Batch Size", 
                                info='Compare this many spectra to same amount of other spectra at each iteration.')
        n_max_peaks = gr.Number(value=1024, label="Maximum Number of Peaks", 
                                info="Consider this many m/z peaks at most, per spectrum.")
        match_limit = gr.Number(value=2048, label="Match Limit", 
                                info="Consider this many pairs of m/z before stopping. "
                                    "In practice, a value of 2048 gives more than 99.99% accuracy on GNPS")
        do_preprocess = gr.Checkbox(value=True, label="filter spectra", 
                                    info="If you want to filter spectra before processing, we can do that. Look at the code to see details.")
    with gr.Row():
        array_type = gr.Radio(['numpy', 'sparse'], 
                              value='numpy', type='value',
                              label='If `sparse`, everything with score less than `sparse_threshold` will be discarded.'
                                    'If `numpy`, we disable sparse behaviour.')
        sparse_threshold = gr.Slider(minimum=0, maximum=1, value=0.75, label="Sparse Threshold",
                                        info="For very large results, when comparing, more than 10k x 10k, the output dense score matrix can grow too large for RAM."
                                        "While most of the scores aren't useful (near zero). This argument discards all scores less than sparse_threshold, and returns "
                                        "results as a SparseStack format."
                                        )
    with gr.Row():
        score_vis = gr.Image()

    with gr.Row():
        out_npz = gr.File(label="Download similarity matrix (.npz format)", 
                      interactive=False)
        out_pickle = gr.File(label="Download full `Scores` object (.pickle format)", 
                      interactive=False)
    gr.Markdown("""
            **NOTE** You can use this snippet to use the downloaded array:
            ```py
            import numpy as np
            arr = np.load('scores-nr0hqp85.npz')['scores']
            print(arr)
            ```""")
    btn = gr.Button("Run")
    btn.click(fn=run, 
            inputs=[refs, ques, similarity_method, tolerance, mz_power, intensity_power, shift, 
                            batch_size, n_max_peaks, match_limit, 
                            array_type, sparse_threshold, do_preprocess], 
            outputs=[out_npz, score_vis, out_pickle])

if __name__ == "__main__":
    demo.launch(debug=True)