libpappsomspp
Library for mass spectrometry
semiglobalalignment.h
Go to the documentation of this file.
1/**
2 * \file pappsomspp/processing/specpeptidoms/semiglobalalignment.h
3 * \date 24/03/2025
4 * \author Aurélien Berthier
5 * \brief protein to spectrum alignment
6 *
7 * C++ implementation of the SpecPeptidOMS algorithm described in :
8 * (1) Benoist, É.; Jean, G.; Rogniaux, H.; Fertin, G.; Tessier, D. SpecPeptidOMS Directly and
9 * Rapidly Aligns Mass Spectra on Whole Proteomes and Identifies Peptides That Are Not Necessarily
10 * Tryptic: Implications for Peptidomics. J. Proteome Res. 2025.
11 * https://doi.org/10.1021/acs.jproteome.4c00870.
12 */
13
14/*
15 * Copyright (c) 2025 Aurélien Berthier
16 * <aurelien.berthier@ls2n.fr>
17 *
18 * This program is free software: you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation, either version 3 of the License, or
21 * (at your option) any later version.
22 *
23 * This program is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU General Public License for more details.
27 *
28 * You should have received a copy of the GNU General Public License
29 * along with this program. If not, see <http://www.gnu.org/licenses/>.
30 */
31
32#pragma once
33
34#include <boost/numeric/ublas/matrix.hpp>
35#include "spomsspectrum.h"
36#include "../../protein/protein.h"
37#include "scorevalues.h"
38#include "locationsaver.h"
39#include "scenario.h"
40
41namespace pappso
42{
43namespace specpeptidoms
44{
45
46struct KeyCell
47{
48 std::size_t n_row;
49 int score;
50 std::size_t beginning;
52};
53
55{
56 std::vector<std::size_t> peaks;
58 int score;
60 end_shift; // begin_shift represents the shift at the beginning of the spectrum, i.e. the end
61 // of the peptide sequence (in the N->C reading direction), and vice-versa for
62 // end_shift.
63 std::vector<double> shifts;
64 std::size_t SPC, beginning, end;
65};
66
68{
69 public:
70 /**
71 * Default constructor
72 */
73 SemiGlobalAlignment(ScoreValues &score_values,
74 const pappso::PrecisionPtr precision_ptr,
75 AaCode &aaCode);
76
77 /**
78 * Destructor
79 */
81
82 /**
83 * @brief perform the first alignment search between a protein sequence and a spectrum. The member
84 * location heap is filled with the candidates locations.
85 * @param protein Protein to align
86 * @param spectrum Spectrum to align
87 */
88 void fastAlign(const SpOMSSpectrumCsp &spectrum,
89 const QString &protein_seq,
90 const QString &protein_id);
91
92 /**
93 * @brief performs the second alignment search between a protein subsequence and a spectrum.
94 * IMPLEMENTATION MATRICE DES ORIGINES => ARBRE ?
95 */
96 void preciseAlign(const SpOMSSpectrumCsp &spectrum,
97 const QString &protein_seq,
98 const QString &protein_id,
99 const std::size_t beginning,
100 const std::size_t length);
101
102 /**
103 * @brief performs the post-processing : generates corrected spectra and align them
104 * @param shifts List of potential precursor mass errors to test
105 */
106 void postProcessingAlign(const SpOMSSpectrumCsp &spectrum,
107 const QString &protein_seq,
108 const QString &protein_id,
109 std::size_t beginning,
110 std::size_t length,
111 const std::vector<double> &shifts);
112
114 Scenario getScenario() const;
115 const Alignment &getBestAlignment(const SpOMSSpectrumCsp &spectrum) const;
116
117 static std::vector<double> getPotentialMassErrors(const Alignment &alignment,
118 const QString &protein_seq);
119
120 private:
121 std::vector<KeyCell> m_interest_cells;
122 std::vector<std::pair<std::size_t, KeyCell>> m_updated_cells;
124 const int min_score = 15;
130
131 /**
132 * @brief Stores the best alignment from m_scenario in m_best_alignment
133 */
134 void
135 saveBestAlignment(const QString sequence, const SpOMSSpectrumCsp &spectrum, std::size_t offset);
136
137 void correctAlign(const QString &protein_seq,
138 const QString &protein_id,
139 const SpOMSSpectrumCsp &spectrum,
140 std::vector<std::size_t> peaks_to_remove,
141 std::size_t offset);
142
143 /**
144 * @brief updates the scores of the alignment matrix for a given amino acid as well as the
145 * location heap/scenario.
146 * @param sequence Reversed sequence of the protein being aligned
147 * @param row_number number of the row to update (== index in sequence of the amino acid being
148 * aligned)
149 * @param aa_positions list of the AaPositions of the current amino acid
150 * @param spectrum Spectrum being aligned
151 * @param fast_align Whether to use the fast version of the algorithm (for 1st alignemnt step)
152 */
153 void updateAlignmentMatrix(const QString &sequence,
154 const std::size_t row_number,
155 const std::vector<AaPosition> aa_positions,
156 const SpOMSSpectrumCsp &spectrum,
157 const bool fast_align,
158 const QString &protein);
159
160 /**
161 * @brief indicates if a perfect shift is possible between the provided positions
162 * @param sequence Reversed sequence of the protein being aligned
163 * @param spectrum Spectrum being aligned
164 * @param origin_row beginning row of the aa gap to verify (== index of the first missing aa in
165 * sequence)
166 * @param current_row row being processed (== index of the current AaPosition in sequence)
167 * @param l_peak left peak index of the mz gap to verify
168 * @param r_peak right peak index of the mz gap to verify
169 */
170 bool perfectShiftPossible(const QString &sequence,
171 const SpOMSSpectrumCsp &spectrum,
172 const std::size_t origin_row,
173 const std::size_t current_row,
174 const std::size_t l_peak,
175 const std::size_t r_peak) const;
176
177 std::size_t perfectShiftPossibleFrom0(const QString &sequence,
178 const SpOMSSpectrumCsp &spectrum,
179 const std::size_t current_row,
180 const std::size_t r_peak) const;
181
182 std::size_t perfectShiftPossibleEnd(const QString &sequence,
183 const SpOMSSpectrumCsp &spectrum,
184 std::size_t end_row,
185 std::size_t end_peak) const;
186};
187} // namespace specpeptidoms
188} // namespace pappso
collection of integer code for each amino acid 0 => null 1 to 20 => amino acid sorted by there mass (...
Definition: aacode.h:44
void preciseAlign(const SpOMSSpectrumCsp &spectrum, const QString &protein_seq, const QString &protein_id, const std::size_t beginning, const std::size_t length)
performs the second alignment search between a protein subsequence and a spectrum....
void updateAlignmentMatrix(const QString &sequence, const std::size_t row_number, const std::vector< AaPosition > aa_positions, const SpOMSSpectrumCsp &spectrum, const bool fast_align, const QString &protein)
updates the scores of the alignment matrix for a given amino acid as well as the location heap/scenar...
SemiGlobalAlignment(ScoreValues &score_values, const pappso::PrecisionPtr precision_ptr, AaCode &aaCode)
void fastAlign(const SpOMSSpectrumCsp &spectrum, const QString &protein_seq, const QString &protein_id)
perform the first alignment search between a protein sequence and a spectrum. The member location hea...
void postProcessingAlign(const SpOMSSpectrumCsp &spectrum, const QString &protein_seq, const QString &protein_id, std::size_t beginning, std::size_t length, const std::vector< double > &shifts)
performs the post-processing : generates corrected spectra and align them
const Alignment & getBestAlignment(const SpOMSSpectrumCsp &spectrum) const
void saveBestAlignment(const QString sequence, const SpOMSSpectrumCsp &spectrum, std::size_t offset)
Stores the best alignment from m_scenario in m_best_alignment.
bool perfectShiftPossible(const QString &sequence, const SpOMSSpectrumCsp &spectrum, const std::size_t origin_row, const std::size_t current_row, const std::size_t l_peak, const std::size_t r_peak) const
indicates if a perfect shift is possible between the provided positions
void correctAlign(const QString &protein_seq, const QString &protein_id, const SpOMSSpectrumCsp &spectrum, std::vector< std::size_t > peaks_to_remove, std::size_t offset)
std::vector< std::pair< std::size_t, KeyCell > > m_updated_cells
static std::vector< double > getPotentialMassErrors(const Alignment &alignment, const QString &protein_seq)
std::size_t perfectShiftPossibleFrom0(const QString &sequence, const SpOMSSpectrumCsp &spectrum, const std::size_t current_row, const std::size_t r_peak) const
std::size_t perfectShiftPossibleEnd(const QString &sequence, const SpOMSSpectrumCsp &spectrum, std::size_t end_row, std::size_t end_peak) const
save protein subsequences for alignment
std::shared_ptr< const SpOMSSpectrum > SpOMSSpectrumCsp
Definition: spomsspectrum.h:65
tries to keep as much as possible monoisotopes, removing any possible C13 peaks and changes multichar...
Definition: aa.cpp:39
backtracking of 2nd alignment
SpecPeptidOMS Spectrum.
std::vector< std::size_t > peaks