Source code for ms2pip.search_space

"""
Define and build the search space for in silico spectral library generation.

This module defines the search space for in silico spectral library generation as a
:py:class:`~ProteomeSearchSpace` object. Variable and fixed modifications can be configured
as :py:class:`~ModificationConfig` objects.

The peptide search space can be built from a protein FASTA file and a set of parameters, which can
then be converted to a :py:class:`psm_utils.PSMList` object for use in :py:mod:`ms2pip`. All
parameters are listed below at :py:class:`~ProteomeSearchSpace` and can be passed as a
dictionary, a JSON file, or as a :py:class:`~ProteomeSearchSpace` object. For example:

.. code-block:: json

   {
     "fasta_file": "test.fasta",
     "min_length": 8,
     "max_length": 3,
     "cleavage_rule": "trypsin",
     "missed_cleavages": 2,
     "semi_specific": false,
     "add_decoys": true,
     "modifications": [
       {
         "label": "UNIMOD:Oxidation",
         "amino_acid": "M"
       },
       {
         "label": "UNIMOD:Carbamidomethyl",
         "amino_acid": "C",
         "fixed": true
       }
     ],
     "max_variable_modifications": 3,
     "charges": [2, 3]
   }


For an unspecific protein digestion, the cleavage rule can be set to ``unspecific``. This will
result in a cleavage rule that allows cleavage after any amino acid with an unlimited number of
allowed missed cleavages.

To disable protein digestion when the FASTA file contains peptides, set the cleavage rule to
``-``. This will treat each line in the FASTA file as a separate peptide sequence, but still
allow for modifications and charges to be added.


Examples
--------
>>> from ms2pip.search_space import ProteomeSearchSpace, ModificationConfig
>>> search_space = ProteomeSearchSpace(
...     fasta_file="tests/data/test_proteins.fasta",
...     min_length=8,
...     max_length=30,
...     cleavage_rule="trypsin",
...     missed_cleavages=2,
...     semi_specific=False,
...     modifications=[
...         ModificationConfig(label="UNIMOD:Oxidation", amino_acid="M"),
...         ModificationConfig(label="UNIMOD:Carbamidomethyl", amino_acid="C", fixed=True),
...     ],
...     charges=[2, 3],
... )
>>> psm_list = search_space.into_psm_list()

>>> from ms2pip.search_space import ProteomeSearchSpace
>>> search_space = ProteomeSearchSpace.from_any("tests/data/test_search_space.json")
>>> psm_list = search_space.into_psm_list()

"""

from __future__ import annotations

import multiprocessing
import multiprocessing.dummy
from collections import defaultdict
from functools import partial
from itertools import chain, combinations, product
from logging import getLogger
from pathlib import Path
from typing import Any, Dict, Generator, List, Optional, Union

import numpy as np
import pyteomics.fasta
from psm_utils import PSM, PSMList
from pydantic import BaseModel, field_validator, model_validator
from pyteomics.parser import icleave
from rich.progress import track

logger = getLogger(__name__)


[docs]class ModificationConfig(BaseModel): """Configuration for a single modification in the search space.""" label: str amino_acid: Optional[str] = None peptide_n_term: Optional[bool] = False protein_n_term: Optional[bool] = False peptide_c_term: Optional[bool] = False protein_c_term: Optional[bool] = False fixed: Optional[bool] = False def __init__(self, **data: Any): """ Configuration for a single modification in the search space. Parameters ---------- label Label of the modification. This can be any valid ProForma 2.0 label. amino_acid Amino acid target of the modification. :py:obj:`None` if the modification is not specific to an amino acid. Default is None. peptide_n_term Whether the modification occurs only on the peptide N-terminus. Default is False. protein_n_term Whether the modification occurs only on the protein N-terminus. Default is False. peptide_c_term Whether the modification occurs only on the peptide C-terminus. Default is False. protein_c_term Whether the modification occurs only on the protein C-terminus. Default is False. fixed Whether the modification is fixed. Default is False. """ super().__init__(**data) @model_validator(mode="after") def _modification_must_have_target(self): target_fields = [ "amino_acid", "peptide_n_term", "protein_n_term", "peptide_c_term", "protein_c_term", ] if not any(getattr(self, t) for t in target_fields): raise ValueError("Modifications must have a target (amino acid or N/C-term).") return self
DEFAULT_MODIFICATIONS = [ ModificationConfig( label="UNIMOD:Oxidation", amino_acid="M", ), ModificationConfig( label="UNIMOD:Carbamidomethyl", amino_acid="C", fixed=True, ), ]
[docs]class ProteomeSearchSpace(BaseModel): """Search space for in silico spectral library generation.""" fasta_file: Path min_length: int = 8 max_length: int = 30 min_precursor_mz: Optional[float] = 0 max_precursor_mz: Optional[float] = np.Inf cleavage_rule: str = "trypsin" missed_cleavages: int = 2 semi_specific: bool = False add_decoys: bool = False modifications: List[ModificationConfig] = DEFAULT_MODIFICATIONS max_variable_modifications: int = 3 charges: List[int] = [2, 3] def __init__(self, **data: Any): """ Search space for in silico spectral library generation. Parameters ---------- fasta_file Path to FASTA file with protein sequences. min_length Minimum peptide length. Default is 8. max_length Maximum peptide length. Default is 30. min_precursor_mz Minimum precursor m/z for peptides. Default is 0. max_precursor_mz Maximum precursor m/z for peptides. Default is np.Inf. cleavage_rule Cleavage rule for peptide digestion. Default is "trypsin". missed_cleavages Maximum number of missed cleavages. Default is 2. semi_specific Allow semi-specific cleavage. Default is False. add_decoys Add decoy sequences to search space. Default is False. modifications List of modifications to consider. Default is oxidation of M and carbamidomethylation of C. max_variable_modifications Maximum number of variable modifications per peptide. Default is 3. charges List of charges to consider. Default is [2, 3]. """ super().__init__(**data) self._peptidoform_spaces: List[_PeptidoformSearchSpace] = [] @field_validator("modifications") @classmethod def _validate_modifications(cls, v): if all(isinstance(m, ModificationConfig) for m in v): return v elif all(isinstance(m, dict) for m in v): return [ModificationConfig(**modification) for modification in v] else: raise ValueError( "Modifications should be a list of dicts or ModificationConfig objects." ) @model_validator(mode="after") def _validate_unspecific_cleavage(self): """Validate and configure unspecific cleavage settings.""" # `unspecific` is not an option in pyteomics.parser.icleave, so we configure # the settings for unspecific cleavage manually. if self.cleavage_rule == "unspecific": self.missed_cleavages = self.max_length self.cleavage_rule = r"(?<=[A-Z])" return self def __len__(self): if not self._peptidoform_spaces: raise ValueError("Search space must be built before length can be determined.") return sum(len(pep_space) for pep_space in self._peptidoform_spaces)
[docs] @classmethod def from_any(cls, _input: Union[dict, str, Path, ProteomeSearchSpace]) -> ProteomeSearchSpace: """ Create ProteomeSearchSpace from various input types. Parameters ---------- _input Search space parameters as a dictionary, a path to a JSON file, an existing :py:class:`ProteomeSearchSpace` object. """ if isinstance(_input, ProteomeSearchSpace): return _input elif isinstance(_input, (str, Path)): with open(_input, "rt") as f: return cls.model_validate_json(f.read()) elif isinstance(_input, dict): return cls.model_validate(_input) else: raise ValueError("Search space must be a dict, str, Path, or ProteomeSearchSpace.")
[docs] def build(self, processes: int = 1): """ Build peptide search space from FASTA file. Parameters ---------- processes : int Number of processes to use for parallelization. """ processes = processes if processes else multiprocessing.cpu_count() self._digest_fasta(processes) self._remove_redundancy() self._add_modifications(processes) self._add_charges()
def __iter__(self) -> Generator[PSM, None, None]: """ Generate PSMs from search space. If :py:meth:`build` has not been called, the search space will first be built with the given parameters. Parameters ---------- processes : int Number of processes to use for parallelization. """ # Build search space if not already built if not self._peptidoform_spaces: raise ValueError("Search space must be built before PSMs can be generated.") spectrum_id = 0 for pep_space in self._peptidoform_spaces: for pep in pep_space: yield PSM( peptidoform=pep, spectrum_id=spectrum_id, protein_list=pep_space.proteins, ) spectrum_id += 1
[docs] def filter_psms_by_mz(self, psms: PSMList) -> PSMList: """Filter PSMs by precursor m/z range.""" return PSMList( psm_list=[ psm for psm in psms if self.min_precursor_mz <= psm.peptidoform.theoretical_mz <= self.max_precursor_mz ] )
def _digest_fasta(self, processes: int = 1): """Digest FASTA file to peptides and populate search space.""" # Convert to string to avoid issues with Path objects self.fasta_file = str(self.fasta_file) n_proteins = _count_fasta_entries(self.fasta_file) if self.add_decoys: fasta_db = pyteomics.fasta.decoy_db( self.fasta_file, mode="reverse", decoy_only=False, keep_nterm=True, ) n_proteins *= 2 else: fasta_db = pyteomics.fasta.FASTA(self.fasta_file) # Read proteins and digest to peptides with _get_pool(processes) as pool: partial_digest_protein = partial( _digest_single_protein, min_length=self.min_length, max_length=self.max_length, cleavage_rule=self.cleavage_rule, missed_cleavages=self.missed_cleavages, semi_specific=self.semi_specific, ) results = track( pool.imap(partial_digest_protein, fasta_db), total=n_proteins, description="Digesting proteins...", transient=True, ) self._peptidoform_spaces = list(chain.from_iterable(results)) def _remove_redundancy(self): """Remove redundancy in peptides and combine protein lists.""" peptide_dict = dict() for peptide in track( self._peptidoform_spaces, description="Removing peptide redundancy...", transient=True, ): if peptide.sequence in peptide_dict: peptide_dict[peptide.sequence].proteins.extend(peptide.proteins) else: peptide_dict[peptide.sequence] = peptide # Overwrite with non-redundant peptides self._peptidoform_spaces = list(peptide_dict.values()) def _add_modifications(self, processes: int = 1): """Add modifications to peptides in search space.""" modifications_by_target = _restructure_modifications_by_target(self.modifications) modification_options = [] with _get_pool(processes) as pool: partial_get_modification_versions = partial( _get_peptidoform_modification_versions, modifications=self.modifications, modifications_by_target=modifications_by_target, max_variable_modifications=self.max_variable_modifications, ) modification_options = pool.imap( partial_get_modification_versions, self._peptidoform_spaces ) for pep, mod_opt in track( zip(self._peptidoform_spaces, modification_options), description="Adding modifications...", total=len(self._peptidoform_spaces), transient=True, ): pep.modification_options = mod_opt def _add_charges(self): """Add charge permutations to peptides in search space.""" for peptide in track( self._peptidoform_spaces, description="Adding charge permutations...", transient=True, ): peptide.charge_options = self.charges
class _PeptidoformSearchSpace(BaseModel): """Search space for a given amino acid sequence.""" sequence: str proteins: List[str] is_n_term: Optional[bool] = None is_c_term: Optional[bool] = None modification_options: List[Dict[int, ModificationConfig]] = [] charge_options: List[int] = [] def __init__(self, **data: Any): """ Search space for a given amino acid sequence. Parameters ---------- sequence Amino acid sequence of the peptidoform. proteins List of protein IDs containing the peptidoform. is_n_term Whether the peptidoform is an N-terminal peptide. Default is None. is_c_term Whether the peptidoform is a C-terminal peptide. Default is None. modification_options List of dictionaries with modification positions and configurations. Default is []. charge_options List of charge states to consider. Default is []. """ super().__init__(**data) def __len__(self): return len(self.modification_options) * len(self.charge_options) def __iter__(self) -> Generator[str, None, None]: """Yield peptidoform strings with given charges and modifications.""" if not self.charge_options: raise ValueError("Peptide charge options not defined.") if not self.modification_options: raise ValueError("Peptide modification options not defined.") for modifications, charge in product(self.modification_options, self.charge_options): yield self._construct_peptidoform_string(self.sequence, modifications, charge) @staticmethod def _construct_peptidoform_string( sequence: str, modifications: Dict[int, ModificationConfig], charge: int ) -> str: if not modifications: return f"{sequence}/{charge}" modded_sequence = list(sequence) for position, mod in modifications.items(): if isinstance(position, int): aa = modded_sequence[position] if aa != mod.amino_acid: raise ValueError( f"Modification {mod.label} at position {position} does not match amino " f"acid {aa}." ) modded_sequence[position] = f"{aa}[{mod.label}]" elif position == "N": modded_sequence.insert(0, f"[{mod.label}]-") elif position == "C": modded_sequence.append(f"-[{mod.label}]") else: raise ValueError(f"Invalid position {position} for modification {mod.label}.") return f"{''.join(modded_sequence)}/{charge}" def _digest_single_protein( protein: pyteomics.fasta.Protein, min_length: int = 8, max_length: int = 30, cleavage_rule: str = "trypsin", missed_cleavages: int = 2, semi_specific: bool = False, ) -> List[_PeptidoformSearchSpace]: """Digest protein sequence and return a list of validated peptides.""" def valid_residues(sequence: str) -> bool: return not any(aa in sequence for aa in ["B", "J", "O", "U", "X", "Z"]) def parse_peptide( start_position: int, sequence: str, protein: pyteomics.fasta.Protein, ) -> _PeptidoformSearchSpace: """Parse result from parser.icleave into Peptide.""" return _PeptidoformSearchSpace( sequence=sequence, # Assumes protein ID is description until first space proteins=[protein.description.split(" ")[0]], is_n_term=start_position == 0, is_c_term=start_position + len(sequence) == len(protein.sequence), ) peptides = [ parse_peptide(start, seq, protein) for start, seq in icleave( protein.sequence, cleavage_rule, missed_cleavages=missed_cleavages, min_length=min_length, max_length=max_length, semi=semi_specific, ) if valid_residues(seq) ] return peptides def _count_fasta_entries(filename: Path) -> int: """Count the number of entries in a FASTA file.""" with open(filename, "rt") as f: count = 0 for line in f: if line[0] == ">": count += 1 return count def _restructure_modifications_by_target( modifications: List[ModificationConfig], ) -> Dict[str, Dict[str, List[ModificationConfig]]]: """Restructure variable modifications to options per side chain or terminus.""" modifications_by_target = { "sidechain": defaultdict(lambda: []), "peptide_n_term": defaultdict(lambda: []), "peptide_c_term": defaultdict(lambda: []), "protein_n_term": defaultdict(lambda: []), "protein_c_term": defaultdict(lambda: []), } def add_mod(mod, target, amino_acid): if amino_acid: modifications_by_target[target][amino_acid].append(mod) else: modifications_by_target[target]["any"].append(mod) for mod in modifications: if mod.fixed: continue if mod.peptide_n_term: add_mod(mod, "peptide_n_term", mod.amino_acid) elif mod.peptide_c_term: add_mod(mod, "peptide_c_term", mod.amino_acid) elif mod.protein_n_term: add_mod(mod, "protein_n_term", mod.amino_acid) elif mod.protein_c_term: add_mod(mod, "protein_c_term", mod.amino_acid) else: add_mod(mod, "sidechain", mod.amino_acid) return {k: dict(v) for k, v in modifications_by_target.items()} def _get_modification_possibilities_by_site( peptide: _PeptidoformSearchSpace, modifications_by_target: Dict[str, Dict[str, List[ModificationConfig]]], modifications: List[ModificationConfig], ) -> Dict[Union[str, int], List[ModificationConfig]]: """Get all possible modifications for each site in a peptide sequence.""" possibilities_by_site = defaultdict(list) # Generate dictionary of positions per amino acid position_dict = defaultdict(list) for pos, aa in enumerate(peptide.sequence): position_dict[aa].append(pos) # Map modifications to positions for aa in set(position_dict).intersection(set(modifications_by_target["sidechain"])): possibilities_by_site.update( {pos: modifications_by_target["sidechain"][aa] for pos in position_dict[aa]} ) # Assign possible modifications per terminus for terminus, position, site_name, specificity in [ ("peptide_n_term", 0, "N", None), ("peptide_c_term", -1, "C", None), ("protein_n_term", 0, "N", "is_n_term"), ("protein_c_term", -1, "C", "is_c_term"), ]: if specificity is None or getattr(peptide, specificity): for site, mods in modifications_by_target[terminus].items(): if site == "any" or peptide.sequence[position] == site: possibilities_by_site[site_name].extend(mods) # Override with fixed modifications for mod in modifications: aa = mod.amino_acid # Skip variable modifications if not mod.fixed: continue # Assign if specific aa matches or if no aa is specified for each terminus for terminus, position, site_name, specificity in [ ("peptide_n_term", 0, "N", None), ("peptide_c_term", -1, "C", None), ("protein_n_term", 0, "N", "is_n_term"), ("protein_c_term", -1, "C", "is_c_term"), ]: if getattr(mod, terminus): # Mod has this terminus if specificity is None or getattr(peptide, specificity): # Specificity matches if not aa or (aa and peptide.sequence[position] == aa): # AA matches possibilities_by_site[site_name] = [mod] # Override with fixed mod break # Allow `else: if amino_acid` if no terminus matches # Assign if fixed modification is not terminal and specific aa matches else: if aa: for pos in position_dict[aa]: possibilities_by_site[pos] = [mod] return possibilities_by_site def _get_peptidoform_modification_versions( peptide: _PeptidoformSearchSpace, modifications: List[ModificationConfig], modifications_by_target: Dict[str, Dict[str, List[ModificationConfig]]], max_variable_modifications: int = 3, ) -> List[Dict[Union[str, int], List[ModificationConfig]]]: """ Get all potential combinations of modifications for a peptide sequence. Examples -------- >>> peptide = PeptidoformSpace(sequence="PEPTIDE", proteins=["PROTEIN"]) >>> phospho = ModificationConfig(label="Phospho", amino_acid="T", fixed=False) >>> acetyl = ModificationConfig(label="Acetyl", peptide_n_term=True, fixed=False) >>> modifications = [phospho, acetyl] >>> modifications_by_target = { ... "sidechain": {"S": [modifications[0]]}, ... "peptide_n_term": {"any": [modifications[1]]}, ... "peptide_c_term": {"any": []}, ... "protein_n_term": {"any": []}, ... "protein_c_term": {"any": []}, ... } >>> _get_modification_versions(peptide, modifications, modifications_by_target) [{}, {3: phospho}, {0: acetyl}, {0: acetyl, 3: phospho}] """ # Get all possible modifications for each site in the peptide sequence possibilities_by_site = _get_modification_possibilities_by_site( peptide, modifications_by_target, modifications ) # Separate fixed and variable modification sites fixed_modifications = {} variable_sites = [] for site, mods in possibilities_by_site.items(): for mod in mods: if mod.fixed: fixed_modifications[site] = mod else: variable_sites.append((site, mod)) # Generate all combinations of variable modifications up to the maximum allowed modification_versions = [] for i in range(max_variable_modifications + 1): for comb in combinations(variable_sites, i): combination_dict = fixed_modifications.copy() for site, mod in comb: combination_dict[site] = mod modification_versions.append(combination_dict) return modification_versions def _get_pool(processes: int) -> Union[multiprocessing.Pool, multiprocessing.dummy.Pool]: """Get a multiprocessing pool with the given number of processes.""" # TODO: fix None default value for processes if processes > 1: return multiprocessing.Pool(processes) else: return multiprocessing.dummy.Pool(processes)