Source code for ms2pip.search_space

"""
Define and build the search space for in silico spectral library generation.

This module defines the search space for in silico spectral library generation as a
:py:class:`~ProteomeSearchSpace` object. Variable and fixed modifications can be configured
as :py:class:`~ModificationConfig` objects.

The peptide search space can be built from a protein FASTA file and a set of parameters, which can
then be converted to a :py:class:`psm_utils.PSMList` object for use in :py:mod:`ms2pip`. All
parameters are listed below at :py:class:`~ProteomeSearchSpace` and can be passed as a
dictionary, a JSON file, or as a :py:class:`~ProteomeSearchSpace` object. For example:

.. code-block:: json

   {
     "fasta_file": "test.fasta",
     "min_length": 8,
     "max_length": 3,
     "cleavage_rule": "trypsin",
     "missed_cleavages": 2,
     "semi_specific": false,
     "add_decoys": true,
     "modifications": [
       {
         "label": "UNIMOD:Oxidation",
         "amino_acid": "M"
       },
       {
         "label": "UNIMOD:Carbamidomethyl",
         "amino_acid": "C",
         "fixed": true
       }
     ],
     "max_variable_modifications": 3,
     "charges": [2, 3]
   }


For an unspecific protein digestion, the cleavage rule can be set to ``unspecific``. This will
result in a cleavage rule that allows cleavage after any amino acid with an unlimited number of
allowed missed cleavages.

To disable protein digestion when the FASTA file contains peptides, set the cleavage rule to
``-``. This will treat each line in the FASTA file as a separate peptide sequence, but still
allow for modifications and charges to be added.


Examples
--------
>>> from ms2pip.search_space import ProteomeSearchSpace, ModificationConfig
>>> search_space = ProteomeSearchSpace(
...     fasta_file="tests/data/test_proteins.fasta",
...     min_length=8,
...     max_length=30,
...     cleavage_rule="trypsin",
...     missed_cleavages=2,
...     semi_specific=False,
...     modifications=[
...         ModificationConfig(label="UNIMOD:Oxidation", amino_acid="M"),
...         ModificationConfig(label="UNIMOD:Carbamidomethyl", amino_acid="C", fixed=True),
...     ],
...     charges=[2, 3],
... )
>>> psm_list = search_space.into_psm_list()

>>> from ms2pip.search_space import ProteomeSearchSpace
>>> search_space = ProteomeSearchSpace.from_any("tests/data/test_search_space.json")
>>> psm_list = search_space.into_psm_list()

"""

from __future__ import annotations

import multiprocessing
import multiprocessing.dummy
from collections import defaultdict
from functools import partial
from itertools import chain, combinations, product
from logging import getLogger
from pathlib import Path
from typing import Any, Dict, Generator, List, Optional, Union

import numpy as np
import pyteomics.fasta
from psm_utils import PSM, PSMList
from pydantic import BaseModel, field_validator, model_validator
from pyteomics.parser import icleave
from rich.progress import track

logger = getLogger(__name__)


[docs]class ModificationConfig(BaseModel):
    """Configuration for a single modification in the search space."""

    label: str
    amino_acid: Optional[str] = None
    peptide_n_term: Optional[bool] = False
    protein_n_term: Optional[bool] = False
    peptide_c_term: Optional[bool] = False
    protein_c_term: Optional[bool] = False
    fixed: Optional[bool] = False

    def __init__(self, **data: Any):
        """
        Configuration for a single modification in the search space.

        Parameters
        ----------
        label
            Label of the modification. This can be any valid ProForma 2.0 label.
        amino_acid
            Amino acid target of the modification. :py:obj:`None` if the modification is not
            specific to an amino acid. Default is None.
        peptide_n_term
            Whether the modification occurs only on the peptide N-terminus. Default is False.
        protein_n_term
            Whether the modification occurs only on the protein N-terminus. Default is False.
        peptide_c_term
            Whether the modification occurs only on the peptide C-terminus. Default is False.
        protein_c_term
            Whether the modification occurs only on the protein C-terminus. Default is False.
        fixed
            Whether the modification is fixed. Default is False.

        """
        super().__init__(**data)

    @model_validator(mode="after")
    def _modification_must_have_target(self):
        target_fields = [
            "amino_acid",
            "peptide_n_term",
            "protein_n_term",
            "peptide_c_term",
            "protein_c_term",
        ]
        if not any(getattr(self, t) for t in target_fields):
            raise ValueError("Modifications must have a target (amino acid or N/C-term).")
        return self


DEFAULT_MODIFICATIONS = [
    ModificationConfig(
        label="UNIMOD:Oxidation",
        amino_acid="M",
    ),
    ModificationConfig(
        label="UNIMOD:Carbamidomethyl",
        amino_acid="C",
        fixed=True,
    ),
]


[docs]class ProteomeSearchSpace(BaseModel):
    """Search space for in silico spectral library generation."""

    fasta_file: Path
    min_length: int = 8
    max_length: int = 30
    min_precursor_mz: Optional[float] = 0
    max_precursor_mz: Optional[float] = np.Inf
    cleavage_rule: str = "trypsin"
    missed_cleavages: int = 2
    semi_specific: bool = False
    add_decoys: bool = False
    modifications: List[ModificationConfig] = DEFAULT_MODIFICATIONS
    max_variable_modifications: int = 3
    charges: List[int] = [2, 3]

    def __init__(self, **data: Any):
        """
        Search space for in silico spectral library generation.

        Parameters
        ----------
        fasta_file
            Path to FASTA file with protein sequences.
        min_length
            Minimum peptide length. Default is 8.
        max_length
            Maximum peptide length. Default is 30.
        min_precursor_mz
            Minimum precursor m/z for peptides. Default is 0.
        max_precursor_mz
            Maximum precursor m/z for peptides. Default is np.Inf.
        cleavage_rule
            Cleavage rule for peptide digestion. Default is "trypsin".
        missed_cleavages
            Maximum number of missed cleavages. Default is 2.
        semi_specific
            Allow semi-specific cleavage. Default is False.
        add_decoys
            Add decoy sequences to search space. Default is False.
        modifications
            List of modifications to consider. Default is oxidation of M and carbamidomethylation
            of C.
        max_variable_modifications
            Maximum number of variable modifications per peptide. Default is 3.
        charges
            List of charges to consider. Default is [2, 3].

        """

        super().__init__(**data)
        self._peptidoform_spaces: List[_PeptidoformSearchSpace] = []

    @field_validator("modifications")
    @classmethod
    def _validate_modifications(cls, v):
        if all(isinstance(m, ModificationConfig) for m in v):
            return v
        elif all(isinstance(m, dict) for m in v):
            return [ModificationConfig(**modification) for modification in v]
        else:
            raise ValueError(
                "Modifications should be a list of dicts or ModificationConfig objects."
            )

    @model_validator(mode="after")
    def _validate_unspecific_cleavage(self):
        """Validate and configure unspecific cleavage settings."""
        # `unspecific` is not an option in pyteomics.parser.icleave, so we configure
        # the settings for unspecific cleavage manually.
        if self.cleavage_rule == "unspecific":
            self.missed_cleavages = self.max_length
            self.cleavage_rule = r"(?<=[A-Z])"
        return self

    def __len__(self):
        if not self._peptidoform_spaces:
            raise ValueError("Search space must be built before length can be determined.")
        return sum(len(pep_space) for pep_space in self._peptidoform_spaces)

[docs]    @classmethod
    def from_any(cls, _input: Union[dict, str, Path, ProteomeSearchSpace]) -> ProteomeSearchSpace:
        """
        Create ProteomeSearchSpace from various input types.

        Parameters
        ----------
        _input
            Search space parameters as a dictionary, a path to a JSON file, an existing
            :py:class:`ProteomeSearchSpace` object.

        """
        if isinstance(_input, ProteomeSearchSpace):
            return _input
        elif isinstance(_input, (str, Path)):
            with open(_input, "rt") as f:
                return cls.model_validate_json(f.read())
        elif isinstance(_input, dict):
            return cls.model_validate(_input)
        else:
            raise ValueError("Search space must be a dict, str, Path, or ProteomeSearchSpace.")

[docs]    def build(self, processes: int = 1):
        """
        Build peptide search space from FASTA file.

        Parameters
        ----------
        processes : int
            Number of processes to use for parallelization.

        """
        processes = processes if processes else multiprocessing.cpu_count()
        self._digest_fasta(processes)
        self._remove_redundancy()
        self._add_modifications(processes)
        self._add_charges()

    def __iter__(self) -> Generator[PSM, None, None]:
        """
        Generate PSMs from search space.

        If :py:meth:`build` has not been called, the search space will first be built with the
        given parameters.

        Parameters
        ----------
        processes : int
            Number of processes to use for parallelization.

        """
        # Build search space if not already built
        if not self._peptidoform_spaces:
            raise ValueError("Search space must be built before PSMs can be generated.")

        spectrum_id = 0
        for pep_space in self._peptidoform_spaces:
            for pep in pep_space:
                yield PSM(
                    peptidoform=pep,
                    spectrum_id=spectrum_id,
                    protein_list=pep_space.proteins,
                )
                spectrum_id += 1

[docs]    def filter_psms_by_mz(self, psms: PSMList) -> PSMList:
        """Filter PSMs by precursor m/z range."""
        return PSMList(
            psm_list=[
                psm
                for psm in psms
                if self.min_precursor_mz <= psm.peptidoform.theoretical_mz <= self.max_precursor_mz
            ]
        )

    def _digest_fasta(self, processes: int = 1):
        """Digest FASTA file to peptides and populate search space."""
        # Convert to string to avoid issues with Path objects
        self.fasta_file = str(self.fasta_file)
        n_proteins = _count_fasta_entries(self.fasta_file)
        if self.add_decoys:
            fasta_db = pyteomics.fasta.decoy_db(
                self.fasta_file,
                mode="reverse",
                decoy_only=False,
                keep_nterm=True,
            )
            n_proteins *= 2
        else:
            fasta_db = pyteomics.fasta.FASTA(self.fasta_file)

        # Read proteins and digest to peptides
        with _get_pool(processes) as pool:
            partial_digest_protein = partial(
                _digest_single_protein,
                min_length=self.min_length,
                max_length=self.max_length,
                cleavage_rule=self.cleavage_rule,
                missed_cleavages=self.missed_cleavages,
                semi_specific=self.semi_specific,
            )
            results = track(
                pool.imap(partial_digest_protein, fasta_db),
                total=n_proteins,
                description="Digesting proteins...",
                transient=True,
            )
            self._peptidoform_spaces = list(chain.from_iterable(results))

    def _remove_redundancy(self):
        """Remove redundancy in peptides and combine protein lists."""
        peptide_dict = dict()
        for peptide in track(
            self._peptidoform_spaces,
            description="Removing peptide redundancy...",
            transient=True,
        ):
            if peptide.sequence in peptide_dict:
                peptide_dict[peptide.sequence].proteins.extend(peptide.proteins)
            else:
                peptide_dict[peptide.sequence] = peptide

        # Overwrite with non-redundant peptides
        self._peptidoform_spaces = list(peptide_dict.values())

    def _add_modifications(self, processes: int = 1):
        """Add modifications to peptides in search space."""
        modifications_by_target = _restructure_modifications_by_target(self.modifications)
        modification_options = []
        with _get_pool(processes) as pool:
            partial_get_modification_versions = partial(
                _get_peptidoform_modification_versions,
                modifications=self.modifications,
                modifications_by_target=modifications_by_target,
                max_variable_modifications=self.max_variable_modifications,
            )
            modification_options = pool.imap(
                partial_get_modification_versions, self._peptidoform_spaces
            )
            for pep, mod_opt in track(
                zip(self._peptidoform_spaces, modification_options),
                description="Adding modifications...",
                total=len(self._peptidoform_spaces),
                transient=True,
            ):
                pep.modification_options = mod_opt

    def _add_charges(self):
        """Add charge permutations to peptides in search space."""
        for peptide in track(
            self._peptidoform_spaces,
            description="Adding charge permutations...",
            transient=True,
        ):
            peptide.charge_options = self.charges


class _PeptidoformSearchSpace(BaseModel):
    """Search space for a given amino acid sequence."""

    sequence: str
    proteins: List[str]
    is_n_term: Optional[bool] = None
    is_c_term: Optional[bool] = None
    modification_options: List[Dict[int, ModificationConfig]] = []
    charge_options: List[int] = []

    def __init__(self, **data: Any):
        """
        Search space for a given amino acid sequence.

        Parameters
        ----------
        sequence
            Amino acid sequence of the peptidoform.
        proteins
            List of protein IDs containing the peptidoform.
        is_n_term
            Whether the peptidoform is an N-terminal peptide. Default is None.
        is_c_term
            Whether the peptidoform is a C-terminal peptide. Default is None.
        modification_options
            List of dictionaries with modification positions and configurations. Default is [].
        charge_options
            List of charge states to consider. Default is [].

        """
        super().__init__(**data)

    def __len__(self):
        return len(self.modification_options) * len(self.charge_options)

    def __iter__(self) -> Generator[str, None, None]:
        """Yield peptidoform strings with given charges and modifications."""
        if not self.charge_options:
            raise ValueError("Peptide charge options not defined.")
        if not self.modification_options:
            raise ValueError("Peptide modification options not defined.")

        for modifications, charge in product(self.modification_options, self.charge_options):
            yield self._construct_peptidoform_string(self.sequence, modifications, charge)

    @staticmethod
    def _construct_peptidoform_string(
        sequence: str, modifications: Dict[int, ModificationConfig], charge: int
    ) -> str:
        if not modifications:
            return f"{sequence}/{charge}"

        modded_sequence = list(sequence)
        for position, mod in modifications.items():
            if isinstance(position, int):
                aa = modded_sequence[position]
                if aa != mod.amino_acid:
                    raise ValueError(
                        f"Modification {mod.label} at position {position} does not match amino "
                        f"acid {aa}."
                    )
                modded_sequence[position] = f"{aa}[{mod.label}]"
            elif position == "N":
                modded_sequence.insert(0, f"[{mod.label}]-")
            elif position == "C":
                modded_sequence.append(f"-[{mod.label}]")
            else:
                raise ValueError(f"Invalid position {position} for modification {mod.label}.")

        return f"{''.join(modded_sequence)}/{charge}"


def _digest_single_protein(
    protein: pyteomics.fasta.Protein,
    min_length: int = 8,
    max_length: int = 30,
    cleavage_rule: str = "trypsin",
    missed_cleavages: int = 2,
    semi_specific: bool = False,
) -> List[_PeptidoformSearchSpace]:
    """Digest protein sequence and return a list of validated peptides."""

    def valid_residues(sequence: str) -> bool:
        return not any(aa in sequence for aa in ["B", "J", "O", "U", "X", "Z"])

    def parse_peptide(
        start_position: int,
        sequence: str,
        protein: pyteomics.fasta.Protein,
    ) -> _PeptidoformSearchSpace:
        """Parse result from parser.icleave into Peptide."""
        return _PeptidoformSearchSpace(
            sequence=sequence,
            # Assumes protein ID is description until first space
            proteins=[protein.description.split(" ")[0]],
            is_n_term=start_position == 0,
            is_c_term=start_position + len(sequence) == len(protein.sequence),
        )

    peptides = [
        parse_peptide(start, seq, protein)
        for start, seq in icleave(
            protein.sequence,
            cleavage_rule,
            missed_cleavages=missed_cleavages,
            min_length=min_length,
            max_length=max_length,
            semi=semi_specific,
        )
        if valid_residues(seq)
    ]

    return peptides


def _count_fasta_entries(filename: Path) -> int:
    """Count the number of entries in a FASTA file."""
    with open(filename, "rt") as f:
        count = 0
        for line in f:
            if line[0] == ">":
                count += 1
    return count


def _restructure_modifications_by_target(
    modifications: List[ModificationConfig],
) -> Dict[str, Dict[str, List[ModificationConfig]]]:
    """Restructure variable modifications to options per side chain or terminus."""
    modifications_by_target = {
        "sidechain": defaultdict(lambda: []),
        "peptide_n_term": defaultdict(lambda: []),
        "peptide_c_term": defaultdict(lambda: []),
        "protein_n_term": defaultdict(lambda: []),
        "protein_c_term": defaultdict(lambda: []),
    }

    def add_mod(mod, target, amino_acid):
        if amino_acid:
            modifications_by_target[target][amino_acid].append(mod)
        else:
            modifications_by_target[target]["any"].append(mod)

    for mod in modifications:
        if mod.fixed:
            continue
        if mod.peptide_n_term:
            add_mod(mod, "peptide_n_term", mod.amino_acid)
        elif mod.peptide_c_term:
            add_mod(mod, "peptide_c_term", mod.amino_acid)
        elif mod.protein_n_term:
            add_mod(mod, "protein_n_term", mod.amino_acid)
        elif mod.protein_c_term:
            add_mod(mod, "protein_c_term", mod.amino_acid)
        else:
            add_mod(mod, "sidechain", mod.amino_acid)

    return {k: dict(v) for k, v in modifications_by_target.items()}


def _get_modification_possibilities_by_site(
    peptide: _PeptidoformSearchSpace,
    modifications_by_target: Dict[str, Dict[str, List[ModificationConfig]]],
    modifications: List[ModificationConfig],
) -> Dict[Union[str, int], List[ModificationConfig]]:
    """Get all possible modifications for each site in a peptide sequence."""
    possibilities_by_site = defaultdict(list)

    # Generate dictionary of positions per amino acid
    position_dict = defaultdict(list)
    for pos, aa in enumerate(peptide.sequence):
        position_dict[aa].append(pos)
    # Map modifications to positions
    for aa in set(position_dict).intersection(set(modifications_by_target["sidechain"])):
        possibilities_by_site.update(
            {pos: modifications_by_target["sidechain"][aa] for pos in position_dict[aa]}
        )

    # Assign possible modifications per terminus
    for terminus, position, site_name, specificity in [
        ("peptide_n_term", 0, "N", None),
        ("peptide_c_term", -1, "C", None),
        ("protein_n_term", 0, "N", "is_n_term"),
        ("protein_c_term", -1, "C", "is_c_term"),
    ]:
        if specificity is None or getattr(peptide, specificity):
            for site, mods in modifications_by_target[terminus].items():
                if site == "any" or peptide.sequence[position] == site:
                    possibilities_by_site[site_name].extend(mods)

    # Override with fixed modifications
    for mod in modifications:
        aa = mod.amino_acid
        # Skip variable modifications
        if not mod.fixed:
            continue
        # Assign if specific aa matches or if no aa is specified for each terminus
        for terminus, position, site_name, specificity in [
            ("peptide_n_term", 0, "N", None),
            ("peptide_c_term", -1, "C", None),
            ("protein_n_term", 0, "N", "is_n_term"),
            ("protein_c_term", -1, "C", "is_c_term"),
        ]:
            if getattr(mod, terminus):  # Mod has this terminus
                if specificity is None or getattr(peptide, specificity):  # Specificity matches
                    if not aa or (aa and peptide.sequence[position] == aa):  # AA matches
                        possibilities_by_site[site_name] = [mod]  # Override with fixed mod
                break  # Allow `else: if amino_acid` if no terminus matches
        # Assign if fixed modification is not terminal and specific aa matches
        else:
            if aa:
                for pos in position_dict[aa]:
                    possibilities_by_site[pos] = [mod]

    return possibilities_by_site


def _get_peptidoform_modification_versions(
    peptide: _PeptidoformSearchSpace,
    modifications: List[ModificationConfig],
    modifications_by_target: Dict[str, Dict[str, List[ModificationConfig]]],
    max_variable_modifications: int = 3,
) -> List[Dict[Union[str, int], List[ModificationConfig]]]:
    """
    Get all potential combinations of modifications for a peptide sequence.

    Examples
    --------
    >>> peptide = PeptidoformSpace(sequence="PEPTIDE", proteins=["PROTEIN"])
    >>> phospho = ModificationConfig(label="Phospho", amino_acid="T", fixed=False)
    >>> acetyl = ModificationConfig(label="Acetyl", peptide_n_term=True, fixed=False)
    >>> modifications = [phospho, acetyl]
    >>> modifications_by_target = {
    ...     "sidechain": {"S": [modifications[0]]},
    ...     "peptide_n_term": {"any": [modifications[1]]},
    ...     "peptide_c_term": {"any": []},
    ...     "protein_n_term": {"any": []},
    ...     "protein_c_term": {"any": []},
    ... }
    >>> _get_modification_versions(peptide, modifications, modifications_by_target)
    [{}, {3: phospho}, {0: acetyl}, {0: acetyl, 3: phospho}]

    """
    # Get all possible modifications for each site in the peptide sequence
    possibilities_by_site = _get_modification_possibilities_by_site(
        peptide, modifications_by_target, modifications
    )

    # Separate fixed and variable modification sites
    fixed_modifications = {}
    variable_sites = []
    for site, mods in possibilities_by_site.items():
        for mod in mods:
            if mod.fixed:
                fixed_modifications[site] = mod
            else:
                variable_sites.append((site, mod))

    # Generate all combinations of variable modifications up to the maximum allowed
    modification_versions = []
    for i in range(max_variable_modifications + 1):
        for comb in combinations(variable_sites, i):
            combination_dict = fixed_modifications.copy()
            for site, mod in comb:
                combination_dict[site] = mod
            modification_versions.append(combination_dict)

    return modification_versions


def _get_pool(processes: int) -> Union[multiprocessing.Pool, multiprocessing.dummy.Pool]:
    """Get a multiprocessing pool with the given number of processes."""
    # TODO: fix None default value for processes
    if processes > 1:
        return multiprocessing.Pool(processes)
    else:
        return multiprocessing.dummy.Pool(processes)