Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to
[Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased
## [4.0.0-alpha.1]

### Changed

- Simplified the public package API by splitting up the single class-based API into core functions (`predict`, `finetune`, `train`, etc.)
- Switched deep learning framework from Tensorflow to PyTorch
- Speed up predictions by removing ensemble method where output from three models with differing kernel sizes was averaged to one prediction
- Separated calibration logic to dedicated reusable module with sklearn-like API.
- Improved computational efficiency of piece-wise linear calibration and set sensible default parameters
- Built-in transfer learning functionality, instead of using external `deeplcretrainer` package.
- Cleaned up package, removing legacy and unused code and files, and improving modularity
- Modernized CI workflows to use `uv`
Expand Down
2 changes: 0 additions & 2 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
include deeplc/models/*
include deeplc/package_data/**/*
include deeplc/baseline_performance/*
274 changes: 189 additions & 85 deletions deeplc/_features.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Feature extraction for DeepLC."""

# TODO: Consider ProForma fixed modifications (that are not applied yet) for feature extraction.

from __future__ import annotations

import logging
Expand All @@ -26,6 +28,112 @@
# fmt: on


def encode_peptidoform(
peptidoform: Peptidoform | str,
add_ccs_features: bool = False,
padding_length: int = 60,
positions: set[int] | None = None,
positions_pos: set[int] | None = None,
positions_neg: set[int] | None = None,
dict_aa: dict[str, int] | None = None,
dict_index_pos: dict[str, int] | None = None,
dict_index: dict[str, int] | None = None,
) -> dict[str, np.ndarray]:
"""
Extract features from a single peptidoform.

Parameters
----------
peptidoform
The peptidoform to encode, either as a Peptidoform object or a string.
add_ccs_features
Whether to include CCS features. Default is False.
padding_length
The maximum length of the sequence after padding. Default is 60.
positions
The positions to consider for feature extraction. Default is DEFAULT_POSITIONS.
positions_pos
The positive positions to consider for feature extraction. Default is
DEFAULT_POSITIONS_POS.
positions_neg
The negative positions to consider for feature extraction. Default is
DEFAULT_POSITIONS_NEG.
dict_aa
A dictionary mapping amino acids to indices. Default is DEFAULT_DICT_AA.
dict_index_pos
A dictionary mapping atoms to indices for the positional matrix. Default is
DEFAULT_DICT_INDEX_POS.
dict_index
A dictionary mapping atoms to indices. Default is DEFAULT_DICT_INDEX.

Returns
-------
dict[str, np.ndarray]
A dictionary of Numpy arrays containing the extracted features.

"""
positions = positions or DEFAULT_POSITIONS
positions_pos = positions_pos or DEFAULT_POSITIONS_POS
positions_neg = positions_neg or DEFAULT_POSITIONS_NEG
dict_aa = dict_aa or DEFAULT_DICT_AA
dict_index_pos = dict_index_pos or DEFAULT_DICT_INDEX_POS
dict_index = dict_index or DEFAULT_DICT_INDEX

if isinstance(peptidoform, str):
peptidoform = Peptidoform(peptidoform)
seq = peptidoform.sequence
charge = peptidoform.precursor_charge
seq, seq_len = _truncate_sequence(seq, padding_length)

std_matrix = _fill_standard_matrix(seq, padding_length, dict_index)
onehot_matrix = _fill_onehot_matrix(peptidoform.parsed_sequence, padding_length, dict_aa)
pos_matrix = _fill_pos_matrix(
seq, seq_len, positions_pos, positions_neg, dict_index, dict_index_pos
)
_apply_modifications(
std_matrix,
pos_matrix,
peptidoform.parsed_sequence,
seq_len,
dict_index,
dict_index_pos,
positions,
)
_apply_terminal_modifications(
std_matrix,
pos_matrix,
peptidoform,
seq_len,
dict_index,
dict_index_pos,
positions,
)

matrix_all = np.sum(std_matrix, axis=0)
matrix_all = np.append(matrix_all, seq_len)
if add_ccs_features:
if not charge:
raise ValueError(f"Peptidoform has no charge: {peptidoform}")
matrix_all = np.append(matrix_all, (seq.count("H")) / seq_len)
matrix_all = np.append(
matrix_all, (seq.count("F") + seq.count("W") + seq.count("Y")) / seq_len
)
matrix_all = np.append(matrix_all, (seq.count("D") + seq.count("E")) / seq_len)
matrix_all = np.append(matrix_all, (seq.count("K") + seq.count("R")) / seq_len)
matrix_all = np.append(matrix_all, charge)

matrix_sum = _compute_rolling_sum(std_matrix.T, n=2)[:, ::2].T

matrix_global = np.concatenate([matrix_all, pos_matrix.flatten()])

return {
"matrix": std_matrix,
"matrix_sum": matrix_sum,
"matrix_global": matrix_global,
"matrix_hc": onehot_matrix,
}


def _truncate_sequence(seq: str, max_length: int) -> tuple[str, int]:
"""Truncate the sequence if it exceeds the max_length."""
if len(seq) > max_length:
Expand Down Expand Up @@ -98,6 +206,40 @@ def _fill_pos_matrix(
return pos_mat


def _apply_composition_to_matrices(
mat: np.ndarray,
pos_mat: np.ndarray,
composition: mass.Composition,
i: int,
seq_len: int,
dict_index: dict[str, int],
dict_index_pos: dict[str, int],
positions: set[int],
) -> None:
"""Apply a composition delta to the standard and positional matrices."""
for atom_comp, change in composition.items():
try:
mat[i, dict_index[atom_comp]] += change
if i in positions:
pos_mat[i, dict_index_pos[atom_comp]] += change
elif (i - seq_len) in positions:
pos_mat[i - seq_len, dict_index_pos[atom_comp]] += change
except KeyError:
try:
warnings.warn(f"Replacing pattern for atom: {atom_comp}", stacklevel=2)
atom_comp_clean = sub(r"\[.*?\]", "", atom_comp)
mat[i, dict_index[atom_comp_clean]] += change
if i in positions:
pos_mat[i, dict_index_pos[atom_comp_clean]] += change
elif (i - seq_len) in positions:
pos_mat[i - seq_len, dict_index_pos[atom_comp_clean]] += change
except KeyError:
warnings.warn(f"Ignoring atom {atom_comp} at pos {i}", stacklevel=2)
continue
except IndexError:
warnings.warn(f"Index error for atom {atom_comp} at pos {i}", stacklevel=2)


def _apply_modifications(
mat: np.ndarray,
pos_mat: np.ndarray,
Expand All @@ -118,96 +260,58 @@ def _apply_modifications(
f"Skipping modification without known composition: {token[1]}", stacklevel=2
)
continue
for atom_comp, change in mod_comp.items():
_apply_composition_to_matrices(
mat,
pos_mat,
mod_comp,
i,
seq_len,
dict_index,
dict_index_pos,
positions,
)


def _apply_terminal_modifications(
mat: np.ndarray,
pos_mat: np.ndarray,
peptidoform: Peptidoform,
seq_len: int,
dict_index: dict[str, int],
dict_index_pos: dict[str, int],
positions: set[int],
) -> None:
"""Apply N- and C-terminal modification changes to the matrices."""
terminal_mods = [
(0, peptidoform.properties.get("n_term")), # N-terminus at position 0
(seq_len - 1, peptidoform.properties.get("c_term")), # C-terminus at last position
]
for i, mods in terminal_mods:
if not mods:
continue
for tag in mods:
try:
mat[i, dict_index[atom_comp]] += change
if i in positions:
pos_mat[i, dict_index_pos[atom_comp]] += change
elif (i - seq_len) in positions:
pos_mat[i - seq_len, dict_index_pos[atom_comp]] += change
except KeyError:
try:
warnings.warn(f"Replacing pattern for atom: {atom_comp}", stacklevel=2)
atom_comp_clean = sub(r"\[.*?\]", "", atom_comp)
mat[i, dict_index[atom_comp_clean]] += change
if i in positions:
pos_mat[i, dict_index_pos[atom_comp_clean]] += change
elif (i - seq_len) in positions:
pos_mat[i - seq_len, dict_index_pos[atom_comp_clean]] += change
except KeyError:
warnings.warn(f"Ignoring atom {atom_comp} at pos {i}", stacklevel=2)
continue
except IndexError:
warnings.warn(f"Index error for atom {atom_comp} at pos {i}", stacklevel=2)
mod_comp = tag.composition
except Exception:
warnings.warn(
f"Skipping terminal modification without known composition: {tag}",
stacklevel=2,
)
continue
_apply_composition_to_matrices(
mat,
pos_mat,
mod_comp,
i,
seq_len,
dict_index,
dict_index_pos,
positions,
)


def _compute_rolling_sum(matrix: np.ndarray, n: int = 2) -> np.ndarray:
"""Compute a rolling sum over the matrix."""
ret = np.cumsum(matrix, axis=1, dtype=np.float32)
ret[:, n:] = ret[:, n:] - ret[:, :-n]
return ret[:, n - 1 :]


def encode_peptidoform(
peptidoform: Peptidoform | str,
add_ccs_features: bool = False,
padding_length: int = 60,
positions: set[int] | None = None,
positions_pos: set[int] | None = None,
positions_neg: set[int] | None = None,
dict_aa: dict[str, int] | None = None,
dict_index_pos: dict[str, int] | None = None,
dict_index: dict[str, int] | None = None,
) -> dict[str, np.ndarray]:
"""Extract features from a single peptidoform."""
positions = positions or DEFAULT_POSITIONS
positions_pos = positions_pos or DEFAULT_POSITIONS_POS
positions_neg = positions_neg or DEFAULT_POSITIONS_NEG
dict_aa = dict_aa or DEFAULT_DICT_AA
dict_index_pos = dict_index_pos or DEFAULT_DICT_INDEX_POS
dict_index = dict_index or DEFAULT_DICT_INDEX

if isinstance(peptidoform, str):
peptidoform = Peptidoform(peptidoform)
seq = peptidoform.sequence
charge = peptidoform.precursor_charge
seq, seq_len = _truncate_sequence(seq, padding_length)

std_matrix = _fill_standard_matrix(seq, padding_length, dict_index)
onehot_matrix = _fill_onehot_matrix(peptidoform.parsed_sequence, padding_length, dict_aa)
pos_matrix = _fill_pos_matrix(
seq, seq_len, positions_pos, positions_neg, dict_index, dict_index_pos
)
_apply_modifications(
std_matrix,
pos_matrix,
peptidoform.parsed_sequence,
seq_len,
dict_index,
dict_index_pos,
positions,
)

matrix_all = np.sum(std_matrix, axis=0)
matrix_all = np.append(matrix_all, seq_len)
if add_ccs_features:
if not charge:
raise ValueError(f"Peptidoform has no charge: {peptidoform}")
matrix_all = np.append(matrix_all, (seq.count("H")) / seq_len)
matrix_all = np.append(
matrix_all, (seq.count("F") + seq.count("W") + seq.count("Y")) / seq_len
)
matrix_all = np.append(matrix_all, (seq.count("D") + seq.count("E")) / seq_len)
matrix_all = np.append(matrix_all, (seq.count("K") + seq.count("R")) / seq_len)
matrix_all = np.append(matrix_all, charge)

matrix_sum = _compute_rolling_sum(std_matrix.T, n=2)[:, ::2].T

matrix_global = np.concatenate([matrix_all, pos_matrix.flatten()])

return {
"matrix": std_matrix,
"matrix_sum": matrix_sum,
"matrix_global": matrix_global,
"matrix_hc": onehot_matrix,
}
Loading
Loading