Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions pyprophet/cli/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
memray_profile,
)
from .._config import RunnerIOConfig
from ..io.util import get_num_runs
# Defer import of runner to avoid premature sklearn import before OMP_NUM_THREADS is set
# from ..scoring.runner import PyProphetLearner, PyProphetWeightApplier

Expand All @@ -37,7 +38,7 @@
default=1.0,
show_default=True,
type=float,
help="Subsampling ratio for large data. Use <1.0 to subsample precursors for semi-supervised learning, the learned weights will then be applied to the full data set.",
help="Subsampling ratio for large data. Use <1.0 to subsample precursors for semi-supervised learning, the learned weights will then be applied to the full data set. When set to 1.0 (default) and the input has >20 runs, auto-subsampling to 1/N is applied (N=number of runs). Set to -1.0 to disable auto-subsampling and use full data.",
)
# Semi-supervised learning
@click.option(
Expand Down Expand Up @@ -373,10 +374,29 @@ def score(
ctx.obj["LOG_HEADER"],
)

# Auto-subsample based on number of runs if applicable
if subsample_ratio == 1.0:
# Check if we should auto-subsample
num_runs = get_num_runs(infile, config.file_type)
if num_runs > 20:
config.subsample_ratio = 1.0 / num_runs
logger.info(
f"Auto-subsampling enabled: {num_runs} runs detected. "
f"Setting subsample_ratio to 1/{num_runs} = {config.subsample_ratio:.4f} "
f"for efficient semi-supervised learning. Use --subsample_ratio -1 to disable auto-subsampling."
)
elif subsample_ratio == -1.0:
# User explicitly disabled auto-subsampling
config.subsample_ratio = 1.0
logger.info(
"Auto-subsampling disabled (subsample_ratio set to -1.0). "
"Using full dataset for semi-supervised learning."
)

Comment thread
singjc marked this conversation as resolved.
Comment thread
singjc marked this conversation as resolved.
# Validate file type and subsample ratio. OSW, parquet, parquet_split, and parquet_split_multi all support subsampling
if (
config.file_type not in ["osw", "parquet", "parquet_split", "parquet_split_multi"]
and subsample_ratio < 1.0
and config.subsample_ratio < 1.0
):
logger.warning(
"Semi-supervised learning on a subset of the data, and then applying the weights to the full data is currently only supported for OSW, `parquet`, `parquet_split`, and `parquet_split_multi` files.\nFor TSV and other formats, you need to manually prepare a subsampled input file.\nSetting subsample_ratio to 1.0.",
Expand Down
36 changes: 25 additions & 11 deletions pyprophet/io/ipf/osw.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,12 +335,12 @@ def _fetch_alignment_features_duckdb(self, con):
WHERE fma.LABEL = 1
AND fma.REFERENCE_FEATURE_ID != fma.ALIGNED_FEATURE_ID
) AS merged
LEFT JOIN (
INNER JOIN (
SELECT
FEATURE_ID,
MIN(PEP) AS pep
FROM osw.SCORE_ALIGNMENT
WHERE PEP <= {pep_threshold}
WHERE PEP < {pep_threshold}
GROUP BY FEATURE_ID
) AS sa
ON merged.FEATURE_ID = sa.FEATURE_ID
Expand Down Expand Up @@ -545,18 +545,32 @@ def _fetch_alignment_features_sqlite(self, con):
query = f"""
SELECT
DENSE_RANK() OVER (ORDER BY PRECURSOR_ID, ALIGNMENT_ID) AS ALIGNMENT_GROUP_ID,
ALIGNED_FEATURE_ID AS FEATURE_ID
FEATURE_ID
FROM (
SELECT DISTINCT * FROM FEATURE_MS2_ALIGNMENT
) AS FEATURE_MS2_ALIGNMENT
SELECT DISTINCT
ALIGNMENT_ID,
PRECURSOR_ID,
REFERENCE_FEATURE_ID AS FEATURE_ID
FROM FEATURE_MS2_ALIGNMENT
WHERE LABEL = 1
AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID

UNION

SELECT DISTINCT
ALIGNMENT_ID,
PRECURSOR_ID,
ALIGNED_FEATURE_ID AS FEATURE_ID
FROM FEATURE_MS2_ALIGNMENT
WHERE LABEL = 1
AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID
) AS feature_list
INNER JOIN (
SELECT DISTINCT *, MIN(QVALUE)
SELECT DISTINCT FEATURE_ID
FROM SCORE_ALIGNMENT
GROUP BY FEATURE_ID
) AS SCORE_ALIGNMENT
ON SCORE_ALIGNMENT.FEATURE_ID = FEATURE_MS2_ALIGNMENT.ALIGNED_FEATURE_ID
WHERE LABEL = 1
AND SCORE_ALIGNMENT.PEP < {pep_threshold}
WHERE PEP < {pep_threshold}
) AS good_alignments
ON good_alignments.FEATURE_ID = feature_list.FEATURE_ID
ORDER BY ALIGNMENT_GROUP_ID
"""

Expand Down
77 changes: 77 additions & 0 deletions pyprophet/io/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,83 @@ def group_by_run(files):
click.echo(f" └── 📄 {os.path.basename(alignment)}")


def get_num_runs(infile, file_type):
"""
Get the number of runs in the input file.

Supports OSW (SQLite), Parquet, Parquet split, and Parquet split multi formats.

Args:
infile (str): Path to the input file or directory.
file_type (str): Type of input file ('osw', 'parquet', 'parquet_split', 'parquet_split_multi', 'tsv').

Returns:
int: Number of runs in the input file, or 0 if unable to determine.
"""
try:
if file_type == "osw":
# Query RUN table from SQLite database
if not is_sqlite_file(infile):
return 0

con = sqlite3.connect(infile)
try:
cursor = con.cursor()
cursor.execute("SELECT COUNT(*) FROM RUN")
num_runs = cursor.fetchone()[0]
return num_runs
except sqlite3.OperationalError:
# RUN table doesn't exist
return 0
finally:
con.close()

elif file_type == "parquet":
# Single parquet file - use DuckDB for efficient streaming count
if not is_parquet_file(infile):
return 0

try:
con = duckdb.connect()
query = f"""
SELECT COUNT(DISTINCT RUN_ID) as num_runs
FROM read_parquet('{infile}')
"""
result = con.execute(query).fetchone()
return result[0] if result else 0
except Exception:
return 0
Comment thread
singjc marked this conversation as resolved.

elif file_type == "parquet_split":
# Single-run split parquet directory - by definition, there is only 1 run
# (parquet_split is validated to contain one set of run files)
precursor_path = os.path.join(infile, "precursors_features.parquet")
if os.path.exists(precursor_path) and is_parquet_file(precursor_path):
return 1
return 0

elif file_type == "parquet_split_multi":
# Multi-run split parquet directory - count .oswpq subdirectories
if not os.path.isdir(infile):
return 0

# Each .oswpq directory represents one run
runs = [
d for d in os.listdir(infile)
if d.endswith(".oswpq") and os.path.isdir(os.path.join(infile, d))
]
return len(runs)

elif file_type == "tsv":
return 1

return 0

except Exception as e:
logger.warning(f"Error getting number of runs from {infile}: {e}")
return 0


def unimod_to_codename(seq):
"""
Convert a sequence with unimod modifications to a codename.
Expand Down
Loading