From e8e197471c415cab6dfe3ea567e84887803c65ab Mon Sep 17 00:00:00 2001 From: singjc Date: Tue, 5 May 2026 20:05:15 -0400 Subject: [PATCH 1/3] Refactor SQL queries in OSWReader to improve alignment feature extraction and ensure correct PEP threshold comparison Co-authored-by: Copilot --- pyprophet/io/ipf/osw.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/pyprophet/io/ipf/osw.py b/pyprophet/io/ipf/osw.py index 41bae28a..cd8515c7 100644 --- a/pyprophet/io/ipf/osw.py +++ b/pyprophet/io/ipf/osw.py @@ -335,12 +335,12 @@ def _fetch_alignment_features_duckdb(self, con): WHERE fma.LABEL = 1 AND fma.REFERENCE_FEATURE_ID != fma.ALIGNED_FEATURE_ID ) AS merged - LEFT JOIN ( + INNER JOIN ( SELECT FEATURE_ID, MIN(PEP) AS pep FROM osw.SCORE_ALIGNMENT - WHERE PEP <= {pep_threshold} + WHERE PEP < {pep_threshold} GROUP BY FEATURE_ID ) AS sa ON merged.FEATURE_ID = sa.FEATURE_ID @@ -545,18 +545,32 @@ def _fetch_alignment_features_sqlite(self, con): query = f""" SELECT DENSE_RANK() OVER (ORDER BY PRECURSOR_ID, ALIGNMENT_ID) AS ALIGNMENT_GROUP_ID, - ALIGNED_FEATURE_ID AS FEATURE_ID + FEATURE_ID FROM ( - SELECT DISTINCT * FROM FEATURE_MS2_ALIGNMENT - ) AS FEATURE_MS2_ALIGNMENT + SELECT DISTINCT + ALIGNMENT_ID, + PRECURSOR_ID, + REFERENCE_FEATURE_ID AS FEATURE_ID + FROM FEATURE_MS2_ALIGNMENT + WHERE LABEL = 1 + AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID + + UNION + + SELECT DISTINCT + ALIGNMENT_ID, + PRECURSOR_ID, + ALIGNED_FEATURE_ID AS FEATURE_ID + FROM FEATURE_MS2_ALIGNMENT + WHERE LABEL = 1 + AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID + ) AS feature_list INNER JOIN ( - SELECT DISTINCT *, MIN(QVALUE) + SELECT DISTINCT FEATURE_ID FROM SCORE_ALIGNMENT - GROUP BY FEATURE_ID - ) AS SCORE_ALIGNMENT - ON SCORE_ALIGNMENT.FEATURE_ID = FEATURE_MS2_ALIGNMENT.ALIGNED_FEATURE_ID - WHERE LABEL = 1 - AND SCORE_ALIGNMENT.PEP < {pep_threshold} + WHERE PEP < {pep_threshold} + ) AS good_alignments + ON good_alignments.FEATURE_ID = feature_list.FEATURE_ID ORDER BY ALIGNMENT_GROUP_ID """ From 16915fb506c3c5c172d6179587014743e477c3b5 Mon Sep 17 00:00:00 2001 From: singjc Date: Tue, 5 May 2026 20:18:06 -0400 Subject: [PATCH 2/3] Add auto-subsampling feature based on number of runs and implement get_num_runs utility function Co-authored-by: Copilot --- pyprophet/cli/score.py | 22 ++++++++++++- pyprophet/io/util.py | 75 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) diff --git a/pyprophet/cli/score.py b/pyprophet/cli/score.py index aff5503a..c0759090 100644 --- a/pyprophet/cli/score.py +++ b/pyprophet/cli/score.py @@ -12,6 +12,7 @@ memray_profile, ) from .._config import RunnerIOConfig +from ..io.util import get_num_runs # Defer import of runner to avoid premature sklearn import before OMP_NUM_THREADS is set # from ..scoring.runner import PyProphetLearner, PyProphetWeightApplier @@ -37,7 +38,7 @@ default=1.0, show_default=True, type=float, - help="Subsampling ratio for large data. Use <1.0 to subsample precursors for semi-supervised learning, the learned weights will then be applied to the full data set.", + help="Subsampling ratio for large data. Use <1.0 to subsample precursors for semi-supervised learning, the learned weights will then be applied to the full data set. When set to 1.0 (default) and the input has >20 runs, auto-subsampling to 1/N is applied (N=number of runs). Set to -1.0 to disable auto-subsampling and use full data.", ) # Semi-supervised learning @click.option( @@ -373,6 +374,25 @@ def score( ctx.obj["LOG_HEADER"], ) + # Auto-subsample based on number of runs if applicable + if subsample_ratio == 1.0: + # Check if we should auto-subsample + num_runs = get_num_runs(infile, config.file_type) + if num_runs > 20: + config.subsample_ratio = 1.0 / num_runs + logger.info( + f"Auto-subsampling enabled: {num_runs} runs detected. " + f"Setting subsample_ratio to 1/{num_runs} = {config.subsample_ratio:.4f} " + f"for efficient semi-supervised learning. Use --subsample_ratio -1 to disable auto-subsampling." + ) + elif subsample_ratio == -1.0: + # User explicitly disabled auto-subsampling + config.subsample_ratio = 1.0 + logger.info( + "Auto-subsampling disabled (subsample_ratio set to -1.0). " + "Using full dataset for semi-supervised learning." + ) + # Validate file type and subsample ratio. OSW, parquet, parquet_split, and parquet_split_multi all support subsampling if ( config.file_type not in ["osw", "parquet", "parquet_split", "parquet_split_multi"] diff --git a/pyprophet/io/util.py b/pyprophet/io/util.py index 9cce1d95..265423fd 100644 --- a/pyprophet/io/util.py +++ b/pyprophet/io/util.py @@ -465,6 +465,81 @@ def group_by_run(files): click.echo(f" └── 📄 {os.path.basename(alignment)}") +def get_num_runs(infile, file_type): + """ + Get the number of runs in the input file. + + Supports OSW (SQLite), Parquet, Parquet split, and Parquet split multi formats. + + Args: + infile (str): Path to the input file or directory. + file_type (str): Type of input file ('osw', 'parquet', 'parquet_split', 'parquet_split_multi', 'tsv'). + + Returns: + int: Number of runs in the input file, or 0 if unable to determine. + """ + try: + if file_type == "osw": + # Query RUN table from SQLite database + if not is_sqlite_file(infile): + return 0 + + con = sqlite3.connect(infile) + try: + cursor = con.cursor() + cursor.execute("SELECT COUNT(*) FROM RUN") + num_runs = cursor.fetchone()[0] + return num_runs + except sqlite3.OperationalError: + # RUN table doesn't exist + return 0 + finally: + con.close() + + elif file_type == "parquet": + # Single parquet file - need to check RUN_ID column + if not is_parquet_file(infile): + return 0 + + try: + df = pd.read_parquet(infile, columns=["RUN_ID"]) + return df["RUN_ID"].nunique() + except Exception: + return 0 + + elif file_type == "parquet_split": + # Single-run split parquet directory + precursor_path = os.path.join(infile, "precursors_features.parquet") + if os.path.exists(precursor_path) and is_parquet_file(precursor_path): + try: + df = pd.read_parquet(precursor_path, columns=["RUN_ID"]) + return df["RUN_ID"].nunique() + except Exception: + return 0 + return 0 + + elif file_type == "parquet_split_multi": + # Multi-run split parquet directory - count .oswpq subdirectories + if not os.path.isdir(infile): + return 0 + + # Each .oswpq directory represents one run + runs = [ + d for d in os.listdir(infile) + if d.endswith(".oswpq") and os.path.isdir(os.path.join(infile, d)) + ] + return len(runs) + + elif file_type == "tsv": + return 1 + + return 0 + + except Exception as e: + logger.warning(f"Error getting number of runs from {infile}: {e}") + return 0 + + def unimod_to_codename(seq): """ Convert a sequence with unimod modifications to a codename. From 58e7a6bc9bf77113988dbc4bf376180f75066667 Mon Sep 17 00:00:00 2001 From: singjc Date: Tue, 5 May 2026 20:33:23 -0400 Subject: [PATCH 3/3] Fix subsample_ratio reference in score function and optimize get_num_runs for parquet files using DuckDB Co-authored-by: Copilot --- pyprophet/cli/score.py | 2 +- pyprophet/io/util.py | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pyprophet/cli/score.py b/pyprophet/cli/score.py index c0759090..f470e610 100644 --- a/pyprophet/cli/score.py +++ b/pyprophet/cli/score.py @@ -396,7 +396,7 @@ def score( # Validate file type and subsample ratio. OSW, parquet, parquet_split, and parquet_split_multi all support subsampling if ( config.file_type not in ["osw", "parquet", "parquet_split", "parquet_split_multi"] - and subsample_ratio < 1.0 + and config.subsample_ratio < 1.0 ): logger.warning( "Semi-supervised learning on a subset of the data, and then applying the weights to the full data is currently only supported for OSW, `parquet`, `parquet_split`, and `parquet_split_multi` files.\nFor TSV and other formats, you need to manually prepare a subsampled input file.\nSetting subsample_ratio to 1.0.", diff --git a/pyprophet/io/util.py b/pyprophet/io/util.py index 265423fd..d4f36b7a 100644 --- a/pyprophet/io/util.py +++ b/pyprophet/io/util.py @@ -497,25 +497,27 @@ def get_num_runs(infile, file_type): con.close() elif file_type == "parquet": - # Single parquet file - need to check RUN_ID column + # Single parquet file - use DuckDB for efficient streaming count if not is_parquet_file(infile): return 0 try: - df = pd.read_parquet(infile, columns=["RUN_ID"]) - return df["RUN_ID"].nunique() + con = duckdb.connect() + query = f""" + SELECT COUNT(DISTINCT RUN_ID) as num_runs + FROM read_parquet('{infile}') + """ + result = con.execute(query).fetchone() + return result[0] if result else 0 except Exception: return 0 elif file_type == "parquet_split": - # Single-run split parquet directory + # Single-run split parquet directory - by definition, there is only 1 run + # (parquet_split is validated to contain one set of run files) precursor_path = os.path.join(infile, "precursors_features.parquet") if os.path.exists(precursor_path) and is_parquet_file(precursor_path): - try: - df = pd.read_parquet(precursor_path, columns=["RUN_ID"]) - return df["RUN_ID"].nunique() - except Exception: - return 0 + return 1 return 0 elif file_type == "parquet_split_multi":