From e8e197471c415cab6dfe3ea567e84887803c65ab Mon Sep 17 00:00:00 2001
From: singjc <justincsing@gmail.com>
Date: Tue, 5 May 2026 20:05:15 -0400
Subject: [PATCH 1/3] Refactor SQL queries in OSWReader to improve alignment
 feature extraction and ensure correct PEP threshold comparison

Co-authored-by: Copilot <copilot@github.com>
---
 pyprophet/io/ipf/osw.py | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/pyprophet/io/ipf/osw.py b/pyprophet/io/ipf/osw.py
index 41bae28a..cd8515c7 100644
--- a/pyprophet/io/ipf/osw.py
+++ b/pyprophet/io/ipf/osw.py
@@ -335,12 +335,12 @@ def _fetch_alignment_features_duckdb(self, con):
                 WHERE fma.LABEL = 1
                 AND fma.REFERENCE_FEATURE_ID != fma.ALIGNED_FEATURE_ID
             ) AS merged
-            LEFT JOIN (
+            INNER JOIN (
                 SELECT 
                     FEATURE_ID,
                     MIN(PEP) AS pep
                 FROM osw.SCORE_ALIGNMENT
-                WHERE PEP <= {pep_threshold}
+                WHERE PEP < {pep_threshold}
                 GROUP BY FEATURE_ID
             ) AS sa
             ON merged.FEATURE_ID = sa.FEATURE_ID
@@ -545,18 +545,32 @@ def _fetch_alignment_features_sqlite(self, con):
         query = f"""
             SELECT  
                 DENSE_RANK() OVER (ORDER BY PRECURSOR_ID, ALIGNMENT_ID) AS ALIGNMENT_GROUP_ID,
-                ALIGNED_FEATURE_ID AS FEATURE_ID 
+                FEATURE_ID 
             FROM (
-                SELECT DISTINCT * FROM FEATURE_MS2_ALIGNMENT
-            ) AS FEATURE_MS2_ALIGNMENT
+                SELECT DISTINCT
+                    ALIGNMENT_ID,
+                    PRECURSOR_ID,
+                    REFERENCE_FEATURE_ID AS FEATURE_ID
+                FROM FEATURE_MS2_ALIGNMENT
+                WHERE LABEL = 1
+                AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID
+                
+                UNION
+                
+                SELECT DISTINCT
+                    ALIGNMENT_ID,
+                    PRECURSOR_ID,
+                    ALIGNED_FEATURE_ID AS FEATURE_ID
+                FROM FEATURE_MS2_ALIGNMENT
+                WHERE LABEL = 1
+                AND REFERENCE_FEATURE_ID != ALIGNED_FEATURE_ID
+            ) AS feature_list
             INNER JOIN (
-                SELECT DISTINCT *, MIN(QVALUE) 
+                SELECT DISTINCT FEATURE_ID
                 FROM SCORE_ALIGNMENT 
-                GROUP BY FEATURE_ID
-            ) AS SCORE_ALIGNMENT 
-            ON SCORE_ALIGNMENT.FEATURE_ID = FEATURE_MS2_ALIGNMENT.ALIGNED_FEATURE_ID
-            WHERE LABEL = 1
-            AND SCORE_ALIGNMENT.PEP < {pep_threshold}
+                WHERE PEP < {pep_threshold}
+            ) AS good_alignments 
+            ON good_alignments.FEATURE_ID = feature_list.FEATURE_ID
             ORDER BY ALIGNMENT_GROUP_ID
         """
 

From 16915fb506c3c5c172d6179587014743e477c3b5 Mon Sep 17 00:00:00 2001
From: singjc <justincsing@gmail.com>
Date: Tue, 5 May 2026 20:18:06 -0400
Subject: [PATCH 2/3] Add auto-subsampling feature based on number of runs and
 implement get_num_runs utility function

Co-authored-by: Copilot <copilot@github.com>
---
 pyprophet/cli/score.py | 22 ++++++++++++-
 pyprophet/io/util.py   | 75 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/pyprophet/cli/score.py b/pyprophet/cli/score.py
index aff5503a..c0759090 100644
--- a/pyprophet/cli/score.py
+++ b/pyprophet/cli/score.py
@@ -12,6 +12,7 @@
     memray_profile,
 )
 from .._config import RunnerIOConfig
+from ..io.util import get_num_runs
 # Defer import of runner to avoid premature sklearn import before OMP_NUM_THREADS is set
 # from ..scoring.runner import PyProphetLearner, PyProphetWeightApplier
 
@@ -37,7 +38,7 @@
     default=1.0,
     show_default=True,
     type=float,
-    help="Subsampling ratio for large data. Use <1.0 to subsample precursors for semi-supervised learning, the learned weights will then be applied to the full data set.",
+    help="Subsampling ratio for large data. Use <1.0 to subsample precursors for semi-supervised learning, the learned weights will then be applied to the full data set. When set to 1.0 (default) and the input has >20 runs, auto-subsampling to 1/N is applied (N=number of runs). Set to -1.0 to disable auto-subsampling and use full data.",
 )
 # Semi-supervised learning
 @click.option(
@@ -373,6 +374,25 @@ def score(
         ctx.obj["LOG_HEADER"],
     )
 
+    # Auto-subsample based on number of runs if applicable
+    if subsample_ratio == 1.0:
+        # Check if we should auto-subsample
+        num_runs = get_num_runs(infile, config.file_type)
+        if num_runs > 20:
+            config.subsample_ratio = 1.0 / num_runs
+            logger.info(
+                f"Auto-subsampling enabled: {num_runs} runs detected. "
+                f"Setting subsample_ratio to 1/{num_runs} = {config.subsample_ratio:.4f} "
+                f"for efficient semi-supervised learning. Use --subsample_ratio -1 to disable auto-subsampling."
+            )
+    elif subsample_ratio == -1.0:
+        # User explicitly disabled auto-subsampling
+        config.subsample_ratio = 1.0
+        logger.info(
+            "Auto-subsampling disabled (subsample_ratio set to -1.0). "
+            "Using full dataset for semi-supervised learning."
+        )
+
     # Validate file type and subsample ratio. OSW, parquet, parquet_split, and parquet_split_multi all support subsampling
     if (
         config.file_type not in ["osw", "parquet", "parquet_split", "parquet_split_multi"]
diff --git a/pyprophet/io/util.py b/pyprophet/io/util.py
index 9cce1d95..265423fd 100644
--- a/pyprophet/io/util.py
+++ b/pyprophet/io/util.py
@@ -465,6 +465,81 @@ def group_by_run(files):
         click.echo(f"    └── 📄 {os.path.basename(alignment)}")
 
 
+def get_num_runs(infile, file_type):
+    """
+    Get the number of runs in the input file.
+
+    Supports OSW (SQLite), Parquet, Parquet split, and Parquet split multi formats.
+
+    Args:
+        infile (str): Path to the input file or directory.
+        file_type (str): Type of input file ('osw', 'parquet', 'parquet_split', 'parquet_split_multi', 'tsv').
+
+    Returns:
+        int: Number of runs in the input file, or 0 if unable to determine.
+    """
+    try:
+        if file_type == "osw":
+            # Query RUN table from SQLite database
+            if not is_sqlite_file(infile):
+                return 0
+
+            con = sqlite3.connect(infile)
+            try:
+                cursor = con.cursor()
+                cursor.execute("SELECT COUNT(*) FROM RUN")
+                num_runs = cursor.fetchone()[0]
+                return num_runs
+            except sqlite3.OperationalError:
+                # RUN table doesn't exist
+                return 0
+            finally:
+                con.close()
+
+        elif file_type == "parquet":
+            # Single parquet file - need to check RUN_ID column
+            if not is_parquet_file(infile):
+                return 0
+
+            try:
+                df = pd.read_parquet(infile, columns=["RUN_ID"])
+                return df["RUN_ID"].nunique()
+            except Exception:
+                return 0
+
+        elif file_type == "parquet_split":
+            # Single-run split parquet directory
+            precursor_path = os.path.join(infile, "precursors_features.parquet")
+            if os.path.exists(precursor_path) and is_parquet_file(precursor_path):
+                try:
+                    df = pd.read_parquet(precursor_path, columns=["RUN_ID"])
+                    return df["RUN_ID"].nunique()
+                except Exception:
+                    return 0
+            return 0
+
+        elif file_type == "parquet_split_multi":
+            # Multi-run split parquet directory - count .oswpq subdirectories
+            if not os.path.isdir(infile):
+                return 0
+
+            # Each .oswpq directory represents one run
+            runs = [
+                d for d in os.listdir(infile)
+                if d.endswith(".oswpq") and os.path.isdir(os.path.join(infile, d))
+            ]
+            return len(runs)
+
+        elif file_type == "tsv":
+            return 1
+
+        return 0
+
+    except Exception as e:
+        logger.warning(f"Error getting number of runs from {infile}: {e}")
+        return 0
+
+
 def unimod_to_codename(seq):
     """
     Convert a sequence with unimod modifications to a codename.

From 58e7a6bc9bf77113988dbc4bf376180f75066667 Mon Sep 17 00:00:00 2001
From: singjc <justincsing@gmail.com>
Date: Tue, 5 May 2026 20:33:23 -0400
Subject: [PATCH 3/3] Fix subsample_ratio reference in score function and
 optimize get_num_runs for parquet files using DuckDB

Co-authored-by: Copilot <copilot@github.com>
---
 pyprophet/cli/score.py |  2 +-
 pyprophet/io/util.py   | 20 +++++++++++---------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/pyprophet/cli/score.py b/pyprophet/cli/score.py
index c0759090..f470e610 100644
--- a/pyprophet/cli/score.py
+++ b/pyprophet/cli/score.py
@@ -396,7 +396,7 @@ def score(
     # Validate file type and subsample ratio. OSW, parquet, parquet_split, and parquet_split_multi all support subsampling
     if (
         config.file_type not in ["osw", "parquet", "parquet_split", "parquet_split_multi"]
-        and subsample_ratio < 1.0
+        and config.subsample_ratio < 1.0
     ):
         logger.warning(
             "Semi-supervised learning on a subset of the data, and then applying the weights to the full data is currently only supported for OSW, `parquet`, `parquet_split`, and `parquet_split_multi` files.\nFor TSV and other formats, you need to manually prepare a subsampled input file.\nSetting subsample_ratio to 1.0.",
diff --git a/pyprophet/io/util.py b/pyprophet/io/util.py
index 265423fd..d4f36b7a 100644
--- a/pyprophet/io/util.py
+++ b/pyprophet/io/util.py
@@ -497,25 +497,27 @@ def get_num_runs(infile, file_type):
                 con.close()
 
         elif file_type == "parquet":
-            # Single parquet file - need to check RUN_ID column
+            # Single parquet file - use DuckDB for efficient streaming count
             if not is_parquet_file(infile):
                 return 0
 
             try:
-                df = pd.read_parquet(infile, columns=["RUN_ID"])
-                return df["RUN_ID"].nunique()
+                con = duckdb.connect()
+                query = f"""
+                    SELECT COUNT(DISTINCT RUN_ID) as num_runs
+                    FROM read_parquet('{infile}')
+                """
+                result = con.execute(query).fetchone()
+                return result[0] if result else 0
             except Exception:
                 return 0
 
         elif file_type == "parquet_split":
-            # Single-run split parquet directory
+            # Single-run split parquet directory - by definition, there is only 1 run
+            # (parquet_split is validated to contain one set of run files)
             precursor_path = os.path.join(infile, "precursors_features.parquet")
             if os.path.exists(precursor_path) and is_parquet_file(precursor_path):
-                try:
-                    df = pd.read_parquet(precursor_path, columns=["RUN_ID"])
-                    return df["RUN_ID"].nunique()
-                except Exception:
-                    return 0
+                return 1
             return 0
 
         elif file_type == "parquet_split_multi":