Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ keywords = [
"proteomics",
"mass-spectrometry",
"data-analysis",
"big data"
"big data",
"sdrf",
"sample-metadata",
"proteomics-pipeline"
]
classifiers = [
"Intended Audience :: Science/Research",
Expand All @@ -34,10 +37,11 @@ dependencies = [
]

[project.urls]
Homepage = "https://quantms.org"
Documentation = "https://docs.quantms.org"
GitHub = "https://github.com/bigbio/quantms-utils"
PyPi = "https://pypi.org/project/quantms-utils/"
Quantms = "https://quantms.org"
LICENSE = "https://github.com/bigbio/quantms-utils/blob/main/LICENSE"
"Bug Tracker" = "https://github.com/bigbio/quantms-utils/issues"
PyPI = "https://pypi.org/project/quantms-utils/"

[project.scripts]
quantmsutilsc = "quantmsutils.quantmsutilsc:main"
Expand Down
15 changes: 12 additions & 3 deletions quantmsutils/diann/diann2msstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def _parse_unified_design(exp_design_file):

# Validate required columns
required_cols = {
"Filename", "Fraction", "Sample", "Condition", "BioReplicate", "Label", "LabelType"
"Filename", "Fraction", "Sample", "Condition", "BioReplicate"
}

missing = required_cols - set(df.columns)
Expand All @@ -179,16 +179,25 @@ def _parse_unified_design(exp_design_file):
df["run"] = df["Filename"].apply(_true_stem)

# Multiplexing
if df["Label"].nunique() > 1:
if "Label" in df.columns and df["Label"].nunique() > 1:
labels_lower = df["Label"].astype(str).str.lower()

if "silac" in df["LabelType"].values:
if labels_lower.str.contains("silac").any():
silac_dict = {
"SILAC light": "L",
"SILAC medium": "M",
"SILAC heavy": "H",
}
df["Label"] = df["Label"].replace(silac_dict)

if labels_lower.str.contains("mtraq").any():
mtraq_dict = {
"MTRAQ0": "0",
"MTRAQ4": "4",
"MTRAQ8": "8",
}
df["Label"] = df["Label"].replace(mtraq_dict)
Comment on lines +185 to +199
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Search for Label column values in test design files to confirm expected casing
rg -n --type tsv "mTRAQ\|MTRAQ\|SILAC\|silac" --glob "*.tsv"
# Also check any usage in Python source that constructs or references these label strings
rg -n "mTRAQ\|MTRAQ\|SILAC light\|silac" --type py

Repository: bigbio/quantms-utils

Length of output: 92


🏁 Script executed:

#!/bin/bash
# Search for actual label values in TSV files (design files)
find . -name "*.tsv" -type f | head -20

Repository: bigbio/quantms-utils

Length of output: 312


🏁 Script executed:

#!/bin/bash
# Search for Label column values in any data files
rg -i "mTRAQ|MTRAQ|SILAC" --glob "*.tsv"

Repository: bigbio/quantms-utils

Length of output: 46


🏁 Script executed:

#!/bin/bash
# Check Python source for label string construction/references
rg "mTRAQ|MTRAQ|SILAC|Label" -A 2 -B 2 --type py quantmsutils/diann/

Repository: bigbio/quantms-utils

Length of output: 4163


🏁 Script executed:

#!/bin/bash
# Check the actual code context around lines 185-199
cat -n quantmsutils/diann/diann2msstats.py | sed -n '175,210p'

Repository: bigbio/quantms-utils

Length of output: 1731


🏁 Script executed:

#!/bin/bash
# Check test design files for Label column content
echo "=== PXD026600_diann_design.tsv ===" 
head -5 ./tests/test_data/diann2msstats/PXD026600_diann_design.tsv | cat -v

echo -e "\n=== PXD026600.sdrf_openms_design.tsv ===" 
head -5 ./tests/test_data/diann2msstats/PXD026600.sdrf_openms_design.tsv | cat -v

echo -e "\n=== Check for any mTRAQ or MTRAQ in test files ===" 
rg -i "label" ./tests/test_data/diann2msstats/*.tsv | head -20

Repository: bigbio/quantms-utils

Length of output: 4533


🏁 Script executed:

#!/bin/bash
# Look at full test design file structure
wc -l ./tests/test_data/diann2msstats/*.tsv

Repository: bigbio/quantms-utils

Length of output: 279


🏁 Script executed:

#!/bin/bash
# Check convert-diann module to see what label values it produces
find . -name "*convert*diann*" -o -name "*diann*convert*" | grep -i py
rg "convert.*diann\|diann.*convert" -l --type py

Repository: bigbio/quantms-utils

Length of output: 46


🏁 Script executed:

#!/bin/bash
# Search for "convert" and "diann" in module names and imports
find . -type f -name "*.py" | xargs grep -l "convert.*diann\|quantmsdiann" | head -20

Repository: bigbio/quantms-utils

Length of output: 168


🏁 Script executed:

#!/bin/bash
# Search broadly for SILAC and mTRAQ in entire repo
rg -i "silac.*light|mtraq0|mtraq4|mtraq8" -B 2 -A 2

Repository: bigbio/quantms-utils

Length of output: 933


🏁 Script executed:

#!/bin/bash
# Check what modules exist under quantmsutils/
ls -la quantmsutils/

Repository: bigbio/quantms-utils

Length of output: 633


🏁 Script executed:

#!/bin/bash
# Look at the import statements and references in diann2msstats to understand data flow
head -50 quantmsutils/diann/diann2msstats.py

Repository: bigbio/quantms-utils

Length of output: 1577


🏁 Script executed:

#!/bin/bash
# Search for where Label column gets populated in convert-diann or related modules
rg "Label.*=.*mTRAQ|Label.*=.*SILAC|\"mTRAQ|\"MTRAQ|\"SILAC" --type py -B 3 -A 3

Repository: bigbio/quantms-utils

Length of output: 1640


🏁 Script executed:

#!/bin/bash
# Check if there's any test or documentation about multiplexed designs
find . -path ./node_modules -prune -o -name "*.md" -type f -exec grep -l "mTRAQ\|SILAC\|multiplex" {} \;

Repository: bigbio/quantms-utils

Length of output: 46


🏁 Script executed:

#!/bin/bash
# Search for actual usage or test code that calls diann2msstats
grep -r "diann2msstats\|exp_design" tests/ --include="*.py" | head -20

Repository: bigbio/quantms-utils

Length of output: 1636


🏁 Script executed:

#!/bin/bash
# Check dianncfg.py to see how it generates or handles Label values for multiplexed experiments
cat -n quantmsutils/diann/dianncfg.py | grep -A 10 -B 10 "mTRAQ"

Repository: bigbio/quantms-utils

Length of output: 871


🏁 Script executed:

#!/bin/bash
# Search for any conversion or generation of Label column values with mTRAQ/SILAC
rg "Channel.*mTRAQ|mTRAQ.*Channel|Label.*:" quantmsutils/diann/ --type py -B 2 -A 2

Repository: bigbio/quantms-utils

Length of output: 1521


🏁 Script executed:

#!/bin/bash
# Check if there's documentation or comments about expected label format
rg -i "label.*format|mtraq.*label|silac.*label" quantmsutils/ --type py -B 1 -A 1

Repository: bigbio/quantms-utils

Length of output: 46


🏁 Script executed:

#!/bin/bash
# Get more context around the dianncfg.py TODO to understand current multiplex support status
cat -n quantmsutils/diann/dianncfg.py | sed -n '95,125p'

Repository: bigbio/quantms-utils

Length of output: 1338


🏁 Script executed:

#!/bin/bash
# Check if there are any other scripts or workflows that might generate Label columns with mTRAQ/SILAC
find . -name "*.py" -type f -exec grep -l "mTRAQ0\|MTRAQ0\|SILAC light" {} \;

Repository: bigbio/quantms-utils

Length of output: 102


🏁 Script executed:

#!/bin/bash
# Examine the entire get_exp_design_dfs function to understand data flow
rg -A 80 "def get_exp_design_dfs" quantmsutils/diann/diann2msstats.py

Repository: bigbio/quantms-utils

Length of output: 3222


Case-sensitive replacement will silently no-op for mTRAQ labels once multiplex support is enabled.

The multiplexing code (lines 185-199) currently appears unreachable: dianncfg.py (line 106) marks multiplex support as a TODO and explicitly rejects multiplexed experiments (mTRAQ, TMT, iTRAQ, Dimethyl modifications) with an error message. However, the bug is real and will manifest once this feature is implemented.

When enabled, labels_lower holds the lowercased values (for detection), but both df["Label"].replace(silac_dict) and df["Label"].replace(mtraq_dict) match against the original casing of df["Label"].

  • SILAC: keys "SILAC light"/"SILAC medium"/"SILAC heavy" match the SDRF convention exactly — works only when the upstream file uses that exact casing.
  • mTRAQ: keys "MTRAQ0"/"MTRAQ4"/"MTRAQ8" are all-caps, but the standard notation used throughout the codebase (e.g., dianncfg.py) is "mTRAQ" (lowercase m). Once multiplex support is added and design files contain "mTRAQ0", the replacement will silently no-op — detection triggers, replacement fails to match, df["Label"] retains "mTRAQ0", and the downstream merge against DIA-NN's Channel values ("0"/"4"/"8") produces all-NaN rows that are dropped, yielding an empty MSstats output.

Fix: run the replacement on labels_lower (already lowercase) using lowercase dict keys to ensure case-insensitive matching.

🐛 Proposed fix
     labels_lower = df["Label"].astype(str).str.lower()

     if labels_lower.str.contains("silac").any():
         silac_dict = {
-            "SILAC light": "L",
-            "SILAC medium": "M",
-            "SILAC heavy": "H",
+            "silac light": "L",
+            "silac medium": "M",
+            "silac heavy": "H",
         }
-        df["Label"] = df["Label"].replace(silac_dict)
+        df["Label"] = labels_lower.map(silac_dict).fillna(df["Label"])

     if labels_lower.str.contains("mtraq").any():
         mtraq_dict = {
-            "MTRAQ0": "0",
-            "MTRAQ4": "4",
-            "MTRAQ8": "8",
+            "mtraq0": "0",
+            "mtraq4": "4",
+            "mtraq8": "8",
         }
-        df["Label"] = df["Label"].replace(mtraq_dict)
+        df["Label"] = labels_lower.map(mtraq_dict).fillna(df["Label"])

Using labels_lower.map(dict).fillna(df["Label"]) performs case-insensitive lookup while preserving the original value for labels that do not match any key.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@quantmsutils/diann/diann2msstats.py` around lines 185 - 199, The replacement
is case-sensitive because df["Label"].replace(...) uses original casing while
detection uses labels_lower; update the logic for both silac_dict and mtraq_dict
to perform lookups against labels_lower (which is already lowercased) using
lowercase keys and then write back mapped values while preserving non-matches
(e.g., use labels_lower.map(lowercased_dict).fillna(df["Label"]) or equivalent)
so that variables labels_lower, silac_dict/mtraq_dict, and df["Label"] are used
for case-insensitive replacement and original values are kept when there is no
match.


f_table = df[["Filename", "Fraction", "Sample", "run", "Label"]].copy()
else:
f_table = df[["Filename", "Fraction", "Sample", "run"]].copy()
Expand Down
6 changes: 3 additions & 3 deletions tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,9 +378,9 @@ def test_unified_format_validates_sample_consistency(self):
with tempfile.TemporaryDirectory() as tmpdir:
bad_file = os.path.join(tmpdir, "inconsistent_design.tsv")
with open(bad_file, "w") as f:
f.write("Filename\tSample\tFraction\tCondition\tBioReplicate\n")
f.write("file1.raw\t1\t1\tCondA\t1\n")
f.write("file2.raw\t1\t1\tCondB\t2\n") # Same Sample, different Condition
f.write("Filename\tSample\tFraction\tCondition\tBioReplicate\tLabel\tLabelType\n")
f.write("file1.raw\t1\t1\tCondA\t1\tlabel free sample\tlabel free\n")
f.write("file2.raw\t1\t1\tCondB\t2\tlabel free sample\tlabel free\n") # Same Sample, different Condition
with pytest.raises(ValueError, match="Inconsistent"):
get_exp_design_dfs(bad_file)

Expand Down
Loading