bbglab · St3451 · Feb 15, 2026
diff --git a/scripts/datasets/af_merge.py b/scripts/datasets/af_merge.py
@@ -182,7 +182,7 @@ def degronopedia_af_merge(struct_name, input_path, afold_version, output_path, z
 
 ## In-house scripts
 
-# Add SEQREF record to pdb file
+# Add SEQRES records to PDB file
 
 def get_res_from_chain(pdb_path):
     """
@@ -221,17 +221,17 @@ def get_pdb_seqres_records(lst_res):
     return records
 
 
-def add_refseq_record_to_pdb(path_structure):
+def add_seqres_records_to_pdb(path_structure):
     """
-    Add the SEQREF records to the pdb file.
+    Add SEQRES records to the PDB file.
     """
 
     # Open the PDB file and get SEQRES insert index
     with open(path_structure, 'r') as file:
         pdb_lines = file.readlines()
         insert_index = next(i for i, line in enumerate(pdb_lines) if line.startswith('MODEL'))
 
-    # Get seares records
+    # Get SEQRES records
     residues = get_res_from_chain(path_structure)
     seqres_records = get_pdb_seqres_records(residues)
 
@@ -324,12 +324,12 @@ def merge_af_fragments(input_dir, output_dir=None, af_version=4, gzip=False):
                     file = f"AF-{uni_id}-F{f}-model_v{af_version}.pdb{zip_ext}"
                     shutil.move(os.path.join(input_dir, file), path_original_frag)
 
-                # Rename merged structure and add refseq records to pdb
+                # Rename merged structure and add SEQRES records to PDB
                 if processed:
                     tmp_name = os.path.join(output_dir, f"AF-{uni_id}-FM-model_v{af_version}.pdb")
                     name = os.path.join(output_dir, f"AF-{uni_id}-F{max_f}M-model_v{af_version}.pdb")
                     os.rename(tmp_name, name)
-                    add_refseq_record_to_pdb(name)
+                    add_seqres_records_to_pdb(name)
 
             if len(not_processed) > 0:
                 logger.warning(f"Not processed: {not_processed}")

diff --git a/scripts/datasets/custom_pdb.py b/scripts/datasets/custom_pdb.py
@@ -84,21 +84,25 @@ def copy_and_parse_custom_pdbs(
     # Handle metadata
     if custom_mane_metadata_path and os.path.isfile(custom_mane_metadata_path):
         samplesheet_df = pd.read_csv(custom_mane_metadata_path)
+        missing_cols = {"sequence", "aa_sequence"} - set(samplesheet_df.columns)
+        if missing_cols:
+            raise ValueError(
+                f"Custom MANE metadata file {custom_mane_metadata_path!r} is missing required columns: "
+                f"{', '.join(sorted(missing_cols))}"
+            )
     else:
         if not custom_mane_metadata_path:
             logger.warning(
-                "No samplesheet path provided; skipping SEQRES insertion for accession %s",
-                accession
+                "No samplesheet path provided; skipping SEQRES insertion."
             )
         else:
             logger.warning(
-                "samplesheet not found at '%s'; skipping SEQRES insertion for accession %s",
-                custom_mane_metadata_path,
-                accession
+                "samplesheet not found at '%s'; skipping SEQRES insertion.",
+                custom_mane_metadata_path
             )
         samplesheet_df = None
 
-    # Copy and gzip pdb and optionally add REFSEQ
+    # Copy and gzip pdb and optionally add SEQRES
     for fname in os.listdir(src_dir):
         if not fname.endswith('.pdb'):
             continue
@@ -126,7 +130,7 @@ def copy_and_parse_custom_pdbs(
             if accession not in samplesheet_df["sequence"].values:
                 logger.warning("Accession %s not in samplesheet %s", accession, custom_mane_metadata_path)
                 continue
-            seq = samplesheet_df[samplesheet_df["sequence"] == accession].refseq.values[0]
+            seq = samplesheet_df[samplesheet_df["sequence"] == accession].aa_sequence.values[0]
 
             if not pd.isna(seq):
                 seq = [one_to_three_res_map[aa] for aa in seq]
@@ -141,4 +145,4 @@ def copy_and_parse_custom_pdbs(
                         logger.warning(f"SEQRES not found in samplesheet and its extraction from structure failed: {new_name}")
                 except Exception as e:
                     logger.warning(f"SEQRES not found in samplesheet and its extraction from structure failed: {new_name}")
-                    logger.warning(f"Exception captured: {e}")
+                    logger.warning(f"Exception captured: {e}")
diff --git a/tools/preprocessing/prepare_samplesheet.py b/tools/preprocessing/prepare_samplesheet.py
@@ -74,8 +74,8 @@ def _download_mane_fasta(self):
     def _parse_ncbi_mane_fasta(self) -> pd.DataFrame:
         """
         Parse the gzipped FASTA into a DataFrame with columns
-        ['sequence', 'refseq'] where 'sequence' is the Ensembl ID
-        (no version) and 'refseq' is the AA string.
+        ['sequence', 'aa_sequence'] where 'sequence' is the Ensembl ID
+        (no version) and 'aa_sequence' is the amino-acid string.
         """
         if not self.fasta_gz.exists():
             raise FileNotFoundError(self.fasta_gz)
@@ -99,7 +99,7 @@ def _parse_ncbi_mane_fasta(self) -> pd.DataFrame:
                 ids.append(header)
                 seqs.append("".join(seq_parts))
 
-        return pd.DataFrame({"sequence": ids, "refseq": seqs})
+        return pd.DataFrame({"sequence": ids, "aa_sequence": seqs})
 
 
     def _load_mane_af(self) -> pd.DataFrame:
@@ -141,15 +141,15 @@ def write_fastas_and_update_sheet(self, df: pd.DataFrame) -> pd.DataFrame:
         """
 
         df = df.copy()
-        df["length"] = df["refseq"].str.len()
+        df["length"] = df["aa_sequence"].str.len()
         self.fasta_dir.mkdir(exist_ok=True, parents=True)
 
         if self.no_fragments:
             df = df[df["length"] <= 2700].reset_index(drop=True)
 
         # Build fasta paths and write files
         fasta_paths = []
-        for seq_id, seq, length in zip(df["sequence"], df["refseq"], df["length"]):
+        for seq_id, seq, length in zip(df["sequence"], df["aa_sequence"], df["length"]):
             p = self.fasta_dir / f"{seq_id}.fasta"
             fasta_str = f">{seq_id} | {length} aa\n{seq}\n"
             p.write_text(fasta_str)

diff --git a/tools/preprocessing/update_samplesheet_and_structures.py b/tools/preprocessing/update_samplesheet_and_structures.py
@@ -259,7 +259,7 @@ def merge_structure_bundles(
         raise RuntimeError("No valid bundles provided for merging.")
     combined = pd.concat(merged, ignore_index=True).drop_duplicates(subset=["sequence"])
     combined = attach_metadata(combined, metadata_map)
-    combined = attach_refseq(combined, master_samplesheet)
+    combined = attach_aa_sequence(combined, master_samplesheet)
     combined.to_csv(output_dir / "samplesheet.csv", index=False)
     print(f"Merged {len(merged)} bundles → {len(combined)} unique ENSP entries")
     return combined
@@ -447,18 +447,18 @@ def attach_metadata(df: pd.DataFrame, metadata_map: Optional[pd.DataFrame]) -> p
     )
 
 
-def attach_refseq(df: pd.DataFrame, master_samplesheet: Optional[pd.DataFrame]) -> pd.DataFrame:
-    """Attach the amino-acid sequence column (`refseq`) from the master samplesheet when available."""
+def attach_aa_sequence(df: pd.DataFrame, master_samplesheet: Optional[pd.DataFrame]) -> pd.DataFrame:
+    """Attach the amino-acid sequence column (`aa_sequence`) from the master samplesheet when available."""
     if master_samplesheet is None or df.empty:
         return df
-    if "refseq" not in master_samplesheet.columns:
+    if "aa_sequence" not in master_samplesheet.columns:
         return df
-    refseq_map = (
-        master_samplesheet[["sequence", "refseq"]]
+    seq_map = (
+        master_samplesheet[["sequence", "aa_sequence"]]
         .dropna(subset=["sequence"])
         .drop_duplicates(subset=["sequence"])
     )
-    return df.drop(columns=["refseq"], errors="ignore").merge(refseq_map, on="sequence", how="left")
+    return df.drop(columns=["aa_sequence"], errors="ignore").merge(seq_map, on="sequence", how="left")
 
 
 def compute_fasta_lengths(fasta_paths: pd.Series) -> pd.Series: