diff --git a/scripts/datasets/af_merge.py b/scripts/datasets/af_merge.py index 90169ff..0aea722 100755 --- a/scripts/datasets/af_merge.py +++ b/scripts/datasets/af_merge.py @@ -182,7 +182,7 @@ def degronopedia_af_merge(struct_name, input_path, afold_version, output_path, z ## In-house scripts -# Add SEQREF record to pdb file +# Add SEQRES records to PDB file def get_res_from_chain(pdb_path): """ @@ -221,9 +221,9 @@ def get_pdb_seqres_records(lst_res): return records -def add_refseq_record_to_pdb(path_structure): +def add_seqres_records_to_pdb(path_structure): """ - Add the SEQREF records to the pdb file. + Add SEQRES records to the PDB file. """ # Open the PDB file and get SEQRES insert index @@ -231,7 +231,7 @@ def add_refseq_record_to_pdb(path_structure): pdb_lines = file.readlines() insert_index = next(i for i, line in enumerate(pdb_lines) if line.startswith('MODEL')) - # Get seares records + # Get SEQRES records residues = get_res_from_chain(path_structure) seqres_records = get_pdb_seqres_records(residues) @@ -324,12 +324,12 @@ def merge_af_fragments(input_dir, output_dir=None, af_version=4, gzip=False): file = f"AF-{uni_id}-F{f}-model_v{af_version}.pdb{zip_ext}" shutil.move(os.path.join(input_dir, file), path_original_frag) - # Rename merged structure and add refseq records to pdb + # Rename merged structure and add SEQRES records to PDB if processed: tmp_name = os.path.join(output_dir, f"AF-{uni_id}-FM-model_v{af_version}.pdb") name = os.path.join(output_dir, f"AF-{uni_id}-F{max_f}M-model_v{af_version}.pdb") os.rename(tmp_name, name) - add_refseq_record_to_pdb(name) + add_seqres_records_to_pdb(name) if len(not_processed) > 0: logger.warning(f"Not processed: {not_processed}") diff --git a/scripts/datasets/custom_pdb.py b/scripts/datasets/custom_pdb.py index 63f5a46..81dce62 100644 --- a/scripts/datasets/custom_pdb.py +++ b/scripts/datasets/custom_pdb.py @@ -84,21 +84,25 @@ def copy_and_parse_custom_pdbs( # Handle metadata if custom_mane_metadata_path and os.path.isfile(custom_mane_metadata_path): samplesheet_df = pd.read_csv(custom_mane_metadata_path) + missing_cols = {"sequence", "aa_sequence"} - set(samplesheet_df.columns) + if missing_cols: + raise ValueError( + f"Custom MANE metadata file {custom_mane_metadata_path!r} is missing required columns: " + f"{', '.join(sorted(missing_cols))}" + ) else: if not custom_mane_metadata_path: logger.warning( - "No samplesheet path provided; skipping SEQRES insertion for accession %s", - accession + "No samplesheet path provided; skipping SEQRES insertion." ) else: logger.warning( - "samplesheet not found at '%s'; skipping SEQRES insertion for accession %s", - custom_mane_metadata_path, - accession + "samplesheet not found at '%s'; skipping SEQRES insertion.", + custom_mane_metadata_path ) samplesheet_df = None - # Copy and gzip pdb and optionally add REFSEQ + # Copy and gzip pdb and optionally add SEQRES for fname in os.listdir(src_dir): if not fname.endswith('.pdb'): continue @@ -126,7 +130,7 @@ def copy_and_parse_custom_pdbs( if accession not in samplesheet_df["sequence"].values: logger.warning("Accession %s not in samplesheet %s", accession, custom_mane_metadata_path) continue - seq = samplesheet_df[samplesheet_df["sequence"] == accession].refseq.values[0] + seq = samplesheet_df[samplesheet_df["sequence"] == accession].aa_sequence.values[0] if not pd.isna(seq): seq = [one_to_three_res_map[aa] for aa in seq] @@ -141,4 +145,4 @@ def copy_and_parse_custom_pdbs( logger.warning(f"SEQRES not found in samplesheet and its extraction from structure failed: {new_name}") except Exception as e: logger.warning(f"SEQRES not found in samplesheet and its extraction from structure failed: {new_name}") - logger.warning(f"Exception captured: {e}") \ No newline at end of file + logger.warning(f"Exception captured: {e}") diff --git a/tools/preprocessing/prepare_samplesheet.py b/tools/preprocessing/prepare_samplesheet.py index cb3401e..fedc7a0 100644 --- a/tools/preprocessing/prepare_samplesheet.py +++ b/tools/preprocessing/prepare_samplesheet.py @@ -74,8 +74,8 @@ def _download_mane_fasta(self): def _parse_ncbi_mane_fasta(self) -> pd.DataFrame: """ Parse the gzipped FASTA into a DataFrame with columns - ['sequence', 'refseq'] where 'sequence' is the Ensembl ID - (no version) and 'refseq' is the AA string. + ['sequence', 'aa_sequence'] where 'sequence' is the Ensembl ID + (no version) and 'aa_sequence' is the amino-acid string. """ if not self.fasta_gz.exists(): raise FileNotFoundError(self.fasta_gz) @@ -99,7 +99,7 @@ def _parse_ncbi_mane_fasta(self) -> pd.DataFrame: ids.append(header) seqs.append("".join(seq_parts)) - return pd.DataFrame({"sequence": ids, "refseq": seqs}) + return pd.DataFrame({"sequence": ids, "aa_sequence": seqs}) def _load_mane_af(self) -> pd.DataFrame: @@ -141,7 +141,7 @@ def write_fastas_and_update_sheet(self, df: pd.DataFrame) -> pd.DataFrame: """ df = df.copy() - df["length"] = df["refseq"].str.len() + df["length"] = df["aa_sequence"].str.len() self.fasta_dir.mkdir(exist_ok=True, parents=True) if self.no_fragments: @@ -149,7 +149,7 @@ def write_fastas_and_update_sheet(self, df: pd.DataFrame) -> pd.DataFrame: # Build fasta paths and write files fasta_paths = [] - for seq_id, seq, length in zip(df["sequence"], df["refseq"], df["length"]): + for seq_id, seq, length in zip(df["sequence"], df["aa_sequence"], df["length"]): p = self.fasta_dir / f"{seq_id}.fasta" fasta_str = f">{seq_id} | {length} aa\n{seq}\n" p.write_text(fasta_str) diff --git a/tools/preprocessing/update_samplesheet_and_structures.py b/tools/preprocessing/update_samplesheet_and_structures.py index 06b6b3f..d13aba6 100644 --- a/tools/preprocessing/update_samplesheet_and_structures.py +++ b/tools/preprocessing/update_samplesheet_and_structures.py @@ -259,7 +259,7 @@ def merge_structure_bundles( raise RuntimeError("No valid bundles provided for merging.") combined = pd.concat(merged, ignore_index=True).drop_duplicates(subset=["sequence"]) combined = attach_metadata(combined, metadata_map) - combined = attach_refseq(combined, master_samplesheet) + combined = attach_aa_sequence(combined, master_samplesheet) combined.to_csv(output_dir / "samplesheet.csv", index=False) print(f"Merged {len(merged)} bundles → {len(combined)} unique ENSP entries") return combined @@ -447,18 +447,18 @@ def attach_metadata(df: pd.DataFrame, metadata_map: Optional[pd.DataFrame]) -> p ) -def attach_refseq(df: pd.DataFrame, master_samplesheet: Optional[pd.DataFrame]) -> pd.DataFrame: - """Attach the amino-acid sequence column (`refseq`) from the master samplesheet when available.""" +def attach_aa_sequence(df: pd.DataFrame, master_samplesheet: Optional[pd.DataFrame]) -> pd.DataFrame: + """Attach the amino-acid sequence column (`aa_sequence`) from the master samplesheet when available.""" if master_samplesheet is None or df.empty: return df - if "refseq" not in master_samplesheet.columns: + if "aa_sequence" not in master_samplesheet.columns: return df - refseq_map = ( - master_samplesheet[["sequence", "refseq"]] + seq_map = ( + master_samplesheet[["sequence", "aa_sequence"]] .dropna(subset=["sequence"]) .drop_duplicates(subset=["sequence"]) ) - return df.drop(columns=["refseq"], errors="ignore").merge(refseq_map, on="sequence", how="left") + return df.drop(columns=["aa_sequence"], errors="ignore").merge(seq_map, on="sequence", how="left") def compute_fasta_lengths(fasta_paths: pd.Series) -> pd.Series: