Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions scripts/datasets/af_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def degronopedia_af_merge(struct_name, input_path, afold_version, output_path, z

## In-house scripts

# Add SEQREF record to pdb file
# Add SEQRES records to PDB file

def get_res_from_chain(pdb_path):
"""
Expand Down Expand Up @@ -221,17 +221,17 @@ def get_pdb_seqres_records(lst_res):
return records


def add_refseq_record_to_pdb(path_structure):
def add_seqres_records_to_pdb(path_structure):
"""
Add the SEQREF records to the pdb file.
Add SEQRES records to the PDB file.
"""

# Open the PDB file and get SEQRES insert index
with open(path_structure, 'r') as file:
pdb_lines = file.readlines()
insert_index = next(i for i, line in enumerate(pdb_lines) if line.startswith('MODEL'))

# Get seares records
# Get SEQRES records
residues = get_res_from_chain(path_structure)
seqres_records = get_pdb_seqres_records(residues)

Expand Down Expand Up @@ -324,12 +324,12 @@ def merge_af_fragments(input_dir, output_dir=None, af_version=4, gzip=False):
file = f"AF-{uni_id}-F{f}-model_v{af_version}.pdb{zip_ext}"
shutil.move(os.path.join(input_dir, file), path_original_frag)

# Rename merged structure and add refseq records to pdb
# Rename merged structure and add SEQRES records to PDB
if processed:
tmp_name = os.path.join(output_dir, f"AF-{uni_id}-FM-model_v{af_version}.pdb")
name = os.path.join(output_dir, f"AF-{uni_id}-F{max_f}M-model_v{af_version}.pdb")
os.rename(tmp_name, name)
add_refseq_record_to_pdb(name)
add_seqres_records_to_pdb(name)

if len(not_processed) > 0:
logger.warning(f"Not processed: {not_processed}")
Expand Down
20 changes: 12 additions & 8 deletions scripts/datasets/custom_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,21 +84,25 @@ def copy_and_parse_custom_pdbs(
# Handle metadata
if custom_mane_metadata_path and os.path.isfile(custom_mane_metadata_path):
samplesheet_df = pd.read_csv(custom_mane_metadata_path)
missing_cols = {"sequence", "aa_sequence"} - set(samplesheet_df.columns)
if missing_cols:
raise ValueError(
f"Custom MANE metadata file {custom_mane_metadata_path!r} is missing required columns: "
f"{', '.join(sorted(missing_cols))}"
)
else:
if not custom_mane_metadata_path:
logger.warning(
"No samplesheet path provided; skipping SEQRES insertion for accession %s",
accession
"No samplesheet path provided; skipping SEQRES insertion."
)
else:
logger.warning(
"samplesheet not found at '%s'; skipping SEQRES insertion for accession %s",
custom_mane_metadata_path,
accession
"samplesheet not found at '%s'; skipping SEQRES insertion.",
custom_mane_metadata_path
)
samplesheet_df = None

# Copy and gzip pdb and optionally add REFSEQ
# Copy and gzip pdb and optionally add SEQRES
for fname in os.listdir(src_dir):
if not fname.endswith('.pdb'):
continue
Expand Down Expand Up @@ -126,7 +130,7 @@ def copy_and_parse_custom_pdbs(
if accession not in samplesheet_df["sequence"].values:
logger.warning("Accession %s not in samplesheet %s", accession, custom_mane_metadata_path)
continue
seq = samplesheet_df[samplesheet_df["sequence"] == accession].refseq.values[0]
seq = samplesheet_df[samplesheet_df["sequence"] == accession].aa_sequence.values[0]

if not pd.isna(seq):
seq = [one_to_three_res_map[aa] for aa in seq]
Expand All @@ -141,4 +145,4 @@ def copy_and_parse_custom_pdbs(
logger.warning(f"SEQRES not found in samplesheet and its extraction from structure failed: {new_name}")
except Exception as e:
logger.warning(f"SEQRES not found in samplesheet and its extraction from structure failed: {new_name}")
logger.warning(f"Exception captured: {e}")
logger.warning(f"Exception captured: {e}")
10 changes: 5 additions & 5 deletions tools/preprocessing/prepare_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ def _download_mane_fasta(self):
def _parse_ncbi_mane_fasta(self) -> pd.DataFrame:
"""
Parse the gzipped FASTA into a DataFrame with columns
['sequence', 'refseq'] where 'sequence' is the Ensembl ID
(no version) and 'refseq' is the AA string.
['sequence', 'aa_sequence'] where 'sequence' is the Ensembl ID
(no version) and 'aa_sequence' is the amino-acid string.
"""
if not self.fasta_gz.exists():
raise FileNotFoundError(self.fasta_gz)
Expand All @@ -99,7 +99,7 @@ def _parse_ncbi_mane_fasta(self) -> pd.DataFrame:
ids.append(header)
seqs.append("".join(seq_parts))

return pd.DataFrame({"sequence": ids, "refseq": seqs})
return pd.DataFrame({"sequence": ids, "aa_sequence": seqs})


def _load_mane_af(self) -> pd.DataFrame:
Expand Down Expand Up @@ -141,15 +141,15 @@ def write_fastas_and_update_sheet(self, df: pd.DataFrame) -> pd.DataFrame:
"""

df = df.copy()
df["length"] = df["refseq"].str.len()
df["length"] = df["aa_sequence"].str.len()
self.fasta_dir.mkdir(exist_ok=True, parents=True)

if self.no_fragments:
df = df[df["length"] <= 2700].reset_index(drop=True)

# Build fasta paths and write files
fasta_paths = []
for seq_id, seq, length in zip(df["sequence"], df["refseq"], df["length"]):
for seq_id, seq, length in zip(df["sequence"], df["aa_sequence"], df["length"]):
p = self.fasta_dir / f"{seq_id}.fasta"
fasta_str = f">{seq_id} | {length} aa\n{seq}\n"
p.write_text(fasta_str)
Expand Down
14 changes: 7 additions & 7 deletions tools/preprocessing/update_samplesheet_and_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def merge_structure_bundles(
raise RuntimeError("No valid bundles provided for merging.")
combined = pd.concat(merged, ignore_index=True).drop_duplicates(subset=["sequence"])
combined = attach_metadata(combined, metadata_map)
combined = attach_refseq(combined, master_samplesheet)
combined = attach_aa_sequence(combined, master_samplesheet)
combined.to_csv(output_dir / "samplesheet.csv", index=False)
print(f"Merged {len(merged)} bundles → {len(combined)} unique ENSP entries")
return combined
Expand Down Expand Up @@ -447,18 +447,18 @@ def attach_metadata(df: pd.DataFrame, metadata_map: Optional[pd.DataFrame]) -> p
)


def attach_refseq(df: pd.DataFrame, master_samplesheet: Optional[pd.DataFrame]) -> pd.DataFrame:
"""Attach the amino-acid sequence column (`refseq`) from the master samplesheet when available."""
def attach_aa_sequence(df: pd.DataFrame, master_samplesheet: Optional[pd.DataFrame]) -> pd.DataFrame:
"""Attach the amino-acid sequence column (`aa_sequence`) from the master samplesheet when available."""
if master_samplesheet is None or df.empty:
return df
if "refseq" not in master_samplesheet.columns:
if "aa_sequence" not in master_samplesheet.columns:
return df
refseq_map = (
master_samplesheet[["sequence", "refseq"]]
seq_map = (
master_samplesheet[["sequence", "aa_sequence"]]
.dropna(subset=["sequence"])
.drop_duplicates(subset=["sequence"])
)
return df.drop(columns=["refseq"], errors="ignore").merge(refseq_map, on="sequence", how="left")
return df.drop(columns=["aa_sequence"], errors="ignore").merge(seq_map, on="sequence", how="left")


def compute_fasta_lengths(fasta_paths: pd.Series) -> pd.Series:
Expand Down