Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
31bac06
fix: enhance download_biomart_metadata function with retry logic and …
St3451 Feb 16, 2026
b268d2a
feat: improve get_ref_dna_from_ensembl_mp function with error handlin…
St3451 Feb 16, 2026
86796c0
refactor: streamline get_ref_dna_from_ensembl function by removing un…
St3451 Feb 16, 2026
fea7d2f
logs: enhance get_ref_dna_from_ensembl function with improved error h…
St3451 Feb 16, 2026
a402cde
docs: update organism specification in README to include full scienti…
St3451 Feb 16, 2026
d961730
refactor: simplify multiprocessing in get_ref_dna_from_ensembl_mp fun…
St3451 Feb 16, 2026
8c2ce07
fix: enhance download_biomart_metadata function with improved error h…
St3451 Feb 16, 2026
dbf0cd2
logs: improve error handling and logging in download_biomart_metadata…
St3451 Feb 16, 2026
519d92c
fix: improve error handling by removing partial BioMart metadata file…
St3451 Feb 16, 2026
2db7336
feat: add batch retrieval for Ensembl CDS DNA sequences and improve r…
St3451 Feb 17, 2026
3753b2c
feat: handle rate limiting in get_ref_dna_from_ensembl_batch function
St3451 Feb 17, 2026
826c239
increase max_attempts in download_biomart_metadata to 5
St3451 Feb 17, 2026
2cf9865
fix: add logging import to seq_for_mut_prob.py
St3451 Feb 17, 2026
2c006e3
remove outdated download_biomart_metadata function
St3451 Feb 17, 2026
08eb1bb
fix: handle empty dataframes in process_seq_df and process_seq_df_mane
St3451 Feb 17, 2026
40b3432
logs: enhance logging for BioMart download failures and improve error…
St3451 Feb 17, 2026
62e8df5
logs: add warning log for exceeding max attempts in Ensembl CDS batch…
St3451 Feb 17, 2026
a29831d
fix: add SSL verification for download_single_file in download_biomar…
St3451 Feb 17, 2026
5c2a26f
logs: update headers for Ensembl REST API and add wget option to disa…
St3451 Feb 17, 2026
4276b47
feat: add SSL option to download_single_file for secure downloads
St3451 Feb 17, 2026
b1f5722
refactor: remove unused datasets_dir parameter from process_seq_df fu…
St3451 Feb 17, 2026
6c7d4a4
refactor: update process_seq_df_mane to only download biomart metadat…
St3451 Feb 17, 2026
2029a9b
logs: add debug logging for BioMart download attempts in download_bio…
St3451 Feb 17, 2026
c01dd5c
feat: initialize Tri_context with NaN and apply transformation only t…
St3451 Feb 17, 2026
cd65b60
refactor: return NaN instead of empty string for DNA sequences in Ens…
St3451 Feb 17, 2026
5839863
feat: add support for custom PAE directory and update README
St3451 Feb 17, 2026
b6f60cb
docs: update README and main.py to clarify AlphaFold DB versioning fo…
St3451 Feb 17, 2026
2812f00
docs: correct typos and improve clarity in README.md
St3451 Feb 17, 2026
dfc5243
feat: enhance dataset building process with improved logging and stru…
St3451 Feb 17, 2026
d79d679
fix: enable directory cleaning in dataset build process
St3451 Feb 17, 2026
26fc3d5
fix: improve logging for custom PAE directory handling in dataset bui…
St3451 Feb 17, 2026
6456014
lint: update main execution to enforce CLI usage for dataset building
St3451 Feb 17, 2026
5194584
fix: pass af_version to merge_af_fragments for improved dataset merging
St3451 Feb 18, 2026
4e447dd
logs: enhance logging for duplicate gene removal and Ensembl CDS fail…
St3451 Feb 18, 2026
cfc4e6b
fix: update custom MANE PDB directory option to require --mane_only f…
St3451 Feb 18, 2026
0d9412e
lint: simplify debug logging
St3451 Feb 18, 2026
44a8644
refactor: enforce CLI usage for seq_for_mut_prob module execution
St3451 Feb 18, 2026
19f3c0c
refactor: increase probe size and consecutive missing threshold for P…
St3451 Feb 18, 2026
94f8f2e
fix: remove existing PAE output directory before copying custom PAE d…
St3451 Feb 18, 2026
91e22f4
docs: update process_seq_df docstring to include canonical transcript…
St3451 Feb 18, 2026
eaa0b9e
feat: add function to load custom gene symbol mappings from samplesheet
St3451 Feb 18, 2026
f7853af
frefactor: update samplesheet tool build_metadata_map() to accept a p…
St3451 Feb 18, 2026
d8ea655
logs: enhance PDB copying process with detailed logging and summary o…
St3451 Feb 18, 2026
3c3f055
fix: ensure REPO_ROOT is added to sys.path for module imports in prep…
St3451 Feb 18, 2026
50498ac
limit number of connections in download_single_file to a maximum of 10
St3451 Feb 19, 2026
0ed4e61
reduce maximum number of connections in download_single_file from 10 …
St3451 Feb 19, 2026
7967d1b
feat: add retry logic for missing entries; increase max attempts and…
St3451 Feb 19, 2026
3a348b5
cap Ensembl CDS batch workers to a maximum number of cores
St3451 Feb 19, 2026
2c5c109
feat: implement bounded parallelism for retrying missing Ensembl CDS …
St3451 Feb 19, 2026
ea63130
fix: handle consecutive missing PAE downloads correctly
St3451 Feb 19, 2026
cdf9f6a
logs: enhance logging for Ensembl CDS retrieval with sequence count
St3451 Feb 19, 2026
2209b0e
fix: prevent duplicate SEQRES records in PDB files and log skipped in…
St3451 Feb 19, 2026
d7bf403
fix: handle ENSP IDs in get_exons_coord function and return NaN for m…
St3451 Feb 19, 2026
6eeadf3
update symbol assignment to use pd.NA for missing values in build_sym…
St3451 Feb 19, 2026
9170592
fix: update add_seqres_to_pdb to return bool and skip insertion if SE…
St3451 Feb 19, 2026
3d8d4e9
fix: reduce batch size for Backtranseq API calls to improve performan…
St3451 Feb 19, 2026
44255a5
logs: enhance error handling and logging in backtranseq function for …
St3451 Feb 19, 2026
b836ccf
feat: enhance backtranseq function with retry logic and timeout handl…
St3451 Feb 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 18 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Additionally, you may need to install additional development tools. Depending on
- If you have sudo privileges:

```bash
sudo apt install built-essential
sudo apt install build-essential
```

- For HPC cluster environment, it is recommended to use [Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) (or [Mamba](https://mamba.readthedocs.io/en/latest/)):
Expand Down Expand Up @@ -57,19 +57,22 @@ Additionally, you may need to install additional development tools. Depending on

## Building Datasets

This step build the datasets necessary for Oncodrive3D to run the 3D clustering analysis. It is required once after installation or whenever you need to generate datasets for a different organism or apply a specific threshold to define amino acid contacts.
This step builds the datasets necessary for Oncodrive3D to run the 3D clustering analysis. It is required once after installation or whenever you need to generate datasets for a different organism or apply a specific threshold to define amino acid contacts.

> [!WARNING]
> This step is highly time- and resource-intensive, requiring a significant amount of free disk space and computational power. It will download and process a large amount of data. Ensure sufficient resources are available before proceeding, as insufficient capacity may result in extended runtimes or processing failures.
>
> Reliable internet access is required because AlphaFold structures, Ensembl annotations, Pfam files, and other resources are downloaded on demand during the build.

> [!NOTE]
> The first time that you run Oncodrive3D building dataset step with a given reference genome, it will download it from our servers. By default the downloaded datasets go to`~/.bgdata`. If you want to move these datasets to another folder you have to define the system environment variable `BGDATA_LOCAL` with an export command.
> The first time that you run Oncodrive3D building dataset step with a given reference genome, it will download it from our servers. By default the downloaded datasets go to `~/.bgdata`. If you want to move these datasets to another folder you have to define the system environment variable `BGDATA_LOCAL` with an export command.

> [!NOTE]
> Human datasets built with the default settings pull canonical transcript metadata from the January 2024 Ensembl archive (release 111 / GENCODE v45). For maximum compatibility, annotate your input variants with the same Ensembl/Gencode release or supply the unfiltered VEP output together with `--o3d_transcripts --use_input_symbols`.

> [!NOTE] Predicted Aligned Error (PAE) files for older AlphaFold DB versions (e.g., v4) are no longer hosted after 2025. If you need PAE for an older AF version, download and supply them locally via `--custom_pae_dir`.
> MANE structures are only available from the AlphaFold DB v4 release. Non‑MANE builds default to v6; MANE mode forces v4 for structures, so you should provide PAE files via `--custom_pae_dir`.

```
Usage: oncodrive3d build-datasets [OPTIONS]

Expand All @@ -83,13 +86,16 @@ Examples:
Options:
-o, --output_dir PATH Path to the directory where the output files will be saved.
Default: ./datasets/
-s, --organism PATH Specifies the organism (`human` or `mouse`).
Default: human
-s, --organism TEXT Specifies the organism (`human` or `mouse`; also accepts `Homo sapiens` / `Mus musculus`).
Default: Homo sapiens
-m, --mane Use structures predicted from MANE Select transcripts
(applicable to Homo sapiens only).
-M, --mane_only Use only structures predicted from MANE Select transcripts
(applicable to Homo sapiens only).
-C, --custom_mane_pdb_dir PATH Path to directory containing custom MANE PDB structures.
-C, --custom_mane_pdb_dir PATH Path to directory containing custom MANE PDB structures (requires --mane_only).
Default: None
--custom_pae_dir PATH Path to directory containing pre-downloaded PAE JSON files.
The directory will be copied into the build as `pae/`.
Default: None
-f, --custom_mane_metadata_path Path to a dataframe (typically a samplesheet.csv) including
Ensembl IDs and sequences of the custom pdbs.
Expand All @@ -98,8 +104,8 @@ Options:
Default: 10
-c, --cores INT Number of CPU cores for computation.
Default: All available CPU cores
--af_version INT Version of the AlphaFold Protein Structure Database release.
Default: 4
--af_version INT AlphaFold DB version for non-MANE builds (MANE uses v4).
Default: 6
-y, --yes Run without interactive prompts.
-v, --verbose Enables verbose output.
-h, --help Show this message and exit.
Expand All @@ -112,7 +118,7 @@ For more information on the output of this step, please refer to the [Building D
> To maximize structural coverage of **MANE Select transcripts**, you can [predict missing structures locally and integrate them into Oncodrive3D](tools/preprocessing/README.md) using:
>
> - `tools/preprocessing/prepare_samplesheet.py`: a standalone utility that:
> - Retrieve the full MANE entries from NCBI.
> - Retrieves the full MANE entries from NCBI.
> - Identifies proteins missing from the AlphaFold MANE dataset.
> - Generates:
> - A `samplesheet.csv` with Ensembl protein IDs, FASTA paths, and optional sequences.
Expand All @@ -133,7 +139,7 @@ For more information on the output of this step, please refer to the [Building D

## Running 3D clustering Analysis

For in depth information on how to obtain the required input data and for comprehensive information about the output, please refer to the [Input and Output Documentation](https://github.com/bbglab/oncodrive3d/tree/master/docs/run_input_output.md) of the 3D clustering analysis.
For in-depth information on how to obtain the required input data and for comprehensive information about the output, please refer to the [Input and Output Documentation](https://github.com/bbglab/oncodrive3d/tree/master/docs/run_input_output.md) of the 3D clustering analysis.

### Input

Expand Down Expand Up @@ -256,8 +262,6 @@ For more information, refer to the [Oncodrive3D Pipeline](https://github.com/bbg

### Usage

---

> [!WARNING]
> When using the Nextflow script, ensure that your input files are organized in the following directory structure (you only need either the `maf/` or `vep/` directory):
>
Expand Down Expand Up @@ -302,10 +306,10 @@ Options:
--vep_input BOOL Use `vep/` subdir as input and select transcripts matching
the Ensembl transcript IDs in Oncodrive3D built datasets.
Default: false
--mane BOOL Prioritize structures corresponding to MANE transcrips if
--mane BOOL Prioritize structures corresponding to MANE transcripts if
multiple structures are associated to the same gene.
Default: false
--seed INT: Seed value for reproducibility.
--seed INT Seed value for reproducibility.
Default: 128
```

Expand Down
21 changes: 19 additions & 2 deletions scripts/datasets/af_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,12 +224,17 @@ def get_pdb_seqres_records(lst_res):
def add_refseq_record_to_pdb(path_structure):
"""
Add the SEQREF records to the pdb file.
Copy link

Copilot AI Feb 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring says “Add the SEQREF records”, but the code is inserting SEQRES records. Update the docstring to match the PDB record type to avoid confusion.

Suggested change
Add the SEQREF records to the pdb file.
Add the SEQRES records to the pdb file.

Copilot uses AI. Check for mistakes.
Returns True if SEQRES was inserted, False if skipped because SEQRES already exists.
"""

# Open the PDB file and get SEQRES insert index
with open(path_structure, 'r') as file:
pdb_lines = file.readlines()
insert_index = next(i for i, line in enumerate(pdb_lines) if line.startswith('MODEL'))

if any(line.startswith('SEQRES') for line in pdb_lines):
return False

insert_index = next(i for i, line in enumerate(pdb_lines) if line.startswith('MODEL'))

# Get seares records
residues = get_res_from_chain(path_structure)
Expand All @@ -243,6 +248,8 @@ def add_refseq_record_to_pdb(path_structure):
output_file.truncate()
output_file.writelines(pdb_lines)

return True


# Other functions

Expand Down Expand Up @@ -306,6 +313,8 @@ def merge_af_fragments(input_dir, output_dir=None, af_version=4, gzip=False):
else:
# Get list of fragmented Uniprot ID and max AF-F
not_processed = []
refseq_added = 0
refseq_skipped_existing = 0
for uni_id, max_f in tqdm(fragments, total=len(fragments), desc="Merging AF fragments"):

processed = False
Expand All @@ -329,7 +338,10 @@ def merge_af_fragments(input_dir, output_dir=None, af_version=4, gzip=False):
tmp_name = os.path.join(output_dir, f"AF-{uni_id}-FM-model_v{af_version}.pdb")
name = os.path.join(output_dir, f"AF-{uni_id}-F{max_f}M-model_v{af_version}.pdb")
os.rename(tmp_name, name)
add_refseq_record_to_pdb(name)
if add_refseq_record_to_pdb(name):
refseq_added += 1
else:
refseq_skipped_existing += 1

if len(not_processed) > 0:
logger.warning(f"Not processed: {not_processed}")
Expand All @@ -338,6 +350,11 @@ def merge_af_fragments(input_dir, output_dir=None, af_version=4, gzip=False):

save_unprocessed_ids(not_processed,
os.path.join(output_dir, "fragmented_pdbs", "ids_not_merged.txt"))
if refseq_skipped_existing:
logger.info(
"Skipped SEQRES insertion for %s merged structures (SEQRES already present).",
refseq_skipped_existing,
)
logger.info("Merge of structures completed!")

else:
Expand Down
71 changes: 49 additions & 22 deletions scripts/datasets/build_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@


import os
import shutil
import daiquiri

from scripts import __logger_name__
Expand All @@ -43,6 +44,7 @@ def build(output_datasets,
mane,
mane_only,
custom_pdb_dir,
custom_pae_dir,
custom_mane_metadata_path,
distance_threshold,
num_cores,
Expand All @@ -57,6 +59,13 @@ def build(output_datasets,

# Download PDB structures
species = get_species(organism)
if mane and str(af_version) != "4":
logger.warning(
"MANE structures are only available in AlphaFold DB v4. "
"Ignoring --af_version=%s and using v4 for this build.",
af_version,
)
af_version = 4
if not mane_only:
logger.info("Downloading AlphaFold (AF) predicted structures...")
get_structures(
Expand All @@ -69,7 +78,11 @@ def build(output_datasets,

# Merge fragmented structures
logger.info("Merging fragmented structures...")
merge_af_fragments(input_dir=os.path.join(output_datasets,"pdb_structures"), gzip=True)
merge_af_fragments(
input_dir=os.path.join(output_datasets,"pdb_structures"),
af_version=af_version,
gzip=True
)

# Download PDB MANE structures
if species == "Homo sapiens" and mane:
Expand All @@ -78,13 +91,21 @@ def build(output_datasets,
path=os.path.join(output_datasets,"pdb_structures_mane"),
species=species,
mane=True,
af_version=str(af_version),
threads=num_cores
)
mv_mane_pdb(output_datasets, "pdb_structures", "pdb_structures_mane")
logger.info("Download of MANE structures completed!")

# Copy custom PDB structures and optinally add SEQRES
if custom_pdb_dir is not None:
if not mane_only:
logger.error(
"custom_pdb_dir requires --mane_only. Use --mane_only when providing custom MANE structures."
)
raise ValueError(
"custom_pdb_dir requires --mane_only"
)
if custom_mane_metadata_path is None:
logger.error(
"custom_mane_metadata_path must be provided when custom_pdb_dir is specified"
Expand Down Expand Up @@ -112,6 +133,7 @@ def build(output_datasets,
output_seq_df=os.path.join(output_datasets, "seq_for_mut_prob.tsv"),
organism=species,
mane=mane,
mane_only=mane_only,
num_cores=num_cores,
mane_version=mane_version,
custom_mane_metadata_path=custom_mane_metadata_path
Expand All @@ -127,18 +149,32 @@ def build(output_datasets,
)

# Get PAE
logger.info("Downloading AF predicted aligned error (PAE)...")
get_pae(
input_dir=os.path.join(output_datasets,"pdb_structures"),
output_dir=os.path.join(output_datasets,"pae"),
num_cores=num_cores,
af_version=str(af_version),
custom_pdb_dir=custom_pdb_dir
)
pae_output_dir = os.path.join(output_datasets, "pae")
if custom_pae_dir is not None:
logger.info("Copying precomputed PAE directory...")
if os.path.exists(custom_pae_dir):
if os.path.exists(pae_output_dir):
shutil.rmtree(pae_output_dir)
shutil.copytree(custom_pae_dir, pae_output_dir)
else:
logger.warning(
"Custom PAE directory does not exist: %s. Skipping copy. "
"Contact maps will be computed without PAE (binary maps).",
custom_pae_dir,
)
else:
logger.info("Downloading AF predicted aligned error (PAE)...")
get_pae(
input_dir=os.path.join(output_datasets,"pdb_structures"),
output_dir=pae_output_dir,
num_cores=num_cores,
af_version=str(af_version),
custom_pdb_dir=custom_pdb_dir
)

# Parse PAE
logger.info("Parsing PAE...")
parse_pae(input=os.path.join(output_datasets, 'pae'))
parse_pae(input=pae_output_dir)
logger.info("Parsing PAE completed!")

# Get pCAMPs
Expand All @@ -159,15 +195,6 @@ def build(output_datasets,
logger.info("Datasets have been successfully built and are ready for analysis!")

if __name__ == "__main__":
build(
output_datasets="/data/bbg/nobackup/scratch/oncodrive3d/mane_missing/oncodrive3d/datasets/datasets-mane_only-mane_custom-250729",
organism="Homo sapiens",
mane=False,
mane_only=True,
custom_pdb_dir="/data/bbg/nobackup/scratch/oncodrive3d/mane_missing/data/250724-no_fragments/all_pdbs-pred_and_retrieved/pdbs",
custom_mane_metadata_path="/data/bbg/nobackup/scratch/oncodrive3d/mane_missing/data/250724-no_fragments/all_pdbs-pred_and_retrieved/samplesheet.csv",
distance_threshold=10,
num_cores=8,
af_version=4,
mane_version=1.4
)
raise SystemExit(
"This module is intended to be used via the CLI: `oncodrive3d build-datasets`."
)
38 changes: 34 additions & 4 deletions scripts/datasets/custom_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def get_pdb_seqres_records(lst_res):
return records


def add_seqres_to_pdb(path_pdb: str, residues: list) -> None:
def add_seqres_to_pdb(path_pdb: str, residues: list) -> bool:
"""
Insert SEQRES records at the very top of a PDB file (supports gzipped and plain).

Expand All @@ -56,6 +56,9 @@ def add_seqres_to_pdb(path_pdb: str, residues: list) -> None:
with open_in(path_pdb, mode_in) as fh:
lines = fh.readlines()

if any(line.startswith("SEQRES") for line in lines):
return False

# Generate SEQRES lines
seqres = get_pdb_seqres_records(residues)

Expand All @@ -65,6 +68,8 @@ def add_seqres_to_pdb(path_pdb: str, residues: list) -> None:
# Write back
with open_out(path_pdb, mode_out) as fh:
fh.writelines(new_lines)

return True


def copy_and_parse_custom_pdbs(
Expand Down Expand Up @@ -99,13 +104,20 @@ def copy_and_parse_custom_pdbs(
samplesheet_df = None

# Copy and gzip pdb and optionally add REFSEQ
total_pdb_files = 0
copied = 0
skipped_format = 0
seqres_inserted = 0
seqres_skipped_existing = 0
for fname in os.listdir(src_dir):
if not fname.endswith('.pdb'):
continue
total_pdb_files += 1

parts = fname.split('.') # e.g. [ACCESSION, fragment_code, 'alphafold', 'pdb']
if len(parts) < 4:
logger.warning(f"Skipping unexpected filename format: {fname}")
skipped_format += 1
continue

accession = parts[0]
Expand All @@ -119,6 +131,7 @@ def copy_and_parse_custom_pdbs(
with open(src_path, 'rb') as fin, gzip.open(dst_path, 'wb') as fout:
shutil.copyfileobj(fin, fout)

copied += 1
logger.debug(f'Copied and gzipped: {fname} -> {new_name}')

# Optionally add SEQRES records
Expand All @@ -130,8 +143,11 @@ def copy_and_parse_custom_pdbs(

if not pd.isna(seq):
seq = [one_to_three_res_map[aa] for aa in seq]
add_seqres_to_pdb(path_pdb=dst_path, residues=seq)
logger.debug(f"Inserted SEQRES records into: {new_name}")
if add_seqres_to_pdb(path_pdb=dst_path, residues=seq):
logger.debug(f"Inserted SEQRES records into: {new_name}")
seqres_inserted += 1
else:
seqres_skipped_existing += 1
else:
try:
seq = "".join(list(get_seq_from_pdb(dst_path)))
Expand All @@ -141,4 +157,18 @@ def copy_and_parse_custom_pdbs(
logger.warning(f"SEQRES not found in samplesheet and its extraction from structure failed: {new_name}")
except Exception as e:
logger.warning(f"SEQRES not found in samplesheet and its extraction from structure failed: {new_name}")
logger.warning(f"Exception captured: {e}")
logger.warning(f"Exception captured: {e}")

logger.info(
"Custom PDB copy summary: %s/%s structures copied (skipped %s invalid filenames).",
copied,
total_pdb_files,
skipped_format,
)
if seqres_inserted:
logger.debug("Inserted SEQRES records into %s custom structures.", seqres_inserted)
if seqres_skipped_existing:
logger.info(
"Skipped SEQRES insertion for %s custom structures (SEQRES already present).",
seqres_skipped_existing,
)
Loading