From 31bac066fccd6d4a80ed2bd6994b45a69b8cc8b0 Mon Sep 17 00:00:00 2001 From: St3451 Date: Mon, 16 Feb 2026 12:45:01 +0100 Subject: [PATCH 01/60] fix: enhance download_biomart_metadata function with retry logic and fallback URL --- scripts/datasets/seq_for_mut_prob.py | 83 +++++++++++++++++++++++++--- 1 file changed, 76 insertions(+), 7 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 535fb33..5856259 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -16,7 +16,7 @@ import multiprocessing import os import re -import shlex +import shutil import subprocess import sys import time @@ -624,17 +624,86 @@ def get_mane_to_af_mapping( # time.sleep(5) -def download_biomart_metadata(path_to_file): +def download_biomart_metadata(path_to_file, max_attempts=2, wait_seconds=10): """ Query biomart to get the list of transcript corresponding to the downloaded structures (a few structures are missing) and other information. """ - command = f""" - wget -O {path_to_file} 'http://jan2024.archive.ensembl.org/biomart/martservice?query=' - """ + base_archive = "http://jan2024.archive.ensembl.org" + base_latest = "https://www.ensembl.org" + query = ( + '/biomart/martservice?query=' + '' + '' + '' + '' + '' + '' + ) + url = f"{base_archive}{query}" + fallback_url = f"{base_latest}{query}" + + if shutil.which("wget") is None: + logger.warning("wget not found; falling back to Python downloader for BioMart metadata.") + for attempt in range(1, max_attempts + 1): + try: + download_single_file(url, path_to_file, threads=4) + return + except Exception: + logger.warning( + "BioMart download failed (attempt %s/%s). Retrying in %ss...", + attempt, + max_attempts, + wait_seconds, + ) + time.sleep(wait_seconds) + logger.warning("Falling back to latest Ensembl BioMart URL after failure on %s.", base_archive) + download_single_file(fallback_url, path_to_file, threads=4) + return + + command = [ + "wget", + "--continue", + "--read-timeout=120", + "--timeout=120", + "--tries=1", + "-O", + path_to_file, + url, + ] + + for attempt in range(1, max_attempts + 1): + result = subprocess.run(command) + if result.returncode == 0: + return + logger.warning( + "BioMart download failed (attempt %s/%s). Retrying in %ss...", + attempt, + max_attempts, + wait_seconds, + ) + time.sleep(wait_seconds) + + logger.warning("Falling back to latest Ensembl BioMart URL after failure on %s.", base_archive) + command[-1] = fallback_url + for attempt in range(1, max_attempts + 1): + result = subprocess.run(command) + if result.returncode == 0: + return + logger.warning( + "Fallback BioMart download failed (attempt %s/%s). Retrying in %ss...", + attempt, + max_attempts, + wait_seconds, + ) + time.sleep(wait_seconds) - subprocess.run(shlex.split(command)) + raise RuntimeError( + f"Failed to download BioMart metadata after {max_attempts} attempts on archive and " + f"{max_attempts} attempts on latest." + ) def get_biomart_metadata(datasets_dir, uniprot_ids): @@ -1028,4 +1097,4 @@ def get_seq_df(datasets_dir, num_cores=8, mane_version=1.4, custom_mane_metadata_path="/data/bbg/nobackup/scratch/oncodrive3d/mane_missing/data/250724-no_fragments/af_predictions/previously_pred/samplesheet.csv" - ) \ No newline at end of file + ) From b268d2a042f7acc13d6ae4f7d294da0d21f88648 Mon Sep 17 00:00:00 2001 From: St3451 Date: Mon, 16 Feb 2026 16:15:49 +0100 Subject: [PATCH 02/60] feat: improve get_ref_dna_from_ensembl_mp function with error handling and optimized processing --- scripts/datasets/seq_for_mut_prob.py | 30 ++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 5856259..50f2733 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -807,12 +807,34 @@ def get_ref_dna_from_ensembl_mp(seq_df, cores): https://rest.ensembl.org/documentation/info/sequence_id """ - pool = multiprocessing.Pool(processes=cores) seq_df = seq_df.copy() - seq_df["Seq_dna"] = pool.map(get_ref_dna_from_ensembl_wrapper, seq_df.Ens_Transcr_ID) - pool.close() - pool.join() + transcript_ids = seq_df.Ens_Transcr_ID.tolist() + total = len(transcript_ids) + if total == 0: + seq_df["Seq_dna"] = [] + return seq_df + + logger.debug("Retrieving CDS DNA from Ensembl for %s transcripts with %s cores.", total, cores) + + if cores <= 1: + seq_df["Seq_dna"] = [ + get_ref_dna_from_ensembl_wrapper(tid) + for tid in tqdm(transcript_ids, total=total, desc="Ensembl CDS") + ] + logger.debug("Completed Ensembl CDS retrieval.") + return seq_df + pool = multiprocessing.Pool(processes=cores) + try: + results_iter = pool.imap(get_ref_dna_from_ensembl_wrapper, transcript_ids) + seq_df["Seq_dna"] = [ + seq for seq in tqdm(results_iter, total=total, desc="Ensembl CDS") + ] + finally: + pool.close() + pool.join() + + logger.debug("Completed Ensembl CDS retrieval.") return seq_df From 86796c0d3dfc6a4daa18bfd6e5451065c1ff57c2 Mon Sep 17 00:00:00 2001 From: St3451 Date: Mon, 16 Feb 2026 16:18:46 +0100 Subject: [PATCH 03/60] refactor: streamline get_ref_dna_from_ensembl function by removing unnecessary wrapper --- scripts/datasets/seq_for_mut_prob.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 50f2733..ec4aae2 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -790,15 +790,6 @@ def get_ref_dna_from_ensembl(transcript_id): return seq_dna[:len(seq_dna)-3] -def get_ref_dna_from_ensembl_wrapper(ensembl_id): - """ - Wrapper for multiple processing function using - Ensembl Get sequence rest API. - """ - - return get_ref_dna_from_ensembl(ensembl_id) - - def get_ref_dna_from_ensembl_mp(seq_df, cores): """ Multiple processing function to use Ensembl GET sequence @@ -818,7 +809,7 @@ def get_ref_dna_from_ensembl_mp(seq_df, cores): if cores <= 1: seq_df["Seq_dna"] = [ - get_ref_dna_from_ensembl_wrapper(tid) + get_ref_dna_from_ensembl(tid) for tid in tqdm(transcript_ids, total=total, desc="Ensembl CDS") ] logger.debug("Completed Ensembl CDS retrieval.") @@ -826,7 +817,7 @@ def get_ref_dna_from_ensembl_mp(seq_df, cores): pool = multiprocessing.Pool(processes=cores) try: - results_iter = pool.imap(get_ref_dna_from_ensembl_wrapper, transcript_ids) + results_iter = pool.imap(get_ref_dna_from_ensembl, transcript_ids) seq_df["Seq_dna"] = [ seq for seq in tqdm(results_iter, total=total, desc="Ensembl CDS") ] From fea7d2f9bc03c59540b4c292788a8e185c52ee53 Mon Sep 17 00:00:00 2001 From: St3451 Date: Mon, 16 Feb 2026 17:35:20 +0100 Subject: [PATCH 04/60] logs: enhance get_ref_dna_from_ensembl function with improved error handling and logging --- scripts/datasets/seq_for_mut_prob.py | 56 +++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index ec4aae2..bcf6880 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -756,11 +756,22 @@ def get_ref_dna_from_ensembl(transcript_id): https://rest.ensembl.org/documentation/info/sequence_id """ + pid = os.getpid() + start_time = time.perf_counter() + failures = 0 + + if pd.isna(transcript_id): + logger.debug("Ensembl CDS start: (pid=%s) -> skipping", pid) + return np.nan + + transcript_id = str(transcript_id) + logger.debug("Ensembl CDS start: %s (pid=%s)", transcript_id, pid) + server = "https://rest.ensembl.org" ext = f"/sequence/id/{transcript_id}?type=cds" status = "INIT" - i = 0 + last_error = None while status != "FINISHED": try: @@ -774,19 +785,52 @@ def get_ref_dna_from_ensembl(transcript_id): status = "FINISHED" except requests.exceptions.RequestException as e: - i += 1 + failures += 1 + last_error = e status = "ERROR" - if i%10 == 0: - logger.debug(f"Failed to retrieve sequence for {transcript_id} {e}: Retrying..") + if failures % 10 == 0: + logger.debug( + "Failed to retrieve sequence for %s (pid=%s, failures=%s) %s: Retrying..", + transcript_id, + pid, + failures, + e, + ) time.sleep(5) - if i == 100: - logger.debug(f"Failed to retrieve sequence for {transcript_id} {e}: Skipping..") + if failures == 100: + elapsed = time.perf_counter() - start_time + logger.warning( + "Ensembl CDS failed: %s (pid=%s, elapsed=%.2fs, failures=%s). Last error: %s", + transcript_id, + pid, + elapsed, + failures, + last_error, + ) return np.nan time.sleep(1) seq_dna = "".join(r.text.strip().split("\n")[1:]) + elapsed = time.perf_counter() - start_time + if failures > 0: + logger.info( + "Ensembl CDS completed: %s (pid=%s, elapsed=%.2fs, failures=%s)", + transcript_id, + pid, + elapsed, + failures, + ) + else: + logger.debug( + "Ensembl CDS completed: %s (pid=%s, elapsed=%.2fs, failures=%s)", + transcript_id, + pid, + elapsed, + failures, + ) + return seq_dna[:len(seq_dna)-3] From a402cdebe6c621620dcc7e9a5482cf29800b915a Mon Sep 17 00:00:00 2001 From: St3451 Date: Mon, 16 Feb 2026 17:35:35 +0100 Subject: [PATCH 05/60] docs: update organism specification in README to include full scientific names --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d2db3df..b23f5de 100644 --- a/README.md +++ b/README.md @@ -83,8 +83,8 @@ Examples: Options: -o, --output_dir PATH Path to the directory where the output files will be saved. Default: ./datasets/ - -s, --organism PATH Specifies the organism (`human` or `mouse`). - Default: human + -s, --organism TEXT Specifies the organism (`human` or `mouse`; also accepts `Homo sapiens` / `Mus musculus`). + Default: Homo sapiens -m, --mane Use structures predicted from MANE Select transcripts (applicable to Homo sapiens only). -M, --mane_only Use only structures predicted from MANE Select transcripts From d9617305605b132c2f0c2fdfcf1a8d2f10edb73f Mon Sep 17 00:00:00 2001 From: St3451 Date: Mon, 16 Feb 2026 17:46:26 +0100 Subject: [PATCH 06/60] refactor: simplify multiprocessing in get_ref_dna_from_ensembl_mp function --- scripts/datasets/seq_for_mut_prob.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index bcf6880..c87a1cd 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -859,15 +859,9 @@ def get_ref_dna_from_ensembl_mp(seq_df, cores): logger.debug("Completed Ensembl CDS retrieval.") return seq_df - pool = multiprocessing.Pool(processes=cores) - try: + with multiprocessing.Pool(processes=cores) as pool: results_iter = pool.imap(get_ref_dna_from_ensembl, transcript_ids) - seq_df["Seq_dna"] = [ - seq for seq in tqdm(results_iter, total=total, desc="Ensembl CDS") - ] - finally: - pool.close() - pool.join() + seq_df["Seq_dna"] = [seq for seq in tqdm(results_iter, total=total, desc="Ensembl CDS")] logger.debug("Completed Ensembl CDS retrieval.") return seq_df From 8c2ce07f0a7dfb6de56dcd4137ab549e470c5f68 Mon Sep 17 00:00:00 2001 From: St3451 Date: Mon, 16 Feb 2026 17:48:48 +0100 Subject: [PATCH 07/60] fix: enhance download_biomart_metadata function with improved error handling and fallback mechanism --- scripts/datasets/seq_for_mut_prob.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index c87a1cd..726d28f 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -647,11 +647,13 @@ def download_biomart_metadata(path_to_file, max_attempts=2, wait_seconds=10): if shutil.which("wget") is None: logger.warning("wget not found; falling back to Python downloader for BioMart metadata.") + last_exc = None for attempt in range(1, max_attempts + 1): try: download_single_file(url, path_to_file, threads=4) return - except Exception: + except Exception as exc: + last_exc = exc logger.warning( "BioMart download failed (attempt %s/%s). Retrying in %ss...", attempt, @@ -659,9 +661,26 @@ def download_biomart_metadata(path_to_file, max_attempts=2, wait_seconds=10): wait_seconds, ) time.sleep(wait_seconds) + logger.warning("Falling back to latest Ensembl BioMart URL after failure on %s.", base_archive) - download_single_file(fallback_url, path_to_file, threads=4) - return + for attempt in range(1, max_attempts + 1): + try: + download_single_file(fallback_url, path_to_file, threads=4) + return + except Exception as exc: + last_exc = exc + logger.warning( + "Fallback BioMart download failed (attempt %s/%s). Retrying in %ss...", + attempt, + max_attempts, + wait_seconds, + ) + time.sleep(wait_seconds) + + raise RuntimeError( + f"Failed to download BioMart metadata after {max_attempts} attempts on archive and " + f"{max_attempts} attempts on latest." + ) from last_exc command = [ "wget", From dbf0cd2459b2f0a74f2c09de89c0e79714cb85c6 Mon Sep 17 00:00:00 2001 From: St3451 Date: Mon, 16 Feb 2026 18:07:44 +0100 Subject: [PATCH 08/60] logs: improve error handling and logging in download_biomart_metadata function --- scripts/datasets/seq_for_mut_prob.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 726d28f..0e64aee 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -655,11 +655,13 @@ def download_biomart_metadata(path_to_file, max_attempts=2, wait_seconds=10): except Exception as exc: last_exc = exc logger.warning( - "BioMart download failed (attempt %s/%s). Retrying in %ss...", + "BioMart download failed (attempt %s/%s). Retrying in %ss... Error: %s", attempt, max_attempts, wait_seconds, + exc, ) + logger.debug("BioMart download exception details:", exc_info=True) time.sleep(wait_seconds) logger.warning("Falling back to latest Ensembl BioMart URL after failure on %s.", base_archive) @@ -670,11 +672,13 @@ def download_biomart_metadata(path_to_file, max_attempts=2, wait_seconds=10): except Exception as exc: last_exc = exc logger.warning( - "Fallback BioMart download failed (attempt %s/%s). Retrying in %ss...", + "Fallback BioMart download failed (attempt %s/%s). Retrying in %ss... Error: %s", attempt, max_attempts, wait_seconds, + exc, ) + logger.debug("Fallback BioMart download exception details:", exc_info=True) time.sleep(wait_seconds) raise RuntimeError( From 519d92c2808bb9cfa1af96bb5b8a856d54570341 Mon Sep 17 00:00:00 2001 From: St3451 Date: Mon, 16 Feb 2026 18:23:02 +0100 Subject: [PATCH 09/60] fix: improve error handling by removing partial BioMart metadata files before fallback --- scripts/datasets/seq_for_mut_prob.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 0e64aee..4f8f02f 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -665,6 +665,15 @@ def download_biomart_metadata(path_to_file, max_attempts=2, wait_seconds=10): time.sleep(wait_seconds) logger.warning("Falling back to latest Ensembl BioMart URL after failure on %s.", base_archive) + if os.path.exists(path_to_file): + try: + os.remove(path_to_file) + except OSError as exc: + logger.warning( + "Failed to remove partial BioMart metadata file %s before fallback: %s", + path_to_file, + exc, + ) for attempt in range(1, max_attempts + 1): try: download_single_file(fallback_url, path_to_file, threads=4) @@ -710,6 +719,15 @@ def download_biomart_metadata(path_to_file, max_attempts=2, wait_seconds=10): time.sleep(wait_seconds) logger.warning("Falling back to latest Ensembl BioMart URL after failure on %s.", base_archive) + if os.path.exists(path_to_file): + try: + os.remove(path_to_file) + except OSError as exc: + logger.warning( + "Failed to remove partial BioMart metadata file %s before fallback: %s", + path_to_file, + exc, + ) command[-1] = fallback_url for attempt in range(1, max_attempts + 1): result = subprocess.run(command) From 2db73369c62e5e32fb9176540a7beb91827cd86e Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 01:37:59 +0100 Subject: [PATCH 10/60] feat: add batch retrieval for Ensembl CDS DNA sequences and improve request handling --- scripts/datasets/seq_for_mut_prob.py | 168 +++++++++++++++++++++++++-- 1 file changed, 158 insertions(+), 10 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 4f8f02f..52ffe4c 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -41,6 +41,11 @@ logger = daiquiri.getLogger(__logger_name__ + ".build.seq_for_mut_prob") +_ENSEMBL_REST_SERVER = "https://rest.ensembl.org" +_ENSEMBL_REST_TIMEOUT = (10, 160) # (connect, read) seconds +_ENSEMBL_REST_HEADERS = {"Content-Type": "text/plain"} + + #=========== # region Initialize #=========== @@ -789,6 +794,134 @@ def get_biomart_metadata(datasets_dir, uniprot_ids): return canonical_transcripts +def get_ref_dna_from_ensembl_batch(transcript_ids, max_attempts=3, wait_seconds=2): + """ + Retrieve Ensembl CDS DNA sequences for up to 50 stable IDs in a single request. + + Ensembl REST docs: POST /sequence/id (max POST size = 50). + """ + + pid = os.getpid() + start_time = time.perf_counter() + + if not transcript_ids: + return [] + + # Keep output aligned with the input order (including any NA values). + results = [np.nan] * len(transcript_ids) + request_ids = [] + request_positions = [] + for pos, tid in enumerate(transcript_ids): + if pd.isna(tid): + continue + tid_str = str(tid) + request_ids.append(tid_str) + request_positions.append(pos) + + if not request_ids: + return results + + if len(request_ids) > 50: + raise ValueError(f"Ensembl POST /sequence/id supports max 50 IDs per request; got {len(request_ids)}.") + + url = f"{_ENSEMBL_REST_SERVER}/sequence/id" + headers = {"Content-Type": "application/json", "Accept": "application/json"} + payload = {"ids": request_ids} + params = {"type": "cds"} + + last_error = None + for attempt in range(1, max_attempts + 1): + try: + r = requests.post(url, headers=headers, json=payload, params=params, timeout=_ENSEMBL_REST_TIMEOUT) + r.raise_for_status() + decoded = r.json() + + seq_by_id = {} + errors_by_id = {} + if isinstance(decoded, dict): + decoded = [decoded] + if not isinstance(decoded, list): + raise ValueError(f"Unexpected Ensembl response type: {type(decoded).__name__}") + + for item in decoded: + if not isinstance(item, dict): + continue + + item_id = item.get("id") + item_query = item.get("query") + item_keys = [k for k in (item_query, item_id) if k] + if not item_keys: + continue + + if "seq" in item and item.get("seq") is not None: + seq_val = item.get("seq") + for key in item_keys: + seq_by_id[key] = seq_val + elif "error" in item: + err_val = item.get("error") + for key in item_keys: + errors_by_id[key] = err_val + + missing = 0 + for pos, tid in zip(request_positions, request_ids): + seq_dna = seq_by_id.get(tid) + if seq_dna is None and "." in tid: + seq_dna = seq_by_id.get(tid.split(".", 1)[0]) + if not seq_dna: + missing += 1 + continue + results[pos] = seq_dna[:-3] if len(seq_dna) >= 3 else "" + + elapsed = time.perf_counter() - start_time + if missing > 0 and logger.isEnabledFor(logging.DEBUG): + example_missing = [tid for tid in request_ids if tid not in seq_by_id][:5] + logger.debug( + "Ensembl CDS batch completed with missing IDs (pid=%s, elapsed=%.2fs, requested=%s, missing=%s). Example missing: %s", + pid, + elapsed, + len(request_ids), + missing, + example_missing, + ) + if errors_by_id: + example_errors = {k: errors_by_id[k] for k in list(errors_by_id)[:3]} + logger.debug("Ensembl CDS batch errors (pid=%s): %s", pid, example_errors) + else: + logger.debug( + "Ensembl CDS batch completed (pid=%s, elapsed=%.2fs, requested=%s)", + pid, + elapsed, + len(request_ids), + ) + + return results + + except (requests.exceptions.RequestException, ValueError, json.JSONDecodeError) as exc: + last_error = exc + if attempt < max_attempts: + logger.debug( + "Ensembl CDS batch request failed (pid=%s, attempt=%s/%s): %s. Retrying in %ss...", + pid, + attempt, + max_attempts, + exc, + wait_seconds, + ) + time.sleep(wait_seconds) + continue + + elapsed = time.perf_counter() - start_time + logger.warning( + "Ensembl CDS batch failed (pid=%s, elapsed=%.2fs, requested=%s). Last error: %s", + pid, + elapsed, + len(request_ids), + last_error, + ) + + return results + + def get_ref_dna_from_ensembl(transcript_id): """ Use Ensembl GET sequence rest API to obtain CDS DNA @@ -808,15 +941,14 @@ def get_ref_dna_from_ensembl(transcript_id): transcript_id = str(transcript_id) logger.debug("Ensembl CDS start: %s (pid=%s)", transcript_id, pid) - server = "https://rest.ensembl.org" - ext = f"/sequence/id/{transcript_id}?type=cds" + url = f"{_ENSEMBL_REST_SERVER}/sequence/id/{transcript_id}?type=cds" status = "INIT" last_error = None while status != "FINISHED": try: - r = requests.get(server+ext, headers={ "Content-Type" : "text/x-fasta"}, timeout=160) + r = requests.get(url, headers=_ENSEMBL_REST_HEADERS, timeout=_ENSEMBL_REST_TIMEOUT) if not r.ok: r.raise_for_status() @@ -852,7 +984,11 @@ def get_ref_dna_from_ensembl(transcript_id): time.sleep(1) - seq_dna = "".join(r.text.strip().split("\n")[1:]) + text = r.text.strip() + if text.startswith(">"): + seq_dna = "".join(text.splitlines()[1:]) + else: + seq_dna = "".join(text.splitlines()) elapsed = time.perf_counter() - start_time if failures > 0: @@ -893,16 +1029,28 @@ def get_ref_dna_from_ensembl_mp(seq_df, cores): logger.debug("Retrieving CDS DNA from Ensembl for %s transcripts with %s cores.", total, cores) if cores <= 1: - seq_df["Seq_dna"] = [ - get_ref_dna_from_ensembl(tid) - for tid in tqdm(transcript_ids, total=total, desc="Ensembl CDS") - ] + results = [] + batch_size = 50 + with tqdm(total=total, desc="Ensembl CDS") as pbar: + for i in range(0, total, batch_size): + batch_ids = transcript_ids[i : i + batch_size] + batch_results = get_ref_dna_from_ensembl_batch(batch_ids) + results.extend(batch_results) + pbar.update(len(batch_ids)) + seq_df["Seq_dna"] = results logger.debug("Completed Ensembl CDS retrieval.") return seq_df + batch_size = 50 + batches = [transcript_ids[i : i + batch_size] for i in range(0, total, batch_size)] + results = [] with multiprocessing.Pool(processes=cores) as pool: - results_iter = pool.imap(get_ref_dna_from_ensembl, transcript_ids) - seq_df["Seq_dna"] = [seq for seq in tqdm(results_iter, total=total, desc="Ensembl CDS")] + results_iter = pool.imap(get_ref_dna_from_ensembl_batch, batches) + with tqdm(total=total, desc="Ensembl CDS") as pbar: + for batch_ids, batch_results in zip(batches, results_iter): + results.extend(batch_results) + pbar.update(len(batch_ids)) + seq_df["Seq_dna"] = results logger.debug("Completed Ensembl CDS retrieval.") return seq_df From 3753b2cb113ef51029b3e8e754e3a04746170df0 Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 01:42:58 +0100 Subject: [PATCH 11/60] feat: handle rate limiting in get_ref_dna_from_ensembl_batch function --- scripts/datasets/seq_for_mut_prob.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 52ffe4c..ae729f7 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -833,6 +833,22 @@ def get_ref_dna_from_ensembl_batch(transcript_ids, max_attempts=3, wait_seconds= for attempt in range(1, max_attempts + 1): try: r = requests.post(url, headers=headers, json=payload, params=params, timeout=_ENSEMBL_REST_TIMEOUT) + if r.status_code == 429: + retry_after = r.headers.get("Retry-After") + try: + retry_after = float(retry_after) + except (TypeError, ValueError): + retry_after = wait_seconds + logger.warning( + "Ensembl CDS batch rate limited (pid=%s, attempt=%s/%s). Retrying after %ss.", + pid, + attempt, + max_attempts, + retry_after, + ) + time.sleep(retry_after) + continue + r.raise_for_status() decoded = r.json() From 826c2398471cc1fabe3909f05766c16f15d7e3a9 Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 01:45:45 +0100 Subject: [PATCH 12/60] increase max_attempts in download_biomart_metadata to 5 --- scripts/datasets/seq_for_mut_prob.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index ae729f7..9b83b15 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -629,7 +629,7 @@ def get_mane_to_af_mapping( # time.sleep(5) -def download_biomart_metadata(path_to_file, max_attempts=2, wait_seconds=10): +def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): """ Query biomart to get the list of transcript corresponding to the downloaded structures (a few structures are missing) and other information. From 2cf98652bfe9cdbe2f07571fd6100143fcc0bdfc Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 01:55:00 +0100 Subject: [PATCH 13/60] fix: add logging import to seq_for_mut_prob.py --- scripts/datasets/seq_for_mut_prob.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 9b83b15..8f88b45 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -13,6 +13,7 @@ import ast import json +import logging import multiprocessing import os import re From 2c006e38d60e77bc3ffa57ef3b7d7cf07736ac37 Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 01:55:45 +0100 Subject: [PATCH 14/60] remove outdated download_biomart_metadata function --- scripts/datasets/seq_for_mut_prob.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 8f88b45..39ac9e7 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -613,23 +613,6 @@ def get_mane_to_af_mapping( return mane_mapping -# def download_biomart_metadata(path_to_file, max_attempts=15, cores=8): -# """ -# Query biomart to get the list of transcript corresponding to the downloaded -# structures (a few structures are missing) and other information. -# """ - -# url = 'http://jan2024.archive.ensembl.org/biomart/martservice?query=' -# attempts = 0 - -# while not os.path.exists(path_to_file): -# download_single_file(url, path_to_file, threads=cores) -# attempts += 1 -# if attempts >= max_attempts: -# raise RuntimeError(f"Failed to download MANE summary file after {max_attempts} attempts. Exiting..") -# time.sleep(5) - - def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): """ Query biomart to get the list of transcript corresponding to the downloaded From 08eb1bbf9a4f1b25c5badbf59563050180f96a48 Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 11:00:38 +0100 Subject: [PATCH 15/60] fix: handle empty dataframes in process_seq_df and process_seq_df_mane --- scripts/datasets/seq_for_mut_prob.py | 102 +++++++++++++++++---------- 1 file changed, 64 insertions(+), 38 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 39ac9e7..a158830 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -450,6 +450,9 @@ def add_extra_genes_to_seq_df(seq_df, uniprot_to_gene_dict): lst_extra_genes_rows.append(row) lst_added_genes.append(gene) + if not lst_extra_genes_rows: + return seq_df + seq_df_extra_genes = pd.concat(lst_extra_genes_rows, axis=1).T # Remove rows with multiple symbols and drop duplicated ones @@ -1105,6 +1108,10 @@ def process_seq_df(seq_df, -1 : Not available transcripts, seq DNA retrieved from Backtranseq API """ + if seq_df.empty: + logger.error("No sequences to process in process_seq_df; this should not happen.") + raise RuntimeError("Empty sequence dataframe: no structures to process.") + # Process entries in Proteins API (Reference_info 1) #--------------------------------------------------- @@ -1130,13 +1137,14 @@ def process_seq_df(seq_df, # Process entries not in Proteins API (Reference_info -1) #------------------------------------------------------------ - # Add DNA seq from Backtranseq for any other entry - logger.debug(f"Retrieving CDS DNA seq for entries without available transcript ID (Backtranseq API): {len(seq_df_not_uniprot)} structures..") - seq_df_not_uniprot = batch_backtranseq(seq_df_not_uniprot, 500, organism=organism) + if len(seq_df_not_uniprot) > 0: + # Add DNA seq from Backtranseq for any other entry + logger.debug(f"Retrieving CDS DNA seq for entries without available transcript ID (Backtranseq API): {len(seq_df_not_uniprot)} structures..") + seq_df_not_uniprot = batch_backtranseq(seq_df_not_uniprot, 500, organism=organism) - # Get trinucleotide context - seq_df_not_uniprot["Tri_context"] = seq_df_not_uniprot["Seq_dna"].apply( - lambda x: ",".join(per_site_trinucleotide_context(x, no_flanks=True))) + # Get trinucleotide context + seq_df_not_uniprot["Tri_context"] = seq_df_not_uniprot["Seq_dna"].apply( + lambda x: ",".join(per_site_trinucleotide_context(x, no_flanks=True))) # Prepare final output @@ -1181,45 +1189,63 @@ def process_seq_df_mane(seq_df, seq_df_mane = seq_df_mane.drop(columns=["Gene"]).merge(mane_mapping, how="left", on="Uniprot_ID") seq_df_mane["Reference_info"] = 0 - # Add DNA seq from Ensembl for structures with available transcript ID - logger.debug(f"Retrieving CDS DNA seq from transcript ID (Ensembl API): {len(seq_df_mane)} structures..") - seq_df_mane = get_ref_dna_from_ensembl_mp(seq_df_mane, cores=num_cores) - - # Set failed and len-mismatching entries as no-transcripts entries - failed_ix = seq_df_mane.apply(lambda x: True if pd.isna(x.Seq_dna) else len(x.Seq_dna) / 3 != len(x.Seq), axis=1) - if sum(failed_ix) > 0: - seq_df_mane_failed = seq_df_mane[failed_ix] - seq_df_mane = seq_df_mane[~failed_ix] - seq_df_mane_failed = seq_df_mane_failed.drop(columns=[ - "Ens_Gene_ID", - "Ens_Transcr_ID", - "Reverse_strand", - "Chr", - "Refseq_prot", - "Reference_info", - "Seq_dna" - ]) - seq_df_nomane = pd.concat((seq_df_nomane, seq_df_mane_failed)) + if seq_df_mane.empty: + logger.warning("No MANE sequences to process; skipping Ensembl CDS retrieval.") + if "Seq_dna" not in seq_df_mane.columns: + seq_df_mane["Seq_dna"] = pd.Series(dtype=object) + else: + # Add DNA seq from Ensembl for structures with available transcript ID + logger.debug(f"Retrieving CDS DNA seq from transcript ID (Ensembl API): {len(seq_df_mane)} structures..") + seq_df_mane = get_ref_dna_from_ensembl_mp(seq_df_mane, cores=num_cores) + + # Set failed and len-mismatching entries as no-transcripts entries + failed_ix = seq_df_mane.apply(lambda x: True if pd.isna(x.Seq_dna) else len(x.Seq_dna) / 3 != len(x.Seq), axis=1) + if sum(failed_ix) > 0: + seq_df_mane_failed = seq_df_mane[failed_ix] + seq_df_mane = seq_df_mane[~failed_ix] + seq_df_mane_failed = seq_df_mane_failed.drop(columns=[ + "Ens_Gene_ID", + "Ens_Transcr_ID", + "Reverse_strand", + "Chr", + "Refseq_prot", + "Reference_info", + "Seq_dna" + ]) + seq_df_nomane = pd.concat((seq_df_nomane, seq_df_mane_failed)) # Seq df not MANE # --------------- - seq_df_nomane = add_extra_genes_to_seq_df(seq_df_nomane, uniprot_to_gene_dict) # Filter out genes with NA - seq_df_nomane = seq_df_nomane[seq_df_nomane.Gene.isin(mane_mapping_not_af.Gene)] # Filter out genes that are not in MANE list - - # Retrieve seq from coordinates - logger.debug(f"Retrieving CDS DNA seq from reference genome (Proteins API): {len(seq_df_nomane['Uniprot_ID'].unique())} structures..") - coord_df = get_exons_coord(seq_df_nomane["Uniprot_ID"].unique(), ens_canonical_transcripts_lst) - seq_df_nomane = seq_df_nomane.merge(coord_df, on=["Seq", "Uniprot_ID"], how="left").reset_index(drop=True) # Discard entries whose Seq obtained by Proteins API don't exactly match the one in structure - seq_df_nomane = add_ref_dna_and_context(seq_df_nomane, hg38) - seq_df_nomane_tr = seq_df_nomane[seq_df_nomane["Reference_info"] == 1] - seq_df_nomane_notr = seq_df_nomane[seq_df_nomane["Reference_info"] == -1] + if seq_df_nomane.empty: + logger.debug("No non-MANE sequences to process; skipping Proteins/Backtranseq retrieval.") + seq_df_nomane_tr = seq_df_nomane.copy() + seq_df_nomane_notr = seq_df_nomane.copy() + else: + seq_df_nomane = add_extra_genes_to_seq_df(seq_df_nomane, uniprot_to_gene_dict) # Filter out genes with NA + seq_df_nomane = seq_df_nomane[seq_df_nomane.Gene.isin(mane_mapping_not_af.Gene)] # Filter out genes that are not in MANE list - # Add DNA seq from Backtranseq for any other entry - logger.debug(f"Retrieving CDS DNA seq for genes without available transcript ID (Backtranseq API): {len(seq_df_nomane_notr)} structures..") - seq_df_nomane_notr = batch_backtranseq(seq_df_nomane_notr, 500, organism="Homo sapiens") + if seq_df_nomane.empty: + logger.debug("No non-MANE sequences after filtering; skipping Proteins/Backtranseq retrieval.") + seq_df_nomane_tr = seq_df_nomane.copy() + seq_df_nomane_notr = seq_df_nomane.copy() + else: + # Retrieve seq from coordinates + logger.debug(f"Retrieving CDS DNA seq from reference genome (Proteins API): {len(seq_df_nomane['Uniprot_ID'].unique())} structures..") + coord_df = get_exons_coord(seq_df_nomane["Uniprot_ID"].unique(), ens_canonical_transcripts_lst) + seq_df_nomane = seq_df_nomane.merge(coord_df, on=["Seq", "Uniprot_ID"], how="left").reset_index(drop=True) # Discard entries whose Seq obtained by Proteins API don't exactly match the one in structure + seq_df_nomane = add_ref_dna_and_context(seq_df_nomane, hg38) + seq_df_nomane_tr = seq_df_nomane[seq_df_nomane["Reference_info"] == 1] + seq_df_nomane_notr = seq_df_nomane[seq_df_nomane["Reference_info"] == -1] + + # Add DNA seq from Backtranseq for any other entry + if len(seq_df_nomane_notr) > 0: + logger.debug(f"Retrieving CDS DNA seq for genes without available transcript ID (Backtranseq API): {len(seq_df_nomane_notr)} structures..") + seq_df_nomane_notr = batch_backtranseq(seq_df_nomane_notr, 500, organism="Homo sapiens") # Get trinucleotide context seq_df_not_uniprot = pd.concat((seq_df_mane, seq_df_nomane_notr)) + if "Seq_dna" not in seq_df_not_uniprot.columns: + seq_df_not_uniprot["Seq_dna"] = pd.Series(dtype=object) seq_df_not_uniprot["Tri_context"] = seq_df_not_uniprot["Seq_dna"].apply( lambda x: ",".join(per_site_trinucleotide_context(x, no_flanks=True))) From 40b3432bf17152b7d17722714378f4cefa72485a Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 11:51:58 +0100 Subject: [PATCH 16/60] logs: enhance logging for BioMart download failures and improve error handling --- scripts/datasets/seq_for_mut_prob.py | 61 +++++++++++++++++++++------- 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index a158830..b473ab0 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -699,15 +699,28 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): ] for attempt in range(1, max_attempts + 1): - result = subprocess.run(command) + result = subprocess.run(command, capture_output=True, text=True) if result.returncode == 0: return - logger.warning( - "BioMart download failed (attempt %s/%s). Retrying in %ss...", - attempt, - max_attempts, - wait_seconds, - ) + stderr = (result.stderr or "").strip() + if stderr: + logger.warning( + "BioMart download failed (attempt %s/%s, return code %s). stderr: %s", + attempt, + max_attempts, + result.returncode, + stderr, + ) + else: + logger.warning( + "BioMart download failed (attempt %s/%s, return code %s). Retrying in %ss...", + attempt, + max_attempts, + result.returncode, + wait_seconds, + ) + if logger.isEnabledFor(logging.DEBUG) and result.stdout: + logger.debug("BioMart wget stdout (attempt %s/%s): %s", attempt, max_attempts, result.stdout.strip()) time.sleep(wait_seconds) logger.warning("Falling back to latest Ensembl BioMart URL after failure on %s.", base_archive) @@ -722,15 +735,33 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): ) command[-1] = fallback_url for attempt in range(1, max_attempts + 1): - result = subprocess.run(command) + result = subprocess.run(command, capture_output=True, text=True) if result.returncode == 0: return - logger.warning( - "Fallback BioMart download failed (attempt %s/%s). Retrying in %ss...", - attempt, - max_attempts, - wait_seconds, - ) + stderr = (result.stderr or "").strip() + if stderr: + logger.warning( + "Fallback BioMart download failed (attempt %s/%s, return code %s). stderr: %s", + attempt, + max_attempts, + result.returncode, + stderr, + ) + else: + logger.warning( + "Fallback BioMart download failed (attempt %s/%s, return code %s). Retrying in %ss...", + attempt, + max_attempts, + result.returncode, + wait_seconds, + ) + if logger.isEnabledFor(logging.DEBUG) and result.stdout: + logger.debug( + "Fallback BioMart wget stdout (attempt %s/%s): %s", + attempt, + max_attempts, + result.stdout.strip(), + ) time.sleep(wait_seconds) raise RuntimeError( @@ -1011,7 +1042,7 @@ def get_ref_dna_from_ensembl(transcript_id): failures, ) - return seq_dna[:len(seq_dna)-3] + return seq_dna[:-3] def get_ref_dna_from_ensembl_mp(seq_df, cores): From 62e8df5995f906b3ef56052c5600b5da207ad26f Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 11:52:28 +0100 Subject: [PATCH 17/60] logs: add warning log for exceeding max attempts in Ensembl CDS batch requests --- scripts/datasets/seq_for_mut_prob.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index b473ab0..1bdc50b 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -857,6 +857,15 @@ def get_ref_dna_from_ensembl_batch(transcript_ids, max_attempts=3, wait_seconds= retry_after = float(retry_after) except (TypeError, ValueError): retry_after = wait_seconds + if attempt >= max_attempts: + logger.warning( + "Ensembl CDS batch rate limited (pid=%s, attempt=%s/%s). Giving up after %ss.", + pid, + attempt, + max_attempts, + retry_after, + ) + return results logger.warning( "Ensembl CDS batch rate limited (pid=%s, attempt=%s/%s). Retrying after %ss.", pid, From a29831d0c55ee50d5841fbece910b10c454b2606 Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 12:18:00 +0100 Subject: [PATCH 18/60] fix: add SSL verification for download_single_file in download_biomart_metadata --- scripts/datasets/seq_for_mut_prob.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 1bdc50b..6e239d7 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -640,9 +640,10 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): if shutil.which("wget") is None: logger.warning("wget not found; falling back to Python downloader for BioMart metadata.") last_exc = None + ssl_verify_archive = url.startswith("https://") for attempt in range(1, max_attempts + 1): try: - download_single_file(url, path_to_file, threads=4) + download_single_file(url, path_to_file, threads=4, ssl=ssl_verify_archive) return except Exception as exc: last_exc = exc @@ -666,9 +667,10 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): path_to_file, exc, ) + ssl_verify_fallback = fallback_url.startswith("https://") for attempt in range(1, max_attempts + 1): try: - download_single_file(fallback_url, path_to_file, threads=4) + download_single_file(fallback_url, path_to_file, threads=4, ssl=ssl_verify_fallback) return except Exception as exc: last_exc = exc From 5c2a26fcf23bf2cf3a1155edadd7fe86578fb9cb Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 12:21:50 +0100 Subject: [PATCH 19/60] logs: update headers for Ensembl REST API and add wget option to disable HSTS --- scripts/datasets/seq_for_mut_prob.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 6e239d7..2b5edae 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -44,7 +44,7 @@ _ENSEMBL_REST_SERVER = "https://rest.ensembl.org" _ENSEMBL_REST_TIMEOUT = (10, 160) # (connect, read) seconds -_ENSEMBL_REST_HEADERS = {"Content-Type": "text/plain"} +_ENSEMBL_REST_HEADERS = {"Accept": "text/plain"} #=========== @@ -691,6 +691,7 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): command = [ "wget", + "--no-hsts", "--continue", "--read-timeout=120", "--timeout=120", From 4276b47b71897f391335ab562452c550cb719ee1 Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 12:22:42 +0100 Subject: [PATCH 20/60] feat: add SSL option to download_single_file for secure downloads --- scripts/datasets/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/datasets/utils.py b/scripts/datasets/utils.py index 70612b2..91447f2 100644 --- a/scripts/datasets/utils.py +++ b/scripts/datasets/utils.py @@ -115,7 +115,7 @@ def calculate_hash(filepath: str, hash_func=hashlib.sha256) -> str: return hash_obj.hexdigest() -def download_single_file(url: str, destination: str, threads: int, proteome=None) -> None: +def download_single_file(url: str, destination: str, threads: int, proteome=None, ssl=False) -> None: """ Downloads a file from a URL and saves it to the specified destination. @@ -138,7 +138,7 @@ def download_single_file(url: str, destination: str, threads: int, proteome=None logger.debug(f'Downloading {url}') logger.debug(f"Downloading to {destination}") - dl = Downloader(timeout=aiohttp.ClientTimeout(sock_read=400), ssl=False) + dl = Downloader(timeout=aiohttp.ClientTimeout(sock_read=400), ssl=ssl) dl.start(url, destination, segments=num_connections, display=True, retries=10, clear_terminal=False) logger.debug('Download complete') @@ -392,4 +392,4 @@ def uniprot_to_hugo(uniprot_ids, hugo_as_keys=False, batch_size=5000): # if hugo_as_keys: # result_dict = convert_dict_hugo_to_uniprot(result_dict) -# return result_dict \ No newline at end of file +# return result_dict From b1f5722e22d96ca3846fe997e773a0f8e4f976d4 Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 12:40:17 +0100 Subject: [PATCH 21/60] refactor: remove unused datasets_dir parameter from process_seq_df functions --- scripts/datasets/seq_for_mut_prob.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 2b5edae..b78f429 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -1135,11 +1135,9 @@ def mane_uniprot_to_hugo(uniprot_ids, mane_mapping): def process_seq_df(seq_df, - datasets_dir, organism, uniprot_to_gene_dict, - ens_canonical_transcripts_lst, - num_cores=1): + ens_canonical_transcripts_lst): """ Retrieve DNA sequence and tri-nucleotide context for each structure in the initialized dataframe @@ -1205,14 +1203,11 @@ def process_seq_df(seq_df, def process_seq_df_mane(seq_df, - datasets_dir, uniprot_to_gene_dict, mane_mapping, mane_mapping_not_af, ens_canonical_transcripts_lst, - custom_mane_metadata_path=None, - num_cores=1, - mane_version=1.4): + num_cores=1): """ Retrieve DNA sequence and tri-nucleotide context for each structure in the initialized dataframe @@ -1368,21 +1363,16 @@ def get_seq_df(datasets_dir, if mane: seq_df = process_seq_df_mane(seq_df, - datasets_dir, uniprot_to_gene_dict, mane_mapping, mane_mapping_not_af, ens_canonical_transcripts_lst, - custom_mane_metadata_path, - num_cores, - mane_version=mane_version) + num_cores) else: seq_df = process_seq_df(seq_df, - datasets_dir, organism, uniprot_to_gene_dict, - ens_canonical_transcripts_lst, - num_cores) + ens_canonical_transcripts_lst) # Save seq_df_cols = ['Gene', 'HGNC_ID', 'Ens_Gene_ID', From 6c7d4a4575aad29bc7adc1639ddcfdb321ede3e9 Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 12:46:30 +0100 Subject: [PATCH 22/60] refactor: update process_seq_df_mane to only download biomart metadata if there are no-mane transcripts to process with Proteins API --- scripts/datasets/seq_for_mut_prob.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index b78f429..cc7392c 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -636,6 +636,7 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): ) url = f"{base_archive}{query}" fallback_url = f"{base_latest}{query}" + logger.debug("Starting BioMart metadata download to %s (archive: %s, latest: %s).", path_to_file, base_archive, base_latest) if shutil.which("wget") is None: logger.warning("wget not found; falling back to Python downloader for BioMart metadata.") @@ -1135,9 +1136,9 @@ def mane_uniprot_to_hugo(uniprot_ids, mane_mapping): def process_seq_df(seq_df, + datasets_dir, organism, - uniprot_to_gene_dict, - ens_canonical_transcripts_lst): + uniprot_to_gene_dict): """ Retrieve DNA sequence and tri-nucleotide context for each structure in the initialized dataframe @@ -1153,6 +1154,8 @@ def process_seq_df(seq_df, logger.error("No sequences to process in process_seq_df; this should not happen.") raise RuntimeError("Empty sequence dataframe: no structures to process.") + ens_canonical_transcripts_lst = get_biomart_metadata(datasets_dir, seq_df["Uniprot_ID"].unique()) + # Process entries in Proteins API (Reference_info 1) #--------------------------------------------------- @@ -1203,10 +1206,10 @@ def process_seq_df(seq_df, def process_seq_df_mane(seq_df, + datasets_dir, uniprot_to_gene_dict, mane_mapping, mane_mapping_not_af, - ens_canonical_transcripts_lst, num_cores=1): """ Retrieve DNA sequence and tri-nucleotide context @@ -1267,6 +1270,7 @@ def process_seq_df_mane(seq_df, seq_df_nomane_tr = seq_df_nomane.copy() seq_df_nomane_notr = seq_df_nomane.copy() else: + ens_canonical_transcripts_lst = get_biomart_metadata(datasets_dir, seq_df_nomane["Uniprot_ID"].unique()) # Retrieve seq from coordinates logger.debug(f"Retrieving CDS DNA seq from reference genome (Proteins API): {len(seq_df_nomane['Uniprot_ID'].unique())} structures..") coord_df = get_exons_coord(seq_df_nomane["Uniprot_ID"].unique(), ens_canonical_transcripts_lst) @@ -1354,25 +1358,22 @@ def get_seq_df(datasets_dir, # uniprot_to_gene_dict = uniprot_to_hugo_pressed(uniprot_ids) # --- - # Get biomart metadata and canonical transcript IDs - ens_canonical_transcripts_lst = get_biomart_metadata(datasets_dir, uniprot_ids) - # Create a dataframe with protein sequences logger.debug("Initializing sequence df..") seq_df = initialize_seq_df(pdb_dir, uniprot_to_gene_dict) if mane: seq_df = process_seq_df_mane(seq_df, + datasets_dir, uniprot_to_gene_dict, mane_mapping, mane_mapping_not_af, - ens_canonical_transcripts_lst, num_cores) else: seq_df = process_seq_df(seq_df, + datasets_dir, organism, - uniprot_to_gene_dict, - ens_canonical_transcripts_lst) + uniprot_to_gene_dict) # Save seq_df_cols = ['Gene', 'HGNC_ID', 'Ens_Gene_ID', From 2029a9b55c96fd7b910a59fa8ee0469dd43a122e Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 12:50:49 +0100 Subject: [PATCH 23/60] logs: add debug logging for BioMart download attempts in download_biomart_metadata function --- scripts/datasets/seq_for_mut_prob.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index cc7392c..6b69b0c 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -643,6 +643,7 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): last_exc = None ssl_verify_archive = url.startswith("https://") for attempt in range(1, max_attempts + 1): + logger.debug("Starting BioMart download attempt %s/%s (archive).", attempt, max_attempts) try: download_single_file(url, path_to_file, threads=4, ssl=ssl_verify_archive) return @@ -670,6 +671,7 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): ) ssl_verify_fallback = fallback_url.startswith("https://") for attempt in range(1, max_attempts + 1): + logger.debug("Starting BioMart download attempt %s/%s (latest).", attempt, max_attempts) try: download_single_file(fallback_url, path_to_file, threads=4, ssl=ssl_verify_fallback) return @@ -703,6 +705,7 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): ] for attempt in range(1, max_attempts + 1): + logger.debug("Starting BioMart wget attempt %s/%s (archive).", attempt, max_attempts) result = subprocess.run(command, capture_output=True, text=True) if result.returncode == 0: return @@ -739,6 +742,7 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): ) command[-1] = fallback_url for attempt in range(1, max_attempts + 1): + logger.debug("Starting BioMart wget attempt %s/%s (latest).", attempt, max_attempts) result = subprocess.run(command, capture_output=True, text=True) if result.returncode == 0: return From c01dd5c202bcf96a1791c5a1606c34bb9fdd67d3 Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 12:54:22 +0100 Subject: [PATCH 24/60] feat: initialize Tri_context with NaN and apply transformation only to valid sequences --- scripts/datasets/seq_for_mut_prob.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 6b69b0c..961dbd6 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -1191,7 +1191,9 @@ def process_seq_df(seq_df, seq_df_not_uniprot = batch_backtranseq(seq_df_not_uniprot, 500, organism=organism) # Get trinucleotide context - seq_df_not_uniprot["Tri_context"] = seq_df_not_uniprot["Seq_dna"].apply( + seq_df_not_uniprot["Tri_context"] = np.nan + valid_seq_mask = seq_df_not_uniprot["Seq_dna"].notna() + seq_df_not_uniprot.loc[valid_seq_mask, "Tri_context"] = seq_df_not_uniprot.loc[valid_seq_mask, "Seq_dna"].apply( lambda x: ",".join(per_site_trinucleotide_context(x, no_flanks=True))) @@ -1292,7 +1294,9 @@ def process_seq_df_mane(seq_df, seq_df_not_uniprot = pd.concat((seq_df_mane, seq_df_nomane_notr)) if "Seq_dna" not in seq_df_not_uniprot.columns: seq_df_not_uniprot["Seq_dna"] = pd.Series(dtype=object) - seq_df_not_uniprot["Tri_context"] = seq_df_not_uniprot["Seq_dna"].apply( + seq_df_not_uniprot["Tri_context"] = np.nan + valid_seq_mask = seq_df_not_uniprot["Seq_dna"].notna() + seq_df_not_uniprot.loc[valid_seq_mask, "Tri_context"] = seq_df_not_uniprot.loc[valid_seq_mask, "Seq_dna"].apply( lambda x: ",".join(per_site_trinucleotide_context(x, no_flanks=True))) # Prepare final output From cd65b6010d541714ec639fcfe5d8a96652b22c4a Mon Sep 17 00:00:00 2001 From: St3451 Date: Tue, 17 Feb 2026 12:56:23 +0100 Subject: [PATCH 25/60] refactor: return NaN instead of empty string for DNA sequences in Ensembl retrieval functions --- scripts/datasets/seq_for_mut_prob.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 961dbd6..44f4dba 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -921,7 +921,7 @@ def get_ref_dna_from_ensembl_batch(transcript_ids, max_attempts=3, wait_seconds= if not seq_dna: missing += 1 continue - results[pos] = seq_dna[:-3] if len(seq_dna) >= 3 else "" + results[pos] = seq_dna[:-3] if len(seq_dna) >= 3 else np.nan elapsed = time.perf_counter() - start_time if missing > 0 and logger.isEnabledFor(logging.DEBUG): @@ -1059,7 +1059,7 @@ def get_ref_dna_from_ensembl(transcript_id): failures, ) - return seq_dna[:-3] + return seq_dna[:-3] if len(seq_dna) >= 3 else np.nan def get_ref_dna_from_ensembl_mp(seq_df, cores): From 58398630600e3f9b3e2d61622aa62c59b017f32e Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 00:12:11 +0100 Subject: [PATCH 26/60] feat: add support for custom PAE directory and update README --- README.md | 6 + scripts/datasets/build_datasets.py | 185 ++++++++++++++++------------- scripts/datasets/get_pae.py | 54 +++++++-- scripts/main.py | 5 + 4 files changed, 160 insertions(+), 90 deletions(-) diff --git a/README.md b/README.md index b23f5de..8a28f2e 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,9 @@ This step build the datasets necessary for Oncodrive3D to run the 3D clustering > [!NOTE] > Human datasets built with the default settings pull canonical transcript metadata from the January 2024 Ensembl archive (release 111 / GENCODE v45). For maximum compatibility, annotate your input variants with the same Ensembl/Gencode release or supply the unfiltered VEP output together with `--o3d_transcripts --use_input_symbols`. +> [!NOTE] Predicted Aligned Error (PAE) files for older AlphaFold DB versions (e.g., v4) are no longer hosted after 2025. If you need PAE for an older AF version, download and supply them locally via `--custom_pae_dir`. +> MANE structures are only available for download from the AlphaFold DB v4 release; for MANE mode, you should provide PAE files via `--custom_pae_dir`. + ``` Usage: oncodrive3d build-datasets [OPTIONS] @@ -91,6 +94,9 @@ Options: (applicable to Homo sapiens only). -C, --custom_mane_pdb_dir PATH Path to directory containing custom MANE PDB structures. Default: None + --custom_pae_dir PATH Path to directory containing pre-downloaded PAE JSON files. + The directory will be copied into the build as `pae/`. + Default: None -f, --custom_mane_metadata_path Path to a dataframe (typically a samplesheet.csv) including Ensembl IDs and sequences of the custom pdbs. Default: None diff --git a/scripts/datasets/build_datasets.py b/scripts/datasets/build_datasets.py index 04a4f45..d0e961f 100644 --- a/scripts/datasets/build_datasets.py +++ b/scripts/datasets/build_datasets.py @@ -21,6 +21,7 @@ import os +import shutil import daiquiri from scripts import __logger_name__ @@ -43,6 +44,7 @@ def build(output_datasets, mane, mane_only, custom_pdb_dir, + custom_pae_dir, custom_mane_metadata_path, distance_threshold, num_cores, @@ -52,93 +54,109 @@ def build(output_datasets, Build datasets necessary to run Oncodrive3D. """ - # Empty directory - clean_dir(output_datasets, 'd', txt_file=True) + # # Empty directory + # clean_dir(output_datasets, 'd', txt_file=True) # Download PDB structures species = get_species(organism) - if not mane_only: - logger.info("Downloading AlphaFold (AF) predicted structures...") - get_structures( - path=os.path.join(output_datasets,"pdb_structures"), - species=species, - af_version=str(af_version), - threads=num_cores - ) - logger.info("Download of structures completed!") - - # Merge fragmented structures - logger.info("Merging fragmented structures...") - merge_af_fragments(input_dir=os.path.join(output_datasets,"pdb_structures"), gzip=True) - - # Download PDB MANE structures - if species == "Homo sapiens" and mane: - logger.info("Downloading AlphaFold (AF) predicted structures overlap with MANE...") - get_structures( - path=os.path.join(output_datasets,"pdb_structures_mane"), - species=species, - mane=True, - threads=num_cores - ) - mv_mane_pdb(output_datasets, "pdb_structures", "pdb_structures_mane") - logger.info("Download of MANE structures completed!") + # if not mane_only: + # logger.info("Downloading AlphaFold (AF) predicted structures...") + # get_structures( + # path=os.path.join(output_datasets,"pdb_structures"), + # species=species, + # af_version=str(af_version), + # threads=num_cores + # ) + # logger.info("Download of structures completed!") + + # # Merge fragmented structures + # logger.info("Merging fragmented structures...") + # merge_af_fragments(input_dir=os.path.join(output_datasets,"pdb_structures"), gzip=True) + + # # Download PDB MANE structures + # if species == "Homo sapiens" and mane: + # if str(af_version) != "4": + # logger.warning( + # "MANE structures are only available in AlphaFold DB v4. " + # "Ignoring --af_version=%s and using v4 for MANE downloads.", + # af_version, + # ) + # logger.info("Downloading AlphaFold (AF) predicted structures overlap with MANE...") + # get_structures( + # path=os.path.join(output_datasets,"pdb_structures_mane"), + # species=species, + # mane=True, + # af_version="4", + # threads=num_cores + # ) + # mv_mane_pdb(output_datasets, "pdb_structures", "pdb_structures_mane") + # logger.info("Download of MANE structures completed!") - # Copy custom PDB structures and optinally add SEQRES - if custom_pdb_dir is not None: - if custom_mane_metadata_path is None: - logger.error( - "custom_mane_metadata_path must be provided when custom_pdb_dir is specified" - ) - raise ValueError( - "Both custom_pdb_dir and custom_mane_metadata_path must be provided together" - ) + # # Copy custom PDB structures and optinally add SEQRES + # if custom_pdb_dir is not None: + # if custom_mane_metadata_path is None: + # logger.error( + # "custom_mane_metadata_path must be provided when custom_pdb_dir is specified" + # ) + # raise ValueError( + # "Both custom_pdb_dir and custom_mane_metadata_path must be provided together" + # ) - logger.info("Copying custom PDB structures...") - if os.path.exists(custom_pdb_dir): - copy_and_parse_custom_pdbs( - src_dir=custom_pdb_dir, - dst_dir=os.path.join(output_datasets,"pdb_structures"), - af_version=int(af_version), - custom_mane_metadata_path=custom_mane_metadata_path - ) - else: - logger.error(f"Custom PDB directory does not exist: {custom_pdb_dir}") - raise FileNotFoundError(f"Custom PDB directory not found: {custom_pdb_dir}") + # logger.info("Copying custom PDB structures...") + # if os.path.exists(custom_pdb_dir): + # copy_and_parse_custom_pdbs( + # src_dir=custom_pdb_dir, + # dst_dir=os.path.join(output_datasets,"pdb_structures"), + # af_version=int(af_version), + # custom_mane_metadata_path=custom_mane_metadata_path + # ) + # else: + # logger.error(f"Custom PDB directory does not exist: {custom_pdb_dir}") + # raise FileNotFoundError(f"Custom PDB directory not found: {custom_pdb_dir}") - # Create df including genes and proteins sequences & Hugo to Uniprot_ID mapping - logger.info("Generating dataframe for genes and proteins sequences...") - seq_df = get_seq_df( - datasets_dir=output_datasets, - output_seq_df=os.path.join(output_datasets, "seq_for_mut_prob.tsv"), - organism=species, - mane=mane, - num_cores=num_cores, - mane_version=mane_version, - custom_mane_metadata_path=custom_mane_metadata_path - ) - logger.info("Generation of sequences dataframe completed!") - - # Get model confidence - logger.info("Extracting AF model confidence...") - get_confidence( - input=os.path.join(output_datasets, "pdb_structures"), - output_dir=os.path.join(output_datasets), - seq_df=seq_df - ) - - # Get PAE - logger.info("Downloading AF predicted aligned error (PAE)...") - get_pae( - input_dir=os.path.join(output_datasets,"pdb_structures"), - output_dir=os.path.join(output_datasets,"pae"), - num_cores=num_cores, - af_version=str(af_version), - custom_pdb_dir=custom_pdb_dir - ) + # # Create df including genes and proteins sequences & Hugo to Uniprot_ID mapping + # logger.info("Generating dataframe for genes and proteins sequences...") + # seq_df = get_seq_df( + # datasets_dir=output_datasets, + # output_seq_df=os.path.join(output_datasets, "seq_for_mut_prob.tsv"), + # organism=species, + # mane=mane, + # num_cores=num_cores, + # mane_version=mane_version, + # custom_mane_metadata_path=custom_mane_metadata_path + # ) + # logger.info("Generation of sequences dataframe completed!") + + # # Get model confidence + # logger.info("Extracting AF model confidence...") + # get_confidence( + # input=os.path.join(output_datasets, "pdb_structures"), + # output_dir=os.path.join(output_datasets), + # seq_df=seq_df + # ) + + pae_output_dir = os.path.join(output_datasets, "pae") + if custom_pae_dir is not None: + logger.info("Copying precomputed PAE directory...") + if os.path.exists(custom_pae_dir): + shutil.copytree(custom_pae_dir, pae_output_dir) + else: + logger.error(f"Custom PAE directory does not exist: {custom_pae_dir}") + raise FileNotFoundError(f"Custom PAE directory not found: {custom_pae_dir}") + else: + # Get PAE + logger.info("Downloading AF predicted aligned error (PAE)...") + get_pae( + input_dir=os.path.join(output_datasets,"pdb_structures"), + output_dir=pae_output_dir, + num_cores=num_cores, + af_version=str(af_version), + custom_pdb_dir=custom_pdb_dir + ) # Parse PAE logger.info("Parsing PAE...") - parse_pae(input=os.path.join(output_datasets, 'pae')) + parse_pae(input=pae_output_dir) logger.info("Parsing PAE completed!") # Get pCAMPs @@ -160,14 +178,15 @@ def build(output_datasets, if __name__ == "__main__": build( - output_datasets="/data/bbg/nobackup/scratch/oncodrive3d/mane_missing/oncodrive3d/datasets/datasets-mane_only-mane_custom-250729", + output_datasets="/home/spellegrini/Coding/Data/oncodrive3d/datasets-mane_only-mane_custom-260215-debugging", organism="Homo sapiens", - mane=False, + mane=True, mane_only=True, - custom_pdb_dir="/data/bbg/nobackup/scratch/oncodrive3d/mane_missing/data/250724-no_fragments/all_pdbs-pred_and_retrieved/pdbs", - custom_mane_metadata_path="/data/bbg/nobackup/scratch/oncodrive3d/mane_missing/data/250724-no_fragments/all_pdbs-pred_and_retrieved/samplesheet.csv", + custom_pdb_dir="/home/spellegrini/Coding/Data/oncodrive3d/mane_missing/20260215/final_bundle/pdbs", + custom_pae_dir=None, + custom_mane_metadata_path="/home/spellegrini/Coding/Data/oncodrive3d/mane_missing/20260215/final_bundle/samplesheet.csv", distance_threshold=10, - num_cores=8, + num_cores=1, af_version=4, mane_version=1.4 ) diff --git a/scripts/datasets/get_pae.py b/scripts/datasets/get_pae.py index fe60087..e1c1030 100644 --- a/scripts/datasets/get_pae.py +++ b/scripts/datasets/get_pae.py @@ -17,7 +17,7 @@ def download_pae( af_version: int, output_dir: str, max_retries: int = 100 - ) -> None: + ) -> str: """ Download Predicted Aligned Error (PAE) file from AlphaFold DB. @@ -26,6 +26,9 @@ def download_pae( af_version: AlphaFold 2 version. output_dir: Output directory where to download the PAE files. max_retries: Break the loop if the download fails too many times. + + Returns: + "ok" if downloaded, "missing" if 404/410, "failed" otherwise. """ file_path = os.path.join(output_dir, f"{uniprot_id}-F1-predicted_aligned_error.json") @@ -42,6 +45,17 @@ def download_pae( time.sleep(30) try: response = requests.get(download_url, timeout=30) + if response.status_code in (404, 410): + logger.warning( + "PAE not available for %s (AF v%s). Skipping.", + uniprot_id, + af_version, + ) + return "missing" + if not response.ok: + raise requests.exceptions.HTTPError( + f"PAE download failed with status {response.status_code}" + ) content = response.content if content.endswith(b'}]') and not content.endswith(b''): with open(file_path, 'wb') as output_file: @@ -51,6 +65,7 @@ def download_pae( status = "ERROR" if i % 10 == 0: logger.debug(f"Request failed {e}: Retrying") + return "ok" if status == "FINISHED" else "failed" def get_pae( @@ -88,13 +103,38 @@ def get_pae( custom_uniprot_ids = [fname.split('.')[0] for fname in os.listdir(custom_pdb_dir) if fname.endswith('.pdb')] uniprot_ids = [uni_id for uni_id in uniprot_ids if uni_id not in custom_uniprot_ids] - with concurrent.futures.ThreadPoolExecutor(max_workers=num_cores) as executor: - tasks = [executor.submit(download_pae, uniprot_id, af_version, output_dir) for uniprot_id in uniprot_ids] - - for _ in tqdm(concurrent.futures.as_completed(tasks), total=len(tasks), desc="Downloading PAE"): - pass + consecutive_missing = 0 + if uniprot_ids: + probe_ids = uniprot_ids[:5] + remaining_ids = uniprot_ids[5:] + for uniprot_id in tqdm(probe_ids, desc="Downloading PAE"): + result = download_pae(uniprot_id, af_version, output_dir) + if result == "missing": + consecutive_missing += 1 + else: + consecutive_missing = 0 + + if consecutive_missing >= 5: + logger.warning( + "Detected %s consecutive missing PAE downloads (HTTP 404/410). " + "PAE files for AlphaFold DB v%s are likely unavailable. " + "Skipping remaining PAE downloads. Contact maps will be computed without PAE (binary maps). " + "Re-run build-datasets with --custom_pae_dir to use precomputed PAE.", + consecutive_missing, + af_version, + ) + with open(checkpoint, "w") as f: + f.write('') + return + + if remaining_ids: + with concurrent.futures.ThreadPoolExecutor(max_workers=num_cores) as executor: + tasks = [executor.submit(download_pae, uniprot_id, af_version, output_dir) for uniprot_id in remaining_ids] + + for _ in tqdm(concurrent.futures.as_completed(tasks), total=len(tasks), desc="Downloading PAE"): + pass with open(checkpoint, "w") as f: f.write('') - logger.info('Download of PAE completed!') \ No newline at end of file + logger.info('Download of PAE completed!') diff --git a/scripts/main.py b/scripts/main.py index 8fd1af3..5fbb767 100755 --- a/scripts/main.py +++ b/scripts/main.py @@ -44,6 +44,8 @@ def oncodrive3D(): help="Use only structures predicted from MANE Select transcripts", is_flag=True) @click.option("-C", "--custom_mane_pdb_dir", help="Directory where to load custom MANE PDB structures (overwriting existing ones)") +@click.option("--custom_pae_dir", + help="Directory containing pre-downloaded PAE JSON files to copy into the build (renamed to 'pae')") @click.option("-f", "--custom_mane_metadata_path", help="Path to a dataframe including the Ensembl Protein ID and the amino acid sequence of the custom MANE PDB structures") @click.option("-j", "--mane_version", default=1.4, @@ -64,6 +66,7 @@ def build_datasets(output_dir, mane, mane_only, custom_mane_pdb_dir, + custom_pae_dir, custom_mane_metadata_path, distance_threshold, cores, @@ -85,6 +88,7 @@ def build_datasets(output_dir, logger.info(f"MANE Select: {mane}") logger.info(f"MANE Select only: {mane_only}") logger.info(f"Custom MANE PDB directory: {custom_mane_pdb_dir}") + logger.info(f"Custom PAE directory: {custom_pae_dir}") logger.info(f"Custom MANE PDB metadata path: {custom_mane_metadata_path}") logger.info(f"Distance threshold: {distance_threshold}Å") logger.info(f"CPU cores: {cores}") @@ -100,6 +104,7 @@ def build_datasets(output_dir, mane, mane_only, custom_mane_pdb_dir, + custom_pae_dir, custom_mane_metadata_path, distance_threshold, cores, From b6f60cbc5c2dd7cb5f56095f714b17a45dcdb9cb Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 00:18:31 +0100 Subject: [PATCH 27/60] docs: update README and main.py to clarify AlphaFold DB versioning for MANE and non-MANE builds --- README.md | 6 +++--- scripts/main.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 8a28f2e..e62baf6 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ This step build the datasets necessary for Oncodrive3D to run the 3D clustering > Human datasets built with the default settings pull canonical transcript metadata from the January 2024 Ensembl archive (release 111 / GENCODE v45). For maximum compatibility, annotate your input variants with the same Ensembl/Gencode release or supply the unfiltered VEP output together with `--o3d_transcripts --use_input_symbols`. > [!NOTE] Predicted Aligned Error (PAE) files for older AlphaFold DB versions (e.g., v4) are no longer hosted after 2025. If you need PAE for an older AF version, download and supply them locally via `--custom_pae_dir`. -> MANE structures are only available for download from the AlphaFold DB v4 release; for MANE mode, you should provide PAE files via `--custom_pae_dir`. +> MANE structures are only available from the AlphaFold DB v4 release. Non‑MANE builds default to v6; MANE mode forces v4 for structures, so you should provide PAE files via `--custom_pae_dir`. ``` Usage: oncodrive3d build-datasets [OPTIONS] @@ -104,8 +104,8 @@ Options: Default: 10 -c, --cores INT Number of CPU cores for computation. Default: All available CPU cores - --af_version INT Version of the AlphaFold Protein Structure Database release. - Default: 4 + --af_version INT AlphaFold DB version for non-MANE builds (MANE uses v4). + Default: 6 -y, --yes Run without interactive prompts. -v, --verbose Enables verbose output. -h, --help Show this message and exit. diff --git a/scripts/main.py b/scripts/main.py index 5fbb767..2e167b0 100755 --- a/scripts/main.py +++ b/scripts/main.py @@ -54,8 +54,8 @@ def oncodrive3D(): help="Distance threshold (Å) to define contact between amino acids") @click.option("-c", "--cores", type=click.IntRange(min=1, max=len(os.sched_getaffinity(0)), clamp=False), default=len(os.sched_getaffinity(0)), help="Number of cores to use in the computation") -@click.option("--af_version", type=click.IntRange(min=1, clamp=False), default=4, - help="Version of AlphaFold 2 predictions") +@click.option("--af_version", type=click.IntRange(min=1, clamp=False), default=6, + help="AlphaFold DB version for non-MANE downloads (MANE uses v4)") @click.option("-y", "--yes", help="No interaction", is_flag=True) @click.option("-v", "--verbose", From 2812f00ac1bc39231d20af3ab8cf6e7c2c49cc12 Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 00:22:41 +0100 Subject: [PATCH 28/60] docs: correct typos and improve clarity in README.md --- README.md | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index e62baf6..3e4cc86 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Additionally, you may need to install additional development tools. Depending on - If you have sudo privileges: ```bash - sudo apt install built-essential + sudo apt install build-essential ``` - For HPC cluster environment, it is recommended to use [Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) (or [Mamba](https://mamba.readthedocs.io/en/latest/)): @@ -57,7 +57,7 @@ Additionally, you may need to install additional development tools. Depending on ## Building Datasets -This step build the datasets necessary for Oncodrive3D to run the 3D clustering analysis. It is required once after installation or whenever you need to generate datasets for a different organism or apply a specific threshold to define amino acid contacts. +This step builds the datasets necessary for Oncodrive3D to run the 3D clustering analysis. It is required once after installation or whenever you need to generate datasets for a different organism or apply a specific threshold to define amino acid contacts. > [!WARNING] > This step is highly time- and resource-intensive, requiring a significant amount of free disk space and computational power. It will download and process a large amount of data. Ensure sufficient resources are available before proceeding, as insufficient capacity may result in extended runtimes or processing failures. @@ -65,7 +65,7 @@ This step build the datasets necessary for Oncodrive3D to run the 3D clustering > Reliable internet access is required because AlphaFold structures, Ensembl annotations, Pfam files, and other resources are downloaded on demand during the build. > [!NOTE] -> The first time that you run Oncodrive3D building dataset step with a given reference genome, it will download it from our servers. By default the downloaded datasets go to`~/.bgdata`. If you want to move these datasets to another folder you have to define the system environment variable `BGDATA_LOCAL` with an export command. +> The first time that you run Oncodrive3D building dataset step with a given reference genome, it will download it from our servers. By default the downloaded datasets go to `~/.bgdata`. If you want to move these datasets to another folder you have to define the system environment variable `BGDATA_LOCAL` with an export command. > [!NOTE] > Human datasets built with the default settings pull canonical transcript metadata from the January 2024 Ensembl archive (release 111 / GENCODE v45). For maximum compatibility, annotate your input variants with the same Ensembl/Gencode release or supply the unfiltered VEP output together with `--o3d_transcripts --use_input_symbols`. @@ -118,7 +118,7 @@ For more information on the output of this step, please refer to the [Building D > To maximize structural coverage of **MANE Select transcripts**, you can [predict missing structures locally and integrate them into Oncodrive3D](tools/preprocessing/README.md) using: > > - `tools/preprocessing/prepare_samplesheet.py`: a standalone utility that: -> - Retrieve the full MANE entries from NCBI. +> - Retrieves the full MANE entries from NCBI. > - Identifies proteins missing from the AlphaFold MANE dataset. > - Generates: > - A `samplesheet.csv` with Ensembl protein IDs, FASTA paths, and optional sequences. @@ -139,7 +139,7 @@ For more information on the output of this step, please refer to the [Building D ## Running 3D clustering Analysis -For in depth information on how to obtain the required input data and for comprehensive information about the output, please refer to the [Input and Output Documentation](https://github.com/bbglab/oncodrive3d/tree/master/docs/run_input_output.md) of the 3D clustering analysis. +For in-depth information on how to obtain the required input data and for comprehensive information about the output, please refer to the [Input and Output Documentation](https://github.com/bbglab/oncodrive3d/tree/master/docs/run_input_output.md) of the 3D clustering analysis. ### Input @@ -262,8 +262,6 @@ For more information, refer to the [Oncodrive3D Pipeline](https://github.com/bbg ### Usage ---- - > [!WARNING] > When using the Nextflow script, ensure that your input files are organized in the following directory structure (you only need either the `maf/` or `vep/` directory): > @@ -308,10 +306,10 @@ Options: --vep_input BOOL Use `vep/` subdir as input and select transcripts matching the Ensembl transcript IDs in Oncodrive3D built datasets. Default: false - --mane BOOL Prioritize structures corresponding to MANE transcrips if + --mane BOOL Prioritize structures corresponding to MANE transcripts if multiple structures are associated to the same gene. Default: false - --seed INT: Seed value for reproducibility. + --seed INT Seed value for reproducibility. Default: 128 ``` From dfc524389477ecf59e1a6e4b8f901ecfbf34fa4f Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 00:32:24 +0100 Subject: [PATCH 29/60] feat: enhance dataset building process with improved logging and structure downloads --- scripts/datasets/build_datasets.py | 145 +++++++++++++++-------------- 1 file changed, 73 insertions(+), 72 deletions(-) diff --git a/scripts/datasets/build_datasets.py b/scripts/datasets/build_datasets.py index d0e961f..c3385a6 100644 --- a/scripts/datasets/build_datasets.py +++ b/scripts/datasets/build_datasets.py @@ -59,81 +59,82 @@ def build(output_datasets, # Download PDB structures species = get_species(organism) - # if not mane_only: - # logger.info("Downloading AlphaFold (AF) predicted structures...") - # get_structures( - # path=os.path.join(output_datasets,"pdb_structures"), - # species=species, - # af_version=str(af_version), - # threads=num_cores - # ) - # logger.info("Download of structures completed!") - - # # Merge fragmented structures - # logger.info("Merging fragmented structures...") - # merge_af_fragments(input_dir=os.path.join(output_datasets,"pdb_structures"), gzip=True) - - # # Download PDB MANE structures - # if species == "Homo sapiens" and mane: - # if str(af_version) != "4": - # logger.warning( - # "MANE structures are only available in AlphaFold DB v4. " - # "Ignoring --af_version=%s and using v4 for MANE downloads.", - # af_version, - # ) - # logger.info("Downloading AlphaFold (AF) predicted structures overlap with MANE...") - # get_structures( - # path=os.path.join(output_datasets,"pdb_structures_mane"), - # species=species, - # mane=True, - # af_version="4", - # threads=num_cores - # ) - # mv_mane_pdb(output_datasets, "pdb_structures", "pdb_structures_mane") - # logger.info("Download of MANE structures completed!") + if mane and str(af_version) != "4": + logger.warning( + "MANE structures are only available in AlphaFold DB v4. " + "Ignoring --af_version=%s and using v4 for this build.", + af_version, + ) + af_version = 4 + if not mane_only: + logger.info("Downloading AlphaFold (AF) predicted structures...") + get_structures( + path=os.path.join(output_datasets,"pdb_structures"), + species=species, + af_version=str(af_version), + threads=num_cores + ) + logger.info("Download of structures completed!") + + # Merge fragmented structures + logger.info("Merging fragmented structures...") + merge_af_fragments(input_dir=os.path.join(output_datasets,"pdb_structures"), gzip=True) + + # Download PDB MANE structures + if species == "Homo sapiens" and mane: + logger.info("Downloading AlphaFold (AF) predicted structures overlap with MANE...") + get_structures( + path=os.path.join(output_datasets,"pdb_structures_mane"), + species=species, + mane=True, + af_version=str(af_version), + threads=num_cores + ) + mv_mane_pdb(output_datasets, "pdb_structures", "pdb_structures_mane") + logger.info("Download of MANE structures completed!") - # # Copy custom PDB structures and optinally add SEQRES - # if custom_pdb_dir is not None: - # if custom_mane_metadata_path is None: - # logger.error( - # "custom_mane_metadata_path must be provided when custom_pdb_dir is specified" - # ) - # raise ValueError( - # "Both custom_pdb_dir and custom_mane_metadata_path must be provided together" - # ) + # Copy custom PDB structures and optinally add SEQRES + if custom_pdb_dir is not None: + if custom_mane_metadata_path is None: + logger.error( + "custom_mane_metadata_path must be provided when custom_pdb_dir is specified" + ) + raise ValueError( + "Both custom_pdb_dir and custom_mane_metadata_path must be provided together" + ) - # logger.info("Copying custom PDB structures...") - # if os.path.exists(custom_pdb_dir): - # copy_and_parse_custom_pdbs( - # src_dir=custom_pdb_dir, - # dst_dir=os.path.join(output_datasets,"pdb_structures"), - # af_version=int(af_version), - # custom_mane_metadata_path=custom_mane_metadata_path - # ) - # else: - # logger.error(f"Custom PDB directory does not exist: {custom_pdb_dir}") - # raise FileNotFoundError(f"Custom PDB directory not found: {custom_pdb_dir}") + logger.info("Copying custom PDB structures...") + if os.path.exists(custom_pdb_dir): + copy_and_parse_custom_pdbs( + src_dir=custom_pdb_dir, + dst_dir=os.path.join(output_datasets,"pdb_structures"), + af_version=int(af_version), + custom_mane_metadata_path=custom_mane_metadata_path + ) + else: + logger.error(f"Custom PDB directory does not exist: {custom_pdb_dir}") + raise FileNotFoundError(f"Custom PDB directory not found: {custom_pdb_dir}") - # # Create df including genes and proteins sequences & Hugo to Uniprot_ID mapping - # logger.info("Generating dataframe for genes and proteins sequences...") - # seq_df = get_seq_df( - # datasets_dir=output_datasets, - # output_seq_df=os.path.join(output_datasets, "seq_for_mut_prob.tsv"), - # organism=species, - # mane=mane, - # num_cores=num_cores, - # mane_version=mane_version, - # custom_mane_metadata_path=custom_mane_metadata_path - # ) - # logger.info("Generation of sequences dataframe completed!") - - # # Get model confidence - # logger.info("Extracting AF model confidence...") - # get_confidence( - # input=os.path.join(output_datasets, "pdb_structures"), - # output_dir=os.path.join(output_datasets), - # seq_df=seq_df - # ) + # Create df including genes and proteins sequences & Hugo to Uniprot_ID mapping + logger.info("Generating dataframe for genes and proteins sequences...") + seq_df = get_seq_df( + datasets_dir=output_datasets, + output_seq_df=os.path.join(output_datasets, "seq_for_mut_prob.tsv"), + organism=species, + mane=mane, + num_cores=num_cores, + mane_version=mane_version, + custom_mane_metadata_path=custom_mane_metadata_path + ) + logger.info("Generation of sequences dataframe completed!") + + # Get model confidence + logger.info("Extracting AF model confidence...") + get_confidence( + input=os.path.join(output_datasets, "pdb_structures"), + output_dir=os.path.join(output_datasets), + seq_df=seq_df + ) pae_output_dir = os.path.join(output_datasets, "pae") if custom_pae_dir is not None: From d79d6793f8d3d168536247e65255df9a85821cba Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 00:32:54 +0100 Subject: [PATCH 30/60] fix: enable directory cleaning in dataset build process --- scripts/datasets/build_datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/datasets/build_datasets.py b/scripts/datasets/build_datasets.py index c3385a6..2674ea0 100644 --- a/scripts/datasets/build_datasets.py +++ b/scripts/datasets/build_datasets.py @@ -54,8 +54,8 @@ def build(output_datasets, Build datasets necessary to run Oncodrive3D. """ - # # Empty directory - # clean_dir(output_datasets, 'd', txt_file=True) + # Empty directory + clean_dir(output_datasets, 'd', txt_file=True) # Download PDB structures species = get_species(organism) From 26fc3d51f6377332c04ba18eb7a2618b9d8eb531 Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 00:35:51 +0100 Subject: [PATCH 31/60] fix: improve logging for custom PAE directory handling in dataset build process --- scripts/datasets/build_datasets.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/datasets/build_datasets.py b/scripts/datasets/build_datasets.py index 2674ea0..3bdbff2 100644 --- a/scripts/datasets/build_datasets.py +++ b/scripts/datasets/build_datasets.py @@ -136,16 +136,19 @@ def build(output_datasets, seq_df=seq_df ) + # Get PAE pae_output_dir = os.path.join(output_datasets, "pae") if custom_pae_dir is not None: logger.info("Copying precomputed PAE directory...") if os.path.exists(custom_pae_dir): shutil.copytree(custom_pae_dir, pae_output_dir) else: - logger.error(f"Custom PAE directory does not exist: {custom_pae_dir}") - raise FileNotFoundError(f"Custom PAE directory not found: {custom_pae_dir}") + logger.warning( + "Custom PAE directory does not exist: %s. Skipping copy. " + "Contact maps will be computed without PAE (binary maps).", + custom_pae_dir, + ) else: - # Get PAE logger.info("Downloading AF predicted aligned error (PAE)...") get_pae( input_dir=os.path.join(output_datasets,"pdb_structures"), From 6456014ec80626443fd6484ec25b61eafbd6e03d Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 00:54:15 +0100 Subject: [PATCH 32/60] lint: update main execution to enforce CLI usage for dataset building --- scripts/datasets/build_datasets.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/scripts/datasets/build_datasets.py b/scripts/datasets/build_datasets.py index 3bdbff2..a5e9fa0 100644 --- a/scripts/datasets/build_datasets.py +++ b/scripts/datasets/build_datasets.py @@ -181,16 +181,6 @@ def build(output_datasets, logger.info("Datasets have been successfully built and are ready for analysis!") if __name__ == "__main__": - build( - output_datasets="/home/spellegrini/Coding/Data/oncodrive3d/datasets-mane_only-mane_custom-260215-debugging", - organism="Homo sapiens", - mane=True, - mane_only=True, - custom_pdb_dir="/home/spellegrini/Coding/Data/oncodrive3d/mane_missing/20260215/final_bundle/pdbs", - custom_pae_dir=None, - custom_mane_metadata_path="/home/spellegrini/Coding/Data/oncodrive3d/mane_missing/20260215/final_bundle/samplesheet.csv", - distance_threshold=10, - num_cores=1, - af_version=4, - mane_version=1.4 - ) + raise SystemExit( + "This module is intended to be used via the CLI: `oncodrive3d build-datasets`." + ) From 519458417304e44261807ec12710780e6fd3c480 Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 01:10:28 +0100 Subject: [PATCH 33/60] fix: pass af_version to merge_af_fragments for improved dataset merging --- scripts/datasets/build_datasets.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/datasets/build_datasets.py b/scripts/datasets/build_datasets.py index a5e9fa0..1ac13e3 100644 --- a/scripts/datasets/build_datasets.py +++ b/scripts/datasets/build_datasets.py @@ -78,7 +78,11 @@ def build(output_datasets, # Merge fragmented structures logger.info("Merging fragmented structures...") - merge_af_fragments(input_dir=os.path.join(output_datasets,"pdb_structures"), gzip=True) + merge_af_fragments( + input_dir=os.path.join(output_datasets,"pdb_structures"), + af_version=af_version, + gzip=True + ) # Download PDB MANE structures if species == "Homo sapiens" and mane: From 4e447dd217bbf4164313274ac9034c43711c67dd Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 11:37:31 +0100 Subject: [PATCH 34/60] logs: enhance logging for duplicate gene removal and Ensembl CDS failures in sequence dataframe processing --- scripts/datasets/seq_for_mut_prob.py | 43 +++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 44f4dba..c090cd2 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -1206,7 +1206,15 @@ def process_seq_df(seq_df, seq_df.Reference_info.value_counts().values)]) logger.info(f"Built of sequence dataframe completed. Retrieved {len(seq_df)} structures ({logger_report})") seq_df = add_extra_genes_to_seq_df(seq_df, uniprot_to_gene_dict) + pre_drop = len(seq_df) seq_df = drop_gene_duplicates(seq_df) + if logger.isEnabledFor(logging.DEBUG): + logger.debug( + "Duplicate gene removal: %s removed (from %s to %s).", + pre_drop - len(seq_df), + pre_drop, + len(seq_df), + ) return seq_df @@ -1246,7 +1254,18 @@ def process_seq_df_mane(seq_df, seq_df_mane = get_ref_dna_from_ensembl_mp(seq_df_mane, cores=num_cores) # Set failed and len-mismatching entries as no-transcripts entries - failed_ix = seq_df_mane.apply(lambda x: True if pd.isna(x.Seq_dna) else len(x.Seq_dna) / 3 != len(x.Seq), axis=1) + seq_len = seq_df_mane["Seq"].str.len() + dna_len = seq_df_mane["Seq_dna"].str.len() + failed_nan = seq_df_mane["Seq_dna"].isna() + failed_mismatch = (~failed_nan) & (dna_len / 3 != seq_len) + failed_ix = failed_nan | failed_mismatch + if logger.isEnabledFor(logging.DEBUG): + logger.debug( + "Ensembl CDS failures: total=%s (missing=%s, length_mismatch=%s).", + int(failed_ix.sum()), + int(failed_nan.sum()), + int(failed_mismatch.sum()), + ) if sum(failed_ix) > 0: seq_df_mane_failed = seq_df_mane[failed_ix] seq_df_mane = seq_df_mane[~failed_ix] @@ -1260,6 +1279,11 @@ def process_seq_df_mane(seq_df, "Seq_dna" ]) seq_df_nomane = pd.concat((seq_df_nomane, seq_df_mane_failed)) + if logger.isEnabledFor(logging.DEBUG): + logger.debug( + "Moved %s failed MANE entries to non-MANE pool.", + len(seq_df_mane_failed), + ) # Seq df not MANE # --------------- @@ -1268,8 +1292,17 @@ def process_seq_df_mane(seq_df, seq_df_nomane_tr = seq_df_nomane.copy() seq_df_nomane_notr = seq_df_nomane.copy() else: + before_nomane = len(seq_df_nomane) seq_df_nomane = add_extra_genes_to_seq_df(seq_df_nomane, uniprot_to_gene_dict) # Filter out genes with NA + after_extra = len(seq_df_nomane) seq_df_nomane = seq_df_nomane[seq_df_nomane.Gene.isin(mane_mapping_not_af.Gene)] # Filter out genes that are not in MANE list + if logger.isEnabledFor(logging.DEBUG): + logger.debug( + "Non-MANE pool sizes: initial=%s, after_extra_genes=%s, after_mane_filter=%s.", + before_nomane, + after_extra, + len(seq_df_nomane), + ) if seq_df_nomane.empty: logger.debug("No non-MANE sequences after filtering; skipping Proteins/Backtranseq retrieval.") @@ -1301,7 +1334,15 @@ def process_seq_df_mane(seq_df, # Prepare final output seq_df = pd.concat((seq_df_not_uniprot, seq_df_nomane_tr)).reset_index(drop=True) + pre_drop = len(seq_df) seq_df = drop_gene_duplicates(seq_df) + if logger.isEnabledFor(logging.DEBUG): + logger.debug( + "Duplicate gene removal: %s removed (from %s to %s).", + pre_drop - len(seq_df), + pre_drop, + len(seq_df), + ) report_df = seq_df.Reference_info.value_counts().reset_index() report_df = report_df.rename(columns={"index" : "Source"}) report_df.Source = report_df.Source.map({1 : "Proteins API", From cfc4e6b0f6ffb373bbdd3c0ff35d26395c5c1ddf Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 11:56:06 +0100 Subject: [PATCH 35/60] fix: update custom MANE PDB directory option to require --mane_only flag and guard the filter for non_mane structures to be used only when mane_only=False --- README.md | 2 +- scripts/datasets/build_datasets.py | 8 ++++++++ scripts/datasets/seq_for_mut_prob.py | 30 ++++++++++++++++++++-------- scripts/main.py | 2 +- 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 3e4cc86..289abaf 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ Options: (applicable to Homo sapiens only). -M, --mane_only Use only structures predicted from MANE Select transcripts (applicable to Homo sapiens only). - -C, --custom_mane_pdb_dir PATH Path to directory containing custom MANE PDB structures. + -C, --custom_mane_pdb_dir PATH Path to directory containing custom MANE PDB structures (requires --mane_only). Default: None --custom_pae_dir PATH Path to directory containing pre-downloaded PAE JSON files. The directory will be copied into the build as `pae/`. diff --git a/scripts/datasets/build_datasets.py b/scripts/datasets/build_datasets.py index 1ac13e3..a65ec38 100644 --- a/scripts/datasets/build_datasets.py +++ b/scripts/datasets/build_datasets.py @@ -99,6 +99,13 @@ def build(output_datasets, # Copy custom PDB structures and optinally add SEQRES if custom_pdb_dir is not None: + if not mane_only: + logger.error( + "custom_pdb_dir requires --mane_only. Use --mane_only when providing custom MANE structures." + ) + raise ValueError( + "custom_pdb_dir requires --mane_only" + ) if custom_mane_metadata_path is None: logger.error( "custom_mane_metadata_path must be provided when custom_pdb_dir is specified" @@ -126,6 +133,7 @@ def build(output_datasets, output_seq_df=os.path.join(output_datasets, "seq_for_mut_prob.tsv"), organism=species, mane=mane, + mane_only=mane_only, num_cores=num_cores, mane_version=mane_version, custom_mane_metadata_path=custom_mane_metadata_path diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index c090cd2..4fe62d7 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -1224,6 +1224,7 @@ def process_seq_df_mane(seq_df, uniprot_to_gene_dict, mane_mapping, mane_mapping_not_af, + mane_only=False, num_cores=1): """ Retrieve DNA sequence and tri-nucleotide context @@ -1295,14 +1296,24 @@ def process_seq_df_mane(seq_df, before_nomane = len(seq_df_nomane) seq_df_nomane = add_extra_genes_to_seq_df(seq_df_nomane, uniprot_to_gene_dict) # Filter out genes with NA after_extra = len(seq_df_nomane) - seq_df_nomane = seq_df_nomane[seq_df_nomane.Gene.isin(mane_mapping_not_af.Gene)] # Filter out genes that are not in MANE list - if logger.isEnabledFor(logging.DEBUG): - logger.debug( - "Non-MANE pool sizes: initial=%s, after_extra_genes=%s, after_mane_filter=%s.", - before_nomane, - after_extra, - len(seq_df_nomane), - ) + if not mane_only: + if logger.isEnabledFor(logging.DEBUG): + logger.debug("Filtering non-MANE entries using mane_mapping_not_af (gene whitelist).") + seq_df_nomane = seq_df_nomane[seq_df_nomane.Gene.isin(mane_mapping_not_af.Gene)] # Filter out genes that are not in MANE list + if logger.isEnabledFor(logging.DEBUG): + logger.debug( + "Non-MANE pool sizes: initial=%s, after_extra_genes=%s, after_mane_filter=%s.", + before_nomane, + after_extra, + len(seq_df_nomane), + ) + else: + if logger.isEnabledFor(logging.DEBUG): + logger.debug( + "Non-MANE pool sizes: initial=%s, after_extra_genes=%s (mane_only, no MANE filter applied).", + before_nomane, + after_extra, + ) if seq_df_nomane.empty: logger.debug("No non-MANE sequences after filtering; skipping Proteins/Backtranseq retrieval.") @@ -1359,6 +1370,7 @@ def get_seq_df(datasets_dir, output_seq_df, organism = "Homo sapiens", mane=False, + mane_only=False, custom_mane_metadata_path=None, num_cores=1, mane_version=1.4): @@ -1417,6 +1429,7 @@ def get_seq_df(datasets_dir, uniprot_to_gene_dict, mane_mapping, mane_mapping_not_af, + mane_only, num_cores) else: seq_df = process_seq_df(seq_df, @@ -1443,6 +1456,7 @@ def get_seq_df(datasets_dir, output_seq_df=os.path.join(output_datasets, "seq_for_mut_prob.tsv"), organism='Homo sapiens', mane=True, + mane_only=False, num_cores=8, mane_version=1.4, custom_mane_metadata_path="/data/bbg/nobackup/scratch/oncodrive3d/mane_missing/data/250724-no_fragments/af_predictions/previously_pred/samplesheet.csv" diff --git a/scripts/main.py b/scripts/main.py index 2e167b0..42ed143 100755 --- a/scripts/main.py +++ b/scripts/main.py @@ -43,7 +43,7 @@ def oncodrive3D(): @click.option("-M", "--mane_only", help="Use only structures predicted from MANE Select transcripts", is_flag=True) @click.option("-C", "--custom_mane_pdb_dir", - help="Directory where to load custom MANE PDB structures (overwriting existing ones)") + help="Directory where to load custom MANE PDB structures (overwriting existing ones; requires --mane_only)") @click.option("--custom_pae_dir", help="Directory containing pre-downloaded PAE JSON files to copy into the build (renamed to 'pae')") @click.option("-f", "--custom_mane_metadata_path", From 0d9412e1042673149eb9ec604bd67ad6b836a599 Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 11:56:43 +0100 Subject: [PATCH 36/60] lint: simplify debug logging --- scripts/datasets/seq_for_mut_prob.py | 79 +++++++++++++--------------- 1 file changed, 36 insertions(+), 43 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 4fe62d7..7183e55 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -726,7 +726,7 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): result.returncode, wait_seconds, ) - if logger.isEnabledFor(logging.DEBUG) and result.stdout: + if result.stdout: logger.debug("BioMart wget stdout (attempt %s/%s): %s", attempt, max_attempts, result.stdout.strip()) time.sleep(wait_seconds) @@ -763,7 +763,7 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): result.returncode, wait_seconds, ) - if logger.isEnabledFor(logging.DEBUG) and result.stdout: + if result.stdout: logger.debug( "Fallback BioMart wget stdout (attempt %s/%s): %s", attempt, @@ -1208,13 +1208,12 @@ def process_seq_df(seq_df, seq_df = add_extra_genes_to_seq_df(seq_df, uniprot_to_gene_dict) pre_drop = len(seq_df) seq_df = drop_gene_duplicates(seq_df) - if logger.isEnabledFor(logging.DEBUG): - logger.debug( - "Duplicate gene removal: %s removed (from %s to %s).", - pre_drop - len(seq_df), - pre_drop, - len(seq_df), - ) + logger.debug( + "Duplicate gene removal: %s removed (from %s to %s).", + pre_drop - len(seq_df), + pre_drop, + len(seq_df), + ) return seq_df @@ -1260,13 +1259,12 @@ def process_seq_df_mane(seq_df, failed_nan = seq_df_mane["Seq_dna"].isna() failed_mismatch = (~failed_nan) & (dna_len / 3 != seq_len) failed_ix = failed_nan | failed_mismatch - if logger.isEnabledFor(logging.DEBUG): - logger.debug( - "Ensembl CDS failures: total=%s (missing=%s, length_mismatch=%s).", - int(failed_ix.sum()), - int(failed_nan.sum()), - int(failed_mismatch.sum()), - ) + logger.debug( + "Ensembl CDS failures: total=%s (missing=%s, length_mismatch=%s).", + int(failed_ix.sum()), + int(failed_nan.sum()), + int(failed_mismatch.sum()), + ) if sum(failed_ix) > 0: seq_df_mane_failed = seq_df_mane[failed_ix] seq_df_mane = seq_df_mane[~failed_ix] @@ -1280,11 +1278,10 @@ def process_seq_df_mane(seq_df, "Seq_dna" ]) seq_df_nomane = pd.concat((seq_df_nomane, seq_df_mane_failed)) - if logger.isEnabledFor(logging.DEBUG): - logger.debug( - "Moved %s failed MANE entries to non-MANE pool.", - len(seq_df_mane_failed), - ) + logger.debug( + "Moved %s failed MANE entries to non-MANE pool.", + len(seq_df_mane_failed), + ) # Seq df not MANE # --------------- @@ -1297,23 +1294,20 @@ def process_seq_df_mane(seq_df, seq_df_nomane = add_extra_genes_to_seq_df(seq_df_nomane, uniprot_to_gene_dict) # Filter out genes with NA after_extra = len(seq_df_nomane) if not mane_only: - if logger.isEnabledFor(logging.DEBUG): - logger.debug("Filtering non-MANE entries using mane_mapping_not_af (gene whitelist).") + logger.debug("Filtering non-MANE entries using mane_mapping_not_af (gene whitelist).") seq_df_nomane = seq_df_nomane[seq_df_nomane.Gene.isin(mane_mapping_not_af.Gene)] # Filter out genes that are not in MANE list - if logger.isEnabledFor(logging.DEBUG): - logger.debug( - "Non-MANE pool sizes: initial=%s, after_extra_genes=%s, after_mane_filter=%s.", - before_nomane, - after_extra, - len(seq_df_nomane), - ) + logger.debug( + "Non-MANE pool sizes: initial=%s, after_extra_genes=%s, after_mane_filter=%s.", + before_nomane, + after_extra, + len(seq_df_nomane), + ) else: - if logger.isEnabledFor(logging.DEBUG): - logger.debug( - "Non-MANE pool sizes: initial=%s, after_extra_genes=%s (mane_only, no MANE filter applied).", - before_nomane, - after_extra, - ) + logger.debug( + "Non-MANE pool sizes: initial=%s, after_extra_genes=%s (mane_only, no MANE filter applied).", + before_nomane, + after_extra, + ) if seq_df_nomane.empty: logger.debug("No non-MANE sequences after filtering; skipping Proteins/Backtranseq retrieval.") @@ -1347,13 +1341,12 @@ def process_seq_df_mane(seq_df, seq_df = pd.concat((seq_df_not_uniprot, seq_df_nomane_tr)).reset_index(drop=True) pre_drop = len(seq_df) seq_df = drop_gene_duplicates(seq_df) - if logger.isEnabledFor(logging.DEBUG): - logger.debug( - "Duplicate gene removal: %s removed (from %s to %s).", - pre_drop - len(seq_df), - pre_drop, - len(seq_df), - ) + logger.debug( + "Duplicate gene removal: %s removed (from %s to %s).", + pre_drop - len(seq_df), + pre_drop, + len(seq_df), + ) report_df = seq_df.Reference_info.value_counts().reset_index() report_df = report_df.rename(columns={"index" : "Source"}) report_df.Source = report_df.Source.map({1 : "Proteins API", From 44a86442e3106f3aded75a46f472e709354ee26b Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 11:59:32 +0100 Subject: [PATCH 37/60] refactor: enforce CLI usage for seq_for_mut_prob module execution --- scripts/datasets/seq_for_mut_prob.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 7183e55..1694b35 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -1443,14 +1443,6 @@ def get_seq_df(datasets_dir, if __name__ == "__main__": - output_datasets = '/data/bbg/nobackup/scratch/oncodrive3d/tests/datasets_mane_240725_mane_missing_dev' - get_seq_df( - datasets_dir=output_datasets, - output_seq_df=os.path.join(output_datasets, "seq_for_mut_prob.tsv"), - organism='Homo sapiens', - mane=True, - mane_only=False, - num_cores=8, - mane_version=1.4, - custom_mane_metadata_path="/data/bbg/nobackup/scratch/oncodrive3d/mane_missing/data/250724-no_fragments/af_predictions/previously_pred/samplesheet.csv" + raise SystemExit( + "This module is intended to be used via the CLI: `oncodrive3d build-datasets`." ) From 19f3c0c1f6e4f015fe1f24b2e47eae90ab5835bc Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 12:00:27 +0100 Subject: [PATCH 38/60] refactor: increase probe size and consecutive missing threshold for PAE downloads --- scripts/datasets/get_pae.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/datasets/get_pae.py b/scripts/datasets/get_pae.py index e1c1030..a66cea6 100644 --- a/scripts/datasets/get_pae.py +++ b/scripts/datasets/get_pae.py @@ -105,8 +105,8 @@ def get_pae( consecutive_missing = 0 if uniprot_ids: - probe_ids = uniprot_ids[:5] - remaining_ids = uniprot_ids[5:] + probe_ids = uniprot_ids[:10] + remaining_ids = uniprot_ids[10:] for uniprot_id in tqdm(probe_ids, desc="Downloading PAE"): result = download_pae(uniprot_id, af_version, output_dir) if result == "missing": @@ -114,7 +114,7 @@ def get_pae( else: consecutive_missing = 0 - if consecutive_missing >= 5: + if consecutive_missing >= 10: logger.warning( "Detected %s consecutive missing PAE downloads (HTTP 404/410). " "PAE files for AlphaFold DB v%s are likely unavailable. " From 94f8f2e5442ea644a0ae6f3cdc6202c0a84c9255 Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 12:01:35 +0100 Subject: [PATCH 39/60] fix: remove existing PAE output directory before copying custom PAE directory --- scripts/datasets/build_datasets.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/datasets/build_datasets.py b/scripts/datasets/build_datasets.py index a65ec38..9c57141 100644 --- a/scripts/datasets/build_datasets.py +++ b/scripts/datasets/build_datasets.py @@ -153,6 +153,8 @@ def build(output_datasets, if custom_pae_dir is not None: logger.info("Copying precomputed PAE directory...") if os.path.exists(custom_pae_dir): + if os.path.exists(pae_output_dir): + shutil.rmtree(pae_output_dir) shutil.copytree(custom_pae_dir, pae_output_dir) else: logger.warning( From 91e22f47fddcef3b0bc20f09d6e3615fa5ab4f5d Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 12:14:04 +0100 Subject: [PATCH 40/60] docs: update process_seq_df docstring to include canonical transcript metadata retrieval --- scripts/datasets/seq_for_mut_prob.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 1694b35..d5e89e9 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -1149,6 +1149,9 @@ def process_seq_df(seq_df, prioritizing structures obtained from transcripts whose exon coordinates are available in the Proteins API. + Canonical transcript metadata is retrieved internally from BioMart + for the provided dataset directory and Uniprot IDs. + Reference_info labels: 1 : Transcript ID, exons coord, seq DNA obtained from Proteins API -1 : Not available transcripts, seq DNA retrieved from Backtranseq API From eaa0b9e01ccac480b78f8950dcd91a9cd3e1f758 Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 12:27:16 +0100 Subject: [PATCH 41/60] feat: add function to load custom gene symbol mappings from samplesheet --- scripts/datasets/seq_for_mut_prob.py | 48 ++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index d5e89e9..bc2e3b0 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -528,6 +528,41 @@ def load_custom_ens_prot_ids(path): return ids +def load_custom_symbol_map(path): + """ + Read a samplesheet and return a mapping from ENSP (sequence) to gene symbol. + Falls back to 'gene' column if 'symbol' is not present. + """ + if not os.path.isfile(path): + logger.error(f"Custom MANE metadata path does not exist: {path!r}") + raise FileNotFoundError(f"Custom MANE metadata not found: {path!r}") + df = pd.read_csv(path) + if "sequence" not in df.columns: + logger.debug("Custom MANE metadata missing 'sequence' column; skipping symbol mapping.") + return {} + + symbol_col = None + if "symbol" in df.columns: + symbol_col = "symbol" + elif "gene" in df.columns: + symbol_col = "gene" + else: + logger.debug("Custom MANE metadata missing 'symbol'/'gene' column; skipping symbol mapping.") + return {} + + seq = df["sequence"].astype(str).str.split(".", n=1).str[0] + sym = df[symbol_col] + mapping = {} + for k, v in zip(seq, sym): + if pd.isna(v): + continue + v = str(v).strip() + if not v: + continue + mapping[k] = v + return mapping + + def get_mane_to_af_mapping( datasets_dir, uniprot_ids, @@ -1402,6 +1437,19 @@ def get_seq_df(datasets_dir, ) uniprot_to_gene_dict = dict(zip(mane_mapping["Uniprot_ID"], mane_mapping["Gene"])) + if custom_mane_metadata_path is not None: + custom_symbol_map = load_custom_symbol_map(custom_mane_metadata_path) + if custom_symbol_map: + filled = 0 + for ens_id, symbol in custom_symbol_map.items(): + if ens_id in uniprot_ids and (ens_id not in uniprot_to_gene_dict or pd.isna(uniprot_to_gene_dict[ens_id])): + uniprot_to_gene_dict[ens_id] = symbol + filled += 1 + if filled > 0: + logger.debug( + "Filled %s gene symbols from custom MANE samplesheet for ENSP-only entries.", + filled, + ) missing_uni_ids = list(set(uniprot_ids) - set(mane_mapping.Uniprot_ID)) uniprot_to_gene_dict = uniprot_to_gene_dict | uniprot_to_hugo(missing_uni_ids) else: From f7853afa9a25e8a50e57d7c555393991ddefbdfb Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 13:01:55 +0100 Subject: [PATCH 42/60] frefactor: update samplesheet tool build_metadata_map() to accept a precomputed symbol map and only add CGC + length on top --- tools/preprocessing/README.md | 2 +- .../update_samplesheet_and_structures.py | 139 ++++++++++++++---- 2 files changed, 110 insertions(+), 31 deletions(-) diff --git a/tools/preprocessing/README.md b/tools/preprocessing/README.md index 6e35e46..79706aa 100644 --- a/tools/preprocessing/README.md +++ b/tools/preprocessing/README.md @@ -99,7 +99,7 @@ Arguments: - `--max-workers`: Parallel workers for canonical indexing (default = all cores). - `--filter-long-sequences/--no-filter-long-sequences`: Whether to drop long proteins from the nf-core input (default enabled). - `--max-sequence-length`: Length cutoff applied when filtering (default `2700` residues). -- `--include-metadata/--no-include-metadata`: Add `symbol`, `CGC`, and `length` columns to every emitted `samplesheet.csv` (default disabled). +- `--include-metadata/--no-include-metadata`: Add `CGC` and `length` columns to every emitted `samplesheet.csv`. - `--config-path`: YAML with path templates describing where to place predicted/missing/retrieved bundles relative to `--samplesheet-folder` (default `config.yaml`). > [!NOTE] diff --git a/tools/preprocessing/update_samplesheet_and_structures.py b/tools/preprocessing/update_samplesheet_and_structures.py index 06b6b3f..dee13fd 100644 --- a/tools/preprocessing/update_samplesheet_and_structures.py +++ b/tools/preprocessing/update_samplesheet_and_structures.py @@ -381,6 +381,38 @@ def prepare_annotation_maps( return seq_map[["symbol", "CGC"]], refseq_map[["symbol", "CGC"]], cgc_symbols +def prepare_symbol_maps(mane_summary_path: Path) -> tuple[pd.DataFrame, pd.DataFrame]: + """Build lookup tables that map ENSP/RefSeq identifiers to symbols.""" + mane_summary = pd.read_table(mane_summary_path) + + column_aliases = { + "ensembl_prot": {"Ensembl_prot", "ensembl_prot", "Ens_Prot_ID"}, + "refseq_prot": {"RefSeq_prot", "refseq_prot"}, + "symbol": {"symbol", "Gene Symbol", "gene_symbol"}, + } + + rename_map = {} + for target, candidates in column_aliases.items(): + for candidate in candidates: + if candidate in mane_summary.columns: + rename_map[candidate] = target + break + required = {"ensembl_prot", "refseq_prot", "symbol"} + if not required.issubset(rename_map.values()): + missing = required - set(rename_map.values()) + raise KeyError(f"Missing columns in MANE summary: {missing}") + + mane_summary = mane_summary.rename(columns=rename_map) + annotations = mane_summary[["ensembl_prot", "refseq_prot", "symbol"]].copy() + annotations["ensembl_prot"] = strip_version_suffix(annotations["ensembl_prot"]) + annotations["refseq_prot"] = strip_version_suffix(annotations["refseq_prot"]) + annotations = annotations.drop_duplicates() + + seq_map = annotations.dropna(subset=["ensembl_prot"]).set_index("ensembl_prot") + refseq_map = annotations.dropna(subset=["refseq_prot"]).set_index("refseq_prot") + return seq_map[["symbol"]], refseq_map[["symbol"]] + + def attach_symbol_and_cgc( df: pd.DataFrame, seq_map: pd.DataFrame, @@ -389,12 +421,13 @@ def attach_symbol_and_cgc( """Annotate `df` with symbol/CGC columns using the provided lookup tables.""" annotated = df.copy() seq_keys = strip_version_suffix(annotated["sequence"]) + if "symbol" not in annotated.columns: + annotated["symbol"] = pd.NA + if "CGC" not in annotated.columns: + annotated["CGC"] = pd.NA if not seq_map.empty: - annotated["symbol"] = seq_keys.map(seq_map["symbol"]) - annotated["CGC"] = seq_keys.map(seq_map["CGC"]) - else: - annotated["symbol"] = pd.Series("", index=annotated.index) - annotated["CGC"] = pd.Series(0, index=annotated.index, dtype="Int64") + annotated["symbol"] = annotated["symbol"].fillna(seq_keys.map(seq_map["symbol"])) + annotated["CGC"] = annotated["CGC"].fillna(seq_keys.map(seq_map["CGC"])) if "refseq_prot" in annotated.columns and not refseq_map.empty: refseq_keys = strip_version_suffix(annotated["refseq_prot"]) @@ -406,17 +439,12 @@ def attach_symbol_and_cgc( return annotated -def build_metadata_map( - samplesheet: pd.DataFrame, - fasta_dir: Path, - mane_summary_path: Path, - cgc_path: Optional[Path], -) -> pd.DataFrame: - """Return a dataframe with one row per sequence containing symbol/CGC/length metadata.""" +def build_symbol_map(samplesheet: pd.DataFrame, mane_summary_path: Path) -> pd.DataFrame: + """Return a dataframe with one row per sequence containing symbol metadata.""" if samplesheet.empty: - return pd.DataFrame(columns=["sequence", "symbol", "CGC", "length"]) + return pd.DataFrame(columns=["sequence", "symbol"]) - seq_map, refseq_map, _ = prepare_annotation_maps(mane_summary_path, cgc_path) + seq_map, refseq_map = prepare_symbol_maps(mane_summary_path) metadata = samplesheet[["sequence"]].drop_duplicates().copy() if "refseq_prot" in samplesheet.columns: @@ -427,11 +455,44 @@ def build_metadata_map( ) metadata["refseq_prot"] = metadata["sequence"].map(refseq_lookup) - metadata = attach_symbol_and_cgc(metadata, seq_map, refseq_map) + seq_keys = strip_version_suffix(metadata["sequence"]) + if not seq_map.empty: + metadata["symbol"] = seq_keys.map(seq_map["symbol"]) + else: + metadata["symbol"] = pd.Series("", index=metadata.index) + + if "refseq_prot" in metadata.columns and not refseq_map.empty: + refseq_keys = strip_version_suffix(metadata["refseq_prot"]) + metadata["symbol"] = metadata["symbol"].fillna(refseq_keys.map(refseq_map["symbol"])) + + metadata["symbol"] = metadata["symbol"].fillna("") + return metadata.drop(columns=["refseq_prot"], errors="ignore") + + +def build_metadata_map( + samplesheet: pd.DataFrame, + fasta_dir: Path, + mane_summary_path: Path, + cgc_path: Optional[Path], + symbol_map: Optional[pd.DataFrame] = None, +) -> pd.DataFrame: + """Return a dataframe with one row per sequence containing symbol/CGC/length metadata.""" + if samplesheet.empty: + return pd.DataFrame(columns=["sequence", "symbol", "CGC", "length"]) + + metadata = samplesheet[["sequence"]].drop_duplicates().copy() + if symbol_map is None: + symbol_map = build_symbol_map(samplesheet, mane_summary_path) + metadata = attach_metadata(metadata, symbol_map) + metadata["symbol"] = metadata.get("symbol", pd.Series("", index=metadata.index)).fillna("") + cgc_symbols = load_cgc_symbols(cgc_path) + if cgc_symbols: + metadata["CGC"] = metadata["symbol"].isin(cgc_symbols).astype(int) + else: + metadata["CGC"] = 0 fasta_paths = metadata["sequence"].map(lambda seq: (fasta_dir / f"{seq}.fasta").as_posix()) metadata["length"] = compute_fasta_lengths(pd.Series(fasta_paths.values, index=metadata.index)) - metadata = metadata.drop(columns=["refseq_prot"], errors="ignore") return metadata @@ -439,12 +500,23 @@ def attach_metadata(df: pd.DataFrame, metadata_map: Optional[pd.DataFrame]) -> p """Merge symbol/CGC/length metadata into df when available.""" if metadata_map is None or df.empty: return df - columns = ["sequence", "symbol", "CGC", "length"] - metadata = metadata_map[columns].drop_duplicates(subset=["sequence"]) - return ( - df.drop(columns=["symbol", "CGC", "length"], errors="ignore") - .merge(metadata, on="sequence", how="left") - ) + available = [col for col in ["symbol", "CGC", "length"] if col in metadata_map.columns] + if not available: + return df + metadata = metadata_map[["sequence"] + available].drop_duplicates(subset=["sequence"]) + merged = df.merge(metadata, on="sequence", how="left", suffixes=("", "_meta")) + for col in available: + meta_col = f"{col}_meta" + if meta_col in merged.columns: + if col in merged.columns: + merged[col] = merged[meta_col].combine_first(merged[col]) + else: + merged[col] = merged[meta_col] + merged = merged.drop(columns=[meta_col]) + for col in ("CGC", "length"): + if col not in available and col in merged.columns: + merged = merged.drop(columns=[col]) + return merged def attach_refseq(df: pd.DataFrame, master_samplesheet: Optional[pd.DataFrame]) -> pd.DataFrame: @@ -555,7 +627,7 @@ def filter_long_sequences( if include_metadata: removed_clean = removed.copy() else: - removed_clean = removed.drop(columns=["symbol", "CGC", "length"], errors="ignore") + removed_clean = removed.drop(columns=["CGC", "length"], errors="ignore") removed_path = missing_dir / "samplesheet_removed_long.csv" removed_clean.to_csv(removed_path, index=False) @@ -690,6 +762,7 @@ def run_pipeline( ) -> None: """Execute the MANE maintenance workflow end-to-end.""" samplesheet = pd.read_csv(paths.samplesheet_path) + symbol_map = build_symbol_map(samplesheet, paths.mane_summary_path) metadata_map = None if settings.include_metadata: metadata_map = build_metadata_map( @@ -697,16 +770,22 @@ def run_pipeline( paths.fasta_dir, paths.mane_summary_path, paths.cgc_list_path, + symbol_map=symbol_map, ) - samplesheet = attach_metadata(samplesheet, metadata_map) - samplesheet.to_csv(paths.samplesheet_path, index=False) + + metadata_for_outputs = metadata_map or symbol_map + samplesheet = attach_metadata(samplesheet, metadata_for_outputs) + samplesheet.to_csv(paths.samplesheet_path, index=False) + if settings.include_metadata: print(f"Annotated master samplesheet with metadata at {paths.samplesheet_path}") + else: + print(f"Annotated master samplesheet with symbols at {paths.samplesheet_path}") print(f"Loaded {len(samplesheet):,} samples from {paths.samplesheet_path}") if predicted_raw_dir: if not predicted_raw_dir.exists(): raise FileNotFoundError(f"--predicted-dir not found: {predicted_raw_dir}") - sync_predicted_bundle(predicted_raw_dir, paths.predicted_bundle_dir, metadata_map) + sync_predicted_bundle(predicted_raw_dir, paths.predicted_bundle_dir, metadata_for_outputs) else: print("No --predicted-dir supplied; skipping nf-core sync and using existing predicted bundle.") @@ -727,7 +806,7 @@ def run_pipeline( samplesheet_missing = samplesheet_missing.iloc[0:0].copy() if settings.enable_canonical_reuse: - retrieved_df = reuse_canonical_structures(samplesheet_missing, paths, settings, metadata_map) + retrieved_df = reuse_canonical_structures(samplesheet_missing, paths, settings, metadata_for_outputs) if retrieved_df.empty: print("Skipping canonical reuse because no PDBs were harvested.") else: @@ -752,7 +831,7 @@ def run_pipeline( if settings.include_metadata: annotated_missing_df.to_csv(paths.missing_samplesheet_path, index=False) else: - clean_missing_df = annotated_missing_df.drop(columns=["symbol", "CGC", "length"], errors="ignore") + clean_missing_df = annotated_missing_df.drop(columns=["CGC", "length"], errors="ignore") clean_missing_df.to_csv(paths.missing_samplesheet_path, index=False) print(f"Updated missing samplesheet saved to {paths.missing_samplesheet_path}") @@ -772,7 +851,7 @@ def run_pipeline( merge_structure_bundles( bundles_to_merge, paths.final_bundle_dir, - metadata_map, + metadata_for_outputs, master_samplesheet=samplesheet, ) print(f"Final bundle written to {paths.final_bundle_dir}") @@ -834,7 +913,7 @@ def run_pipeline( "--include-metadata/--no-include-metadata", default=False, show_default=True, - help="Attach symbol/CGC/length columns to every emitted samplesheet.", + help="Attach CGC/length columns (symbol is always included) to every emitted samplesheet.", ) def cli( samplesheet_folder: str, From d8ea6553ad9c2e497e992af113e0337522ce2d8d Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 13:19:56 +0100 Subject: [PATCH 43/60] logs: enhance PDB copying process with detailed logging and summary of operations --- scripts/datasets/custom_pdb.py | 19 ++++++++++++++++++- scripts/datasets/seq_for_mut_prob.py | 12 ++++++++++++ .../update_samplesheet_and_structures.py | 7 ++----- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/scripts/datasets/custom_pdb.py b/scripts/datasets/custom_pdb.py index 63f5a46..c8c0672 100644 --- a/scripts/datasets/custom_pdb.py +++ b/scripts/datasets/custom_pdb.py @@ -99,13 +99,19 @@ def copy_and_parse_custom_pdbs( samplesheet_df = None # Copy and gzip pdb and optionally add REFSEQ + total_pdb_files = 0 + copied = 0 + skipped_format = 0 + seqres_inserted = 0 for fname in os.listdir(src_dir): if not fname.endswith('.pdb'): continue + total_pdb_files += 1 parts = fname.split('.') # e.g. [ACCESSION, fragment_code, 'alphafold', 'pdb'] if len(parts) < 4: logger.warning(f"Skipping unexpected filename format: {fname}") + skipped_format += 1 continue accession = parts[0] @@ -119,6 +125,7 @@ def copy_and_parse_custom_pdbs( with open(src_path, 'rb') as fin, gzip.open(dst_path, 'wb') as fout: shutil.copyfileobj(fin, fout) + copied += 1 logger.debug(f'Copied and gzipped: {fname} -> {new_name}') # Optionally add SEQRES records @@ -132,6 +139,7 @@ def copy_and_parse_custom_pdbs( seq = [one_to_three_res_map[aa] for aa in seq] add_seqres_to_pdb(path_pdb=dst_path, residues=seq) logger.debug(f"Inserted SEQRES records into: {new_name}") + seqres_inserted += 1 else: try: seq = "".join(list(get_seq_from_pdb(dst_path))) @@ -141,4 +149,13 @@ def copy_and_parse_custom_pdbs( logger.warning(f"SEQRES not found in samplesheet and its extraction from structure failed: {new_name}") except Exception as e: logger.warning(f"SEQRES not found in samplesheet and its extraction from structure failed: {new_name}") - logger.warning(f"Exception captured: {e}") \ No newline at end of file + logger.warning(f"Exception captured: {e}") + + logger.info( + "Custom PDB copy summary: %s/%s structures copied (skipped %s invalid filenames).", + copied, + total_pdb_files, + skipped_format, + ) + if seqres_inserted: + logger.debug("Inserted SEQRES records into %s custom structures.", seqres_inserted) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index bc2e3b0..9caf08e 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -636,6 +636,18 @@ def get_mane_to_af_mapping( base_ens = mane_mapping["Ens_Prot_ID"].str.split(".", n=1).str[0] mask = base_ens.isin(custom_ids) mane_mapping.loc[mask, "Uniprot_ID"] = base_ens[mask] + if logger.isEnabledFor(logging.DEBUG): + summary_ens = set(mane_summary["Ens_Prot_ID"].astype(str).str.split(".", n=1).str[0]) + missing_custom = sorted(set(custom_ids) - summary_ens) + if missing_custom: + preview = ", ".join(missing_custom[:10]) + suffix = "..." if len(missing_custom) > 10 else "" + logger.debug( + "Custom MANE ENSP IDs not found in MANE summary (%s): %s%s", + len(missing_custom), + preview, + suffix, + ) # Select available Uniprot ID, fist one if multiple are present mane_mapping = mane_mapping.dropna(subset=["Uniprot_ID"]).reset_index(drop=True) diff --git a/tools/preprocessing/update_samplesheet_and_structures.py b/tools/preprocessing/update_samplesheet_and_structures.py index dee13fd..ca83800 100644 --- a/tools/preprocessing/update_samplesheet_and_structures.py +++ b/tools/preprocessing/update_samplesheet_and_structures.py @@ -828,11 +828,8 @@ def run_pipeline( include_metadata=settings.include_metadata, ) - if settings.include_metadata: - annotated_missing_df.to_csv(paths.missing_samplesheet_path, index=False) - else: - clean_missing_df = annotated_missing_df.drop(columns=["CGC", "length"], errors="ignore") - clean_missing_df.to_csv(paths.missing_samplesheet_path, index=False) + clean_missing_df = annotated_missing_df.drop(columns=["symbol", "CGC", "length"], errors="ignore") + clean_missing_df.to_csv(paths.missing_samplesheet_path, index=False) print(f"Updated missing samplesheet saved to {paths.missing_samplesheet_path}") if settings.filter_long_sequences and not removed_long_df.empty: From 3c3f05592f38a0b1d59e76201a58a73a65a9d2c5 Mon Sep 17 00:00:00 2001 From: St3451 Date: Wed, 18 Feb 2026 13:43:49 +0100 Subject: [PATCH 44/60] fix: ensure REPO_ROOT is added to sys.path for module imports in prepare_samplesheet.py --- tools/preprocessing/prepare_samplesheet.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/preprocessing/prepare_samplesheet.py b/tools/preprocessing/prepare_samplesheet.py index cb3401e..a05db2e 100644 --- a/tools/preprocessing/prepare_samplesheet.py +++ b/tools/preprocessing/prepare_samplesheet.py @@ -13,11 +13,16 @@ import os +import sys +from pathlib import Path import click import gzip import time import pandas as pd -from pathlib import Path +REPO_ROOT = Path(__file__).resolve().parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + from scripts.datasets.utils import download_single_file # import logging From 50498ac5d057328e1b8cde436d9b0d8e0cc86820 Mon Sep 17 00:00:00 2001 From: St3451 Date: Thu, 19 Feb 2026 08:44:17 +0100 Subject: [PATCH 45/60] limit number of connections in download_single_file to a maximum of 10 --- scripts/datasets/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/datasets/utils.py b/scripts/datasets/utils.py index 91447f2..4a34077 100644 --- a/scripts/datasets/utils.py +++ b/scripts/datasets/utils.py @@ -124,7 +124,7 @@ def download_single_file(url: str, destination: str, threads: int, proteome=None destination (str): The local path where the file will be saved. """ - num_connections = 15 if threads > 40 else threads + num_connections = min(threads, 10) if os.path.exists(destination): logger.debug(f"File {destination} already exists..") From 0ed4e61b9aaed91d5c3ae459d6e9f5f6ee7a2e12 Mon Sep 17 00:00:00 2001 From: St3451 Date: Thu, 19 Feb 2026 10:09:45 +0100 Subject: [PATCH 46/60] reduce maximum number of connections in download_single_file from 10 to 8 --- scripts/datasets/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/datasets/utils.py b/scripts/datasets/utils.py index 4a34077..58abaef 100644 --- a/scripts/datasets/utils.py +++ b/scripts/datasets/utils.py @@ -124,7 +124,7 @@ def download_single_file(url: str, destination: str, threads: int, proteome=None destination (str): The local path where the file will be saved. """ - num_connections = min(threads, 10) + num_connections = min(threads, 8) if os.path.exists(destination): logger.debug(f"File {destination} already exists..") From 7967d1bc9e32736a8fe826fa6e43249456d9b5b2 Mon Sep 17 00:00:00 2001 From: St3451 Date: Thu, 19 Feb 2026 10:30:49 +0100 Subject: [PATCH 47/60] feat: add retry logic for missing entries; increase max attempts and wait time in get_ref_dna_from_ensembl_batch; --- scripts/datasets/seq_for_mut_prob.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 9caf08e..b0bfa37 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -867,7 +867,7 @@ def get_biomart_metadata(datasets_dir, uniprot_ids): return canonical_transcripts -def get_ref_dna_from_ensembl_batch(transcript_ids, max_attempts=3, wait_seconds=2): +def get_ref_dna_from_ensembl_batch(transcript_ids, max_attempts=8, wait_seconds=3): """ Retrieve Ensembl CDS DNA sequences for up to 50 stable IDs in a single request. @@ -1303,6 +1303,22 @@ def process_seq_df_mane(seq_df, logger.debug(f"Retrieving CDS DNA seq from transcript ID (Ensembl API): {len(seq_df_mane)} structures..") seq_df_mane = get_ref_dna_from_ensembl_mp(seq_df_mane, cores=num_cores) + # Retry missing entries one-by-one using single-request API + missing_mask = seq_df_mane["Seq_dna"].isna() + if missing_mask.any(): + logger.debug( + "Retrying %s missing Ensembl CDS entries one by one.", + int(missing_mask.sum()), + ) + recovered = 0 + for idx in seq_df_mane[missing_mask].index: + seq_dna = get_ref_dna_from_ensembl(seq_df_mane.at[idx, "Ens_Transcr_ID"]) + if pd.notna(seq_dna): + recovered += 1 + seq_df_mane.at[idx, "Seq_dna"] = seq_dna + if recovered > 0: + logger.debug("Recovered %s missing Ensembl CDS entries after single-request retry.", recovered) + # Set failed and len-mismatching entries as no-transcripts entries seq_len = seq_df_mane["Seq"].str.len() dna_len = seq_df_mane["Seq_dna"].str.len() From 3a348b5121fbf197e4d809b0b475d04c568fac44 Mon Sep 17 00:00:00 2001 From: St3451 Date: Thu, 19 Feb 2026 10:33:41 +0100 Subject: [PATCH 48/60] cap Ensembl CDS batch workers to a maximum number of cores --- scripts/datasets/seq_for_mut_prob.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index b0bfa37..0586a20 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -45,6 +45,7 @@ _ENSEMBL_REST_SERVER = "https://rest.ensembl.org" _ENSEMBL_REST_TIMEOUT = (10, 160) # (connect, read) seconds _ENSEMBL_REST_HEADERS = {"Accept": "text/plain"} +_ENSEMBL_CDS_MAX_CORES = 8 #=========== @@ -1124,6 +1125,14 @@ def get_ref_dna_from_ensembl_mp(seq_df, cores): seq_df["Seq_dna"] = [] return seq_df + if cores > _ENSEMBL_CDS_MAX_CORES: + logger.info( + "Capping Ensembl CDS batch workers from %s to %s.", + cores, + _ENSEMBL_CDS_MAX_CORES, + ) + cores = _ENSEMBL_CDS_MAX_CORES + logger.debug("Retrieving CDS DNA from Ensembl for %s transcripts with %s cores.", total, cores) if cores <= 1: From 2c5c109386c8dd118d23d6fc484017d0d9f884cc Mon Sep 17 00:00:00 2001 From: St3451 Date: Thu, 19 Feb 2026 10:34:27 +0100 Subject: [PATCH 49/60] feat: implement bounded parallelism for retrying missing Ensembl CDS entries --- scripts/datasets/seq_for_mut_prob.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 0586a20..cc377e8 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -1312,21 +1312,28 @@ def process_seq_df_mane(seq_df, logger.debug(f"Retrieving CDS DNA seq from transcript ID (Ensembl API): {len(seq_df_mane)} structures..") seq_df_mane = get_ref_dna_from_ensembl_mp(seq_df_mane, cores=num_cores) - # Retry missing entries one-by-one using single-request API + # Retry missing entries using single-request API (bounded parallelism) missing_mask = seq_df_mane["Seq_dna"].isna() if missing_mask.any(): + missing_ids = seq_df_mane.loc[missing_mask, "Ens_Transcr_ID"].tolist() logger.debug( - "Retrying %s missing Ensembl CDS entries one by one.", - int(missing_mask.sum()), + "Retrying %s missing Ensembl CDS entries with %s workers.", + len(missing_ids), + min(num_cores, _ENSEMBL_CDS_MAX_CORES), ) - recovered = 0 - for idx in seq_df_mane[missing_mask].index: - seq_dna = get_ref_dna_from_ensembl(seq_df_mane.at[idx, "Ens_Transcr_ID"]) - if pd.notna(seq_dna): - recovered += 1 - seq_df_mane.at[idx, "Seq_dna"] = seq_dna + retry_workers = min(num_cores, _ENSEMBL_CDS_MAX_CORES) + if retry_workers <= 1: + retry_results = [get_ref_dna_from_ensembl(tid) for tid in missing_ids] + else: + with multiprocessing.Pool(processes=retry_workers) as pool: + retry_results = pool.map(get_ref_dna_from_ensembl, missing_ids) + recovered = sum(pd.notna(val) for val in retry_results) + seq_df_mane.loc[missing_mask, "Seq_dna"] = retry_results if recovered > 0: - logger.debug("Recovered %s missing Ensembl CDS entries after single-request retry.", recovered) + logger.debug( + "Recovered %s missing Ensembl CDS entries after single-request retry.", + recovered, + ) # Set failed and len-mismatching entries as no-transcripts entries seq_len = seq_df_mane["Seq"].str.len() From ea631301d5cd03b4f762a22417fac5e79f5f7c47 Mon Sep 17 00:00:00 2001 From: St3451 Date: Thu, 19 Feb 2026 10:48:42 +0100 Subject: [PATCH 50/60] fix: handle consecutive missing PAE downloads correctly --- scripts/datasets/get_pae.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/datasets/get_pae.py b/scripts/datasets/get_pae.py index a66cea6..4af4a61 100644 --- a/scripts/datasets/get_pae.py +++ b/scripts/datasets/get_pae.py @@ -111,7 +111,7 @@ def get_pae( result = download_pae(uniprot_id, af_version, output_dir) if result == "missing": consecutive_missing += 1 - else: + elif result == "ok": consecutive_missing = 0 if consecutive_missing >= 10: From cdf9f6a45f56eae43a60115a9d430c264e4f1d08 Mon Sep 17 00:00:00 2001 From: St3451 Date: Thu, 19 Feb 2026 17:32:23 +0100 Subject: [PATCH 51/60] logs: enhance logging for Ensembl CDS retrieval with sequence count --- scripts/datasets/seq_for_mut_prob.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index cc377e8..c9304dd 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -1145,7 +1145,12 @@ def get_ref_dna_from_ensembl_mp(seq_df, cores): results.extend(batch_results) pbar.update(len(batch_ids)) seq_df["Seq_dna"] = results - logger.debug("Completed Ensembl CDS retrieval.") + retrieved = int(pd.Series(results).notna().sum()) + logger.debug( + "Completed Ensembl CDS retrieval: %s/%s sequences retrieved.", + retrieved, + total, + ) return seq_df batch_size = 50 @@ -1158,8 +1163,12 @@ def get_ref_dna_from_ensembl_mp(seq_df, cores): results.extend(batch_results) pbar.update(len(batch_ids)) seq_df["Seq_dna"] = results - - logger.debug("Completed Ensembl CDS retrieval.") + retrieved = int(pd.Series(results).notna().sum()) + logger.debug( + "Completed Ensembl CDS retrieval: %s/%s sequences retrieved.", + retrieved, + total, + ) return seq_df From 2209b0e1671b677be4ad11901db15ef4d45d23e5 Mon Sep 17 00:00:00 2001 From: St3451 Date: Thu, 19 Feb 2026 17:33:38 +0100 Subject: [PATCH 52/60] fix: prevent duplicate SEQRES records in PDB files and log skipped insertions --- scripts/datasets/af_merge.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/scripts/datasets/af_merge.py b/scripts/datasets/af_merge.py index 90169ff..a8a81ad 100755 --- a/scripts/datasets/af_merge.py +++ b/scripts/datasets/af_merge.py @@ -224,12 +224,17 @@ def get_pdb_seqres_records(lst_res): def add_refseq_record_to_pdb(path_structure): """ Add the SEQREF records to the pdb file. + Returns True if SEQRES was inserted, False if skipped because SEQRES already exists. """ # Open the PDB file and get SEQRES insert index with open(path_structure, 'r') as file: pdb_lines = file.readlines() - insert_index = next(i for i, line in enumerate(pdb_lines) if line.startswith('MODEL')) + + if any(line.startswith('SEQRES') for line in pdb_lines): + return False + + insert_index = next(i for i, line in enumerate(pdb_lines) if line.startswith('MODEL')) # Get seares records residues = get_res_from_chain(path_structure) @@ -243,6 +248,8 @@ def add_refseq_record_to_pdb(path_structure): output_file.truncate() output_file.writelines(pdb_lines) + return True + # Other functions @@ -306,6 +313,8 @@ def merge_af_fragments(input_dir, output_dir=None, af_version=4, gzip=False): else: # Get list of fragmented Uniprot ID and max AF-F not_processed = [] + refseq_added = 0 + refseq_skipped_existing = 0 for uni_id, max_f in tqdm(fragments, total=len(fragments), desc="Merging AF fragments"): processed = False @@ -329,7 +338,10 @@ def merge_af_fragments(input_dir, output_dir=None, af_version=4, gzip=False): tmp_name = os.path.join(output_dir, f"AF-{uni_id}-FM-model_v{af_version}.pdb") name = os.path.join(output_dir, f"AF-{uni_id}-F{max_f}M-model_v{af_version}.pdb") os.rename(tmp_name, name) - add_refseq_record_to_pdb(name) + if add_refseq_record_to_pdb(name): + refseq_added += 1 + else: + refseq_skipped_existing += 1 if len(not_processed) > 0: logger.warning(f"Not processed: {not_processed}") @@ -338,6 +350,11 @@ def merge_af_fragments(input_dir, output_dir=None, af_version=4, gzip=False): save_unprocessed_ids(not_processed, os.path.join(output_dir, "fragmented_pdbs", "ids_not_merged.txt")) + if refseq_skipped_existing: + logger.info( + "Skipped SEQRES insertion for %s merged structures (SEQRES already present).", + refseq_skipped_existing, + ) logger.info("Merge of structures completed!") else: From d7bf40301e07c10ca3221c33c202345a7fc6f991 Mon Sep 17 00:00:00 2001 From: St3451 Date: Thu, 19 Feb 2026 17:42:46 +0100 Subject: [PATCH 53/60] fix: handle ENSP IDs in get_exons_coord function and return NaN for missing data --- scripts/datasets/seq_for_mut_prob.py | 37 +++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index c9304dd..b8c6b52 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -314,8 +314,30 @@ def get_exons_coord(ids, ens_canonical_transcripts_lst, batch_size=100): https://doi.org/10.1093/nar/gkx237 """ + ids = [str(i) for i in ids] + ens_prot_ids = [i for i in ids if i.upper().startswith("ENSP")] + proteins_api_ids = [i for i in ids if not i.upper().startswith("ENSP")] + + if ens_prot_ids: + logger.debug( + "Skipping %s ENSP IDs for Proteins API (will fall back to Backtranseq).", + len(ens_prot_ids), + ) + lst_df = [] - batches_ids = [ids[i:i+batch_size] for i in range(0, len(ids), batch_size)] + if not proteins_api_ids: + nan = np.repeat(np.nan, len(ids)) + return pd.DataFrame({ + "Uniprot_ID": ids, + "Ens_Gene_ID": nan, + "Ens_Transcr_ID": nan, + "Seq": nan, + "Chr": nan, + "Reverse_strand": nan, + "Exons_coord": nan, + }) + + batches_ids = [proteins_api_ids[i:i+batch_size] for i in range(0, len(proteins_api_ids), batch_size)] for batch_ids in tqdm(batches_ids, total=len(batches_ids), desc="Adding exons coordinate"): @@ -335,6 +357,19 @@ def get_exons_coord(ids, ens_canonical_transcripts_lst, batch_size=100): batch_df = pd.concat([batch_df, nan_rows], ignore_index=True) lst_df.append(batch_df) + if ens_prot_ids: + nan = np.repeat(np.nan, len(ens_prot_ids)) + nan_rows = pd.DataFrame({ + "Uniprot_ID": ens_prot_ids, + "Ens_Gene_ID": nan, + "Ens_Transcr_ID": nan, + "Seq": nan, + "Chr": nan, + "Reverse_strand": nan, + "Exons_coord": nan, + }) + lst_df.append(nan_rows) + return pd.concat(lst_df).reset_index(drop=True) From 6eeadf3965a7c0dcd14abacec16f5021d655b97b Mon Sep 17 00:00:00 2001 From: St3451 Date: Thu, 19 Feb 2026 18:27:05 +0100 Subject: [PATCH 54/60] update symbol assignment to use pd.NA for missing values in build_symbol_map function --- tools/preprocessing/update_samplesheet_and_structures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/preprocessing/update_samplesheet_and_structures.py b/tools/preprocessing/update_samplesheet_and_structures.py index ca83800..a7827a7 100644 --- a/tools/preprocessing/update_samplesheet_and_structures.py +++ b/tools/preprocessing/update_samplesheet_and_structures.py @@ -459,7 +459,7 @@ def build_symbol_map(samplesheet: pd.DataFrame, mane_summary_path: Path) -> pd.D if not seq_map.empty: metadata["symbol"] = seq_keys.map(seq_map["symbol"]) else: - metadata["symbol"] = pd.Series("", index=metadata.index) + metadata["symbol"] = pd.Series(pd.NA, index=metadata.index) if "refseq_prot" in metadata.columns and not refseq_map.empty: refseq_keys = strip_version_suffix(metadata["refseq_prot"]) From 9170592765a9bb1f35c545fc2bda4c482ae24c35 Mon Sep 17 00:00:00 2001 From: St3451 Date: Thu, 19 Feb 2026 23:01:52 +0100 Subject: [PATCH 55/60] fix: update add_seqres_to_pdb to return bool and skip insertion if SEQRES already exists --- scripts/datasets/custom_pdb.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/scripts/datasets/custom_pdb.py b/scripts/datasets/custom_pdb.py index c8c0672..212582f 100644 --- a/scripts/datasets/custom_pdb.py +++ b/scripts/datasets/custom_pdb.py @@ -39,7 +39,7 @@ def get_pdb_seqres_records(lst_res): return records -def add_seqres_to_pdb(path_pdb: str, residues: list) -> None: +def add_seqres_to_pdb(path_pdb: str, residues: list) -> bool: """ Insert SEQRES records at the very top of a PDB file (supports gzipped and plain). @@ -56,6 +56,9 @@ def add_seqres_to_pdb(path_pdb: str, residues: list) -> None: with open_in(path_pdb, mode_in) as fh: lines = fh.readlines() + if any(line.startswith("SEQRES") for line in lines): + return False + # Generate SEQRES lines seqres = get_pdb_seqres_records(residues) @@ -65,6 +68,8 @@ def add_seqres_to_pdb(path_pdb: str, residues: list) -> None: # Write back with open_out(path_pdb, mode_out) as fh: fh.writelines(new_lines) + + return True def copy_and_parse_custom_pdbs( @@ -103,6 +108,7 @@ def copy_and_parse_custom_pdbs( copied = 0 skipped_format = 0 seqres_inserted = 0 + seqres_skipped_existing = 0 for fname in os.listdir(src_dir): if not fname.endswith('.pdb'): continue @@ -137,9 +143,11 @@ def copy_and_parse_custom_pdbs( if not pd.isna(seq): seq = [one_to_three_res_map[aa] for aa in seq] - add_seqres_to_pdb(path_pdb=dst_path, residues=seq) - logger.debug(f"Inserted SEQRES records into: {new_name}") - seqres_inserted += 1 + if add_seqres_to_pdb(path_pdb=dst_path, residues=seq): + logger.debug(f"Inserted SEQRES records into: {new_name}") + seqres_inserted += 1 + else: + seqres_skipped_existing += 1 else: try: seq = "".join(list(get_seq_from_pdb(dst_path))) @@ -159,3 +167,8 @@ def copy_and_parse_custom_pdbs( ) if seqres_inserted: logger.debug("Inserted SEQRES records into %s custom structures.", seqres_inserted) + if seqres_skipped_existing: + logger.info( + "Skipped SEQRES insertion for %s custom structures (SEQRES already present).", + seqres_skipped_existing, + ) From 3d8d4e95a4dfabff158beed9718e2b5de161b125 Mon Sep 17 00:00:00 2001 From: St3451 Date: Thu, 19 Feb 2026 23:21:42 +0100 Subject: [PATCH 56/60] fix: reduce batch size for Backtranseq API calls to improve performance and reliability --- scripts/datasets/seq_for_mut_prob.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index b8c6b52..fd95723 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -1291,7 +1291,7 @@ def process_seq_df(seq_df, if len(seq_df_not_uniprot) > 0: # Add DNA seq from Backtranseq for any other entry logger.debug(f"Retrieving CDS DNA seq for entries without available transcript ID (Backtranseq API): {len(seq_df_not_uniprot)} structures..") - seq_df_not_uniprot = batch_backtranseq(seq_df_not_uniprot, 500, organism=organism) + seq_df_not_uniprot = batch_backtranseq(seq_df_not_uniprot, 100, organism=organism) # Get trinucleotide context seq_df_not_uniprot["Tri_context"] = np.nan @@ -1452,7 +1452,7 @@ def process_seq_df_mane(seq_df, # Add DNA seq from Backtranseq for any other entry if len(seq_df_nomane_notr) > 0: logger.debug(f"Retrieving CDS DNA seq for genes without available transcript ID (Backtranseq API): {len(seq_df_nomane_notr)} structures..") - seq_df_nomane_notr = batch_backtranseq(seq_df_nomane_notr, 500, organism="Homo sapiens") + seq_df_nomane_notr = batch_backtranseq(seq_df_nomane_notr, 100, organism="Homo sapiens") # Get trinucleotide context seq_df_not_uniprot = pd.concat((seq_df_mane, seq_df_nomane_notr)) From 44255a5f3f347a19569e4fc9165dc43e14222446 Mon Sep 17 00:00:00 2001 From: St3451 Date: Thu, 19 Feb 2026 23:25:38 +0100 Subject: [PATCH 57/60] logs: enhance error handling and logging in backtranseq function for API responses --- scripts/datasets/seq_for_mut_prob.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index fd95723..0aef949 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -115,6 +115,12 @@ def backtranseq(protein_seqs, organism = "Homo sapiens"): time.sleep(10) try: response = requests.post(run_url, data=params, timeout=160) + if not response.ok: + logger.debug( + "Backtranseq submit returned HTTP %s: %s", + response.status_code, + response.text[:200], + ) except requests.exceptions.RequestException as e: response = "ERROR" logger.debug(f"Request failed: {e}") @@ -127,7 +133,15 @@ def backtranseq(protein_seqs, organism = "Homo sapiens"): time.sleep(20) try: result = requests.get(status_url + job_id, timeout=160) - status = result.text.strip() + if result.ok: + status = result.text.strip() + else: + status = "ERROR" + logger.debug( + "Backtranseq status returned HTTP %s: %s", + result.status_code, + result.text[:200], + ) except requests.exceptions.RequestException as e: status = "ERROR" logger.debug(f"Request failed {e}: Retrying..") @@ -137,7 +151,15 @@ def backtranseq(protein_seqs, organism = "Homo sapiens"): while status != "FINISHED": try: result = requests.get(result_url + job_id + "/out", timeout=160) - status = "FINISHED" + if result.ok: + status = "FINISHED" + else: + status = "ERROR" + logger.debug( + "Backtranseq result returned HTTP %s: %s", + result.status_code, + result.text[:200], + ) except requests.exceptions.RequestException as e: status = "ERROR" logger.debug(f"Request failed {e}: Retrying..") From b836ccf8ddd4f9c286f2cd613df50179ca2d03c2 Mon Sep 17 00:00:00 2001 From: St3451 Date: Thu, 19 Feb 2026 23:30:21 +0100 Subject: [PATCH 58/60] feat: enhance backtranseq function with retry logic and timeout handling for improved reliability --- scripts/datasets/seq_for_mut_prob.py | 137 ++++++++++++++++++--------- 1 file changed, 91 insertions(+), 46 deletions(-) diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index 0aef949..f414ef6 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -91,7 +91,7 @@ def initialize_seq_df(input_path, uniprot_to_gene_dict): # (not 100% reliable but available fo most seq) #============================================== -def backtranseq(protein_seqs, organism = "Homo sapiens"): +def backtranseq(protein_seqs, organism = "Homo sapiens", max_attempts=5, total_timeout=45 * 60): """ Perform backtranslation from proteins to DNA sequences using EMBOS backtranseq. """ @@ -108,66 +108,101 @@ def backtranseq(protein_seqs, organism = "Homo sapiens"): "molecule": "dna", "organism": organism} - # Submit the job request and retrieve the job ID - response = "INIT" - while str(response) != "": - if response != "INIT": - time.sleep(10) - try: - response = requests.post(run_url, data=params, timeout=160) - if not response.ok: + for attempt in range(1, max_attempts + 1): + attempt_start = time.monotonic() + + # Submit the job request and retrieve the job ID + job_id = None + while job_id is None: + if time.monotonic() - attempt_start > total_timeout: + logger.warning( + "Backtranseq submit timed out after %ss (attempt %s/%s).", + total_timeout, + attempt, + max_attempts, + ) + break + try: + response = requests.post(run_url, data=params, timeout=160) + if response.ok: + job_id = response.text.strip() + break logger.debug( "Backtranseq submit returned HTTP %s: %s", response.status_code, response.text[:200], ) - except requests.exceptions.RequestException as e: - response = "ERROR" - logger.debug(f"Request failed: {e}") + except requests.exceptions.RequestException as e: + logger.debug(f"Request failed: {e}") + time.sleep(10) - job_id = response.text.strip() + if job_id is None: + continue - # Wait for the job to complete - status = "INIT" - while status != "FINISHED": - time.sleep(20) - try: - result = requests.get(status_url + job_id, timeout=160) - if result.ok: - status = result.text.strip() - else: - status = "ERROR" - logger.debug( - "Backtranseq status returned HTTP %s: %s", - result.status_code, - result.text[:200], + # Wait for the job to complete + status = "INIT" + while True: + if time.monotonic() - attempt_start > total_timeout: + logger.warning( + "Backtranseq status polling timed out after %ss (attempt %s/%s).", + total_timeout, + attempt, + max_attempts, ) - except requests.exceptions.RequestException as e: - status = "ERROR" - logger.debug(f"Request failed {e}: Retrying..") + status = "TIMEOUT" + break + time.sleep(20) + try: + result = requests.get(status_url + job_id, timeout=160) + if not result.ok: + logger.debug( + "Backtranseq status returned HTTP %s: %s", + result.status_code, + result.text[:200], + ) + continue + status = result.text.strip() + if status == "FINISHED": + break + if status in {"ERROR", "FAILED"}: + logger.warning( + "Backtranseq returned terminal status '%s' (attempt %s/%s).", + status, + attempt, + max_attempts, + ) + break + except requests.exceptions.RequestException as e: + logger.debug(f"Request failed {e}: Retrying..") - # Retrieve the results of the job - status = "INIT" - while status != "FINISHED": - try: - result = requests.get(result_url + job_id + "/out", timeout=160) - if result.ok: - status = "FINISHED" - else: - status = "ERROR" + if status != "FINISHED": + continue + + # Retrieve the results of the job + while True: + if time.monotonic() - attempt_start > total_timeout: + logger.warning( + "Backtranseq result fetch timed out after %ss (attempt %s/%s).", + total_timeout, + attempt, + max_attempts, + ) + break + try: + result = requests.get(result_url + job_id + "/out", timeout=160) + if result.ok: + return result.text.strip() logger.debug( "Backtranseq result returned HTTP %s: %s", result.status_code, result.text[:200], ) - except requests.exceptions.RequestException as e: - status = "ERROR" - logger.debug(f"Request failed {e}: Retrying..") - time.sleep(10) - - dna_seq = result.text.strip() + except requests.exceptions.RequestException as e: + logger.debug(f"Request failed {e}: Retrying..") + time.sleep(10) - return dna_seq + logger.warning("Backtranseq failed after %s attempts; returning empty result.", max_attempts) + return None def batch_backtranseq(df, batch_size, organism = "Homo sapiens"): @@ -195,6 +230,16 @@ def batch_backtranseq(df, batch_size, organism = "Homo sapiens"): # Run backtranseq batch_dna = backtranseq(batch_seq, organism = organism) + if not batch_dna: + logger.warning( + "Backtranseq failed for batch %s/%s; setting Seq_dna to NaN.", + i + 1, + len(batches), + ) + batch["Seq_dna"] = np.nan + lst_batches.append(batch) + continue + # Parse output batch_dna = re.split(">EMBOSS_\d+", batch_dna.replace("\n", ""))[1:] From 19e9ff244b7b32a97abce56a3907cfd234147baf Mon Sep 17 00:00:00 2001 From: St3451 Date: Fri, 20 Feb 2026 11:22:58 +0100 Subject: [PATCH 59/60] docs: correct docstring in add_refseq_record_to_pdb function and update metadata_for_outputs assignment logic --- scripts/datasets/af_merge.py | 2 +- tools/preprocessing/update_samplesheet_and_structures.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/datasets/af_merge.py b/scripts/datasets/af_merge.py index a8a81ad..c7c88b5 100755 --- a/scripts/datasets/af_merge.py +++ b/scripts/datasets/af_merge.py @@ -223,7 +223,7 @@ def get_pdb_seqres_records(lst_res): def add_refseq_record_to_pdb(path_structure): """ - Add the SEQREF records to the pdb file. + Add the SEQRES records to the pdb file. Returns True if SEQRES was inserted, False if skipped because SEQRES already exists. """ diff --git a/tools/preprocessing/update_samplesheet_and_structures.py b/tools/preprocessing/update_samplesheet_and_structures.py index a7827a7..0edc9e1 100644 --- a/tools/preprocessing/update_samplesheet_and_structures.py +++ b/tools/preprocessing/update_samplesheet_and_structures.py @@ -773,7 +773,7 @@ def run_pipeline( symbol_map=symbol_map, ) - metadata_for_outputs = metadata_map or symbol_map + metadata_for_outputs = metadata_map if metadata_map is not None else symbol_map samplesheet = attach_metadata(samplesheet, metadata_for_outputs) samplesheet.to_csv(paths.samplesheet_path, index=False) if settings.include_metadata: From cd05fe7c2b9c5992d62def92000be69333526e86 Mon Sep 17 00:00:00 2001 From: St3451 Date: Fri, 20 Feb 2026 13:40:23 +0100 Subject: [PATCH 60/60] fix: add af_version parameter to build function and update download_biomart_metadata to support archive usage --- scripts/datasets/build_datasets.py | 3 +- scripts/datasets/seq_for_mut_prob.py | 183 ++++++++++++++++----------- 2 files changed, 114 insertions(+), 72 deletions(-) diff --git a/scripts/datasets/build_datasets.py b/scripts/datasets/build_datasets.py index 9c57141..adc683c 100644 --- a/scripts/datasets/build_datasets.py +++ b/scripts/datasets/build_datasets.py @@ -136,7 +136,8 @@ def build(output_datasets, mane_only=mane_only, num_cores=num_cores, mane_version=mane_version, - custom_mane_metadata_path=custom_mane_metadata_path + custom_mane_metadata_path=custom_mane_metadata_path, + af_version=af_version, ) logger.info("Generation of sequences dataframe completed!") diff --git a/scripts/datasets/seq_for_mut_prob.py b/scripts/datasets/seq_for_mut_prob.py index f414ef6..d213337 100644 --- a/scripts/datasets/seq_for_mut_prob.py +++ b/scripts/datasets/seq_for_mut_prob.py @@ -766,7 +766,7 @@ def get_mane_to_af_mapping( return mane_mapping -def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): +def download_biomart_metadata(path_to_file, max_attempts=3, wait_seconds=10, use_archive=True): """ Query biomart to get the list of transcript corresponding to the downloaded structures (a few structures are missing) and other information. @@ -791,25 +791,29 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): if shutil.which("wget") is None: logger.warning("wget not found; falling back to Python downloader for BioMart metadata.") last_exc = None - ssl_verify_archive = url.startswith("https://") - for attempt in range(1, max_attempts + 1): - logger.debug("Starting BioMart download attempt %s/%s (archive).", attempt, max_attempts) - try: - download_single_file(url, path_to_file, threads=4, ssl=ssl_verify_archive) - return - except Exception as exc: - last_exc = exc - logger.warning( - "BioMart download failed (attempt %s/%s). Retrying in %ss... Error: %s", - attempt, - max_attempts, - wait_seconds, - exc, - ) - logger.debug("BioMart download exception details:", exc_info=True) - time.sleep(wait_seconds) + if use_archive: + ssl_verify_archive = url.startswith("https://") + for attempt in range(1, max_attempts + 1): + logger.debug("Starting BioMart download attempt %s/%s (archive).", attempt, max_attempts) + try: + download_single_file(url, path_to_file, threads=4, ssl=ssl_verify_archive) + return + except Exception as exc: + last_exc = exc + logger.warning( + "BioMart download failed (attempt %s/%s). Retrying in %ss... Error: %s", + attempt, + max_attempts, + wait_seconds, + exc, + ) + logger.debug("BioMart download exception details:", exc_info=True) + time.sleep(wait_seconds) + + logger.warning("Falling back to latest Ensembl BioMart URL after failure on %s.", base_archive) + else: + logger.debug("Skipping archive BioMart URL; using latest only.") - logger.warning("Falling back to latest Ensembl BioMart URL after failure on %s.", base_archive) if os.path.exists(path_to_file): try: os.remove(path_to_file) @@ -837,10 +841,14 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): logger.debug("Fallback BioMart download exception details:", exc_info=True) time.sleep(wait_seconds) - raise RuntimeError( - f"Failed to download BioMart metadata after {max_attempts} attempts on archive and " - f"{max_attempts} attempts on latest." - ) from last_exc + if use_archive: + message = ( + f"Failed to download BioMart metadata after {max_attempts} attempts on archive and " + f"{max_attempts} attempts on latest." + ) + else: + message = f"Failed to download BioMart metadata after {max_attempts} attempts on latest." + raise RuntimeError(message) from last_exc command = [ "wget", @@ -854,33 +862,37 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): url, ] - for attempt in range(1, max_attempts + 1): - logger.debug("Starting BioMart wget attempt %s/%s (archive).", attempt, max_attempts) - result = subprocess.run(command, capture_output=True, text=True) - if result.returncode == 0: - return - stderr = (result.stderr or "").strip() - if stderr: - logger.warning( - "BioMart download failed (attempt %s/%s, return code %s). stderr: %s", - attempt, - max_attempts, - result.returncode, - stderr, - ) - else: - logger.warning( - "BioMart download failed (attempt %s/%s, return code %s). Retrying in %ss...", - attempt, - max_attempts, - result.returncode, - wait_seconds, - ) - if result.stdout: - logger.debug("BioMart wget stdout (attempt %s/%s): %s", attempt, max_attempts, result.stdout.strip()) - time.sleep(wait_seconds) + if use_archive: + for attempt in range(1, max_attempts + 1): + logger.debug("Starting BioMart wget attempt %s/%s (archive).", attempt, max_attempts) + result = subprocess.run(command, capture_output=True, text=True) + if result.returncode == 0: + return + stderr = (result.stderr or "").strip() + if stderr: + logger.warning( + "BioMart download failed (attempt %s/%s, return code %s). stderr: %s", + attempt, + max_attempts, + result.returncode, + stderr, + ) + else: + logger.warning( + "BioMart download failed (attempt %s/%s, return code %s). Retrying in %ss...", + attempt, + max_attempts, + result.returncode, + wait_seconds, + ) + if result.stdout: + logger.debug("BioMart wget stdout (attempt %s/%s): %s", attempt, max_attempts, result.stdout.strip()) + time.sleep(wait_seconds) + + logger.warning("Falling back to latest Ensembl BioMart URL after failure on %s.", base_archive) + else: + logger.debug("Skipping archive BioMart URL; using latest only.") - logger.warning("Falling back to latest Ensembl BioMart URL after failure on %s.", base_archive) if os.path.exists(path_to_file): try: os.remove(path_to_file) @@ -922,13 +934,17 @@ def download_biomart_metadata(path_to_file, max_attempts=5, wait_seconds=10): ) time.sleep(wait_seconds) - raise RuntimeError( - f"Failed to download BioMart metadata after {max_attempts} attempts on archive and " - f"{max_attempts} attempts on latest." - ) + if use_archive: + message = ( + f"Failed to download BioMart metadata after {max_attempts} attempts on archive and " + f"{max_attempts} attempts on latest." + ) + else: + message = f"Failed to download BioMart metadata after {max_attempts} attempts on latest." + raise RuntimeError(message) -def get_biomart_metadata(datasets_dir, uniprot_ids): +def get_biomart_metadata(datasets_dir, uniprot_ids, use_archive=True): """ Download a dataframe including ensembl canonical transcript IDs, HGNC IDs, Uniprot IDs, and other useful information. @@ -937,7 +953,7 @@ def get_biomart_metadata(datasets_dir, uniprot_ids): try: path_biomart_metadata = os.path.join(datasets_dir, "biomart_metadata.tsv") if not os.path.exists(path_biomart_metadata): - download_biomart_metadata(path_biomart_metadata) + download_biomart_metadata(path_biomart_metadata, use_archive=use_archive) # Parse biomart_df = pd.read_csv(path_biomart_metadata, sep="\t", header=None, low_memory=False) @@ -1309,7 +1325,8 @@ def mane_uniprot_to_hugo(uniprot_ids, mane_mapping): def process_seq_df(seq_df, datasets_dir, organism, - uniprot_to_gene_dict): + uniprot_to_gene_dict, + use_archive_biomart=True): """ Retrieve DNA sequence and tri-nucleotide context for each structure in the initialized dataframe @@ -1328,7 +1345,11 @@ def process_seq_df(seq_df, logger.error("No sequences to process in process_seq_df; this should not happen.") raise RuntimeError("Empty sequence dataframe: no structures to process.") - ens_canonical_transcripts_lst = get_biomart_metadata(datasets_dir, seq_df["Uniprot_ID"].unique()) + ens_canonical_transcripts_lst = get_biomart_metadata( + datasets_dir, + seq_df["Uniprot_ID"].unique(), + use_archive=use_archive_biomart, + ) # Process entries in Proteins API (Reference_info 1) #--------------------------------------------------- @@ -1394,7 +1415,8 @@ def process_seq_df_mane(seq_df, mane_mapping, mane_mapping_not_af, mane_only=False, - num_cores=1): + num_cores=1, + use_archive_biomart=True): """ Retrieve DNA sequence and tri-nucleotide context for each structure in the initialized dataframe @@ -1507,7 +1529,11 @@ def process_seq_df_mane(seq_df, seq_df_nomane_tr = seq_df_nomane.copy() seq_df_nomane_notr = seq_df_nomane.copy() else: - ens_canonical_transcripts_lst = get_biomart_metadata(datasets_dir, seq_df_nomane["Uniprot_ID"].unique()) + ens_canonical_transcripts_lst = get_biomart_metadata( + datasets_dir, + seq_df_nomane["Uniprot_ID"].unique(), + use_archive=use_archive_biomart, + ) # Retrieve seq from coordinates logger.debug(f"Retrieving CDS DNA seq from reference genome (Proteins API): {len(seq_df_nomane['Uniprot_ID'].unique())} structures..") coord_df = get_exons_coord(seq_df_nomane["Uniprot_ID"].unique(), ens_canonical_transcripts_lst) @@ -1559,7 +1585,8 @@ def get_seq_df(datasets_dir, mane_only=False, custom_mane_metadata_path=None, num_cores=1, - mane_version=1.4): + mane_version=1.4, + af_version=None): """ Generate a dataframe including IDs mapping information, the protein sequence, the DNA sequence and its tri-nucleotide context, which is @@ -1618,23 +1645,37 @@ def get_seq_df(datasets_dir, # uniprot_to_gene_dict = uniprot_to_hugo_pressed(uniprot_ids) # --- + use_archive_biomart = True + if af_version is not None and str(af_version) != "4": + use_archive_biomart = False + logger.debug( + "Using latest BioMart URL only because af_version=%s (archive disabled).", + af_version, + ) + # Create a dataframe with protein sequences logger.debug("Initializing sequence df..") seq_df = initialize_seq_df(pdb_dir, uniprot_to_gene_dict) if mane: - seq_df = process_seq_df_mane(seq_df, - datasets_dir, - uniprot_to_gene_dict, - mane_mapping, - mane_mapping_not_af, - mane_only, - num_cores) + seq_df = process_seq_df_mane( + seq_df, + datasets_dir, + uniprot_to_gene_dict, + mane_mapping, + mane_mapping_not_af, + mane_only, + num_cores, + use_archive_biomart=use_archive_biomart, + ) else: - seq_df = process_seq_df(seq_df, - datasets_dir, - organism, - uniprot_to_gene_dict) + seq_df = process_seq_df( + seq_df, + datasets_dir, + organism, + uniprot_to_gene_dict, + use_archive_biomart=use_archive_biomart, + ) # Save seq_df_cols = ['Gene', 'HGNC_ID', 'Ens_Gene_ID',