From 281a5157a8f36ee90b6afea033b5f4a329ca48d4 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Mon, 11 May 2026 20:07:04 +1000 Subject: [PATCH 1/2] fixing issues with wasm reporting --- rust/bioscript-formats/src/inspect.rs | 5 +- rust/bioscript-formats/src/inspect/sex.rs | 16 +- .../src/inspect/sex/alignment_depth.rs | 76 +++++++- rust/bioscript-formats/src/lib.rs | 3 +- rust/bioscript-wasm/src/lookup_api.rs | 2 + rust/bioscript-wasm/src/report_api.rs | 71 ++++++- .../src/report_input_inspection.rs | 177 ++++++++++++++++++ .../src/report_workspace/analysis.rs | 2 +- 8 files changed, 328 insertions(+), 24 deletions(-) create mode 100644 rust/bioscript-wasm/src/report_input_inspection.rs diff --git a/rust/bioscript-formats/src/inspect.rs b/rust/bioscript-formats/src/inspect.rs index b19ef17..f282fe0 100644 --- a/rust/bioscript-formats/src/inspect.rs +++ b/rust/bioscript-formats/src/inspect.rs @@ -40,7 +40,10 @@ pub(crate) use heuristics::*; pub(crate) use io::*; #[cfg(test)] pub(crate) use render::*; -pub use sex::{InferredSex, SexDetectionConfidence, SexInference}; +pub use sex::{ + InferredSex, SexDetectionConfidence, SexInference, infer_sex_from_alignment_reader, + infer_sex_from_text_lines, +}; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum FileContainer { diff --git a/rust/bioscript-formats/src/inspect/sex.rs b/rust/bioscript-formats/src/inspect/sex.rs index 9073355..4a26f50 100644 --- a/rust/bioscript-formats/src/inspect/sex.rs +++ b/rust/bioscript-formats/src/inspect/sex.rs @@ -14,6 +14,8 @@ use super::{DetectedKind, InspectOptions}; mod alignment_depth; mod classify; +pub use alignment_depth::infer_sex_from_alignment_reader; + pub(crate) use alignment_depth::infer_sex_from_alignment_path; use classify::{classify_stats, supports_sex_detection, unsupported_sex_inference}; @@ -166,7 +168,7 @@ pub(crate) fn infer_sex_from_zip_bytes( infer_sex_from_bytes(selected_entry, &entry_bytes, kind) } -pub(crate) fn infer_sex_from_text_lines( +pub fn infer_sex_from_text_lines( lines: &[String], kind: DetectedKind, ) -> Result { @@ -194,11 +196,13 @@ fn infer_sex_from_reader( let mut stats = SexStats::default(); let mut probe_lines = Vec::new(); let mut line = String::new(); + // Treat any I/O error mid-stream (e.g. truncated bgzf head when the + // wasm caller only loaded the first N MiB) as end-of-data: classify + // whatever we got rather than failing the whole inspection. The CLI + // path streams the full file so this is effectively unchanged for it. for _ in 0..64 { line.clear(); - let bytes = reader - .read_line(&mut line) - .map_err(|err| RuntimeError::Io(format!("failed to scan sex markers: {err}")))?; + let bytes = reader.read_line(&mut line).unwrap_or_default(); if bytes == 0 { let delimiter = detect_delimiter(&probe_lines); let mut column_indexes = None; @@ -232,9 +236,7 @@ fn infer_sex_from_reader( } for _ in probe_lines.len()..MAX_SEX_DETECTION_LINES { line.clear(); - let bytes = reader - .read_line(&mut line) - .map_err(|err| RuntimeError::Io(format!("failed to scan sex markers: {err}")))?; + let bytes = reader.read_line(&mut line).unwrap_or_default(); if bytes == 0 { break; } diff --git a/rust/bioscript-formats/src/inspect/sex/alignment_depth.rs b/rust/bioscript-formats/src/inspect/sex/alignment_depth.rs index d90d49d..74954ad 100644 --- a/rust/bioscript-formats/src/inspect/sex/alignment_depth.rs +++ b/rust/bioscript-formats/src/inspect/sex/alignment_depth.rs @@ -70,7 +70,31 @@ pub(crate) fn infer_sex_from_alignment_path( ..GenotypeLoadOptions::default() }; - let stats = sample_alignment_sex_windows(path, &load_options, reference_file)?; + let stats = sample_alignment_sex_windows_from_path(path, &load_options, reference_file)?; + let mut inference = classify_alignment_stats(&stats); + if options.input_index.is_none() { + inference + .evidence + .push("CRAM sex detection ran without explicit --input-index".to_owned()); + } + Ok(inference) +} + +/// Reader-based equivalent of `infer_sex_from_alignment_path`. Wasm callers +/// build the `IndexedReader` from JS-supplied `readAt` callbacks; this lets +/// them invoke the same Y/X-vs-autosome coverage analysis the CLI uses +/// without ever touching `std::fs`. +pub fn infer_sex_from_alignment_reader( + reader: &mut noodles::cram::io::indexed_reader::IndexedReader, + label: &str, + allow_reference_md5_mismatch: bool, +) -> Result { + let stats = + sample_alignment_sex_windows_with_reader(reader, label, allow_reference_md5_mismatch)?; + Ok(classify_alignment_stats(&stats)) +} + +fn classify_alignment_stats(stats: &AlignmentSexStats) -> SexInference { let autosome_mean = mean_records(stats.autosome_records, stats.autosome_windows); let x_mean = mean_records(stats.x_records, stats.x_windows); let y_mean = mean_records(stats.y_records, stats.y_windows); @@ -91,7 +115,7 @@ pub(crate) fn infer_sex_from_alignment_path( (InferredSex::Unknown, SexDetectionConfidence::Low) }; - let mut evidence = vec![ + let evidence = vec![ format!("autosome_windows={}", stats.autosome_windows), format!("autosome_records={}", stats.autosome_records), format!("x_windows={}", stats.x_windows), @@ -104,19 +128,55 @@ pub(crate) fn infer_sex_from_alignment_path( format!("x_to_autosome_ratio={x_ratio:.3}"), format!("y_to_autosome_ratio={y_ratio:.3}"), ]; - if options.input_index.is_none() { - evidence.push("CRAM sex detection ran without explicit --input-index".to_owned()); - } - Ok(SexInference { + SexInference { sex, confidence, method: "alignment_autosome_x_y_depth_ratio".to_owned(), evidence, - }) + } +} + +fn sample_alignment_sex_windows_with_reader( + reader: &mut noodles::cram::io::indexed_reader::IndexedReader, + label: &str, + allow_reference_md5_mismatch: bool, +) -> Result { + let mut stats = AlignmentSexStats::default(); + for (chrom, center) in ALIGNMENT_AUTOSOME_WINDOWS { + stats.autosome_records += count_alignment_records_in_window( + reader, + label, + chrom, + *center, + allow_reference_md5_mismatch, + )?; + stats.autosome_windows += 1; + } + for center in ALIGNMENT_X_NON_PAR_WINDOWS { + stats.x_records += count_alignment_records_in_window( + reader, + label, + "X", + *center, + allow_reference_md5_mismatch, + )?; + stats.x_windows += 1; + } + for center in ALIGNMENT_Y_WINDOWS { + stats.y_records += count_alignment_records_in_window( + reader, + label, + "Y", + *center, + allow_reference_md5_mismatch, + )?; + stats.y_windows += 1; + } + Ok(stats) } -fn sample_alignment_sex_windows( +fn sample_alignment_sex_windows_from_path( path: &Path, options: &GenotypeLoadOptions, reference_file: &Path, diff --git a/rust/bioscript-formats/src/lib.rs b/rust/bioscript-formats/src/lib.rs index b01d3b7..b349f43 100644 --- a/rust/bioscript-formats/src/lib.rs +++ b/rust/bioscript-formats/src/lib.rs @@ -20,6 +20,7 @@ pub use genotype::{ }; pub use inspect::{ DetectedKind, DetectionConfidence, FileContainer, FileInspection, InferredSex, InspectOptions, - SexDetectionConfidence, SexInference, SourceMetadata, inspect_bytes, inspect_file, + SexDetectionConfidence, SexInference, SourceMetadata, infer_sex_from_alignment_reader, + infer_sex_from_text_lines, inspect_bytes, inspect_file, }; pub use prepare::{PrepareRequest, PreparedPaths, prepare_indexes, shell_flags}; diff --git a/rust/bioscript-wasm/src/lookup_api.rs b/rust/bioscript-wasm/src/lookup_api.rs index a52297b..1aa133c 100644 --- a/rust/bioscript-wasm/src/lookup_api.rs +++ b/rust/bioscript-wasm/src/lookup_api.rs @@ -285,6 +285,8 @@ fn variant_input_to_spec(variant: &VariantInput) -> Result } else { None }, + grch37_assembly_ref: None, + grch38_assembly_ref: None, reference: Some(variant.ref_base.clone()), alternate: Some(variant.alt_base.clone()), kind, diff --git a/rust/bioscript-wasm/src/report_api.rs b/rust/bioscript-wasm/src/report_api.rs index 2c33a32..b6aaad1 100644 --- a/rust/bioscript-wasm/src/report_api.rs +++ b/rust/bioscript-wasm/src/report_api.rs @@ -10,8 +10,7 @@ use bioscript_core::{ VariantSpec, }; use bioscript_formats::{ - GenotypeLoadOptions, GenotypeStore, InferredSex, InspectOptions, SexDetectionConfidence, - SexInference, inspect_bytes as inspect_bytes_rs, + GenotypeLoadOptions, GenotypeStore, InspectOptions, inspect_bytes as inspect_bytes_rs, }; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig}; use bioscript_schema::{ @@ -24,6 +23,8 @@ use wasm_bindgen::prelude::*; #[path = "report_helpers.rs"] mod report_helpers; +#[path = "report_input_inspection.rs"] +mod report_input_inspection; #[path = "report_lookup.rs"] mod report_lookup; #[path = "report_render.rs"] @@ -32,6 +33,10 @@ mod report_render; mod report_workspace; use report_helpers::*; +use report_input_inspection::{ + decompress_vcf_head_lines, explicit_sex_from_options, inspect_head_via_js_reader, + vcf_sex_via_tabix, +}; use report_lookup::{CramReportLookup, VcfReportLookup}; use report_render::{ AppReportJsonInput, app_report_json, match_app_findings, render_app_html_document, @@ -48,7 +53,7 @@ include!("../../bioscript-cli/src/report_html_helpers.rs"); #[derive(Deserialize)] #[serde(rename_all = "camelCase")] -struct PackageFileInput { +pub(super) struct PackageFileInput { path: String, contents: String, #[serde(default)] @@ -57,13 +62,18 @@ struct PackageFileInput { #[derive(Default, Deserialize)] #[serde(rename_all = "camelCase")] -struct ReportOptionsInput { +pub(super) struct ReportOptionsInput { #[serde(default = "default_analysis_max_duration_ms")] analysis_max_duration_ms: u64, #[serde(default)] detect_sex: bool, #[serde(default)] filters: Vec, + /// Optional explicit sample sex (mirrors the CLI's `--sample-sex` flag). + /// When set, takes precedence over inference: the report carries + /// `method=explicit_sample_sex` like the CLI. + #[serde(default)] + sample_sex: Option, } #[derive(Serialize)] @@ -219,6 +229,12 @@ pub fn run_package_report_from_cram( let manifest_metadata = workspace.report_manifest_metadata(manifest_path)?; let findings = workspace.load_manifest_findings(manifest_path)?; let provenance = workspace.load_manifest_provenance_links(manifest_path)?; + let mut head_inspection = inspect_head_via_js_reader( + &cram_read_at, + cram_len as u64, + input_name, + false, // sex detection runs separately below via the indexed reader + ); let crai_index = bioscript_formats::alignment::parse_crai_bytes(crai_bytes) .map_err(|err| JsError::new(&format!("parse crai: {err:?}")))?; @@ -242,6 +258,26 @@ pub fn run_package_report_from_cram( label: input_name.to_owned(), }; + // CRAM sex detection: explicit override wins, otherwise alignment Y/X + // coverage analysis through the same reader the variant lookup will use. + if let Some(explicit) = explicit_sex_from_options(&options) { + head_inspection.inferred_sex = Some(explicit); + } else if options.detect_sex { + let mut reader_borrow = lookup.reader.borrow_mut(); + match bioscript_formats::infer_sex_from_alignment_reader( + &mut reader_borrow, + &lookup.label, + true, + ) { + Ok(inference) => head_inspection.inferred_sex = Some(inference), + Err(err) => { + head_inspection + .evidence + .push(format!("alignment sex detection failed: {err:?}")); + } + } + } + let mut loader = GenotypeLoadOptions::default(); loader.format = Some(bioscript_formats::GenotypeSourceFormat::Cram); loader.allow_reference_md5_mismatch = true; @@ -276,7 +312,7 @@ pub fn run_package_report_from_cram( analyses: &analyses, findings: &matched_findings, provenance: &provenance, - input_inspection: None, + input_inspection: Some(&head_inspection), manifest_metadata: &manifest_metadata, })]; let observations_tsv = render_app_observations_tsv(&observations)?; @@ -329,6 +365,20 @@ pub fn run_package_report_from_vcf( let manifest_metadata = workspace.report_manifest_metadata(manifest_path)?; let findings = workspace.load_manifest_findings(manifest_path)?; let provenance = workspace.load_manifest_provenance_links(manifest_path)?; + // Inspect format/source/assembly from the head, but skip the byte-stream + // sex detection — we'll do that via tabix-targeted X non-PAR queries + // below, which works on indexed VCFs of any size. + let mut head_inspection = inspect_head_via_js_reader( + &vcf_read_at, + vcf_len as u64, + input_name, + false, + ); + // Decompress the head once to grab the VCF header lines (## meta + #CHROM + // column header) — these are needed by `infer_sex_from_text_lines` to + // figure out delimiter / column indexes for the data lines we'll pull + // via tabix below. + let head_lines = decompress_vcf_head_lines(&vcf_read_at, vcf_len as u64); let tabix_index = bioscript_formats::alignment::parse_tbi_bytes(tbi_bytes) .map_err(|err| JsError::new(&format!("parse tbi: {err:?}")))?; @@ -340,6 +390,15 @@ pub fn run_package_report_from_vcf( label: input_name.to_owned(), }; + if let Some(explicit) = explicit_sex_from_options(&options) { + head_inspection.inferred_sex = Some(explicit); + } else if options.detect_sex { + let mut reader_borrow = lookup.reader.borrow_mut(); + if let Some(inference) = vcf_sex_via_tabix(&mut reader_borrow, &head_lines) { + head_inspection.inferred_sex = Some(inference); + } + } + let mut loader = GenotypeLoadOptions::default(); loader.format = Some(bioscript_formats::GenotypeSourceFormat::Vcf); let manifest_output = @@ -372,7 +431,7 @@ pub fn run_package_report_from_vcf( analyses: &analyses, findings: &matched_findings, provenance: &provenance, - input_inspection: None, + input_inspection: Some(&head_inspection), manifest_metadata: &manifest_metadata, })]; let observations_tsv = render_app_observations_tsv(&observations)?; diff --git a/rust/bioscript-wasm/src/report_input_inspection.rs b/rust/bioscript-wasm/src/report_input_inspection.rs new file mode 100644 index 0000000..b9656a3 --- /dev/null +++ b/rust/bioscript-wasm/src/report_input_inspection.rs @@ -0,0 +1,177 @@ +use super::ReportOptionsInput; + +/// Pull a head sample from the JS-backed reader and run it through the same +/// `inspect_bytes_rs` path the text/zip flow uses. This populates the +/// "Input" metadata block in the rust HTML report (Format / Source / +/// Assembly / Inferred sex / Evidence) for CRAM and VCF inputs, which +/// otherwise had no inspection data. +pub(super) fn inspect_head_via_js_reader( + read_at: &js_sys::Function, + total_len: u64, + input_name: &str, + detect_sex: bool, +) -> bioscript_formats::FileInspection { + use crate::js_reader::JsReader; + use std::io::Read; + + // For VCF/CRAM, sex detection and full assembly inference require + // scanning many records. 8 MiB is enough for hundreds of decompressed + // VCF records and several CRAM containers. + let head_len = total_len.min(8 * 1024 * 1024); + let mut reader = JsReader::new(read_at.clone(), total_len, "inspect"); + let mut buf = vec![0u8; head_len as usize]; + let mut filled = 0usize; + while filled < buf.len() { + match reader.read(&mut buf[filled..]) { + Ok(0) => break, + Ok(n) => filled += n, + Err(_) => break, + } + } + buf.truncate(filled); + let opts = bioscript_formats::InspectOptions { + input_index: None, + reference_file: None, + reference_index: None, + detect_sex, + }; + match bioscript_formats::inspect_bytes(input_name, &buf, &opts) { + Ok(inspection) => inspection, + Err(err) => bioscript_formats::FileInspection { + path: std::path::PathBuf::from(input_name), + container: bioscript_formats::FileContainer::Plain, + detected_kind: bioscript_formats::DetectedKind::Unknown, + confidence: bioscript_formats::DetectionConfidence::Unknown, + source: None, + assembly: None, + phased: None, + selected_entry: None, + has_index: None, + index_path: None, + reference_matches: None, + inferred_sex: None, + evidence: vec![format!("inspect_bytes failed: {err:?}")], + warnings: Vec::new(), + duration_ms: 0, + }, + } +} + +/// Build the `SexInference` the CLI produces when `--sample-sex` is passed, +/// without dragging in the bioscript-cli crate. Mirrors +/// `bioscript_cli::report_options::explicit_sample_sex_inference`. +pub(super) fn explicit_sex_from_options( + options: &ReportOptionsInput, +) -> Option { + let raw = options.sample_sex.as_deref()?.trim().to_ascii_lowercase(); + let sex = match raw.as_str() { + "male" | "m" => bioscript_formats::InferredSex::Male, + "female" | "f" => bioscript_formats::InferredSex::Female, + "unknown" | "u" | "" => bioscript_formats::InferredSex::Unknown, + _ => return None, + }; + Some(bioscript_formats::SexInference { + sex, + confidence: bioscript_formats::SexDetectionConfidence::High, + method: "explicit_sample_sex".to_owned(), + evidence: vec!["source=sample_sex_option".to_owned()], + }) +} + +const VCF_X_NON_PAR_WINDOWS_GRCH38: &[(i64, i64)] = &[ + (10_000_000, 11_000_000), + (40_000_000, 41_000_000), + (70_000_000, 71_000_000), + (100_000_000, 101_000_000), + (130_000_000, 131_000_000), +]; +const VCF_Y_WINDOWS_GRCH38: &[(i64, i64)] = &[ + (3_500_000, 4_500_000), + (10_000_000, 11_000_000), + (15_000_000, 16_000_000), +]; + +/// Sex inference for indexed VCFs that streams only X non-PAR + Y windows +/// instead of scanning the whole file. Reuses the shared +/// `infer_sex_from_text_lines` so classification rules match the CLI. +pub(super) fn vcf_sex_via_tabix( + reader: &mut noodles::csi::io::IndexedReader, noodles::tabix::Index>, + head_lines: &[String], +) -> Option { + let mut lines = head_lines.to_vec(); + for chrom_label in ["X", "chrX"] { + for (start, end) in VCF_X_NON_PAR_WINDOWS_GRCH38 { + let Some(region) = build_region(chrom_label, *start, *end) else { + continue; + }; + if let Ok(query) = reader.query(®ion) { + for record_result in query { + let Ok(record) = record_result else { + continue; + }; + let line: &str = record.as_ref(); + lines.push(line.to_owned()); + } + } + } + for (start, end) in VCF_Y_WINDOWS_GRCH38 { + let y_label = if chrom_label == "X" { "Y" } else { "chrY" }; + let Some(region) = build_region(y_label, *start, *end) else { + continue; + }; + if let Ok(query) = reader.query(®ion) { + for record_result in query { + let Ok(record) = record_result else { + continue; + }; + let line: &str = record.as_ref(); + lines.push(line.to_owned()); + } + } + } + } + bioscript_formats::infer_sex_from_text_lines(&lines, bioscript_formats::DetectedKind::Vcf).ok() +} + +fn build_region(chrom: &str, start: i64, end: i64) -> Option { + use noodles::core::{Position, Region}; + let s = Position::try_from(usize::try_from(start.max(1)).ok()?).ok()?; + let e = Position::try_from(usize::try_from(end.max(start)).ok()?).ok()?; + Some(Region::new(chrom, s..=e)) +} + +/// Pull the VCF header from the bgzf head so `infer_sex_from_text_lines` can +/// resolve delimiter and column indexes for the X/Y records added via tabix. +pub(super) fn decompress_vcf_head_lines( + read_at: &js_sys::Function, + total_len: u64, +) -> Vec { + use crate::js_reader::JsReader; + use std::io::{BufRead, BufReader, Read}; + + let head_len = total_len.min(2 * 1024 * 1024); + let mut reader = JsReader::new(read_at.clone(), total_len, "vcf-head"); + let mut buf = vec![0u8; head_len as usize]; + let mut filled = 0usize; + while filled < buf.len() { + match reader.read(&mut buf[filled..]) { + Ok(0) => break, + Ok(n) => filled += n, + Err(_) => break, + } + } + buf.truncate(filled); + let cursor = std::io::Cursor::new(buf); + let mut bgzf_reader = BufReader::new(noodles::bgzf::io::Reader::new(cursor)); + let mut lines = Vec::new(); + let mut line = String::new(); + for _ in 0..1024 { + line.clear(); + match bgzf_reader.read_line(&mut line) { + Ok(0) => break, + Ok(_) => lines.push(line.trim_end_matches(['\n', '\r']).to_owned()), + Err(_) => break, + } + } + lines +} diff --git a/rust/bioscript-wasm/src/report_workspace/analysis.rs b/rust/bioscript-wasm/src/report_workspace/analysis.rs index c522419..1d7cdfa 100644 --- a/rust/bioscript-wasm/src/report_workspace/analysis.rs +++ b/rust/bioscript-wasm/src/report_workspace/analysis.rs @@ -1,7 +1,7 @@ use super::*; impl PackageWorkspace { - pub(super) fn run_manifest_analyses( + pub(crate) fn run_manifest_analyses( &self, manifest_path: &str, input_name: &str, From e66040a049f0b39e77f8811e79fa879608325465 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Mon, 11 May 2026 21:48:41 +1000 Subject: [PATCH 2/2] fixing issues with wasm reporting versus rust --- rust/bioscript-formats/src/genotype.rs | 124 ++++---------- .../src/genotype/backends.rs | 9 +- .../src/genotype/delimited.rs | 2 +- rust/bioscript-formats/src/genotype/io.rs | 20 +-- .../bioscript-formats/src/genotype/loaders.rs | 160 ++++++++++++++++++ rust/bioscript-formats/src/genotype/types.rs | 5 + rust/bioscript-formats/src/genotype/vcf.rs | 6 +- .../src/genotype/vcf/matching.rs | 2 +- rust/bioscript-formats/src/lib.rs | 5 +- rust/bioscript-wasm/Cargo.toml | 7 +- rust/bioscript-wasm/src/report_api.rs | 11 +- rust/bioscript-wasm/src/report_helpers.rs | 47 ++++- rust/bioscript-wasm/src/report_lookup.rs | 39 +++-- rust/bioscript-wasm/src/report_workspace.rs | 8 +- 14 files changed, 300 insertions(+), 145 deletions(-) create mode 100644 rust/bioscript-formats/src/genotype/loaders.rs diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index 805c805..050f8cf 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -1,7 +1,7 @@ use std::{ collections::HashMap, fs::File, - io::{BufReader, Cursor}, + io::{BufRead, BufReader, Cursor}, path::Path, }; @@ -16,6 +16,7 @@ mod common; mod cram_backend; mod delimited; mod io; +mod loaders; mod types; mod vcf; mod vcf_tokens; @@ -32,7 +33,7 @@ use cram_backend::{ }; pub use cram_backend::{observe_cram_indel_with_reader, observe_cram_snp_with_reader}; pub(crate) use delimited::{ - DelimitedColumnIndexes, Delimiter, detect_delimiter, parse_streaming_row, + COMMENT_PREFIXES, DelimitedColumnIndexes, Delimiter, detect_delimiter, parse_streaming_row, }; #[cfg(test)] use delimited::{GENOTYPE_ALIASES, split_csv_line, strip_bom, strip_inline_comment}; @@ -44,31 +45,27 @@ use delimited::{ }; #[cfg(test)] use io::looks_like_vcf_lines; -use io::{ - detect_source_format, is_bgzf_path, read_lines_from_reader, read_zip_entry_limited, - select_zip_entry, -}; +use io::{detect_source_format, is_bgzf_path, read_lines_from_reader, select_zip_entry}; pub use types::{ BackendCapabilities, GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind, }; use types::{CramBackend, DelimitedBackend, QueryBackend, RsidMapBackend, VcfBackend}; +pub use vcf::{ + choose_variant_locus_for_assembly, imputed_reference_observation, observe_vcf_snp_with_reader, + observe_vcf_variant_with_reader, +}; #[cfg(test)] use vcf::{ - choose_variant_locus_for_assembly, detect_vcf_assembly, extract_vcf_sample_genotype, - normalize_chromosome_name, parse_vcf_record, vcf_row_matches_variant, -}; -pub use vcf::{ - imputed_reference_observation, observe_vcf_snp_with_reader, observe_vcf_variant_with_reader, + detect_vcf_assembly, extract_vcf_sample_genotype, normalize_chromosome_name, parse_vcf_record, + vcf_row_matches_variant, }; use vcf::{lookup_indexed_vcf_variants, scan_vcf_variants}; -use vcf_tokens::genotype_from_vcf_gt; +pub(crate) use vcf_tokens::genotype_from_vcf_gt; #[cfg(test)] use vcf_tokens::{ is_symbolic_vcf_alt, normalize_sequence_token, vcf_alt_token, vcf_reference_token, }; -const MAX_ZIP_ENTRY_BYTES: u64 = 128 * 1024 * 1024; - impl GenotypeStore { pub fn from_file(path: &Path) -> Result { Self::from_file_with_options(path, &GenotypeLoadOptions::default()) @@ -102,6 +99,7 @@ impl GenotypeStore { backend: QueryBackend::RsidMap(RsidMapBackend { format: GenotypeSourceFormat::Text, values: HashMap::new(), + source_lines: HashMap::new(), }), } } @@ -131,13 +129,11 @@ impl GenotypeStore { if lower.ends_with(".zip") { return Self::from_zip_bytes(name, bytes); } + let reader = BufReader::new(Cursor::new(bytes)); if lower.ends_with(".vcf") { - let lines = - read_lines_from_reader(BufReader::new(Cursor::new(bytes)), Path::new(name))?; - return Self::from_vcf_lines(lines); + return Self::from_vcf_reader(reader, name); } - let lines = read_lines_from_reader(BufReader::new(Cursor::new(bytes)), Path::new(name))?; - Self::from_delimited_lines(GenotypeSourceFormat::Text, lines) + Self::from_delimited_reader(GenotypeSourceFormat::Text, reader, name) } fn from_zip_bytes(name: &str, bytes: &[u8]) -> Result { @@ -168,22 +164,22 @@ impl GenotypeStore { "zip archive {name} does not contain a supported genotype file" )) })?; - let mut entry = archive.by_name(&selected).map_err(|err| { + let entry = archive.by_name(&selected).map_err(|err| { RuntimeError::Io(format!( "failed to open genotype entry {selected} in {name}: {err}" )) })?; - let contents = read_zip_entry_limited( - &mut entry, - MAX_ZIP_ENTRY_BYTES, - &format!("genotype entry {selected} in {name}"), - )?; - let lines = - read_lines_from_reader(BufReader::new(Cursor::new(contents)), Path::new(&selected))?; + let label = format!("genotype entry {selected} in {name}"); + // Stream-decompress directly off the zip reader so we never have to + // materialize the entire decompressed entry in memory. GenesForGood + // exports decompress to >128MB which used to trip the old + // `read_zip_entry_limited` cap; the cap is gone because the streaming + // parser keeps memory bounded to the rsid map itself. + let reader = BufReader::new(entry); if selected.to_ascii_lowercase().ends_with(".vcf") { - return Self::from_vcf_lines(lines); + return Self::from_vcf_reader(reader, &label); } - Self::from_delimited_lines(GenotypeSourceFormat::Zip, lines) + Self::from_delimited_reader(GenotypeSourceFormat::Zip, reader, &label) } fn from_vcf_file(path: &Path, options: &GenotypeLoadOptions) -> Self { @@ -236,63 +232,20 @@ impl GenotypeStore { }) } - fn from_vcf_lines(lines: Vec) -> Result { - let mut values = HashMap::new(); - - for line in lines { - let trimmed = line.trim(); - if trimmed.is_empty() || trimmed.starts_with("##") || trimmed.starts_with("#CHROM") { - continue; - } - - let fields: Vec<&str> = trimmed.split('\t').collect(); - if fields.len() < 10 { - continue; - } - - let rsid = fields[2].trim(); - if rsid.is_empty() || rsid == "." { - continue; - } - - let reference = fields[3].trim(); - let alternates: Vec<&str> = fields[4] - .split(',') - .map(str::trim) - .filter(|alt| !alt.is_empty() && *alt != ".") - .collect(); - if reference.is_empty() || alternates.is_empty() { - continue; - } - - let sample_gt = fields[9].split(':').next().unwrap_or("."); - if let Some(genotype) = genotype_from_vcf_gt(sample_gt, reference, &alternates) { - values.insert(rsid.to_owned(), genotype); - } - } - - Ok(Self::from_rsid_map(GenotypeSourceFormat::Vcf, values)) + fn from_vcf_reader(reader: R, label: &str) -> Result { + loaders::from_vcf_reader(reader, label) } - fn from_delimited_lines( + fn from_delimited_reader( format: GenotypeSourceFormat, - lines: Vec, + reader: R, + label: &str, ) -> Result { - let delimiter = detect_delimiter(&lines); - let mut parser = RowParser::new(delimiter); - let mut values = HashMap::new(); - for line in lines { - if let Some((rsid, genotype)) = parser.consume_line(&line)? { - values.insert(rsid, genotype); - } - } - Ok(Self::from_rsid_map(format, values)) + loaders::from_delimited_reader(format, reader, label) } - fn from_rsid_map(format: GenotypeSourceFormat, values: HashMap) -> Self { - Self { - backend: QueryBackend::RsidMap(RsidMapBackend { format, values }), - } + fn from_vcf_lines(lines: Vec) -> Result { + loaders::from_vcf_lines(lines) } fn from_delimited_file( @@ -1683,15 +1636,4 @@ mod tests { .unwrap_err(); assert!(err.to_string().contains("invalid VCF position")); } - - #[test] - fn zip_entry_limited_reader_rejects_oversized_output() { - let mut reader = std::io::Cursor::new(b"abcdef".to_vec()); - let err = read_zip_entry_limited(&mut reader, 5, "test zip entry").unwrap_err(); - assert!( - err.to_string() - .contains("test zip entry exceeds decompressed limit of 5 bytes"), - "{err}" - ); - } } diff --git a/rust/bioscript-formats/src/genotype/backends.rs b/rust/bioscript-formats/src/genotype/backends.rs index b2e70cd..32ca5a8 100644 --- a/rust/bioscript-formats/src/genotype/backends.rs +++ b/rust/bioscript-formats/src/genotype/backends.rs @@ -22,11 +22,18 @@ impl RsidMapBackend { ) -> Result { for rsid in &variant.rsids { if let Some(value) = self.values.get(rsid) { + let mut evidence = vec![format!("resolved by rsid {rsid}")]; + // Mirror DelimitedBackend's `| source line: …` evidence so + // wasm-side from_bytes loads produce byte-identical reports + // to the CLI's path-backed DelimitedBackend. + if let Some(source) = self.source_lines.get(rsid) { + evidence.push(format!("source line: {source}")); + } return Ok(VariantObservation { backend: self.backend_name().to_owned(), matched_rsid: Some(rsid.clone()), genotype: Some(value.clone()), - evidence: vec![format!("resolved by rsid {rsid}")], + evidence, ..VariantObservation::default() }); } diff --git a/rust/bioscript-formats/src/genotype/delimited.rs b/rust/bioscript-formats/src/genotype/delimited.rs index a4240a8..02b1da3 100644 --- a/rust/bioscript-formats/src/genotype/delimited.rs +++ b/rust/bioscript-formats/src/genotype/delimited.rs @@ -8,7 +8,7 @@ mod scan; pub(crate) use scan::scan_delimited_variants; -const COMMENT_PREFIXES: [&str; 2] = ["#", "//"]; +pub(crate) const COMMENT_PREFIXES: [&str; 2] = ["#", "//"]; const RSID_ALIASES: &[&str] = &["rsid", "name", "snp", "marker", "id", "snpid"]; const CHROM_ALIASES: &[&str] = &["chromosome", "chr", "chrom"]; const POSITION_ALIASES: &[&str] = &[ diff --git a/rust/bioscript-formats/src/genotype/io.rs b/rust/bioscript-formats/src/genotype/io.rs index c70622d..8d69e55 100644 --- a/rust/bioscript-formats/src/genotype/io.rs +++ b/rust/bioscript-formats/src/genotype/io.rs @@ -1,6 +1,6 @@ use std::{ fs::File, - io::{BufRead, BufReader, Read}, + io::{BufRead, BufReader}, path::Path, }; @@ -96,24 +96,6 @@ pub(crate) fn read_lines_from_reader( Ok(lines) } -pub(crate) fn read_zip_entry_limited( - reader: &mut R, - max_bytes: u64, - label: &str, -) -> Result, RuntimeError> { - let mut contents = Vec::new(); - reader - .take(max_bytes.saturating_add(1)) - .read_to_end(&mut contents) - .map_err(|err| RuntimeError::Io(format!("failed to read {label}: {err}")))?; - if u64::try_from(contents.len()).unwrap_or(u64::MAX) > max_bytes { - return Err(RuntimeError::InvalidArguments(format!( - "{label} exceeds decompressed limit of {max_bytes} bytes" - ))); - } - Ok(contents) -} - pub(crate) fn detect_source_format( path: &Path, forced: Option, diff --git a/rust/bioscript-formats/src/genotype/loaders.rs b/rust/bioscript-formats/src/genotype/loaders.rs new file mode 100644 index 0000000..27079cf --- /dev/null +++ b/rust/bioscript-formats/src/genotype/loaders.rs @@ -0,0 +1,160 @@ +use std::{collections::HashMap, io::BufRead}; + +use bioscript_core::RuntimeError; + +use super::{ + COMMENT_PREFIXES, GenotypeSourceFormat, GenotypeStore, QueryBackend, RowParser, RsidMapBackend, + detect_delimiter, vcf_tokens::genotype_from_vcf_gt, +}; + +pub(crate) fn from_vcf_reader( + mut reader: R, + label: &str, +) -> Result { + let mut values = HashMap::new(); + let mut buf = String::new(); + loop { + buf.clear(); + let bytes = reader + .read_line(&mut buf) + .map_err(|err| RuntimeError::Io(format!("failed to read {label}: {err}")))?; + if bytes == 0 { + break; + } + read_vcf_rsid_line(buf.trim_end_matches(['\n', '\r']), &mut values); + } + + Ok(from_rsid_map( + GenotypeSourceFormat::Vcf, + values, + HashMap::new(), + )) +} + +pub(crate) fn from_delimited_reader( + format: GenotypeSourceFormat, + mut reader: R, + label: &str, +) -> Result { + // Buffer lines up to the first non-empty/non-comment line so delimiter + // detection sees representative input, then stream the rest directly. + let mut prelude: Vec = Vec::new(); + let mut buf = String::new(); + let mut delimiter = None; + loop { + buf.clear(); + let bytes = reader + .read_line(&mut buf) + .map_err(|err| RuntimeError::Io(format!("failed to read {label}: {err}")))?; + if bytes == 0 { + break; + } + let line = buf.trim_end_matches(['\n', '\r']).to_owned(); + let trimmed = line.trim(); + let is_data = !trimmed.is_empty() + && !COMMENT_PREFIXES + .iter() + .any(|prefix| trimmed.starts_with(prefix)); + prelude.push(line); + if is_data { + delimiter = Some(detect_delimiter(&prelude)); + break; + } + } + + let mut parser = RowParser::new(delimiter.unwrap_or(super::Delimiter::Tab)); + let mut values = HashMap::new(); + let mut source_lines = HashMap::new(); + for line in prelude { + consume_delimited_line(&mut parser, &line, &mut values, &mut source_lines)?; + } + loop { + buf.clear(); + let bytes = reader + .read_line(&mut buf) + .map_err(|err| RuntimeError::Io(format!("failed to read {label}: {err}")))?; + if bytes == 0 { + break; + } + consume_delimited_line( + &mut parser, + buf.trim_end_matches(['\n', '\r']), + &mut values, + &mut source_lines, + )?; + } + + Ok(from_rsid_map(format, values, source_lines)) +} + +pub(crate) fn from_vcf_lines(lines: Vec) -> Result { + let mut values = HashMap::new(); + for line in lines { + read_vcf_rsid_line(line.trim(), &mut values); + } + Ok(from_rsid_map( + GenotypeSourceFormat::Vcf, + values, + HashMap::new(), + )) +} + +fn read_vcf_rsid_line(line: &str, values: &mut HashMap) { + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.starts_with("##") || trimmed.starts_with("#CHROM") { + return; + } + + let fields: Vec<&str> = trimmed.split('\t').collect(); + if fields.len() < 10 { + return; + } + + let rsid = fields[2].trim(); + if rsid.is_empty() || rsid == "." { + return; + } + + let reference = fields[3].trim(); + let alternates: Vec<&str> = fields[4] + .split(',') + .map(str::trim) + .filter(|alt| !alt.is_empty() && *alt != ".") + .collect(); + if reference.is_empty() || alternates.is_empty() { + return; + } + + let sample_gt = fields[9].split(':').next().unwrap_or("."); + if let Some(genotype) = genotype_from_vcf_gt(sample_gt, reference, &alternates) { + values.insert(rsid.to_owned(), genotype); + } +} + +fn consume_delimited_line( + parser: &mut RowParser, + line: &str, + values: &mut HashMap, + source_lines: &mut HashMap, +) -> Result<(), RuntimeError> { + let trimmed = line.trim().to_owned(); + if let Some((rsid, genotype)) = parser.consume_line(line)? { + values.insert(rsid.clone(), genotype); + source_lines.insert(rsid, trimmed); + } + Ok(()) +} + +fn from_rsid_map( + format: GenotypeSourceFormat, + values: HashMap, + source_lines: HashMap, +) -> GenotypeStore { + GenotypeStore { + backend: QueryBackend::RsidMap(RsidMapBackend { + format, + values, + source_lines, + }), + } +} diff --git a/rust/bioscript-formats/src/genotype/types.rs b/rust/bioscript-formats/src/genotype/types.rs index b65d47b..3572faf 100644 --- a/rust/bioscript-formats/src/genotype/types.rs +++ b/rust/bioscript-formats/src/genotype/types.rs @@ -33,6 +33,11 @@ pub(crate) enum QueryBackend { pub(crate) struct RsidMapBackend { pub(crate) format: GenotypeSourceFormat, pub(crate) values: HashMap, + /// Original input line per rsid, retained so wasm-side `from_bytes` loads + /// can emit the same `| source line: …` evidence that the CLI's + /// path-backed `DelimitedBackend` does on every lookup. Empty for + /// in-memory maps that don't have a line representation. + pub(crate) source_lines: HashMap, } #[derive(Debug, Clone)] diff --git a/rust/bioscript-formats/src/genotype/vcf.rs b/rust/bioscript-formats/src/genotype/vcf.rs index cf8fa57..1440891 100644 --- a/rust/bioscript-formats/src/genotype/vcf.rs +++ b/rust/bioscript-formats/src/genotype/vcf.rs @@ -20,10 +20,8 @@ use super::{ mod matching; mod reader; -pub use matching::imputed_reference_observation; -pub(crate) use matching::{ - choose_variant_locus_for_assembly, normalize_chromosome_name, vcf_row_matches_variant, -}; +pub use matching::{choose_variant_locus_for_assembly, imputed_reference_observation}; +pub(crate) use matching::{normalize_chromosome_name, vcf_row_matches_variant}; pub use reader::{observe_vcf_snp_with_reader, observe_vcf_variant_with_reader}; #[derive(Debug, Clone)] diff --git a/rust/bioscript-formats/src/genotype/vcf/matching.rs b/rust/bioscript-formats/src/genotype/vcf/matching.rs index afecd10..a0bbddc 100644 --- a/rust/bioscript-formats/src/genotype/vcf/matching.rs +++ b/rust/bioscript-formats/src/genotype/vcf/matching.rs @@ -4,7 +4,7 @@ use crate::inspect::InferredSex; use super::ParsedVcfRow; -pub(crate) fn choose_variant_locus_for_assembly( +pub fn choose_variant_locus_for_assembly( variant: &VariantSpec, assembly: Option, ) -> Option { diff --git a/rust/bioscript-formats/src/lib.rs b/rust/bioscript-formats/src/lib.rs index b349f43..38241bf 100644 --- a/rust/bioscript-formats/src/lib.rs +++ b/rust/bioscript-formats/src/lib.rs @@ -15,8 +15,9 @@ mod prepare; pub use genotype::{ BackendCapabilities, GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind, - imputed_reference_observation, observe_cram_indel_with_reader, observe_cram_snp_with_reader, - observe_vcf_snp_with_reader, observe_vcf_variant_with_reader, + choose_variant_locus_for_assembly, imputed_reference_observation, + observe_cram_indel_with_reader, observe_cram_snp_with_reader, observe_vcf_snp_with_reader, + observe_vcf_variant_with_reader, }; pub use inspect::{ DetectedKind, DetectionConfidence, FileContainer, FileInspection, InferredSex, InspectOptions, diff --git a/rust/bioscript-wasm/Cargo.toml b/rust/bioscript-wasm/Cargo.toml index 3dcee73..fc744af 100644 --- a/rust/bioscript-wasm/Cargo.toml +++ b/rust/bioscript-wasm/Cargo.toml @@ -17,7 +17,12 @@ noodles = { version = "0.109.0", features = ["bgzf", "cram", "csi", "fasta", "ta wasm-bindgen = "0.2" js-sys = "0.3" serde = { version = "1", features = ["derive"] } -serde_json = "1" +# `preserve_order` matches `bioscript-cli`'s feature set so the wasm-produced +# JSON dumps key fields in the same insertion order the CLI does. Without +# this, BTreeMap-backed alphabetical sorting moves `assay_id` after +# `analyses` in the raw `index.html` JSON dump and the byte-diff vs the CLI +# blows up. +serde_json = { version = "1", features = ["preserve_order"] } serde_yaml = "0.9" sha2 = "0.10" zip = { version = "2.2.0", default-features = false, features = ["deflate"] } diff --git a/rust/bioscript-wasm/src/report_api.rs b/rust/bioscript-wasm/src/report_api.rs index b6aaad1..33d6231 100644 --- a/rust/bioscript-wasm/src/report_api.rs +++ b/rust/bioscript-wasm/src/report_api.rs @@ -10,7 +10,9 @@ use bioscript_core::{ VariantSpec, }; use bioscript_formats::{ - GenotypeLoadOptions, GenotypeStore, InspectOptions, inspect_bytes as inspect_bytes_rs, + DetectedKind, DetectionConfidence, FileContainer, FileInspection, GenotypeLoadOptions, + GenotypeStore, InferredSex, InspectOptions, SexDetectionConfidence, SexInference, + inspect_bytes as inspect_bytes_rs, }; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig}; use bioscript_schema::{ @@ -111,7 +113,7 @@ pub fn run_package_report_bytes( }; let workspace = PackageWorkspace::new(package_files)?; let participant_id = participant_id_from_name(input_name); - let assay_id = app_assay_id(Path::new(manifest_path))?; + let assay_id = app_assay_id_from_workspace(&workspace, manifest_path)?; let manifest_metadata = workspace.report_manifest_metadata(manifest_path)?; let findings = workspace.load_manifest_findings(manifest_path)?; let provenance = workspace.load_manifest_provenance_links(manifest_path)?; @@ -225,7 +227,7 @@ pub fn run_package_report_from_cram( }; let workspace = PackageWorkspace::new(package_files)?; let participant_id = participant_id_from_name(input_name); - let assay_id = app_assay_id(Path::new(manifest_path))?; + let assay_id = app_assay_id_from_workspace(&workspace, manifest_path)?; let manifest_metadata = workspace.report_manifest_metadata(manifest_path)?; let findings = workspace.load_manifest_findings(manifest_path)?; let provenance = workspace.load_manifest_provenance_links(manifest_path)?; @@ -361,7 +363,7 @@ pub fn run_package_report_from_vcf( }; let workspace = PackageWorkspace::new(package_files)?; let participant_id = participant_id_from_name(input_name); - let assay_id = app_assay_id(Path::new(manifest_path))?; + let assay_id = app_assay_id_from_workspace(&workspace, manifest_path)?; let manifest_metadata = workspace.report_manifest_metadata(manifest_path)?; let findings = workspace.load_manifest_findings(manifest_path)?; let provenance = workspace.load_manifest_provenance_links(manifest_path)?; @@ -388,6 +390,7 @@ pub fn run_package_report_from_vcf( let lookup = VcfReportLookup { reader: std::cell::RefCell::new(indexed), label: input_name.to_owned(), + detected_assembly: head_inspection.assembly, }; if let Some(explicit) = explicit_sex_from_options(&options) { diff --git a/rust/bioscript-wasm/src/report_helpers.rs b/rust/bioscript-wasm/src/report_helpers.rs index ca7e284..e3a12ad 100644 --- a/rust/bioscript-wasm/src/report_helpers.rs +++ b/rust/bioscript-wasm/src/report_helpers.rs @@ -122,6 +122,32 @@ pub(super) fn participant_id_from_name(path: &str) -> String { .replace([' ', '\t', '\n'], "_") } +/// Derive the assay id from a manifest path — matches the CLI's +/// `bioscript-cli::report_execution::app_assay_id`, which loads the manifest +/// and returns its `name:` field (panels / assays / variants all carry one). +/// This function operates on a `PackageWorkspace` so it can find files in the +/// in-memory map without touching disk. +/// +/// Previously the wasm derived the id from the manifest filename stem (e.g. +/// `manifest.yaml` -> `manifest`), which diverged from the CLI's `pgx-1` +/// (panel `name:` field) and cascaded into the HTML report's +/// `participant_id × assay_id` keys. +pub(super) fn app_assay_id_from_workspace( + workspace: &PackageWorkspace, + manifest_path: &str, +) -> Result { + match workspace.schema(manifest_path)?.as_str() { + "bioscript:panel:1.0" => Ok(workspace.load_panel(manifest_path)?.name), + "bioscript:assay:1.0" => Ok(workspace.load_assay(manifest_path)?.name), + "bioscript:variant:1.0" | "bioscript:variant" => { + Ok(workspace.load_variant(manifest_path)?.name) + } + other => Err(JsError::new(&format!( + "unsupported manifest schema '{other}'" + ))), + } +} + pub(super) fn app_assay_id(path: &Path) -> Result { path.file_stem() .and_then(|value| value.to_str()) @@ -262,20 +288,20 @@ pub(super) fn input_inspection_json( bioscript_formats::DetectedKind::ReferenceFasta => "reference_fasta", bioscript_formats::DetectedKind::Unknown => "unknown", }, - "format_confidence": match inspection.confidence { - bioscript_formats::DetectionConfidence::Authoritative => "authoritative", - bioscript_formats::DetectionConfidence::StrongHeuristic => "strong_heuristic", - bioscript_formats::DetectionConfidence::WeakHeuristic => "weak_heuristic", - bioscript_formats::DetectionConfidence::Unknown => "unknown", - }, + "format_confidence": detection_confidence_name(inspection.confidence), "assembly": inspection.assembly.map(|assembly| match assembly { Assembly::Grch37 => "grch37", Assembly::Grch38 => "grch38", }), + "phased": inspection.phased, "selected_entry": inspection.selected_entry, + "has_index": inspection.has_index, + "index_path": inspection.index_path.as_ref().map(|path| path.display().to_string()), + "reference_matches": inspection.reference_matches, "source": inspection.source.as_ref().map(|source| serde_json::json!({ "vendor": source.vendor, "platform_version": source.platform_version, + "confidence": detection_confidence_name(source.confidence), "evidence": source.evidence, })), "inferred_sex": inspection.inferred_sex.as_ref().map(|sex| serde_json::json!({ @@ -290,6 +316,15 @@ pub(super) fn input_inspection_json( }) } +fn detection_confidence_name(value: bioscript_formats::DetectionConfidence) -> &'static str { + match value { + bioscript_formats::DetectionConfidence::Authoritative => "authoritative", + bioscript_formats::DetectionConfidence::StrongHeuristic => "strong_heuristic", + bioscript_formats::DetectionConfidence::WeakHeuristic => "weak_heuristic", + bioscript_formats::DetectionConfidence::Unknown => "unknown", + } +} + pub(super) fn yaml_string(value: &serde_yaml::Value, key: &str) -> Option { value .get(key) diff --git a/rust/bioscript-wasm/src/report_lookup.rs b/rust/bioscript-wasm/src/report_lookup.rs index dee827b..ff0ffcd 100644 --- a/rust/bioscript-wasm/src/report_lookup.rs +++ b/rust/bioscript-wasm/src/report_lookup.rs @@ -182,12 +182,19 @@ pub(super) struct VcfReportLookup { noodles::csi::io::IndexedReader, noodles::tabix::Index>, >, pub(super) label: String, + /// Assembly resolved from the VCF header (via `inspect_head_via_js_reader`). + /// Matches the CLI's `lookup_indexed_vcf_variants` flow which calls + /// `detect_vcf_assembly_from_path`. Without this the wasm picks GRCh38 + /// over GRCh37 for any panel variant that declares both loci, then + /// misses the variant in a GRCh37-coded VCF (NA06985.clean.vcf.gz etc.) + /// and falls through to "imputed reference". + pub(super) detected_assembly: Option, } impl report_workspace::VariantLookup for VcfReportLookup { fn lookup_variant(&self, spec: &VariantSpec) -> Result { let mut reader = self.reader.borrow_mut(); - observe_vcf_variant(&mut reader, &self.label, spec) + observe_vcf_variant(&mut reader, &self.label, spec, self.detected_assembly) } fn lookup_variants( @@ -197,7 +204,12 @@ impl report_workspace::VariantLookup for VcfRe let mut reader = self.reader.borrow_mut(); let mut out = Vec::with_capacity(specs.len()); for spec in specs { - out.push(observe_vcf_variant(&mut reader, &self.label, spec)?); + out.push(observe_vcf_variant( + &mut reader, + &self.label, + spec, + self.detected_assembly, + )?); } Ok(out) } @@ -210,16 +222,12 @@ fn observe_vcf_variant( >, label: &str, variant: &VariantSpec, + detected_assembly: Option, ) -> Result { - let assembly = variant - .grch38 - .as_ref() - .map(|_| Assembly::Grch38) - .or_else(|| variant.grch37.as_ref().map(|_| Assembly::Grch37)); - let raw_locus = variant - .grch38 - .as_ref() - .or(variant.grch37.as_ref()) + // Use the existing CLI helper so wasm picks the same locus the path-based + // path does: detected GRCh37 → grch37 first; detected GRCh38 → grch38 + // first; None → grch37 first (CLI default for variant-only VCFs). + let raw_locus = bioscript_formats::choose_variant_locus_for_assembly(variant, detected_assembly) .ok_or_else(|| { RuntimeError::Io(format!( "variant {} has no GRCh37/GRCh38 locus", @@ -230,6 +238,15 @@ fn observe_vcf_variant( .unwrap_or("variant") )) })?; + let assembly = detected_assembly.or_else(|| { + if variant.grch37.as_ref().is_some_and(|l| l == &raw_locus) { + Some(Assembly::Grch37) + } else if variant.grch38.as_ref().is_some_and(|l| l == &raw_locus) { + Some(Assembly::Grch38) + } else { + None + } + }); let locus = GenomicLocus { chrom: raw_locus.chrom.clone(), start: raw_locus.start, diff --git a/rust/bioscript-wasm/src/report_workspace.rs b/rust/bioscript-wasm/src/report_workspace.rs index e075abd..6efd6ca 100644 --- a/rust/bioscript-wasm/src/report_workspace.rs +++ b/rust/bioscript-wasm/src/report_workspace.rs @@ -64,7 +64,7 @@ impl PackageWorkspace { .map_err(|err| JsError::new(&format!("failed to parse YAML {path}: {err}"))) } - fn schema(&self, path: &str) -> Result { + pub(super) fn schema(&self, path: &str) -> Result { self.yaml(path)? .get("schema") .and_then(serde_yaml::Value::as_str) @@ -77,17 +77,17 @@ impl PackageWorkspace { normalize_package_path(&base.join(relative).display().to_string()) } - fn load_variant(&self, path: &str) -> Result { + pub(super) fn load_variant(&self, path: &str) -> Result { load_variant_manifest_text(path, self.text(path)?) .map_err(|err| JsError::new(&format!("load variant {path}: {err}"))) } - fn load_panel(&self, path: &str) -> Result { + pub(super) fn load_panel(&self, path: &str) -> Result { load_panel_manifest_text(path, self.text(path)?) .map_err(|err| JsError::new(&format!("load panel {path}: {err}"))) } - fn load_assay(&self, path: &str) -> Result { + pub(super) fn load_assay(&self, path: &str) -> Result { load_assay_manifest_text(path, self.text(path)?) .map_err(|err| JsError::new(&format!("load assay {path}: {err}"))) }