diff --git a/noodles b/noodles index bff28f9..51b904d 160000 --- a/noodles +++ b/noodles @@ -1 +1 @@ -Subproject commit bff28f9699ab25c1cb6d3661f07ff1381c869f90 +Subproject commit 51b904dd4ec4c462755ee14266f65f554a4e6d64 diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 9041e4f..7e35e9b 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -106,6 +106,7 @@ version = "0.2.0" dependencies = [ "bioscript-core", "bioscript-formats", + "bioscript-reporting", "bioscript-runtime", "bioscript-schema", "monty", @@ -143,6 +144,17 @@ dependencies = [ "zip", ] +[[package]] +name = "bioscript-reporting" +version = "0.2.0" +dependencies = [ + "bioscript-core", + "bioscript-formats", + "bioscript-schema", + "serde_json", + "serde_yaml", +] + [[package]] name = "bioscript-runtime" version = "0.2.0" @@ -170,6 +182,7 @@ version = "0.2.0" dependencies = [ "bioscript-core", "bioscript-formats", + "bioscript-reporting", "bioscript-runtime", "bioscript-schema", "console_error_panic_hook", @@ -501,9 +514,9 @@ dependencies = [ [[package]] name = "digest" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4850db49bf08e663084f7fb5c87d202ef91a3907271aff24a94eb97ff039153c" +checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" dependencies = [ "block-buffer 0.12.0", "const-oid", @@ -710,9 +723,9 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "hybrid-array" -version = "0.4.10" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3944cf8cf766b40e2a1a333ee5e9b563f854d5fa49d6a8ca2764e97c6eddb214" +checksum = "9155a582abd142abc056962c29e3ce5ff2ad5469f4246b537ed42c5deba857da" dependencies = [ "typenum", ] @@ -1093,7 +1106,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69b6441f590336821bb897fb28fc622898ccceb1d6cea3fde5ea86b090c4de98" dependencies = [ "cfg-if", - "digest 0.11.2", + "digest 0.11.3", ] [[package]] @@ -1140,9 +1153,9 @@ dependencies = [ [[package]] name = "noodles" -version = "0.109.0" +version = "0.110.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4876a9caa25647b9ddc09883263b61fd6500ec7a4394f31daa5428026d5eb415" +checksum = "34e7ed524a472dbd0dc69cc7b63408f37552e48460028af44a3024f2eb77f036" dependencies = [ "noodles-bam", "noodles-bgzf", @@ -1157,7 +1170,7 @@ dependencies = [ [[package]] name = "noodles-bam" -version = "0.88.0" +version = "0.89.0" dependencies = [ "bstr", "indexmap", @@ -1170,23 +1183,23 @@ dependencies = [ [[package]] name = "noodles-bgzf" -version = "0.46.0" +version = "0.47.0" dependencies = [ "bytes", "crossbeam-channel", - "flate2", + "zlib-rs", ] [[package]] name = "noodles-core" -version = "0.19.0" +version = "0.20.0" dependencies = [ "bstr", ] [[package]] name = "noodles-cram" -version = "0.92.0" +version = "0.93.0" dependencies = [ "bitflags", "bstr", @@ -1204,7 +1217,7 @@ dependencies = [ [[package]] name = "noodles-csi" -version = "0.55.0" +version = "0.56.0" dependencies = [ "bit-vec 0.9.1", "bstr", @@ -1215,7 +1228,7 @@ dependencies = [ [[package]] name = "noodles-fasta" -version = "0.60.0" +version = "0.61.0" dependencies = [ "bstr", "memchr", @@ -1225,7 +1238,7 @@ dependencies = [ [[package]] name = "noodles-sam" -version = "0.84.0" +version = "0.85.0" dependencies = [ "bitflags", "bstr", @@ -1239,7 +1252,7 @@ dependencies = [ [[package]] name = "noodles-tabix" -version = "0.61.0" +version = "0.62.0" dependencies = [ "bstr", "indexmap", @@ -1250,7 +1263,7 @@ dependencies = [ [[package]] name = "noodles-vcf" -version = "0.87.0" +version = "0.88.0" dependencies = [ "indexmap", "memchr", @@ -1919,9 +1932,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "typenum" -version = "1.19.0" +version = "1.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" +checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" [[package]] name = "unicode-ident" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 10619b1..79214a0 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -5,6 +5,7 @@ members = [ "bioscript-core", "bioscript-ffi", "bioscript-formats", + "bioscript-reporting", "bioscript-runtime", "bioscript-schema", "bioscript-wasm", diff --git a/rust/bioscript-cli/Cargo.toml b/rust/bioscript-cli/Cargo.toml index 158a105..781604e 100644 --- a/rust/bioscript-cli/Cargo.toml +++ b/rust/bioscript-cli/Cargo.toml @@ -10,6 +10,7 @@ path = "src/main.rs" [dependencies] bioscript-core = { path = "../bioscript-core" } bioscript-formats = { path = "../bioscript-formats" } +bioscript-reporting = { path = "../bioscript-reporting" } bioscript-runtime = { path = "../bioscript-runtime" } bioscript-schema = { path = "../bioscript-schema" } monty = { path = "../../monty/crates/monty" } diff --git a/rust/bioscript-cli/src/main.rs b/rust/bioscript-cli/src/main.rs index c84d927..8f7a9b9 100644 --- a/rust/bioscript-cli/src/main.rs +++ b/rust/bioscript-cli/src/main.rs @@ -8,9 +8,5 @@ include!("package.rs"); include!("report_review.rs"); include!("report_execution.rs"); include!("report_observations.rs"); -include!("report_observation_json.rs"); -include!("report_findings.rs"); -include!("report_matching.rs"); include!("report_output.rs"); -include!("report_html.rs"); include!("manifest_runner.rs"); diff --git a/rust/bioscript-cli/src/manifest.rs b/rust/bioscript-cli/src/manifest.rs deleted file mode 100644 index fe6e300..0000000 --- a/rust/bioscript-cli/src/manifest.rs +++ /dev/null @@ -1,340 +0,0 @@ -use std::{collections::BTreeMap, fmt::Write as _, fs, path::Path}; - -use bioscript_formats::{GenotypeLoadOptions, GenotypeStore}; -use bioscript_schema::{ - PanelManifest, VariantManifest, load_panel_manifest, load_variant_manifest, -}; - -use crate::paths::{resolve_cli_path, resolve_cli_path_buf}; - -pub(crate) struct ManifestRunOptions<'a> { - pub(crate) input_file: Option<&'a str>, - pub(crate) output_file: Option<&'a str>, - pub(crate) participant_id: Option<&'a str>, - pub(crate) trace_report: Option<&'a Path>, - pub(crate) loader: &'a GenotypeLoadOptions, - pub(crate) filters: &'a [String], -} - -pub(crate) fn is_yaml_manifest(path: &Path) -> bool { - path.extension() - .and_then(|ext| ext.to_str()) - .is_some_and(|ext| matches!(ext, "yaml" | "yml")) -} - -pub(crate) fn run_manifest( - runtime_root: &Path, - manifest_path: &Path, - options: &ManifestRunOptions<'_>, -) -> Result<(), String> { - let schema = manifest_schema(manifest_path)?; - let resolved_input = options - .input_file - .map(|value| resolve_cli_path(runtime_root, value)); - let resolved_output = options - .output_file - .map(|value| resolve_cli_path_buf(runtime_root, Path::new(value))); - let resolved_trace = options - .trace_report - .map(|value| resolve_cli_path_buf(runtime_root, value)); - match schema.as_str() { - "bioscript:variant:1.0" | "bioscript:variant" => { - let manifest = load_variant_manifest(manifest_path)?; - let row = run_variant_manifest( - runtime_root, - &manifest, - resolved_input.as_deref(), - options.participant_id, - options.loader, - )?; - write_manifest_outputs( - std::slice::from_ref(&row), - resolved_output.as_deref(), - resolved_trace.as_deref(), - )?; - Ok(()) - } - "bioscript:panel:1.0" => { - let manifest = load_panel_manifest(manifest_path)?; - let rows = run_panel_manifest( - runtime_root, - &manifest, - resolved_input.as_deref(), - options.participant_id, - options.loader, - options.filters, - )?; - write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; - Ok(()) - } - other => Err(format!("unsupported manifest schema '{other}'")), - } -} - -fn run_variant_manifest( - runtime_root: &Path, - manifest: &VariantManifest, - input_file: Option<&str>, - participant_id: Option<&str>, - loader: &GenotypeLoadOptions, -) -> Result, String> { - let input_file = input_file.ok_or("manifest execution requires --input-file")?; - let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) - .map_err(|err| err.to_string())?; - let observation = store - .lookup_variant(&manifest.spec) - .map_err(|err| err.to_string())?; - Ok(variant_row( - runtime_root, - &manifest.path, - &manifest.name, - &manifest.tags, - &observation, - participant_id, - )) -} - -fn run_panel_manifest( - runtime_root: &Path, - panel: &PanelManifest, - input_file: Option<&str>, - participant_id: Option<&str>, - loader: &GenotypeLoadOptions, - filters: &[String], -) -> Result>, String> { - let input_file = input_file.ok_or("manifest execution requires --input-file")?; - let store = GenotypeStore::from_file_with_options(Path::new(input_file), loader) - .map_err(|err| err.to_string())?; - let mut rows = Vec::new(); - - for member in &panel.members { - if member.kind != "variant" { - return Err(format!( - "panel member kind '{}' is not executable yet; panel execution is currently variant-only", - member.kind - )); - } - let Some(path) = &member.path else { - return Err("remote panel members are not executable yet".to_owned()); - }; - let resolved = resolve_manifest_path(runtime_root, &panel.path, path)?; - let manifest = load_variant_manifest(&resolved)?; - if !matches_filters(&manifest, &resolved, filters) { - continue; - } - let observation = store - .lookup_variant(&manifest.spec) - .map_err(|err| err.to_string())?; - rows.push(variant_row( - runtime_root, - &resolved, - &manifest.name, - &manifest.tags, - &observation, - participant_id, - )); - } - - Ok(rows) -} - -pub(crate) fn variant_row( - runtime_root: &Path, - path: &Path, - name: &str, - tags: &[String], - observation: &bioscript_core::VariantObservation, - participant_id: Option<&str>, -) -> BTreeMap { - let mut row = BTreeMap::new(); - row.insert("kind".to_owned(), "variant".to_owned()); - row.insert("name".to_owned(), name.to_owned()); - row.insert( - "path".to_owned(), - path.strip_prefix(runtime_root) - .unwrap_or(path) - .display() - .to_string(), - ); - row.insert("tags".to_owned(), tags.join(",")); - row.insert("backend".to_owned(), observation.backend.clone()); - row.insert( - "participant_id".to_owned(), - participant_id.unwrap_or_default().to_owned(), - ); - row.insert( - "matched_rsid".to_owned(), - observation.matched_rsid.clone().unwrap_or_default(), - ); - row.insert( - "assembly".to_owned(), - observation - .assembly - .map(|value| match value { - bioscript_core::Assembly::Grch37 => "grch37".to_owned(), - bioscript_core::Assembly::Grch38 => "grch38".to_owned(), - }) - .unwrap_or_default(), - ); - row.insert( - "genotype".to_owned(), - observation.genotype.clone().unwrap_or_default(), - ); - row.insert( - "ref_count".to_owned(), - observation - .ref_count - .map_or_else(String::new, |value| value.to_string()), - ); - row.insert( - "alt_count".to_owned(), - observation - .alt_count - .map_or_else(String::new, |value| value.to_string()), - ); - row.insert( - "depth".to_owned(), - observation - .depth - .map_or_else(String::new, |value| value.to_string()), - ); - row.insert( - "raw_counts".to_owned(), - serde_json::to_string(&observation.raw_counts).unwrap_or_default(), - ); - row.insert("evidence".to_owned(), observation.evidence.join(" | ")); - row -} - -fn write_manifest_outputs( - rows: &[BTreeMap], - output_file: Option<&Path>, - trace_report: Option<&Path>, -) -> Result<(), String> { - let text = render_rows_as_tsv(rows); - if let Some(output_file) = output_file { - if let Some(parent) = output_file.parent() { - fs::create_dir_all(parent).map_err(|err| { - format!("failed to create output dir {}: {err}", parent.display()) - })?; - } - fs::write(output_file, &text) - .map_err(|err| format!("failed to write output {}: {err}", output_file.display()))?; - } else { - print!("{text}"); - } - - if let Some(trace_report) = trace_report { - if let Some(parent) = trace_report.parent() { - fs::create_dir_all(parent) - .map_err(|err| format!("failed to create trace dir {}: {err}", parent.display()))?; - } - let mut trace = String::from("step\tline\tcode\n"); - for (idx, row) in rows.iter().enumerate() { - let _ = writeln!( - trace, - "{}\t{}\t{}", - idx + 1, - idx + 1, - row.get("path").cloned().unwrap_or_default() - ); - } - fs::write(trace_report, trace) - .map_err(|err| format!("failed to write trace {}: {err}", trace_report.display()))?; - } - - Ok(()) -} - -pub(crate) fn render_rows_as_tsv(rows: &[BTreeMap]) -> String { - let headers = [ - "kind", - "name", - "path", - "tags", - "participant_id", - "backend", - "matched_rsid", - "assembly", - "genotype", - "ref_count", - "alt_count", - "depth", - "evidence", - ]; - let mut out = headers.join("\t"); - out.push('\n'); - for row in rows { - let line = headers - .iter() - .map(|header| { - row.get(*header) - .cloned() - .unwrap_or_default() - .replace('\t', " ") - }) - .collect::>() - .join("\t"); - out.push_str(&line); - out.push('\n'); - } - out -} - -pub(crate) fn matches_filters(manifest: &VariantManifest, path: &Path, filters: &[String]) -> bool { - filters.iter().all(|filter| match filter.split_once('=') { - Some(("kind", value)) => value == "variant", - Some(("name", value)) => manifest.name.contains(value), - Some(("path", value)) => path.display().to_string().contains(value), - Some(("tag", value)) => manifest.tags.iter().any(|tag| tag == value), - Some(_) | None => false, - }) -} - -pub(crate) fn resolve_manifest_path( - runtime_root: &Path, - manifest_path: &Path, - relative: &str, -) -> Result { - let base_dir = manifest_path - .parent() - .ok_or_else(|| format!("manifest has no parent: {}", manifest_path.display()))?; - let joined = base_dir.join(relative); - let canonical_root = runtime_root - .canonicalize() - .map_err(|err| format!("failed to resolve root {}: {err}", runtime_root.display()))?; - let canonical_base = base_dir.canonicalize().map_err(|err| { - format!( - "failed to resolve manifest dir {}: {err}", - base_dir.display() - ) - })?; - let canonical_joined = joined - .canonicalize() - .map_err(|err| format!("failed to resolve {}: {err}", joined.display()))?; - let boundary = if canonical_base.starts_with(&canonical_root) { - &canonical_root - } else { - &canonical_base - }; - if !canonical_joined.starts_with(boundary) { - return Err(format!( - "manifest member path escapes bioscript root: {}", - canonical_joined.display() - )); - } - Ok(canonical_joined) -} - -pub(crate) fn manifest_schema(path: &Path) -> Result { - let text = fs::read_to_string(path) - .map_err(|err| format!("failed to read {}: {err}", path.display()))?; - let value: serde_yaml::Value = serde_yaml::from_str(&text) - .map_err(|err| format!("failed to parse YAML {}: {err}", path.display()))?; - value - .as_mapping() - .and_then(|mapping| mapping.get(serde_yaml::Value::String("schema".to_owned()))) - .and_then(serde_yaml::Value::as_str) - .map(ToOwned::to_owned) - .ok_or_else(|| format!("{} is missing schema", path.display())) -} diff --git a/rust/bioscript-cli/src/manifest_runner.rs b/rust/bioscript-cli/src/manifest_runner.rs index e691d08..eddac35 100644 --- a/rust/bioscript-cli/src/manifest_runner.rs +++ b/rust/bioscript-cli/src/manifest_runner.rs @@ -12,7 +12,11 @@ fn run_manifest( manifest_path: &Path, options: &ManifestRunOptions<'_>, ) -> Result<(), String> { - let schema = manifest_schema(manifest_path)?; + let workspace = bioscript_reporting::FilesystemManifestWorkspace::new(runtime_root); + let schema = bioscript_reporting::report_manifest_schema( + &workspace, + &manifest_path.display().to_string(), + )?; let resolved_input = options .input_file .map(|value| resolve_cli_path(runtime_root, value)); @@ -22,8 +26,8 @@ fn run_manifest( let resolved_trace = options .trace_report .map(|value| resolve_cli_path_buf(runtime_root, value)); - match schema.as_str() { - "bioscript:variant:1.0" | "bioscript:variant" => { + match bioscript_reporting::report_manifest_kind(&schema)? { + bioscript_reporting::ReportManifestKind::Variant => { let manifest = load_variant_manifest(manifest_path)?; let row = run_variant_manifest( runtime_root, @@ -39,7 +43,7 @@ fn run_manifest( )?; Ok(()) } - "bioscript:panel:1.0" => { + bioscript_reporting::ReportManifestKind::Panel => { let manifest = load_panel_manifest(manifest_path)?; let rows = run_panel_manifest( runtime_root, @@ -52,7 +56,7 @@ fn run_manifest( write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; Ok(()) } - "bioscript:assay:1.0" => { + bioscript_reporting::ReportManifestKind::Assay => { let manifest = load_assay_manifest(manifest_path)?; let rows = run_assay_manifest( runtime_root, @@ -65,7 +69,6 @@ fn run_manifest( write_manifest_outputs(&rows, resolved_output.as_deref(), resolved_trace.as_deref())?; Ok(()) } - other => Err(format!("unsupported manifest schema '{other}'")), } } @@ -126,30 +129,26 @@ fn run_panel_manifest_with_store( let mut variant_entries = Vec::new(); for (member_index, member) in panel.members.iter().enumerate() { - let Some(path) = &member.path else { - return Err("remote panel members are not executable yet".to_owned()); - }; - let resolved = resolve_manifest_path(runtime_root, &panel.path, path)?; - if member.kind == "variant" { - let manifest = load_variant_manifest(&resolved)?; - if !matches_filters(&manifest, &resolved, filters) { - continue; + match bioscript_reporting::panel_executable_member(&member.kind, member.path.as_deref())? { + bioscript_reporting::ExecutablePanelMember::Variant(path) => { + let resolved = resolve_manifest_path(runtime_root, &panel.path, path)?; + let manifest = load_variant_manifest(&resolved)?; + if !matches_filters(&manifest, &resolved, filters) { + continue; + } + variant_entries.push((member_index, resolved, manifest)); + } + bioscript_reporting::ExecutablePanelMember::Assay(path) => { + let resolved = resolve_manifest_path(runtime_root, &panel.path, path)?; + let assay = load_assay_manifest(&resolved)?; + rows_by_member[member_index] = run_assay_manifest_with_store( + runtime_root, + &assay, + store, + participant_id, + filters, + )?; } - variant_entries.push((member_index, resolved, manifest)); - } else if member.kind == "assay" { - let assay = load_assay_manifest(&resolved)?; - rows_by_member[member_index] = run_assay_manifest_with_store( - runtime_root, - &assay, - store, - participant_id, - filters, - )?; - } else { - return Err(format!( - "panel member kind '{}' is not executable", - member.kind - )); } } @@ -203,21 +202,16 @@ fn run_assay_manifest_with_store( let mut entries = Vec::new(); for member in &assay.members { - if member.kind != "variant" { - return Err(format!( - "assay member kind '{}' is not executable", - member.kind - )); - } - let Some(path) = &member.path else { - return Err("remote assay members are not executable yet".to_owned()); - }; - let resolved = resolve_manifest_path(runtime_root, &assay.path, path)?; - let manifest = load_variant_manifest(&resolved)?; - if !matches_filters(&manifest, &resolved, filters) { - continue; + match bioscript_reporting::assay_executable_member(&member.kind, member.path.as_deref())? { + bioscript_reporting::ExecutableAssayMember::Variant(path) => { + let resolved = resolve_manifest_path(runtime_root, &assay.path, path)?; + let manifest = load_variant_manifest(&resolved)?; + if !matches_filters(&manifest, &resolved, filters) { + continue; + } + entries.push((resolved, manifest)); + } } - entries.push((resolved, manifest)); } let observations = store @@ -255,64 +249,18 @@ fn variant_row( observation: &bioscript_core::VariantObservation, participant_id: Option<&str>, ) -> BTreeMap { - let mut row = BTreeMap::new(); - row.insert("kind".to_owned(), "variant".to_owned()); - row.insert("name".to_owned(), name.to_owned()); - row.insert( - "path".to_owned(), - path.strip_prefix(runtime_root) - .unwrap_or(path) - .display() - .to_string(), - ); - row.insert("tags".to_owned(), tags.join(",")); - row.insert("backend".to_owned(), observation.backend.clone()); - row.insert( - "participant_id".to_owned(), - participant_id.unwrap_or_default().to_owned(), - ); - row.insert( - "matched_rsid".to_owned(), - observation.matched_rsid.clone().unwrap_or_default(), - ); - row.insert( - "assembly".to_owned(), - observation - .assembly - .map(|value| match value { - bioscript_core::Assembly::Grch37 => "grch37".to_owned(), - bioscript_core::Assembly::Grch38 => "grch38".to_owned(), - }) - .unwrap_or_default(), - ); - row.insert( - "genotype".to_owned(), - observation.genotype.clone().unwrap_or_default(), - ); - row.insert( - "ref_count".to_owned(), - observation - .ref_count - .map_or_else(String::new, |value| value.to_string()), - ); - row.insert( - "alt_count".to_owned(), - observation - .alt_count - .map_or_else(String::new, |value| value.to_string()), - ); - row.insert( - "depth".to_owned(), - observation - .depth - .map_or_else(String::new, |value| value.to_string()), - ); - row.insert( - "raw_counts".to_owned(), - serde_json::to_string(&observation.raw_counts).unwrap_or_default(), - ); - row.insert("evidence".to_owned(), observation.evidence.join(" | ")); - row + let row_path = path + .strip_prefix(runtime_root) + .unwrap_or(path) + .display() + .to_string(); + bioscript_reporting::variant_row( + &row_path, + name, + tags, + observation, + participant_id.unwrap_or_default(), + ) } fn write_manifest_outputs( @@ -320,7 +268,7 @@ fn write_manifest_outputs( output_file: Option<&Path>, trace_report: Option<&Path>, ) -> Result<(), String> { - let text = render_rows_as_tsv(rows); + let text = bioscript_reporting::render_manifest_rows_tsv(rows); if let Some(output_file) = output_file { if let Some(parent) = output_file.parent() { fs::create_dir_all(parent).map_err(|err| { @@ -338,16 +286,7 @@ fn write_manifest_outputs( fs::create_dir_all(parent) .map_err(|err| format!("failed to create trace dir {}: {err}", parent.display()))?; } - let mut trace = String::from("step\tline\tcode\n"); - for (idx, row) in rows.iter().enumerate() { - let _ = writeln!( - trace, - "{}\t{}\t{}", - idx + 1, - idx + 1, - row.get("path").cloned().unwrap_or_default() - ); - } + let trace = bioscript_reporting::render_manifest_trace_tsv(rows); fs::write(trace_report, trace) .map_err(|err| format!("failed to write trace {}: {err}", trace_report.display()))?; } @@ -369,49 +308,12 @@ fn resolve_cli_path_buf(root: &Path, value: &Path) -> PathBuf { } } -fn render_rows_as_tsv(rows: &[BTreeMap]) -> String { - let headers = [ - "kind", - "name", - "path", - "tags", - "participant_id", - "backend", - "matched_rsid", - "assembly", - "genotype", - "ref_count", - "alt_count", - "depth", - "evidence", - ]; - let mut out = headers.join("\t"); - out.push('\n'); - for row in rows { - let line = headers - .iter() - .map(|header| { - row.get(*header) - .cloned() - .unwrap_or_default() - .replace('\t', " ") - }) - .collect::>() - .join("\t"); - out.push_str(&line); - out.push('\n'); - } - out -} - fn matches_filters(manifest: &VariantManifest, path: &Path, filters: &[String]) -> bool { - filters.iter().all(|filter| match filter.split_once('=') { - Some(("kind", value)) => value == "variant", - Some(("name", value)) => manifest.name.contains(value), - Some(("path", value)) => path.display().to_string().contains(value), - Some(("tag", value)) => manifest.tags.iter().any(|tag| tag == value), - Some(_) | None => false, - }) + bioscript_reporting::matches_variant_manifest_filters( + manifest, + &path.display().to_string(), + filters, + ) } fn resolve_manifest_path( @@ -419,47 +321,13 @@ fn resolve_manifest_path( manifest_path: &Path, relative: &str, ) -> Result { - let base_dir = manifest_path - .parent() - .ok_or_else(|| format!("manifest has no parent: {}", manifest_path.display()))?; - let joined = base_dir.join(relative); - let canonical_root = runtime_root - .canonicalize() - .map_err(|err| format!("failed to resolve root {}: {err}", runtime_root.display()))?; - let canonical_base = base_dir.canonicalize().map_err(|err| { - format!( - "failed to resolve manifest dir {}: {err}", - base_dir.display() - ) - })?; - let canonical_joined = joined - .canonicalize() - .map_err(|err| format!("failed to resolve {}: {err}", joined.display()))?; - let boundary = if canonical_base.starts_with(&canonical_root) { - &canonical_root - } else { - &canonical_base - }; - if !canonical_joined.starts_with(boundary) { - return Err(format!( - "manifest member path escapes bioscript root: {}", - canonical_joined.display() - )); - } - Ok(canonical_joined) + bioscript_reporting::resolve_filesystem_manifest_path(runtime_root, manifest_path, relative) } fn manifest_schema(path: &Path) -> Result { - let text = fs::read_to_string(path) - .map_err(|err| format!("failed to read {}: {err}", path.display()))?; - let value: serde_yaml::Value = serde_yaml::from_str(&text) - .map_err(|err| format!("failed to parse YAML {}: {err}", path.display()))?; - value - .as_mapping() - .and_then(|mapping| mapping.get(serde_yaml::Value::String("schema".to_owned()))) - .and_then(serde_yaml::Value::as_str) - .map(ToOwned::to_owned) - .ok_or_else(|| format!("{} is missing schema", path.display())) + let root = path.parent().unwrap_or_else(|| Path::new(".")); + let workspace = bioscript_reporting::FilesystemManifestWorkspace::new(root); + bioscript_reporting::report_manifest_schema(&workspace, &path.display().to_string()) } fn normalize_loader_paths(root: &Path, loader: &mut GenotypeLoadOptions) { diff --git a/rust/bioscript-cli/src/report_execution.rs b/rust/bioscript-cli/src/report_execution.rs index b40578f..227b257 100644 --- a/rust/bioscript-cli/src/report_execution.rs +++ b/rust/bioscript-cli/src/report_execution.rs @@ -7,41 +7,35 @@ fn run_manifest_rows_for_report( filters: &[String], ) -> Result>, String> { let input_text = input_file.display().to_string(); - match manifest_schema(manifest_path)?.as_str() { - "bioscript:variant:1.0" | "bioscript:variant" => { - let manifest = load_variant_manifest(manifest_path)?; - Ok(vec![run_variant_manifest( - runtime_root, - &manifest, - Some(&input_text), - Some(participant_id), - loader, - )?]) - } - "bioscript:panel:1.0" => { - let manifest = load_panel_manifest(manifest_path)?; - run_panel_manifest( - runtime_root, - &manifest, - Some(&input_text), - Some(participant_id), - loader, - filters, - ) - } - "bioscript:assay:1.0" => { - let manifest = load_assay_manifest(manifest_path)?; - run_assay_manifest( + let store = GenotypeStore::from_file_with_options(Path::new(&input_text), loader) + .map_err(|err| err.to_string())?; + let workspace = bioscript_reporting::FilesystemManifestWorkspace::new(runtime_root); + let manifest_path_text = manifest_path.display().to_string(); + let tasks = + bioscript_reporting::collect_variant_manifest_tasks(&workspace, &manifest_path_text, filters)?; + let observations = store + .lookup_variants( + &tasks + .iter() + .map(|task| task.manifest.spec.clone()) + .collect::>(), + ) + .map_err(|err| err.to_string())?; + Ok(tasks + .into_iter() + .zip(observations) + .map(|(task, observation)| { + let resolved = Path::new(&task.manifest_path); + variant_row( runtime_root, - &manifest, - Some(&input_text), + resolved, + &task.manifest.name, + &task.manifest.tags, + &observation, Some(participant_id), - loader, - filters, ) - } - other => Err(format!("unsupported manifest schema '{other}'")), - } + }) + .collect()) } struct ReportAnalysisOptions<'a> { @@ -50,6 +44,7 @@ struct ReportAnalysisOptions<'a> { participant_id: &'a str, loader: &'a GenotypeLoadOptions, output_dir: &'a Path, + observation_rows: &'a [BTreeMap], filters: &'a [String], max_duration_ms: u64, } @@ -58,53 +53,20 @@ fn run_manifest_analyses_for_report( manifest_path: &Path, options: &ReportAnalysisOptions<'_>, ) -> Result, String> { - match manifest_schema(manifest_path)?.as_str() { - "bioscript:panel:1.0" => { - let manifest = load_panel_manifest(manifest_path)?; - let mut analyses = Vec::new(); - if options.filters.is_empty() { - analyses.extend(run_interpretations_for_report( - &manifest.path, - &manifest.name, - &manifest.interpretations, - options, - )?); - } - for member in &manifest.members { - if member.kind != "assay" { - continue; - } - let Some(path) = &member.path else { - continue; - }; - let resolved = - resolve_manifest_path(options.runtime_root, &manifest.path, path)?; - if !analysis_path_matches_filters(&resolved, options.filters) { - continue; - } - analyses.extend(run_manifest_analyses_for_report(&resolved, options)?); - } - Ok(analyses) - } - "bioscript:assay:1.0" => { - let manifest = load_assay_manifest(manifest_path)?; - run_interpretations_for_report( - &manifest.path, - &manifest.name, - &manifest.interpretations, - options, - ) - } - "bioscript:variant:1.0" | "bioscript:variant" => Ok(Vec::new()), - other => Err(format!("unsupported manifest schema '{other}'")), + let workspace = bioscript_reporting::FilesystemManifestWorkspace::new(options.runtime_root); + let manifest_path_text = manifest_path.display().to_string(); + let mut analyses = Vec::new(); + for task in + bioscript_reporting::collect_analysis_manifest_tasks(&workspace, &manifest_path_text, options.filters)? + { + analyses.extend(run_interpretations_for_report( + Path::new(&task.manifest_path), + &task.manifest_name, + &task.interpretations, + options, + )?); } -} - -fn analysis_path_matches_filters(path: &Path, filters: &[String]) -> bool { - filters.iter().all(|filter| match filter.split_once('=') { - Some(("path", value)) => path.display().to_string().contains(value), - _ => false, - }) + Ok(analyses) } fn run_interpretations_for_report( @@ -115,19 +77,11 @@ fn run_interpretations_for_report( ) -> Result, String> { let mut outputs = Vec::new(); for interpretation in interpretations { - if interpretation.kind != "bioscript" { - return Err(format!( - "analysis '{}' uses unsupported kind '{}'", - interpretation.id, interpretation.kind - )); - } + bioscript_reporting::validate_bioscript_interpretation(interpretation)?; let script_path = resolve_manifest_path(options.runtime_root, manifest_path, &interpretation.path)?; - let format = interpretation - .output_format - .as_deref() - .unwrap_or("json") - .to_ascii_lowercase(); + let analysis_format = + bioscript_reporting::analysis_output_format(interpretation.output_format.as_deref())?; let analysis_dir = options.output_dir.join("analysis").join(options.participant_id); fs::create_dir_all(&analysis_dir).map_err(|err| { format!( @@ -135,96 +89,134 @@ fn run_interpretations_for_report( analysis_dir.display() ) })?; - let extension = match format.as_str() { - "tsv" => "tsv", - "json" => "json", - "jsonl" => "jsonl", - other => return Err(format!("unsupported analysis output_format '{other}'")), - }; - let output_file = analysis_dir.join(format!("{}.{}", interpretation.id, extension)); - run_bioscript_analysis_script( - options.runtime_root, - &script_path, - options.input_file, - &output_file, - options.participant_id, - options.loader, - options.max_duration_ms, - )?; - let (rows, row_headers) = parse_analysis_output(&output_file, &format)?; - outputs.push(serde_json::json!({ - "schema": "bioscript:analysis-output:1.0", - "version": "1.0", - "participant_id": options.participant_id, - "assay_id": manifest_name, - "analysis_id": interpretation.id, - "analysis_label": interpretation.label.clone(), - "kind": interpretation.kind, - "output_format": format, - "manifest_path": manifest_path.strip_prefix(options.runtime_root).unwrap_or(manifest_path).display().to_string(), - "script_path": script_path.strip_prefix(options.runtime_root).unwrap_or(&script_path).display().to_string(), - "output_file": output_file.strip_prefix(options.runtime_root).unwrap_or(&output_file).display().to_string(), - "derived_from": interpretation.derived_from.clone(), - "emits": interpretation.emits.iter().map(|emit| serde_json::json!({ - "key": emit.key.clone(), - "label": emit.label.clone(), - "value_type": emit.value_type.clone(), - "format": emit.format.clone(), - })).collect::>(), - "logic": interpretation.logic.as_ref().map(|logic| serde_json::json!({ - "description": logic.description.clone(), - "source": logic.source.as_ref().map(|source| serde_json::json!({ - "name": source.name.clone(), - "url": source.url.clone(), - })), - })), - "row_headers": row_headers, - "rows": rows, - })); + let output_file = options.output_dir.join( + bioscript_reporting::analysis_output_relative_file( + options.participant_id, + &interpretation.id, + analysis_format.extension, + ), + ); + let observations_file = options.output_dir.join( + bioscript_reporting::analysis_observations_relative_file( + options.participant_id, + &interpretation.id, + ), + ); + fs::write( + &observations_file, + bioscript_reporting::render_manifest_rows_tsv(options.observation_rows), + ) + .map_err(|err| { + format!( + "failed to write analysis observations {}: {err}", + observations_file.display() + ) + })?; + run_bioscript_analysis_script(&BioscriptAnalysisScriptInput { + runtime_root: options.runtime_root, + script_path: &script_path, + input_file: options.input_file, + output_file: &output_file, + observations_file: &observations_file, + participant_id: options.participant_id, + loader: options.loader, + analysis_max_duration_ms: options.max_duration_ms, + })?; + let (rows, row_headers) = parse_analysis_output(&output_file, analysis_format.format)?; + let manifest_path_text = manifest_path + .strip_prefix(options.runtime_root) + .unwrap_or(manifest_path) + .display() + .to_string(); + let script_path_text = script_path + .strip_prefix(options.runtime_root) + .unwrap_or(&script_path) + .display() + .to_string(); + let output_file_text = output_file + .strip_prefix(options.runtime_root) + .unwrap_or(&output_file) + .display() + .to_string(); + let observations_file_text = observations_file + .strip_prefix(options.runtime_root) + .unwrap_or(&observations_file) + .display() + .to_string(); + outputs.push(bioscript_reporting::analysis_output_json( + bioscript_reporting::AnalysisOutputJsonInput { + participant_id: options.participant_id, + assay_id: manifest_name, + interpretation, + output_format: analysis_format.format, + manifest_path: &manifest_path_text, + script_path: &script_path_text, + output_file: &output_file_text, + observations_file: Some(&observations_file_text), + row_headers, + rows, + }, + )); } Ok(outputs) } -fn run_bioscript_analysis_script( - runtime_root: &Path, - script_path: &Path, - input_file: &Path, - output_file: &Path, - participant_id: &str, - loader: &GenotypeLoadOptions, +struct BioscriptAnalysisScriptInput<'a> { + runtime_root: &'a Path, + script_path: &'a Path, + input_file: &'a Path, + output_file: &'a Path, + observations_file: &'a Path, + participant_id: &'a str, + loader: &'a GenotypeLoadOptions, analysis_max_duration_ms: u64, -) -> Result<(), String> { +} + +fn run_bioscript_analysis_script(input: &BioscriptAnalysisScriptInput<'_>) -> Result<(), String> { let limits = ResourceLimits::new() - .max_duration(Duration::from_millis(analysis_max_duration_ms)) + .max_duration(Duration::from_millis(input.analysis_max_duration_ms)) .max_memory(16 * 1024 * 1024) .max_allocations(400_000) .gc_interval(1000) .max_recursion_depth(Some(200)); let runtime = BioscriptRuntime::with_config( - runtime_root.to_path_buf(), + input.runtime_root.to_path_buf(), RuntimeConfig { limits, - loader: loader.clone(), + loader: input.loader.clone(), ..RuntimeConfig::default() }, ) .map_err(|err| err.to_string())?; runtime .run_file( - script_path, + input.script_path, None, vec![ ( "input_file", - monty::MontyObject::String(runtime_path_string(runtime_root, input_file)), + monty::MontyObject::String(runtime_path_string( + input.runtime_root, + input.input_file, + )), ), ( "output_file", - monty::MontyObject::String(runtime_path_string(runtime_root, output_file)), + monty::MontyObject::String(runtime_path_string( + input.runtime_root, + input.output_file, + )), + ), + ( + "observations_file", + monty::MontyObject::String(runtime_path_string( + input.runtime_root, + input.observations_file, + )), ), ( "participant_id", - monty::MontyObject::String(participant_id.to_owned()), + monty::MontyObject::String(input.participant_id.to_owned()), ), ], ) @@ -245,93 +237,10 @@ fn parse_analysis_output( ) -> Result<(Vec, Vec), String> { let text = fs::read_to_string(path) .map_err(|err| format!("failed to read analysis output {}: {err}", path.display()))?; - match format { - "tsv" => Ok(parse_analysis_tsv(&text)), - "json" => { - let value: serde_json::Value = serde_json::from_str(&text).map_err(|err| { - format!("failed to parse analysis JSON {}: {err}", path.display()) - })?; - let rows = match value { - serde_json::Value::Array(rows) => rows, - serde_json::Value::Object(mut object) => object - .remove("rows") - .and_then(|rows| rows.as_array().cloned()) - .unwrap_or_else(|| vec![serde_json::Value::Object(object)]), - other => vec![other], - }; - let headers = analysis_headers_from_rows(&rows); - Ok((rows, headers)) - } - "jsonl" => { - let rows = text - .lines() - .filter(|line| !line.trim().is_empty()) - .map(|line| serde_json::from_str(line).map_err(|err| err.to_string())) - .collect::, _>>()?; - let headers = analysis_headers_from_rows(&rows); - Ok((rows, headers)) - } - other => Err(format!("unsupported analysis output_format '{other}'")), - } -} - -fn parse_analysis_tsv(text: &str) -> (Vec, Vec) { - let mut lines = text.lines().filter(|line| !line.trim().is_empty()); - let Some(header_line) = lines.next() else { - return (Vec::new(), Vec::new()); - }; - let headers: Vec<&str> = header_line.split('\t').collect(); - let mut rows = Vec::new(); - for line in lines { - let values: Vec<&str> = line.split('\t').collect(); - let mut object = serde_json::Map::new(); - for (idx, header) in headers.iter().enumerate() { - object.insert( - (*header).to_owned(), - serde_json::Value::String(values.get(idx).copied().unwrap_or_default().to_owned()), - ); - } - rows.push(serde_json::Value::Object(object)); - } - (rows, headers.iter().map(|header| (*header).to_owned()).collect()) -} - -fn analysis_headers_from_rows(rows: &[serde_json::Value]) -> Vec { - let mut headers = Vec::new(); - for row in rows { - let Some(object) = row.as_object() else { - continue; - }; - for key in object.keys() { - if !headers.contains(key) { - headers.push(key.clone()); - } - } - } - headers -} - -fn app_assay_id(path: &Path) -> Result { - match manifest_schema(path)?.as_str() { - "bioscript:panel:1.0" => Ok(load_panel_manifest(path)?.name), - "bioscript:assay:1.0" => Ok(load_assay_manifest(path)?.name), - "bioscript:variant:1.0" | "bioscript:variant" => Ok(load_variant_manifest(path)?.name), - other => Err(format!("unsupported manifest schema '{other}'")), - } + bioscript_reporting::parse_analysis_output_text(&text, format) + .map_err(|err| format!("failed to parse analysis output {}: {err}", path.display())) } fn participant_id_from_path(path: &Path) -> String { - let file_name = path - .file_name() - .and_then(|value| value.to_str()) - .unwrap_or("participant"); - file_name - .trim_end_matches(".txt.zip") - .trim_end_matches(".csv.zip") - .trim_end_matches(".vcf.gz") - .trim_end_matches(".cram") - .trim_end_matches(".zip") - .trim_end_matches(".txt") - .trim_end_matches(".csv") - .to_owned() + bioscript_reporting::participant_id_from_path(path) } diff --git a/rust/bioscript-cli/src/report_findings.rs b/rust/bioscript-cli/src/report_findings.rs deleted file mode 100644 index c54e835..0000000 --- a/rust/bioscript-cli/src/report_findings.rs +++ /dev/null @@ -1,361 +0,0 @@ -fn load_manifest_findings( - root: &Path, - manifest_path: &Path, -) -> Result, String> { - let value = load_yaml_value(manifest_path)?; - let schema = value - .get("schema") - .and_then(serde_yaml::Value::as_str) - .unwrap_or_default(); - let mut findings = Vec::new(); - - if matches!( - schema, - "bioscript:variant:1.0" - | "bioscript:variant" - | "bioscript:assay:1.0" - | "bioscript:panel:1.0" - | "bioscript:pgx-findings:1.0" - ) && let Some(items) = value - .get("findings") - .and_then(serde_yaml::Value::as_sequence) - { - for item in items { - let json_item = yaml_to_json(item.clone())?; - let include = json_item - .get("include") - .and_then(serde_json::Value::as_str) - .map(str::to_owned); - if let Some(include) = include { - let include_path = resolve_manifest_path(root, manifest_path, &include)?; - let mut included = load_manifest_findings(root, &include_path)?; - let inherited_binding = json_item.get("binding").cloned(); - for included_item in &mut included { - if inherited_binding.is_some() - && included_item.get("binding").is_none() - && included_item.get("effects").is_none() - && let Some(object) = included_item.as_object_mut() - { - object.insert( - "binding".to_owned(), - inherited_binding.clone().unwrap_or(serde_json::Value::Null), - ); - } - } - findings.extend(included); - continue; - } - if json_item.get("include").is_none() { - findings.push(json_item); - } - } - } - - if matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") - && let Some(items) = value - .get("members") - .and_then(serde_yaml::Value::as_sequence) - { - for member in items { - let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { - continue; - }; - if !matches!(kind, "variant" | "assay") { - continue; - } - let Some(path) = member.get("path").and_then(serde_yaml::Value::as_str) else { - continue; - }; - let member_path = resolve_manifest_path(root, manifest_path, path)?; - findings.extend(load_manifest_findings(root, &member_path)?); - } - } - - Ok(findings) -} - -fn load_yaml_value(path: &Path) -> Result { - let text = fs::read_to_string(path) - .map_err(|err| format!("failed to read YAML {}: {err}", path.display()))?; - serde_yaml::from_str(&text) - .map_err(|err| format!("failed to parse YAML {}: {err}", path.display())) -} - -fn yaml_to_json(value: serde_yaml::Value) -> Result { - serde_json::to_value(value).map_err(|err| format!("failed to convert YAML to JSON: {err}")) -} - -fn load_manifest_provenance_links( - root: &Path, - manifest_path: &Path, -) -> Result, String> { - let value = load_yaml_value(manifest_path)?; - let schema = value - .get("schema") - .and_then(serde_yaml::Value::as_str) - .unwrap_or_default(); - let mut links = BTreeMap::::new(); - collect_manifest_provenance_entries(&value, &mut links)?; - - if matches!( - schema, - "bioscript:variant:1.0" - | "bioscript:variant" - | "bioscript:assay:1.0" - | "bioscript:panel:1.0" - | "bioscript:pgx-findings:1.0" - ) && let Some(items) = value - .get("findings") - .and_then(serde_yaml::Value::as_sequence) - { - for item in items { - let json_item = yaml_to_json(item.clone())?; - let Some(include) = json_item.get("include").and_then(serde_json::Value::as_str) else { - continue; - }; - let include_path = resolve_manifest_path(root, manifest_path, include)?; - for item in load_manifest_provenance_links(root, &include_path)? { - if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { - links.entry(url.to_owned()).or_insert(item); - } - } - } - } - - if matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") - && let Some(items) = value - .get("members") - .and_then(serde_yaml::Value::as_sequence) - { - for member in items { - let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { - continue; - }; - if !matches!(kind, "variant" | "assay") { - continue; - } - let Some(path) = member.get("path").and_then(serde_yaml::Value::as_str) else { - continue; - }; - let member_path = resolve_manifest_path(root, manifest_path, path)?; - for item in load_manifest_provenance_links(root, &member_path)? { - if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { - links.entry(url.to_owned()).or_insert(item); - } - } - } - } - - Ok(links.into_values().collect()) -} - -fn collect_manifest_provenance_entries( - value: &serde_yaml::Value, - links: &mut BTreeMap, -) -> Result<(), String> { - if let Some(sources) = value - .get("provenance") - .and_then(|provenance| provenance.get("sources")) - .and_then(serde_yaml::Value::as_sequence) - { - for source in sources { - let json = yaml_to_json(source.clone())?; - if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { - links.entry(url.to_owned()).or_insert(json); - } - } - } - if let Some(source) = value.get("source") { - let json = yaml_to_json(source.clone())?; - if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { - links.entry(url.to_owned()).or_insert(json); - } - } - Ok(()) -} - -fn match_app_findings( - findings: &[serde_json::Value], - observations: &[serde_json::Value], - analyses: &[serde_json::Value], -) -> Vec { - let mut matched = Vec::new(); - let mut seen = std::collections::BTreeSet::new(); - for finding in findings { - if let Some(effects) = finding.get("effects").and_then(serde_json::Value::as_array) { - for effect in effects { - if let Some(observation) = app_finding_match_observation(effect, observations) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.remove("effects"); - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert("matched_effect".to_owned(), effect.clone()); - object.insert( - "matched_observation".to_owned(), - app_finding_observation_context(observation), - ); - } - let key = app_finding_dedupe_key(&item); - if seen.insert(key) { - matched.push(item); - } - } else if let Some(analysis) = app_finding_match_analysis(effect, analyses) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.remove("effects"); - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert("matched_effect".to_owned(), effect.clone()); - object.insert("matched_analysis".to_owned(), analysis); - } - let key = app_finding_dedupe_key(&item); - if seen.insert(key) { - matched.push(item); - } - } - } - } else if let Some(observation) = app_finding_match_observation(finding, observations) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert( - "matched_observation".to_owned(), - app_finding_observation_context(observation), - ); - } - let key = app_finding_dedupe_key(&item); - if seen.insert(key) { - matched.push(item); - } - } else if let Some(analysis) = app_finding_match_analysis(finding, analyses) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert("matched_analysis".to_owned(), analysis); - } - let key = app_finding_dedupe_key(&item); - if seen.insert(key) { - matched.push(item); - } - } - } - matched -} - -#[cfg(test)] -mod report_observations_tests { - use super::*; - - #[test] - fn raw_counts_can_fill_display_for_homozygous_and_heterozygous_observations() { - assert_eq!( - genotype_display_from_raw_counts(r#"{"T": 24}"#).as_deref(), - Some("TT") - ); - assert_eq!( - genotype_display_from_raw_counts(r#"{"C": 12, "T": 10}"#).as_deref(), - Some("CT") - ); - } - - #[test] - fn non_reportable_alleles_are_classified_as_observed_or_unknown() { - assert_eq!( - classify_non_reportable_alleles("TT", "C", "G", &["T".to_owned()]), - Some("observed_alt") - ); - assert_eq!( - classify_non_reportable_alleles("AT", "C", "G", &["T".to_owned()]), - Some("unknown_alt") - ); - assert_eq!( - classify_non_reportable_alleles("CG", "C", "G", &["T".to_owned()]), - None - ); - } - - #[test] - fn deletion_copy_number_calls_are_normalized_from_insertion_deletion_tokens() { - assert_eq!( - normalize_app_genotype( - "DI", - "TTATAA", - "", - Some(bioscript_core::VariantKind::Deletion), - "22", - None - ), - ("0/1".to_owned(), "het".to_owned()) - ); - } - - #[test] - fn cram_long_deletion_copy_number_calls_are_displayed_as_insertion_deletion_tokens() { - let mut row = BTreeMap::new(); - row.insert("backend".to_owned(), "cram".to_owned()); - let manifest = bioscript_schema::VariantManifest { - path: std::path::PathBuf::new(), - name: "apol1_g2".to_owned(), - tags: Vec::new(), - spec: bioscript_core::VariantSpec { - reference: Some("TTATAA".to_owned()), - alternate: Some("".to_owned()), - kind: Some(bioscript_core::VariantKind::Deletion), - ..bioscript_core::VariantSpec::default() - }, - }; - - assert_eq!( - deletion_copy_number_display(&row, &manifest, Some(53), Some(0)).as_deref(), - Some("II") - ); - assert_eq!( - normalize_app_genotype( - "II", - "TTATAA", - "", - Some(bioscript_core::VariantKind::Deletion), - "22", - None - ), - ("0/0".to_owned(), "hom_ref".to_owned()) - ); - } - - #[test] - fn single_allele_sex_chromosome_calls_are_treated_as_hemizygous() { - assert_eq!( - normalize_app_genotype("G", "C", "G", None, "X", None), - ("1".to_owned(), "hem_alt".to_owned()) - ); - assert_eq!( - normalize_app_genotype("C", "C", "G", None, "chrX", None), - ("0".to_owned(), "hem_ref".to_owned()) - ); - assert_eq!( - normalize_app_genotype("G", "C", "G", None, "1", None), - ("G".to_owned(), "unknown".to_owned()) - ); - assert_eq!( - normalize_app_genotype("GG", "C", "G", None, "X", None), - ("1/1".to_owned(), "hom_alt".to_owned()) - ); - } - - #[test] - fn confident_male_sex_chromosome_duplicate_calls_are_hemizygous() { - let inferred_sex = SexInference { - sex: InferredSex::Male, - confidence: SexDetectionConfidence::High, - method: "vcf_non_par_x_gt".to_owned(), - evidence: vec!["called_y_snps=1200".to_owned()], - }; - assert_eq!( - normalize_app_genotype("GG", "C", "G", None, "X", Some(&inferred_sex)), - ("1".to_owned(), "hem_alt".to_owned()) - ); - assert_eq!( - normalize_app_genotype("CC", "C", "G", None, "chrX", Some(&inferred_sex)), - ("0".to_owned(), "hem_ref".to_owned()) - ); - } -} diff --git a/rust/bioscript-cli/src/report_html.rs b/rust/bioscript-cli/src/report_html.rs deleted file mode 100644 index 4013c20..0000000 --- a/rust/bioscript-cli/src/report_html.rs +++ /dev/null @@ -1,6 +0,0 @@ -include!("report_html_sections.rs"); -include!("report_html_analysis.rs"); -include!("report_html_provenance.rs"); -include!("report_html_observations.rs"); -include!("report_html_pgx.rs"); -include!("report_html_helpers.rs"); diff --git a/rust/bioscript-cli/src/report_observation_json.rs b/rust/bioscript-cli/src/report_observation_json.rs deleted file mode 100644 index 0c36bc3..0000000 --- a/rust/bioscript-cli/src/report_observation_json.rs +++ /dev/null @@ -1,142 +0,0 @@ -struct AppObservationJson { - allele_balance: Option, - alt_count: Option, - assay_id: String, - assembly: String, - call: ObservationCallValues, - chrom: String, - depth: Option, - evidence_raw: String, - gene: String, - genotype: String, - genotype_display: String, - kind: String, - locus: Option, - manifest: bioscript_schema::VariantManifest, - non_reportable_status: Option<&'static str>, - observed_alt_alleles: Vec, - ref_allele: String, - ref_count: Option, - reportable_alt: String, - row: BTreeMap, - row_path: String, - source: serde_json::Value, - weak_indel_match: bool, - zygosity: String, -} - -struct ObservationCallValues { - outcome: &'static str, - status: &'static str, - reported_genotype_display: String, -} - -fn observation_call_values( - depth: Option, - non_reportable_status: Option<&'static str>, - genotype: &str, - zygosity: &str, - genotype_display: &str, -) -> ObservationCallValues { - let outcome = if depth == Some(0) { - "not_covered" - } else if non_reportable_status == Some("observed_alt") { - "observed_alt" - } else if non_reportable_status == Some("unknown_alt") { - "unknown_alt" - } else if genotype == "./." { - "no_call" - } else if zygosity == "hom_ref" || zygosity == "hem_ref" { - "reference" - } else if zygosity == "het" || zygosity == "hom_alt" || zygosity == "hem_alt" { - "variant" - } else { - "unknown" - }; - let status = if matches!(outcome, "observed_alt" | "unknown_alt") { - outcome - } else if genotype == "./." { - "no_call" - } else { - "called" - }; - let reported_genotype_display = if matches!(zygosity, "hem_ref" | "hem_alt") { - hemizygous_display_genotype(genotype_display) - } else if genotype_display.is_empty() && matches!(outcome, "no_call" | "not_covered") { - "??".to_owned() - } else { - genotype_display.to_owned() - }; - ObservationCallValues { - outcome, - status, - reported_genotype_display, - } -} - -fn render_app_observation_json(input: AppObservationJson) -> serde_json::Value { - let AppObservationJson { - allele_balance, - alt_count, - assay_id, - assembly, - call, - chrom, - depth, - evidence_raw, - gene, - genotype, - genotype_display, - kind, - locus, - manifest, - non_reportable_status, - observed_alt_alleles, - ref_allele, - ref_count, - reportable_alt, - row, - row_path, - source, - weak_indel_match, - zygosity, - } = input; - serde_json::json!({ - "participant_id": row.get("participant_id").cloned().unwrap_or_default(), - "assay_id": assay_id, - "assay_version": "1.0", - "variant_key": manifest.name, - "variant_path": row_path, - "rsid": row.get("matched_rsid").filter(|value| !value.is_empty()).cloned().or_else(|| manifest.spec.rsids.first().cloned()), - "gene": gene, - "assembly": if assembly.is_empty() { serde_json::Value::Null } else { serde_json::Value::String(assembly.to_uppercase()) }, - "chrom": chrom, - "pos_start": locus.as_ref().map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.start)), - "pos_end": locus.as_ref().map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.end)), - "ref": ref_allele, - "alt": reportable_alt, - "kind": kind, - "match_status": if row.get("matched_rsid").is_some_and(|value| !value.is_empty()) || !genotype_display.is_empty() { "found" } else { "not_found" }, - "coverage_status": depth.map_or("covered", |depth| if depth > 0 { "covered" } else { "not_covered" }), - "call_status": call.status, - "genotype": genotype, - "genotype_display": call.reported_genotype_display, - "zygosity": zygosity, - "ref_count": ref_count, - "alt_count": alt_count, - "depth": depth, - "genotype_quality": serde_json::Value::Null, - "allele_balance": allele_balance, - "outcome": call.outcome, - "evidence_type": if row.get("backend").is_some_and(|value| value == "cram") { "mpileup" } else { "genotype_file" }, - "evidence_raw": evidence_raw, - "source": source, - "match_quality": if weak_indel_match { serde_json::Value::String("weak".to_owned()) } else { serde_json::Value::Null }, - "match_notes": if weak_indel_match { - serde_json::Value::String("consumer genotype file reported an insertion/deletion token at the marker, not sequence-resolved evidence for the exact deletion allele".to_owned()) - } else { - serde_json::Value::Null - }, - "facets": observation_facets(non_reportable_status, &observed_alt_alleles), - }) -} diff --git a/rust/bioscript-cli/src/report_observations.rs b/rust/bioscript-cli/src/report_observations.rs index 6e92bc5..261dada 100644 --- a/rust/bioscript-cli/src/report_observations.rs +++ b/rust/bioscript-cli/src/report_observations.rs @@ -13,147 +13,34 @@ fn app_observation_from_manifest_row( }; let manifest = load_variant_manifest(&manifest_path)?; let gene = variant_manifest_gene(&manifest_path)?; - let ref_allele = manifest.spec.reference.clone().unwrap_or_default(); - let reportable_alt = manifest.spec.alternate.clone().unwrap_or_default(); let observed_alt_alleles = variant_observed_alt_alleles(&manifest_path)?; - let mut genotype_display = row - .get("genotype") - .filter(|value| !value.is_empty()) - .cloned() - .or_else(|| genotype_display_from_raw_counts(row.get("raw_counts")?)) - .unwrap_or_default(); - let depth = parse_optional_u32(row.get("depth")); - let ref_count = parse_optional_u32(row.get("ref_count")); - let alt_count = parse_optional_u32(row.get("alt_count")); - if let Some(normalized_display) = - deletion_copy_number_display(row, &manifest, depth, alt_count) - { - genotype_display = normalized_display; - } - let weak_indel_match = is_weak_delimited_indel_match(row, &manifest, &genotype_display); - let allele_balance = match (alt_count, depth) { - (Some(alt_count), Some(depth)) if depth > 0 => { - Some(f64::from(alt_count) / f64::from(depth)) - } - _ => None, - }; - let assembly = row - .get("assembly") - .filter(|value| !value.is_empty()) - .cloned() - .or_else(|| fallback_assembly.map(assembly_row_value)) - .unwrap_or_default(); - let locus = if assembly.eq_ignore_ascii_case("grch37") { - manifest.spec.grch37.as_ref() - } else { - manifest - .spec - .grch38 - .as_ref() - .or(manifest.spec.grch37.as_ref()) - }; - let chrom = locus.map_or(String::new(), |locus| locus.chrom.clone()); - let (genotype, zygosity) = normalize_app_genotype( - &genotype_display, - &ref_allele, - &reportable_alt, - manifest.spec.kind, - &chrom, - inferred_sex, - ); - let non_reportable_status = - classify_non_reportable_alleles(&genotype_display, &ref_allele, &reportable_alt, &observed_alt_alleles); - let call = observation_call_values( - depth, - non_reportable_status, - &genotype, - &zygosity, - &genotype_display, - ); - let evidence_raw = observation_evidence_raw(row, &chrom, inferred_sex); let source = variant_primary_source(&manifest_path)?; - let kind = manifest - .spec - .kind - .map_or("unknown".to_owned(), |kind| format!("{kind:?}").to_lowercase()); - Ok(render_app_observation_json(AppObservationJson { - allele_balance, - alt_count, - assay_id: assay_id.to_owned(), - assembly, - call, - chrom, - depth, - evidence_raw, - gene, - genotype, - genotype_display, - kind, - locus: locus.cloned(), - manifest, - non_reportable_status, - observed_alt_alleles, - ref_allele, - ref_count, - reportable_alt, - row: row.clone(), - row_path, - source, - weak_indel_match, - zygosity, - })) -} - -fn assembly_row_value(assembly: bioscript_core::Assembly) -> String { - match assembly { - bioscript_core::Assembly::Grch37 => "grch37".to_owned(), - bioscript_core::Assembly::Grch38 => "grch38".to_owned(), - } -} - -fn hemizygous_display_genotype(display: &str) -> String { - display - .chars() - .find(char::is_ascii_alphabetic) - .map_or_else(|| display.to_owned(), |allele| allele.to_string()) -} - -fn deletion_copy_number_display( - row: &BTreeMap, - manifest: &bioscript_schema::VariantManifest, - depth: Option, - alt_count: Option, -) -> Option { - if !matches!(manifest.spec.kind, Some(bioscript_core::VariantKind::Deletion)) { - return None; - } - if !matches!( - row.get("backend").map(String::as_str), - Some("cram" | "bam") - ) { - return None; - } - if manifest.spec.reference.as_deref().unwrap_or_default().len() <= 1 { - return None; - } - let depth = depth?; - if depth == 0 { - return None; - } - let alt_fraction = f64::from(alt_count.unwrap_or(0)) / f64::from(depth); - if alt_fraction >= 0.8 { - Some("DD".to_owned()) - } else if alt_fraction <= 0.2 { - Some("II".to_owned()) - } else { - Some("DI".to_owned()) - } + Ok(bioscript_reporting::app_observation_from_manifest_row( + bioscript_reporting::AppObservationInput { + row, + row_path: &row_path, + assay_id, + manifest, + gene, + source, + observed_alt_alleles, + inferred_sex, + fallback_assembly, + }, + )) +} + +fn load_yaml_value(path: &Path) -> Result { + let text = fs::read_to_string(path) + .map_err(|err| format!("failed to read YAML {}: {err}", path.display()))?; + serde_yaml::from_str(&text) + .map_err(|err| format!("failed to parse YAML {}: {err}", path.display())) } fn variant_primary_source(path: &Path) -> Result { let value = load_yaml_value(path)?; let mut links = BTreeMap::::new(); - collect_manifest_provenance_entries(&value, &mut links)?; + bioscript_reporting::collect_manifest_provenance_entries(&value, &mut links)?; if let Some(source) = links .values() .find(|source| source_url_contains(source, "ncbi.nlm.nih.gov/snp/rs")) @@ -219,238 +106,3 @@ fn variant_observed_alt_alleles(path: &Path) -> Result, String> { .map(ToOwned::to_owned) .collect()) } - -fn normalize_app_genotype( - display: &str, - ref_allele: &str, - alt_allele: &str, - kind: Option, - chrom: &str, - inferred_sex: Option<&SexInference>, -) -> (String, String) { - if display.is_empty() { - return ("./.".to_owned(), "unknown".to_owned()); - } - if matches!(kind, Some(bioscript_core::VariantKind::Deletion)) - && ref_allele.len() != 1 - && display - .chars() - .filter(char::is_ascii_alphabetic) - .all(|allele| matches!(allele.to_ascii_uppercase(), 'I' | 'D')) - { - return normalize_app_genotype(display, "I", "D", None, chrom, inferred_sex); - } - let alleles: Vec = display.chars().filter(char::is_ascii_alphabetic).collect(); - if ref_allele.len() != 1 || alt_allele.len() != 1 { - return (display.to_owned(), "unknown".to_owned()); - } - let ref_ch = ref_allele.chars().next().unwrap_or_default(); - let alt_ch = alt_allele.chars().next().unwrap_or_default(); - if alleles.len() == 1 && is_haploid_sex_chromosome(chrom) { - let allele = alleles[0]; - if allele == ref_ch { - return ("0".to_owned(), "hem_ref".to_owned()); - } - if allele == alt_ch { - return ("1".to_owned(), "hem_alt".to_owned()); - } - return (display.to_owned(), "unknown".to_owned()); - } - if alleles.len() != 2 { - return (display.to_owned(), "unknown".to_owned()); - } - if is_confident_male_sex_chromosome(chrom, inferred_sex) && alleles[0] == alleles[1] { - let allele = alleles[0]; - if allele == ref_ch { - return ("0".to_owned(), "hem_ref".to_owned()); - } - if allele == alt_ch { - return ("1".to_owned(), "hem_alt".to_owned()); - } - return (display.to_owned(), "unknown".to_owned()); - } - let alt_count = alleles.iter().filter(|allele| **allele == alt_ch).count(); - let ref_count = alleles.iter().filter(|allele| **allele == ref_ch).count(); - match (ref_count, alt_count) { - (2, 0) => ("0/0".to_owned(), "hom_ref".to_owned()), - (1, 1) => ("0/1".to_owned(), "het".to_owned()), - (0, 2) => ("1/1".to_owned(), "hom_alt".to_owned()), - _ => (display.to_owned(), "unknown".to_owned()), - } -} - -fn is_confident_male_sex_chromosome(chrom: &str, inferred_sex: Option<&SexInference>) -> bool { - is_haploid_sex_chromosome(chrom) - && inferred_sex.is_some_and(|sex| { - sex.sex == InferredSex::Male - && matches!( - sex.confidence, - SexDetectionConfidence::High | SexDetectionConfidence::Medium - ) - }) -} - -fn is_haploid_sex_chromosome(chrom: &str) -> bool { - matches!( - chrom - .trim() - .trim_start_matches("chr") - .trim_start_matches("CHR") - .to_ascii_uppercase() - .as_str(), - "X" | "Y" | "23" | "24" - ) -} - -fn observation_evidence_raw( - row: &BTreeMap, - chrom: &str, - inferred_sex: Option<&SexInference>, -) -> String { - let mut evidence_raw = row.get("evidence").cloned().unwrap_or_default(); - if !is_haploid_sex_chromosome(chrom) { - return evidence_raw; - } - let Some(inferred_sex) = inferred_sex else { - return evidence_raw; - }; - let sex_evidence = sex_inference_evidence_raw(inferred_sex); - if sex_evidence.is_empty() { - return evidence_raw; - } - if evidence_raw.is_empty() { - evidence_raw = sex_evidence; - } else { - evidence_raw.push_str(" | "); - evidence_raw.push_str(&sex_evidence); - } - evidence_raw -} - -fn sex_inference_evidence_raw(inferred_sex: &SexInference) -> String { - let sex = match inferred_sex.sex { - InferredSex::Male => "male", - InferredSex::Female => "female", - InferredSex::Unknown => "unknown", - }; - let confidence = match inferred_sex.confidence { - SexDetectionConfidence::High => "high", - SexDetectionConfidence::Medium => "medium", - SexDetectionConfidence::Low => "low", - }; - let mut fields = vec![ - format!("detected_sex={sex}"), - format!("sex_confidence={confidence}"), - format!("sex_method={}", inferred_sex.method), - ]; - fields.extend( - inferred_sex - .evidence - .iter() - .map(|item| format!("sex_{item}")), - ); - fields.join(" ") -} - -fn genotype_display_from_raw_counts(raw_counts: &str) -> Option { - let counts: serde_json::Map = serde_json::from_str(raw_counts).ok()?; - let mut items = counts - .into_iter() - .filter_map(|(base, count)| { - let base = base.chars().next()?.to_ascii_uppercase(); - let count = count.as_u64()?; - if matches!(base, 'A' | 'C' | 'G' | 'T') && count > 0 { - Some((base, count)) - } else { - None - } - }) - .collect::>(); - if items.is_empty() { - return None; - } - items.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); - let total = items.iter().map(|(_, count)| *count).sum::(); - let (top_base, top_count) = items[0]; - if total == 0 || items.len() == 1 || top_count.saturating_mul(10) >= total.saturating_mul(8) { - return Some(format!("{top_base}{top_base}")); - } - Some(format!("{}{}", top_base, items[1].0)) -} - -fn classify_non_reportable_alleles( - display: &str, - ref_allele: &str, - reportable_alt: &str, - observed_alts: &[String], -) -> Option<&'static str> { - if display.is_empty() || ref_allele.len() != 1 || reportable_alt.len() != 1 { - return None; - } - let ref_ch = ref_allele.chars().next()?.to_ascii_uppercase(); - let alt_ch = reportable_alt.chars().next()?.to_ascii_uppercase(); - let non_reportable = display - .chars() - .filter(char::is_ascii_alphabetic) - .map(|ch| ch.to_ascii_uppercase()) - .filter(|ch| *ch != ref_ch && *ch != alt_ch) - .collect::>(); - if non_reportable.is_empty() { - return None; - } - if non_reportable.iter().all(|ch| { - observed_alts.iter().any(|alt| { - alt.len() == 1 - && alt - .chars() - .next() - .is_some_and(|alt_ch| alt_ch.to_ascii_uppercase() == *ch) - }) - }) { - Some("observed_alt") - } else { - Some("unknown_alt") - } -} - -fn is_weak_delimited_indel_match( - row: &BTreeMap, - manifest: &bioscript_schema::VariantManifest, - genotype_display: &str, -) -> bool { - if !matches!(manifest.spec.kind, Some(bioscript_core::VariantKind::Deletion)) { - return false; - } - if !matches!(row.get("backend").map(String::as_str), Some("text" | "zip")) { - return false; - } - if manifest.spec.reference.as_deref().unwrap_or_default().len() <= 1 { - return false; - } - genotype_display - .chars() - .filter(char::is_ascii_alphabetic) - .all(|allele| matches!(allele.to_ascii_uppercase(), 'I' | 'D')) -} - -fn observation_facets( - non_reportable_status: Option<&str>, - observed_alts: &[String], -) -> serde_json::Value { - let mut facets = Vec::new(); - if let Some(status) = non_reportable_status { - facets.push(status.to_owned()); - if status == "observed_alt" && !observed_alts.is_empty() { - facets.push(format!("known_observed_alts={}", observed_alts.join(","))); - } - } - if facets.is_empty() { - serde_json::Value::Null - } else { - serde_json::Value::String(facets.join(";")) - } -} - -fn parse_optional_u32(value: Option<&String>) -> Option { - value.and_then(|value| value.parse::().ok()) -} diff --git a/rust/bioscript-cli/src/report_options.rs b/rust/bioscript-cli/src/report_options.rs index 1a4bc99..dde0487 100644 --- a/rust/bioscript-cli/src/report_options.rs +++ b/rust/bioscript-cli/src/report_options.rs @@ -235,10 +235,10 @@ fn generate_app_report(options: &AppReportOptions) -> Result<(), String> { ) })?; - let assay_id = app_assay_id(&options.manifest_path)?; - let manifest_metadata = report_manifest_metadata(&options.manifest_path)?; - let findings = load_manifest_findings(&options.root, &options.manifest_path)?; - let provenance = load_manifest_provenance_links(&options.root, &options.manifest_path)?; + let manifest_workspace = bioscript_reporting::FilesystemManifestWorkspace::new(&options.root); + let manifest_path = options.manifest_path.display().to_string(); + let manifest_context = + bioscript_reporting::load_report_manifest_context(&manifest_workspace, &manifest_path)?; let mut observations = Vec::new(); let mut analyses = Vec::new(); let mut reports = Vec::new(); @@ -271,7 +271,7 @@ fn generate_app_report(options: &AppReportOptions) -> Result<(), String> { app_observation_from_manifest_row( &options.root, row, - &assay_id, + &manifest_context.assay_id, input_inspection.inferred_sex.as_ref(), input_inspection.assembly, ) @@ -284,24 +284,32 @@ fn generate_app_report(options: &AppReportOptions) -> Result<(), String> { participant_id: &participant_id, loader: &input_loader, output_dir: &options.output_dir, + observation_rows: &rows, filters: &options.filters, max_duration_ms: options.analysis_max_duration_ms, }; let input_analyses = run_manifest_analyses_for_report(&options.manifest_path, &analysis_options)?; analyses.extend(input_analyses.clone()); - let matched_findings = match_app_findings(&findings, &input_observations, &input_analyses); - reports.push(app_report_json(AppReportJsonInput { - assay_id: &assay_id, + let input_file_name = input_file + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or_default(); + let input_file_path = input_file.display().to_string(); + reports.push(bioscript_reporting::app_input_report_json( + bioscript_reporting::AppInputReportInput { + assay_id: &manifest_context.assay_id, participant_id: &participant_id, - input_file, + input_file_name, + input_file_path: &input_file_path, observations: &input_observations, analyses: &input_analyses, - findings: &matched_findings, - provenance: &provenance, + findings: &manifest_context.findings, + provenance: &manifest_context.provenance, input_inspection: Some(&input_inspection), - manifest_metadata: &manifest_metadata, - })); + manifest_metadata: &manifest_context.manifest_metadata, + }, + )); } write_app_observations( diff --git a/rust/bioscript-cli/src/report_output.rs b/rust/bioscript-cli/src/report_output.rs index 2b74dc2..be6e86a 100644 --- a/rust/bioscript-cli/src/report_output.rs +++ b/rust/bioscript-cli/src/report_output.rs @@ -1,233 +1,10 @@ -#[derive(Clone, Copy)] -struct AppReportJsonInput<'a> { - assay_id: &'a str, - participant_id: &'a str, - input_file: &'a Path, - observations: &'a [serde_json::Value], - analyses: &'a [serde_json::Value], - findings: &'a [serde_json::Value], - provenance: &'a [serde_json::Value], - input_inspection: Option<&'a bioscript_formats::FileInspection>, - manifest_metadata: &'a serde_json::Value, -} - -fn app_report_json(input: AppReportJsonInput<'_>) -> serde_json::Value { - let called = input - .observations - .iter() - .filter(|item| { - item.get("call_status").and_then(serde_json::Value::as_str) == Some("called") - }) - .count(); - let input_debug = input - .input_inspection - .map(|inspection| { - let mut value = input_inspection_json(inspection); - if observations_have_imputed_vcf_references(input.observations) - && let Some(object) = value.as_object_mut() - { - object.insert( - "vcf_missing_reference_imputation".to_owned(), - serde_json::Value::Bool(true), - ); - } - value - }); - serde_json::json!({ - "schema": "bioscript:report:1.0", - "version": "1.0", - "participant_id": input.participant_id, - "assay_id": input.assay_id, - "assay_version": "1.0", - "manifest": input.manifest_metadata, - "input": { - "file_name": input.input_file.file_name().and_then(|value| value.to_str()).unwrap_or_default(), - "file_path": input.input_file.display().to_string(), - "debug": input_debug, - }, - "report_status": if called == input.observations.len() { "complete" } else { "partial" }, - "derived_from": input.observations.iter().filter_map(|item| item.get("variant_key").cloned()).collect::>(), - "analyses": input.analyses, - "findings": input.findings, - "provenance": input.provenance, - "metrics": { - "n_sites_tested": input.observations.len(), - "n_sites_called": called, - "n_sites_missing": input.observations.len().saturating_sub(called), - "n_analyses": input.analyses.len(), - "n_findings_matched": input.findings.len(), - } - }) -} - -fn observations_have_imputed_vcf_references(observations: &[serde_json::Value]) -> bool { - observations.iter().any(|observation| { - observation - .get("evidence_raw") - .and_then(serde_json::Value::as_str) - .is_some_and(|evidence| { - evidence.contains("imputed reference genotype from absent variant-only VCF record") - }) - }) -} - -fn report_manifest_metadata(path: &Path) -> Result { - let text = fs::read_to_string(path) - .map_err(|err| format!("failed to read manifest metadata {}: {err}", path.display()))?; - let value: serde_yaml::Value = serde_yaml::from_str(&text) - .map_err(|err| format!("failed to parse manifest metadata {}: {err}", path.display()))?; - let schema = yaml_string_at(&value, "schema"); - let version = yaml_string_at(&value, "version"); - let name = yaml_string_at(&value, "name"); - let label = yaml_string_at(&value, "label").or_else(|| name.clone()); - let tags = value - .get("tags") - .and_then(serde_yaml::Value::as_sequence) - .map(|items| { - items - .iter() - .filter_map(serde_yaml::Value::as_str) - .map(serde_json::Value::from) - .collect::>() - }) - .unwrap_or_default(); - let members = value - .get("members") - .and_then(serde_yaml::Value::as_sequence) - .map(|items| { - items - .iter() - .filter_map(serde_yaml::Value::as_mapping) - .map(|mapping| { - serde_json::json!({ - "kind": yaml_mapping_string(mapping, "kind"), - "path": yaml_mapping_string(mapping, "path"), - "version": yaml_mapping_string(mapping, "version"), - }) - }) - .collect::>() - }) - .unwrap_or_default(); - Ok(serde_json::json!({ - "schema": schema, - "version": version, - "name": name, - "label": label, - "tags": tags, - "members": members, - })) -} - -fn yaml_string_at(value: &serde_yaml::Value, key: &str) -> Option { - value - .get(key) - .and_then(serde_yaml::Value::as_str) - .map(ToOwned::to_owned) -} - -fn yaml_mapping_string(mapping: &serde_yaml::Mapping, key: &str) -> Option { - mapping - .get(serde_yaml::Value::String(key.to_owned())) - .and_then(serde_yaml::Value::as_str) - .map(ToOwned::to_owned) -} - -fn input_inspection_json(inspection: &bioscript_formats::FileInspection) -> serde_json::Value { - serde_json::json!({ - "container": file_container_name(inspection.container), - "format": detected_kind_name(inspection.detected_kind), - "format_confidence": detection_confidence_name(inspection.confidence), - "assembly": inspection.assembly.map(assembly_name), - "phased": inspection.phased, - "selected_entry": inspection.selected_entry, - "has_index": inspection.has_index, - "index_path": inspection.index_path.as_ref().map(|path| path.display().to_string()), - "reference_matches": inspection.reference_matches, - "source": inspection.source.as_ref().map(|source| serde_json::json!({ - "vendor": source.vendor, - "platform_version": source.platform_version, - "confidence": detection_confidence_name(source.confidence), - "evidence": source.evidence, - })), - "inferred_sex": inspection.inferred_sex.as_ref().map(|sex| serde_json::json!({ - "sex": inferred_sex_name(sex.sex), - "confidence": sex_detection_confidence_name(sex.confidence), - "method": sex.method, - "evidence": sex.evidence, - })), - "evidence": inspection.evidence, - "warnings": inspection.warnings, - "duration_ms": inspection.duration_ms, - }) -} - -fn file_container_name(value: bioscript_formats::FileContainer) -> &'static str { - match value { - bioscript_formats::FileContainer::Plain => "plain", - bioscript_formats::FileContainer::Zip => "zip", - } -} - -fn detected_kind_name(value: bioscript_formats::DetectedKind) -> &'static str { - match value { - bioscript_formats::DetectedKind::GenotypeText => "genotype_text", - bioscript_formats::DetectedKind::Vcf => "vcf", - bioscript_formats::DetectedKind::AlignmentCram => "alignment_cram", - bioscript_formats::DetectedKind::AlignmentBam => "alignment_bam", - bioscript_formats::DetectedKind::ReferenceFasta => "reference_fasta", - bioscript_formats::DetectedKind::Unknown => "unknown", - } -} - -fn detection_confidence_name(value: bioscript_formats::DetectionConfidence) -> &'static str { - match value { - bioscript_formats::DetectionConfidence::Authoritative => "authoritative", - bioscript_formats::DetectionConfidence::StrongHeuristic => "strong_heuristic", - bioscript_formats::DetectionConfidence::WeakHeuristic => "weak_heuristic", - bioscript_formats::DetectionConfidence::Unknown => "unknown", - } -} - -fn assembly_name(value: bioscript_core::Assembly) -> &'static str { - match value { - bioscript_core::Assembly::Grch37 => "grch37", - bioscript_core::Assembly::Grch38 => "grch38", - } -} - -fn inferred_sex_name(value: InferredSex) -> &'static str { - match value { - InferredSex::Male => "male", - InferredSex::Female => "female", - InferredSex::Unknown => "unknown", - } -} - -fn sex_detection_confidence_name(value: SexDetectionConfidence) -> &'static str { - match value { - SexDetectionConfidence::High => "high", - SexDetectionConfidence::Medium => "medium", - SexDetectionConfidence::Low => "low", - } -} - fn write_app_observations( output_dir: &Path, observations: &[serde_json::Value], format: AppOutputFormat, ) -> Result<(), String> { if matches!(format, AppOutputFormat::Tsv | AppOutputFormat::Both) { - let mut out = bioscript_core::OBSERVATION_TSV_HEADERS.join("\t"); - out.push('\n'); - for observation in observations { - let line = bioscript_core::OBSERVATION_TSV_HEADERS - .iter() - .map(|header| json_field_as_tsv(observation.get(*header))) - .collect::>() - .join("\t"); - out.push_str(&line); - out.push('\n'); - } + let out = bioscript_reporting::render_observations_tsv(observations); fs::write(output_dir.join("observations.tsv"), out) .map_err(|err| format!("failed to write observations.tsv: {err}"))?; } @@ -269,12 +46,7 @@ fn write_app_reports( } fn write_jsonl(path: &Path, rows: &[serde_json::Value]) -> Result<(), String> { - let mut out = String::new(); - for row in rows { - let line = serde_json::to_string(row).map_err(|err| err.to_string())?; - out.push_str(&line); - out.push('\n'); - } + let out = bioscript_reporting::render_jsonl(rows)?; fs::write(path, out).map_err(|err| format!("failed to write {}: {err}", path.display())) } @@ -283,73 +55,12 @@ fn write_json_pretty(path: &Path, value: &serde_json::Value) -> Result<(), Strin fs::write(path, text).map_err(|err| format!("failed to write {}: {err}", path.display())) } -fn json_field_as_tsv(value: Option<&serde_json::Value>) -> String { - match value { - Some(serde_json::Value::Null) | None => String::new(), - Some(serde_json::Value::String(value)) => value.replace(['\t', '\n'], " "), - Some(value) => value.to_string().replace(['\t', '\n'], " "), - } -} - fn write_app_html( output_dir: &Path, observations: &[serde_json::Value], reports: &[serde_json::Value], ) -> Result<(), String> { - let mut out = String::from( - r##"BioScript report
"##, - ); - let label_findings = collect_report_findings(reports, "bioscript:pgx-label:1.0"); - let summary_findings = collect_report_findings(reports, "bioscript:pgx-summary:1.0"); - let has_pgx_findings = !label_findings.is_empty() || !summary_findings.is_empty(); - let analysis_outputs = collect_report_analyses(reports); - let participants = collect_report_participants(reports); - render_report_manifest_header(&mut out, reports); - let _ = write!( - out, - "
{} observation(s), {} analysis output(s), {} PGx label finding(s), {} PGx summary finding(s)
", - observations.len(), - analysis_outputs.len(), - label_findings.len(), - summary_findings.len() - ); - render_participant_filter(&mut out, &participants); - out.push_str(""); - out.push_str("

Input

"); - render_input_debug(&mut out, reports, participants.len() > 1); - out.push_str("
"); - out.push_str("

Observations

"); - render_observation_table(&mut out, observations, participants.len() > 1); - out.push_str("
"); - out.push_str("

Analysis

"); - render_analysis_tables( - &mut out, - &analysis_outputs, - observations, - participants.len() > 1, - ); - out.push_str("
"); - if has_pgx_findings { - out.push_str("

PGx

"); - render_pgx_table(&mut out, &label_findings, &summary_findings); - out.push_str("
"); - } - out.push_str("

Provenance

"); - render_provenance_links(&mut out, reports); - out.push_str("
"); - out.push_str("

Source

"); - render_report_source_section(&mut out, reports); - out.push_str("
"); - out.push_str("

Raw Reports JSON

Show raw report JSON"); - for report in reports { - let text = serde_json::to_string_pretty(report).map_err(|err| err.to_string())?; - let _ = write!(out, "
{}
", html_escape(&text)); - } - out.push_str("
"); + let out = bioscript_reporting::render_app_html_document(observations, reports)?; fs::write(output_dir.join("index.html"), out) .map_err(|err| format!("failed to write index.html: {err}")) } diff --git a/rust/bioscript-cli/src/report_review.rs b/rust/bioscript-cli/src/report_review.rs index ae05dbf..d10932c 100644 --- a/rust/bioscript-cli/src/report_review.rs +++ b/rust/bioscript-cli/src/report_review.rs @@ -81,10 +81,10 @@ fn generate_review_report(options: &ReviewReportOptions) -> Result<(), String> { ) })?; - let assay_id = app_assay_id(&options.manifest_path)?; - let manifest_metadata = report_manifest_metadata(&options.manifest_path)?; - let findings = load_manifest_findings(&options.root, &options.manifest_path)?; - let provenance = load_manifest_provenance_links(&options.root, &options.manifest_path)?; + let manifest_workspace = bioscript_reporting::FilesystemManifestWorkspace::new(&options.root); + let manifest_path = options.manifest_path.display().to_string(); + let manifest_context = + bioscript_reporting::load_report_manifest_context(&manifest_workspace, &manifest_path)?; let cases = load_review_cases(&options.cases_path)?; let mut observations = Vec::new(); let mut analyses = Vec::new(); @@ -102,25 +102,40 @@ fn generate_review_report(options: &ReviewReportOptions) -> Result<(), String> { &options.filters, )? .iter() - .map(|row| app_observation_from_manifest_row(&options.root, row, &assay_id, None, None)) + .map(|row| { + app_observation_from_manifest_row( + &options.root, + row, + &manifest_context.assay_id, + None, + None, + ) + }) .collect::, _>>()?; observations.extend(input_observations.clone()); let input_analyses = run_review_analyses(options, &case, &input_bytes)?; analyses.extend(input_analyses.clone()); - let matched_findings = match_app_findings(&findings, &input_observations, &input_analyses); let synthetic_input = PathBuf::from(format!("review://{}", case.id)); - let mut report = app_report_json(AppReportJsonInput { - assay_id: &assay_id, + let synthetic_input_name = synthetic_input + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or_default(); + let synthetic_input_path = synthetic_input.display().to_string(); + let mut report = bioscript_reporting::app_input_report_json( + bioscript_reporting::AppInputReportInput { + assay_id: &manifest_context.assay_id, participant_id: &case.id, - input_file: &synthetic_input, + input_file_name: synthetic_input_name, + input_file_path: &synthetic_input_path, observations: &input_observations, analyses: &input_analyses, - findings: &matched_findings, - provenance: &provenance, + findings: &manifest_context.findings, + provenance: &manifest_context.provenance, input_inspection: None, - manifest_metadata: &manifest_metadata, - }); + manifest_metadata: &manifest_context.manifest_metadata, + }, + ); if let Some(object) = report.as_object_mut() { object.insert( "review_case".to_owned(), @@ -206,12 +221,14 @@ fn run_review_analyses( format: Some(GenotypeSourceFormat::Text), ..GenotypeLoadOptions::default() }; + let observation_rows = Vec::new(); let analysis_options = ReportAnalysisOptions { runtime_root: &options.root, input_file: &temp_path, participant_id: &case.id, loader: &loader, output_dir: &options.output_dir, + observation_rows: &observation_rows, filters: &options.filters, max_duration_ms: 1_000, }; diff --git a/rust/bioscript-ffi/src/variant_yaml.rs b/rust/bioscript-ffi/src/variant_yaml.rs index 69c2547..9877043 100644 --- a/rust/bioscript-ffi/src/variant_yaml.rs +++ b/rust/bioscript-ffi/src/variant_yaml.rs @@ -8,14 +8,20 @@ use crate::types::{ NamedVariantSpec, RunVariantYamlRequest, RunVariantYamlResult, observation_result, }; -/// Runs a BioScript variant YAML assay against a supported genome file. +/// Runs a `BioScript` variant YAML assay against a supported genome file. /// /// The native desktop/mobile path uses this instead of the web WASM exports. /// It intentionally mirrors the web variant YAML flow: compile YAML through /// `bioscript-schema`, choose the preferred assembly-specific variant, and /// execute lookup through `bioscript-formats`. +/// +/// # Errors +/// +/// Returns an error when the YAML assay cannot be read or compiled, when the +/// input options are invalid, or when the genome file cannot be loaded or +/// queried. pub fn run_variant_yaml_request( - request: RunVariantYamlRequest, + request: &RunVariantYamlRequest, ) -> Result { let yaml_path = PathBuf::from(&request.yaml_path); let yaml_text = fs::read_to_string(&yaml_path) @@ -28,7 +34,7 @@ pub fn run_variant_yaml_request( &yaml_text, )?; let selected = select_preferred_assembly_variants(&request.genome_path, variants); - let loader = variant_loader(&request)?; + let loader = variant_loader(request)?; let genome_path = PathBuf::from(&request.genome_path); let store = GenotypeStore::from_file_with_options(&genome_path, &loader) diff --git a/rust/bioscript-formats/Cargo.toml b/rust/bioscript-formats/Cargo.toml index e91d8fe..52db7fc 100644 --- a/rust/bioscript-formats/Cargo.toml +++ b/rust/bioscript-formats/Cargo.toml @@ -9,7 +9,7 @@ crate-type = ["rlib"] [dependencies] bioscript-core = { path = "../bioscript-core" } flate2 = "1.1.9" -noodles = { version = "0.109.0", features = ["bam", "bgzf", "core", "cram", "csi", "fasta", "sam", "tabix", "vcf"] } +noodles = { version = "0.110.0", features = ["bam", "bgzf", "core", "cram", "csi", "fasta", "sam", "tabix", "vcf"] } zip = { version = "2.2.0", default-features = false, features = ["deflate"] } [lints.clippy] diff --git a/rust/bioscript-formats/src/alignment.rs b/rust/bioscript-formats/src/alignment.rs index ec134f7..382626f 100644 --- a/rust/bioscript-formats/src/alignment.rs +++ b/rust/bioscript-formats/src/alignment.rs @@ -14,11 +14,10 @@ mod readers; pub use readers::{ build_bam_indexed_reader_from_reader, build_cram_indexed_reader_from_reader, - build_reference_repository_from_readers, - generate_bam_bai_bytes, generate_bam_bai_reader, generate_cram_crai_bytes, - generate_cram_crai_reader, generate_fasta_fai_bytes, generate_fasta_fai_reader, - generate_vcf_tbi_bytes, - parse_bai_bytes, parse_crai_bytes, parse_fai_bytes, parse_tbi_bytes, + build_reference_repository_from_readers, generate_bam_bai_bytes, generate_bam_bai_reader, + generate_cram_crai_bytes, generate_cram_crai_reader, generate_fasta_fai_bytes, + generate_fasta_fai_reader, generate_vcf_tbi_bytes, parse_bai_bytes, parse_crai_bytes, + parse_fai_bytes, parse_tbi_bytes, }; pub(crate) use cram_stream::for_each_raw_cram_record_with_reader_inner; diff --git a/rust/bioscript-formats/src/alignment/readers.rs b/rust/bioscript-formats/src/alignment/readers.rs index 9eb9078..fa953d6 100644 --- a/rust/bioscript-formats/src/alignment/readers.rs +++ b/rust/bioscript-formats/src/alignment/readers.rs @@ -5,23 +5,20 @@ use std::{ path::Path, }; +use noodles::core::Position; +use noodles::sam::alignment::Record as _; +use noodles::sam::header::record::value::map::header::{sort_order::COORDINATE, tag::SORT_ORDER}; use noodles::{ - bam, - bgzf, - csi::{self, binning_index::{Indexer, index::reference_sequence::bin::Chunk}}, - cram::{ + bam, bgzf, + cram::{self, container::ReferenceSequenceContext, crai, io::reader::Container}, + csi::{ self, - container::ReferenceSequenceContext, - crai, - io::reader::Container, + binning_index::{Indexer, index::reference_sequence::bin::Chunk}, }, fasta::{self, repository::adapters::IndexedReader as FastaIndexedReader}, tabix, vcf, vcf::variant::Record as _, }; -use noodles::core::Position; -use noodles::sam::alignment::Record as _; -use noodles::sam::header::record::value::map::header::{sort_order::COORDINATE, tag::SORT_ORDER}; use bioscript_core::RuntimeError; @@ -65,35 +62,35 @@ where /// callers that receive the small index inline while the big CRAM stays on a /// JS-backed reader. pub fn parse_crai_bytes(bytes: &[u8]) -> Result { - crai::io::Reader::new(std::io::Cursor::new(bytes)) - .read_index() - .map_err(|err| RuntimeError::Io(format!("failed to parse CRAM index bytes: {err}"))) + crai::io::Reader::new(std::io::Cursor::new(bytes)) + .read_index() + .map_err(|err| RuntimeError::Io(format!("failed to parse CRAM index bytes: {err}"))) } /// Parse a BAM index (`.bai`) from an in-memory byte buffer. pub fn parse_bai_bytes(bytes: &[u8]) -> Result { - bam::bai::io::Reader::new(std::io::Cursor::new(bytes)) - .read_index() - .map_err(|err| RuntimeError::Io(format!("failed to parse BAM index bytes: {err}"))) + bam::bai::io::Reader::new(std::io::Cursor::new(bytes)) + .read_index() + .map_err(|err| RuntimeError::Io(format!("failed to parse BAM index bytes: {err}"))) } /// Build a BAM `IndexedReader` over any `Read` source given a parsed BAI index. pub fn build_bam_indexed_reader_from_reader( - reader: R, - bai_index: bam::bai::Index, + reader: R, + bai_index: bam::bai::Index, ) -> Result>, RuntimeError> where - R: Read, + R: Read, { - bam::io::indexed_reader::Builder::default() - .set_index(bai_index) - .build_from_reader(reader) - .map_err(|err| RuntimeError::Io(format!("failed to build indexed BAM reader: {err}"))) + bam::io::indexed_reader::Builder::default() + .set_index(bai_index) + .build_from_reader(reader) + .map_err(|err| RuntimeError::Io(format!("failed to build indexed BAM reader: {err}"))) } /// Parse a FASTA index (`.fai`) from an in-memory byte buffer. pub fn parse_fai_bytes(bytes: &[u8]) -> Result { - fasta::fai::io::Reader::new(std::io::Cursor::new(bytes)) + fasta::fai::io::Reader::new(std::io::Cursor::new(bytes)) .read_index() .map_err(|err| RuntimeError::Io(format!("failed to parse FASTA index bytes: {err}"))) } @@ -138,7 +135,9 @@ pub fn generate_vcf_tbi_bytes(bytes: &[u8]) -> Result, RuntimeError> { indexer .add_record(reference_sequence_name, start, end, chunk) - .map_err(|err| RuntimeError::Io(format!("failed to add VCF record to tabix index: {err}")))?; + .map_err(|err| { + RuntimeError::Io(format!("failed to add VCF record to tabix index: {err}")) + })?; start_position = end_position; } @@ -179,14 +178,15 @@ where break; } - let compression_header = container - .compression_header() - .map_err(|err| RuntimeError::Io(format!("failed to read CRAM compression header: {err}")))?; + let compression_header = container.compression_header().map_err(|err| { + RuntimeError::Io(format!("failed to read CRAM compression header: {err}")) + })?; let landmarks = container.header().landmarks(); let slice_count = landmarks.len(); for (i, result) in container.slices().enumerate() { - let slice = result.map_err(|err| RuntimeError::Io(format!("failed to read CRAM slice: {err}")))?; + let slice = result + .map_err(|err| RuntimeError::Io(format!("failed to read CRAM slice: {err}")))?; let landmark = landmarks[i]; let slice_length = if i < slice_count - 1 { landmarks[i + 1] - landmark @@ -199,9 +199,9 @@ where SliceReferenceSequenceAlignmentRangeInclusive, > = HashMap::new(); - let (core_data_src, external_data_srcs) = slice - .decode_blocks() - .map_err(|err| RuntimeError::Io(format!("failed to decode CRAM slice blocks: {err}")))?; + let (core_data_src, external_data_srcs) = slice.decode_blocks().map_err(|err| { + RuntimeError::Io(format!("failed to decode CRAM slice blocks: {err}")) + })?; for record in slice .records( @@ -211,16 +211,24 @@ where &core_data_src, &external_data_srcs, ) - .map_err(|err| RuntimeError::Io(format!("failed to decode CRAM slice records: {err}")))? + .map_err(|err| { + RuntimeError::Io(format!("failed to decode CRAM slice records: {err}")) + })? { let range = reference_sequence_ids - .entry(record.reference_sequence_id(&header).transpose().map_err(|err| { - RuntimeError::Io(format!("failed to read CRAM record reference id: {err}")) - })?) + .entry(record.reference_sequence_id(&header).transpose().map_err( + |err| { + RuntimeError::Io(format!( + "failed to read CRAM record reference id: {err}" + )) + }, + )?) .or_default(); let alignment_start = record.alignment_start().transpose().map_err(|err| { - RuntimeError::Io(format!("failed to read CRAM record alignment start: {err}")) + RuntimeError::Io(format!( + "failed to read CRAM record alignment start: {err}" + )) })?; let alignment_end = record.alignment_end().transpose().map_err(|err| { RuntimeError::Io(format!("failed to read CRAM record alignment end: {err}")) @@ -307,11 +315,10 @@ where let header = reader .read_header() .map_err(|err| RuntimeError::Io(format!("failed to read BAM header: {err}")))?; - if !header + if header .header() .and_then(|hdr| hdr.other_fields().get(&SORT_ORDER)) - .map(|sort_order| sort_order == COORDINATE) - .unwrap_or_default() + .is_none_or(|sort_order| sort_order != COORDINATE) { return Err(RuntimeError::Io( "BAM must be coordinate-sorted (SO:coordinate) before indexing".to_owned(), @@ -330,18 +337,15 @@ where let end_position = reader.get_ref().virtual_position(); let chunk = Chunk::new(start_position, end_position); let alignment_context = match ( - record - .reference_sequence_id() - .transpose() - .map_err(|err| RuntimeError::Io(format!("failed to read BAM reference id: {err}")))?, - record - .alignment_start() - .transpose() - .map_err(|err| RuntimeError::Io(format!("failed to read BAM alignment start: {err}")))?, - record - .alignment_end() - .transpose() - .map_err(|err| RuntimeError::Io(format!("failed to read BAM alignment end: {err}")))?, + record.reference_sequence_id().transpose().map_err(|err| { + RuntimeError::Io(format!("failed to read BAM reference id: {err}")) + })?, + record.alignment_start().transpose().map_err(|err| { + RuntimeError::Io(format!("failed to read BAM alignment start: {err}")) + })?, + record.alignment_end().transpose().map_err(|err| { + RuntimeError::Io(format!("failed to read BAM alignment end: {err}")) + })?, ) { (Some(id), Some(start), Some(end)) => { let flags = record.flags(); @@ -352,7 +356,9 @@ where builder .add_record(alignment_context, chunk) - .map_err(|err| RuntimeError::Io(format!("failed to add BAM record to BAI index: {err}")))?; + .map_err(|err| { + RuntimeError::Io(format!("failed to add BAM record to BAI index: {err}")) + })?; start_position = end_position; } diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index 26898a4..ff58d03 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -7,8 +7,6 @@ use std::{ use zip::ZipArchive; -#[cfg(test)] -use bioscript_core::{Assembly, GenomicLocus, VariantKind}; use bioscript_core::{RuntimeError, VariantObservation, VariantSpec}; mod backends; @@ -21,33 +19,14 @@ mod types; mod vcf; mod vcf_tokens; -#[cfg(test)] -use common::chrom_sort_key; pub(crate) use common::{describe_query, normalize_genotype, variant_sort_key}; -#[cfg(test)] -use cram_backend::{ - SnpPileupCounts, anchor_window, choose_variant_locus, classify_expected_indel, - describe_copy_number_decision_rule, describe_locus, describe_snp_decision_rule, - detect_reference_assembly, first_base, indel_at_anchor, infer_copy_number_genotype, - infer_snp_genotype, len_as_i64, normalize_pileup_base, record_overlaps_locus, spans_position, -}; pub use cram_backend::{ - observe_cram_deletion_with_reader, observe_cram_indel_with_reader, - observe_cram_snp_with_reader, + observe_cram_deletion_with_reader, observe_cram_indel_with_reader, observe_cram_snp_with_reader, }; pub(crate) use delimited::{ COMMENT_PREFIXES, DelimitedColumnIndexes, Delimiter, detect_delimiter, parse_streaming_row, }; -#[cfg(test)] -use delimited::{GENOTYPE_ALIASES, split_csv_line, strip_bom, strip_inline_comment}; use delimited::{RowParser, scan_delimited_variants}; -#[cfg(test)] -use delimited::{ - build_column_indexes, default_column_indexes, find_header_index, looks_like_header_fields, - normalize_name, -}; -#[cfg(test)] -use io::looks_like_vcf_lines; use io::{detect_source_format, is_bgzf_path, read_lines_from_reader, select_zip_entry}; pub use types::{ BackendCapabilities, GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind, @@ -57,17 +36,8 @@ pub use vcf::{ choose_variant_locus_for_assembly, imputed_reference_observation, observe_vcf_snp_with_reader, observe_vcf_variant_with_reader, }; -#[cfg(test)] -use vcf::{ - detect_vcf_assembly, extract_vcf_sample_genotype, normalize_chromosome_name, parse_vcf_record, - vcf_row_matches_variant, -}; use vcf::{lookup_indexed_vcf_variants, scan_vcf_variants}; pub(crate) use vcf_tokens::genotype_from_vcf_gt; -#[cfg(test)] -use vcf_tokens::{ - is_symbolic_vcf_alt, normalize_sequence_token, vcf_alt_token, vcf_reference_token, -}; impl GenotypeStore { pub fn from_file(path: &Path) -> Result { @@ -525,12 +495,35 @@ mod tests { time::{SystemTime, UNIX_EPOCH}, }; + use bioscript_core::{Assembly, GenomicLocus, VariantKind}; use noodles::bgzf; use noodles::csi; use zip::write::SimpleFileOptions; use crate::alignment::{AlignmentOp, AlignmentOpKind, AlignmentRecord}; - use crate::genotype::{io::read_plain_lines, vcf::detect_vcf_assembly_from_path}; + use crate::genotype::{ + common::chrom_sort_key, + cram_backend::{ + SnpPileupCounts, anchor_window, choose_variant_locus, classify_expected_indel, + describe_copy_number_decision_rule, describe_locus, describe_snp_decision_rule, + detect_reference_assembly, first_base, indel_at_anchor, infer_copy_number_genotype, + infer_snp_genotype, len_as_i64, normalize_pileup_base, record_overlaps_locus, + spans_position, + }, + delimited::{ + GENOTYPE_ALIASES, build_column_indexes, default_column_indexes, find_header_index, + looks_like_header_fields, normalize_name, split_csv_line, strip_bom, + strip_inline_comment, + }, + io::{looks_like_vcf_lines, read_plain_lines}, + vcf::{ + detect_vcf_assembly, detect_vcf_assembly_from_path, extract_vcf_sample_genotype, + normalize_chromosome_name, parse_vcf_record, vcf_row_matches_variant, + }, + vcf_tokens::{ + is_symbolic_vcf_alt, normalize_sequence_token, vcf_alt_token, vcf_reference_token, + }, + }; fn temp_dir(label: &str) -> PathBuf { let nanos = SystemTime::now() @@ -680,9 +673,7 @@ mod tests { matched_rsid: None, assembly: Some(Assembly::Grch38), genotype: Some("TT".to_owned()), - evidence: vec![ - "observed SNP pileup at 22:36265988-36265988 ref=T alt=G".to_owned(), - ], + evidence: vec!["observed SNP pileup at 22:36265988-36265988 ref=T alt=G".to_owned()], ..VariantObservation::default() }]; diff --git a/rust/bioscript-formats/src/genotype/backends.rs b/rust/bioscript-formats/src/genotype/backends.rs index d762875..659328d 100644 --- a/rust/bioscript-formats/src/genotype/backends.rs +++ b/rust/bioscript-formats/src/genotype/backends.rs @@ -57,12 +57,17 @@ impl RsidMapBackend { }); } - Ok(VariantObservation { - backend: self.backend_name().to_owned(), - evidence: vec![format!( + let evidence = if variant.has_coordinates() { + format!( "no matching rsid or locus found for {}", describe_query(variant) - )], + ) + } else { + "no matching rsid found".to_owned() + }; + Ok(VariantObservation { + backend: self.backend_name().to_owned(), + evidence: vec![evidence], ..VariantObservation::default() }) } diff --git a/rust/bioscript-formats/src/genotype/cram_backend.rs b/rust/bioscript-formats/src/genotype/cram_backend.rs index 1d9f082..228926d 100644 --- a/rust/bioscript-formats/src/genotype/cram_backend.rs +++ b/rust/bioscript-formats/src/genotype/cram_backend.rs @@ -25,8 +25,7 @@ pub(crate) use indel::{ classify_expected_indel, indel_at_anchor, record_overlaps_locus, spans_position, }; pub use reader::{ - observe_cram_deletion_with_reader, observe_cram_indel_with_reader, - observe_cram_snp_with_reader, + observe_cram_deletion_with_reader, observe_cram_indel_with_reader, observe_cram_snp_with_reader, }; const DEFAULT_MPILEUP_MIN_BASE_QUALITY: u8 = 13; diff --git a/rust/bioscript-formats/src/genotype/loaders.rs b/rust/bioscript-formats/src/genotype/loaders.rs index 2d9d000..079a069 100644 --- a/rust/bioscript-formats/src/genotype/loaders.rs +++ b/rust/bioscript-formats/src/genotype/loaders.rs @@ -68,7 +68,13 @@ pub(crate) fn from_delimited_reader( let mut locus_values = HashMap::new(); let mut source_lines = HashMap::new(); for line in prelude { - consume_delimited_line(&mut parser, &line, &mut values, &mut locus_values, &mut source_lines)?; + consume_delimited_line( + &mut parser, + &line, + &mut values, + &mut locus_values, + &mut source_lines, + )?; } loop { buf.clear(); @@ -146,7 +152,10 @@ fn consume_delimited_line( let source_line = sanitize_evidence_line(line); if let (Some(chrom), Some(position)) = (row.chrom.as_ref(), row.position) { locus_values.insert( - (chrom.trim_start_matches("chr").to_ascii_lowercase(), position), + ( + chrom.trim_start_matches("chr").to_ascii_lowercase(), + position, + ), (row.genotype.clone(), row.rsid.clone(), source_line.clone()), ); } diff --git a/rust/bioscript-formats/src/genotype/vcf.rs b/rust/bioscript-formats/src/genotype/vcf.rs index a3fab85..599b1d9 100644 --- a/rust/bioscript-formats/src/genotype/vcf.rs +++ b/rust/bioscript-formats/src/genotype/vcf.rs @@ -238,8 +238,7 @@ pub(crate) fn lookup_indexed_vcf_variants( && !observation.evidence.iter().any(|line| { line.contains("tabix index has no contig") || line.contains("has no GRCh37/GRCh38 locus") - }) - { + }) { imputed_reference_observation( backend.backend_name(), &backend.path.display().to_string(), diff --git a/rust/bioscript-reporting/Cargo.toml b/rust/bioscript-reporting/Cargo.toml new file mode 100644 index 0000000..cea0ab4 --- /dev/null +++ b/rust/bioscript-reporting/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "bioscript-reporting" +version = "0.2.0" +edition = "2024" + +[dependencies] +bioscript-core = { path = "../bioscript-core" } +bioscript-formats = { path = "../bioscript-formats" } +bioscript-schema = { path = "../bioscript-schema" } +serde_json = { version = "1", features = ["preserve_order"] } +serde_yaml = "0.9" + +[lints.clippy] +pedantic = { level = "warn", priority = -1 } diff --git a/rust/bioscript-reporting/src/analysis.rs b/rust/bioscript-reporting/src/analysis.rs new file mode 100644 index 0000000..664e449 --- /dev/null +++ b/rust/bioscript-reporting/src/analysis.rs @@ -0,0 +1,409 @@ +use std::path::Path; + +use bioscript_core::{Assembly, VariantObservation}; +use bioscript_schema::PanelInterpretation; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct AnalysisOutputFormat { + pub format: &'static str, + pub extension: &'static str, +} + +pub fn analysis_output_format(value: Option<&str>) -> Result { + match value.unwrap_or("json").to_ascii_lowercase().as_str() { + "tsv" => Ok(AnalysisOutputFormat { + format: "tsv", + extension: "tsv", + }), + "json" => Ok(AnalysisOutputFormat { + format: "json", + extension: "json", + }), + "jsonl" => Ok(AnalysisOutputFormat { + format: "jsonl", + extension: "jsonl", + }), + other => Err(format!("unsupported analysis output_format '{other}'")), + } +} + +pub fn parse_analysis_output_text( + text: &str, + format: &str, +) -> Result<(Vec, Vec), String> { + match format { + "tsv" => Ok(parse_analysis_tsv(text)), + "json" => { + let value: serde_json::Value = serde_json::from_str(text) + .map_err(|err| format!("failed to parse analysis JSON: {err}"))?; + let rows = match value { + serde_json::Value::Array(rows) => rows, + serde_json::Value::Object(mut object) => object + .remove("rows") + .and_then(|rows| rows.as_array().cloned()) + .unwrap_or_else(|| vec![serde_json::Value::Object(object)]), + other => vec![other], + }; + let headers = analysis_headers_from_rows(&rows); + Ok((rows, headers)) + } + "jsonl" => { + let rows = text + .lines() + .filter(|line| !line.trim().is_empty()) + .map(|line| serde_json::from_str(line).map_err(|err| err.to_string())) + .collect::, _>>()?; + let headers = analysis_headers_from_rows(&rows); + Ok((rows, headers)) + } + other => Err(format!("unsupported analysis output_format '{other}'")), + } +} + +pub fn validate_bioscript_interpretation( + interpretation: &PanelInterpretation, +) -> Result<(), String> { + if interpretation.kind == "bioscript" { + return Ok(()); + } + Err(format!( + "analysis '{}' uses unsupported kind '{}'", + interpretation.id, interpretation.kind + )) +} + +pub fn analysis_output_relative_file( + participant_id: &str, + interpretation_id: &str, + extension: &str, +) -> String { + format!("analysis/{participant_id}/{interpretation_id}.{extension}") +} + +pub fn analysis_observations_relative_file( + participant_id: &str, + interpretation_id: &str, +) -> String { + format!("analysis/{participant_id}/{interpretation_id}.observations.tsv") +} + +pub fn render_analysis_observations_tsv(observations: &[VariantObservation]) -> String { + let headers = [ + "matched_rsid", + "assembly", + "genotype", + "ref_count", + "alt_count", + "depth", + "evidence", + ]; + let mut out = headers.join("\t"); + out.push('\n'); + for observation in observations { + let assembly = observation + .assembly + .map(|value| match value { + Assembly::Grch37 => "grch37", + Assembly::Grch38 => "grch38", + }) + .unwrap_or_default(); + let values = [ + observation.matched_rsid.clone().unwrap_or_default(), + assembly.to_owned(), + observation.genotype.clone().unwrap_or_default(), + observation + .ref_count + .map_or_else(String::new, |value| value.to_string()), + observation + .alt_count + .map_or_else(String::new, |value| value.to_string()), + observation + .depth + .map_or_else(String::new, |value| value.to_string()), + observation.evidence.join(" | "), + ]; + out.push_str(&values.join("\t")); + out.push('\n'); + } + out +} + +pub struct AnalysisOutputJsonInput<'a> { + pub participant_id: &'a str, + pub assay_id: &'a str, + pub interpretation: &'a PanelInterpretation, + pub output_format: &'a str, + pub manifest_path: &'a str, + pub script_path: &'a str, + pub output_file: &'a str, + pub observations_file: Option<&'a str>, + pub row_headers: Vec, + pub rows: Vec, +} + +pub fn analysis_output_json(input: AnalysisOutputJsonInput<'_>) -> serde_json::Value { + let AnalysisOutputJsonInput { + participant_id, + assay_id, + interpretation, + output_format, + manifest_path, + script_path, + output_file, + observations_file, + row_headers, + rows, + } = input; + + serde_json::json!({ + "schema": "bioscript:analysis-output:1.0", + "version": "1.0", + "participant_id": participant_id, + "assay_id": assay_id, + "analysis_id": interpretation.id, + "analysis_label": interpretation.label, + "kind": interpretation.kind, + "output_format": output_format, + "manifest_path": manifest_path, + "script_path": script_path, + "output_file": output_file, + "observations_file": observations_file, + "derived_from": interpretation.derived_from, + "assets": [], + "emits": interpretation.emits.iter().map(|emit| serde_json::json!({ + "key": emit.key, + "label": emit.label, + "value_type": emit.value_type, + "format": emit.format, + })).collect::>(), + "logic": interpretation.logic.as_ref().map(|logic| serde_json::json!({ + "description": logic.description, + "source": logic.source.as_ref().map(|source| serde_json::json!({ + "name": source.name, + "url": source.url, + })), + })), + "row_headers": row_headers, + "rows": rows, + }) +} + +pub fn participant_id_from_path(path: &Path) -> String { + let file_name = path + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or("participant"); + participant_id_from_name(file_name) +} + +pub fn participant_id_from_name(file_name: &str) -> String { + file_name + .trim_end_matches(".txt.zip") + .trim_end_matches(".csv.zip") + .trim_end_matches(".vcf.gz") + .trim_end_matches(".cram") + .trim_end_matches(".zip") + .trim_end_matches(".txt") + .trim_end_matches(".csv") + .to_owned() +} + +fn parse_analysis_tsv(text: &str) -> (Vec, Vec) { + let mut lines = text.lines().filter(|line| !line.trim().is_empty()); + let Some(header_line) = lines.next() else { + return (Vec::new(), Vec::new()); + }; + let headers: Vec<&str> = header_line.split('\t').collect(); + let mut rows = Vec::new(); + for line in lines { + let values: Vec<&str> = line.split('\t').collect(); + let mut object = serde_json::Map::new(); + for (idx, header) in headers.iter().enumerate() { + object.insert( + (*header).to_owned(), + serde_json::Value::String(values.get(idx).copied().unwrap_or_default().to_owned()), + ); + } + rows.push(serde_json::Value::Object(object)); + } + ( + rows, + headers.iter().map(|header| (*header).to_owned()).collect(), + ) +} + +fn analysis_headers_from_rows(rows: &[serde_json::Value]) -> Vec { + let mut headers = Vec::new(); + for row in rows { + let Some(object) = row.as_object() else { + continue; + }; + for key in object.keys() { + if !headers.contains(key) { + headers.push(key.clone()); + } + } + } + headers +} + +#[cfg(test)] +mod tests { + use super::{ + AnalysisOutputFormat, AnalysisOutputJsonInput, analysis_observations_relative_file, + analysis_output_format, analysis_output_json, analysis_output_relative_file, + parse_analysis_output_text, participant_id_from_name, render_analysis_observations_tsv, + validate_bioscript_interpretation, + }; + use bioscript_core::{Assembly, VariantObservation}; + use bioscript_schema::{ + PanelInterpretation, PanelInterpretationLogic, PanelInterpretationLogicSource, + }; + + #[test] + fn analysis_output_format_defaults_lowercases_and_rejects_unknown() { + assert_eq!( + analysis_output_format(None).unwrap(), + AnalysisOutputFormat { + format: "json", + extension: "json" + } + ); + assert_eq!( + analysis_output_format(Some("TSV")).unwrap(), + AnalysisOutputFormat { + format: "tsv", + extension: "tsv" + } + ); + assert_eq!( + analysis_output_format(Some("xml")).unwrap_err(), + "unsupported analysis output_format 'xml'" + ); + } + + #[test] + fn participant_id_suffix_stripping_matches_cli_report_path() { + assert_eq!( + participant_id_from_name("NA06985.clean.vcf.gz"), + "NA06985.clean" + ); + assert_eq!( + participant_id_from_name("genome_hu50B3F5_v5_Full.zip"), + "genome_hu50B3F5_v5_Full" + ); + assert_eq!(participant_id_from_name("sample.cram"), "sample"); + assert_eq!(participant_id_from_name("sample name.txt"), "sample name"); + } + + #[test] + fn parses_tsv_analysis_output_with_missing_cells() { + let (rows, headers) = + parse_analysis_output_text("status\tnote\nnormal\nvariant\tflag\n", "tsv").unwrap(); + assert_eq!(headers, vec!["status", "note"]); + assert_eq!(rows[0]["status"], "normal"); + assert_eq!(rows[0]["note"], ""); + assert_eq!(rows[1]["status"], "variant"); + assert_eq!(rows[1]["note"], "flag"); + } + + #[test] + fn json_headers_include_keys_from_all_rows() { + let (rows, headers) = + parse_analysis_output_text(r#"[{"a":1},{"b":2,"a":3}]"#, "json").unwrap(); + assert_eq!(rows.len(), 2); + assert_eq!(headers, vec!["a", "b"]); + } + + #[test] + fn interpretation_validation_and_output_file_naming_are_shared() { + let mut interpretation = PanelInterpretation { + id: "apoe_epsilon".to_owned(), + label: None, + kind: "bioscript".to_owned(), + path: "apoe.py".to_owned(), + output_format: None, + derived_from: Vec::new(), + emits: Vec::new(), + logic: None, + }; + + validate_bioscript_interpretation(&interpretation).unwrap(); + assert_eq!( + analysis_output_relative_file("sample", &interpretation.id, "tsv"), + "analysis/sample/apoe_epsilon.tsv" + ); + assert_eq!( + analysis_observations_relative_file("sample", &interpretation.id), + "analysis/sample/apoe_epsilon.observations.tsv" + ); + + interpretation.kind = "shell".to_owned(); + assert_eq!( + validate_bioscript_interpretation(&interpretation).unwrap_err(), + "analysis 'apoe_epsilon' uses unsupported kind 'shell'" + ); + } + + #[test] + fn analysis_output_json_uses_shared_report_shape() { + let interpretation = PanelInterpretation { + id: "apoe_epsilon".to_owned(), + label: Some("APOE epsilon".to_owned()), + kind: "bioscript".to_owned(), + path: "apoe.py".to_owned(), + output_format: Some("tsv".to_owned()), + derived_from: vec!["rs429358.yaml".to_owned()], + emits: Vec::new(), + logic: Some(PanelInterpretationLogic { + description: Some("APOE logic".to_owned()), + source: Some(PanelInterpretationLogicSource { + name: Some("ClinPGx".to_owned()), + url: Some("https://example.test".to_owned()), + }), + }), + }; + let value = analysis_output_json(AnalysisOutputJsonInput { + participant_id: "sample", + assay_id: "pgx", + interpretation: &interpretation, + output_format: "tsv", + manifest_path: "assets/APOE/assay.yaml", + script_path: "assets/APOE/apoe.py", + output_file: "analysis/sample/apoe_epsilon.tsv", + observations_file: Some("analysis/sample/apoe_epsilon.observations.tsv"), + row_headers: vec!["apoe_status".to_owned()], + rows: vec![serde_json::json!({"apoe_status": "e3/e3"})], + }); + + assert_eq!(value["schema"], "bioscript:analysis-output:1.0"); + assert_eq!(value["analysis_id"], "apoe_epsilon"); + assert_eq!(value["output_format"], "tsv"); + assert_eq!( + value["observations_file"], + "analysis/sample/apoe_epsilon.observations.tsv" + ); + assert_eq!(value["derived_from"][0], "rs429358.yaml"); + assert_eq!(value["assets"].as_array().unwrap().len(), 0); + assert_eq!(value["emits"].as_array().unwrap().len(), 0); + assert_eq!(value["logic"]["source"]["name"], "ClinPGx"); + assert_eq!(value["rows"][0]["apoe_status"], "e3/e3"); + } + + #[test] + fn renders_analysis_observations_for_runtime_scripts() { + let text = render_analysis_observations_tsv(&[VariantObservation { + matched_rsid: Some("rs1".to_owned()), + assembly: Some(Assembly::Grch38), + genotype: Some("AG".to_owned()), + ref_count: Some(8), + alt_count: Some(4), + depth: Some(12), + evidence: vec!["source".to_owned()], + ..VariantObservation::default() + }]); + + assert!(text.starts_with("matched_rsid\tassembly\tgenotype\t")); + assert!(text.contains("rs1\tgrch38\tAG\t8\t4\t12\tsource\n")); + } +} diff --git a/rust/bioscript-reporting/src/artifacts.rs b/rust/bioscript-reporting/src/artifacts.rs new file mode 100644 index 0000000..1590cba --- /dev/null +++ b/rust/bioscript-reporting/src/artifacts.rs @@ -0,0 +1,65 @@ +pub struct ReportArtifactTexts { + pub observations_tsv: String, + pub analysis_jsonl: String, + pub reports_jsonl: String, + pub html: String, + pub text_output: String, +} + +pub fn render_input_report_artifact_texts( + input: crate::AppInputReportInput<'_>, +) -> Result { + let report = crate::app_input_report_json(input); + render_report_artifact_texts(input.observations, input.analyses, &[report]) +} + +pub fn render_report_artifact_texts( + observations: &[serde_json::Value], + analyses: &[serde_json::Value], + reports: &[serde_json::Value], +) -> Result { + Ok(ReportArtifactTexts { + observations_tsv: render_observations_tsv(observations), + analysis_jsonl: render_jsonl(analyses)?, + reports_jsonl: render_jsonl(reports)?, + html: crate::render_app_html_document(observations, reports)?, + text_output: standard_text_output(), + }) +} + +pub fn standard_text_output() -> String { + "observations: observations.tsv\nanalysis: analysis.jsonl\nreports: reports.jsonl\nhtml: index.html\n" + .to_owned() +} + +pub fn render_observations_tsv(observations: &[serde_json::Value]) -> String { + let mut out = bioscript_core::OBSERVATION_TSV_HEADERS.join("\t"); + out.push('\n'); + for observation in observations { + let line = bioscript_core::OBSERVATION_TSV_HEADERS + .iter() + .map(|header| json_field_as_tsv(observation.get(*header))) + .collect::>() + .join("\t"); + out.push_str(&line); + out.push('\n'); + } + out +} + +pub fn render_jsonl(rows: &[serde_json::Value]) -> Result { + let mut out = String::new(); + for row in rows { + out.push_str(&serde_json::to_string(row).map_err(|err| err.to_string())?); + out.push('\n'); + } + Ok(out) +} + +pub fn json_field_as_tsv(value: Option<&serde_json::Value>) -> String { + match value { + Some(serde_json::Value::Null) | None => String::new(), + Some(serde_json::Value::String(value)) => value.replace(['\t', '\n'], " "), + Some(value) => value.to_string().replace(['\t', '\n'], " "), + } +} diff --git a/rust/bioscript-wasm/src/report_render.rs b/rust/bioscript-reporting/src/html.rs similarity index 61% rename from rust/bioscript-wasm/src/report_render.rs rename to rust/bioscript-reporting/src/html.rs index 9d98988..608f1b3 100644 --- a/rust/bioscript-wasm/src/report_render.rs +++ b/rust/bioscript-reporting/src/html.rs @@ -1,125 +1,33 @@ -use super::*; +use std::fmt::Write as _; -#[derive(Clone, Copy)] -pub(super) struct AppReportJsonInput<'a> { - pub(super) assay_id: &'a str, - pub(super) participant_id: &'a str, - pub(super) input_file_name: &'a str, - pub(super) observations: &'a [serde_json::Value], - pub(super) analyses: &'a [serde_json::Value], - pub(super) findings: &'a [serde_json::Value], - pub(super) provenance: &'a [serde_json::Value], - pub(super) input_inspection: Option<&'a bioscript_formats::FileInspection>, - pub(super) manifest_metadata: &'a serde_json::Value, -} - -pub(super) fn app_report_json(input: AppReportJsonInput<'_>) -> serde_json::Value { - let called = input - .observations - .iter() - .filter(|item| { - item.get("call_status").and_then(serde_json::Value::as_str) == Some("called") - }) - .count(); - serde_json::json!({ - "schema": "bioscript:report:1.0", - "version": "1.0", - "participant_id": input.participant_id, - "assay_id": input.assay_id, - "assay_version": "1.0", - "manifest": input.manifest_metadata, - "input": { - "file_name": input.input_file_name, - "file_path": input.input_file_name, - "debug": input.input_inspection.map(input_inspection_json), - }, - "report_status": if called == input.observations.len() { "complete" } else { "partial" }, - "derived_from": input.observations.iter().filter_map(|item| item.get("variant_key").cloned()).collect::>(), - "analyses": input.analyses, - "findings": input.findings, - "provenance": input.provenance, - "metrics": { - "n_sites_tested": input.observations.len(), - "n_sites_called": called, - "n_sites_missing": input.observations.len().saturating_sub(called), - "n_analyses": input.analyses.len(), - "n_findings_matched": input.findings.len(), - } - }) -} +mod analysis; +mod helpers; +mod observations; +mod pgx; +mod provenance; +mod sections; -pub(super) fn match_app_findings( - findings: &[serde_json::Value], - observations: &[serde_json::Value], - analyses: &[serde_json::Value], -) -> Vec { - let mut matched = Vec::new(); - let mut seen = std::collections::BTreeSet::new(); - for finding in findings { - if let Some(effects) = finding.get("effects").and_then(serde_json::Value::as_array) { - for effect in effects { - if let Some(observation) = app_finding_match_observation(effect, observations) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.remove("effects"); - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert("matched_effect".to_owned(), effect.clone()); - object.insert( - "matched_observation".to_owned(), - app_finding_observation_context(observation), - ); - } - if seen.insert(app_finding_dedupe_key(&item)) { - matched.push(item); - } - } else if let Some(analysis) = app_finding_match_analysis(effect, analyses) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.remove("effects"); - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert("matched_effect".to_owned(), effect.clone()); - object.insert("matched_analysis".to_owned(), analysis); - } - if seen.insert(app_finding_dedupe_key(&item)) { - matched.push(item); - } - } - } - } else if let Some(observation) = app_finding_match_observation(finding, observations) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert( - "matched_observation".to_owned(), - app_finding_observation_context(observation), - ); - } - if seen.insert(app_finding_dedupe_key(&item)) { - matched.push(item); - } - } else if let Some(analysis) = app_finding_match_analysis(finding, analyses) { - let mut item = finding.clone(); - if let Some(object) = item.as_object_mut() { - object.insert("matched".to_owned(), serde_json::Value::Bool(true)); - object.insert("matched_analysis".to_owned(), analysis); - } - if seen.insert(app_finding_dedupe_key(&item)) { - matched.push(item); - } - } - } - matched -} +use analysis::render_analysis_tables; +use helpers::html_escape; +use observations::render_observation_table; +use pgx::render_pgx_table; +use provenance::render_provenance_links; +use sections::{ + collect_report_analyses, collect_report_findings, collect_report_participants, + render_input_debug, render_participant_filter, render_report_manifest_header, + render_report_source_section, +}; -pub(super) fn render_app_html_document( +pub fn render_app_html_document( observations: &[serde_json::Value], reports: &[serde_json::Value], -) -> Result { +) -> Result { let mut out = String::from( r##"BioScript report
"##, ); let label_findings = collect_report_findings(reports, "bioscript:pgx-label:1.0"); let summary_findings = collect_report_findings(reports, "bioscript:pgx-summary:1.0"); + let has_pgx_findings = !label_findings.is_empty() || !summary_findings.is_empty(); let analysis_outputs = collect_report_analyses(reports); let participants = collect_report_participants(reports); render_report_manifest_header(&mut out, reports); @@ -132,28 +40,39 @@ pub(super) fn render_app_html_document( summary_findings.len() ); render_participant_filter(&mut out, &participants); - out.push_str(""); + out.push_str(""); out.push_str("

Input

"); render_input_debug(&mut out, reports, participants.len() > 1); - out.push_str("

Observations

"); + out.push_str("
"); + out.push_str("

Observations

"); render_observation_table(&mut out, observations, participants.len() > 1); - out.push_str("

Analysis

"); + out.push_str("
"); + out.push_str("

Analysis

"); render_analysis_tables( &mut out, &analysis_outputs, observations, participants.len() > 1, ); - out.push_str("

PGx

"); - render_pgx_table(&mut out, &label_findings, &summary_findings); - out.push_str("

Provenance

"); + out.push_str("
"); + if has_pgx_findings { + out.push_str("

PGx

"); + render_pgx_table(&mut out, &label_findings, &summary_findings); + out.push_str("
"); + } + out.push_str("

Provenance

"); render_provenance_links(&mut out, reports); - out.push_str("

Source

"); + out.push_str("
"); + out.push_str("

Source

"); render_report_source_section(&mut out, reports); - out.push_str("

Raw Reports JSON

Show raw report JSON"); + out.push_str("
"); + out.push_str("

Raw Reports JSON

Show raw report JSON"); for report in reports { - let text = - serde_json::to_string_pretty(report).map_err(|err| JsError::new(&err.to_string()))?; + let text = serde_json::to_string_pretty(report).map_err(|err| err.to_string())?; let _ = write!(out, "
{}
", html_escape(&text)); } out.push_str("
"); diff --git a/rust/bioscript-cli/src/report_html_analysis.rs b/rust/bioscript-reporting/src/html/analysis.rs similarity index 88% rename from rust/bioscript-cli/src/report_html_analysis.rs rename to rust/bioscript-reporting/src/html/analysis.rs index 573b77b..171dcca 100644 --- a/rust/bioscript-cli/src/report_html_analysis.rs +++ b/rust/bioscript-reporting/src/html/analysis.rs @@ -1,4 +1,9 @@ -fn render_analysis_tables( +use super::helpers::{ + html_escape, json_field_as_tsv, render_table_end, render_table_start, table_cell, + table_header_label, value_str, +}; +use std::fmt::Write as _; +pub(super) fn render_analysis_tables( out: &mut String, analyses: &[serde_json::Value], observations: &[serde_json::Value], @@ -65,7 +70,7 @@ fn render_analysis_tables( } } -fn analysis_title(analysis: &serde_json::Value) -> String { +pub(super) fn analysis_title(analysis: &serde_json::Value) -> String { let label = value_str(analysis, "analysis_label"); if label.is_empty() { value_str(analysis, "analysis_id").to_owned() @@ -74,7 +79,7 @@ fn analysis_title(analysis: &serde_json::Value) -> String { } } -fn analysis_row_headers( +pub(super) fn analysis_row_headers( analysis: &serde_json::Value, rows: &[serde_json::Value], show_participant_id: bool, @@ -122,11 +127,11 @@ fn analysis_row_headers( headers } -fn should_show_analysis_header(key: &str, show_participant_id: bool) -> bool { +pub(super) fn should_show_analysis_header(key: &str, show_participant_id: bool) -> bool { (show_participant_id || key != "participant_id") && key != "notes" && key != "report_notes" } -fn analysis_notes(rows: &[serde_json::Value]) -> Vec { +pub(super) fn analysis_notes(rows: &[serde_json::Value]) -> Vec { let mut notes = Vec::new(); for row in rows { if let Some(note) = row @@ -142,7 +147,7 @@ fn analysis_notes(rows: &[serde_json::Value]) -> Vec { notes } -fn render_analysis_key_values( +pub(super) fn render_analysis_key_values( out: &mut String, analysis: &serde_json::Value, row: &serde_json::Value, @@ -161,7 +166,7 @@ fn render_analysis_key_values( out.push_str(""); } -fn analysis_header_label(analysis: &serde_json::Value, key: &str) -> String { +pub(super) fn analysis_header_label(analysis: &serde_json::Value, key: &str) -> String { analysis .get("emits") .and_then(serde_json::Value::as_array) @@ -179,7 +184,7 @@ fn analysis_header_label(analysis: &serde_json::Value, key: &str) -> String { .unwrap_or_else(|| table_header_label(key)) } -fn render_analysis_value(key: &str, value: &str) -> String { +pub(super) fn render_analysis_value(key: &str, value: &str) -> String { if value.starts_with("http://") || value.starts_with("https://") { return format!( "Source", @@ -197,11 +202,11 @@ fn render_analysis_value(key: &str, value: &str) -> String { } } -fn is_analysis_badge_key(key: &str) -> bool { +pub(super) fn is_analysis_badge_key(key: &str) -> bool { key.ends_with("_status") || key.ends_with("_outcome") } -fn analysis_badge_class(value: &str) -> &'static str { +pub(super) fn analysis_badge_class(value: &str) -> &'static str { match value { "normal" | "reference" => "analysis-badge-normal", "variant" => "analysis-badge-variant", @@ -210,7 +215,7 @@ fn analysis_badge_class(value: &str) -> &'static str { } } -fn render_analysis_notes(out: &mut String, notes: &[String]) { +pub(super) fn render_analysis_notes(out: &mut String, notes: &[String]) { if notes.is_empty() { return; } @@ -222,14 +227,14 @@ fn render_analysis_notes(out: &mut String, notes: &[String]) { out.push_str(""); } -fn render_weak_indel_analysis_note(out: &mut String, weak_indel_dependency: bool) { +pub(super) fn render_weak_indel_analysis_note(out: &mut String, weak_indel_dependency: bool) { if !weak_indel_dependency { return; } out.push_str("

Notes

* Result depends on a weak indel match from a consumer genotype file.

"); } -fn analysis_depends_on_weak_observation( +pub(super) fn analysis_depends_on_weak_observation( analysis: &serde_json::Value, observations: &[serde_json::Value], ) -> bool { @@ -258,14 +263,14 @@ fn analysis_depends_on_weak_observation( }) } -fn analysis_observation_is_weak_indel_match(observation: &serde_json::Value) -> bool { +pub(super) fn analysis_observation_is_weak_indel_match(observation: &serde_json::Value) -> bool { observation .get("match_quality") .and_then(serde_json::Value::as_str) == Some("weak") } -fn render_analysis_logic(out: &mut String, analysis: &serde_json::Value) { +pub(super) fn render_analysis_logic(out: &mut String, analysis: &serde_json::Value) { let Some(logic) = analysis.get("logic") else { return; }; diff --git a/rust/bioscript-cli/src/report_html_helpers.rs b/rust/bioscript-reporting/src/html/helpers.rs similarity index 78% rename from rust/bioscript-cli/src/report_html_helpers.rs rename to rust/bioscript-reporting/src/html/helpers.rs index 57bc8df..109dc88 100644 --- a/rust/bioscript-cli/src/report_html_helpers.rs +++ b/rust/bioscript-reporting/src/html/helpers.rs @@ -1,4 +1,5 @@ -fn render_table_start(out: &mut String, table_id: &str, headers: &[&str]) { +use std::fmt::Write as _; +pub(super) fn render_table_start(out: &mut String, table_id: &str, headers: &[&str]) { let escaped_id = html_escape(table_id); let _ = write!( out, @@ -17,7 +18,7 @@ fn render_table_start(out: &mut String, table_id: &str, headers: &[&str]) { out.push_str(""); } -fn table_column_class(header: &str) -> &'static str { +pub(super) fn table_column_class(header: &str) -> &'static str { if is_debug_column(header) { "debug-col" } else { @@ -25,7 +26,7 @@ fn table_column_class(header: &str) -> &'static str { } } -fn is_debug_column(header: &str) -> bool { +pub(super) fn is_debug_column(header: &str) -> bool { matches!( header, "allele_balance" @@ -43,7 +44,7 @@ fn is_debug_column(header: &str) -> bool { ) } -fn table_header_label(header: &str) -> String { +pub(super) fn table_header_label(header: &str) -> String { match header { "participant_id" => "Participant ID".to_owned(), "rsid" => "RSID".to_owned(), @@ -81,15 +82,15 @@ fn table_header_label(header: &str) -> String { } } -fn render_table_end(out: &mut String) { +pub(super) fn render_table_end(out: &mut String) { out.push_str(""); } -fn table_cell(out: &mut String, value: &str) { +pub(super) fn table_cell(out: &mut String, value: &str) { class_cell(out, value, ""); } -fn class_cell(out: &mut String, value: &str, class_name: &str) { +pub(super) fn class_cell(out: &mut String, value: &str, class_name: &str) { if class_name.is_empty() { let _ = write!(out, "{}", html_escape(value)); } else { @@ -102,7 +103,7 @@ fn class_cell(out: &mut String, value: &str, class_name: &str) { } } -fn link_cell(out: &mut String, url: &str) { +pub(super) fn link_cell(out: &mut String, url: &str) { if url.is_empty() { out.push_str(""); } else { @@ -114,14 +115,14 @@ fn link_cell(out: &mut String, url: &str) { } } -fn value_str<'a>(value: &'a serde_json::Value, key: &str) -> &'a str { +pub(super) fn value_str<'a>(value: &'a serde_json::Value, key: &str) -> &'a str { value .get(key) .and_then(serde_json::Value::as_str) .unwrap_or_default() } -fn join_string_array(value: Option<&serde_json::Value>) -> String { +pub(super) fn join_string_array(value: Option<&serde_json::Value>) -> String { value .and_then(serde_json::Value::as_array) .map(|items| { @@ -134,7 +135,7 @@ fn join_string_array(value: Option<&serde_json::Value>) -> String { .unwrap_or_default() } -fn join_drugs(finding: &serde_json::Value) -> String { +pub(super) fn join_drugs(finding: &serde_json::Value) -> String { finding .get("drugs") .and_then(serde_json::Value::as_array) @@ -148,7 +149,15 @@ fn join_drugs(finding: &serde_json::Value) -> String { .unwrap_or_default() } -fn html_escape(value: &str) -> String { +pub(super) fn json_field_as_tsv(value: Option<&serde_json::Value>) -> String { + match value { + Some(serde_json::Value::Null) | None => String::new(), + Some(serde_json::Value::String(value)) => value.replace(['\t', '\n'], " "), + Some(value) => value.to_string().replace(['\t', '\n'], " "), + } +} + +pub(super) fn html_escape(value: &str) -> String { value .replace('&', "&") .replace('<', "<") diff --git a/rust/bioscript-cli/src/report_html_observations.rs b/rust/bioscript-reporting/src/html/observations.rs similarity index 84% rename from rust/bioscript-cli/src/report_html_observations.rs rename to rust/bioscript-reporting/src/html/observations.rs index 500e376..a5cab20 100644 --- a/rust/bioscript-cli/src/report_html_observations.rs +++ b/rust/bioscript-reporting/src/html/observations.rs @@ -1,4 +1,8 @@ -fn render_observation_table( +use super::helpers::{ + class_cell, html_escape, json_field_as_tsv, render_table_start, table_column_class, value_str, +}; +use std::fmt::Write as _; +pub(super) fn render_observation_table( out: &mut String, observations: &[serde_json::Value], show_participant_id: bool, @@ -47,19 +51,23 @@ fn render_observation_table( !json_field_as_tsv(observation.get("match_quality")).is_empty() || !json_field_as_tsv(observation.get("match_notes")).is_empty() }); - let show_imputed_reference_note = observations.iter().any(observation_is_imputed_vcf_reference); + let show_imputed_reference_note = observations + .iter() + .any(observation_is_imputed_vcf_reference); let show_weak_indel_note = observations.iter().any(observation_is_weak_indel_match); let headers = all_headers .iter() .copied() .filter(|header| show_participant_id || *header != "participant_id") .filter(|header| { - show_counts || !matches!(*header, "ref_count" | "alt_count" | "depth" | "allele_balance") + show_counts + || !matches!( + *header, + "ref_count" | "alt_count" | "depth" | "allele_balance" + ) }) .filter(|header| show_genotype_quality || *header != "genotype_quality") - .filter(|header| { - show_match_quality || !matches!(*header, "match_quality" | "match_notes") - }) + .filter(|header| show_match_quality || !matches!(*header, "match_quality" | "match_notes")) .filter(|header| show_facets || *header != "facets") .collect::>(); render_table_start(out, "observations-table", &headers); @@ -85,7 +93,7 @@ fn render_observation_table( } } -fn observation_filter_group(observation: &serde_json::Value) -> &'static str { +pub(super) fn observation_filter_group(observation: &serde_json::Value) -> &'static str { match observation_row_class(observation) { "row-reference" => "reference", "row-missing" => "missing", @@ -93,7 +101,7 @@ fn observation_filter_group(observation: &serde_json::Value) -> &'static str { } } -fn render_observation_filters(out: &mut String) { +pub(super) fn render_observation_filters(out: &mut String) { out.push_str("
Observations:"); for (outcome, label) in [ ("variant", "Show variants"), @@ -110,13 +118,13 @@ fn render_observation_filters(out: &mut String) { out.push_str("
"); } -fn observation_has_quantitative_depth(observation: &serde_json::Value) -> bool { +pub(super) fn observation_has_quantitative_depth(observation: &serde_json::Value) -> bool { ["ref_count", "alt_count", "depth", "allele_balance"] .iter() .any(|key| !json_field_as_tsv(observation.get(*key)).is_empty()) } -fn observation_row_class(observation: &serde_json::Value) -> &'static str { +pub(super) fn observation_row_class(observation: &serde_json::Value) -> &'static str { let outcome = observation .get("outcome") .and_then(serde_json::Value::as_str) @@ -140,7 +148,11 @@ fn observation_row_class(observation: &serde_json::Value) -> &'static str { } } -fn render_observation_cell(out: &mut String, observation: &serde_json::Value, header: &str) { +pub(super) fn render_observation_cell( + out: &mut String, + observation: &serde_json::Value, + header: &str, +) { let cell_class = table_column_class(header); if header == "outcome" { let mut value = json_field_as_tsv(observation.get(header)); @@ -149,7 +161,12 @@ fn render_observation_cell(out: &mut String, observation: &serde_json::Value, he { value.push('*'); } - let _ = write!(out, "{}", cell_class, html_escape(&value)); + let _ = write!( + out, + "{}", + cell_class, + html_escape(&value) + ); return; } if header == "ref_alt" { @@ -166,7 +183,9 @@ fn render_observation_cell(out: &mut String, observation: &serde_json::Value, he return; } if header == "source" { - let source = observation.get("source").unwrap_or(&serde_json::Value::Null); + let source = observation + .get("source") + .unwrap_or(&serde_json::Value::Null); let url = source .get("url") .and_then(serde_json::Value::as_str) @@ -211,7 +230,7 @@ fn render_observation_cell(out: &mut String, observation: &serde_json::Value, he ); } -fn observation_is_imputed_vcf_reference(observation: &serde_json::Value) -> bool { +pub(super) fn observation_is_imputed_vcf_reference(observation: &serde_json::Value) -> bool { observation .get("evidence_raw") .and_then(serde_json::Value::as_str) @@ -220,14 +239,14 @@ fn observation_is_imputed_vcf_reference(observation: &serde_json::Value) -> bool }) } -fn observation_is_weak_indel_match(observation: &serde_json::Value) -> bool { +pub(super) fn observation_is_weak_indel_match(observation: &serde_json::Value) -> bool { observation .get("match_quality") .and_then(serde_json::Value::as_str) == Some("weak") } -fn observation_ref_alt(observation: &serde_json::Value) -> String { +pub(super) fn observation_ref_alt(observation: &serde_json::Value) -> String { let ref_allele = observation .get("ref") .and_then(serde_json::Value::as_str) @@ -243,7 +262,7 @@ fn observation_ref_alt(observation: &serde_json::Value) -> String { } } -fn highlight_allele(value: &str, allele: &str) -> String { +pub(super) fn highlight_allele(value: &str, allele: &str) -> String { if value.is_empty() || allele.is_empty() { return html_escape(value); } diff --git a/rust/bioscript-cli/src/report_html_pgx.rs b/rust/bioscript-reporting/src/html/pgx.rs similarity index 87% rename from rust/bioscript-cli/src/report_html_pgx.rs rename to rust/bioscript-reporting/src/html/pgx.rs index b205973..e23618d 100644 --- a/rust/bioscript-cli/src/report_html_pgx.rs +++ b/rust/bioscript-reporting/src/html/pgx.rs @@ -1,4 +1,10 @@ -fn render_pgx_table( +use super::helpers::{ + class_cell, html_escape, join_drugs, join_string_array, link_cell, render_table_end, + render_table_start, table_cell, value_str, +}; +use super::observations::highlight_allele; +use std::fmt::Write as _; +pub(super) fn render_pgx_table( out: &mut String, label_findings: &[serde_json::Value], summary_findings: &[serde_json::Value], @@ -68,7 +74,7 @@ fn render_pgx_table( out.push_str(""); } -fn render_pgx_row(out: &mut String, finding: &serde_json::Value, show_drug: bool) { +pub(super) fn render_pgx_row(out: &mut String, finding: &serde_json::Value, show_drug: bool) { let source_type = pgx_source_type(finding); let level = pgx_level_value(finding); let level_slug = pgx_level_filter_slug(finding); @@ -97,14 +103,14 @@ fn render_pgx_row(out: &mut String, finding: &serde_json::Value, show_drug: bool out.push_str(""); } -fn pgx_source_type(finding: &serde_json::Value) -> &str { +pub(super) fn pgx_source_type(finding: &serde_json::Value) -> &str { match value_str(finding, "schema") { "bioscript:pgx-label:1.0" => "Drug Label", _ => "Summary", } } -fn pgx_level_value(finding: &serde_json::Value) -> &str { +pub(super) fn pgx_level_value(finding: &serde_json::Value) -> &str { if pgx_source_type(finding) == "Drug Label" { value_str(finding, "pgx_action_level") } else { @@ -112,7 +118,7 @@ fn pgx_level_value(finding: &serde_json::Value) -> &str { } } -fn pgx_level_filter_slug(finding: &serde_json::Value) -> String { +pub(super) fn pgx_level_filter_slug(finding: &serde_json::Value) -> String { if pgx_source_type(finding) == "Drug Label" { format!("drug-{}", pgx_level_slug(pgx_level_value(finding))) } else { @@ -120,7 +126,7 @@ fn pgx_level_filter_slug(finding: &serde_json::Value) -> String { } } -fn pgx_category(finding: &serde_json::Value) -> String { +pub(super) fn pgx_category(finding: &serde_json::Value) -> String { if pgx_source_type(finding) == "Drug Label" { let actions = join_string_array(finding.get("prescribing_actions")); let sources = join_string_array(finding.get("regulatory_sources")); @@ -136,7 +142,7 @@ fn pgx_category(finding: &serde_json::Value) -> String { } } -fn pgx_finding_text(finding: &serde_json::Value) -> String { +pub(super) fn pgx_finding_text(finding: &serde_json::Value) -> String { if pgx_source_type(finding) == "Drug Label" { for key in ["prescribing_information", "summary", "notes", "label"] { let value = value_str(finding, key); @@ -156,7 +162,7 @@ fn pgx_finding_text(finding: &serde_json::Value) -> String { value_str(finding, "notes").to_owned() } -fn pgx_evidence_url(finding: &serde_json::Value) -> &str { +pub(super) fn pgx_evidence_url(finding: &serde_json::Value) -> &str { finding .get("evidence") .and_then(|value| value.get("url")) @@ -164,7 +170,7 @@ fn pgx_evidence_url(finding: &serde_json::Value) -> &str { .unwrap_or_default() } -fn pgx_drug_names(findings: &[&serde_json::Value]) -> Vec { +pub(super) fn pgx_drug_names(findings: &[&serde_json::Value]) -> Vec { let mut drugs = Vec::new(); for finding in findings { for drug in finding_drug_names(finding) { @@ -177,7 +183,7 @@ fn pgx_drug_names(findings: &[&serde_json::Value]) -> Vec { drugs } -fn finding_drug_names(finding: &serde_json::Value) -> Vec { +pub(super) fn finding_drug_names(finding: &serde_json::Value) -> Vec { finding .get("drugs") .and_then(serde_json::Value::as_array) @@ -191,7 +197,7 @@ fn finding_drug_names(finding: &serde_json::Value) -> Vec { .unwrap_or_default() } -fn render_pgx_filters(out: &mut String) { +pub(super) fn render_pgx_filters(out: &mut String) { out.push_str("
Drug Label PGx Level i
"); for (level, label) in [ ("required", "Testing Required"), @@ -241,7 +247,7 @@ fn render_pgx_filters(out: &mut String) { out.push_str("
"); } -fn pgx_any_level_cell(out: &mut String, finding: &serde_json::Value) { +pub(super) fn pgx_any_level_cell(out: &mut String, finding: &serde_json::Value) { let level = pgx_level_value(finding); if pgx_source_type(finding) == "Drug Label" { pgx_level_cell(out, level); @@ -250,7 +256,7 @@ fn pgx_any_level_cell(out: &mut String, finding: &serde_json::Value) { } } -fn pgx_genotype_cell(out: &mut String, finding: &serde_json::Value) { +pub(super) fn pgx_genotype_cell(out: &mut String, finding: &serde_json::Value) { let value = finding .get("matched_observation") .and_then(|observation| observation.get("genotype_display")) @@ -278,7 +284,7 @@ fn pgx_genotype_cell(out: &mut String, finding: &serde_json::Value) { } } -fn finding_participant(finding: &serde_json::Value) -> String { +pub(super) fn finding_participant(finding: &serde_json::Value) -> String { finding .get("matched_observation") .or_else(|| finding.get("matched_analysis")) @@ -288,7 +294,7 @@ fn finding_participant(finding: &serde_json::Value) -> String { .to_owned() } -fn pgx_outcome_filter_slug(finding: &serde_json::Value) -> &'static str { +pub(super) fn pgx_outcome_filter_slug(finding: &serde_json::Value) -> &'static str { match finding .get("matched_observation") .and_then(|observation| observation.get("outcome")) @@ -301,7 +307,7 @@ fn pgx_outcome_filter_slug(finding: &serde_json::Value) -> &'static str { } } -fn finding_rsid(finding: &serde_json::Value) -> String { +pub(super) fn finding_rsid(finding: &serde_json::Value) -> String { finding .get("matched_observation") .and_then(|observation| observation.get("rsid")) @@ -311,7 +317,7 @@ fn finding_rsid(finding: &serde_json::Value) -> String { .to_owned() } -fn finding_gene(finding: &serde_json::Value) -> String { +pub(super) fn finding_gene(finding: &serde_json::Value) -> String { finding .get("matched_observation") .and_then(|observation| observation.get("gene")) @@ -319,16 +325,12 @@ fn finding_gene(finding: &serde_json::Value) -> String { .map(ToOwned::to_owned) .or_else(|| { let genes = join_string_array(finding.get("genes")); - if genes.is_empty() { - None - } else { - Some(genes) - } + if genes.is_empty() { None } else { Some(genes) } }) .unwrap_or_default() } -fn matched_ref_alt(finding: &serde_json::Value) -> String { +pub(super) fn matched_ref_alt(finding: &serde_json::Value) -> String { let Some(observation) = finding.get("matched_observation") else { return String::new(); }; @@ -348,7 +350,7 @@ fn matched_ref_alt(finding: &serde_json::Value) -> String { } } -fn evidence_level_group(level: &str) -> String { +pub(super) fn evidence_level_group(level: &str) -> String { let normalized = level.trim().to_ascii_lowercase(); if normalized.starts_with("1a") { "1a".to_owned() @@ -371,14 +373,14 @@ fn evidence_level_group(level: &str) -> String { } } -fn evidence_level_color_group(level: &str) -> String { +pub(super) fn evidence_level_color_group(level: &str) -> String { level .chars() .find(char::is_ascii_digit) .map_or_else(|| "unknown".to_owned(), |ch| ch.to_string()) } -fn evidence_level_cell(out: &mut String, level: &str) { +pub(super) fn evidence_level_cell(out: &mut String, level: &str) { let display = if level.is_empty() { "Unknown" } else { level }; let group = evidence_level_color_group(display); let _ = write!( @@ -390,7 +392,7 @@ fn evidence_level_cell(out: &mut String, level: &str) { ); } -fn pgx_level_slug(level: &str) -> String { +pub(super) fn pgx_level_slug(level: &str) -> String { let normalized = level.to_ascii_lowercase(); if normalized.contains("required") { "required".to_owned() @@ -409,7 +411,7 @@ fn pgx_level_slug(level: &str) -> String { } } -fn pgx_level_cell(out: &mut String, level: &str) { +pub(super) fn pgx_level_cell(out: &mut String, level: &str) { let display = if level.is_empty() { "Unknown" } else { level }; let slug = pgx_level_slug(display); let _ = write!( @@ -421,7 +423,7 @@ fn pgx_level_cell(out: &mut String, level: &str) { ); } -fn pgx_level_sort_rank(level: &str) -> u8 { +pub(super) fn pgx_level_sort_rank(level: &str) -> u8 { match pgx_level_slug(level).as_str() { "required" => 1, "recommended" => 2, @@ -433,7 +435,7 @@ fn pgx_level_sort_rank(level: &str) -> u8 { } } -fn evidence_level_sort_rank(level: &str) -> u8 { +pub(super) fn evidence_level_sort_rank(level: &str) -> u8 { match evidence_level_group(level).as_str() { "1a" => 11, "1b" => 12, diff --git a/rust/bioscript-cli/src/report_html_provenance.rs b/rust/bioscript-reporting/src/html/provenance.rs similarity index 78% rename from rust/bioscript-cli/src/report_html_provenance.rs rename to rust/bioscript-reporting/src/html/provenance.rs index 36a664d..6c56b09 100644 --- a/rust/bioscript-cli/src/report_html_provenance.rs +++ b/rust/bioscript-reporting/src/html/provenance.rs @@ -1,4 +1,7 @@ -fn render_provenance_links(out: &mut String, reports: &[serde_json::Value]) { +use super::helpers::html_escape; +use std::collections::BTreeMap; +use std::fmt::Write as _; +pub(super) fn render_provenance_links(out: &mut String, reports: &[serde_json::Value]) { let mut links = BTreeMap::::new(); for report in reports { collect_provenance_links_from_value(report, &mut links); @@ -30,7 +33,7 @@ fn render_provenance_links(out: &mut String, reports: &[serde_json::Value]) { out.push_str(""); } -fn group_provenance_links_by_domain( +pub(super) fn group_provenance_links_by_domain( links: BTreeMap, ) -> BTreeMap> { let mut grouped = BTreeMap::>::new(); @@ -43,7 +46,7 @@ fn group_provenance_links_by_domain( grouped } -fn domain_from_url(url: &str) -> Option { +pub(super) fn domain_from_url(url: &str) -> Option { let without_scheme = url.split_once("://")?.1; let host = without_scheme.split(['/', '?', '#']).next()?.trim(); if host.is_empty() { @@ -52,7 +55,17 @@ fn domain_from_url(url: &str) -> Option { Some(host.to_ascii_lowercase()) } } -fn collect_provenance_links_from_value( + +pub(super) fn provenance_value_as_string(value: &serde_json::Value) -> Option { + match value { + serde_json::Value::String(value) => Some(value.clone()), + serde_json::Value::Number(value) => Some(value.to_string()), + serde_json::Value::Bool(value) => Some(value.to_string()), + _ => None, + } +} + +pub(super) fn collect_provenance_links_from_value( value: &serde_json::Value, links: &mut BTreeMap, ) { @@ -65,7 +78,7 @@ fn collect_provenance_links_from_value( .get("name") .or_else(|| object.get("label")) .or_else(|| object.get("source")) - .and_then(value_as_string) + .and_then(provenance_value_as_string) .unwrap_or_default(); links.entry(url.to_owned()).or_insert(label); } @@ -81,4 +94,3 @@ fn collect_provenance_links_from_value( _ => {} } } - diff --git a/rust/bioscript-cli/src/report_html_sections.rs b/rust/bioscript-reporting/src/html/sections.rs similarity index 82% rename from rust/bioscript-cli/src/report_html_sections.rs rename to rust/bioscript-reporting/src/html/sections.rs index 5fd80eb..8d265de 100644 --- a/rust/bioscript-cli/src/report_html_sections.rs +++ b/rust/bioscript-reporting/src/html/sections.rs @@ -1,4 +1,6 @@ -fn collect_report_analyses(reports: &[serde_json::Value]) -> Vec { +use super::helpers::{html_escape, table_cell, value_str}; +use std::fmt::Write as _; +pub(super) fn collect_report_analyses(reports: &[serde_json::Value]) -> Vec { reports .iter() .filter_map(|report| report.get("analyses").and_then(serde_json::Value::as_array)) @@ -7,7 +9,10 @@ fn collect_report_analyses(reports: &[serde_json::Value]) -> Vec Vec { +pub(super) fn collect_report_findings( + reports: &[serde_json::Value], + schema: &str, +) -> Vec { reports .iter() .filter_map(|report| report.get("findings").and_then(serde_json::Value::as_array)) @@ -17,7 +22,7 @@ fn collect_report_findings(reports: &[serde_json::Value], schema: &str) -> Vec Vec { +pub(super) fn collect_report_participants(reports: &[serde_json::Value]) -> Vec { let mut participants = Vec::new(); for report in reports { let participant = value_str(report, "participant_id"); @@ -28,7 +33,7 @@ fn collect_report_participants(reports: &[serde_json::Value]) -> Vec { participants } -fn render_report_manifest_header(out: &mut String, reports: &[serde_json::Value]) { +pub(super) fn render_report_manifest_header(out: &mut String, reports: &[serde_json::Value]) { let manifest = reports .first() .and_then(|report| report.get("manifest")) @@ -45,7 +50,7 @@ fn render_report_manifest_header(out: &mut String, reports: &[serde_json::Value] out.push_str("

Disclaimer: This is not medical or clinical advice, only for research purposes. Always consult a licensed professional to interpret medical information.

This report was generated offline on your system.

For more information see https://app.biovault.net

"); } -fn render_report_source_section(out: &mut String, reports: &[serde_json::Value]) { +pub(super) fn render_report_source_section(out: &mut String, reports: &[serde_json::Value]) { let manifest = reports .first() .and_then(|report| report.get("manifest")) @@ -61,7 +66,7 @@ fn render_report_source_section(out: &mut String, reports: &[serde_json::Value]) render_manifest_members(out, manifest); } -fn report_manifest_kv(out: &mut String, key: &str, value: &str) { +pub(super) fn report_manifest_kv(out: &mut String, key: &str, value: &str) { if value.is_empty() { return; } @@ -73,7 +78,7 @@ fn report_manifest_kv(out: &mut String, key: &str, value: &str) { ); } -fn manifest_tags(manifest: &serde_json::Value) -> String { +pub(super) fn manifest_tags(manifest: &serde_json::Value) -> String { manifest .get("tags") .and_then(serde_json::Value::as_array) @@ -87,8 +92,11 @@ fn manifest_tags(manifest: &serde_json::Value) -> String { .unwrap_or_default() } -fn manifest_member_summary(manifest: &serde_json::Value) -> String { - let Some(members) = manifest.get("members").and_then(serde_json::Value::as_array) else { +pub(super) fn manifest_member_summary(manifest: &serde_json::Value) -> String { + let Some(members) = manifest + .get("members") + .and_then(serde_json::Value::as_array) + else { return String::new(); }; let preview = members @@ -114,8 +122,11 @@ fn manifest_member_summary(manifest: &serde_json::Value) -> String { } } -fn render_manifest_members(out: &mut String, manifest: &serde_json::Value) { - let Some(members) = manifest.get("members").and_then(serde_json::Value::as_array) else { +pub(super) fn render_manifest_members(out: &mut String, manifest: &serde_json::Value) { + let Some(members) = manifest + .get("members") + .and_then(serde_json::Value::as_array) + else { return; }; if members.is_empty() { @@ -132,7 +143,7 @@ fn render_manifest_members(out: &mut String, manifest: &serde_json::Value) { out.push_str(""); } -fn render_participant_filter(out: &mut String, participants: &[String]) { +pub(super) fn render_participant_filter(out: &mut String, participants: &[String]) { if participants.len() <= 1 { return; } @@ -148,7 +159,11 @@ fn render_participant_filter(out: &mut String, participants: &[String]) { out.push_str(""); } -fn render_input_debug(out: &mut String, reports: &[serde_json::Value], show_participant_id: bool) { +pub(super) fn render_input_debug( + out: &mut String, + reports: &[serde_json::Value], + show_participant_id: bool, +) { if reports.is_empty() { out.push_str("

No input metadata.

"); return; @@ -185,7 +200,9 @@ fn render_input_debug(out: &mut String, reports: &[serde_json::Value], show_part let input = report.get("input").unwrap_or(&serde_json::Value::Null); let debug = input.get("debug").unwrap_or(&serde_json::Value::Null); let source = debug.get("source").unwrap_or(&serde_json::Value::Null); - let sex = debug.get("inferred_sex").unwrap_or(&serde_json::Value::Null); + let sex = debug + .get("inferred_sex") + .unwrap_or(&serde_json::Value::Null); let _ = write!( out, "", @@ -225,11 +242,13 @@ fn render_input_debug(out: &mut String, reports: &[serde_json::Value], show_part out.push_str(""); } -fn render_input_debug_key_values(out: &mut String, report: &serde_json::Value) { +pub(super) fn render_input_debug_key_values(out: &mut String, report: &serde_json::Value) { let input = report.get("input").unwrap_or(&serde_json::Value::Null); let debug = input.get("debug").unwrap_or(&serde_json::Value::Null); let source = debug.get("source").unwrap_or(&serde_json::Value::Null); - let sex = debug.get("inferred_sex").unwrap_or(&serde_json::Value::Null); + let sex = debug + .get("inferred_sex") + .unwrap_or(&serde_json::Value::Null); out.push_str("
"); input_debug_kv(out, "File", value_str(input, "file_name")); input_debug_kv( @@ -263,7 +282,7 @@ fn render_input_debug_key_values(out: &mut String, report: &serde_json::Value) { out.push_str("
"); } -fn input_debug_kv(out: &mut String, key: &str, value: &str) { +pub(super) fn input_debug_kv(out: &mut String, key: &str, value: &str) { let _ = write!( out, "
{}
{}
", @@ -272,7 +291,7 @@ fn input_debug_kv(out: &mut String, key: &str, value: &str) { ); } -fn compact_join(values: &[&str]) -> String { +pub(super) fn compact_join(values: &[&str]) -> String { values .iter() .filter(|value| !value.is_empty()) @@ -281,7 +300,7 @@ fn compact_join(values: &[&str]) -> String { .join(" / ") } -fn input_debug_vcf_imputation(debug: &serde_json::Value) -> &'static str { +pub(super) fn input_debug_vcf_imputation(debug: &serde_json::Value) -> &'static str { if debug .get("vcf_missing_reference_imputation") .and_then(serde_json::Value::as_bool) @@ -293,7 +312,7 @@ fn input_debug_vcf_imputation(debug: &serde_json::Value) -> &'static str { } } -fn input_debug_evidence(debug: &serde_json::Value) -> String { +pub(super) fn input_debug_evidence(debug: &serde_json::Value) -> String { let mut evidence = Vec::new(); collect_string_array(debug.get("evidence"), &mut evidence); collect_string_array(debug.get("warnings"), &mut evidence); @@ -306,7 +325,7 @@ fn input_debug_evidence(debug: &serde_json::Value) -> String { evidence.join(" | ") } -fn collect_string_array(value: Option<&serde_json::Value>, out: &mut Vec) { +pub(super) fn collect_string_array(value: Option<&serde_json::Value>, out: &mut Vec) { if let Some(items) = value.and_then(serde_json::Value::as_array) { for item in items.iter().filter_map(serde_json::Value::as_str) { if !item.is_empty() && !out.iter().any(|existing| existing == item) { diff --git a/rust/bioscript-reporting/src/lib.rs b/rust/bioscript-reporting/src/lib.rs new file mode 100644 index 0000000..45d2f47 --- /dev/null +++ b/rust/bioscript-reporting/src/lib.rs @@ -0,0 +1,40 @@ +#![allow(clippy::missing_errors_doc, clippy::must_use_candidate)] + +mod analysis; +mod artifacts; +mod html; +mod manifest; +mod matching; +mod observation; +mod report_json; +mod rows; + +pub use analysis::{ + AnalysisOutputFormat, AnalysisOutputJsonInput, analysis_observations_relative_file, + analysis_output_format, analysis_output_json, analysis_output_relative_file, + parse_analysis_output_text, participant_id_from_name, participant_id_from_path, + render_analysis_observations_tsv, validate_bioscript_interpretation, +}; +pub use artifacts::{ + ReportArtifactTexts, json_field_as_tsv, render_input_report_artifact_texts, render_jsonl, + render_observations_tsv, render_report_artifact_texts, standard_text_output, +}; +pub use html::render_app_html_document; +pub use manifest::{ + AnalysisManifestTask, ExecutableAssayMember, ExecutablePanelMember, + FilesystemManifestWorkspace, ManifestWorkspace, ReportManifestContext, ReportManifestKind, + VariantManifestTask, assay_executable_member, assay_executable_member_path, + collect_analysis_manifest_tasks, collect_manifest_provenance_entries, + collect_variant_manifest_tasks, load_manifest_findings, load_manifest_provenance_links, + load_report_manifest_context, matches_analysis_path_filters, matches_variant_manifest_filters, + panel_executable_member, panel_executable_member_path, report_assay_id, report_manifest_kind, + report_manifest_metadata, report_manifest_schema, resolve_filesystem_manifest_path, +}; +pub use matching::match_app_findings; +pub use observation::{AppObservationInput, app_observation_from_manifest_row}; +pub use report_json::{ + AppInputReportInput, AppReportJsonInput, app_input_report_json, app_report_json, +}; +pub use rows::{ + MANIFEST_ROW_TSV_HEADERS, render_manifest_rows_tsv, render_manifest_trace_tsv, variant_row, +}; diff --git a/rust/bioscript-reporting/src/manifest.rs b/rust/bioscript-reporting/src/manifest.rs new file mode 100644 index 0000000..87baadc --- /dev/null +++ b/rust/bioscript-reporting/src/manifest.rs @@ -0,0 +1,941 @@ +use std::{ + fs, + path::{Path, PathBuf}, +}; + +use bioscript_schema::{ + PanelInterpretation, VariantManifest, load_assay_manifest_text, load_panel_manifest_text, + load_variant_manifest_text, +}; + +mod provenance; + +pub use provenance::{collect_manifest_provenance_entries, load_manifest_provenance_links}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ReportManifestKind { + Variant, + Panel, + Assay, +} + +pub fn report_manifest_kind(schema: &str) -> Result { + match schema { + "bioscript:variant:1.0" | "bioscript:variant" => Ok(ReportManifestKind::Variant), + "bioscript:panel:1.0" => Ok(ReportManifestKind::Panel), + "bioscript:assay:1.0" => Ok(ReportManifestKind::Assay), + other => Err(format!("unsupported manifest schema '{other}'")), + } +} + +pub fn report_manifest_schema( + workspace: &impl ManifestWorkspace, + path: &str, +) -> Result { + let value = workspace.load_yaml(path)?; + yaml_string(&value, "schema").ok_or_else(|| format!("{path} is missing schema")) +} + +pub trait ManifestWorkspace { + fn load_text(&self, path: &str) -> Result; + fn load_yaml(&self, path: &str) -> Result; + fn resolve(&self, base: &str, relative: &str) -> Result; +} + +pub struct FilesystemManifestWorkspace { + root: PathBuf, +} + +impl FilesystemManifestWorkspace { + #[must_use] + pub fn new(root: impl Into) -> Self { + Self { root: root.into() } + } +} + +impl ManifestWorkspace for FilesystemManifestWorkspace { + fn load_text(&self, path: &str) -> Result { + let path = Path::new(path); + fs::read_to_string(path) + .map_err(|err| format!("failed to read YAML {}: {err}", path.display())) + } + + fn load_yaml(&self, path: &str) -> Result { + let text = self.load_text(path)?; + serde_yaml::from_str(&text).map_err(|err| format!("failed to parse YAML {path}: {err}")) + } + + fn resolve(&self, base: &str, relative: &str) -> Result { + resolve_filesystem_manifest_path(&self.root, Path::new(base), relative) + .map(|path| path.display().to_string()) + } +} + +pub fn resolve_filesystem_manifest_path( + root: &Path, + manifest_path: &Path, + relative: &str, +) -> Result { + let base_dir = manifest_path + .parent() + .ok_or_else(|| format!("manifest has no parent: {}", manifest_path.display()))?; + let joined = base_dir.join(relative); + let canonical_root = root + .canonicalize() + .map_err(|err| format!("failed to resolve root {}: {err}", root.display()))?; + let canonical_base = base_dir.canonicalize().map_err(|err| { + format!( + "failed to resolve manifest dir {}: {err}", + base_dir.display() + ) + })?; + let canonical_joined = joined + .canonicalize() + .map_err(|err| format!("failed to resolve {}: {err}", joined.display()))?; + let boundary = if canonical_base.starts_with(&canonical_root) { + &canonical_root + } else { + &canonical_base + }; + if !canonical_joined.starts_with(boundary) { + return Err(format!( + "manifest member path escapes bioscript root: {}", + canonical_joined.display() + )); + } + Ok(canonical_joined) +} + +pub fn report_manifest_metadata( + workspace: &impl ManifestWorkspace, + path: &str, +) -> Result { + let value = workspace.load_yaml(path)?; + let members = value + .get("members") + .and_then(serde_yaml::Value::as_sequence) + .map(|items| { + items + .iter() + .filter_map(serde_yaml::Value::as_mapping) + .map(|mapping| { + serde_json::json!({ + "kind": yaml_mapping_string(mapping, "kind"), + "path": yaml_mapping_string(mapping, "path"), + "version": yaml_mapping_string(mapping, "version"), + }) + }) + .collect::>() + }) + .unwrap_or_default(); + Ok(serde_json::json!({ + "schema": yaml_string(&value, "schema"), + "version": yaml_string(&value, "version"), + "name": yaml_string(&value, "name"), + "label": yaml_string(&value, "label").or_else(|| yaml_string(&value, "name")), + "tags": yaml_string_sequence(&value, "tags"), + "members": members, + })) +} + +pub fn report_assay_id(workspace: &impl ManifestWorkspace, path: &str) -> Result { + let value = workspace.load_yaml(path)?; + let schema = + yaml_string(&value, "schema").ok_or_else(|| format!("{path} is missing schema"))?; + report_manifest_kind(&schema)?; + yaml_string(&value, "name").ok_or_else(|| format!("manifest is missing required name: {path}")) +} + +#[derive(Clone, Debug)] +pub struct ReportManifestContext { + pub assay_id: String, + pub manifest_metadata: serde_json::Value, + pub findings: Vec, + pub provenance: Vec, +} + +pub fn load_report_manifest_context( + workspace: &impl ManifestWorkspace, + path: &str, +) -> Result { + Ok(ReportManifestContext { + assay_id: report_assay_id(workspace, path)?, + manifest_metadata: report_manifest_metadata(workspace, path)?, + findings: load_manifest_findings(workspace, path)?, + provenance: load_manifest_provenance_links(workspace, path)?, + }) +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct AnalysisManifestTask { + pub manifest_path: String, + pub manifest_name: String, + pub interpretations: Vec, +} + +pub fn collect_analysis_manifest_tasks( + workspace: &impl ManifestWorkspace, + path: &str, + filters: &[String], +) -> Result, String> { + match report_manifest_kind(&report_manifest_schema(workspace, path)?)? { + ReportManifestKind::Panel => { + let text = workspace.load_text(path)?; + let manifest = load_panel_manifest_text(path, &text)?; + let mut tasks = Vec::new(); + if filters.is_empty() && !manifest.interpretations.is_empty() { + tasks.push(AnalysisManifestTask { + manifest_path: path.to_owned(), + manifest_name: manifest.name.clone(), + interpretations: manifest.interpretations.clone(), + }); + } + for member in &manifest.members { + if member.kind != "assay" { + continue; + } + let Some(member_path) = &member.path else { + continue; + }; + let resolved = workspace.resolve(path, member_path)?; + if !matches_analysis_path_filters(&resolved, filters) { + continue; + } + tasks.extend(collect_analysis_manifest_tasks( + workspace, &resolved, filters, + )?); + } + Ok(tasks) + } + ReportManifestKind::Assay => { + let text = workspace.load_text(path)?; + let manifest = load_assay_manifest_text(path, &text)?; + if manifest.interpretations.is_empty() { + return Ok(Vec::new()); + } + Ok(vec![AnalysisManifestTask { + manifest_path: path.to_owned(), + manifest_name: manifest.name, + interpretations: manifest.interpretations, + }]) + } + ReportManifestKind::Variant => Ok(Vec::new()), + } +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct VariantManifestTask { + pub manifest_path: String, + pub manifest: VariantManifest, +} + +pub fn collect_variant_manifest_tasks( + workspace: &impl ManifestWorkspace, + path: &str, + filters: &[String], +) -> Result, String> { + match report_manifest_kind(&report_manifest_schema(workspace, path)?)? { + ReportManifestKind::Variant => { + let text = workspace.load_text(path)?; + let manifest = load_variant_manifest_text(path, &text)?; + Ok(vec![VariantManifestTask { + manifest_path: path.to_owned(), + manifest, + }]) + } + ReportManifestKind::Panel => { + let text = workspace.load_text(path)?; + let manifest = load_panel_manifest_text(path, &text)?; + let mut tasks = Vec::new(); + for member in &manifest.members { + match panel_executable_member(&member.kind, member.path.as_deref())? { + ExecutablePanelMember::Variant(member_path) => { + let resolved = workspace.resolve(path, member_path)?; + let variant = load_variant_task(workspace, &resolved)?; + if matches_variant_manifest_filters(&variant.manifest, &resolved, filters) { + tasks.push(variant); + } + } + ExecutablePanelMember::Assay(member_path) => { + let resolved = workspace.resolve(path, member_path)?; + tasks.extend(collect_variant_manifest_tasks( + workspace, &resolved, filters, + )?); + } + } + } + Ok(tasks) + } + ReportManifestKind::Assay => { + let text = workspace.load_text(path)?; + let manifest = load_assay_manifest_text(path, &text)?; + let mut tasks = Vec::new(); + for member in &manifest.members { + match assay_executable_member(&member.kind, member.path.as_deref())? { + ExecutableAssayMember::Variant(member_path) => { + let resolved = workspace.resolve(path, member_path)?; + let variant = load_variant_task(workspace, &resolved)?; + if matches_variant_manifest_filters(&variant.manifest, &resolved, filters) { + tasks.push(variant); + } + } + } + } + Ok(tasks) + } + } +} + +fn load_variant_task( + workspace: &impl ManifestWorkspace, + path: &str, +) -> Result { + let text = workspace.load_text(path)?; + let manifest = load_variant_manifest_text(path, &text)?; + Ok(VariantManifestTask { + manifest_path: path.to_owned(), + manifest, + }) +} + +pub fn load_manifest_findings( + workspace: &impl ManifestWorkspace, + path: &str, +) -> Result, String> { + let value = workspace.load_yaml(path)?; + let schema = yaml_string(&value, "schema").unwrap_or_default(); + let mut findings = Vec::new(); + + if manifest_supports_findings(&schema) + && let Some(items) = value + .get("findings") + .and_then(serde_yaml::Value::as_sequence) + { + for item in items { + let json_item = yaml_to_json(item.clone())?; + let include = json_item + .get("include") + .and_then(serde_json::Value::as_str) + .map(str::to_owned); + if let Some(include) = include { + let include_path = workspace.resolve(path, &include)?; + let mut included = load_manifest_findings(workspace, &include_path)?; + let inherited_binding = json_item.get("binding").cloned(); + for included_item in &mut included { + if inherited_binding.is_some() + && included_item.get("binding").is_none() + && included_item.get("effects").is_none() + && let Some(object) = included_item.as_object_mut() + { + object.insert( + "binding".to_owned(), + inherited_binding.clone().unwrap_or(serde_json::Value::Null), + ); + } + } + findings.extend(included); + continue; + } + if json_item.get("include").is_none() { + findings.push(json_item); + } + } + } + + for member_path in traversable_manifest_member_paths(&schema, &value) { + let resolved = workspace.resolve(path, member_path)?; + findings.extend(load_manifest_findings(workspace, &resolved)?); + } + + Ok(findings) +} + +pub fn matches_variant_manifest_filters( + manifest: &VariantManifest, + path: &str, + filters: &[String], +) -> bool { + filters.iter().all(|filter| match filter.split_once('=') { + Some(("kind", value)) => value == "variant", + Some(("name", value)) => manifest.name.contains(value), + Some(("path", value)) => path.contains(value), + Some(("tag", value)) => manifest.tags.iter().any(|tag| tag == value), + Some(_) | None => false, + }) +} + +pub fn matches_analysis_path_filters(path: &str, filters: &[String]) -> bool { + filters.iter().all(|filter| match filter.split_once('=') { + Some(("path", value)) => path.contains(value), + _ => false, + }) +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ExecutablePanelMember<'a> { + Variant(&'a str), + Assay(&'a str), +} + +pub fn panel_executable_member<'a>( + kind: &str, + path: Option<&'a str>, +) -> Result, String> { + let path = panel_executable_member_path(kind, path)?; + match kind { + "variant" => Ok(ExecutablePanelMember::Variant(path)), + "assay" => Ok(ExecutablePanelMember::Assay(path)), + _ => unreachable!("panel_executable_member_path validates member kind"), + } +} + +pub fn panel_executable_member_path<'a>( + kind: &str, + path: Option<&'a str>, +) -> Result<&'a str, String> { + let Some(path) = path else { + return Err("remote panel members are not executable yet".to_owned()); + }; + if !matches!(kind, "variant" | "assay") { + return Err(format!("panel member kind '{kind}' is not executable")); + } + Ok(path) +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ExecutableAssayMember<'a> { + Variant(&'a str), +} +pub fn assay_executable_member<'a>( + kind: &str, + path: Option<&'a str>, +) -> Result, String> { + let path = assay_executable_member_path(kind, path)?; + Ok(ExecutableAssayMember::Variant(path)) +} +pub fn assay_executable_member_path<'a>( + kind: &str, + path: Option<&'a str>, +) -> Result<&'a str, String> { + if kind != "variant" { + return Err(format!("assay member kind '{kind}' is not executable")); + } + let Some(path) = path else { + return Err("remote assay members are not executable yet".to_owned()); + }; + Ok(path) +} + +pub(super) fn manifest_supports_findings(schema: &str) -> bool { + matches!( + schema, + "bioscript:variant:1.0" + | "bioscript:variant" + | "bioscript:assay:1.0" + | "bioscript:panel:1.0" + | "bioscript:pgx-findings:1.0" + ) +} + +fn manifest_has_traversable_members(schema: &str) -> bool { + matches!(schema, "bioscript:assay:1.0" | "bioscript:panel:1.0") +} + +pub(super) fn traversable_manifest_member_paths<'a>( + schema: &str, + value: &'a serde_yaml::Value, +) -> Vec<&'a str> { + if !manifest_has_traversable_members(schema) { + return Vec::new(); + } + value + .get("members") + .and_then(serde_yaml::Value::as_sequence) + .into_iter() + .flatten() + .filter_map(traversable_manifest_member_path) + .collect() +} + +fn traversable_manifest_member_path(member: &serde_yaml::Value) -> Option<&str> { + let kind = member.get("kind").and_then(serde_yaml::Value::as_str)?; + if !matches!(kind, "variant" | "assay") { + return None; + } + member.get("path").and_then(serde_yaml::Value::as_str) +} + +pub(super) fn yaml_to_json(value: serde_yaml::Value) -> Result { + serde_json::to_value(value).map_err(|err| format!("failed to convert YAML to JSON: {err}")) +} + +pub(super) fn yaml_string(value: &serde_yaml::Value, key: &str) -> Option { + value + .get(key) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) +} + +fn yaml_string_sequence(value: &serde_yaml::Value, key: &str) -> Vec { + value + .get(key) + .and_then(serde_yaml::Value::as_sequence) + .map(|items| { + items + .iter() + .filter_map(serde_yaml::Value::as_str) + .map(serde_json::Value::from) + .collect() + }) + .unwrap_or_default() +} + +fn yaml_mapping_string(mapping: &serde_yaml::Mapping, key: &str) -> Option { + mapping + .get(serde_yaml::Value::String(key.to_owned())) + .and_then(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) +} + +#[cfg(test)] +mod tests { + use std::{ + collections::BTreeMap, + fs, + path::PathBuf, + time::{SystemTime, UNIX_EPOCH}, + }; + + use bioscript_core::VariantSpec; + use bioscript_schema::VariantManifest; + + use super::{ + ExecutableAssayMember, ExecutablePanelMember, ManifestWorkspace, ReportManifestKind, + assay_executable_member, assay_executable_member_path, collect_analysis_manifest_tasks, + collect_variant_manifest_tasks, matches_analysis_path_filters, + matches_variant_manifest_filters, panel_executable_member, panel_executable_member_path, + report_assay_id, report_manifest_kind, report_manifest_schema, + resolve_filesystem_manifest_path, traversable_manifest_member_paths, + }; + + struct InlineWorkspace { + yaml: &'static str, + } + + impl ManifestWorkspace for InlineWorkspace { + fn load_text(&self, _path: &str) -> Result { + Ok(self.yaml.to_owned()) + } + + fn load_yaml(&self, _path: &str) -> Result { + serde_yaml::from_str(self.yaml).map_err(|err| err.to_string()) + } + + fn resolve(&self, _base: &str, relative: &str) -> Result { + Ok(relative.to_owned()) + } + } + + struct MapWorkspace { + files: BTreeMap, + } + + impl ManifestWorkspace for MapWorkspace { + fn load_text(&self, path: &str) -> Result { + self.files + .get(path) + .cloned() + .ok_or_else(|| format!("missing file: {path}")) + } + + fn load_yaml(&self, path: &str) -> Result { + serde_yaml::from_str(&self.load_text(path)?).map_err(|err| err.to_string()) + } + + fn resolve(&self, base: &str, relative: &str) -> Result { + let base = std::path::Path::new(base) + .parent() + .unwrap_or_else(|| std::path::Path::new("")); + Ok(base.join(relative).display().to_string()) + } + } + + const NESTED_PANEL_YAML: &str = r#" +schema: bioscript:panel:1.0 +version: "1.0" +name: panel +members: + - kind: variant + path: rs1.yaml + - kind: assay + path: assets/APOE/assay.yaml + - kind: variant + path: rs3.yaml +"#; + + const NESTED_ASSAY_YAML: &str = r#" +schema: bioscript:assay:1.0 +version: "1.0" +name: apoe +members: + - kind: variant + path: rs2.yaml +"#; + + fn variant_yaml(name: &str, pos: u32, tag: &str) -> String { + format!( + r#" +schema: bioscript:variant:1.0 +version: "1.0" +name: {name} +tags: [{tag}] +identifiers: + rsids: + - {name} +coordinates: + grch38: + chrom: "1" + pos: {pos} +alleles: + kind: snv + ref: A + alts: + - G +"# + ) + } + + fn nested_variant_workspace() -> MapWorkspace { + MapWorkspace { + files: BTreeMap::from([ + ("panel.yaml".to_owned(), NESTED_PANEL_YAML.to_owned()), + ("rs1.yaml".to_owned(), variant_yaml("rs1", 1, "keep")), + ( + "assets/APOE/assay.yaml".to_owned(), + NESTED_ASSAY_YAML.to_owned(), + ), + ( + "assets/APOE/rs2.yaml".to_owned(), + variant_yaml("rs2", 2, "keep"), + ), + ("rs3.yaml".to_owned(), variant_yaml("rs3", 3, "skip")), + ]), + } + } + + #[test] + fn report_assay_id_uses_manifest_name_for_supported_schemas() { + let workspace = InlineWorkspace { + yaml: "schema: bioscript:panel:1.0\nname: pgx-1\n", + }; + assert_eq!( + report_assay_id(&workspace, "manifest.yaml").unwrap(), + "pgx-1" + ); + } + + #[test] + fn report_manifest_schema_reads_workspace_yaml() { + let workspace = InlineWorkspace { + yaml: "schema: bioscript:assay:1.0\nname: apoe\n", + }; + assert_eq!( + report_manifest_schema(&workspace, "assay.yaml").unwrap(), + "bioscript:assay:1.0" + ); + + let workspace = InlineWorkspace { + yaml: "name: apoe\n", + }; + assert_eq!( + report_manifest_schema(&workspace, "assay.yaml").unwrap_err(), + "assay.yaml is missing schema" + ); + } + + #[test] + fn report_assay_id_rejects_unsupported_schema() { + let workspace = InlineWorkspace { + yaml: "schema: other\nname: nope\n", + }; + let err = report_assay_id(&workspace, "manifest.yaml").unwrap_err(); + assert!(err.contains("unsupported manifest schema")); + } + + #[test] + fn report_manifest_kind_centralizes_schema_dispatch() { + assert_eq!( + report_manifest_kind("bioscript:variant").unwrap(), + ReportManifestKind::Variant + ); + assert_eq!( + report_manifest_kind("bioscript:variant:1.0").unwrap(), + ReportManifestKind::Variant + ); + assert_eq!( + report_manifest_kind("bioscript:panel:1.0").unwrap(), + ReportManifestKind::Panel + ); + assert_eq!( + report_manifest_kind("bioscript:assay:1.0").unwrap(), + ReportManifestKind::Assay + ); + assert_eq!( + report_manifest_kind("other").unwrap_err(), + "unsupported manifest schema 'other'" + ); + } + + #[test] + fn variant_manifest_filters_match_kind_name_path_and_tag() { + let manifest = VariantManifest { + path: PathBuf::from("rs1.yaml"), + name: "APOE_rs429358".to_owned(), + tags: vec!["gene:APOE".to_owned(), "pgx".to_owned()], + spec: VariantSpec::default(), + }; + let filters = vec![ + "kind=variant".to_owned(), + "name=APOE".to_owned(), + "path=assets/APOE".to_owned(), + "tag=pgx".to_owned(), + ]; + assert!(matches_variant_manifest_filters( + &manifest, + "assets/APOE/rs429358.yaml", + &filters + )); + assert!(!matches_variant_manifest_filters( + &manifest, + "assets/MTHFR/rs429358.yaml", + &filters + )); + } + + #[test] + fn analysis_path_filters_only_accept_path_filters() { + assert!(matches_analysis_path_filters( + "assets/APOE/assay.yaml", + &["path=APOE".to_owned()] + )); + assert!(!matches_analysis_path_filters( + "assets/MTHFR/assay.yaml", + &["path=APOE".to_owned()] + )); + assert!(!matches_analysis_path_filters( + "assets/APOE/assay.yaml", + &["tag=pgx".to_owned()] + )); + assert!(matches_analysis_path_filters("assets/APOE/assay.yaml", &[])); + } + + #[test] + fn collect_analysis_manifest_tasks_matches_cli_panel_filter_semantics() { + let workspace = MapWorkspace { + files: BTreeMap::from([ + ( + "panel.yaml".to_owned(), + r#" +schema: bioscript:panel:1.0 +version: "1.0" +name: panel +analyses: + - id: panel_analysis + kind: bioscript + path: panel.py + derived_from: + - rs1.yaml +members: + - kind: assay + path: assets/APOE/assay.yaml +"# + .to_owned(), + ), + ( + "assets/APOE/assay.yaml".to_owned(), + r#" +schema: bioscript:assay:1.0 +version: "1.0" +name: apoe +members: + - kind: variant + path: rs2.yaml +analyses: + - id: apoe_analysis + kind: bioscript + path: apoe.py + derived_from: + - rs2.yaml +"# + .to_owned(), + ), + ]), + }; + + let tasks = collect_analysis_manifest_tasks(&workspace, "panel.yaml", &[]).unwrap(); + assert_eq!(tasks.len(), 2); + assert_eq!(tasks[0].manifest_name, "panel"); + assert_eq!(tasks[0].interpretations[0].id, "panel_analysis"); + assert_eq!(tasks[1].manifest_path, "assets/APOE/assay.yaml"); + assert_eq!(tasks[1].interpretations[0].id, "apoe_analysis"); + + let filtered = collect_analysis_manifest_tasks( + &workspace, + "panel.yaml", + &["path=assets/APOE/assay.yaml".to_owned()], + ) + .unwrap(); + assert_eq!(filtered.len(), 1); + assert_eq!(filtered[0].manifest_name, "apoe"); + } + + #[test] + fn collect_variant_manifest_tasks_preserves_nested_member_order_and_filters() { + let workspace = nested_variant_workspace(); + + let tasks = collect_variant_manifest_tasks(&workspace, "panel.yaml", &[]).unwrap(); + assert_eq!( + tasks + .iter() + .map(|task| task.manifest.name.as_str()) + .collect::>(), + vec!["rs1", "rs2", "rs3"] + ); + + let filtered = + collect_variant_manifest_tasks(&workspace, "panel.yaml", &["tag=keep".to_owned()]) + .unwrap(); + assert_eq!( + filtered + .iter() + .map(|task| task.manifest_path.as_str()) + .collect::>(), + vec!["rs1.yaml", "assets/APOE/rs2.yaml"] + ); + } + + #[test] + fn executable_member_validation_matches_cli_error_contract() { + assert_eq!( + panel_executable_member("variant", Some("rs.yaml")).unwrap(), + ExecutablePanelMember::Variant("rs.yaml") + ); + assert_eq!( + panel_executable_member("assay", Some("assay.yaml")).unwrap(), + ExecutablePanelMember::Assay("assay.yaml") + ); + assert_eq!( + panel_executable_member_path("variant", Some("rs.yaml")).unwrap(), + "rs.yaml" + ); + assert_eq!( + panel_executable_member_path("assay", Some("assay.yaml")).unwrap(), + "assay.yaml" + ); + assert_eq!( + panel_executable_member_path("download", Some("remote.yaml")).unwrap_err(), + "panel member kind 'download' is not executable" + ); + assert_eq!( + panel_executable_member_path("variant", None).unwrap_err(), + "remote panel members are not executable yet" + ); + + assert_eq!( + assay_executable_member("variant", Some("rs.yaml")).unwrap(), + ExecutableAssayMember::Variant("rs.yaml") + ); + assert_eq!( + assay_executable_member_path("variant", Some("rs.yaml")).unwrap(), + "rs.yaml" + ); + assert_eq!( + assay_executable_member_path("assay", Some("nested.yaml")).unwrap_err(), + "assay member kind 'assay' is not executable" + ); + assert_eq!( + assay_executable_member_path("variant", None).unwrap_err(), + "remote assay members are not executable yet" + ); + } + + #[test] + fn traversable_manifest_members_keep_metadata_traversal_semantics() { + let value: serde_yaml::Value = serde_yaml::from_str( + r" +schema: bioscript:panel:1.0 +members: + - kind: variant + path: assets/A/rs1.yaml + - kind: assay + path: assets/B/assay.yaml + - kind: download + path: remote.yaml + - kind: variant + - path: missing-kind.yaml +", + ) + .unwrap(); + + assert_eq!( + traversable_manifest_member_paths("bioscript:panel:1.0", &value), + vec!["assets/A/rs1.yaml", "assets/B/assay.yaml"] + ); + assert!(traversable_manifest_member_paths("bioscript:variant:1.0", &value).is_empty()); + } + + #[test] + fn filesystem_manifest_path_resolution_enforces_root_boundary() { + let unique = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + let root = std::env::temp_dir().join(format!( + "bioscript-reporting-resolve-{}-{unique}", + std::process::id() + )); + let assay_dir = root.join("assay"); + let outside = root + .parent() + .unwrap_or(root.as_path()) + .join(format!("bioscript-reporting-outside-{unique}")); + fs::create_dir_all(&assay_dir).unwrap(); + fs::create_dir_all(&outside).unwrap(); + fs::write( + assay_dir.join("manifest.yaml"), + "schema: bioscript:assay:1.0\n", + ) + .unwrap(); + fs::write( + assay_dir.join("variant.yaml"), + "schema: bioscript:variant:1.0\n", + ) + .unwrap(); + fs::write( + outside.join("variant.yaml"), + "schema: bioscript:variant:1.0\n", + ) + .unwrap(); + + let manifest = assay_dir.join("manifest.yaml"); + assert_eq!( + resolve_filesystem_manifest_path(&root, &manifest, "variant.yaml").unwrap(), + assay_dir.join("variant.yaml").canonicalize().unwrap() + ); + assert!( + resolve_filesystem_manifest_path( + &root, + &manifest, + &format!( + "../../{}/variant.yaml", + outside.file_name().unwrap().to_string_lossy() + ) + ) + .unwrap_err() + .contains("manifest member path escapes bioscript root") + ); + + let _ = fs::remove_dir_all(&root); + let _ = fs::remove_dir_all(&outside); + } +} diff --git a/rust/bioscript-reporting/src/matching.rs b/rust/bioscript-reporting/src/matching.rs new file mode 100644 index 0000000..bb66823 --- /dev/null +++ b/rust/bioscript-reporting/src/matching.rs @@ -0,0 +1,529 @@ +use std::path::Path; + +pub fn match_app_findings( + findings: &[serde_json::Value], + observations: &[serde_json::Value], + analyses: &[serde_json::Value], +) -> Vec { + let mut matched = Vec::new(); + let mut seen = std::collections::BTreeSet::new(); + for finding in findings { + if let Some(effects) = finding.get("effects").and_then(serde_json::Value::as_array) { + for effect in effects { + if let Some(observation) = app_finding_match_observation(effect, observations) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.remove("effects"); + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_effect".to_owned(), effect.clone()); + object.insert( + "matched_observation".to_owned(), + app_finding_observation_context(observation), + ); + } + if seen.insert(app_finding_dedupe_key(&item)) { + matched.push(item); + } + } else if let Some(analysis) = app_finding_match_analysis(effect, analyses) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.remove("effects"); + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_effect".to_owned(), effect.clone()); + object.insert("matched_analysis".to_owned(), analysis); + } + if seen.insert(app_finding_dedupe_key(&item)) { + matched.push(item); + } + } + } + } else if let Some(observation) = app_finding_match_observation(finding, observations) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert( + "matched_observation".to_owned(), + app_finding_observation_context(observation), + ); + } + if seen.insert(app_finding_dedupe_key(&item)) { + matched.push(item); + } + } else if let Some(analysis) = app_finding_match_analysis(finding, analyses) { + let mut item = finding.clone(); + if let Some(object) = item.as_object_mut() { + object.insert("matched".to_owned(), serde_json::Value::Bool(true)); + object.insert("matched_analysis".to_owned(), analysis); + } + if seen.insert(app_finding_dedupe_key(&item)) { + matched.push(item); + } + } + } + matched +} + +fn app_finding_match_observation<'a>( + finding: &serde_json::Value, + observations: &'a [serde_json::Value], +) -> Option<&'a serde_json::Value> { + let binding = finding.get("binding")?; + match binding.get("source").and_then(serde_json::Value::as_str) { + Some("variant") => app_variant_binding_match_observation(binding, observations), + _ => None, + } +} + +fn app_finding_match_analysis( + finding: &serde_json::Value, + analyses: &[serde_json::Value], +) -> Option { + let binding = finding.get("binding")?; + if binding.get("source").and_then(serde_json::Value::as_str) != Some("analysis") { + return None; + } + let analysis_id = binding + .get("analysis_id") + .or_else(|| binding.get("analysis")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let key = binding.get("key").and_then(serde_json::Value::as_str)?; + for analysis in analyses { + if !analysis_id.is_empty() + && analysis + .get("analysis_id") + .and_then(serde_json::Value::as_str) + != Some(analysis_id) + { + continue; + } + let Some(rows) = analysis.get("rows").and_then(serde_json::Value::as_array) else { + continue; + }; + for row in rows { + if app_binding_matches_value(row.get(key), binding) { + return Some(serde_json::json!({ + "participant_id": analysis.get("participant_id").cloned().unwrap_or(serde_json::Value::Null), + "assay_id": analysis.get("assay_id").cloned().unwrap_or(serde_json::Value::Null), + "analysis_id": analysis.get("analysis_id").cloned().unwrap_or(serde_json::Value::Null), + "key": key, + "value": row.get(key).cloned().unwrap_or(serde_json::Value::Null), + "row": row, + })); + } + } + } + None +} + +fn app_variant_binding_match_observation<'a>( + binding: &serde_json::Value, + observations: &'a [serde_json::Value], +) -> Option<&'a serde_json::Value> { + let operator = binding + .get("operator") + .and_then(serde_json::Value::as_str) + .unwrap_or("equals"); + if matches!(operator, "dosage_equals" | "dosage_in") { + let allele = binding + .get("allele") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + return observations + .iter() + .filter(|observation| !app_variant_ref_mismatch(binding, observation)) + .filter(|observation| app_binding_chromosome_count_matches(binding, observation)) + .find(|observation| { + let dosage = app_observation_allele_dosage(observation, allele); + app_binding_matches_dosage(dosage, binding) + }); + } + + let key = binding + .get("key") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if key.is_empty() { + return None; + } + if key == "alt" { + let expected_alleles = app_binding_expected_values(binding); + return observations + .iter() + .filter(|observation| !app_variant_ref_mismatch(binding, observation)) + .filter(|observation| app_binding_chromosome_count_matches(binding, observation)) + .find(|observation| { + app_binding_matches_value(observation.get(key), binding) + && expected_alleles.iter().any(|expected| { + app_observation_allele_dosage(observation, expected) + .is_some_and(|dosage| dosage > 0) + }) + }); + } + observations + .iter() + .filter(|observation| !app_variant_ref_mismatch(binding, observation)) + .filter(|observation| app_binding_chromosome_count_matches(binding, observation)) + .find(|observation| app_binding_matches_value(observation.get(key), binding)) +} + +fn app_finding_observation_context(observation: &serde_json::Value) -> serde_json::Value { + serde_json::json!({ + "participant_id": observation.get("participant_id").cloned().unwrap_or(serde_json::Value::Null), + "rsid": observation.get("rsid").cloned().unwrap_or(serde_json::Value::Null), + "gene": observation.get("gene").cloned().unwrap_or(serde_json::Value::Null), + "ref": observation.get("ref").cloned().unwrap_or(serde_json::Value::Null), + "alt": observation.get("alt").cloned().unwrap_or(serde_json::Value::Null), + "genotype_display": observation.get("genotype_display").cloned().unwrap_or(serde_json::Value::Null), + "outcome": observation.get("outcome").cloned().unwrap_or(serde_json::Value::Null), + }) +} + +fn app_variant_ref_mismatch(binding: &serde_json::Value, observation: &serde_json::Value) -> bool { + let variant_ref = binding + .get("variant") + .or_else(|| binding.get("path")) + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if variant_ref.is_empty() { + return false; + } + let basename = Path::new(variant_ref) + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or(variant_ref); + let candidates = [ + observation + .get("variant_key") + .and_then(serde_json::Value::as_str), + observation + .get("variant_path") + .and_then(serde_json::Value::as_str), + observation.get("rsid").and_then(serde_json::Value::as_str), + ]; + !candidates.into_iter().flatten().any(|candidate| { + candidate == variant_ref + || Path::new(candidate) + .file_name() + .and_then(|value| value.to_str()) + .is_some_and(|value| value == basename) + }) +} + +fn app_observation_allele_dosage(observation: &serde_json::Value, allele: &str) -> Option { + if allele.is_empty() { + return None; + } + let ref_allele = observation + .get("ref") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let alt_allele = observation + .get("alt") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + let zygosity = observation + .get("zygosity") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if allele == ref_allele { + return match zygosity { + "hom_ref" => Some(2), + "hem_ref" | "het" => Some(1), + "hom_alt" | "hem_alt" => Some(0), + _ => None, + }; + } + if allele == alt_allele { + return match zygosity { + "hom_ref" | "hem_ref" => Some(0), + "het" | "hem_alt" => Some(1), + "hom_alt" => Some(2), + _ => None, + }; + } + let display = observation + .get("genotype_display") + .and_then(serde_json::Value::as_str) + .unwrap_or_default(); + if allele.len() == 1 { + let allele_ch = allele.chars().next()?.to_ascii_uppercase(); + return display + .chars() + .filter(|ch| ch.to_ascii_uppercase() == allele_ch) + .count() + .try_into() + .ok(); + } + None +} + +fn app_binding_chromosome_count_matches( + binding: &serde_json::Value, + observation: &serde_json::Value, +) -> bool { + let Some(expected) = binding + .get("chromosome_count") + .and_then(serde_json::Value::as_i64) + else { + return true; + }; + app_observation_chromosome_count(observation).is_some_and(|actual| actual == expected) +} + +fn app_observation_chromosome_count(observation: &serde_json::Value) -> Option { + match observation + .get("zygosity") + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + { + "hem_ref" | "hem_alt" => Some(1), + "hom_ref" | "het" | "hom_alt" => Some(2), + _ => None, + } +} + +fn app_binding_matches_value( + actual: Option<&serde_json::Value>, + binding: &serde_json::Value, +) -> bool { + let actual = actual.and_then(value_as_string).unwrap_or_default(); + match binding + .get("operator") + .and_then(serde_json::Value::as_str) + .unwrap_or("equals") + { + "equals" => binding + .get("value") + .and_then(value_as_string) + .is_some_and(|value| value == actual), + "in" => binding + .get("values") + .and_then(serde_json::Value::as_array) + .is_some_and(|values| { + values + .iter() + .filter_map(value_as_string) + .any(|value| value == actual) + }), + _ => false, + } +} + +fn app_binding_expected_values(binding: &serde_json::Value) -> Vec { + let mut values = Vec::new(); + if let Some(value) = binding.get("value").and_then(value_as_string) { + values.push(value.clone()); + } + if let Some(array) = binding.get("values").and_then(serde_json::Value::as_array) { + values.extend(array.iter().filter_map(value_as_string)); + } + values +} + +fn app_binding_matches_dosage(dosage: Option, binding: &serde_json::Value) -> bool { + let Some(dosage) = dosage else { + return false; + }; + match binding + .get("operator") + .and_then(serde_json::Value::as_str) + .unwrap_or_default() + { + "dosage_equals" => binding + .get("value") + .and_then(serde_json::Value::as_i64) + .is_some_and(|value| value == dosage), + "dosage_in" => binding + .get("values") + .and_then(serde_json::Value::as_array) + .is_some_and(|values| { + values + .iter() + .filter_map(serde_json::Value::as_i64) + .any(|value| value == dosage) + }), + _ => false, + } +} + +fn value_as_string(value: &serde_json::Value) -> Option { + match value { + serde_json::Value::String(value) => Some(value.clone()), + serde_json::Value::Number(value) => Some(value.to_string()), + serde_json::Value::Bool(value) => Some(value.to_string()), + _ => None, + } +} + +fn app_finding_dedupe_key(finding: &serde_json::Value) -> String { + let effect_key = finding + .get("matched_effect") + .and_then(|effect| { + effect + .get("id") + .or_else(|| effect.get("label")) + .or_else(|| effect.get("text")) + }) + .and_then(value_as_string) + .unwrap_or_default(); + if let Some(evidence) = finding.get("evidence") { + let source = evidence + .get("source") + .and_then(value_as_string) + .unwrap_or_default(); + let kind = evidence + .get("kind") + .and_then(value_as_string) + .unwrap_or_default(); + let id = evidence + .get("id") + .and_then(value_as_string) + .unwrap_or_default(); + if !source.is_empty() || !kind.is_empty() || !id.is_empty() { + return format!("evidence|{source}|{kind}|{id}|{effect_key}"); + } + if let Some(url) = evidence.get("url").and_then(value_as_string) { + return format!("evidence_url|{url}|{effect_key}"); + } + } + if let Some(id) = finding.get("id").and_then(value_as_string) { + return format!("id|{id}|{effect_key}"); + } + format!( + "content|{}|{}|{}|{}", + finding + .get("schema") + .and_then(value_as_string) + .unwrap_or_default(), + finding + .get("label") + .and_then(value_as_string) + .unwrap_or_default(), + finding + .get("notes") + .and_then(value_as_string) + .unwrap_or_default(), + effect_key + ) +} + +#[cfg(test)] +mod report_matching_tests { + use super::*; + + #[test] + fn alt_binding_requires_observed_allele_dosage() { + let binding = serde_json::json!({ + "source": "variant", + "key": "alt", + "value": "G" + }); + let observations = vec![ + serde_json::json!({ + "variant_path": "rs1.yaml", + "ref": "A", + "alt": "G", + "genotype_display": "AA", + "zygosity": "hom_ref" + }), + serde_json::json!({ + "variant_path": "rs2.yaml", + "ref": "A", + "alt": "G", + "genotype_display": "AG", + "zygosity": "het" + }), + ]; + + let matched = app_variant_binding_match_observation(&binding, &observations) + .expect("het alt observation should match"); + assert_eq!( + matched + .get("genotype_display") + .and_then(serde_json::Value::as_str), + Some("AG") + ); + } + + #[test] + fn alt_in_binding_requires_observed_allele_dosage() { + let binding = serde_json::json!({ + "source": "variant", + "key": "alt", + "operator": "in", + "values": ["G", "T"] + }); + let observations = vec![serde_json::json!({ + "variant_path": "rs1.yaml", + "ref": "A", + "alt": "T", + "genotype_display": "AT", + "zygosity": "het" + })]; + + assert!(app_variant_binding_match_observation(&binding, &observations).is_some()); + } + + #[test] + fn hemizygous_observations_count_as_single_allele_dosage() { + let observations = vec![serde_json::json!({ + "variant_path": "rs3813929.yaml", + "ref": "C", + "alt": "T", + "genotype": "1", + "genotype_display": "T", + "zygosity": "hem_alt" + })]; + + let include_binding = serde_json::json!({ + "source": "variant", + "variant": "rs3813929.yaml", + "key": "alt", + "value": "T" + }); + assert!(app_variant_binding_match_observation(&include_binding, &observations).is_some()); + + let effect_binding = serde_json::json!({ + "source": "variant", + "variant": "rs3813929.yaml", + "allele": "T", + "operator": "dosage_equals", + "value": 1, + "chromosome_count": 1 + }); + assert!(app_variant_binding_match_observation(&effect_binding, &observations).is_some()); + } + + #[test] + fn chromosome_count_binding_separates_one_x_and_two_x_rows() { + let observations = vec![serde_json::json!({ + "variant_path": "rs3813929.yaml", + "ref": "C", + "alt": "T", + "genotype": "0/1", + "genotype_display": "CT", + "zygosity": "het" + })]; + + let one_x_binding = serde_json::json!({ + "source": "variant", + "variant": "rs3813929.yaml", + "allele": "T", + "operator": "dosage_equals", + "value": 1, + "chromosome_count": 1 + }); + assert!(app_variant_binding_match_observation(&one_x_binding, &observations).is_none()); + + let two_x_binding = serde_json::json!({ + "source": "variant", + "variant": "rs3813929.yaml", + "allele": "T", + "operator": "dosage_equals", + "value": 1, + "chromosome_count": 2 + }); + assert!(app_variant_binding_match_observation(&two_x_binding, &observations).is_some()); + } +} diff --git a/rust/bioscript-reporting/src/observation.rs b/rust/bioscript-reporting/src/observation.rs new file mode 100644 index 0000000..38e4ced --- /dev/null +++ b/rust/bioscript-reporting/src/observation.rs @@ -0,0 +1,601 @@ +use std::collections::BTreeMap; + +use bioscript_core::{Assembly, GenomicLocus, VariantKind}; +use bioscript_formats::{InferredSex, SexDetectionConfidence, SexInference}; +use bioscript_schema::VariantManifest; + +mod facets; + +use facets::{ + classify_non_reportable_alleles, is_weak_delimited_indel_match, observation_facets, + parse_optional_u32, +}; + +pub struct AppObservationInput<'a> { + pub row: &'a BTreeMap, + pub row_path: &'a str, + pub assay_id: &'a str, + pub manifest: VariantManifest, + pub gene: String, + pub source: serde_json::Value, + pub observed_alt_alleles: Vec, + pub inferred_sex: Option<&'a SexInference>, + pub fallback_assembly: Option, +} + +struct AppObservationJson { + allele_balance: Option, + alt_count: Option, + assay_id: String, + assembly: String, + call: ObservationCallValues, + chrom: String, + depth: Option, + evidence_raw: String, + gene: String, + genotype: String, + genotype_display: String, + kind: String, + locus: Option, + manifest: VariantManifest, + non_reportable_status: Option<&'static str>, + observed_alt_alleles: Vec, + ref_allele: String, + ref_count: Option, + reportable_alt: String, + row: BTreeMap, + row_path: String, + source: serde_json::Value, + weak_indel_match: bool, + zygosity: String, +} + +struct ObservationCallValues { + outcome: &'static str, + status: &'static str, + reported_genotype_display: String, +} + +pub fn app_observation_from_manifest_row(input: AppObservationInput<'_>) -> serde_json::Value { + let AppObservationInput { + row, + row_path, + assay_id, + manifest, + gene, + source, + observed_alt_alleles, + inferred_sex, + fallback_assembly, + } = input; + let ref_allele = manifest.spec.reference.clone().unwrap_or_default(); + let reportable_alt = manifest.spec.alternate.clone().unwrap_or_default(); + let mut genotype_display = row + .get("genotype") + .filter(|value| !value.is_empty()) + .cloned() + .or_else(|| genotype_display_from_raw_counts(row.get("raw_counts")?)) + .unwrap_or_default(); + let depth = parse_optional_u32(row.get("depth")); + let ref_count = parse_optional_u32(row.get("ref_count")); + let alt_count = parse_optional_u32(row.get("alt_count")); + if let Some(normalized_display) = deletion_copy_number_display(row, &manifest, depth, alt_count) + { + genotype_display = normalized_display; + } + let weak_indel_match = is_weak_delimited_indel_match(row, &manifest, &genotype_display); + let allele_balance = match (alt_count, depth) { + (Some(alt_count), Some(depth)) if depth > 0 => { + Some(f64::from(alt_count) / f64::from(depth)) + } + _ => None, + }; + let assembly = row + .get("assembly") + .filter(|value| !value.is_empty()) + .cloned() + .or_else(|| fallback_assembly.map(assembly_row_value)) + .unwrap_or_default(); + let locus = if assembly.eq_ignore_ascii_case("grch37") { + manifest.spec.grch37.as_ref() + } else { + manifest + .spec + .grch38 + .as_ref() + .or(manifest.spec.grch37.as_ref()) + }; + let chrom = locus.map_or(String::new(), |locus| locus.chrom.clone()); + let (genotype, zygosity) = normalize_app_genotype( + &genotype_display, + &ref_allele, + &reportable_alt, + manifest.spec.kind, + &chrom, + inferred_sex, + ); + let non_reportable_status = classify_non_reportable_alleles( + &genotype_display, + &ref_allele, + &reportable_alt, + &observed_alt_alleles, + ); + let call = observation_call_values( + depth, + non_reportable_status, + &genotype, + &zygosity, + &genotype_display, + ); + let evidence_raw = observation_evidence_raw(row, &chrom, inferred_sex); + let kind = manifest.spec.kind.map_or("unknown".to_owned(), |kind| { + format!("{kind:?}").to_lowercase() + }); + render_app_observation_json(AppObservationJson { + allele_balance, + alt_count, + assay_id: assay_id.to_owned(), + assembly, + call, + chrom, + depth, + evidence_raw, + gene, + genotype, + genotype_display, + kind, + locus: locus.cloned(), + manifest, + non_reportable_status, + observed_alt_alleles, + ref_allele, + ref_count, + reportable_alt, + row: row.clone(), + row_path: row_path.to_owned(), + source, + weak_indel_match, + zygosity, + }) +} + +fn observation_call_values( + depth: Option, + non_reportable_status: Option<&'static str>, + genotype: &str, + zygosity: &str, + genotype_display: &str, +) -> ObservationCallValues { + let outcome = if depth == Some(0) { + "not_covered" + } else if non_reportable_status == Some("observed_alt") { + "observed_alt" + } else if non_reportable_status == Some("unknown_alt") { + "unknown_alt" + } else if genotype == "./." { + "no_call" + } else if zygosity == "hom_ref" || zygosity == "hem_ref" { + "reference" + } else if zygosity == "het" || zygosity == "hom_alt" || zygosity == "hem_alt" { + "variant" + } else { + "unknown" + }; + let status = if matches!(outcome, "observed_alt" | "unknown_alt") { + outcome + } else if genotype == "./." { + "no_call" + } else { + "called" + }; + let reported_genotype_display = if matches!(zygosity, "hem_ref" | "hem_alt") { + hemizygous_display_genotype(genotype_display) + } else if genotype_display.is_empty() && matches!(outcome, "no_call" | "not_covered") { + "??".to_owned() + } else { + genotype_display.to_owned() + }; + ObservationCallValues { + outcome, + status, + reported_genotype_display, + } +} + +fn render_app_observation_json(input: AppObservationJson) -> serde_json::Value { + let AppObservationJson { + allele_balance, + alt_count, + assay_id, + assembly, + call, + chrom, + depth, + evidence_raw, + gene, + genotype, + genotype_display, + kind, + locus, + manifest, + non_reportable_status, + observed_alt_alleles, + ref_allele, + ref_count, + reportable_alt, + row, + row_path, + source, + weak_indel_match, + zygosity, + } = input; + serde_json::json!({ + "participant_id": row.get("participant_id").cloned().unwrap_or_default(), + "assay_id": assay_id, + "assay_version": "1.0", + "variant_key": manifest.name, + "variant_path": row_path, + "rsid": row.get("matched_rsid").filter(|value| !value.is_empty()).cloned().or_else(|| manifest.spec.rsids.first().cloned()), + "gene": gene, + "assembly": if assembly.is_empty() { serde_json::Value::Null } else { serde_json::Value::String(assembly.to_uppercase()) }, + "chrom": chrom, + "pos_start": locus.as_ref().map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.start)), + "pos_end": locus.as_ref().map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.end)), + "ref": ref_allele, + "alt": reportable_alt, + "kind": kind, + "match_status": if row.get("matched_rsid").is_some_and(|value| !value.is_empty()) || !genotype_display.is_empty() { "found" } else { "not_found" }, + "coverage_status": depth.map_or("covered", |depth| if depth > 0 { "covered" } else { "not_covered" }), + "call_status": call.status, + "genotype": genotype, + "genotype_display": call.reported_genotype_display, + "zygosity": zygosity, + "ref_count": ref_count, + "alt_count": alt_count, + "depth": depth, + "genotype_quality": serde_json::Value::Null, + "allele_balance": allele_balance, + "outcome": call.outcome, + "evidence_type": if row.get("backend").is_some_and(|value| value == "cram") { "mpileup" } else { "genotype_file" }, + "evidence_raw": evidence_raw, + "source": source, + "match_quality": if weak_indel_match { serde_json::Value::String("weak".to_owned()) } else { serde_json::Value::Null }, + "match_notes": if weak_indel_match { + serde_json::Value::String("consumer genotype file reported an insertion/deletion token at the marker, not sequence-resolved evidence for the exact deletion allele".to_owned()) + } else { + serde_json::Value::Null + }, + "facets": observation_facets(non_reportable_status, &observed_alt_alleles), + }) +} + +fn assembly_row_value(assembly: Assembly) -> String { + match assembly { + Assembly::Grch37 => "grch37".to_owned(), + Assembly::Grch38 => "grch38".to_owned(), + } +} + +fn hemizygous_display_genotype(display: &str) -> String { + display + .chars() + .find(char::is_ascii_alphabetic) + .map_or_else(|| display.to_owned(), |allele| allele.to_string()) +} + +fn deletion_copy_number_display( + row: &BTreeMap, + manifest: &VariantManifest, + depth: Option, + alt_count: Option, +) -> Option { + if !matches!(manifest.spec.kind, Some(VariantKind::Deletion)) { + return None; + } + if !matches!(row.get("backend").map(String::as_str), Some("cram" | "bam")) { + return None; + } + if manifest.spec.reference.as_deref().unwrap_or_default().len() <= 1 { + return None; + } + let depth = depth?; + if depth == 0 { + return None; + } + let alt_fraction = f64::from(alt_count.unwrap_or(0)) / f64::from(depth); + if alt_fraction >= 0.8 { + Some("DD".to_owned()) + } else if alt_fraction <= 0.2 { + Some("II".to_owned()) + } else { + Some("DI".to_owned()) + } +} + +fn normalize_app_genotype( + display: &str, + ref_allele: &str, + alt_allele: &str, + kind: Option, + chrom: &str, + inferred_sex: Option<&SexInference>, +) -> (String, String) { + if display.is_empty() { + return ("./.".to_owned(), "unknown".to_owned()); + } + if matches!(kind, Some(VariantKind::Deletion)) + && ref_allele.len() != 1 + && display + .chars() + .filter(char::is_ascii_alphabetic) + .all(|allele| matches!(allele.to_ascii_uppercase(), 'I' | 'D')) + { + return normalize_app_genotype(display, "I", "D", None, chrom, inferred_sex); + } + let alleles: Vec = display.chars().filter(char::is_ascii_alphabetic).collect(); + if ref_allele.len() != 1 || alt_allele.len() != 1 { + return (display.to_owned(), "unknown".to_owned()); + } + let ref_ch = ref_allele.chars().next().unwrap_or_default(); + let alt_ch = alt_allele.chars().next().unwrap_or_default(); + if alleles.len() == 1 && is_haploid_sex_chromosome(chrom) { + let allele = alleles[0]; + if allele == ref_ch { + return ("0".to_owned(), "hem_ref".to_owned()); + } + if allele == alt_ch { + return ("1".to_owned(), "hem_alt".to_owned()); + } + return (display.to_owned(), "unknown".to_owned()); + } + if alleles.len() != 2 { + return (display.to_owned(), "unknown".to_owned()); + } + if is_confident_male_sex_chromosome(chrom, inferred_sex) && alleles[0] == alleles[1] { + let allele = alleles[0]; + if allele == ref_ch { + return ("0".to_owned(), "hem_ref".to_owned()); + } + if allele == alt_ch { + return ("1".to_owned(), "hem_alt".to_owned()); + } + return (display.to_owned(), "unknown".to_owned()); + } + let alt_count = alleles.iter().filter(|allele| **allele == alt_ch).count(); + let ref_count = alleles.iter().filter(|allele| **allele == ref_ch).count(); + match (ref_count, alt_count) { + (2, 0) => ("0/0".to_owned(), "hom_ref".to_owned()), + (1, 1) => ("0/1".to_owned(), "het".to_owned()), + (0, 2) => ("1/1".to_owned(), "hom_alt".to_owned()), + _ => (display.to_owned(), "unknown".to_owned()), + } +} + +fn is_confident_male_sex_chromosome(chrom: &str, inferred_sex: Option<&SexInference>) -> bool { + is_haploid_sex_chromosome(chrom) + && inferred_sex.is_some_and(|sex| { + sex.sex == InferredSex::Male + && matches!( + sex.confidence, + SexDetectionConfidence::High | SexDetectionConfidence::Medium + ) + }) +} + +fn is_haploid_sex_chromosome(chrom: &str) -> bool { + matches!( + chrom + .trim() + .trim_start_matches("chr") + .trim_start_matches("CHR") + .to_ascii_uppercase() + .as_str(), + "X" | "Y" | "23" | "24" + ) +} + +fn observation_evidence_raw( + row: &BTreeMap, + chrom: &str, + inferred_sex: Option<&SexInference>, +) -> String { + let mut evidence_raw = row.get("evidence").cloned().unwrap_or_default(); + if !is_haploid_sex_chromosome(chrom) { + return evidence_raw; + } + let Some(inferred_sex) = inferred_sex else { + return evidence_raw; + }; + let sex_evidence = sex_inference_evidence_raw(inferred_sex); + if sex_evidence.is_empty() { + return evidence_raw; + } + if evidence_raw.is_empty() { + evidence_raw = sex_evidence; + } else { + evidence_raw.push_str(" | "); + evidence_raw.push_str(&sex_evidence); + } + evidence_raw +} + +fn sex_inference_evidence_raw(inferred_sex: &SexInference) -> String { + let sex = match inferred_sex.sex { + InferredSex::Male => "male", + InferredSex::Female => "female", + InferredSex::Unknown => "unknown", + }; + let confidence = match inferred_sex.confidence { + SexDetectionConfidence::High => "high", + SexDetectionConfidence::Medium => "medium", + SexDetectionConfidence::Low => "low", + }; + let mut fields = vec![ + format!("detected_sex={sex}"), + format!("sex_confidence={confidence}"), + format!("sex_method={}", inferred_sex.method), + ]; + fields.extend( + inferred_sex + .evidence + .iter() + .map(|item| format!("sex_{item}")), + ); + fields.join(" ") +} + +fn genotype_display_from_raw_counts(raw_counts: &str) -> Option { + let counts: serde_json::Map = + serde_json::from_str(raw_counts).ok()?; + let mut items = counts + .into_iter() + .filter_map(|(base, count)| { + let base = base.chars().next()?.to_ascii_uppercase(); + let count = count.as_u64()?; + if matches!(base, 'A' | 'C' | 'G' | 'T') && count > 0 { + Some((base, count)) + } else { + None + } + }) + .collect::>(); + if items.is_empty() { + return None; + } + items.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0))); + let total = items.iter().map(|(_, count)| *count).sum::(); + let (top_base, top_count) = items[0]; + if total == 0 || items.len() == 1 || top_count.saturating_mul(10) >= total.saturating_mul(8) { + return Some(format!("{top_base}{top_base}")); + } + Some(format!("{}{}", top_base, items[1].0)) +} + +#[cfg(test)] +mod tests { + use super::*; + use bioscript_core::VariantSpec; + use std::path::PathBuf; + + #[test] + fn normalizes_long_deletion_reference_tokens_as_insertion_deletion_copy_number() { + assert_eq!( + normalize_app_genotype( + "II", + "TTATAA", + "", + Some(VariantKind::Deletion), + "22", + None, + ), + ("0/0".to_owned(), "hom_ref".to_owned()) + ); + assert_eq!( + normalize_app_genotype( + "ID", + "TTATAA", + "", + Some(VariantKind::Deletion), + "22", + None, + ), + ("0/1".to_owned(), "het".to_owned()) + ); + } + + #[test] + fn displays_cram_long_deletion_copy_number_as_insertion_deletion_tokens() { + let manifest = VariantManifest { + path: PathBuf::from("rs71785313.yaml"), + name: "APOL1_G2".to_owned(), + tags: Vec::new(), + spec: VariantSpec { + reference: Some("TTATAA".to_owned()), + alternate: Some("".to_owned()), + kind: Some(VariantKind::Deletion), + ..VariantSpec::default() + }, + }; + let mut row = BTreeMap::new(); + row.insert("backend".to_owned(), "cram".to_owned()); + + assert_eq!( + deletion_copy_number_display(&row, &manifest, Some(39), Some(0)).as_deref(), + Some("II") + ); + assert_eq!( + deletion_copy_number_display(&row, &manifest, Some(39), Some(39)).as_deref(), + Some("DD") + ); + assert_eq!( + deletion_copy_number_display(&row, &manifest, Some(40), Some(20)).as_deref(), + Some("DI") + ); + } + + #[test] + fn raw_counts_can_fill_display_for_homozygous_and_heterozygous_observations() { + assert_eq!( + genotype_display_from_raw_counts(r#"{"T": 24}"#).as_deref(), + Some("TT") + ); + assert_eq!( + genotype_display_from_raw_counts(r#"{"C": 12, "T": 10}"#).as_deref(), + Some("CT") + ); + } + + #[test] + fn non_reportable_alleles_are_classified_as_observed_or_unknown() { + assert_eq!( + classify_non_reportable_alleles("TT", "C", "G", &["T".to_owned()]), + Some("observed_alt") + ); + assert_eq!( + classify_non_reportable_alleles("AT", "C", "G", &["T".to_owned()]), + Some("unknown_alt") + ); + assert_eq!( + classify_non_reportable_alleles("CG", "C", "G", &["T".to_owned()]), + None + ); + } + + #[test] + fn single_allele_sex_chromosome_calls_are_treated_as_hemizygous() { + assert_eq!( + normalize_app_genotype("G", "C", "G", None, "X", None), + ("1".to_owned(), "hem_alt".to_owned()) + ); + assert_eq!( + normalize_app_genotype("C", "C", "G", None, "chrX", None), + ("0".to_owned(), "hem_ref".to_owned()) + ); + assert_eq!( + normalize_app_genotype("G", "C", "G", None, "1", None), + ("G".to_owned(), "unknown".to_owned()) + ); + assert_eq!( + normalize_app_genotype("GG", "C", "G", None, "X", None), + ("1/1".to_owned(), "hom_alt".to_owned()) + ); + } + + #[test] + fn confident_male_sex_chromosome_duplicate_calls_are_hemizygous() { + let inferred_sex = SexInference { + sex: InferredSex::Male, + confidence: SexDetectionConfidence::High, + method: "vcf_non_par_x_gt".to_owned(), + evidence: vec!["called_y_snps=1200".to_owned()], + }; + assert_eq!( + normalize_app_genotype("GG", "C", "G", None, "X", Some(&inferred_sex)), + ("1".to_owned(), "hem_alt".to_owned()) + ); + assert_eq!( + normalize_app_genotype("CC", "C", "G", None, "chrX", Some(&inferred_sex)), + ("0".to_owned(), "hem_ref".to_owned()) + ); + } +} diff --git a/rust/bioscript-reporting/src/observation/facets.rs b/rust/bioscript-reporting/src/observation/facets.rs new file mode 100644 index 0000000..40a8951 --- /dev/null +++ b/rust/bioscript-reporting/src/observation/facets.rs @@ -0,0 +1,81 @@ +use std::collections::BTreeMap; + +use bioscript_core::VariantKind; +use bioscript_schema::VariantManifest; + +pub(super) fn classify_non_reportable_alleles( + display: &str, + ref_allele: &str, + reportable_alt: &str, + observed_alts: &[String], +) -> Option<&'static str> { + if display.is_empty() || ref_allele.len() != 1 || reportable_alt.len() != 1 { + return None; + } + let ref_ch = ref_allele.chars().next()?.to_ascii_uppercase(); + let alt_ch = reportable_alt.chars().next()?.to_ascii_uppercase(); + let non_reportable = display + .chars() + .filter(char::is_ascii_alphabetic) + .map(|ch| ch.to_ascii_uppercase()) + .filter(|ch| *ch != ref_ch && *ch != alt_ch) + .collect::>(); + if non_reportable.is_empty() { + return None; + } + if non_reportable.iter().all(|ch| { + observed_alts.iter().any(|alt| { + alt.len() == 1 + && alt + .chars() + .next() + .is_some_and(|alt_ch| alt_ch.to_ascii_uppercase() == *ch) + }) + }) { + Some("observed_alt") + } else { + Some("unknown_alt") + } +} + +pub(super) fn is_weak_delimited_indel_match( + row: &BTreeMap, + manifest: &VariantManifest, + genotype_display: &str, +) -> bool { + if !matches!(manifest.spec.kind, Some(VariantKind::Deletion)) { + return false; + } + if !matches!(row.get("backend").map(String::as_str), Some("text" | "zip")) { + return false; + } + if manifest.spec.reference.as_deref().unwrap_or_default().len() <= 1 { + return false; + } + genotype_display + .chars() + .filter(char::is_ascii_alphabetic) + .all(|allele| matches!(allele.to_ascii_uppercase(), 'I' | 'D')) +} + +pub(super) fn observation_facets( + non_reportable_status: Option<&str>, + observed_alts: &[String], +) -> serde_json::Value { + let mut facets = Vec::new(); + if let Some(status) = non_reportable_status { + facets.push(status.to_owned()); + if status == "observed_alt" && !observed_alts.is_empty() { + facets.push(format!("known_observed_alts={}", observed_alts.join(","))); + } + } + if facets.is_empty() { + serde_json::Value::Null + } else { + serde_json::Value::String(facets.join(";")) + } +} + +pub(super) fn parse_optional_u32(value: Option<&String>) -> Option { + value.and_then(|value| value.parse::().ok()) +} diff --git a/rust/bioscript-reporting/src/report_json.rs b/rust/bioscript-reporting/src/report_json.rs new file mode 100644 index 0000000..e3c5e44 --- /dev/null +++ b/rust/bioscript-reporting/src/report_json.rs @@ -0,0 +1,181 @@ +#[derive(Clone, Copy)] +pub struct AppReportJsonInput<'a> { + pub assay_id: &'a str, + pub participant_id: &'a str, + pub input_file_name: &'a str, + pub input_file_path: &'a str, + pub observations: &'a [serde_json::Value], + pub analyses: &'a [serde_json::Value], + pub findings: &'a [serde_json::Value], + pub provenance: &'a [serde_json::Value], + pub input_inspection: Option<&'a bioscript_formats::FileInspection>, + pub manifest_metadata: &'a serde_json::Value, +} + +#[derive(Clone, Copy)] +pub struct AppInputReportInput<'a> { + pub assay_id: &'a str, + pub participant_id: &'a str, + pub input_file_name: &'a str, + pub input_file_path: &'a str, + pub observations: &'a [serde_json::Value], + pub analyses: &'a [serde_json::Value], + pub findings: &'a [serde_json::Value], + pub provenance: &'a [serde_json::Value], + pub input_inspection: Option<&'a bioscript_formats::FileInspection>, + pub manifest_metadata: &'a serde_json::Value, +} + +pub fn app_input_report_json(input: AppInputReportInput<'_>) -> serde_json::Value { + let matched_findings = + crate::match_app_findings(input.findings, input.observations, input.analyses); + app_report_json(AppReportJsonInput { + assay_id: input.assay_id, + participant_id: input.participant_id, + input_file_name: input.input_file_name, + input_file_path: input.input_file_path, + observations: input.observations, + analyses: input.analyses, + findings: &matched_findings, + provenance: input.provenance, + input_inspection: input.input_inspection, + manifest_metadata: input.manifest_metadata, + }) +} + +pub fn app_report_json(input: AppReportJsonInput<'_>) -> serde_json::Value { + let called = input + .observations + .iter() + .filter(|item| { + item.get("call_status").and_then(serde_json::Value::as_str) == Some("called") + }) + .count(); + let input_debug = input.input_inspection.map(|inspection| { + let mut value = input_inspection_json(inspection); + if observations_have_imputed_vcf_references(input.observations) + && let Some(object) = value.as_object_mut() + { + object.insert( + "vcf_missing_reference_imputation".to_owned(), + serde_json::Value::Bool(true), + ); + } + value + }); + serde_json::json!({ + "schema": "bioscript:report:1.0", + "version": "1.0", + "participant_id": input.participant_id, + "assay_id": input.assay_id, + "assay_version": "1.0", + "manifest": input.manifest_metadata, + "input": { + "file_name": input.input_file_name, + "file_path": input.input_file_path, + "debug": input_debug, + }, + "report_status": if called == input.observations.len() { "complete" } else { "partial" }, + "derived_from": input.observations.iter().filter_map(|item| item.get("variant_key").cloned()).collect::>(), + "analyses": input.analyses, + "findings": input.findings, + "provenance": input.provenance, + "metrics": { + "n_sites_tested": input.observations.len(), + "n_sites_called": called, + "n_sites_missing": input.observations.len().saturating_sub(called), + "n_analyses": input.analyses.len(), + "n_findings_matched": input.findings.len(), + } + }) +} + +fn observations_have_imputed_vcf_references(observations: &[serde_json::Value]) -> bool { + observations.iter().any(|observation| { + observation + .get("evidence_raw") + .and_then(serde_json::Value::as_str) + .is_some_and(|evidence| { + evidence.contains("imputed reference genotype from absent variant-only VCF record") + }) + }) +} + +fn input_inspection_json(inspection: &bioscript_formats::FileInspection) -> serde_json::Value { + serde_json::json!({ + "container": file_container_name(inspection.container), + "format": detected_kind_name(inspection.detected_kind), + "format_confidence": detection_confidence_name(inspection.confidence), + "assembly": inspection.assembly.map(assembly_name), + "phased": inspection.phased, + "selected_entry": inspection.selected_entry, + "has_index": inspection.has_index, + "index_path": inspection.index_path.as_ref().map(|path| path.display().to_string()), + "reference_matches": inspection.reference_matches, + "source": inspection.source.as_ref().map(|source| serde_json::json!({ + "vendor": source.vendor, + "platform_version": source.platform_version, + "confidence": detection_confidence_name(source.confidence), + "evidence": source.evidence, + })), + "inferred_sex": inspection.inferred_sex.as_ref().map(|sex| serde_json::json!({ + "sex": inferred_sex_name(sex.sex), + "confidence": sex_detection_confidence_name(sex.confidence), + "method": sex.method, + "evidence": sex.evidence, + })), + "evidence": inspection.evidence, + "warnings": inspection.warnings, + "duration_ms": inspection.duration_ms, + }) +} + +fn file_container_name(value: bioscript_formats::FileContainer) -> &'static str { + match value { + bioscript_formats::FileContainer::Plain => "plain", + bioscript_formats::FileContainer::Zip => "zip", + } +} + +fn detected_kind_name(value: bioscript_formats::DetectedKind) -> &'static str { + match value { + bioscript_formats::DetectedKind::GenotypeText => "genotype_text", + bioscript_formats::DetectedKind::Vcf => "vcf", + bioscript_formats::DetectedKind::AlignmentCram => "alignment_cram", + bioscript_formats::DetectedKind::AlignmentBam => "alignment_bam", + bioscript_formats::DetectedKind::ReferenceFasta => "reference_fasta", + bioscript_formats::DetectedKind::Unknown => "unknown", + } +} + +fn detection_confidence_name(value: bioscript_formats::DetectionConfidence) -> &'static str { + match value { + bioscript_formats::DetectionConfidence::Authoritative => "authoritative", + bioscript_formats::DetectionConfidence::StrongHeuristic => "strong_heuristic", + bioscript_formats::DetectionConfidence::WeakHeuristic => "weak_heuristic", + bioscript_formats::DetectionConfidence::Unknown => "unknown", + } +} + +fn assembly_name(value: bioscript_core::Assembly) -> &'static str { + match value { + bioscript_core::Assembly::Grch37 => "grch37", + bioscript_core::Assembly::Grch38 => "grch38", + } +} + +fn inferred_sex_name(value: bioscript_formats::InferredSex) -> &'static str { + match value { + bioscript_formats::InferredSex::Male => "male", + bioscript_formats::InferredSex::Female => "female", + bioscript_formats::InferredSex::Unknown => "unknown", + } +} + +fn sex_detection_confidence_name(value: bioscript_formats::SexDetectionConfidence) -> &'static str { + match value { + bioscript_formats::SexDetectionConfidence::High => "high", + bioscript_formats::SexDetectionConfidence::Medium => "medium", + bioscript_formats::SexDetectionConfidence::Low => "low", + } +} diff --git a/rust/bioscript-reporting/src/rows.rs b/rust/bioscript-reporting/src/rows.rs new file mode 100644 index 0000000..7f8d823 --- /dev/null +++ b/rust/bioscript-reporting/src/rows.rs @@ -0,0 +1,149 @@ +use std::collections::BTreeMap; + +use bioscript_core::{Assembly, VariantObservation}; + +pub const MANIFEST_ROW_TSV_HEADERS: [&str; 13] = [ + "kind", + "name", + "path", + "tags", + "participant_id", + "backend", + "matched_rsid", + "assembly", + "genotype", + "ref_count", + "alt_count", + "depth", + "evidence", +]; + +pub fn variant_row( + path: &str, + name: &str, + tags: &[String], + observation: &VariantObservation, + participant_id: &str, +) -> BTreeMap { + let mut row = BTreeMap::new(); + row.insert("kind".to_owned(), "variant".to_owned()); + row.insert("name".to_owned(), name.to_owned()); + row.insert("path".to_owned(), path.to_owned()); + row.insert("tags".to_owned(), tags.join(",")); + row.insert("backend".to_owned(), observation.backend.clone()); + row.insert("participant_id".to_owned(), participant_id.to_owned()); + row.insert( + "matched_rsid".to_owned(), + observation.matched_rsid.clone().unwrap_or_default(), + ); + row.insert( + "assembly".to_owned(), + observation + .assembly + .map(assembly_row_value) + .unwrap_or_default(), + ); + row.insert( + "genotype".to_owned(), + observation.genotype.clone().unwrap_or_default(), + ); + row.insert( + "ref_count".to_owned(), + observation + .ref_count + .map_or_else(String::new, |value| value.to_string()), + ); + row.insert( + "alt_count".to_owned(), + observation + .alt_count + .map_or_else(String::new, |value| value.to_string()), + ); + row.insert( + "depth".to_owned(), + observation + .depth + .map_or_else(String::new, |value| value.to_string()), + ); + row.insert( + "raw_counts".to_owned(), + serde_json::to_string(&observation.raw_counts).unwrap_or_default(), + ); + row.insert("evidence".to_owned(), observation.evidence.join(" | ")); + row +} + +pub fn render_manifest_rows_tsv(rows: &[BTreeMap]) -> String { + let mut out = MANIFEST_ROW_TSV_HEADERS.join("\t"); + out.push('\n'); + for row in rows { + let line = MANIFEST_ROW_TSV_HEADERS + .iter() + .map(|header| { + row.get(*header) + .cloned() + .unwrap_or_default() + .replace('\t', " ") + }) + .collect::>() + .join("\t"); + out.push_str(&line); + out.push('\n'); + } + out +} + +pub fn render_manifest_trace_tsv(rows: &[BTreeMap]) -> String { + let mut trace = String::from("step\tline\tcode\n"); + for (idx, row) in rows.iter().enumerate() { + trace.push_str(&(idx + 1).to_string()); + trace.push('\t'); + trace.push_str(&(idx + 1).to_string()); + trace.push('\t'); + trace.push_str(&row.get("path").cloned().unwrap_or_default()); + trace.push('\n'); + } + trace +} + +fn assembly_row_value(assembly: Assembly) -> String { + match assembly { + Assembly::Grch37 => "grch37".to_owned(), + Assembly::Grch38 => "grch38".to_owned(), + } +} + +#[cfg(test)] +mod tests { + use super::{MANIFEST_ROW_TSV_HEADERS, render_manifest_rows_tsv, render_manifest_trace_tsv}; + + #[test] + fn renders_manifest_rows_with_cli_header_order() { + let mut row = std::collections::BTreeMap::new(); + row.insert("kind".to_owned(), "variant".to_owned()); + row.insert("name".to_owned(), "APOE\trs429358".to_owned()); + row.insert("path".to_owned(), "assets/APOE/rs429358.yaml".to_owned()); + row.insert("genotype".to_owned(), "TT".to_owned()); + + let text = render_manifest_rows_tsv(&[row]); + let mut lines = text.lines(); + assert_eq!(lines.next().unwrap(), MANIFEST_ROW_TSV_HEADERS.join("\t")); + assert_eq!( + lines.next().unwrap(), + "variant\tAPOE rs429358\tassets/APOE/rs429358.yaml\t\t\t\t\t\tTT\t\t\t\t" + ); + } + + #[test] + fn renders_manifest_trace_with_cli_format() { + let mut first = std::collections::BTreeMap::new(); + first.insert("path".to_owned(), "assets/APOE/rs429358.yaml".to_owned()); + let mut second = std::collections::BTreeMap::new(); + second.insert("path".to_owned(), "assets/APOE/rs7412.yaml".to_owned()); + + assert_eq!( + render_manifest_trace_tsv(&[first, second]), + "step\tline\tcode\n1\t1\tassets/APOE/rs429358.yaml\n2\t2\tassets/APOE/rs7412.yaml\n" + ); + } +} diff --git a/rust/bioscript-wasm/Cargo.toml b/rust/bioscript-wasm/Cargo.toml index 7ef98ae..b54f449 100644 --- a/rust/bioscript-wasm/Cargo.toml +++ b/rust/bioscript-wasm/Cargo.toml @@ -9,11 +9,12 @@ crate-type = ["cdylib", "rlib"] [dependencies] bioscript-core = { path = "../bioscript-core" } bioscript-formats = { path = "../bioscript-formats" } +bioscript-reporting = { path = "../bioscript-reporting" } bioscript-runtime = { path = "../bioscript-runtime" } bioscript-schema = { path = "../bioscript-schema" } getrandom = { version = "0.3", features = ["wasm_js"] } monty = { path = "../../monty/crates/monty" } -noodles = { version = "0.109.0", features = ["bam", "bgzf", "cram", "csi", "fasta", "tabix"] } +noodles = { version = "0.110.0", features = ["bam", "bgzf", "core", "cram", "csi", "fasta", "sam", "tabix", "vcf"] } wasm-bindgen = "0.2" js-sys = "0.3" serde = { version = "1", features = ["derive"] } diff --git a/rust/bioscript-wasm/src/lib.rs b/rust/bioscript-wasm/src/lib.rs index f345c8b..f057ac9 100644 --- a/rust/bioscript-wasm/src/lib.rs +++ b/rust/bioscript-wasm/src/lib.rs @@ -14,20 +14,20 @@ //! - Index-less fallback (linear scan or on-the-fly index build). //! - Indel / deletion observations on CRAM. -mod inspect_api; mod index_api; +mod inspect_api; mod js_reader; mod lookup_api; mod package_api; mod report_api; mod variant_yaml; -pub use inspect_api::{inspect_bytes, resolve_remote_resource_text}; pub use index_api::{ generate_bam_bai, generate_bam_bai_from_reader, generate_cram_crai, generate_cram_crai_from_reader, generate_fasta_fai, generate_fasta_fai_from_reader, generate_vcf_tbi, }; +pub use inspect_api::{inspect_bytes, resolve_remote_resource_text}; pub use lookup_api::{ lookup_cram_variants, lookup_genotype_bytes_rsids, lookup_genotype_bytes_variants, lookup_vcf_variants, diff --git a/rust/bioscript-wasm/src/report_api.rs b/rust/bioscript-wasm/src/report_api.rs index d77d155..459c621 100644 --- a/rust/bioscript-wasm/src/report_api.rs +++ b/rust/bioscript-wasm/src/report_api.rs @@ -1,58 +1,42 @@ use std::{ collections::BTreeMap, - fmt::Write as _, path::{Path, PathBuf}, time::Duration, }; use bioscript_core::{ - Assembly, GenomicLocus, OBSERVATION_TSV_HEADERS, RuntimeError, VariantKind, VariantObservation, - VariantSpec, + Assembly, GenomicLocus, RuntimeError, VariantKind, VariantObservation, VariantSpec, }; use bioscript_formats::{ - DetectedKind, DetectionConfidence, FileContainer, FileInspection, GenotypeLoadOptions, - GenotypeStore, InferredSex, InspectOptions, SexDetectionConfidence, SexInference, + GenotypeLoadOptions, GenotypeStore, InspectOptions, SexInference, inspect_bytes as inspect_bytes_rs, }; use bioscript_runtime::{BioscriptRuntime, RuntimeConfig}; -use bioscript_schema::{ - AssayManifest, PanelInterpretation, PanelManifest, VariantManifest, load_assay_manifest_text, - load_panel_manifest_text, load_variant_manifest_text, -}; +use bioscript_schema::{PanelInterpretation, VariantManifest, load_variant_manifest_text}; use monty::{MontyObject, ResourceLimits}; use serde::{Deserialize, Serialize}; use wasm_bindgen::prelude::*; +#[path = "report_api/analysis_cache.rs"] +mod analysis_cache; #[path = "report_helpers.rs"] mod report_helpers; #[path = "report_input_inspection.rs"] mod report_input_inspection; #[path = "report_lookup.rs"] mod report_lookup; -#[path = "report_render.rs"] -mod report_render; #[path = "report_workspace.rs"] mod report_workspace; +use analysis_cache::analysis_cache_observations; use report_helpers::*; use report_input_inspection::{ decompress_vcf_head_lines, explicit_sex_from_options, inspect_head_via_js_reader, vcf_sex_via_tabix, }; use report_lookup::{BamReportLookup, CramReportLookup, VcfReportLookup}; -use report_render::{ - AppReportJsonInput, app_report_json, match_app_findings, render_app_html_document, -}; use report_workspace::PackageWorkspace; -include!("../../bioscript-cli/src/report_matching.rs"); -include!("../../bioscript-cli/src/report_html_sections.rs"); -include!("../../bioscript-cli/src/report_html_analysis.rs"); -include!("../../bioscript-cli/src/report_html_provenance.rs"); -include!("../../bioscript-cli/src/report_html_observations.rs"); -include!("../../bioscript-cli/src/report_html_pgx.rs"); -include!("../../bioscript-cli/src/report_html_helpers.rs"); - #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub(super) struct PackageFileInput { @@ -80,40 +64,6 @@ pub(super) struct ReportOptionsInput { sample_sex: Option, } -fn analysis_cache_observations( - manifest_observations: &[VariantObservation], - app_observations: &[serde_json::Value], -) -> Vec { - manifest_observations - .iter() - .map(|observation| { - let mut observation = observation.clone(); - if let Some(app_observation) = matching_app_observation(&observation, app_observations) - && let Some(genotype_display) = app_observation - .get("genotype_display") - .and_then(serde_json::Value::as_str) - .filter(|value| !value.is_empty() && *value != "??") - { - observation.genotype = Some(genotype_display.to_owned()); - } - observation - }) - .collect() -} - -fn matching_app_observation<'a>( - observation: &VariantObservation, - app_observations: &'a [serde_json::Value], -) -> Option<&'a serde_json::Value> { - let matched_rsid = observation.matched_rsid.as_deref()?; - app_observations.iter().find(|app_observation| { - app_observation - .get("rsid") - .and_then(serde_json::Value::as_str) - == Some(matched_rsid) - }) -} - #[derive(Serialize)] #[serde(rename_all = "camelCase")] struct ReportArtifactOutput { @@ -149,10 +99,7 @@ pub fn run_package_report_bytes( }; let workspace = PackageWorkspace::new(package_files)?; let participant_id = participant_id_from_name(input_name); - let assay_id = app_assay_id_from_workspace(&workspace, manifest_path)?; - let manifest_metadata = workspace.report_manifest_metadata(manifest_path)?; - let findings = workspace.load_manifest_findings(manifest_path)?; - let provenance = workspace.load_manifest_provenance_links(manifest_path)?; + let manifest_context = workspace.report_manifest_context(manifest_path)?; let inspect_options = InspectOptions { input_index: None, reference_file: None, @@ -161,12 +108,14 @@ pub fn run_package_report_bytes( }; let input_inspection = inspect_bytes_rs(input_name, input_bytes, &inspect_options) .map_err(|err| JsError::new(&format!("inspect input failed: {err:?}")))?; - let mut loader = GenotypeLoadOptions::default(); - loader.assembly = input_inspection.assembly; - loader.inferred_sex = input_inspection - .inferred_sex - .as_ref() - .map(|inference| inference.sex); + let loader = GenotypeLoadOptions { + assembly: input_inspection.assembly, + inferred_sex: input_inspection + .inferred_sex + .as_ref() + .map(|inference| inference.sex), + ..Default::default() + }; let store = GenotypeStore::from_bytes(input_name, input_bytes) .map_err(|err| JsError::new(&format!("load genotypes failed: {err:?}")))?; let manifest_output = @@ -177,7 +126,7 @@ pub fn run_package_report_bytes( .map(|row| { workspace.app_observation_from_manifest_row( row, - &assay_id, + &manifest_context.assay_id, input_inspection.inferred_sex.as_ref(), input_inspection.assembly, ) @@ -192,40 +141,22 @@ pub fn run_package_report_bytes( &loader, &options, )?; - let matched_findings = match_app_findings(&findings, &observations, &analyses); - let reports = vec![app_report_json(AppReportJsonInput { - assay_id: &assay_id, - participant_id: &participant_id, - input_file_name: input_name, - observations: &observations, - analyses: &analyses, - findings: &matched_findings, - provenance: &provenance, - input_inspection: Some(&input_inspection), - manifest_metadata: &manifest_metadata, - })]; - let observations_tsv = render_app_observations_tsv(&observations)?; - let analysis_jsonl = render_jsonl(&analyses)?; - let reports_jsonl = render_jsonl(&reports)?; - let html = render_app_html_document(&observations, &reports)?; - let text_output = format!( - "observations: observations.tsv\nanalysis: analysis.jsonl\nreports: reports.jsonl\nhtml: index.html\n" - ); - serde_json::to_string(&ReportRunOutput { - artifacts: vec![ - artifact( - "observations.tsv", - "text/tab-separated-values", - observations_tsv, - ), - artifact("analysis.jsonl", "application/jsonl", analysis_jsonl), - artifact("reports.jsonl", "application/jsonl", reports_jsonl), - artifact("index.html", "text/html", html), - ], - duration_ms: (js_sys::Date::now() - started_ms).max(0.0) as u128, - text_output, - }) - .map_err(|err| JsError::new(&format!("failed to encode report output: {err}"))) + let artifacts = bioscript_reporting::render_input_report_artifact_texts( + bioscript_reporting::AppInputReportInput { + assay_id: &manifest_context.assay_id, + participant_id: &participant_id, + input_file_name: input_name, + input_file_path: input_name, + observations: &observations, + analyses: &analyses, + findings: &manifest_context.findings, + provenance: &manifest_context.provenance, + input_inspection: Some(&input_inspection), + manifest_metadata: &manifest_context.manifest_metadata, + }, + ) + .map_err(|err| JsError::new(&err))?; + encode_report_run_output(started_ms, artifacts) } /// Mirrors `runPackageReportBytes` but for CRAM input. The CRAM body and @@ -263,10 +194,7 @@ pub fn run_package_report_from_cram( }; let workspace = PackageWorkspace::new(package_files)?; let participant_id = participant_id_from_name(input_name); - let assay_id = app_assay_id_from_workspace(&workspace, manifest_path)?; - let manifest_metadata = workspace.report_manifest_metadata(manifest_path)?; - let findings = workspace.load_manifest_findings(manifest_path)?; - let provenance = workspace.load_manifest_provenance_links(manifest_path)?; + let manifest_context = workspace.report_manifest_context(manifest_path)?; let mut head_inspection = inspect_head_via_js_reader( &cram_read_at, cram_len as u64, @@ -316,15 +244,19 @@ pub fn run_package_report_from_cram( } } - let mut loader = GenotypeLoadOptions::default(); - loader.format = Some(bioscript_formats::GenotypeSourceFormat::Cram); - loader.allow_reference_md5_mismatch = true; + let loader = GenotypeLoadOptions { + format: Some(bioscript_formats::GenotypeSourceFormat::Cram), + allow_reference_md5_mismatch: true, + ..Default::default() + }; let manifest_output = workspace.run_manifest_rows(manifest_path, &lookup, &participant_id, &options.filters)?; let observations = manifest_output .rows .iter() - .map(|row| workspace.app_observation_from_manifest_row(row, &assay_id, None, None)) + .map(|row| { + workspace.app_observation_from_manifest_row(row, &manifest_context.assay_id, None, None) + }) .collect::, _>>()?; let analysis_observations = analysis_cache_observations(&manifest_output.observations, &observations); @@ -343,38 +275,22 @@ pub fn run_package_report_from_cram( &loader, &options, )?; - let matched_findings = match_app_findings(&findings, &observations, &analyses); - let reports = vec![app_report_json(AppReportJsonInput { - assay_id: &assay_id, - participant_id: &participant_id, - input_file_name: input_name, - observations: &observations, - analyses: &analyses, - findings: &matched_findings, - provenance: &provenance, - input_inspection: Some(&head_inspection), - manifest_metadata: &manifest_metadata, - })]; - let observations_tsv = render_app_observations_tsv(&observations)?; - let analysis_jsonl = render_jsonl(&analyses)?; - let reports_jsonl = render_jsonl(&reports)?; - let html = render_app_html_document(&observations, &reports)?; - let text_output = "observations: observations.tsv\nanalysis: analysis.jsonl\nreports: reports.jsonl\nhtml: index.html\n".to_owned(); - serde_json::to_string(&ReportRunOutput { - artifacts: vec![ - artifact( - "observations.tsv", - "text/tab-separated-values", - observations_tsv, - ), - artifact("analysis.jsonl", "application/jsonl", analysis_jsonl), - artifact("reports.jsonl", "application/jsonl", reports_jsonl), - artifact("index.html", "text/html", html), - ], - duration_ms: (js_sys::Date::now() - started_ms).max(0.0) as u128, - text_output, - }) - .map_err(|err| JsError::new(&format!("failed to encode report output: {err}"))) + let artifacts = bioscript_reporting::render_input_report_artifact_texts( + bioscript_reporting::AppInputReportInput { + assay_id: &manifest_context.assay_id, + participant_id: &participant_id, + input_file_name: input_name, + input_file_path: input_name, + observations: &observations, + analyses: &analyses, + findings: &manifest_context.findings, + provenance: &manifest_context.provenance, + input_inspection: Some(&head_inspection), + manifest_metadata: &manifest_context.manifest_metadata, + }, + ) + .map_err(|err| JsError::new(&err))?; + encode_report_run_output(started_ms, artifacts) } /// Mirrors `runPackageReportBytes` but for BAM input. The BAM body is streamed @@ -401,21 +317,16 @@ pub fn run_package_report_from_bam( }; let workspace = PackageWorkspace::new(package_files)?; let participant_id = participant_id_from_name(input_name); - let assay_id = app_assay_id_from_workspace(&workspace, manifest_path)?; - let manifest_metadata = workspace.report_manifest_metadata(manifest_path)?; - let findings = workspace.load_manifest_findings(manifest_path)?; - let provenance = workspace.load_manifest_provenance_links(manifest_path)?; + let manifest_context = workspace.report_manifest_context(manifest_path)?; let mut head_inspection = inspect_head_via_js_reader(&bam_read_at, bam_len as u64, input_name, false); let bai_index = bioscript_formats::alignment::parse_bai_bytes(bai_bytes) .map_err(|err| JsError::new(&format!("parse bai: {err:?}")))?; let bam_reader = JsReader::new(bam_read_at, bam_len as u64, "bam"); - let indexed = bioscript_formats::alignment::build_bam_indexed_reader_from_reader( - bam_reader, - bai_index, - ) - .map_err(|err| JsError::new(&format!("build bam reader: {err:?}")))?; + let indexed = + bioscript_formats::alignment::build_bam_indexed_reader_from_reader(bam_reader, bai_index) + .map_err(|err| JsError::new(&format!("build bam reader: {err:?}")))?; let lookup = BamReportLookup { reader: std::cell::RefCell::new(indexed), @@ -426,14 +337,18 @@ pub fn run_package_report_from_bam( head_inspection.inferred_sex = Some(explicit); } - let mut loader = GenotypeLoadOptions::default(); - loader.format = Some(bioscript_formats::GenotypeSourceFormat::Bam); + let loader = GenotypeLoadOptions { + format: Some(bioscript_formats::GenotypeSourceFormat::Bam), + ..Default::default() + }; let manifest_output = workspace.run_manifest_rows(manifest_path, &lookup, &participant_id, &options.filters)?; let observations = manifest_output .rows .iter() - .map(|row| workspace.app_observation_from_manifest_row(row, &assay_id, None, None)) + .map(|row| { + workspace.app_observation_from_manifest_row(row, &manifest_context.assay_id, None, None) + }) .collect::, _>>()?; let analysis_observations = analysis_cache_observations(&manifest_output.observations, &observations); @@ -446,38 +361,22 @@ pub fn run_package_report_from_bam( &loader, &options, )?; - let matched_findings = match_app_findings(&findings, &observations, &analyses); - let reports = vec![app_report_json(AppReportJsonInput { - assay_id: &assay_id, - participant_id: &participant_id, - input_file_name: input_name, - observations: &observations, - analyses: &analyses, - findings: &matched_findings, - provenance: &provenance, - input_inspection: Some(&head_inspection), - manifest_metadata: &manifest_metadata, - })]; - let observations_tsv = render_app_observations_tsv(&observations)?; - let analysis_jsonl = render_jsonl(&analyses)?; - let reports_jsonl = render_jsonl(&reports)?; - let html = render_app_html_document(&observations, &reports)?; - let text_output = "observations: observations.tsv\nanalysis: analysis.jsonl\nreports: reports.jsonl\nhtml: index.html\n".to_owned(); - serde_json::to_string(&ReportRunOutput { - artifacts: vec![ - artifact( - "observations.tsv", - "text/tab-separated-values", - observations_tsv, - ), - artifact("analysis.jsonl", "application/jsonl", analysis_jsonl), - artifact("reports.jsonl", "application/jsonl", reports_jsonl), - artifact("index.html", "text/html", html), - ], - duration_ms: (js_sys::Date::now() - started_ms).max(0.0) as u128, - text_output, - }) - .map_err(|err| JsError::new(&format!("failed to encode report output: {err}"))) + let artifacts = bioscript_reporting::render_input_report_artifact_texts( + bioscript_reporting::AppInputReportInput { + assay_id: &manifest_context.assay_id, + participant_id: &participant_id, + input_file_name: input_name, + input_file_path: input_name, + observations: &observations, + analyses: &analyses, + findings: &manifest_context.findings, + provenance: &manifest_context.provenance, + input_inspection: Some(&head_inspection), + manifest_metadata: &manifest_context.manifest_metadata, + }, + ) + .map_err(|err| JsError::new(&err))?; + encode_report_run_output(started_ms, artifacts) } /// Mirrors `runPackageReportBytes` but for a bgzipped, tabix-indexed VCF @@ -504,10 +403,7 @@ pub fn run_package_report_from_vcf( }; let workspace = PackageWorkspace::new(package_files)?; let participant_id = participant_id_from_name(input_name); - let assay_id = app_assay_id_from_workspace(&workspace, manifest_path)?; - let manifest_metadata = workspace.report_manifest_metadata(manifest_path)?; - let findings = workspace.load_manifest_findings(manifest_path)?; - let provenance = workspace.load_manifest_provenance_links(manifest_path)?; + let manifest_context = workspace.report_manifest_context(manifest_path)?; // Inspect format/source/assembly from the head, but skip the byte-stream // sex detection — we'll do that via tabix-targeted X non-PAR queries // below, which works on indexed VCFs of any size. @@ -539,14 +435,18 @@ pub fn run_package_report_from_vcf( } } - let mut loader = GenotypeLoadOptions::default(); - loader.format = Some(bioscript_formats::GenotypeSourceFormat::Vcf); + let loader = GenotypeLoadOptions { + format: Some(bioscript_formats::GenotypeSourceFormat::Vcf), + ..Default::default() + }; let manifest_output = workspace.run_manifest_rows(manifest_path, &lookup, &participant_id, &options.filters)?; let observations = manifest_output .rows .iter() - .map(|row| workspace.app_observation_from_manifest_row(row, &assay_id, None, None)) + .map(|row| { + workspace.app_observation_from_manifest_row(row, &manifest_context.assay_id, None, None) + }) .collect::, _>>()?; let analysis_observations = analysis_cache_observations(&manifest_output.observations, &observations); @@ -564,36 +464,20 @@ pub fn run_package_report_from_vcf( &loader, &options, )?; - let matched_findings = match_app_findings(&findings, &observations, &analyses); - let reports = vec![app_report_json(AppReportJsonInput { - assay_id: &assay_id, - participant_id: &participant_id, - input_file_name: input_name, - observations: &observations, - analyses: &analyses, - findings: &matched_findings, - provenance: &provenance, - input_inspection: Some(&head_inspection), - manifest_metadata: &manifest_metadata, - })]; - let observations_tsv = render_app_observations_tsv(&observations)?; - let analysis_jsonl = render_jsonl(&analyses)?; - let reports_jsonl = render_jsonl(&reports)?; - let html = render_app_html_document(&observations, &reports)?; - let text_output = "observations: observations.tsv\nanalysis: analysis.jsonl\nreports: reports.jsonl\nhtml: index.html\n".to_owned(); - serde_json::to_string(&ReportRunOutput { - artifacts: vec![ - artifact( - "observations.tsv", - "text/tab-separated-values", - observations_tsv, - ), - artifact("analysis.jsonl", "application/jsonl", analysis_jsonl), - artifact("reports.jsonl", "application/jsonl", reports_jsonl), - artifact("index.html", "text/html", html), - ], - duration_ms: (js_sys::Date::now() - started_ms).max(0.0) as u128, - text_output, - }) - .map_err(|err| JsError::new(&format!("failed to encode report output: {err}"))) + let artifacts = bioscript_reporting::render_input_report_artifact_texts( + bioscript_reporting::AppInputReportInput { + assay_id: &manifest_context.assay_id, + participant_id: &participant_id, + input_file_name: input_name, + input_file_path: input_name, + observations: &observations, + analyses: &analyses, + findings: &manifest_context.findings, + provenance: &manifest_context.provenance, + input_inspection: Some(&head_inspection), + manifest_metadata: &manifest_context.manifest_metadata, + }, + ) + .map_err(|err| JsError::new(&err))?; + encode_report_run_output(started_ms, artifacts) } diff --git a/rust/bioscript-wasm/src/report_api/analysis_cache.rs b/rust/bioscript-wasm/src/report_api/analysis_cache.rs new file mode 100644 index 0000000..64d94e9 --- /dev/null +++ b/rust/bioscript-wasm/src/report_api/analysis_cache.rs @@ -0,0 +1,35 @@ +use bioscript_core::VariantObservation; + +pub(super) fn analysis_cache_observations( + manifest_observations: &[VariantObservation], + app_observations: &[serde_json::Value], +) -> Vec { + manifest_observations + .iter() + .map(|observation| { + let mut observation = observation.clone(); + if let Some(app_observation) = matching_app_observation(&observation, app_observations) + && let Some(genotype_display) = app_observation + .get("genotype_display") + .and_then(serde_json::Value::as_str) + .filter(|value| !value.is_empty() && *value != "??") + { + observation.genotype = Some(genotype_display.to_owned()); + } + observation + }) + .collect() +} + +fn matching_app_observation<'a>( + observation: &VariantObservation, + app_observations: &'a [serde_json::Value], +) -> Option<&'a serde_json::Value> { + let matched_rsid = observation.matched_rsid.as_deref()?; + app_observations.iter().find(|app_observation| { + app_observation + .get("rsid") + .and_then(serde_json::Value::as_str) + == Some(matched_rsid) + }) +} diff --git a/rust/bioscript-wasm/src/report_helpers.rs b/rust/bioscript-wasm/src/report_helpers.rs index b325b6a..e3fd5a1 100644 --- a/rust/bioscript-wasm/src/report_helpers.rs +++ b/rust/bioscript-wasm/src/report_helpers.rs @@ -9,6 +9,35 @@ pub(super) fn artifact(name: &str, mime_type: &str, text: String) -> ReportArtif } } +pub(super) fn encode_report_run_output( + started_ms: f64, + artifacts: bioscript_reporting::ReportArtifactTexts, +) -> Result { + serde_json::to_string(&ReportRunOutput { + artifacts: vec![ + artifact( + "observations.tsv", + "text/tab-separated-values", + artifacts.observations_tsv, + ), + artifact( + "analysis.jsonl", + "application/jsonl", + artifacts.analysis_jsonl, + ), + artifact( + "reports.jsonl", + "application/jsonl", + artifacts.reports_jsonl, + ), + artifact("index.html", "text/html", artifacts.html), + ], + duration_ms: (js_sys::Date::now() - started_ms).max(0.0) as u128, + text_output: artifacts.text_output, + }) + .map_err(|err| JsError::new(&format!("failed to encode report output: {err}"))) +} + pub(super) fn variant_row( path: &str, name: &str, @@ -16,86 +45,7 @@ pub(super) fn variant_row( observation: &VariantObservation, participant_id: &str, ) -> BTreeMap { - let mut row = BTreeMap::new(); - row.insert("kind".to_owned(), "variant".to_owned()); - row.insert("name".to_owned(), name.to_owned()); - row.insert("path".to_owned(), path.to_owned()); - row.insert("tags".to_owned(), tags.join(",")); - row.insert("backend".to_owned(), observation.backend.clone()); - row.insert("participant_id".to_owned(), participant_id.to_owned()); - row.insert( - "matched_rsid".to_owned(), - observation.matched_rsid.clone().unwrap_or_default(), - ); - row.insert( - "assembly".to_owned(), - observation - .assembly - .map(assembly_row_value) - .unwrap_or_default(), - ); - row.insert( - "genotype".to_owned(), - observation.genotype.clone().unwrap_or_default(), - ); - row.insert( - "ref_count".to_owned(), - observation - .ref_count - .map_or_else(String::new, |value| value.to_string()), - ); - row.insert( - "alt_count".to_owned(), - observation - .alt_count - .map_or_else(String::new, |value| value.to_string()), - ); - row.insert( - "depth".to_owned(), - observation - .depth - .map_or_else(String::new, |value| value.to_string()), - ); - row.insert( - "raw_counts".to_owned(), - serde_json::to_string(&observation.raw_counts).unwrap_or_default(), - ); - row.insert("evidence".to_owned(), observation.evidence.join(" | ")); - row -} - -pub(super) fn render_app_observations_tsv( - observations: &[serde_json::Value], -) -> Result { - let mut out = OBSERVATION_TSV_HEADERS.join("\t"); - out.push('\n'); - for observation in observations { - let line = OBSERVATION_TSV_HEADERS - .iter() - .map(|header| json_field_as_tsv(observation.get(*header))) - .collect::>() - .join("\t"); - out.push_str(&line); - out.push('\n'); - } - Ok(out) -} - -pub(super) fn render_jsonl(rows: &[serde_json::Value]) -> Result { - let mut out = String::new(); - for row in rows { - out.push_str(&serde_json::to_string(row).map_err(|err| JsError::new(&err.to_string()))?); - out.push('\n'); - } - Ok(out) -} - -pub(super) fn json_field_as_tsv(value: Option<&serde_json::Value>) -> String { - match value { - Some(serde_json::Value::Null) | None => String::new(), - Some(serde_json::Value::String(value)) => value.replace(['\t', '\n'], " "), - Some(value) => value.to_string().replace(['\t', '\n'], " "), - } + bioscript_reporting::variant_row(path, name, tags, observation, participant_id) } pub(super) fn normalize_package_path(path: &str) -> Result { @@ -115,214 +65,14 @@ pub(super) fn default_analysis_max_duration_ms() -> u64 { } pub(super) fn participant_id_from_name(path: &str) -> String { - Path::new(path) - .file_stem() - .and_then(|value| value.to_str()) - .unwrap_or(path) - .replace([' ', '\t', '\n'], "_") -} - -/// Derive the assay id from a manifest path — matches the CLI's -/// `bioscript-cli::report_execution::app_assay_id`, which loads the manifest -/// and returns its `name:` field (panels / assays / variants all carry one). -/// This function operates on a `PackageWorkspace` so it can find files in the -/// in-memory map without touching disk. -/// -/// Previously the wasm derived the id from the manifest filename stem (e.g. -/// `manifest.yaml` -> `manifest`), which diverged from the CLI's `pgx-1` -/// (panel `name:` field) and cascaded into the HTML report's -/// `participant_id × assay_id` keys. -pub(super) fn app_assay_id_from_workspace( - workspace: &PackageWorkspace, - manifest_path: &str, -) -> Result { - match workspace.schema(manifest_path)?.as_str() { - "bioscript:panel:1.0" => Ok(workspace.load_panel(manifest_path)?.name), - "bioscript:assay:1.0" => Ok(workspace.load_assay(manifest_path)?.name), - "bioscript:variant:1.0" | "bioscript:variant" => { - Ok(workspace.load_variant(manifest_path)?.name) - } - other => Err(JsError::new(&format!( - "unsupported manifest schema '{other}'" - ))), - } -} - -pub(super) fn app_assay_id(path: &Path) -> Result { - path.file_stem() - .and_then(|value| value.to_str()) - .map(ToOwned::to_owned) - .ok_or_else(|| { - JsError::new(&format!( - "failed to derive assay id from {}", - path.display() - )) - }) -} - -pub(super) fn matches_filters(manifest: &VariantManifest, path: &str, filters: &[String]) -> bool { - filters.iter().all(|filter| match filter.split_once('=') { - Some(("kind", value)) => value == "variant", - Some(("name", value)) => manifest.name.contains(value), - Some(("path", value)) => path.contains(value), - Some(("tag", value)) => manifest.tags.iter().any(|tag| tag == value), - Some(_) | None => false, - }) + bioscript_reporting::participant_id_from_path(Path::new(path)) } pub(super) fn parse_analysis_output_text( text: &str, format: &str, ) -> Result<(Vec, Vec), JsError> { - match format { - "tsv" => Ok(parse_analysis_tsv(text)), - "json" => { - let value: serde_json::Value = serde_json::from_str(text) - .map_err(|err| JsError::new(&format!("failed to parse analysis JSON: {err}")))?; - let rows = match value { - serde_json::Value::Array(rows) => rows, - serde_json::Value::Object(mut object) => object - .remove("rows") - .and_then(|rows| rows.as_array().cloned()) - .unwrap_or_else(|| vec![serde_json::Value::Object(object)]), - other => vec![other], - }; - let row_headers = rows - .iter() - .find_map(|row| row.as_object()) - .map(|object| object.keys().cloned().collect()) - .unwrap_or_default(); - Ok((rows, row_headers)) - } - "jsonl" => { - let mut rows: Vec = Vec::new(); - for line in text.lines().filter(|line| !line.trim().is_empty()) { - rows.push(serde_json::from_str(line).map_err(|err| { - JsError::new(&format!("failed to parse analysis JSONL: {err}")) - })?); - } - let row_headers = rows - .iter() - .find_map(|row| row.as_object()) - .map(|object| object.keys().cloned().collect()) - .unwrap_or_default(); - Ok((rows, row_headers)) - } - other => Err(JsError::new(&format!( - "unsupported analysis output_format '{other}'" - ))), - } -} - -fn parse_analysis_tsv(text: &str) -> (Vec, Vec) { - let mut lines = text.lines(); - let headers = lines - .next() - .map(|line| line.split('\t').map(ToOwned::to_owned).collect::>()) - .unwrap_or_default(); - let rows = lines - .filter(|line| !line.trim().is_empty()) - .map(|line| { - let fields = line.split('\t').collect::>(); - let object = headers - .iter() - .enumerate() - .map(|(index, header)| { - ( - header.clone(), - serde_json::Value::String( - fields.get(index).copied().unwrap_or_default().to_owned(), - ), - ) - }) - .collect(); - serde_json::Value::Object(object) - }) - .collect(); - (rows, headers) -} - -pub(super) fn yaml_to_json(value: serde_yaml::Value) -> Result { - serde_json::to_value(value) - .map_err(|err| JsError::new(&format!("failed to convert YAML to JSON: {err}"))) -} - -pub(super) fn collect_manifest_provenance_entries( - value: &serde_yaml::Value, - links: &mut BTreeMap, -) -> Result<(), JsError> { - if let Some(sources) = value - .get("provenance") - .and_then(|provenance| provenance.get("sources")) - .and_then(serde_yaml::Value::as_sequence) - { - for source in sources { - let json = yaml_to_json(source.clone())?; - if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { - links.entry(url.to_owned()).or_insert(json); - } - } - } - if let Some(source) = value.get("source") { - let json = yaml_to_json(source.clone())?; - if let Some(url) = json.get("url").and_then(serde_json::Value::as_str) { - links.entry(url.to_owned()).or_insert(json); - } - } - Ok(()) -} - -pub(super) fn input_inspection_json( - inspection: &bioscript_formats::FileInspection, -) -> serde_json::Value { - serde_json::json!({ - "container": match inspection.container { - bioscript_formats::FileContainer::Plain => "plain", - bioscript_formats::FileContainer::Zip => "zip", - }, - "format": match inspection.detected_kind { - bioscript_formats::DetectedKind::GenotypeText => "genotype_text", - bioscript_formats::DetectedKind::Vcf => "vcf", - bioscript_formats::DetectedKind::AlignmentCram => "alignment_cram", - bioscript_formats::DetectedKind::AlignmentBam => "alignment_bam", - bioscript_formats::DetectedKind::ReferenceFasta => "reference_fasta", - bioscript_formats::DetectedKind::Unknown => "unknown", - }, - "format_confidence": detection_confidence_name(inspection.confidence), - "assembly": inspection.assembly.map(|assembly| match assembly { - Assembly::Grch37 => "grch37", - Assembly::Grch38 => "grch38", - }), - "phased": inspection.phased, - "selected_entry": inspection.selected_entry, - "has_index": inspection.has_index, - "index_path": inspection.index_path.as_ref().map(|path| path.display().to_string()), - "reference_matches": inspection.reference_matches, - "source": inspection.source.as_ref().map(|source| serde_json::json!({ - "vendor": source.vendor, - "platform_version": source.platform_version, - "confidence": detection_confidence_name(source.confidence), - "evidence": source.evidence, - })), - "inferred_sex": inspection.inferred_sex.as_ref().map(|sex| serde_json::json!({ - "sex": inferred_sex_name(sex.sex), - "confidence": sex_detection_confidence_name(sex.confidence), - "method": sex.method, - "evidence": sex.evidence, - })), - "evidence": inspection.evidence, - "warnings": inspection.warnings, - "duration_ms": inspection.duration_ms, - }) -} - -fn detection_confidence_name(value: bioscript_formats::DetectionConfidence) -> &'static str { - match value { - bioscript_formats::DetectionConfidence::Authoritative => "authoritative", - bioscript_formats::DetectionConfidence::StrongHeuristic => "strong_heuristic", - bioscript_formats::DetectionConfidence::WeakHeuristic => "weak_heuristic", - bioscript_formats::DetectionConfidence::Unknown => "unknown", - } + bioscript_reporting::parse_analysis_output_text(text, format).map_err(|err| JsError::new(&err)) } pub(super) fn yaml_string(value: &serde_yaml::Value, key: &str) -> Option { @@ -332,30 +82,15 @@ pub(super) fn yaml_string(value: &serde_yaml::Value, key: &str) -> Option Vec { - value - .get(key) - .and_then(serde_yaml::Value::as_sequence) - .map(|items| { - items - .iter() - .filter_map(serde_yaml::Value::as_str) - .map(serde_json::Value::from) - .collect() - }) - .unwrap_or_default() -} - -pub(super) fn yaml_mapping_string(mapping: &serde_yaml::Mapping, key: &str) -> Option { - mapping - .get(serde_yaml::Value::String(key.to_owned())) - .and_then(serde_yaml::Value::as_str) - .map(ToOwned::to_owned) -} - pub(super) fn variant_primary_source_from_yaml(value: &serde_yaml::Value) -> serde_json::Value { let mut links = BTreeMap::::new(); - let _ = collect_manifest_provenance_entries(value, &mut links); + let _ = bioscript_reporting::collect_manifest_provenance_entries(value, &mut links); + if let Some(source) = links + .values() + .find(|source| source_url_contains(source, "ncbi.nlm.nih.gov/snp/rs")) + { + return source.clone(); + } if let Some(rsid) = value .get("identifiers") .and_then(|identifiers| identifiers.get("rsids")) @@ -375,190 +110,45 @@ pub(super) fn variant_primary_source_from_yaml(value: &serde_yaml::Value) -> ser .unwrap_or(serde_json::Value::Null) } -pub(super) fn normalize_app_genotype( - display: &str, - ref_allele: &str, - alt_allele: &str, - kind: Option, - chrom: &str, - inferred_sex: Option<&SexInference>, -) -> (String, String) { - if display.is_empty() { - return ("./.".to_owned(), "unknown".to_owned()); - } - if matches!(kind, Some(VariantKind::Deletion)) - && ref_allele.len() != 1 - && display - .chars() - .filter(char::is_ascii_alphabetic) - .all(|allele| matches!(allele.to_ascii_uppercase(), 'I' | 'D')) - { - return normalize_app_genotype(display, "I", "D", None, chrom, inferred_sex); - } - let alleles: Vec = display.chars().filter(char::is_ascii_alphabetic).collect(); - if ref_allele.len() != 1 || alt_allele.len() != 1 { - return (display.to_owned(), "unknown".to_owned()); - } - let ref_ch = ref_allele.chars().next().unwrap_or_default(); - let alt_ch = alt_allele.chars().next().unwrap_or_default(); - if is_confident_male_sex_chromosome(chrom, inferred_sex) - && alleles.len() == 2 - && alleles[0] == alleles[1] - { - let allele = alleles[0]; - if allele == ref_ch { - return ("0".to_owned(), "hem_ref".to_owned()); - } - if allele == alt_ch { - return ("1".to_owned(), "hem_alt".to_owned()); - } - } - if alleles.len() != 2 { - return (display.to_owned(), "unknown".to_owned()); - } - let alt_count = alleles.iter().filter(|allele| **allele == alt_ch).count(); - let ref_count = alleles.iter().filter(|allele| **allele == ref_ch).count(); - match (ref_count, alt_count) { - (2, 0) => ("0/0".to_owned(), "hom_ref".to_owned()), - (1, 1) => ("0/1".to_owned(), "het".to_owned()), - (0, 2) => ("1/1".to_owned(), "hom_alt".to_owned()), - _ => (display.to_owned(), "unknown".to_owned()), - } -} - -pub(super) fn deletion_copy_number_display( - row: &BTreeMap, - manifest: &VariantManifest, - depth: Option, - alt_count: Option, -) -> Option { - if !matches!(manifest.spec.kind, Some(VariantKind::Deletion)) { - return None; - } - if !matches!( - row.get("backend").map(String::as_str), - Some("cram" | "bam") - ) { - return None; - } - if manifest.spec.reference.as_deref().unwrap_or_default().len() <= 1 { - return None; - } - let depth = depth?; - if depth == 0 { - return None; - } - let alt_fraction = f64::from(alt_count.unwrap_or(0)) / f64::from(depth); - if alt_fraction >= 0.8 { - Some("DD".to_owned()) - } else if alt_fraction <= 0.2 { - Some("II".to_owned()) - } else { - Some("DI".to_owned()) - } -} - -fn is_confident_male_sex_chromosome(chrom: &str, inferred_sex: Option<&SexInference>) -> bool { - matches!( - chrom - .trim() - .trim_start_matches("chr") - .to_ascii_uppercase() - .as_str(), - "X" | "Y" | "23" | "24" - ) && inferred_sex.is_some_and(|sex| { - sex.sex == InferredSex::Male - && matches!( - sex.confidence, - SexDetectionConfidence::High | SexDetectionConfidence::Medium - ) - }) +fn source_url_contains(source: &serde_json::Value, needle: &str) -> bool { + source + .get("url") + .and_then(serde_json::Value::as_str) + .is_some_and(|url| url.contains(needle)) } -pub(super) fn assembly_row_value(assembly: Assembly) -> String { - match assembly { - Assembly::Grch37 => "grch37".to_owned(), - Assembly::Grch38 => "grch38".to_owned(), - } -} - -fn inferred_sex_name(value: InferredSex) -> &'static str { - match value { - InferredSex::Male => "male", - InferredSex::Female => "female", - InferredSex::Unknown => "unknown", - } -} - -fn sex_detection_confidence_name(value: SexDetectionConfidence) -> &'static str { - match value { - SexDetectionConfidence::High => "high", - SexDetectionConfidence::Medium => "medium", - SexDetectionConfidence::Low => "low", - } -} - -pub(super) fn parse_optional_u32(value: Option<&String>) -> Option { - value.and_then(|value| value.parse::().ok()) +pub(super) fn variant_observed_alt_alleles_from_yaml(value: &serde_yaml::Value) -> Vec { + value + .as_mapping() + .and_then(|mapping| mapping.get(serde_yaml::Value::String("alleles".to_owned()))) + .and_then(serde_yaml::Value::as_mapping) + .and_then(|mapping| mapping.get(serde_yaml::Value::String("observed_alts".to_owned()))) + .and_then(serde_yaml::Value::as_sequence) + .map(|items| { + items + .iter() + .filter_map(serde_yaml::Value::as_str) + .map(ToOwned::to_owned) + .collect() + }) + .unwrap_or_default() } #[cfg(test)] mod tests { - use super::*; + use super::participant_id_from_name; #[test] - fn normalizes_long_deletion_reference_tokens_as_insertion_deletion_copy_number() { - assert_eq!( - normalize_app_genotype( - "II", - "TTATAA", - "", - Some(VariantKind::Deletion), - "22", - None, - ), - ("0/0".to_owned(), "hom_ref".to_owned()) - ); - assert_eq!( - normalize_app_genotype( - "ID", - "TTATAA", - "", - Some(VariantKind::Deletion), - "22", - None, - ), - ("0/1".to_owned(), "het".to_owned()) - ); - } - - #[test] - fn displays_cram_long_deletion_copy_number_as_insertion_deletion_tokens() { - let manifest = VariantManifest { - path: PathBuf::from("rs71785313.yaml"), - name: "APOL1_G2".to_owned(), - tags: Vec::new(), - spec: VariantSpec { - reference: Some("TTATAA".to_owned()), - alternate: Some("".to_owned()), - kind: Some(VariantKind::Deletion), - ..VariantSpec::default() - }, - }; - let mut row = BTreeMap::new(); - row.insert("backend".to_owned(), "cram".to_owned()); - - assert_eq!( - deletion_copy_number_display(&row, &manifest, Some(39), Some(0)).as_deref(), - Some("II") - ); + fn participant_id_suffix_stripping_matches_cli_report_path() { assert_eq!( - deletion_copy_number_display(&row, &manifest, Some(39), Some(39)).as_deref(), - Some("DD") + participant_id_from_name("NA06985.clean.vcf.gz"), + "NA06985.clean" ); assert_eq!( - deletion_copy_number_display(&row, &manifest, Some(40), Some(20)).as_deref(), - Some("DI") + participant_id_from_name("/tmp/genome_hu50B3F5_v5_Full.zip"), + "genome_hu50B3F5_v5_Full" ); + assert_eq!(participant_id_from_name("sample.cram"), "sample"); + assert_eq!(participant_id_from_name("sample name.txt"), "sample name"); } } diff --git a/rust/bioscript-wasm/src/report_lookup.rs b/rust/bioscript-wasm/src/report_lookup.rs index 2d59efb..9eab873 100644 --- a/rust/bioscript-wasm/src/report_lookup.rs +++ b/rust/bioscript-wasm/src/report_lookup.rs @@ -1,1023 +1,9 @@ use super::*; -/// Per-variant CRAM lookup that satisfies the workspace's `VariantLookup` -/// trait. Holds the IndexedReader in a `RefCell` so &self lookup methods can -/// mutably read while still being object-safe. -pub(super) struct CramReportLookup { - pub(super) reader: std::cell::RefCell>, - pub(super) label: String, -} - -pub(super) struct BamReportLookup { - pub(super) reader: - std::cell::RefCell>>, - pub(super) label: String, -} - -impl report_workspace::VariantLookup for BamReportLookup { - fn lookup_variant(&self, spec: &VariantSpec) -> Result { - let mut reader = self.reader.borrow_mut(); - observe_bam_variant(&mut reader, &self.label, spec) - } - - fn lookup_variants( - &self, - specs: &[VariantSpec], - ) -> Result, RuntimeError> { - let mut reader = self.reader.borrow_mut(); - let mut out = Vec::with_capacity(specs.len()); - for spec in specs { - out.push(observe_bam_variant(&mut reader, &self.label, spec)?); - } - Ok(out) - } -} - -impl report_workspace::VariantLookup for CramReportLookup { - fn lookup_variant(&self, spec: &VariantSpec) -> Result { - let mut reader = self.reader.borrow_mut(); - observe_cram_variant(&mut reader, &self.label, spec) - } - - fn lookup_variants( - &self, - specs: &[VariantSpec], - ) -> Result, RuntimeError> { - let mut reader = self.reader.borrow_mut(); - let mut out = Vec::with_capacity(specs.len()); - for spec in specs { - out.push(observe_cram_variant(&mut reader, &self.label, spec)?); - } - Ok(out) - } -} - -/// Build a minimal 23andMe-style text from the observations we already -/// computed. Format: `rsid\tchrom\tpos\tgenotype` per line. The runtime's -/// delimited-text loader reads this back as a `RsidMap`/`Delimited` backend -/// so analysis scripts can call `bioscript.load_genotypes(input_file)` and -/// have rsid lookups answered from the cached table. -#[allow(dead_code)] -fn synthesize_genotype_text_from_observations(observations: &[serde_json::Value]) -> String { - let mut out = String::from("# rsid\tchromosome\tposition\tgenotype\n"); - for observation in observations { - let rsid = observation - .get("rsid") - .and_then(serde_json::Value::as_str) - .unwrap_or(""); - if rsid.is_empty() { - continue; - } - let chrom = observation - .get("chrom") - .and_then(serde_json::Value::as_str) - .unwrap_or(""); - let pos = observation - .get("pos_start") - .and_then(|v| { - v.as_i64() - .or_else(|| v.as_str().and_then(|s| s.parse::().ok())) - }) - .unwrap_or(0); - let genotype = observation - .get("genotype_display") - .and_then(serde_json::Value::as_str) - .filter(|s| !s.is_empty() && *s != "??") - .unwrap_or("--"); - out.push_str(&format!("{rsid}\t{chrom}\t{pos}\t{genotype}\n")); - } - out -} - -fn observe_bam_variant( - reader: &mut noodles::bam::io::indexed_reader::IndexedReader>, - label: &str, - variant: &VariantSpec, -) -> Result { - let assembly = variant - .grch38 - .as_ref() - .map(|_| Assembly::Grch38) - .or_else(|| variant.grch37.as_ref().map(|_| Assembly::Grch37)); - let locus = variant - .grch38 - .as_ref() - .or(variant.grch37.as_ref()) - .ok_or_else(|| { - RuntimeError::Io(format!( - "variant {} has no GRCh37/GRCh38 locus", - variant - .rsids - .first() - .map(|s| s.as_str()) - .unwrap_or("variant") - )) - })?; - let locus = GenomicLocus { - chrom: locus.chrom.clone(), - start: locus.start, - end: locus.end, - }; - match variant.kind.unwrap_or(VariantKind::Snp) { - VariantKind::Snp => { - let ref_char = variant - .reference - .as_deref() - .and_then(|s| s.chars().next()) - .ok_or_else(|| { - RuntimeError::Io(format!( - "variant {} missing reference allele", - variant - .rsids - .first() - .map(|s| s.as_str()) - .unwrap_or("variant") - )) - })?; - let alt_char = variant - .alternate - .as_deref() - .and_then(|s| s.chars().next()) - .ok_or_else(|| { - RuntimeError::Io(format!( - "variant {} missing alternate allele", - variant - .rsids - .first() - .map(|s| s.as_str()) - .unwrap_or("variant") - )) - })?; - observe_bam_snp_with_reader( - reader, - label, - &locus, - ref_char, - alt_char, - variant.rsids.first().cloned(), - assembly, - ) - } - VariantKind::Deletion => { - observe_bam_deletion_with_reader( - reader, - label, - &locus, - variant, - assembly, - ) - } - VariantKind::Insertion | VariantKind::Indel => { - let reference = variant.reference.as_deref().ok_or_else(|| { - RuntimeError::Io(format!( - "variant {} missing reference allele", - variant - .rsids - .first() - .map(|s| s.as_str()) - .unwrap_or("variant") - )) - })?; - let alternate = variant.alternate.as_deref().ok_or_else(|| { - RuntimeError::Io(format!( - "variant {} missing alternate allele", - variant - .rsids - .first() - .map(|s| s.as_str()) - .unwrap_or("variant") - )) - })?; - observe_bam_indel_with_reader( - reader, - label, - &locus, - reference, - alternate, - variant.rsids.first().cloned(), - assembly, - ) - } - other => Err(RuntimeError::Io(format!( - "variant {} kind {:?} not supported on BAM via wasm", - variant - .rsids - .first() - .map(|s| s.as_str()) - .unwrap_or("variant"), - other - ))), - } -} - -fn observe_bam_snp_with_reader( - reader: &mut noodles::bam::io::indexed_reader::IndexedReader>, - label: &str, - locus: &GenomicLocus, - reference: char, - alternate: char, - matched_rsid: Option, - assembly: Option, -) -> Result { - use noodles::core::Position; - let mut counts = BamSnpPileupCounts::default(); - let header = read_bam_header(reader, label)?; - let region = bam_region(&header, locus)?; - let target_position = Position::try_from(usize::try_from(locus.start).map_err(|_| { - RuntimeError::InvalidArguments("SNP locus start is out of range".to_owned()) - })?) - .map_err(|_| RuntimeError::InvalidArguments("SNP locus start is out of range".to_owned()))?; - - let query = reader - .query(&header, ®ion) - .map_err(|err| RuntimeError::Io(format!("failed to query BAM {label}: {err}")))?; - for result in query.records() { - let record = result - .map_err(|err| RuntimeError::Io(format!("failed to read BAM record {label}: {err}")))?; - let flags = record.flags(); - if flags.is_unmapped() { - counts.filtered_unmapped += 1; - continue; - } - if flags.is_secondary() { - counts.filtered_secondary += 1; - continue; - } - if flags.is_qc_fail() { - counts.filtered_qc_fail += 1; - continue; - } - if flags.is_duplicate() { - counts.filtered_duplicate += 1; - continue; - } - if flags.is_segmented() && !flags.is_properly_segmented() { - counts.filtered_improper_pair += 1; - continue; - } - - let Some((base, base_quality)) = - bam_base_quality_at_reference_position(&record, target_position)? - else { - continue; - }; - let normalized_base = normalize_pileup_base(base); - let is_reverse = flags.is_reverse_complemented(); - if let Some(base) = normalized_base { - counts.raw_depth += 1; - *counts.raw_base_counts.entry(base.to_string()).or_insert(0) += 1; - let strand_counts = if is_reverse { - &mut counts.raw_reverse_counts - } else { - &mut counts.raw_forward_counts - }; - *strand_counts.entry(base.to_string()).or_insert(0) += 1; - if base == reference { - counts.raw_ref_count += 1; - } else if base == alternate { - counts.raw_alt_count += 1; - } - } - - if base_quality < 13 { - counts.filtered_low_base_quality += 1; - continue; - } - - let Some(base) = normalized_base else { - counts.filtered_non_acgt += 1; - continue; - }; - - counts.filtered_depth += 1; - *counts - .filtered_base_counts - .entry(base.to_string()) - .or_insert(0) += 1; - if base == reference { - counts.filtered_ref_count += 1; - } else if base == alternate { - counts.filtered_alt_count += 1; - } - } - - let ref_count = counts.filtered_ref_count; - let alt_count = counts.filtered_alt_count; - let depth = counts.filtered_depth; - let evidence = counts.evidence_lines(&format!("{}:{}-{}", locus.chrom, locus.start, locus.end), locus.start); - Ok(VariantObservation { - backend: "bam".to_owned(), - matched_rsid, - assembly, - genotype: infer_snp_genotype(reference, alternate, ref_count, alt_count, depth), - ref_count: Some(ref_count), - alt_count: Some(alt_count), - depth: Some(depth), - raw_counts: counts.raw_base_counts, - decision: Some(describe_snp_decision_rule( - reference, alternate, ref_count, alt_count, depth, - )), - evidence, - }) -} - -fn observe_bam_deletion_with_reader( - reader: &mut noodles::bam::io::indexed_reader::IndexedReader>, - label: &str, - locus: &GenomicLocus, - variant: &VariantSpec, - assembly: Option, -) -> Result { - let deletion_length = variant.deletion_length.ok_or_else(|| { - RuntimeError::InvalidArguments("deletion variant requires deletion_length".to_owned()) - })?; - let reference = variant.reference.clone().unwrap_or_else(|| "I".to_owned()); - let alternate = variant.alternate.clone().unwrap_or_else(|| "D".to_owned()); - let anchor_pos = locus.start.saturating_sub(1); - let anchor_locus = GenomicLocus { - chrom: locus.chrom.clone(), - start: anchor_pos, - end: anchor_pos, - }; - - let mut alt_count = 0u32; - let mut ref_count = 0u32; - let mut depth = 0u32; - - let header = read_bam_header(reader, label)?; - let region = bam_region(&header, &anchor_locus)?; - let query = reader - .query(&header, ®ion) - .map_err(|err| RuntimeError::Io(format!("failed to query BAM {label}: {err}")))?; - for result in query.records() { - let record = result - .map_err(|err| RuntimeError::Io(format!("failed to read BAM record {label}: {err}")))?; - let alignment_record = bam_alignment_record(label, &record)?; - if alignment_record.is_unmapped || !spans_position(&alignment_record, anchor_pos) { - continue; - } - depth += 1; - match indel_at_anchor(&alignment_record, anchor_pos) { - Some((bioscript_formats::alignment::AlignmentOpKind::Deletion, len)) - if len == deletion_length => - { - alt_count += 1; - } - _ => ref_count += 1, - } - } - - Ok(VariantObservation { - backend: "bam".to_owned(), - matched_rsid: variant.rsids.first().cloned(), - assembly, - genotype: infer_copy_number_genotype(&reference, &alternate, ref_count, alt_count, depth), - ref_count: Some(ref_count), - alt_count: Some(alt_count), - depth: Some(depth), - raw_counts: BTreeMap::new(), - decision: Some(describe_copy_number_decision_rule( - &reference, &alternate, ref_count, alt_count, depth, - )), - evidence: vec![format!( - "observed BAM deletion anchor {}:{} len={} depth={} ref_count={} alt_count={}", - locus.chrom, anchor_pos, deletion_length, depth, ref_count, alt_count - )], - }) -} - -fn observe_bam_indel_with_reader( - reader: &mut noodles::bam::io::indexed_reader::IndexedReader>, - label: &str, - locus: &GenomicLocus, - reference: &str, - alternate: &str, - matched_rsid: Option, - assembly: Option, -) -> Result { - let mut alt_count = 0u32; - let mut ref_count = 0u32; - let mut depth = 0u32; - let mut matching_alt_lengths = std::collections::BTreeSet::new(); - - let header = read_bam_header(reader, label)?; - let region = bam_region(&header, locus)?; - let query = reader - .query(&header, ®ion) - .map_err(|err| RuntimeError::Io(format!("failed to query BAM {label}: {err}")))?; - for result in query.records() { - let record = result - .map_err(|err| RuntimeError::Io(format!("failed to read BAM record {label}: {err}")))?; - let alignment_record = bam_alignment_record(label, &record)?; - if alignment_record.is_unmapped || !record_overlaps_locus(&alignment_record, locus) { - continue; - } - let classification = - classify_expected_indel(&alignment_record, locus, reference.len(), alternate)?; - if !classification.covering { - continue; - } - depth += 1; - if classification.matches_alt { - alt_count += 1; - matching_alt_lengths.insert(classification.observed_len); - } else if classification.reference_like { - ref_count += 1; - } - } - - let evidence_label = if matching_alt_lengths.is_empty() { - "none".to_owned() - } else { - matching_alt_lengths - .into_iter() - .map(|len| len.to_string()) - .collect::>() - .join(",") - }; - - Ok(VariantObservation { - backend: "bam".to_owned(), - matched_rsid, - assembly, - genotype: infer_copy_number_genotype(reference, alternate, ref_count, alt_count, depth), - ref_count: Some(ref_count), - alt_count: Some(alt_count), - depth: Some(depth), - raw_counts: BTreeMap::new(), - decision: Some(describe_copy_number_decision_rule( - reference, alternate, ref_count, alt_count, depth, - )), - evidence: vec![format!( - "observed BAM indel at {}:{}-{} depth={} ref_count={} alt_count={} matching_alt_lengths={}", - locus.chrom, - locus.start, - locus.end, - depth, - ref_count, - alt_count, - evidence_label - )], - }) -} - -fn read_bam_header( - reader: &mut noodles::bam::io::indexed_reader::IndexedReader>, - label: &str, -) -> Result { - reader - .get_mut() - .seek(noodles::bgzf::VirtualPosition::MIN) - .map_err(|err| RuntimeError::Io(format!("failed to rewind BAM {label}: {err}")))?; - reader - .read_header() - .map_err(|err| RuntimeError::Io(format!("failed to read BAM header {label}: {err}"))) -} - -fn bam_region( - header: &noodles::sam::Header, - locus: &GenomicLocus, -) -> Result { - let chrom = resolve_bam_reference_name(header, &locus.chrom).ok_or_else(|| { - RuntimeError::Unsupported(format!( - "indexed BAM does not contain contig {} for {}:{}-{}", - locus.chrom, locus.chrom, locus.start, locus.end - )) - })?; - format!("{chrom}:{}-{}", locus.start, locus.end) - .parse() - .map_err(|err| RuntimeError::Io(format!("invalid BAM query region: {err}"))) -} +#[path = "report_lookup/alignment.rs"] +mod alignment; -fn resolve_bam_reference_name(header: &noodles::sam::Header, chrom: &str) -> Option { - let candidates = [ - chrom.to_owned(), - format!("chr{chrom}"), - chrom.trim_start_matches("chr").to_owned(), - ]; - candidates.into_iter().find(|candidate| { - header.reference_sequences().iter().any(|(name, _)| { - let name_bytes: &[u8] = name.as_ref(); - name_bytes == candidate.as_bytes() - }) - }) -} - -fn bam_base_quality_at_reference_position( - record: &noodles::bam::Record, - target_position: noodles::core::Position, -) -> Result, RuntimeError> { - use noodles::sam::alignment::record::cigar::op::Kind; - - let Some(alignment_start) = record - .alignment_start() - .transpose() - .map_err(|err| RuntimeError::Io(format!("failed to read BAM alignment start: {err}")))? - else { - return Ok(None); - }; - let mut reference_position = usize::from(alignment_start); - let target = usize::from(target_position); - let mut read_position = 0usize; - let sequence = record.sequence(); - let qualities = record.quality_scores(); - - for result in record.cigar().iter() { - let op = result.map_err(|err| RuntimeError::Io(format!("failed to read BAM CIGAR: {err}")))?; - let len = op.len(); - match op.kind() { - Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => { - if target >= reference_position && target < reference_position + len { - let offset = target - reference_position; - let read_index = read_position + offset; - let Some(base) = sequence.get(read_index) else { - return Ok(None); - }; - let quality = qualities.as_ref().get(read_index).copied().unwrap_or(u8::MAX); - return Ok(Some((base, quality))); - } - reference_position += len; - read_position += len; - } - Kind::Insertion | Kind::SoftClip => { - read_position += len; - } - Kind::Deletion | Kind::Skip => { - if target >= reference_position && target < reference_position + len { - return Ok(None); - } - reference_position += len; - } - Kind::HardClip | Kind::Pad => {} - } - } - - Ok(None) -} - -fn bam_alignment_record( - label: &str, - record: &noodles::bam::Record, -) -> Result { - use noodles::sam::alignment::Record as _; - - let flags = record.flags(); - let is_unmapped = flags.is_unmapped(); - let start = record - .alignment_start() - .transpose() - .map_err(|err| RuntimeError::Io(format!("failed to read BAM alignment start from {label}: {err}")))? - .map(|pos| i64::try_from(usize::from(pos))) - .transpose() - .map_err(|_| RuntimeError::Unsupported(format!("record alignment start exceeds i64 range in {label}")))? - .unwrap_or(0); - let end = record - .alignment_end() - .transpose() - .map_err(|err| RuntimeError::Io(format!("failed to read BAM alignment end from {label}: {err}")))? - .map(|pos| i64::try_from(usize::from(pos))) - .transpose() - .map_err(|_| RuntimeError::Unsupported(format!("record alignment end exceeds i64 range in {label}")))? - .unwrap_or(start); - let cigar = record - .cigar() - .iter() - .map(|result| { - result - .map(map_bam_op) - .map_err(|err| RuntimeError::Io(format!("failed to read BAM CIGAR from {label}: {err}"))) - }) - .collect::, _>>()?; - - Ok(bioscript_formats::alignment::AlignmentRecord { - start, - end, - is_unmapped, - cigar, - }) -} - -fn map_bam_op( - op: noodles::sam::alignment::record::cigar::Op, -) -> bioscript_formats::alignment::AlignmentOp { - use bioscript_formats::alignment::{AlignmentOp, AlignmentOpKind}; - use noodles::sam::alignment::record::cigar::op::Kind; - - let kind = match op.kind() { - Kind::Match => AlignmentOpKind::Match, - Kind::Insertion => AlignmentOpKind::Insertion, - Kind::Deletion => AlignmentOpKind::Deletion, - Kind::Skip => AlignmentOpKind::Skip, - Kind::SoftClip => AlignmentOpKind::SoftClip, - Kind::HardClip => AlignmentOpKind::HardClip, - Kind::Pad => AlignmentOpKind::Pad, - Kind::SequenceMatch => AlignmentOpKind::SequenceMatch, - Kind::SequenceMismatch => AlignmentOpKind::SequenceMismatch, - }; - - AlignmentOp { kind, len: op.len() } -} - -fn normalize_pileup_base(base: u8) -> Option { - match (base as char).to_ascii_uppercase() { - 'A' | 'C' | 'G' | 'T' => Some((base as char).to_ascii_uppercase()), - _ => None, - } -} - -struct IndelClassification { - covering: bool, - reference_like: bool, - matches_alt: bool, - observed_len: usize, -} - -fn record_overlaps_locus( - record: &bioscript_formats::alignment::AlignmentRecord, - locus: &GenomicLocus, -) -> bool { - record.end >= locus.start && record.start <= locus.end -} - -fn spans_position(record: &bioscript_formats::alignment::AlignmentRecord, pos: i64) -> bool { - pos >= record.start.saturating_sub(1) && pos <= record.end -} - -fn indel_at_anchor( - record: &bioscript_formats::alignment::AlignmentRecord, - anchor_pos: i64, -) -> Option<(bioscript_formats::alignment::AlignmentOpKind, usize)> { - let mut ref_pos = record.start; - - for op in &record.cigar { - match op.kind { - bioscript_formats::alignment::AlignmentOpKind::Match - | bioscript_formats::alignment::AlignmentOpKind::SequenceMatch - | bioscript_formats::alignment::AlignmentOpKind::SequenceMismatch - | bioscript_formats::alignment::AlignmentOpKind::Skip => { - ref_pos += i64::try_from(op.len).ok()?; - } - bioscript_formats::alignment::AlignmentOpKind::Insertion => { - let anchor = ref_pos.saturating_sub(1); - if anchor == anchor_pos { - return Some((bioscript_formats::alignment::AlignmentOpKind::Insertion, op.len)); - } - } - bioscript_formats::alignment::AlignmentOpKind::Deletion => { - let anchor = ref_pos.saturating_sub(1); - if anchor == anchor_pos { - return Some((bioscript_formats::alignment::AlignmentOpKind::Deletion, op.len)); - } - ref_pos += i64::try_from(op.len).ok()?; - } - bioscript_formats::alignment::AlignmentOpKind::SoftClip - | bioscript_formats::alignment::AlignmentOpKind::HardClip - | bioscript_formats::alignment::AlignmentOpKind::Pad => {} - } - } - - None -} - -fn classify_expected_indel( - record: &bioscript_formats::alignment::AlignmentRecord, - locus: &GenomicLocus, - reference_len: usize, - alternate: &str, -) -> Result { - let alt_len = alternate.len(); - let anchor_start = locus.start.saturating_sub(1); - let anchor_end = locus.end; - - let covering = record.start <= locus.start && record.end >= locus.end; - if !covering { - return Ok(IndelClassification { - covering: false, - reference_like: false, - matches_alt: false, - observed_len: reference_len, - }); - } - - let mut observed_len = reference_len; - - for anchor in anchor_start..=anchor_end { - if let Some((kind, len)) = indel_at_anchor(record, anchor) { - observed_len = match kind { - bioscript_formats::alignment::AlignmentOpKind::Insertion => reference_len + len, - bioscript_formats::alignment::AlignmentOpKind::Deletion => reference_len.saturating_sub(len), - _ => reference_len, - }; - - return Ok(IndelClassification { - covering: true, - reference_like: false, - matches_alt: observed_len == alt_len, - observed_len, - }); - } - } - - Ok(IndelClassification { - covering: true, - reference_like: true, - matches_alt: false, - observed_len, - }) -} - -#[derive(Default)] -struct BamSnpPileupCounts { - filtered_depth: u32, - filtered_ref_count: u32, - filtered_alt_count: u32, - filtered_base_counts: BTreeMap, - raw_depth: u32, - raw_ref_count: u32, - raw_alt_count: u32, - raw_base_counts: BTreeMap, - filtered_low_base_quality: u32, - filtered_low_mapping_quality: u32, - filtered_non_acgt: u32, - filtered_unmapped: u32, - filtered_secondary: u32, - filtered_qc_fail: u32, - filtered_duplicate: u32, - filtered_improper_pair: u32, - raw_forward_counts: BTreeMap, - raw_reverse_counts: BTreeMap, -} - -impl BamSnpPileupCounts { - fn evidence_lines(&self, locus: &str, target_pos: i64) -> Vec { - vec![ - format!( - "observed BAM SNP pileup at {locus} target_pos={target_pos} filtered_depth={} ref_count={} alt_count={}", - self.filtered_depth, self.filtered_ref_count, self.filtered_alt_count - ), - format!( - "raw pileup depth={} ref_count={} alt_count={} raw_counts={:?}", - self.raw_depth, self.raw_ref_count, self.raw_alt_count, self.raw_base_counts - ), - format!( - "raw strand counts: forward={:?} reverse={:?}", - self.raw_forward_counts, self.raw_reverse_counts - ), - format!( - "filters applied: min_base_quality=13 min_mapping_quality=0 filtered_low_base_quality={} filtered_low_mapping_quality={} filtered_non_acgt={} filtered_unmapped={} filtered_secondary={} filtered_qc_fail={} filtered_duplicate={} filtered_improper_pair={}", - self.filtered_low_base_quality, - self.filtered_low_mapping_quality, - self.filtered_non_acgt, - self.filtered_unmapped, - self.filtered_secondary, - self.filtered_qc_fail, - self.filtered_duplicate, - self.filtered_improper_pair - ), - ] - } -} - -fn infer_snp_genotype( - reference: char, - alternate: char, - ref_count: u32, - alt_count: u32, - depth: u32, -) -> Option { - if depth == 0 || ref_count + alt_count == 0 { - return None; - } - let alt_fraction = f64::from(alt_count) / f64::from(depth); - if alt_fraction >= 0.8 { - Some(format!("{alternate}{alternate}")) - } else if alt_fraction <= 0.2 { - Some(format!("{reference}{reference}")) - } else { - let mut alleles = [reference.to_ascii_uppercase(), alternate.to_ascii_uppercase()]; - alleles.sort_by_key(|allele| match allele { - 'A' => 0, - 'C' => 1, - 'G' => 2, - 'T' => 3, - _ => 99, - }); - Some(alleles.iter().collect()) - } -} - -fn describe_snp_decision_rule( - reference: char, - alternate: char, - ref_count: u32, - alt_count: u32, - depth: u32, -) -> String { - if depth == 0 { - return format!( - "no covering reads for SNP; genotype unresolved (ref={reference}, alt={alternate})" - ); - } - if ref_count + alt_count == 0 { - return format!( - "no reads matched the declared SNP alleles; genotype unresolved; counts ref={ref_count} alt={alt_count} depth={depth} for {reference}>{alternate}" - ); - } - - let alt_fraction = f64::from(alt_count) / f64::from(depth); - format!( - "SNP genotype rule: alt_fraction={alt_fraction:.3} with thresholds ref<=0.200, het=(0.200,0.800), alt>=0.800; counts ref={ref_count} alt={alt_count} depth={depth} for {reference}>{alternate}" - ) -} - -fn infer_copy_number_genotype( - reference: &str, - alternate: &str, - _ref_count: u32, - alt_count: u32, - depth: u32, -) -> Option { - if depth == 0 { - return None; - } - let alt_fraction = f64::from(alt_count) / f64::from(depth); - if alt_fraction >= 0.8 { - Some(format!("{alternate}{alternate}")) - } else if alt_fraction <= 0.2 { - Some(format!("{reference}{reference}")) - } else { - let mut alleles = [ - reference.to_ascii_uppercase(), - alternate.to_ascii_uppercase(), - ]; - alleles.sort_by_key(|allele| allele.chars().next().map_or(u8::MAX, |ch| match ch { - 'A' => 0, - 'C' => 1, - 'G' => 2, - 'T' => 3, - 'I' => 4, - 'D' => 5, - _ => 99, - })); - Some(alleles.concat()) - } -} - -fn describe_copy_number_decision_rule( - reference: &str, - alternate: &str, - _ref_count: u32, - alt_count: u32, - depth: u32, -) -> String { - if depth == 0 { - return format!( - "no covering reads for copy-number style variant; genotype unresolved (ref={reference}, alt={alternate})" - ); - } - - let alt_fraction = f64::from(alt_count) / f64::from(depth); - format!( - "copy-number genotype rule: alt_fraction={alt_fraction:.3} with thresholds ref<=0.200, het=(0.200,0.800), alt>=0.800; counts alt={alt_count} depth={depth} for {reference}->{alternate}" - ) -} - -fn observe_cram_variant( - reader: &mut noodles::cram::io::indexed_reader::IndexedReader, - label: &str, - variant: &VariantSpec, -) -> Result { - let assembly = variant - .grch38 - .as_ref() - .map(|_| Assembly::Grch38) - .or_else(|| variant.grch37.as_ref().map(|_| Assembly::Grch37)); - let locus = variant - .grch38 - .as_ref() - .or(variant.grch37.as_ref()) - .ok_or_else(|| { - RuntimeError::Io(format!( - "variant {} has no GRCh37/GRCh38 locus", - variant - .rsids - .first() - .map(|s| s.as_str()) - .unwrap_or("variant") - )) - })?; - let locus = GenomicLocus { - chrom: locus.chrom.clone(), - start: locus.start, - end: locus.end, - }; - let kind = variant.kind.unwrap_or(VariantKind::Snp); - match kind { - VariantKind::Snp => { - let ref_char = variant - .reference - .as_deref() - .and_then(|s| s.chars().next()) - .ok_or_else(|| { - RuntimeError::Io(format!( - "variant {} missing reference allele", - variant - .rsids - .first() - .map(|s| s.as_str()) - .unwrap_or("variant") - )) - })?; - let alt_char = variant - .alternate - .as_deref() - .and_then(|s| s.chars().next()) - .ok_or_else(|| { - RuntimeError::Io(format!( - "variant {} missing alternate allele", - variant - .rsids - .first() - .map(|s| s.as_str()) - .unwrap_or("variant") - )) - })?; - bioscript_formats::observe_cram_snp_with_reader( - reader, - label, - &locus, - ref_char, - alt_char, - variant.rsids.first().cloned(), - assembly, - ) - } - VariantKind::Deletion => { - let deletion_length = variant.deletion_length.ok_or_else(|| { - RuntimeError::Io(format!( - "variant {} missing deletion_length", - variant - .rsids - .first() - .map(|s| s.as_str()) - .unwrap_or("variant") - )) - })?; - let reference = variant.reference.as_deref().unwrap_or("I"); - let alternate = variant.alternate.as_deref().unwrap_or("D"); - bioscript_formats::observe_cram_deletion_with_reader( - reader, - label, - &locus, - deletion_length, - reference, - alternate, - variant.rsids.first().cloned(), - assembly, - ) - } - VariantKind::Insertion | VariantKind::Indel => { - let reference = variant.reference.as_deref().ok_or_else(|| { - RuntimeError::Io(format!( - "variant {} missing reference allele", - variant - .rsids - .first() - .map(|s| s.as_str()) - .unwrap_or("variant") - )) - })?; - let alternate = variant.alternate.as_deref().ok_or_else(|| { - RuntimeError::Io(format!( - "variant {} missing alternate allele", - variant - .rsids - .first() - .map(|s| s.as_str()) - .unwrap_or("variant") - )) - })?; - bioscript_formats::observe_cram_indel_with_reader( - reader, - label, - &locus, - reference, - alternate, - variant.rsids.first().cloned(), - assembly, - ) - } - other => Err(RuntimeError::Io(format!( - "variant {} kind {:?} not supported on CRAM via wasm", - variant - .rsids - .first() - .map(|s| s.as_str()) - .unwrap_or("variant"), - other - ))), - } -} +pub(crate) use alignment::{BamReportLookup, CramReportLookup}; pub(super) struct VcfReportLookup { pub(super) reader: std::cell::RefCell< @@ -1034,11 +20,6 @@ pub(super) struct VcfReportLookup { } impl report_workspace::VariantLookup for VcfReportLookup { - fn lookup_variant(&self, spec: &VariantSpec) -> Result { - let mut reader = self.reader.borrow_mut(); - observe_vcf_variant(&mut reader, &self.label, spec, self.detected_assembly) - } - fn lookup_variants( &self, specs: &[VariantSpec], @@ -1112,8 +93,7 @@ fn observe_vcf_variant( line.contains("tabix index has no contig") || line.contains("has no GRCh37/GRCh38 locus") }) - { - if let Some(imputed) = bioscript_formats::imputed_reference_observation( + && let Some(imputed) = bioscript_formats::imputed_reference_observation( "vcf", label, variant, @@ -1121,9 +101,9 @@ fn observe_vcf_variant( assembly, None, &observation.evidence.join(" | "), - ) { - return Ok(imputed); - } + ) + { + return Ok(imputed); } Ok(observation) } diff --git a/rust/bioscript-wasm/src/report_lookup/alignment.rs b/rust/bioscript-wasm/src/report_lookup/alignment.rs new file mode 100644 index 0000000..4a3cc88 --- /dev/null +++ b/rust/bioscript-wasm/src/report_lookup/alignment.rs @@ -0,0 +1,165 @@ +use super::*; + +/// Per-variant CRAM lookup that satisfies the workspace's `VariantLookup` +/// trait. Holds the IndexedReader in a `RefCell` so &self lookup methods can +/// mutably read while still being object-safe. +pub(crate) struct CramReportLookup { + pub(crate) reader: std::cell::RefCell>, + pub(crate) label: String, +} + +#[path = "alignment/bam.rs"] +mod bam; + +pub(crate) use bam::BamReportLookup; + +impl report_workspace::VariantLookup for CramReportLookup { + fn lookup_variants( + &self, + specs: &[VariantSpec], + ) -> Result, RuntimeError> { + let mut reader = self.reader.borrow_mut(); + let mut out = Vec::with_capacity(specs.len()); + for spec in specs { + out.push(observe_cram_variant(&mut reader, &self.label, spec)?); + } + Ok(out) + } +} + +fn observe_cram_variant( + reader: &mut noodles::cram::io::indexed_reader::IndexedReader, + label: &str, + variant: &VariantSpec, +) -> Result { + let assembly = variant + .grch38 + .as_ref() + .map(|_| Assembly::Grch38) + .or_else(|| variant.grch37.as_ref().map(|_| Assembly::Grch37)); + let locus = variant + .grch38 + .as_ref() + .or(variant.grch37.as_ref()) + .ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} has no GRCh37/GRCh38 locus", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + let locus = GenomicLocus { + chrom: locus.chrom.clone(), + start: locus.start, + end: locus.end, + }; + let kind = variant.kind.unwrap_or(VariantKind::Snp); + match kind { + VariantKind::Snp => { + let ref_char = variant + .reference + .as_deref() + .and_then(|s| s.chars().next()) + .ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} missing reference allele", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + let alt_char = variant + .alternate + .as_deref() + .and_then(|s| s.chars().next()) + .ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} missing alternate allele", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + bioscript_formats::observe_cram_snp_with_reader( + reader, + label, + &locus, + ref_char, + alt_char, + variant.rsids.first().cloned(), + assembly, + ) + } + VariantKind::Deletion => { + let deletion_length = variant.deletion_length.ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} missing deletion_length", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + let reference = variant.reference.as_deref().unwrap_or("I"); + let alternate = variant.alternate.as_deref().unwrap_or("D"); + bioscript_formats::observe_cram_deletion_with_reader( + reader, + label, + &locus, + deletion_length, + reference, + alternate, + variant.rsids.first().cloned(), + assembly, + ) + } + VariantKind::Insertion | VariantKind::Indel => { + let reference = variant.reference.as_deref().ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} missing reference allele", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + let alternate = variant.alternate.as_deref().ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} missing alternate allele", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + bioscript_formats::observe_cram_indel_with_reader( + reader, + label, + &locus, + reference, + alternate, + variant.rsids.first().cloned(), + assembly, + ) + } + other => Err(RuntimeError::Io(format!( + "variant {} kind {:?} not supported on CRAM via wasm", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant"), + other + ))), + } +} diff --git a/rust/bioscript-wasm/src/report_lookup/alignment/bam.rs b/rust/bioscript-wasm/src/report_lookup/alignment/bam.rs new file mode 100644 index 0000000..d5743db --- /dev/null +++ b/rust/bioscript-wasm/src/report_lookup/alignment/bam.rs @@ -0,0 +1,474 @@ +use super::*; + +pub(crate) struct BamReportLookup { + pub(crate) reader: std::cell::RefCell< + noodles::bam::io::indexed_reader::IndexedReader>, + >, + pub(crate) label: String, +} + +impl report_workspace::VariantLookup for BamReportLookup { + fn lookup_variants( + &self, + specs: &[VariantSpec], + ) -> Result, RuntimeError> { + let mut reader = self.reader.borrow_mut(); + let mut out = Vec::with_capacity(specs.len()); + for spec in specs { + out.push(observe_bam_variant(&mut reader, &self.label, spec)?); + } + Ok(out) + } +} + +/// Build a minimal 23andMe-style text from the observations we already +/// computed. Format: `rsid\tchrom\tpos\tgenotype` per line. The runtime's +/// delimited-text loader reads this back as a `RsidMap`/`Delimited` backend +/// so analysis scripts can call `bioscript.load_genotypes(input_file)` and +/// have rsid lookups answered from the cached table. +#[allow(dead_code)] +fn synthesize_genotype_text_from_observations(observations: &[serde_json::Value]) -> String { + let mut out = String::from("# rsid\tchromosome\tposition\tgenotype\n"); + for observation in observations { + let rsid = observation + .get("rsid") + .and_then(serde_json::Value::as_str) + .unwrap_or(""); + if rsid.is_empty() { + continue; + } + let chrom = observation + .get("chrom") + .and_then(serde_json::Value::as_str) + .unwrap_or(""); + let pos = observation + .get("pos_start") + .and_then(|v| { + v.as_i64() + .or_else(|| v.as_str().and_then(|s| s.parse::().ok())) + }) + .unwrap_or(0); + let genotype = observation + .get("genotype_display") + .and_then(serde_json::Value::as_str) + .filter(|s| !s.is_empty() && *s != "??") + .unwrap_or("--"); + out.push_str(&format!("{rsid}\t{chrom}\t{pos}\t{genotype}\n")); + } + out +} + +fn observe_bam_variant( + reader: &mut noodles::bam::io::indexed_reader::IndexedReader>, + label: &str, + variant: &VariantSpec, +) -> Result { + let assembly = variant + .grch38 + .as_ref() + .map(|_| Assembly::Grch38) + .or_else(|| variant.grch37.as_ref().map(|_| Assembly::Grch37)); + let locus = variant + .grch38 + .as_ref() + .or(variant.grch37.as_ref()) + .ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} has no GRCh37/GRCh38 locus", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + let locus = GenomicLocus { + chrom: locus.chrom.clone(), + start: locus.start, + end: locus.end, + }; + match variant.kind.unwrap_or(VariantKind::Snp) { + VariantKind::Snp => { + let ref_char = variant + .reference + .as_deref() + .and_then(|s| s.chars().next()) + .ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} missing reference allele", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + let alt_char = variant + .alternate + .as_deref() + .and_then(|s| s.chars().next()) + .ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} missing alternate allele", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + observe_bam_snp_with_reader( + reader, + label, + &locus, + ref_char, + alt_char, + variant.rsids.first().cloned(), + assembly, + ) + } + VariantKind::Deletion => { + observe_bam_deletion_with_reader(reader, label, &locus, variant, assembly) + } + VariantKind::Insertion | VariantKind::Indel => { + let reference = variant.reference.as_deref().ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} missing reference allele", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + let alternate = variant.alternate.as_deref().ok_or_else(|| { + RuntimeError::Io(format!( + "variant {} missing alternate allele", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant") + )) + })?; + observe_bam_indel_with_reader( + reader, + label, + &locus, + reference, + alternate, + variant.rsids.first().cloned(), + assembly, + ) + } + other => Err(RuntimeError::Io(format!( + "variant {} kind {:?} not supported on BAM via wasm", + variant + .rsids + .first() + .map(|s| s.as_str()) + .unwrap_or("variant"), + other + ))), + } +} + +fn observe_bam_snp_with_reader( + reader: &mut noodles::bam::io::indexed_reader::IndexedReader>, + label: &str, + locus: &GenomicLocus, + reference: char, + alternate: char, + matched_rsid: Option, + assembly: Option, +) -> Result { + use noodles::core::Position; + let mut counts = BamSnpPileupCounts::default(); + let header = read_bam_header(reader, label)?; + let region = bam_region(&header, locus)?; + let target_position = Position::try_from(usize::try_from(locus.start).map_err(|_| { + RuntimeError::InvalidArguments("SNP locus start is out of range".to_owned()) + })?) + .map_err(|_| RuntimeError::InvalidArguments("SNP locus start is out of range".to_owned()))?; + + let query = reader + .query(&header, ®ion) + .map_err(|err| RuntimeError::Io(format!("failed to query BAM {label}: {err}")))?; + for result in query.records() { + let record = result + .map_err(|err| RuntimeError::Io(format!("failed to read BAM record {label}: {err}")))?; + let flags = record.flags(); + if flags.is_unmapped() { + counts.filtered_unmapped += 1; + continue; + } + if flags.is_secondary() { + counts.filtered_secondary += 1; + continue; + } + if flags.is_qc_fail() { + counts.filtered_qc_fail += 1; + continue; + } + if flags.is_duplicate() { + counts.filtered_duplicate += 1; + continue; + } + if flags.is_segmented() && !flags.is_properly_segmented() { + counts.filtered_improper_pair += 1; + continue; + } + + let Some((base, base_quality)) = + bam_base_quality_at_reference_position(&record, target_position)? + else { + continue; + }; + let normalized_base = normalize_pileup_base(base); + let is_reverse = flags.is_reverse_complemented(); + if let Some(base) = normalized_base { + counts.raw_depth += 1; + *counts.raw_base_counts.entry(base.to_string()).or_insert(0) += 1; + let strand_counts = if is_reverse { + &mut counts.raw_reverse_counts + } else { + &mut counts.raw_forward_counts + }; + *strand_counts.entry(base.to_string()).or_insert(0) += 1; + if base == reference { + counts.raw_ref_count += 1; + } else if base == alternate { + counts.raw_alt_count += 1; + } + } + + if base_quality < 13 { + counts.filtered_low_base_quality += 1; + continue; + } + + let Some(base) = normalized_base else { + counts.filtered_non_acgt += 1; + continue; + }; + + counts.filtered_depth += 1; + *counts + .filtered_base_counts + .entry(base.to_string()) + .or_insert(0) += 1; + if base == reference { + counts.filtered_ref_count += 1; + } else if base == alternate { + counts.filtered_alt_count += 1; + } + } + + let ref_count = counts.filtered_ref_count; + let alt_count = counts.filtered_alt_count; + let depth = counts.filtered_depth; + let evidence = counts.evidence_lines( + &format!("{}:{}-{}", locus.chrom, locus.start, locus.end), + locus.start, + ); + Ok(VariantObservation { + backend: "bam".to_owned(), + matched_rsid, + assembly, + genotype: infer_snp_genotype(reference, alternate, ref_count, alt_count, depth), + ref_count: Some(ref_count), + alt_count: Some(alt_count), + depth: Some(depth), + raw_counts: counts.raw_base_counts, + decision: Some(describe_snp_decision_rule( + reference, alternate, ref_count, alt_count, depth, + )), + evidence, + }) +} + +fn observe_bam_deletion_with_reader( + reader: &mut noodles::bam::io::indexed_reader::IndexedReader>, + label: &str, + locus: &GenomicLocus, + variant: &VariantSpec, + assembly: Option, +) -> Result { + let deletion_length = variant.deletion_length.ok_or_else(|| { + RuntimeError::InvalidArguments("deletion variant requires deletion_length".to_owned()) + })?; + let reference = variant.reference.clone().unwrap_or_else(|| "I".to_owned()); + let alternate = variant.alternate.clone().unwrap_or_else(|| "D".to_owned()); + let anchor_pos = locus.start.saturating_sub(1); + let anchor_locus = GenomicLocus { + chrom: locus.chrom.clone(), + start: anchor_pos, + end: anchor_pos, + }; + + let mut alt_count = 0u32; + let mut ref_count = 0u32; + let mut depth = 0u32; + + let header = read_bam_header(reader, label)?; + let region = bam_region(&header, &anchor_locus)?; + let query = reader + .query(&header, ®ion) + .map_err(|err| RuntimeError::Io(format!("failed to query BAM {label}: {err}")))?; + for result in query.records() { + let record = result + .map_err(|err| RuntimeError::Io(format!("failed to read BAM record {label}: {err}")))?; + let alignment_record = bam_alignment_record(label, &record)?; + if alignment_record.is_unmapped || !spans_position(&alignment_record, anchor_pos) { + continue; + } + depth += 1; + match indel_at_anchor(&alignment_record, anchor_pos) { + Some((bioscript_formats::alignment::AlignmentOpKind::Deletion, len)) + if len == deletion_length => + { + alt_count += 1; + } + _ => ref_count += 1, + } + } + + Ok(VariantObservation { + backend: "bam".to_owned(), + matched_rsid: variant.rsids.first().cloned(), + assembly, + genotype: infer_copy_number_genotype(&reference, &alternate, ref_count, alt_count, depth), + ref_count: Some(ref_count), + alt_count: Some(alt_count), + depth: Some(depth), + raw_counts: BTreeMap::new(), + decision: Some(describe_copy_number_decision_rule( + &reference, &alternate, ref_count, alt_count, depth, + )), + evidence: vec![format!( + "observed BAM deletion anchor {}:{} len={} depth={} ref_count={} alt_count={}", + locus.chrom, anchor_pos, deletion_length, depth, ref_count, alt_count + )], + }) +} + +fn observe_bam_indel_with_reader( + reader: &mut noodles::bam::io::indexed_reader::IndexedReader>, + label: &str, + locus: &GenomicLocus, + reference: &str, + alternate: &str, + matched_rsid: Option, + assembly: Option, +) -> Result { + let mut alt_count = 0u32; + let mut ref_count = 0u32; + let mut depth = 0u32; + let mut matching_alt_lengths = std::collections::BTreeSet::new(); + + let header = read_bam_header(reader, label)?; + let region = bam_region(&header, locus)?; + let query = reader + .query(&header, ®ion) + .map_err(|err| RuntimeError::Io(format!("failed to query BAM {label}: {err}")))?; + for result in query.records() { + let record = result + .map_err(|err| RuntimeError::Io(format!("failed to read BAM record {label}: {err}")))?; + let alignment_record = bam_alignment_record(label, &record)?; + if alignment_record.is_unmapped || !record_overlaps_locus(&alignment_record, locus) { + continue; + } + let classification = + classify_expected_indel(&alignment_record, locus, reference.len(), alternate)?; + if !classification.covering { + continue; + } + depth += 1; + if classification.matches_alt { + alt_count += 1; + matching_alt_lengths.insert(classification.observed_len); + } else if classification.reference_like { + ref_count += 1; + } + } + + let evidence_label = if matching_alt_lengths.is_empty() { + "none".to_owned() + } else { + matching_alt_lengths + .into_iter() + .map(|len| len.to_string()) + .collect::>() + .join(",") + }; + + Ok(VariantObservation { + backend: "bam".to_owned(), + matched_rsid, + assembly, + genotype: infer_copy_number_genotype(reference, alternate, ref_count, alt_count, depth), + ref_count: Some(ref_count), + alt_count: Some(alt_count), + depth: Some(depth), + raw_counts: BTreeMap::new(), + decision: Some(describe_copy_number_decision_rule( + reference, alternate, ref_count, alt_count, depth, + )), + evidence: vec![format!( + "observed BAM indel at {}:{}-{} depth={} ref_count={} alt_count={} matching_alt_lengths={}", + locus.chrom, locus.start, locus.end, depth, ref_count, alt_count, evidence_label + )], + }) +} + +fn read_bam_header( + reader: &mut noodles::bam::io::indexed_reader::IndexedReader>, + label: &str, +) -> Result { + reader + .get_mut() + .seek(noodles::bgzf::VirtualPosition::MIN) + .map_err(|err| RuntimeError::Io(format!("failed to rewind BAM {label}: {err}")))?; + reader + .read_header() + .map_err(|err| RuntimeError::Io(format!("failed to read BAM header {label}: {err}"))) +} + +fn bam_region( + header: &noodles::sam::Header, + locus: &GenomicLocus, +) -> Result { + let chrom = resolve_bam_reference_name(header, &locus.chrom).ok_or_else(|| { + RuntimeError::Unsupported(format!( + "indexed BAM does not contain contig {} for {}:{}-{}", + locus.chrom, locus.chrom, locus.start, locus.end + )) + })?; + format!("{chrom}:{}-{}", locus.start, locus.end) + .parse() + .map_err(|err| RuntimeError::Io(format!("invalid BAM query region: {err}"))) +} + +fn resolve_bam_reference_name(header: &noodles::sam::Header, chrom: &str) -> Option { + let candidates = [ + chrom.to_owned(), + format!("chr{chrom}"), + chrom.trim_start_matches("chr").to_owned(), + ]; + candidates.into_iter().find(|candidate| { + header.reference_sequences().iter().any(|(name, _)| { + let name_bytes: &[u8] = name.as_ref(); + name_bytes == candidate.as_bytes() + }) + }) +} + +#[path = "bam/pileup.rs"] +mod pileup; + +use pileup::{ + BamSnpPileupCounts, bam_alignment_record, bam_base_quality_at_reference_position, + classify_expected_indel, describe_copy_number_decision_rule, describe_snp_decision_rule, + indel_at_anchor, infer_copy_number_genotype, infer_snp_genotype, normalize_pileup_base, + record_overlaps_locus, spans_position, +}; diff --git a/rust/bioscript-wasm/src/report_lookup/alignment/bam/pileup.rs b/rust/bioscript-wasm/src/report_lookup/alignment/bam/pileup.rs new file mode 100644 index 0000000..bf0fbac --- /dev/null +++ b/rust/bioscript-wasm/src/report_lookup/alignment/bam/pileup.rs @@ -0,0 +1,418 @@ +use super::*; + +pub(super) fn bam_base_quality_at_reference_position( + record: &noodles::bam::Record, + target_position: noodles::core::Position, +) -> Result, RuntimeError> { + use noodles::sam::alignment::record::cigar::op::Kind; + + let Some(alignment_start) = record + .alignment_start() + .transpose() + .map_err(|err| RuntimeError::Io(format!("failed to read BAM alignment start: {err}")))? + else { + return Ok(None); + }; + let mut reference_position = usize::from(alignment_start); + let target = usize::from(target_position); + let mut read_position = 0usize; + let sequence = record.sequence(); + let qualities = record.quality_scores(); + + for result in record.cigar().iter() { + let op = + result.map_err(|err| RuntimeError::Io(format!("failed to read BAM CIGAR: {err}")))?; + let len = op.len(); + match op.kind() { + Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => { + if target >= reference_position && target < reference_position + len { + let offset = target - reference_position; + let read_index = read_position + offset; + let Some(base) = sequence.get(read_index) else { + return Ok(None); + }; + let quality = qualities + .as_ref() + .get(read_index) + .copied() + .unwrap_or(u8::MAX); + return Ok(Some((base, quality))); + } + reference_position += len; + read_position += len; + } + Kind::Insertion | Kind::SoftClip => { + read_position += len; + } + Kind::Deletion | Kind::Skip => { + if target >= reference_position && target < reference_position + len { + return Ok(None); + } + reference_position += len; + } + Kind::HardClip | Kind::Pad => {} + } + } + + Ok(None) +} + +pub(super) fn bam_alignment_record( + label: &str, + record: &noodles::bam::Record, +) -> Result { + use noodles::sam::alignment::Record as _; + + let flags = record.flags(); + let is_unmapped = flags.is_unmapped(); + let start = record + .alignment_start() + .transpose() + .map_err(|err| { + RuntimeError::Io(format!( + "failed to read BAM alignment start from {label}: {err}" + )) + })? + .map(|pos| i64::try_from(usize::from(pos))) + .transpose() + .map_err(|_| { + RuntimeError::Unsupported(format!( + "record alignment start exceeds i64 range in {label}" + )) + })? + .unwrap_or(0); + let end = record + .alignment_end() + .transpose() + .map_err(|err| { + RuntimeError::Io(format!( + "failed to read BAM alignment end from {label}: {err}" + )) + })? + .map(|pos| i64::try_from(usize::from(pos))) + .transpose() + .map_err(|_| { + RuntimeError::Unsupported(format!("record alignment end exceeds i64 range in {label}")) + })? + .unwrap_or(start); + let cigar = record + .cigar() + .iter() + .map(|result| { + result.map(map_bam_op).map_err(|err| { + RuntimeError::Io(format!("failed to read BAM CIGAR from {label}: {err}")) + }) + }) + .collect::, _>>()?; + + Ok(bioscript_formats::alignment::AlignmentRecord { + start, + end, + is_unmapped, + cigar, + }) +} + +fn map_bam_op( + op: noodles::sam::alignment::record::cigar::Op, +) -> bioscript_formats::alignment::AlignmentOp { + use bioscript_formats::alignment::{AlignmentOp, AlignmentOpKind}; + use noodles::sam::alignment::record::cigar::op::Kind; + + let kind = match op.kind() { + Kind::Match => AlignmentOpKind::Match, + Kind::Insertion => AlignmentOpKind::Insertion, + Kind::Deletion => AlignmentOpKind::Deletion, + Kind::Skip => AlignmentOpKind::Skip, + Kind::SoftClip => AlignmentOpKind::SoftClip, + Kind::HardClip => AlignmentOpKind::HardClip, + Kind::Pad => AlignmentOpKind::Pad, + Kind::SequenceMatch => AlignmentOpKind::SequenceMatch, + Kind::SequenceMismatch => AlignmentOpKind::SequenceMismatch, + }; + + AlignmentOp { + kind, + len: op.len(), + } +} + +pub(super) fn normalize_pileup_base(base: u8) -> Option { + match (base as char).to_ascii_uppercase() { + 'A' | 'C' | 'G' | 'T' => Some((base as char).to_ascii_uppercase()), + _ => None, + } +} + +pub(super) struct IndelClassification { + pub(super) covering: bool, + pub(super) reference_like: bool, + pub(super) matches_alt: bool, + pub(super) observed_len: usize, +} + +pub(super) fn record_overlaps_locus( + record: &bioscript_formats::alignment::AlignmentRecord, + locus: &GenomicLocus, +) -> bool { + record.end >= locus.start && record.start <= locus.end +} + +pub(super) fn spans_position( + record: &bioscript_formats::alignment::AlignmentRecord, + pos: i64, +) -> bool { + pos >= record.start.saturating_sub(1) && pos <= record.end +} + +pub(super) fn indel_at_anchor( + record: &bioscript_formats::alignment::AlignmentRecord, + anchor_pos: i64, +) -> Option<(bioscript_formats::alignment::AlignmentOpKind, usize)> { + let mut ref_pos = record.start; + + for op in &record.cigar { + match op.kind { + bioscript_formats::alignment::AlignmentOpKind::Match + | bioscript_formats::alignment::AlignmentOpKind::SequenceMatch + | bioscript_formats::alignment::AlignmentOpKind::SequenceMismatch + | bioscript_formats::alignment::AlignmentOpKind::Skip => { + ref_pos += i64::try_from(op.len).ok()?; + } + bioscript_formats::alignment::AlignmentOpKind::Insertion => { + let anchor = ref_pos.saturating_sub(1); + if anchor == anchor_pos { + return Some(( + bioscript_formats::alignment::AlignmentOpKind::Insertion, + op.len, + )); + } + } + bioscript_formats::alignment::AlignmentOpKind::Deletion => { + let anchor = ref_pos.saturating_sub(1); + if anchor == anchor_pos { + return Some(( + bioscript_formats::alignment::AlignmentOpKind::Deletion, + op.len, + )); + } + ref_pos += i64::try_from(op.len).ok()?; + } + bioscript_formats::alignment::AlignmentOpKind::SoftClip + | bioscript_formats::alignment::AlignmentOpKind::HardClip + | bioscript_formats::alignment::AlignmentOpKind::Pad => {} + } + } + + None +} + +pub(super) fn classify_expected_indel( + record: &bioscript_formats::alignment::AlignmentRecord, + locus: &GenomicLocus, + reference_len: usize, + alternate: &str, +) -> Result { + let alt_len = alternate.len(); + let anchor_start = locus.start.saturating_sub(1); + let anchor_end = locus.end; + + let covering = record.start <= locus.start && record.end >= locus.end; + if !covering { + return Ok(IndelClassification { + covering: false, + reference_like: false, + matches_alt: false, + observed_len: reference_len, + }); + } + + let mut observed_len = reference_len; + + for anchor in anchor_start..=anchor_end { + if let Some((kind, len)) = indel_at_anchor(record, anchor) { + observed_len = match kind { + bioscript_formats::alignment::AlignmentOpKind::Insertion => reference_len + len, + bioscript_formats::alignment::AlignmentOpKind::Deletion => { + reference_len.saturating_sub(len) + } + _ => reference_len, + }; + + return Ok(IndelClassification { + covering: true, + reference_like: false, + matches_alt: observed_len == alt_len, + observed_len, + }); + } + } + + Ok(IndelClassification { + covering: true, + reference_like: true, + matches_alt: false, + observed_len, + }) +} + +#[derive(Default)] +pub(super) struct BamSnpPileupCounts { + pub(super) filtered_depth: u32, + pub(super) filtered_ref_count: u32, + pub(super) filtered_alt_count: u32, + pub(super) filtered_base_counts: BTreeMap, + pub(super) raw_depth: u32, + pub(super) raw_ref_count: u32, + pub(super) raw_alt_count: u32, + pub(super) raw_base_counts: BTreeMap, + pub(super) filtered_low_base_quality: u32, + pub(super) filtered_low_mapping_quality: u32, + pub(super) filtered_non_acgt: u32, + pub(super) filtered_unmapped: u32, + pub(super) filtered_secondary: u32, + pub(super) filtered_qc_fail: u32, + pub(super) filtered_duplicate: u32, + pub(super) filtered_improper_pair: u32, + pub(super) raw_forward_counts: BTreeMap, + pub(super) raw_reverse_counts: BTreeMap, +} + +impl BamSnpPileupCounts { + pub(super) fn evidence_lines(&self, locus: &str, target_pos: i64) -> Vec { + vec![ + format!( + "observed BAM SNP pileup at {locus} target_pos={target_pos} filtered_depth={} ref_count={} alt_count={}", + self.filtered_depth, self.filtered_ref_count, self.filtered_alt_count + ), + format!( + "raw pileup depth={} ref_count={} alt_count={} raw_counts={:?}", + self.raw_depth, self.raw_ref_count, self.raw_alt_count, self.raw_base_counts + ), + format!( + "raw strand counts: forward={:?} reverse={:?}", + self.raw_forward_counts, self.raw_reverse_counts + ), + format!( + "filters applied: min_base_quality=13 min_mapping_quality=0 filtered_low_base_quality={} filtered_low_mapping_quality={} filtered_non_acgt={} filtered_unmapped={} filtered_secondary={} filtered_qc_fail={} filtered_duplicate={} filtered_improper_pair={}", + self.filtered_low_base_quality, + self.filtered_low_mapping_quality, + self.filtered_non_acgt, + self.filtered_unmapped, + self.filtered_secondary, + self.filtered_qc_fail, + self.filtered_duplicate, + self.filtered_improper_pair + ), + ] + } +} + +pub(super) fn infer_snp_genotype( + reference: char, + alternate: char, + ref_count: u32, + alt_count: u32, + depth: u32, +) -> Option { + if depth == 0 || ref_count + alt_count == 0 { + return None; + } + let alt_fraction = f64::from(alt_count) / f64::from(depth); + if alt_fraction >= 0.8 { + Some(format!("{alternate}{alternate}")) + } else if alt_fraction <= 0.2 { + Some(format!("{reference}{reference}")) + } else { + let mut alleles = [ + reference.to_ascii_uppercase(), + alternate.to_ascii_uppercase(), + ]; + alleles.sort_by_key(|allele| match allele { + 'A' => 0, + 'C' => 1, + 'G' => 2, + 'T' => 3, + _ => 99, + }); + Some(alleles.iter().collect()) + } +} + +pub(super) fn describe_snp_decision_rule( + reference: char, + alternate: char, + ref_count: u32, + alt_count: u32, + depth: u32, +) -> String { + if depth == 0 { + return format!( + "no covering reads for SNP; genotype unresolved (ref={reference}, alt={alternate})" + ); + } + if ref_count + alt_count == 0 { + return format!( + "no reads matched the declared SNP alleles; genotype unresolved; counts ref={ref_count} alt={alt_count} depth={depth} for {reference}>{alternate}" + ); + } + + let alt_fraction = f64::from(alt_count) / f64::from(depth); + format!( + "SNP genotype rule: alt_fraction={alt_fraction:.3} with thresholds ref<=0.200, het=(0.200,0.800), alt>=0.800; counts ref={ref_count} alt={alt_count} depth={depth} for {reference}>{alternate}" + ) +} + +pub(super) fn infer_copy_number_genotype( + reference: &str, + alternate: &str, + _ref_count: u32, + alt_count: u32, + depth: u32, +) -> Option { + if depth == 0 { + return None; + } + let alt_fraction = f64::from(alt_count) / f64::from(depth); + if alt_fraction >= 0.8 { + Some(format!("{alternate}{alternate}")) + } else if alt_fraction <= 0.2 { + Some(format!("{reference}{reference}")) + } else { + let mut alleles = [ + reference.to_ascii_uppercase(), + alternate.to_ascii_uppercase(), + ]; + alleles.sort_by_key(|allele| { + allele.chars().next().map_or(u8::MAX, |ch| match ch { + 'A' => 0, + 'C' => 1, + 'G' => 2, + 'T' => 3, + 'I' => 4, + 'D' => 5, + _ => 99, + }) + }); + Some(alleles.concat()) + } +} + +pub(super) fn describe_copy_number_decision_rule( + reference: &str, + alternate: &str, + _ref_count: u32, + alt_count: u32, + depth: u32, +) -> String { + if depth == 0 { + return format!( + "no covering reads for copy-number style variant; genotype unresolved (ref={reference}, alt={alternate})" + ); + } + + let alt_fraction = f64::from(alt_count) / f64::from(depth); + format!( + "copy-number genotype rule: alt_fraction={alt_fraction:.3} with thresholds ref<=0.200, het=(0.200,0.800), alt>=0.800; counts alt={alt_count} depth={depth} for {reference}->{alternate}" + ) +} diff --git a/rust/bioscript-wasm/src/report_workspace.rs b/rust/bioscript-wasm/src/report_workspace.rs index d06a57e..85451a5 100644 --- a/rust/bioscript-wasm/src/report_workspace.rs +++ b/rust/bioscript-wasm/src/report_workspace.rs @@ -18,7 +18,6 @@ pub(super) struct ManifestRowsOutput { /// or a CRAM/VCF-reader-backed lookup that streams through JS-supplied /// `readAt` callbacks. pub(super) trait VariantLookup { - fn lookup_variant(&self, spec: &VariantSpec) -> Result; fn lookup_variants( &self, specs: &[VariantSpec], @@ -26,9 +25,6 @@ pub(super) trait VariantLookup { } impl VariantLookup for GenotypeStore { - fn lookup_variant(&self, spec: &VariantSpec) -> Result { - GenotypeStore::lookup_variant(self, spec) - } fn lookup_variants( &self, specs: &[VariantSpec], @@ -64,14 +60,6 @@ impl PackageWorkspace { .map_err(|err| JsError::new(&format!("failed to parse YAML {path}: {err}"))) } - pub(super) fn schema(&self, path: &str) -> Result { - self.yaml(path)? - .get("schema") - .and_then(serde_yaml::Value::as_str) - .map(ToOwned::to_owned) - .ok_or_else(|| JsError::new(&format!("{path} is missing schema"))) - } - fn resolve(&self, base: &str, relative: &str) -> Result { let base = Path::new(base).parent().unwrap_or_else(|| Path::new("")); normalize_package_path(&base.join(relative).display().to_string()) @@ -82,16 +70,6 @@ impl PackageWorkspace { .map_err(|err| JsError::new(&format!("load variant {path}: {err}"))) } - pub(super) fn load_panel(&self, path: &str) -> Result { - load_panel_manifest_text(path, self.text(path)?) - .map_err(|err| JsError::new(&format!("load panel {path}: {err}"))) - } - - pub(super) fn load_assay(&self, path: &str) -> Result { - load_assay_manifest_text(path, self.text(path)?) - .map_err(|err| JsError::new(&format!("load assay {path}: {err}"))) - } - pub(super) fn run_manifest_rows( &self, manifest_path: &str, @@ -99,126 +77,24 @@ impl PackageWorkspace { participant_id: &str, filters: &[String], ) -> Result { - match self.schema(manifest_path)?.as_str() { - "bioscript:variant:1.0" | "bioscript:variant" => { - let manifest = self.load_variant(manifest_path)?; - let observation = store - .lookup_variant(&manifest.spec) - .map_err(|err| JsError::new(&format!("lookup {}: {err:?}", manifest.name)))?; - let row = variant_row( - manifest_path, - &manifest.name, - &manifest.tags, - &observation, - participant_id, - ); - Ok(ManifestRowsOutput { - rows: vec![row], - observations: vec![observation], - }) - } - "bioscript:panel:1.0" => { - self.run_panel_rows(manifest_path, store, participant_id, filters) - } - "bioscript:assay:1.0" => { - self.run_assay_rows(manifest_path, store, participant_id, filters) - } - other => Err(JsError::new(&format!( - "unsupported manifest schema '{other}'" - ))), - } - } - - fn run_panel_rows( - &self, - manifest_path: &str, - store: &dyn VariantLookup, - participant_id: &str, - filters: &[String], - ) -> Result { - let panel = self.load_panel(manifest_path)?; - let mut rows_by_member: Vec>> = - vec![Vec::new(); panel.members.len()]; - let mut all_observations = Vec::::new(); - let mut variants = Vec::<(usize, String, VariantManifest)>::new(); - for (index, member) in panel.members.iter().enumerate() { - let Some(path) = &member.path else { - return Err(JsError::new("remote panel members are not executable yet")); - }; - let resolved = self.resolve(manifest_path, path)?; - if member.kind == "variant" { - let variant = self.load_variant(&resolved)?; - if matches_filters(&variant, &resolved, filters) { - variants.push((index, resolved, variant)); - } - } else if member.kind == "assay" { - let assay_output = - self.run_assay_rows(&resolved, store, participant_id, filters)?; - rows_by_member[index] = assay_output.rows; - all_observations.extend(assay_output.observations); - } - } - let specs = variants - .iter() - .map(|(_, _, manifest)| manifest.spec.clone()) - .collect::>(); + let tasks = + bioscript_reporting::collect_variant_manifest_tasks(self, manifest_path, filters) + .map_err(|err| JsError::new(&err))?; let observations = store - .lookup_variants(&specs) - .map_err(|err| JsError::new(&format!("panel lookup failed: {err:?}")))?; - for ((member_index, resolved, manifest), observation) in - variants.into_iter().zip(observations) - { - rows_by_member[member_index].push(variant_row( - &resolved, - &manifest.name, - &manifest.tags, - &observation, - participant_id, - )); - all_observations.push(observation); - } - Ok(ManifestRowsOutput { - rows: rows_by_member.into_iter().flatten().collect(), - observations: all_observations, - }) - } - - fn run_assay_rows( - &self, - manifest_path: &str, - store: &dyn VariantLookup, - participant_id: &str, - filters: &[String], - ) -> Result { - let assay = self.load_assay(manifest_path)?; - let mut variants = Vec::<(String, VariantManifest)>::new(); - for member in &assay.members { - if member.kind != "variant" { - continue; - } - let Some(path) = &member.path else { - continue; - }; - let resolved = self.resolve(manifest_path, path)?; - let variant = self.load_variant(&resolved)?; - if matches_filters(&variant, &resolved, filters) { - variants.push((resolved, variant)); - } - } - let specs = variants - .iter() - .map(|(_, manifest)| manifest.spec.clone()) - .collect::>(); - let observations = store - .lookup_variants(&specs) - .map_err(|err| JsError::new(&format!("assay lookup failed: {err:?}")))?; - let mut rows = Vec::with_capacity(variants.len()); - let mut collected = Vec::with_capacity(variants.len()); - for ((resolved, manifest), observation) in variants.into_iter().zip(observations) { + .lookup_variants( + &tasks + .iter() + .map(|task| task.manifest.spec.clone()) + .collect::>(), + ) + .map_err(|err| JsError::new(&format!("manifest lookup failed: {err:?}")))?; + let mut rows = Vec::with_capacity(tasks.len()); + let mut collected = Vec::with_capacity(tasks.len()); + for (task, observation) in tasks.into_iter().zip(observations) { rows.push(variant_row( - &resolved, - &manifest.name, - &manifest.tags, + &task.manifest_path, + &task.manifest.name, + &task.manifest.tags, &observation, participant_id, )); @@ -241,293 +117,43 @@ impl PackageWorkspace { let manifest = self.load_variant(&row_path)?; let value = self.yaml(&row_path)?; let gene = yaml_string(&value, "gene").unwrap_or_default(); - let ref_allele = manifest.spec.reference.clone().unwrap_or_default(); - let alt_allele = manifest.spec.alternate.clone().unwrap_or_default(); - let depth = parse_optional_u32(row.get("depth")); - let ref_count = parse_optional_u32(row.get("ref_count")); - let alt_count = parse_optional_u32(row.get("alt_count")); - let raw_genotype_display = deletion_copy_number_display(row, &manifest, depth, alt_count) - .or_else(|| row.get("genotype").cloned()) - .unwrap_or_default(); - let assembly = row - .get("assembly") - .filter(|value| !value.is_empty()) - .cloned() - .or_else(|| fallback_assembly.map(assembly_row_value)) - .unwrap_or_default(); - let locus = if assembly.eq_ignore_ascii_case("grch37") { - manifest.spec.grch37.as_ref() - } else { - manifest - .spec - .grch38 - .as_ref() - .or(manifest.spec.grch37.as_ref()) - }; - let chrom = locus.map_or(String::new(), |locus| locus.chrom.clone()); - let (genotype, zygosity) = normalize_app_genotype( - &raw_genotype_display, - &ref_allele, - &alt_allele, - manifest.spec.kind, - &chrom, - inferred_sex, - ); - let outcome = if genotype == "./." { - "no_call" - } else if zygosity == "hom_ref" || zygosity == "hem_ref" { - "reference" - } else if zygosity == "het" || zygosity == "hom_alt" || zygosity == "hem_alt" { - "variant" - } else { - "unknown" - }; - let evidence_raw = observation_evidence_raw(row, &chrom, inferred_sex); - let genotype_display = if raw_genotype_display.is_empty() && matches!(outcome, "no_call" | "not_covered") { - "??".to_owned() - } else { - raw_genotype_display.clone() - }; - Ok(serde_json::json!({ - "participant_id": row.get("participant_id").cloned().unwrap_or_default(), - "assay_id": assay_id, - "assay_version": "1.0", - "variant_key": manifest.name, - "variant_path": row_path, - "rsid": row.get("matched_rsid").filter(|value| !value.is_empty()).cloned().or_else(|| manifest.spec.rsids.first().cloned()), - "gene": gene, - "assembly": if assembly.is_empty() { serde_json::Value::Null } else { serde_json::Value::String(assembly.to_uppercase()) }, - "chrom": chrom, - "pos_start": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.start)), - "pos_end": locus.map_or(serde_json::Value::Null, |locus| serde_json::Value::from(locus.end)), - "ref": ref_allele, - "alt": alt_allele, - "kind": manifest.spec.kind.map_or("unknown".to_owned(), |kind| format!("{kind:?}").to_lowercase()), - "match_status": if row.get("matched_rsid").is_some_and(|value| !value.is_empty()) || !raw_genotype_display.is_empty() { "found" } else { "not_found" }, - "coverage_status": "covered", - "call_status": if genotype == "./." { "no_call" } else { "called" }, - "genotype": genotype, - "genotype_display": genotype_display, - "zygosity": zygosity, - "ref_count": ref_count, - "alt_count": alt_count, - "depth": depth, - "genotype_quality": serde_json::Value::Null, - "allele_balance": serde_json::Value::Null, - "outcome": outcome, - "evidence_type": "genotype_file", - "evidence_raw": evidence_raw, - "source": variant_primary_source_from_yaml(&value), - "facets": serde_json::Value::Null, - })) - } - - pub(super) fn report_manifest_metadata( + Ok(bioscript_reporting::app_observation_from_manifest_row( + bioscript_reporting::AppObservationInput { + row, + row_path: &row_path, + assay_id, + manifest, + gene, + source: variant_primary_source_from_yaml(&value), + observed_alt_alleles: variant_observed_alt_alleles_from_yaml(&value), + inferred_sex, + fallback_assembly, + }, + )) + } + + pub(super) fn report_manifest_context( &self, path: &str, - ) -> Result { - let value = self.yaml(path)?; - let members = value - .get("members") - .and_then(serde_yaml::Value::as_sequence) - .map(|items| { - items - .iter() - .filter_map(serde_yaml::Value::as_mapping) - .map(|mapping| { - serde_json::json!({ - "kind": yaml_mapping_string(mapping, "kind"), - "path": yaml_mapping_string(mapping, "path"), - "version": yaml_mapping_string(mapping, "version"), - }) - }) - .collect::>() - }) - .unwrap_or_default(); - Ok(serde_json::json!({ - "schema": yaml_string(&value, "schema"), - "version": yaml_string(&value, "version"), - "name": yaml_string(&value, "name"), - "label": yaml_string(&value, "label").or_else(|| yaml_string(&value, "name")), - "tags": yaml_string_sequence(&value, "tags"), - "members": members, - })) + ) -> Result { + bioscript_reporting::load_report_manifest_context(self, path) + .map_err(|err| JsError::new(&err)) } +} - pub(super) fn load_manifest_findings( - &self, - path: &str, - ) -> Result, JsError> { - let value = self.yaml(path)?; - let schema = yaml_string(&value, "schema").unwrap_or_default(); - let mut findings = Vec::new(); - if matches!( - schema.as_str(), - "bioscript:variant:1.0" - | "bioscript:variant" - | "bioscript:assay:1.0" - | "bioscript:panel:1.0" - | "bioscript:pgx-findings:1.0" - ) { - if let Some(items) = value - .get("findings") - .and_then(serde_yaml::Value::as_sequence) - { - for item in items { - let json_item = yaml_to_json(item.clone())?; - if let Some(include) = - json_item.get("include").and_then(serde_json::Value::as_str) - { - let include_path = self.resolve(path, include)?; - let mut included = self.load_manifest_findings(&include_path)?; - let inherited_binding = json_item.get("binding").cloned(); - for included_item in &mut included { - if inherited_binding.is_some() - && included_item.get("binding").is_none() - && included_item.get("effects").is_none() - { - if let Some(object) = included_item.as_object_mut() { - object.insert( - "binding".to_owned(), - inherited_binding - .clone() - .unwrap_or(serde_json::Value::Null), - ); - } - } - } - findings.extend(included); - } else { - findings.push(json_item); - } - } - } - } - if matches!( - schema.as_str(), - "bioscript:assay:1.0" | "bioscript:panel:1.0" - ) { - if let Some(items) = value - .get("members") - .and_then(serde_yaml::Value::as_sequence) - { - for member in items { - let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { - continue; - }; - if !matches!(kind, "variant" | "assay") { - continue; - } - let Some(member_path) = member.get("path").and_then(serde_yaml::Value::as_str) - else { - continue; - }; - let resolved = self.resolve(path, member_path)?; - findings.extend(self.load_manifest_findings(&resolved)?); - } - } - } - Ok(findings) +impl bioscript_reporting::ManifestWorkspace for PackageWorkspace { + fn load_text(&self, path: &str) -> Result { + self.text(path) + .map(str::to_owned) + .map_err(|err| format!("{err:?}")) } - pub(super) fn load_manifest_provenance_links( - &self, - path: &str, - ) -> Result, JsError> { - let value = self.yaml(path)?; - let schema = yaml_string(&value, "schema").unwrap_or_default(); - let mut links = BTreeMap::::new(); - collect_manifest_provenance_entries(&value, &mut links)?; - if matches!( - schema.as_str(), - "bioscript:assay:1.0" | "bioscript:panel:1.0" - ) { - if let Some(items) = value - .get("members") - .and_then(serde_yaml::Value::as_sequence) - { - for member in items { - let Some(kind) = member.get("kind").and_then(serde_yaml::Value::as_str) else { - continue; - }; - if !matches!(kind, "variant" | "assay") { - continue; - } - let Some(member_path) = member.get("path").and_then(serde_yaml::Value::as_str) - else { - continue; - }; - let resolved = self.resolve(path, member_path)?; - for item in self.load_manifest_provenance_links(&resolved)? { - if let Some(url) = item.get("url").and_then(serde_json::Value::as_str) { - links.entry(url.to_owned()).or_insert(item); - } - } - } - } - } - Ok(links.into_values().collect()) + fn load_yaml(&self, path: &str) -> Result { + self.yaml(path).map_err(|err| format!("{err:?}")) } -} -fn observation_evidence_raw( - row: &BTreeMap, - chrom: &str, - inferred_sex: Option<&SexInference>, -) -> String { - let mut evidence_raw = row.get("evidence").cloned().unwrap_or_default(); - if !is_haploid_sex_chromosome(chrom) { - return evidence_raw; + fn resolve(&self, base: &str, relative: &str) -> Result { + self.resolve(base, relative) + .map_err(|err| format!("{err:?}")) } - let Some(inferred_sex) = inferred_sex else { - return evidence_raw; - }; - let sex_evidence = sex_inference_evidence_raw(inferred_sex); - if sex_evidence.is_empty() { - return evidence_raw; - } - if evidence_raw.is_empty() { - evidence_raw = sex_evidence; - } else { - evidence_raw.push_str(" | "); - evidence_raw.push_str(&sex_evidence); - } - evidence_raw -} - -fn sex_inference_evidence_raw(inferred_sex: &SexInference) -> String { - let sex = match inferred_sex.sex { - InferredSex::Male => "male", - InferredSex::Female => "female", - InferredSex::Unknown => "unknown", - }; - let confidence = match inferred_sex.confidence { - SexDetectionConfidence::High => "high", - SexDetectionConfidence::Medium => "medium", - SexDetectionConfidence::Low => "low", - }; - let mut fields = vec![ - format!("detected_sex={sex}"), - format!("sex_confidence={confidence}"), - format!("sex_method={}", inferred_sex.method), - ]; - fields.extend( - inferred_sex - .evidence - .iter() - .map(|item| format!("sex_{item}")), - ); - fields.join(" ") -} - -fn is_haploid_sex_chromosome(chrom: &str) -> bool { - matches!( - chrom - .trim() - .trim_start_matches("chr") - .trim_start_matches("CHR") - .to_ascii_uppercase() - .as_str(), - "X" | "Y" | "23" | "24" - ) } diff --git a/rust/bioscript-wasm/src/report_workspace/analysis.rs b/rust/bioscript-wasm/src/report_workspace/analysis.rs index d54c822..c67ae08 100644 --- a/rust/bioscript-wasm/src/report_workspace/analysis.rs +++ b/rust/bioscript-wasm/src/report_workspace/analysis.rs @@ -1,6 +1,7 @@ use super::*; impl PackageWorkspace { + #[allow(clippy::too_many_arguments)] pub(crate) fn run_manifest_analyses( &self, manifest_path: &str, @@ -11,56 +12,27 @@ impl PackageWorkspace { loader: &GenotypeLoadOptions, options: &ReportOptionsInput, ) -> Result, JsError> { - match self.schema(manifest_path)?.as_str() { - "bioscript:panel:1.0" => { - let panel = self.load_panel(manifest_path)?; - let mut analyses = self.run_interpretations( - manifest_path, - &panel.name, - &panel.interpretations, - input_name, - input_bytes, - preloaded_observations, - participant_id, - loader, - options, - )?; - for member in &panel.members { - if member.kind != "assay" { - continue; - } - let Some(path) = &member.path else { - continue; - }; - let resolved = self.resolve(manifest_path, path)?; - analyses.extend(self.run_manifest_analyses( - &resolved, - input_name, - input_bytes, - preloaded_observations, - participant_id, - loader, - options, - )?); - } - Ok(analyses) - } - "bioscript:assay:1.0" => { - let assay = self.load_assay(manifest_path)?; - self.run_interpretations( - manifest_path, - &assay.name, - &assay.interpretations, - input_name, - input_bytes, - preloaded_observations, - participant_id, - loader, - options, - ) - } - _ => Ok(Vec::new()), + let tasks = bioscript_reporting::collect_analysis_manifest_tasks( + self, + manifest_path, + &options.filters, + ) + .map_err(|err| JsError::new(&err))?; + let mut analyses = Vec::new(); + for task in tasks { + analyses.extend(self.run_interpretations( + &task.manifest_path, + &task.manifest_name, + &task.interpretations, + input_name, + input_bytes, + preloaded_observations, + participant_id, + loader, + options, + )?); } + Ok(analyses) } #[allow(clippy::too_many_arguments)] @@ -78,17 +50,17 @@ impl PackageWorkspace { ) -> Result, JsError> { let mut outputs = Vec::new(); for interpretation in interpretations { - if interpretation.kind != "bioscript" { - return Err(JsError::new(&format!( - "analysis '{}' uses unsupported kind '{}'", - interpretation.id, interpretation.kind - ))); - } + bioscript_reporting::validate_bioscript_interpretation(interpretation) + .map_err(|err| JsError::new(&err))?; let script_path = self.resolve(manifest_path, &interpretation.path)?; - let analysis_output_file = format!( - "analysis/{participant_id}/{}.{}", - interpretation.id, - interpretation.output_format.as_deref().unwrap_or("json") + let analysis_format = bioscript_reporting::analysis_output_format( + interpretation.output_format.as_deref(), + ) + .map_err(|err| JsError::new(&err))?; + let analysis_output_file = bioscript_reporting::analysis_output_relative_file( + participant_id, + &interpretation.id, + analysis_format.extension, ); let output_file = options .output_dir @@ -96,7 +68,21 @@ impl PackageWorkspace { .filter(|dir| !dir.is_empty()) .map(|dir| format!("{}/{}", dir.trim_end_matches('/'), analysis_output_file)) .unwrap_or(analysis_output_file); + let observations_output_file = bioscript_reporting::analysis_observations_relative_file( + participant_id, + &interpretation.id, + ); + let observations_file = options + .output_dir + .as_deref() + .filter(|dir| !dir.is_empty()) + .map(|dir| format!("{}/{}", dir.trim_end_matches('/'), observations_output_file)) + .unwrap_or(observations_output_file); let mut virtual_text_files = self.files.clone(); + virtual_text_files.insert( + observations_file.clone(), + bioscript_reporting::render_analysis_observations_tsv(preloaded_observations), + ); let mut virtual_binary_files = BTreeMap::new(); virtual_binary_files.insert(input_name.to_owned(), input_bytes.to_vec()); let limits = ResourceLimits::new() @@ -123,6 +109,10 @@ impl PackageWorkspace { vec![ ("input_file", MontyObject::String(input_name.to_owned())), ("output_file", MontyObject::String(output_file.clone())), + ( + "observations_file", + MontyObject::String(observations_file.clone()), + ), ( "participant_id", MontyObject::String(participant_id.to_owned()), @@ -139,41 +129,21 @@ impl PackageWorkspace { interpretation.id )) })?; - let format = interpretation - .output_format - .as_deref() - .unwrap_or("json") - .to_ascii_lowercase(); - let (rows, row_headers) = parse_analysis_output_text(text, &format)?; - outputs.push(serde_json::json!({ - "schema": "bioscript:analysis-output:1.0", - "version": "1.0", - "participant_id": participant_id, - "assay_id": manifest_name, - "analysis_id": interpretation.id, - "analysis_label": interpretation.label, - "kind": interpretation.kind, - "output_format": format, - "manifest_path": manifest_path, - "script_path": script_path, - "output_file": output_file, - "derived_from": interpretation.derived_from, - "emits": interpretation.emits.iter().map(|emit| serde_json::json!({ - "key": emit.key, - "label": emit.label, - "value_type": emit.value_type, - "format": emit.format, - })).collect::>(), - "logic": interpretation.logic.as_ref().map(|logic| serde_json::json!({ - "description": logic.description, - "source": logic.source.as_ref().map(|source| serde_json::json!({ - "name": source.name, - "url": source.url, - })), - })), - "row_headers": row_headers, - "rows": rows, - })); + let (rows, row_headers) = parse_analysis_output_text(text, analysis_format.format)?; + outputs.push(bioscript_reporting::analysis_output_json( + bioscript_reporting::AnalysisOutputJsonInput { + participant_id, + assay_id: manifest_name, + interpretation, + output_format: analysis_format.format, + manifest_path, + script_path: &script_path, + output_file: &output_file, + observations_file: Some(&observations_file), + row_headers, + rows, + }, + )); } Ok(outputs) }