Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 23 additions & 65 deletions rust/bioscript-formats/src/genotype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use zip::ZipArchive;
use bioscript_core::{RuntimeError, VariantObservation, VariantSpec};

mod backends;
mod cache;
mod common;
mod cram_backend;
mod delimited;
Expand All @@ -19,6 +20,7 @@ mod types;
mod vcf;
mod vcf_tokens;

pub(crate) use cache::{match_cached_observation, required_cache_miss};
pub(crate) use common::{describe_query, normalize_genotype, variant_sort_key};
pub use cram_backend::{
observe_cram_deletion_with_reader, observe_cram_indel_with_reader, observe_cram_snp_with_reader,
Expand All @@ -27,14 +29,17 @@ pub(crate) use delimited::{
COMMENT_PREFIXES, DelimitedColumnIndexes, Delimiter, detect_delimiter, parse_streaming_row,
};
use delimited::{RowParser, scan_delimited_variants};
use io::{detect_source_format, is_bgzf_path, read_lines_from_reader, select_zip_entry};
use io::{
detect_source_format, is_bgzf_path, looks_like_vcf_lines, read_lines_from_reader,
select_zip_entry,
};
pub use types::{
BackendCapabilities, GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind,
};
use types::{CramBackend, DelimitedBackend, QueryBackend, RsidMapBackend, VcfBackend};
pub use vcf::{
choose_variant_locus_for_assembly, imputed_reference_observation, observe_vcf_snp_with_reader,
observe_vcf_variant_with_reader,
choose_variant_locus_for_assembly, detect_vcf_assembly, imputed_reference_observation,
observe_vcf_snp_with_reader, observe_vcf_variant_with_reader,
};
use vcf::{lookup_indexed_vcf_variants, scan_vcf_variants};
pub(crate) use vcf_tokens::genotype_from_vcf_gt;
Expand Down Expand Up @@ -119,12 +124,16 @@ impl GenotypeStore {
}

pub fn from_bytes(name: &str, bytes: &[u8]) -> Result<Self, RuntimeError> {
// The report pipeline hands us a fixed virtual path (`/input/genotypes`)
// with no extension, so we cannot rely on `name` alone for format
// detection the way `from_file_with_options` can. Sniff the leading
// bytes so a zip/VCF payload is recognised regardless of the name.
let lower = name.to_ascii_lowercase();
if lower.ends_with(".zip") {
if lower.ends_with(".zip") || bytes_look_like_zip(bytes) {
return Self::from_zip_bytes(name, bytes);
}
let reader = BufReader::new(Cursor::new(bytes));
if lower.ends_with(".vcf") {
if lower.ends_with(".vcf") || bytes_look_like_vcf(bytes) {
return Self::from_vcf_reader(reader, name);
}
Self::from_delimited_reader(GenotypeSourceFormat::Text, reader, name)
Expand Down Expand Up @@ -425,68 +434,17 @@ impl GenotypeStore {
}
}

/// Match a `VariantSpec` against a pre-resolved observation list. Tries rsid
/// equality first (most common case for `PGx` panels), then falls back to a
/// chrom+pos+ref+alt match against either `GRCh37` or `GRCh38` loci so cached
/// observations from a CRAM lookup (which may have been done on one assembly)
/// can satisfy a script that supplies the spec on the other.
fn match_cached_observation<'a>(
observations: &'a [VariantObservation],
spec: &VariantSpec,
) -> Option<&'a VariantObservation> {
if let Some(matched) = observations.iter().find(|obs| {
obs.matched_rsid
.as_deref()
.is_some_and(|rsid| spec.rsids.iter().any(|target| target == rsid))
}) {
return Some(matched);
}
let assembly_loci = [spec.grch37.as_ref(), spec.grch38.as_ref()]
.into_iter()
.flatten()
.collect::<Vec<_>>();
let target_ref = spec.reference.as_deref();
let target_alt = spec.alternate.as_deref();
observations.iter().find(|obs| {
let evidence_match = assembly_loci.iter().any(|loci| {
obs.evidence
.iter()
.any(|line| line.contains(&loci.chrom) && line.contains(&loci.start.to_string()))
});
if !evidence_match {
return false;
}
match (target_ref, target_alt) {
(Some(r), Some(a)) => obs
.evidence
.iter()
.any(|line| line.contains(r) && line.contains(a)),
_ => true,
}
})
fn bytes_look_like_zip(bytes: &[u8]) -> bool {
bytes.starts_with(b"PK\x03\x04")
|| bytes.starts_with(b"PK\x05\x06")
|| bytes.starts_with(b"PK\x07\x08")
}

fn required_cache_miss(spec: &VariantSpec) -> RuntimeError {
let rsids = if spec.rsids.is_empty() {
"<none>".to_owned()
} else {
spec.rsids.join("|")
};
let loci = [
spec.grch37
.as_ref()
.map(|locus| format!("grch37:{}:{}-{}", locus.chrom, locus.start, locus.end)),
spec.grch38
.as_ref()
.map(|locus| format!("grch38:{}:{}-{}", locus.chrom, locus.start, locus.end)),
]
.into_iter()
.flatten()
.collect::<Vec<_>>()
.join(",");
RuntimeError::InvalidArguments(format!(
"required preloaded genotype observation missing for rsids={rsids} loci={loci}"
))
fn bytes_look_like_vcf(bytes: &[u8]) -> bool {
let prefix = &bytes[..bytes.len().min(8192)];
let text = String::from_utf8_lossy(prefix);
let lines: Vec<String> = text.lines().map(str::to_owned).collect();
looks_like_vcf_lines(&lines)
}

#[cfg(test)]
Expand Down
65 changes: 65 additions & 0 deletions rust/bioscript-formats/src/genotype/cache.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
use bioscript_core::{RuntimeError, VariantObservation, VariantSpec};

/// Match a `VariantSpec` against a pre-resolved observation list. Tries rsid
/// equality first (most common case for `PGx` panels), then falls back to a
/// chrom+pos+ref+alt match against either `GRCh37` or `GRCh38` loci so cached
/// observations from a CRAM lookup (which may have been done on one assembly)
/// can satisfy a script that supplies the spec on the other.
pub(crate) fn match_cached_observation<'a>(
observations: &'a [VariantObservation],
spec: &VariantSpec,
) -> Option<&'a VariantObservation> {
if let Some(matched) = observations.iter().find(|obs| {
obs.matched_rsid
.as_deref()
.is_some_and(|rsid| spec.rsids.iter().any(|target| target == rsid))
}) {
return Some(matched);
}
let assembly_loci = [spec.grch37.as_ref(), spec.grch38.as_ref()]
.into_iter()
.flatten()
.collect::<Vec<_>>();
let target_ref = spec.reference.as_deref();
let target_alt = spec.alternate.as_deref();
observations.iter().find(|obs| {
let evidence_match = assembly_loci.iter().any(|loci| {
obs.evidence
.iter()
.any(|line| line.contains(&loci.chrom) && line.contains(&loci.start.to_string()))
});
if !evidence_match {
return false;
}
match (target_ref, target_alt) {
(Some(r), Some(a)) => obs
.evidence
.iter()
.any(|line| line.contains(r) && line.contains(a)),
_ => true,
}
})
}

pub(crate) fn required_cache_miss(spec: &VariantSpec) -> RuntimeError {
let rsids = if spec.rsids.is_empty() {
"<none>".to_owned()
} else {
spec.rsids.join("|")
};
let loci = [
spec.grch37
.as_ref()
.map(|locus| format!("grch37:{}:{}-{}", locus.chrom, locus.start, locus.end)),
spec.grch38
.as_ref()
.map(|locus| format!("grch38:{}:{}-{}", locus.chrom, locus.start, locus.end)),
]
.into_iter()
.flatten()
.collect::<Vec<_>>()
.join(",");
RuntimeError::InvalidArguments(format!(
"required preloaded genotype observation missing for rsids={rsids} loci={loci}"
))
}
2 changes: 1 addition & 1 deletion rust/bioscript-formats/src/genotype/vcf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,6 @@ pub(crate) fn extract_vcf_sample_genotype(
genotype_from_vcf_gt(sample_gt, reference, &alternate_refs)
}

pub(crate) fn detect_vcf_assembly(path: &Path, probe_lines: &[String]) -> Option<Assembly> {
pub fn detect_vcf_assembly(path: &Path, probe_lines: &[String]) -> Option<Assembly> {
detect_assembly(&path.to_string_lossy().to_ascii_lowercase(), probe_lines)
}
2 changes: 1 addition & 1 deletion rust/bioscript-formats/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ mod prepare;

pub use genotype::{
BackendCapabilities, GenotypeLoadOptions, GenotypeSourceFormat, GenotypeStore, QueryKind,
choose_variant_locus_for_assembly, imputed_reference_observation,
choose_variant_locus_for_assembly, detect_vcf_assembly, imputed_reference_observation,
observe_cram_deletion_with_reader, observe_cram_indel_with_reader,
observe_cram_snp_with_reader, observe_vcf_snp_with_reader, observe_vcf_variant_with_reader,
};
Expand Down
2 changes: 1 addition & 1 deletion rust/bioscript-reporting/src/html.rs

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion rust/bioscript-reporting/src/html/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ pub(super) fn render_table_start(out: &mut String, table_id: &str, headers: &[&s
let escaped_id = html_escape(table_id);
let _ = write!(
out,
"<div class=\"table-tools\"><input type=\"search\" placeholder=\"Filter table\" data-filter-for=\"{escaped_id}\" oninput=\"applyTableFilters('{escaped_id}')\"></div><div class=\"table-wrap\"><table id=\"{escaped_id}\"><thead><tr>"
"<div class=\"table-tools\"><input type=\"search\" placeholder=\"Filter table\" data-filter-for=\"{escaped_id}\" oninput=\"scheduleTableFilter('{escaped_id}')\"></div><div class=\"table-wrap\"><table id=\"{escaped_id}\"><thead><tr>"
);
for (index, header) in headers.iter().enumerate() {
let _ = write!(
Expand Down
64 changes: 64 additions & 0 deletions rust/bioscript-reporting/src/manifest.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::{
collections::BTreeSet,
fs,
path::{Path, PathBuf},
};
Expand Down Expand Up @@ -284,6 +285,7 @@ pub fn collect_variant_manifest_tasks(
}
}
}
dedupe_variant_manifest_tasks(&mut tasks);
Ok(tasks)
}
ReportManifestKind::Assay => {
Expand All @@ -305,11 +307,17 @@ pub fn collect_variant_manifest_tasks(
}
}
}
dedupe_variant_manifest_tasks(&mut tasks);
Ok(tasks)
}
}
}

fn dedupe_variant_manifest_tasks(tasks: &mut Vec<VariantManifestTask>) {
let mut seen = BTreeSet::new();
tasks.retain(|task| seen.insert(task.manifest_path.clone()));
}

fn load_variant_task(
workspace: &impl ManifestWorkspace,
path: &str,
Expand Down Expand Up @@ -804,6 +812,62 @@ analyses:
assert_eq!(variant_tasks[0].manifest.name, "rs1");
}

#[test]
fn collect_variant_manifest_tasks_dedupes_catalogue_reached_through_panel_and_assay() {
let workspace = MapWorkspace {
files: BTreeMap::from([
(
"panel.yaml".to_owned(),
r#"
schema: bioscript:panel:1.0
version: "1.0"
name: panel
members:
- kind: variant-catalogue
path: catalogue.yaml
- kind: assay
path: assay.yaml
"#
.to_owned(),
),
(
"assay.yaml".to_owned(),
r#"
schema: bioscript:assay:1.0
version: "1.0"
name: assay
members:
- kind: variant-catalogue
path: catalogue.yaml
"#
.to_owned(),
),
(
"catalogue.yaml".to_owned(),
r#"
schema: bioscript:variant-catalogue:1.0
version: "1.0"
name: catalogue
variants:
source: variants.tsv
"#
.to_owned(),
),
(
"variants.tsv".to_owned(),
r#"id name rsid gene ref alt kind grch38_chrom grch38_pos
rs1 rs1 rs1 GENE A G snp 1 123
"#
.to_owned(),
),
]),
};

let tasks = collect_variant_manifest_tasks(&workspace, "panel.yaml", &[]).unwrap();
assert_eq!(tasks.len(), 1);
assert_eq!(tasks[0].manifest_path, "catalogue.yaml#rs1");
}

#[test]
fn manifest_context_and_findings_follow_includes_members_and_inherited_bindings() {
let workspace = MapWorkspace {
Expand Down
Loading
Loading