From 0f3cb9eee31aaaf0e309212cac8e0cc8a6c80630 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 17 Sep 2024 11:24:16 +0100 Subject: [PATCH] Make the mpileup BAM_CREF_SKIP filter optional. Mpileup removes alignments using the cigar ref skip operator ("N"). This was originally added in 2011 in samtools/samtools#d1643d6 with the commit message of "fixed a bug in indel calling related to unmapped and refskip reads". Unfortunately I don't know what that bug was, but removing the code shows it still works (at least for some data!). We need better understanding of what's going on and why it was added, but this PR makes it optional, keeping the default as before. Note there appears to be no filtering of BAM_CREF_SKIP in indels-2.0 so the option would be a nop there. This is an alternative PR to #2281. I've leave it to the project maintainer as to what is preferable: removing the (no longer needed?) filtering, or keeping the default behaviour identical and adding a new option instead (which is safer, but possibly leads to accidental bad calls due to not noticing a new option has appeared). Fixes #2277 --- bam2bcf.h | 2 +- bam2bcf_edlib.c | 17 ++++++++++------- bam2bcf_indel.c | 17 ++++++++++------- doc/bcftools.txt | 8 ++++++++ mpileup.c | 10 +++++++++- 5 files changed, 38 insertions(+), 16 deletions(-) diff --git a/bam2bcf.h b/bam2bcf.h index 8f8f8db5..18cb4b76 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -108,7 +108,7 @@ plp_cd_t; typedef struct __bcf_callaux_t { - int fmt_flag, ambig_reads; + int fmt_flag, ambig_reads, ref_skip_reads; int capQ, min_baseQ, max_baseQ, delta_baseQ; int openQ, extQ, tandemQ; // for indels uint32_t min_support, max_support; // for collecting indel candidates diff --git a/bam2bcf_edlib.c b/bam2bcf_edlib.c index 4e0a38c3..5ed31725 100644 --- a/bam2bcf_edlib.c +++ b/bam2bcf_edlib.c @@ -1559,18 +1559,21 @@ int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, } } - int qbeg, qpos, qend, tbeg, tend, kk; + int qbeg, qpos, qend, tbeg, tend; uint8_t *seq = bam_get_seq(p->b); - uint32_t *cigar = bam_get_cigar(p->b); if (p->b->core.flag & BAM_FUNMAP) continue; // FIXME: the following loop should be better moved outside; // nonetheless, realignment should be much slower anyway. - for (kk = 0; kk < p->b->core.n_cigar; ++kk) - if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) - break; - if (kk < p->b->core.n_cigar) - continue; + if (!bca->ref_skip_reads) { + uint32_t *cigar = bam_get_cigar(p->b); + int kk; + for (kk = 0; kk < p->b->core.n_cigar; ++kk) + if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) + break; + if (kk < p->b->core.n_cigar) + continue; + } // determine the start and end of sequences for alignment int left2 = left, right2 = right; diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 975504f8..7c337b48 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -845,18 +845,21 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, } } - int qbeg, qpos, qend, tbeg, tend, kk; + int qbeg, qpos, qend, tbeg, tend; uint8_t *seq = bam_get_seq(p->b); - uint32_t *cigar = bam_get_cigar(p->b); if (p->b->core.flag & BAM_FUNMAP) continue; // FIXME: the following loop should be better moved outside; // nonetheless, realignment should be much slower anyway. - for (kk = 0; kk < p->b->core.n_cigar; ++kk) - if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) - break; - if (kk < p->b->core.n_cigar) - continue; + if (!bca->ref_skip_reads) { + uint32_t *cigar = bam_get_cigar(p->b); + int kk; + for (kk = 0; kk < p->b->core.n_cigar; ++kk) + if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) + break; + if (kk < p->b->core.n_cigar) + continue; + } // determine the start and end of sequences for alignment // FIXME: loops over CIGAR multiple times diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 5cced978..951dd1f7 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -2503,6 +2503,14 @@ INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for exclude from calling and increment the first value of the AD counter ('incAD0') ['drop'] +*--ref-skip-reads*:: + Include reads with CIGAR 'N' (ref-skip) operators in their alignment. + By default these are filtered out. Enabling this option may be + necessary for calling on some RNASeq pipelines. This works with + the default caller and --indels-cns. The experimental -indels-2.0 + option does not filter out alignments with ref-skips so this option + is unnecessary there. + *-e, --ext-prob* 'INT':: Phred-scaled gap extension sequencing error probability. Reducing 'INT' leads to longer indels [20] diff --git a/mpileup.c b/mpileup.c index 943e0f6f..2aa76e67 100644 --- a/mpileup.c +++ b/mpileup.c @@ -68,7 +68,7 @@ typedef struct _mplp_pileup_t mplp_pileup_t; // Data shared by all bam files typedef struct { int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth, - max_indel_depth, max_read_len, ambig_reads; + max_indel_depth, max_read_len, ambig_reads, ref_skip_reads; uint32_t fmt_flag; int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type; int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels @@ -882,6 +882,7 @@ static int mpileup(mplp_conf_t *conf) conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; conf->bca->fmt_flag = conf->fmt_flag; conf->bca->ambig_reads = conf->ambig_reads; + conf->bca->ref_skip_reads = conf->ref_skip_reads; conf->bca->indel_win_size = conf->indel_win_size; conf->bca->indels_v20 = conf->indels_v20; conf->bca->edlib = conf->edlib; @@ -1291,6 +1292,8 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -p, --per-sample-mF Apply -m and -F per-sample for increased sensitivity\n" " -P, --platforms STR Comma separated list of platforms for indels [all]\n" " --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n"); + fprintf(fp, + " --ref-skip-reads Use reads with N CIGAR op [discard by default]\n"); fprintf(fp, " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias); fprintf(fp, @@ -1381,6 +1384,7 @@ int main_mpileup(int argc, char *argv[]) mplp.fmt_flag = B2B_INFO_BQBZ|B2B_INFO_IDV|B2B_INFO_IMF|B2B_INFO_MQ0F|B2B_INFO_MQBZ|B2B_INFO_MQSBZ|B2B_INFO_RPBZ|B2B_INFO_SCBZ|B2B_INFO_SGB|B2B_INFO_VDB|B2B_FMT_AD; mplp.max_read_len = 500; mplp.ambig_reads = B2B_DROP; + mplp.ref_skip_reads = 0; mplp.indel_win_size = 110; mplp.poly_mqual = 0; mplp.seqQ_offset = 120; @@ -1458,6 +1462,7 @@ int main_mpileup(int argc, char *argv[]) {"seed", required_argument, NULL, 13}, {"ambig-reads", required_argument, NULL, 14}, {"ar", required_argument, NULL, 14}, + {"ref-skip-reads", no_argument, NULL, 29}, {"write-index",optional_argument,NULL,'W'}, {"del-bias", required_argument, NULL, 23}, {"poly-mqual", no_argument, NULL, 24}, @@ -1620,6 +1625,9 @@ int main_mpileup(int argc, char *argv[]) if (mplp.seqQ_offset > 200) mplp.seqQ_offset = 200; break; + case 29: + mplp.ref_skip_reads = 1; + break; case 23: mplp.del_bias = atof(optarg); break; case 24: mplp.poly_mqual = 1; break; case 26: mplp.poly_mqual = 0; break;