From 7e3234ee3c6d46c64d4402acb102b390cf7a7401 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 14 Nov 2019 18:27:43 +0000 Subject: [PATCH 1/3] Fixed CRAM decode on ultra-long cigar. We can now have many cigar ops (eg ref-skips) that combine to more than 512Mb. This isn't a full fix for long references as CRAM sequence positions are still 32-bit. However this is a starting point down that road and there are still sizes between 28-bit and 32-bit where the CIGAR failed to operate correctly. --- cram/cram_decode.c | 110 +++++++++++++++++++++++++-------------------- cram/cram_encode.c | 4 ++ 2 files changed, 65 insertions(+), 49 deletions(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index dba7cf89c..6f1528dc0 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1087,6 +1087,25 @@ static inline int add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_ return -1; } +static int add_cigar(cram_slice *s, uint64_t cig_len, int cig_op) { + int nlen = 1 + cig_len/(1<<28); + if (s->ncigar + nlen >= s->cigar_alloc) { + s->cigar_alloc = (s->ncigar + nlen) < 1024 ? 1024 : (s->ncigar + nlen)*2; + uint32_t *c2 = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar)); + if (!c2) + return -1; + s->cigar = c2; + } + + while (cig_len) { + int sub_len = cig_len < (1<<28) ? cig_len : (1<<28)-1; + s->cigar[s->ncigar++] = (sub_len<<4) + cig_op; + cig_len -= sub_len; + } + + return 0; +} + /* * Internal part of cram_decode_slice(). * Generates the sequence, quality and cigar components. @@ -1097,13 +1116,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, int has_MD, int has_NM) { int prev_pos = 0, f, r = 0, out_sz = 1; int seq_pos = 1; - int cig_len = 0; + uint64_t cig_len = 0; int64_t ref_pos = cr->apos; int32_t fn, i32; enum cigar_op cig_op = BAM_CMATCH; - uint32_t *cigar = s->cigar; - uint32_t ncigar = s->ncigar; - uint32_t cigar_alloc = s->cigar_alloc; uint32_t nm = 0; int32_t md_dist = 0; int orig_aux = 0; @@ -1134,7 +1150,7 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } ref_pos--; // count from 0 - cr->cigar = ncigar; + cr->cigar = s->ncigar; if (!(ds & (CRAM_FC | CRAM_FP))) goto skip_cigar; @@ -1143,13 +1159,6 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, int32_t pos = 0; char op; - if (ncigar+2 >= cigar_alloc) { - cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024; - if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar)))) - return -1; - s->cigar = cigar; - } - if (ds & CRAM_FC) { if (!c->comp_hdr->codecs[DS_FC]) return -1; r |= c->comp_hdr->codecs[DS_FC]->decode(s, @@ -1231,13 +1240,15 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } #ifdef USE_X if (cig_len && cig_op != BAM_CBASE_MATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } cig_op = BAM_CBASE_MATCH; #else if (cig_len && cig_op != BAM_CMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } cig_op = BAM_CMATCH; @@ -1258,7 +1269,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, int have_sc = 0; if (cig_len) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } switch (CRAM_MAJOR_VERS(fd->version)) { @@ -1297,7 +1309,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (have_sc) { if (r) return r; - cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP; + if (add_cigar(s, out_sz2, BAM_CSOFT_CLIP) < 0) + goto err; cig_op = BAM_CSOFT_CLIP; seq_pos += out_sz2; } @@ -1308,7 +1321,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, unsigned char base; #ifdef USE_X if (cig_len && cig_op != BAM_CBASE_MISMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } if (ds & CRAM_BS) { @@ -1323,7 +1337,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, #else int ref_base; if (cig_len && cig_op != BAM_CMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } if (ds & CRAM_BS) { @@ -1365,7 +1380,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'D': { // Deletion; DL if (cig_len && cig_op != BAM_CDEL) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } if (ds & CRAM_DL) { @@ -1419,7 +1435,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, int32_t out_sz2 = 1; if (cig_len && cig_op != BAM_CINS) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } @@ -1441,7 +1458,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'i': { // Insertion (single base); BA if (cig_len && cig_op != BAM_CINS) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } if (ds & CRAM_BA) { @@ -1463,7 +1481,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, int32_t len = 1; if (cig_len && cig_op != BAM_CMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } @@ -1513,7 +1532,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, int32_t len = 1; if (cig_len && cig_op != BAM_CMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } @@ -1537,12 +1557,14 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'B': { // Read base; BA, QS #ifdef USE_X if (cig_len && cig_op != BAM_CBASE_MISMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } #else if (cig_len && cig_op != BAM_CMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } #endif @@ -1601,7 +1623,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'H': { // hard clip; HC if (cig_len && cig_op != BAM_CHARD_CLIP) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } if (ds & CRAM_HC) { @@ -1618,7 +1641,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'P': { // padding; PD if (cig_len && cig_op != BAM_CPAD) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } if (ds & CRAM_PD) { @@ -1635,7 +1659,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'N': { // Ref skip; RS if (cig_len && cig_op != BAM_CREF_SKIP) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } if (ds & CRAM_RS) { @@ -1722,21 +1747,17 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, ref_pos += cr->len - seq_pos + 1; } - if (ncigar+1 >= cigar_alloc) { - cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024; - if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar)))) - return -1; - s->cigar = cigar; - } #ifdef USE_X if (cig_len && cig_op != BAM_CBASE_MATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } cig_op = BAM_CBASE_MATCH; #else if (cig_len && cig_op != BAM_CMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } cig_op = BAM_CMATCH; @@ -1752,17 +1773,11 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (cig_len) { - if (ncigar >= cigar_alloc) { - cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024; - if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar)))) - return -1; - s->cigar = cigar; - } - - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; } - cr->ncigar = ncigar - cr->cigar; + cr->ncigar = s->ncigar - cr->cigar; cr->aend = ref_pos; //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos); @@ -1787,10 +1802,6 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } } - s->cigar = cigar; - s->cigar_alloc = cigar_alloc; - s->ncigar = ncigar; - if (cr->cram_flags & CRAM_FLAG_NO_SEQ) cr->len = 0; @@ -1834,6 +1845,7 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, hts_log_error("CRAM CIGAR extends beyond slice reference extents"); return -1; + err: block_err: return -1; } diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 6c52c5892..8cbf9391c 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -2773,6 +2773,10 @@ static int process_one_read(cram_fd *fd, cram_container *c, c->num_bases += cr->len; cr->apos = bam_pos(b)+1; + if (cr->apos >= UINT32_MAX) { + hts_log_error("Sequence position out of valid range"); + goto err; + } if (c->pos_sorted) { if (cr->apos < s->last_apos) { c->pos_sorted = 0; From 3c079af92b55619e268b94639ffa1bf1e5970fdf Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 15 Nov 2019 11:36:25 +0000 Subject: [PATCH 2/3] Fix sam_hdr_dup to cope with long refs. The h->sdict hash is used to track references that are > 4Gb in size. The dup code didn't copy this. This manifested itself as CRAM SQ headers being truncated (read SAM hdr, dup, write as CRAM hdr). To fix this a function was written that creates or updates the sdict from the hrecs parsed header structs. It's possible this should be called directly from the sam_hdr_create function (part of the SAM format parser) instead of manually keeping track of sdict itself, however doing so would require initialising the new header structs so I haven't done this. This is a general utility, so perhaps should be made a public part of the header API. However IMO the new header API should hide this nuance away and just return the correct data, also ensuring that header updates work correctly and honour the text form. Since c83c9e26 the header API also was using the 32-bit capped target_len in preference to the parsed text from SQ LN fields when they differed. I am assuming this was a decision in what takes priority in BAM where the sequence names and lengths exist in both text and binary form. This commit reverses this and makes the text form always take priority. As this is at least required in some scenarios (long references) it seems easier to simply make it apply in all scenarios. --- header.c | 32 ++++-------------------------- sam.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ test/test.pl | 6 ++++++ 3 files changed, 65 insertions(+), 28 deletions(-) diff --git a/header.c b/header.c index 5a4a77d3d..08e000cae 100644 --- a/header.c +++ b/header.c @@ -47,7 +47,6 @@ typedef khash_t(rm) rmhash_t; static int sam_hdr_link_pg(sam_hdr_t *bh); static int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap); -static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...); #define MAX_ERROR_QUOTE 320 // Prevent over-long error messages @@ -182,14 +181,10 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs, if (hrecs->ref[nref].ty == NULL) { // Attach header line to existing stub entry. hrecs->ref[nref].ty = h_type; - // Check lengths match; correct if not. - if (len != hrecs->ref[nref].len) { - char tmp[32]; - snprintf(tmp, sizeof(tmp), "%" PRIhts_pos, - hrecs->ref[nref].len); - if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0) - return -1; - } + // The old hrecs length may be incorrect as it is initially + // populated from h->target_len which has a max 32-bit size. + // We always take our latest data as canonical. + hrecs->ref[nref].len = len; if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0) return -1; @@ -2485,25 +2480,6 @@ int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap) { return 0; } -/* - * Adds or updates tag key,value pairs in a header line. - * Eg for adding M5 tags to @SQ lines or updating sort order for the - * @HD line. - * - * Specify multiple key,value pairs ending in NULL. - * - * Returns 0 on success - * -1 on failure - */ -static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...) { - va_list args; - int res; - va_start(args, type); - res = sam_hrecs_vupdate(hrecs, type, args); - va_end(args); - return res; -} - /* * Looks for a specific key in a single sam header line identified by *type. * If prev is non-NULL it also fills this out with the previous tag, to diff --git a/sam.c b/sam.c index 7cd976024..ee1df8225 100644 --- a/sam.c +++ b/sam.c @@ -131,6 +131,53 @@ void sam_hdr_destroy(sam_hdr_t *bh) free(bh); } +/* + * Create or update the header sdict field from the new + * parsed hrecs data. This is important for sam_hdr_dup or + * when reading / creating a new header. + * + * Returns 0 on success, + * -1 on failure + */ +static int sam_hdr_update_sdict(sam_hdr_t *h) { + int i; + khash_t(s2i) *long_refs = h->sdict; + + if (!h->hrecs) + return 0; + + for (i = 0; i < h->n_targets; i++) { + hts_pos_t len = sam_hdr_tid2len(h, i); + const char *name = sam_hdr_tid2name(h, i); + khint_t k; + if (len < UINT32_MAX) { + if (!long_refs) + continue; + + // Check if we have an old length, if so remove it. + k = kh_get(s2i, long_refs, name); + if (k < kh_end(long_refs)) + kh_del(s2i, long_refs, k); + } + + if (!long_refs) { + if (!(h->sdict = long_refs = kh_init(s2i))) + goto err; + } + + // Add / update length + int absent; + k = kh_put(s2i, long_refs, name, &absent); + if (absent < 0) + goto err; + kh_val(long_refs, k) = len; + } + return 0; + + err: + return -1; +} + sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0) { if (h0 == NULL) return NULL; @@ -179,6 +226,14 @@ sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0) h->text[h->l_text] = '\0'; } + if (h0->sdict) { + // We have a reason to use new API, so build it so we + // can replicate sdict. + if (sam_hdr_fill_hrecs(h) < 0 || + sam_hdr_update_sdict(h) < 0) + goto fail; + } + return h; fail: diff --git a/test/test.pl b/test/test.pl index ca0d766c1..69cc7b5e1 100755 --- a/test/test.pl +++ b/test/test.pl @@ -645,6 +645,12 @@ sub test_view testv $opts, "./test_view $tv_args -p longrefs/longref.tmp.sam_ longrefs/longref.tmp.sam.gz"; testv $opts, "./compare_sam.pl longrefs/longref.sam longrefs/longref.tmp.sam_"; + # CRAM disabled for now as the positions cannot be 32-bit. (These tests are useful for + # checking SQ headers only.) + # testv $opts, "./test_view $tv_args -C -o no_ref -p longrefs/longref.tmp.cram longrefs/longref.sam"; + # testv $opts, "./test_view $tv_args -p longrefs/longref.tmp.sam_ longrefs/longref.tmp.cram"; + # testv $opts, "./compare_sam.pl longrefs/longref.sam longrefs/longref.tmp.sam_"; + # Build index and compare with on-the-fly one made earlier. test_compare $opts, "$$opts{path}/test_index -c longrefs/longref.tmp.sam.gz", "longrefs/longref.tmp.sam.gz.csi.otf", "longrefs/longref.tmp.sam.gz.csi", gz=>1; From 441ae6d51a9c296ca5487fbbf046e4ebe2fd980e Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Mon, 18 Nov 2019 14:44:55 +0000 Subject: [PATCH 3/3] Create/update long reference length dictionary (`sdict`) together with `target_name` and `target_len`. Rename `rebuild_target_arrays` to `sam_hdr_rebuild_target_arrays`. Remove unnecessary external condition on `hrecs->refs_changed`. Rearrange the position of `sam_hrecs_rebuild_text`, to reflect its internal usage. --- header.c | 87 +++++++++++++++++++++++++++++++++++++------------------- header.h | 2 +- sam.c | 55 ----------------------------------- 3 files changed, 59 insertions(+), 85 deletions(-) diff --git a/header.c b/header.c index 08e000cae..e34168aac 100644 --- a/header.c +++ b/header.c @@ -744,6 +744,23 @@ static int sam_hrecs_rebuild_lines(const sam_hrecs_t *hrecs, kstring_t *ks) { return 0; } +/* + * Reconstructs a kstring from the header hash table. + * Returns 0 on success + * -1 on failure + */ +int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks) { + ks->l = 0; + + if (!hrecs->h || !hrecs->h->size) { + return kputsn("", 0, ks) >= 0 ? 0 : -1; + } + if (sam_hrecs_rebuild_lines(hrecs, ks) != 0) + return -1; + + return 0; +} + static int sam_hrecs_parse_lines(sam_hrecs_t *hrecs, const char *hdr, size_t len) { size_t i, lno; @@ -911,6 +928,8 @@ int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs, // hrecs->refs_changed is the first ref that has been updated, so ones // before that can be skipped. int i; + khint_t k; + khash_t(s2i) *long_refs = bh->sdict; for (i = refs_changed; i < hrecs->nref; i++) { if (i >= bh->n_targets || strcmp(bh->target_name[i], hrecs->ref[i].name) != 0) { @@ -922,13 +941,38 @@ int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs, } if (hrecs->ref[i].len < UINT32_MAX) { bh->target_len[i] = hrecs->ref[i].len; + + if (!long_refs) + continue; + + // Check if we have an old length, if so remove it. + k = kh_get(s2i, long_refs, bh->target_name[i]); + if (k < kh_end(long_refs)) + kh_del(s2i, long_refs, k); } else { bh->target_len[i] = UINT32_MAX; + + if (!long_refs) { + if (!(bh->sdict = long_refs = kh_init(s2i))) + return -1; + } + + // Add / update length + int absent; + k = kh_put(s2i, long_refs, bh->target_name[i], &absent); + if (absent < 0) + return -1; + kh_val(long_refs, k) = hrecs->ref[i].len; } } // Free up any names that have been removed for (; i < bh->n_targets; i++) { + if (long_refs) { + k = kh_get(s2i, long_refs, bh->target_name[i]); + if (k < kh_end(long_refs)) + kh_del(s2i, long_refs, k); + } free(bh->target_name[i]); } @@ -936,7 +980,11 @@ int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs, return 0; } -static int rebuild_target_arrays(sam_hdr_t *bh) { +/*! Rebuild own target_name and target_len arrays from internal hrecs + * + * @return 0 on success; -1 on failure + */ +static int sam_hdr_rebuild_target_arrays(sam_hdr_t *bh) { if (!bh || !bh->hrecs) return -1; @@ -1097,7 +1145,7 @@ int sam_hdr_fill_hrecs(sam_hdr_t *bh) { bh->hrecs = hrecs; - if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) + if (sam_hdr_rebuild_target_arrays(bh) != 0) return -1; return 0; @@ -1192,11 +1240,9 @@ int sam_hdr_rebuild(sam_hdr_t *bh) { if (!(hrecs = bh->hrecs)) return bh->text ? 0 : -1; - if (hrecs->refs_changed >= 0) { - if (rebuild_target_arrays(bh) < 0) { - hts_log_error("Header target array rebuild has failed"); - return -1; - } + if (sam_hdr_rebuild_target_arrays(bh) < 0) { + hts_log_error("Header target array rebuild has failed"); + return -1; } /* If header text wasn't changed or header is empty, don't rebuild it. */ @@ -1253,7 +1299,7 @@ int sam_hdr_add_lines(sam_hdr_t *bh, const char *lines, size_t len) { if (sam_hrecs_parse_lines(hrecs, lines, len) != 0) return -1; - if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) + if (sam_hdr_rebuild_target_arrays(bh) != 0) return -1; hrecs->dirty = 1; @@ -1288,7 +1334,7 @@ int sam_hdr_add_line(sam_hdr_t *bh, const char *type, ...) { va_end(args); if (ret == 0) { - if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) + if (sam_hdr_rebuild_target_arrays(bh) != 0) return -1; if (hrecs->dirty) @@ -1383,7 +1429,7 @@ int sam_hdr_remove_line_id(sam_hdr_t *bh, const char *type, const char *ID_key, int ret = sam_hrecs_remove_line(hrecs, type, type_found); if (ret == 0) { - if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) + if (sam_hdr_rebuild_target_arrays(bh) != 0) return -1; if (hrecs->dirty) @@ -1423,7 +1469,7 @@ int sam_hdr_remove_line_pos(sam_hdr_t *bh, const char *type, int position) { int ret = sam_hrecs_remove_line(hrecs, type, type_found); if (ret == 0) { - if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) + if (sam_hdr_rebuild_target_arrays(bh) != 0) return -1; if (hrecs->dirty) @@ -1553,7 +1599,7 @@ int sam_hdr_update_line(sam_hdr_t *bh, const char *type, ret = sam_hrecs_update_hashes(hrecs, TYPEKEY(type), ty); if (!ret && hrecs->refs_changed >= 0) - ret = rebuild_target_arrays(bh); + ret = sam_hdr_rebuild_target_arrays(bh); if (!ret && hrecs->dirty) redact_header_text(bh); @@ -1900,23 +1946,6 @@ int sam_hdr_remove_tag_id(sam_hdr_t *bh, return ret; } -/* - * Reconstructs a kstring from the header hash table. - * Returns 0 on success - * -1 on failure - */ -int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks) { - ks->l = 0; - - if (!hrecs->h || !hrecs->h->size) { - return kputsn("", 0, ks) >= 0 ? 0 : -1; - } - if (sam_hrecs_rebuild_lines(hrecs, ks) != 0) - return -1; - - return 0; -} - /* * Looks up a reference sequence by name and returns the numerical ID. * Returns -1 if unknown reference; -2 if header could not be parsed. diff --git a/header.h b/header.h index 810a3dda1..c6fbd7f5f 100644 --- a/header.h +++ b/header.h @@ -249,7 +249,7 @@ sam_hrecs_t *sam_hrecs_new(void); */ sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *hrecs); -/*! Update sam_hdr_t target_name and target_len arrays +/*! Update sam_hdr_t target_name and target_len arrays from source sam_hrecs_t * * sam_hdr_t and sam_hrecs_t are specified separately so that sam_hdr_dup * can use it to construct target arrays from the source header. diff --git a/sam.c b/sam.c index ee1df8225..7cd976024 100644 --- a/sam.c +++ b/sam.c @@ -131,53 +131,6 @@ void sam_hdr_destroy(sam_hdr_t *bh) free(bh); } -/* - * Create or update the header sdict field from the new - * parsed hrecs data. This is important for sam_hdr_dup or - * when reading / creating a new header. - * - * Returns 0 on success, - * -1 on failure - */ -static int sam_hdr_update_sdict(sam_hdr_t *h) { - int i; - khash_t(s2i) *long_refs = h->sdict; - - if (!h->hrecs) - return 0; - - for (i = 0; i < h->n_targets; i++) { - hts_pos_t len = sam_hdr_tid2len(h, i); - const char *name = sam_hdr_tid2name(h, i); - khint_t k; - if (len < UINT32_MAX) { - if (!long_refs) - continue; - - // Check if we have an old length, if so remove it. - k = kh_get(s2i, long_refs, name); - if (k < kh_end(long_refs)) - kh_del(s2i, long_refs, k); - } - - if (!long_refs) { - if (!(h->sdict = long_refs = kh_init(s2i))) - goto err; - } - - // Add / update length - int absent; - k = kh_put(s2i, long_refs, name, &absent); - if (absent < 0) - goto err; - kh_val(long_refs, k) = len; - } - return 0; - - err: - return -1; -} - sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0) { if (h0 == NULL) return NULL; @@ -226,14 +179,6 @@ sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0) h->text[h->l_text] = '\0'; } - if (h0->sdict) { - // We have a reason to use new API, so build it so we - // can replicate sdict. - if (sam_hdr_fill_hrecs(h) < 0 || - sam_hdr_update_sdict(h) < 0) - goto fail; - } - return h; fail: