From ac71f8cb76fe5d129002e1dacd815bcc3965dee8 Mon Sep 17 00:00:00 2001 From: Fabian Wahle Date: Fri, 22 May 2026 20:54:51 +0200 Subject: [PATCH] Add chunked hFILE input scheme --- hfile.c | 2 + hfile_internal.h | 4 + hts.c | 13 +- htslib/hfile.h | 4 + multipart.c | 392 +++++++++++++++++++++++++++++++++++++++++++++++ test/test.pl | 26 ++++ 6 files changed, 438 insertions(+), 3 deletions(-) diff --git a/hfile.c b/hfile.c index 097085353..050a3d143 100644 --- a/hfile.c +++ b/hfile.c @@ -1111,6 +1111,7 @@ static int init_add_plugin(void *obj, int (*init)(struct hFILE_plugin *), static int load_hfile_plugins(void) { static const struct hFILE_scheme_handler + chunked = { hopen_chunked_manifest, hfile_always_local, "built-in", 80 }, data = { hopen_mem, hfile_always_local, "built-in", 80 }, file = { hopen_fd_fileuri, hfile_always_local, "built-in", 80 }, preload = { hopen_preload, is_preload_url_remote, "built-in", 80 }; @@ -1119,6 +1120,7 @@ static int load_hfile_plugins(void) if (schemes == NULL) return -1; + hfile_add_scheme_handler("chunked", &chunked); hfile_add_scheme_handler("data", &data); hfile_add_scheme_handler("file", &file); hfile_add_scheme_handler("preload", &preload); diff --git a/hfile_internal.h b/hfile_internal.h index 3c1fc9ab8..be814fd2e 100644 --- a/hfile_internal.h +++ b/hfile_internal.h @@ -57,6 +57,10 @@ struct BGZF; */ struct hFILE *bgzf_hfile(struct BGZF *fp); +/* Opens chunked: manifests as seekable logical files. */ +struct hFILE *hopen_chunked_manifest(const char *url, const char *mode) + HTS_RESULT_USED; + /*! @abstract Closes all hFILE plugins that have been loaded */ diff --git a/hts.c b/hts.c index bb16cdff4..e8fda4787 100644 --- a/hts.c +++ b/hts.c @@ -2827,11 +2827,17 @@ static int idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt) int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) { int ret, save; + const char *fn_local; + if (idx == NULL || fn == NULL) { errno = EINVAL; return -1; } - char *fnidx = (char*)calloc(1, strlen(fn) + 5); + + fn_local = strncmp(fn, "chunked:", 8) == 0 && !hisremote(fn + 8) + ? fn + 8 : fn; + + char *fnidx = (char*)calloc(1, strlen(fn_local) + 5); if (fnidx == NULL) return -1; - strcpy(fnidx, fn); + strcpy(fnidx, fn_local); switch (fmt) { case HTS_FMT_BAI: strcat(fnidx, ".bai"); break; case HTS_FMT_CSI: strcat(fnidx, ".csi"); break; @@ -4776,8 +4782,9 @@ int hts_idx_check_local(const char *fn, int fmt, char **fnidx) { break; } } else { + if (strncmp(fn, "chunked:", 8) == 0 && !hisremote(fn + 8)) fn_tmp = fn + 8; // Borrowed from hopen_fd_fileuri() - if (strncmp(fn, "file://localhost/", 17) == 0) fn_tmp = fn + 16; + else if (strncmp(fn, "file://localhost/", 17) == 0) fn_tmp = fn + 16; else if (strncmp(fn, "file:///", 8) == 0) fn_tmp = fn + 7; else fn_tmp = fn; #if defined(_WIN32) || defined(__MSYS__) diff --git a/htslib/hfile.h b/htslib/hfile.h index cbf6bf68e..c230235f3 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -69,6 +69,10 @@ The usual `fopen(3)` _mode_ letters are supported: one of `r` (read), `w` (write), `a` (append), optionally followed by any of `+` (update), `e` (close on `exec(2)`), `x` (create exclusively), `:` (indicates scheme-specific variable arguments follow). + +The built-in `chunked:` scheme opens a read-only manifest containing one +chunk filename per line and presents the chunks as one seekable logical file. +Blank lines and lines beginning with `#` are ignored. */ HTSLIB_EXPORT hFILE *hopen(const char *filename, const char *mode, ...) HTS_RESULT_USED; diff --git a/multipart.c b/multipart.c index 12d0df282..0ec2b568b 100644 --- a/multipart.c +++ b/multipart.c @@ -37,6 +37,12 @@ DEALINGS IN THE SOFTWARE. */ #ifndef EPROTO #define EPROTO ENOEXEC #endif +#ifndef ENOTSUP +#define ENOTSUP EINVAL +#endif +#ifndef EOVERFLOW +#define EOVERFLOW ERANGE +#endif typedef struct hfile_part { char *url; @@ -140,6 +146,392 @@ static const struct hFILE_backend multipart_backend = multipart_read, multipart_write, multipart_seek, NULL, multipart_close }; +typedef struct hfile_chunked_part { + char *url; + off_t offset; + off_t length; +} hfile_chunked_part; + +typedef struct { + hFILE base; + hfile_chunked_part *parts; + size_t nparts, maxparts, current; + hFILE *currentfp; + off_t length, position; +} hFILE_chunked; + +static int chunked_has_uri_scheme(const char *s) +{ + int n = 0; + + while (isalnum_c(s[n]) || s[n] == '+' || s[n] == '-' || s[n] == '.') + n++; + + return n > 1 && s[n] == ':'; +} + +static int chunked_is_absolute_path(const char *s) +{ + if (s[0] == '/') + return 1; +#if defined(_WIN32) || defined(__MSYS__) + if (isalpha_c(s[0]) && s[1] == ':' && (s[2] == '/' || s[2] == '\\')) + return 1; +#endif + return 0; +} + +static char *chunked_manifest_dir(const char *manifest_name) +{ + const char *slash; + char *dir; + size_t len; + + if (chunked_has_uri_scheme(manifest_name)) + return NULL; + + slash = strrchr(manifest_name, '/'); + if (!slash) + return NULL; + + len = slash - manifest_name + 1; + dir = malloc(len + 1); + if (!dir) { + errno = ENOMEM; + return NULL; + } + + memcpy(dir, manifest_name, len); + dir[len] = '\0'; + return dir; +} + +static char *chunked_resolve_name(const char *base, const char *name) +{ + char *copy; + + if (!base || chunked_is_absolute_path(name) || chunked_has_uri_scheme(name)) + return strdup(name); + + size_t base_len = strlen(base), name_len = strlen(name); + copy = malloc(base_len + name_len + 1); + if (!copy) + return NULL; + + memcpy(copy, base, base_len); + memcpy(copy + base_len, name, name_len + 1); + return copy; +} + +static void chunked_free_parts(hFILE_chunked *fp) +{ + size_t i; + + for (i = 0; i < fp->nparts; i++) + free(fp->parts[i].url); + free(fp->parts); +} + +static int chunked_close_current(hFILE_chunked *fp) +{ + if (fp->currentfp) { + hFILE *part = fp->currentfp; + fp->currentfp = NULL; + if (hclose(part) < 0) + return -1; + } + + return 0; +} + +static size_t chunked_find_part(hFILE_chunked *fp, off_t offset) +{ + size_t lo = 0, hi = fp->nparts; + + while (lo < hi) { + size_t mid = lo + (hi - lo) / 2; + off_t start = fp->parts[mid].offset; + off_t end = start + fp->parts[mid].length; + + if (offset < start) + hi = mid; + else if (offset >= end) + lo = mid + 1; + else + return mid; + } + + return lo; +} + +static int chunked_open_current(hFILE_chunked *fp) +{ + hfile_chunked_part *part; + off_t offset; + + if (fp->current >= fp->nparts) + return 0; + if (fp->currentfp) + return 0; + + part = &fp->parts[fp->current]; + offset = fp->position - part->offset; + + fp->currentfp = hopen(part->url, "r"); + if (!fp->currentfp) + return -1; + + if (offset != 0 && hseek(fp->currentfp, offset, SEEK_SET) < 0) { + int save = errno; + hclose_abruptly(fp->currentfp); + fp->currentfp = NULL; + errno = save; + return -1; + } + + return 0; +} + +static ssize_t chunked_read(hFILE *fpv, void *buffer, size_t nbytes) +{ + hFILE_chunked *fp = (hFILE_chunked *) fpv; + + if (nbytes == 0) + return 0; + + while (fp->current < fp->nparts) { + hfile_chunked_part *part = &fp->parts[fp->current]; + off_t end = part->offset + part->length; + off_t remaining; + size_t request = nbytes; + ssize_t n; + + if (part->length == 0 || fp->position >= end) { + if (chunked_close_current(fp) < 0) + return -1; + fp->current++; + continue; + } + + if (fp->position < part->offset) { + errno = EIO; + return -1; + } + + if (chunked_open_current(fp) < 0) + return -1; + + remaining = end - fp->position; + if (remaining < (off_t) request) + request = remaining; + + n = hread(fp->currentfp, buffer, request); + if (n < 0) + return -1; + if (n == 0) { + errno = EIO; + return -1; + } + + fp->position += n; + return n; + } + + return 0; +} + +static ssize_t chunked_write(hFILE *fpv, const void *buffer, size_t nbytes) +{ + errno = EROFS; + return -1; +} + +static off_t chunked_seek(hFILE *fpv, off_t offset, int whence) +{ + hFILE_chunked *fp = (hFILE_chunked *) fpv; + off_t base, target; + size_t part; + + switch (whence) { + case SEEK_SET: + base = 0; + break; + case SEEK_CUR: + base = htell(&fp->base); + break; + case SEEK_END: + base = fp->length; + break; + default: + errno = EINVAL; + return -1; + } + + if ((offset < 0 && offset < -base) + || (offset > 0 && offset > fp->length - base)) { + errno = EINVAL; + return -1; + } + target = base + offset; + + part = target == fp->length ? fp->nparts : chunked_find_part(fp, target); + if (part == fp->nparts && target != fp->length) { + errno = EINVAL; + return -1; + } + + if (part != fp->current && chunked_close_current(fp) < 0) + return -1; + + fp->current = part; + fp->position = target; + + if (fp->currentfp) { + off_t part_offset = target - fp->parts[fp->current].offset; + if (hseek(fp->currentfp, part_offset, SEEK_SET) < 0) + return -1; + } + + return target; +} + +static int chunked_close(hFILE *fpv) +{ + hFILE_chunked *fp = (hFILE_chunked *) fpv; + int ret = chunked_close_current(fp); + + chunked_free_parts(fp); + return ret; +} + +static const struct hFILE_backend chunked_backend = +{ + chunked_read, chunked_write, chunked_seek, NULL, chunked_close +}; + +static int chunked_part_length(const char *url, off_t *length) +{ + hFILE *part = hopen(url, "r"); + off_t len; + + if (!part) + return -1; + + len = hseek(part, 0, SEEK_END); + if (len < 0) { + int save = errno; + hclose_abruptly(part); + errno = save; + return -1; + } + + if (hclose(part) < 0) + return -1; + + *length = len; + return 0; +} + +hFILE *hopen_chunked_manifest(const char *url, const char *mode) +{ + hFILE_chunked *fp = NULL; + hFILE *manifest = NULL; + kstring_t line = KS_INITIALIZE; + const char *manifest_name = url + 8; // len("chunked:") = 8 + char *manifest_dir = NULL; + + if (!strchr(mode, 'r') || strchr(mode, '+') || strchr(mode, 'w') + || strchr(mode, 'a')) { + errno = ENOTSUP; + return NULL; + } + + if (*manifest_name == '\0') { + errno = EINVAL; + return NULL; + } + + fp = (hFILE_chunked *) hfile_init(sizeof(*fp), mode, 0); + if (!fp) return NULL; + + fp->parts = NULL; + fp->nparts = fp->maxparts = 0; + fp->current = 0; + fp->currentfp = NULL; + fp->length = fp->position = 0; + + manifest = hopen(manifest_name, "r"); + if (!manifest) + goto fail; + + errno = 0; + manifest_dir = chunked_manifest_dir(manifest_name); + if (!manifest_dir && errno) + goto fail; + + while (ks_clear(&line), khgetline(&line, manifest) == 0) { + char *chunk_name; + off_t chunk_length; + hfile_chunked_part *part; + + if (line.l == 0 || line.s[0] == '#') + continue; + + chunk_name = chunked_resolve_name(manifest_dir, line.s); + if (!chunk_name) + goto fail; + + if (chunked_part_length(chunk_name, &chunk_length) < 0) { + free(chunk_name); + goto fail; + } + if (chunk_length < 0 || chunk_length > INT64_MAX - fp->length) { + free(chunk_name); + errno = EOVERFLOW; + goto fail; + } + + hts_expand(hfile_chunked_part, fp->nparts + 1, fp->maxparts, fp->parts); + part = &fp->parts[fp->nparts++]; + part->url = chunk_name; + part->offset = fp->length; + part->length = chunk_length; + fp->length += chunk_length; + } + + if (herrno(manifest)) + goto fail; + if (hclose(manifest) < 0) { + manifest = NULL; + goto fail; + } + manifest = NULL; + + if (fp->nparts == 0) { + errno = EINVAL; + goto fail; + } + + fp->base.backend = &chunked_backend; + + free(manifest_dir); + ks_free(&line); + return &fp->base; + + fail: + { + int save = errno; + if (manifest) + hclose_abruptly(manifest); + free(manifest_dir); + ks_free(&line); + chunked_free_parts(fp); + hfile_destroy((hFILE *) fp); + errno = save; + } + return NULL; +} + // Returns 'v' (valid value), 'i' (invalid; required GA4GH field missing), // or upon encountering an unexpected token, that token's type. // Explicit `return '?'` means a JSON parsing error, typically a member key diff --git a/test/test.pl b/test/test.pl index eaa65ea30..a12f862b2 100755 --- a/test/test.pl +++ b/test/test.pl @@ -863,6 +863,32 @@ sub test_view testv $opts, "./test_view $tv_args -p $src_sam.bam.sam $src_sam.bam"; testv $opts, "./compare_sam.pl $src_sam $src_sam.bam.sam"; + my $chunk_manifest = "$src_sam.bam.chunks"; + open(my $chunk_in, '<', "$src_sam.bam") + || die "Couldn't open $src_sam.bam : $!\n"; + binmode($chunk_in); + open(my $chunk_list, '>', $chunk_manifest) + || die "Couldn't open $chunk_manifest : $!\n"; + my $chunk_no = 0; + while (read($chunk_in, my $chunk, 101)) { + my $chunk_name = sprintf("%s.bam.part%02d", $src_sam, $chunk_no++); + open(my $chunk_out, '>', $chunk_name) + || die "Couldn't open $chunk_name : $!\n"; + binmode($chunk_out); + print $chunk_out $chunk; + close($chunk_out) || die "Error on closing $chunk_name : $!\n"; + (my $chunk_entry = $chunk_name) =~ s{.*/}{}; + print $chunk_list "$chunk_entry\n"; + } + close($chunk_list) || die "Error on closing $chunk_manifest : $!\n"; + close($chunk_in) || die "Error on closing $src_sam.bam : $!\n"; + + testv $opts, "./test_view $tv_args -p $src_sam.bam.chunked.sam chunked:$chunk_manifest"; + testv $opts, "./compare_sam.pl $src_sam $src_sam.bam.chunked.sam"; + testv $opts, "./test_index -b chunked:$chunk_manifest"; + testv $opts, "./test_view $tv_args -p $src_sam.bam.chunked.region.sam chunked:$chunk_manifest ref:1-100000"; + testv $opts, "./compare_sam.pl $src_sam $src_sam.bam.chunked.region.sam"; + if ($test_view_failures == 0) { passed($opts, "BAM records spanning multiple BGZF block tests"); } else {