From 4e206589d7223dbffc6ae95e1eef50ada4caa2e2 Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Wed, 24 Dec 2025 12:38:58 -0500 Subject: [PATCH] Convert DTA notes to UTF-8 --- src/stata/readstat_dta_read.c | 19 ++++++++++++++++++- src/test/test_list.h | 15 +++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/stata/readstat_dta_read.c b/src/stata/readstat_dta_read.c index ce07c333..27d86d14 100644 --- a/src/stata/readstat_dta_read.c +++ b/src/stata/readstat_dta_read.c @@ -184,6 +184,7 @@ static readstat_error_t dta_read_expansion_fields(dta_ctx_t *ctx) { readstat_error_t retval = READSTAT_OK; readstat_io_t *io = ctx->io; char *buffer = NULL; + char *utf8_note = NULL; if (ctx->expansion_len_len == 0) return READSTAT_OK; @@ -271,7 +272,21 @@ static readstat_error_t dta_read_expansion_fields(dta_ctx_t *ctx) { int index = 0; if (strncmp(&buffer[0], "_dta", 4) == 0 && sscanf(&buffer[ctx->ch_metadata_len], "note%d", &index) == 1) { - if (ctx->handle.note(index, &buffer[2*ctx->ch_metadata_len], ctx->user_ctx) != READSTAT_HANDLER_OK) { + const char *note_src = &buffer[2*ctx->ch_metadata_len]; + size_t note_src_len = len - 2*ctx->ch_metadata_len; + size_t utf8_note_len = 4*note_src_len + 1; + + if ((utf8_note = readstat_realloc(utf8_note, utf8_note_len)) == NULL) { + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + + retval = readstat_convert(utf8_note, utf8_note_len, note_src, + strnlen(note_src, note_src_len), ctx->converter); + if (retval != READSTAT_OK) + goto cleanup; + + if (ctx->handle.note(index, utf8_note, ctx->user_ctx) != READSTAT_HANDLER_OK) { retval = READSTAT_ERROR_USER_ABORT; goto cleanup; } @@ -291,6 +306,8 @@ static readstat_error_t dta_read_expansion_fields(dta_ctx_t *ctx) { cleanup: if (buffer) free(buffer); + if (utf8_note) + free(utf8_note); return retval; } diff --git a/src/test/test_list.h b/src/test/test_list.h index 90f4f6d1..bd4cc169 100644 --- a/src/test/test_list.h +++ b/src/test/test_list.h @@ -795,6 +795,21 @@ static rt_test_group_t _test_groups[] = { .label_set = "somelbl" } } + }, + { + .label = "UTF-8 note", + .test_formats = RT_FORMAT_DTA_118_AND_NEWER, + .rows = 0, + .notes_count = 1, + .notes = { + "Stra" "\xc3\x9f" "e" + }, + .columns = { + { + .name = "var1", + .type = READSTAT_TYPE_DOUBLE + } + } } } },