diff --git a/src/readstat.h b/src/readstat.h index bf2c375e..2d058f10 100644 --- a/src/readstat.h +++ b/src/readstat.h @@ -298,6 +298,20 @@ typedef int (*readstat_value_label_handler)(const char *val_labels, readstat_value_t value, const char *label, void *ctx); typedef void (*readstat_error_handler)(const char *error_message, void *ctx); typedef int (*readstat_progress_handler)(double progress, void *ctx); +typedef int (*readstat_invalid_string_handler)(char *dst, size_t dst_len, + const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, + void *ctx); + +int readstat_invalid_string_info(char *dst, size_t dst_len, const char *src, size_t src_len, + int obs_index, readstat_variable_t *variable, void *ctx); +int readstat_invalid_string_copy(char *dst, size_t dst_len, const char *src, size_t src_len, + int obs_index, readstat_variable_t *variable, void *ctx); +int readstat_invalid_string_skip(char *dst, size_t dst_len, const char *src, size_t src_len, + int obs_index, readstat_variable_t *variable, void *ctx); +int readstat_invalid_string_utf8(char *dst, size_t dst_len, const char *src, size_t src_len, + int obs_index, readstat_variable_t *variable, void *ctx); +int readstat_invalid_string_cp1252(char *dst, size_t dst_len, const char *src, size_t src_len, + int obs_index, readstat_variable_t *variable, void *ctx); #if defined(_MSC_VER) #include @@ -334,14 +348,15 @@ typedef struct readstat_io_s { } readstat_io_t; typedef struct readstat_callbacks_s { - readstat_metadata_handler metadata; - readstat_note_handler note; - readstat_variable_handler variable; - readstat_fweight_handler fweight; - readstat_value_handler value; - readstat_value_label_handler value_label; - readstat_error_handler error; - readstat_progress_handler progress; + readstat_metadata_handler metadata; + readstat_note_handler note; + readstat_variable_handler variable; + readstat_fweight_handler fweight; + readstat_value_handler value; + readstat_value_label_handler value_label; + readstat_error_handler error; + readstat_progress_handler progress; + readstat_invalid_string_handler invalid_string; } readstat_callbacks_t; typedef struct readstat_parser_s { @@ -365,6 +380,7 @@ readstat_error_t readstat_set_value_handler(readstat_parser_t *parser, readstat_ readstat_error_t readstat_set_value_label_handler(readstat_parser_t *parser, readstat_value_label_handler value_label_handler); readstat_error_t readstat_set_error_handler(readstat_parser_t *parser, readstat_error_handler error_handler); readstat_error_t readstat_set_progress_handler(readstat_parser_t *parser, readstat_progress_handler progress_handler); +readstat_error_t readstat_set_invalid_string_handler(readstat_parser_t *parser, readstat_invalid_string_handler invalid_string_handler); readstat_error_t readstat_set_open_handler(readstat_parser_t *parser, readstat_open_handler open_handler); readstat_error_t readstat_set_close_handler(readstat_parser_t *parser, readstat_close_handler close_handler); diff --git a/src/readstat_convert.c b/src/readstat_convert.c index 1e54ca58..6676ce9d 100644 --- a/src/readstat_convert.c +++ b/src/readstat_convert.c @@ -17,9 +17,9 @@ readstat_error_t readstat_convert(char *dst, size_t dst_len, const char *src, si char *dst_end = dst; size_t status = iconv(converter, (readstat_iconv_inbuf_t)&src, &src_len, &dst_end, &dst_left); if (status == (size_t)-1) { - if (errno == E2BIG) { + if (errno == E2BIG) { /* E2BIG indicates that the output buffer is not large enough */ return READSTAT_ERROR_CONVERT_LONG_STRING; - } else if (errno == EILSEQ) { + } else if (errno == EILSEQ) { /* EILSEQ indicates an invalid multibyte sequence */ return READSTAT_ERROR_CONVERT_BAD_STRING; } else if (errno != EINVAL) { /* EINVAL indicates improper truncation; accept it */ return READSTAT_ERROR_CONVERT; @@ -34,3 +34,110 @@ readstat_error_t readstat_convert(char *dst, size_t dst_len, const char *src, si } return READSTAT_OK; } + +int readstat_invalid_string_info(char *dst, size_t dst_len, const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, void *ctx) { + /* show information about the invalid string and exit */ + printf("Invalid string in variable %s, row %d: \"%s\"\n", variable->name, obs_index, src); + + return READSTAT_HANDLER_ABORT; +} + +int readstat_invalid_string_copy(char *dst, size_t dst_len, const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, void *ctx) { + /* copy over the string unedited and continue */ + + /* strip off spaces from the input because the programs use ASCII space + * padding even with non-ASCII encoding. */ + while (src_len && src[src_len-1] == ' ') { + src_len--; + } + + if (src_len + 1 > dst_len) { + return READSTAT_HANDLER_ABORT; + } + + memcpy(dst, src, src_len); + dst[src_len] = '\0'; + + return READSTAT_HANDLER_OK; +} + +int readstat_invalid_string_skip(char *dst, size_t dst_len, const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, void *ctx) { + /* skip the invalid string */ + dst[0] = '\0'; + + return READSTAT_HANDLER_OK; +} + +int readstat_invalid_string_utf8(char *dst, size_t dst_len, const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, void *ctx) { + /* treat string as utf-8 and use the unicode replacement character for any invalid bytes */ + + /* strip off spaces from the input because the programs use ASCII space + * padding even with non-ASCII encoding. */ + while (src_len && src[src_len-1] == ' ') { + src_len--; + } + + iconv_t converter = iconv_open("UTF-8", "UTF-8"); + if (converter == (iconv_t)-1) { + return READSTAT_HANDLER_ABORT; + } + + size_t dst_left = dst_len - 1; + char *dst_end = dst; + size_t src_left = src_len; + const char *src_end = src; + while (src_left > 0) { + size_t status = iconv(converter, (readstat_iconv_inbuf_t)&src_end, &src_left, &dst_end, &dst_left); + if (status == (size_t)-1) { + if (errno == E2BIG) { /* E2BIG indicates that the output buffer is not large enough */ + return READSTAT_HANDLER_ABORT; + } else if (errno == EILSEQ) { /* EILSEQ indicates an invalid multibyte sequence */ + if (dst_left < 3) { + return READSTAT_HANDLER_ABORT; + } + + dst_end[0] = (char) 0xEF; + dst_end[1] = (char) 0xBF; + dst_end[2] = (char) 0xBD; + dst_end += 3; + src_end += 1; + dst_left -= 3; + src_left -= 1; + } else if (errno != EINVAL) { /* EINVAL indicates improper truncation; accept it */ + return READSTAT_HANDLER_ABORT; + } else { + /* finish here and accept conversion if EINVAL is returned */ + break; + } + } + } + dst[dst_len - dst_left - 1] = '\0'; + + iconv_close(converter); + return READSTAT_HANDLER_OK; +} + +int readstat_invalid_string_cp1252(char *dst, size_t dst_len, const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, void *ctx) { + /* try converting the rest of the string as WINDOWS-1252, common encoding error */ + while (src_len && src[src_len-1] == ' ') { + src_len--; + } + + iconv_t converter = iconv_open("UTF-8", "WINDOWS-1252"); + if (converter == (iconv_t)-1) { + return READSTAT_HANDLER_ABORT; + } + + size_t dst_left = dst_len - 1; + char *dst_end = dst; + size_t status = iconv(converter, (readstat_iconv_inbuf_t)&src, &src_len, &dst_end, &dst_left); + + if (status == (size_t)-1) { + return READSTAT_HANDLER_ABORT; + } + dst[dst_len - dst_left - 1] = '\0'; + + iconv_close(converter); + return READSTAT_HANDLER_OK; +} + diff --git a/src/readstat_parser.c b/src/readstat_parser.c index d0814000..bd61bba8 100644 --- a/src/readstat_parser.c +++ b/src/readstat_parser.c @@ -59,6 +59,11 @@ readstat_error_t readstat_set_progress_handler(readstat_parser_t *parser, readst return READSTAT_OK; } +readstat_error_t readstat_set_invalid_string_handler(readstat_parser_t *parser, readstat_invalid_string_handler invalid_string_handler) { + parser->handlers.invalid_string = invalid_string_handler; + return READSTAT_OK; +} + readstat_error_t readstat_set_fweight_handler(readstat_parser_t *parser, readstat_fweight_handler fweight_handler) { parser->handlers.fweight = fweight_handler; return READSTAT_OK; diff --git a/src/sas/readstat_sas7bdat_read.c b/src/sas/readstat_sas7bdat_read.c index cfc27500..898bf0f2 100644 --- a/src/sas/readstat_sas7bdat_read.c +++ b/src/sas/readstat_sas7bdat_read.c @@ -399,7 +399,16 @@ static readstat_error_t sas7bdat_handle_data_value(readstat_variable_t *variable if (col_info->type == READSTAT_TYPE_STRING) { retval = readstat_convert(ctx->scratch_buffer, ctx->scratch_buffer_len, col_data, col_info->width, ctx->converter); - if (retval != READSTAT_OK) { + if (retval == READSTAT_ERROR_CONVERT_BAD_STRING) { + if (!ctx->handle.invalid_string) { + goto cleanup; + } else if (ctx->handle.invalid_string(ctx->scratch_buffer, ctx->scratch_buffer_len, + col_data, col_info->width, ctx->parsed_row_count+1, + variable, ctx->user_ctx) != READSTAT_HANDLER_OK) { + retval = READSTAT_ERROR_USER_ABORT; + goto cleanup; + } + } else if (retval != READSTAT_OK) { if (ctx->handle.error) { snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Error converting string (row=%u, col=%u) to specified encoding: %.*s", diff --git a/src/sas/readstat_xport_read.c b/src/sas/readstat_xport_read.c index 0bbb433a..98c5eaa5 100644 --- a/src/sas/readstat_xport_read.c +++ b/src/sas/readstat_xport_read.c @@ -565,8 +565,18 @@ static readstat_error_t xport_process_row(xport_ctx_t *ctx, const char *row, siz } retval = readstat_convert(string, 4*variable->storage_width+1, &row[pos], variable->storage_width, ctx->converter); - if (retval != READSTAT_OK) + if (retval == READSTAT_ERROR_CONVERT_BAD_STRING) { + if (!ctx->handle.invalid_string) { + goto cleanup; + } else if (ctx->handle.invalid_string(string, 4*variable->storage_width+1, + &row[pos], variable->storage_width, ctx->parsed_row_count+1, + variable, ctx->user_ctx) != READSTAT_HANDLER_OK) { + retval = READSTAT_ERROR_USER_ABORT; + goto cleanup; + } + } else if (retval != READSTAT_OK) { goto cleanup; + } value.v.string_value = string; } else { diff --git a/src/spss/readstat_por_read.c b/src/spss/readstat_por_read.c index 44dc0f48..e27071bf 100644 --- a/src/spss/readstat_por_read.c +++ b/src/spss/readstat_por_read.c @@ -616,7 +616,16 @@ static readstat_error_t read_por_file_data(por_ctx_t *ctx) { } rs_retval = readstat_convert(output_string, sizeof(output_string), input_string, strlen(input_string), ctx->converter); - if (rs_retval != READSTAT_OK) { + if (rs_retval == READSTAT_ERROR_CONVERT_BAD_STRING) { + if (!ctx->handle.invalid_string) { + goto cleanup; + } else if (ctx->handle.invalid_string(output_string, sizeof(output_string), + input_string, strlen(input_string), ctx->obs_count+1, + ctx->variables[i], ctx->user_ctx) != READSTAT_HANDLER_OK) { + rs_retval = READSTAT_ERROR_USER_ABORT; + goto cleanup; + } + } else if (rs_retval != READSTAT_OK) { goto cleanup; } value.v.string_value = output_string; diff --git a/src/spss/readstat_sav_read.c b/src/spss/readstat_sav_read.c index 184f5343..1c78caf8 100644 --- a/src/spss/readstat_sav_read.c +++ b/src/spss/readstat_sav_read.c @@ -726,8 +726,18 @@ static readstat_error_t sav_process_row(unsigned char *buffer, size_t buffer_len if (!ctx->variables[var_info->index]->skip) { retval = readstat_convert(ctx->utf8_string, ctx->utf8_string_len, ctx->raw_string, raw_str_used, ctx->converter); - if (retval != READSTAT_OK) + if (retval == READSTAT_ERROR_CONVERT_BAD_STRING) { + if (!ctx->handle.invalid_string) { + goto done; + } else if (ctx->handle.invalid_string(ctx->utf8_string, ctx->utf8_string_len, + ctx->raw_string, raw_str_used, ctx->current_row+1, + ctx->variables[var_info->index], ctx->user_ctx) != READSTAT_HANDLER_OK) { + retval = READSTAT_ERROR_USER_ABORT; + goto done; + } + } else if (retval != READSTAT_OK) { goto done; + } value.v.string_value = ctx->utf8_string; if (ctx->handle.value(ctx->current_row, ctx->variables[var_info->index], value, ctx->user_ctx) != READSTAT_HANDLER_OK) { diff --git a/src/stata/readstat_dta_read.c b/src/stata/readstat_dta_read.c index ce07c333..9e40d89a 100644 --- a/src/stata/readstat_dta_read.c +++ b/src/stata/readstat_dta_read.c @@ -619,8 +619,18 @@ static readstat_error_t dta_handle_row(const unsigned char *buf, dta_ctx_t *ctx) size_t str_len = strnlen((const char *)&buf[offset], max_len); retval = readstat_convert(str_buf, sizeof(str_buf), (const char *)&buf[offset], str_len, ctx->converter); - if (retval != READSTAT_OK) + if (retval == READSTAT_ERROR_CONVERT_BAD_STRING) { + if (!ctx->handle.invalid_string) { + goto cleanup; + } else if (ctx->handle.invalid_string(str_buf, sizeof(str_buf), + (const char *)&buf[offset], str_len, ctx->current_row+1, + ctx->variables[j], ctx->user_ctx) != READSTAT_HANDLER_OK) { + retval = READSTAT_ERROR_USER_ABORT; + goto cleanup; + } + } else if (retval != READSTAT_OK) { goto cleanup; + } value.v.string_value = str_buf; } else if (value.type == READSTAT_TYPE_STRING_REF) { dta_strl_t key = dta_interpret_strl_vo_bytes(ctx, &buf[offset]); diff --git a/src/txt/readstat_txt_read.c b/src/txt/readstat_txt_read.c index e48f687a..df557e5e 100644 --- a/src/txt/readstat_txt_read.c +++ b/src/txt/readstat_txt_read.c @@ -27,8 +27,17 @@ static readstat_error_t handle_value(readstat_parser_t *parser, iconv_t converte if (readstat_type_class(variable->type) == READSTAT_TYPE_CLASS_STRING) { converted_value = malloc(4*len+1); error = readstat_convert(converted_value, 4 * len + 1, bytes, len, converter); - if (error != READSTAT_OK) + if (error == READSTAT_ERROR_CONVERT_BAD_STRING) { + if (!parser->handlers.invalid_string) { + goto cleanup; + } else if (parser->handlers.invalid_string(converted_value, 4 * len + 1, + bytes, len, obs_index+1, variable, ctx) != READSTAT_HANDLER_OK) { + error = READSTAT_ERROR_USER_ABORT; + goto cleanup; + } + } else if (error != READSTAT_OK) { goto cleanup; + } value.v.string_value = converted_value; } else { char *endptr = NULL;