diff --git a/src/readstat_convert.c b/src/readstat_convert.c index 874cf97..08db351 100644 --- a/src/readstat_convert.c +++ b/src/readstat_convert.c @@ -15,15 +15,31 @@ readstat_error_t readstat_convert(char *dst, size_t dst_len, const char *src, si } else if (converter) { size_t dst_left = dst_len - 1; char *dst_end = dst; - size_t status = iconv(converter, (readstat_iconv_inbuf_t)&src, &src_len, &dst_end, &dst_left); - if (status == (size_t)-1) { - if (errno == E2BIG) { - return READSTAT_ERROR_CONVERT_LONG_STRING; - } else if (errno == EILSEQ) { - return READSTAT_ERROR_CONVERT_BAD_STRING; - } else if (errno != EINVAL) { /* EINVAL indicates improper truncation; accept it */ - return READSTAT_ERROR_CONVERT; + + /* Try conversion, with retry logic for invalid sequences */ + while (src_len > 0) { + size_t status = iconv(converter, (readstat_iconv_inbuf_t)&src, &src_len, &dst_end, &dst_left); + + if (status == (size_t)-1) { + if (errno == E2BIG) { + return READSTAT_ERROR_CONVERT_LONG_STRING; + } else if (errno == EILSEQ) { + /* Invalid byte sequence - replace with '?' and continue. + * This handles files with encoding errors more gracefully + * than failing completely. */ + if (src_len > 0 && dst_left > 0) { + *dst_end++ = '?'; + dst_left--; + src++; + src_len--; + continue; + } + break; + } else if (errno != EINVAL) { /* EINVAL indicates improper truncation; accept it */ + return READSTAT_ERROR_CONVERT; + } } + break; } dst[dst_len - dst_left - 1] = '\0'; } else if (src_len + 1 > dst_len) { diff --git a/src/spss/readstat_sav.h b/src/spss/readstat_sav.h index e417ac4..a76ba60 100644 --- a/src/spss/readstat_sav.h +++ b/src/spss/readstat_sav.h @@ -122,6 +122,7 @@ typedef struct sav_ctx_s { #define SAV_RECORD_SUBTYPE_INTEGER_INFO 3 #define SAV_RECORD_SUBTYPE_FP_INFO 4 #define SAV_RECORD_SUBTYPE_MULTIPLE_RESPONSE_SETS 7 +#define SAV_RECORD_SUBTYPE_MULTIPLE_RESPONSE_SETS_V14 19 #define SAV_RECORD_SUBTYPE_PRODUCT_INFO 10 #define SAV_RECORD_SUBTYPE_VAR_DISPLAY 11 #define SAV_RECORD_SUBTYPE_LONG_VAR_NAME 13 diff --git a/src/spss/readstat_sav_parse_mr_name.c b/src/spss/readstat_sav_parse_mr_name.c index 4bfad0c..f3ee191 100644 --- a/src/spss/readstat_sav_parse_mr_name.c +++ b/src/spss/readstat_sav_parse_mr_name.c @@ -1,3 +1,4 @@ + #line 1 "src/spss/readstat_sav_parse_mr_name.rl" #include #include @@ -6,61 +7,65 @@ #include "../readstat_malloc.h" -#line 8 "src/spss/readstat_sav_parse_mr_name.c" -static const signed char _mr_extractor_actions[] = { - 0, 1, 0, 1, 1, 1, 2, 1, - 3, 1, 4, 0 +#line 11 "src/spss/readstat_sav_parse_mr_name.c" +static const char _mr_extractor_actions[] = { + 0, 1, 0, 1, 1, 1, 2, 1, + 3, 1, 4 }; -static const signed char _mr_extractor_key_offsets[] = { - 0, 0, 8, 17, 19, 22, 24, 27, - 36, 48, 0 +static const char _mr_extractor_key_offsets[] = { + 0, 0, 8, 17, 20, 23, 25, 28, + 40, 41, 42, 44, 45, 57, 61 }; static const char _mr_extractor_trans_keys[] = { - 46, 95, 48, 57, 65, 90, 97, 122, - 46, 61, 95, 48, 57, 65, 90, 97, - 122, 67, 68, 32, 48, 57, 48, 57, - 32, 48, 57, 32, 46, 95, 48, 57, - 65, 90, 97, 122, 0, 32, 46, 95, - 9, 13, 48, 57, 65, 90, 97, 122, - 46, 95, 48, 57, 65, 90, 97, 122, - 0 + 46, 95, 48, 57, 65, 90, 97, 122, + 46, 61, 95, 48, 57, 65, 90, 97, + 122, 69, 67, 68, 32, 48, 57, 48, + 57, 32, 48, 57, 0, 32, 46, 95, + 9, 13, 48, 57, 65, 90, 97, 122, + 32, 49, 32, 49, 32, 0, 32, 46, + 95, 9, 13, 48, 57, 65, 90, 97, + 122, 0, 32, 9, 13, 0, 32, 46, + 95, 9, 13, 48, 57, 65, 90, 97, + 122, 0 +}; + +static const char _mr_extractor_single_lengths[] = { + 0, 2, 3, 1, 1, 0, 1, 4, + 1, 1, 2, 1, 4, 2, 4 }; -static const signed char _mr_extractor_single_lengths[] = { - 0, 2, 3, 0, 1, 0, 1, 3, - 4, 2, 0 +static const char _mr_extractor_range_lengths[] = { + 0, 3, 3, 1, 1, 1, 1, 4, + 0, 0, 0, 0, 4, 1, 4 }; -static const signed char _mr_extractor_range_lengths[] = { - 0, 3, 3, 1, 1, 1, 1, 3, - 4, 3, 0 +static const char _mr_extractor_index_offsets[] = { + 0, 0, 6, 13, 16, 19, 21, 24, + 33, 35, 37, 40, 42, 51, 55 }; -static const signed char _mr_extractor_index_offsets[] = { - 0, 0, 6, 13, 15, 18, 20, 23, - 30, 39, 0 +static const char _mr_extractor_indicies[] = { + 0, 0, 0, 0, 0, 1, 0, 2, + 0, 0, 0, 0, 1, 4, 3, 1, + 5, 6, 1, 7, 1, 8, 7, 1, + 9, 9, 10, 10, 9, 10, 10, 10, + 1, 11, 1, 12, 1, 6, 13, 1, + 6, 1, 14, 15, 10, 10, 14, 10, + 10, 10, 1, 14, 14, 14, 1, 14, + 14, 10, 10, 14, 10, 10, 10, 1, + 0 }; -static const signed char _mr_extractor_cond_targs[] = { - 2, 2, 2, 2, 2, 0, 2, 3, - 2, 2, 2, 2, 0, 4, 0, 5, - 4, 0, 6, 0, 7, 6, 0, 7, - 8, 8, 8, 8, 8, 0, 9, 9, - 8, 8, 9, 8, 8, 8, 0, 8, - 8, 8, 8, 8, 0, 0, 1, 2, - 3, 4, 5, 6, 7, 8, 9, 0 +static const char _mr_extractor_trans_targs[] = { + 2, 0, 3, 4, 8, 5, 4, 6, + 12, 14, 7, 9, 10, 11, 13, 12 }; -static const signed char _mr_extractor_cond_actions[] = { - 0, 0, 0, 0, 0, 0, 0, 1, - 0, 0, 0, 0, 0, 3, 0, 5, - 0, 0, 0, 0, 7, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 9, 9, - 0, 0, 9, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0 +static const char _mr_extractor_trans_actions[] = { + 0, 0, 1, 3, 3, 5, 0, 0, + 7, 9, 0, 0, 0, 0, 0, 0 }; static const int mr_extractor_start = 1; @@ -68,321 +73,295 @@ static const int mr_extractor_start = 1; static const int mr_extractor_en_main = 1; -#line 107 "src/spss/readstat_sav_parse_mr_name.rl" +#line 121 "src/spss/readstat_sav_parse_mr_name.rl" readstat_error_t extract_mr_data(const char *line, mr_set_t *result) { - readstat_error_t retval = READSTAT_OK; - - // Variables needed for Ragel operation - int cs = 0; - char *p = (char *)line; - char *start = p; - char *pe = p + strlen(p) + 1; - - // Variables needed for passing Ragel intermediate results - char mr_type = '\0'; - int mr_counted_value = -1; - int mr_subvar_count = 0; - char **mr_subvariables = NULL; - char *mr_name = NULL; - char *mr_label = NULL; - - // Execute Ragel finite state machine (FSM) - -#line 89 "src/spss/readstat_sav_parse_mr_name.c" + readstat_error_t retval = READSTAT_OK; + + // Variables needed for Ragel operation + int cs = 0; + char *p = (char *)line; + char *start = p; + char *pe = p + strlen(p) + 1; + + // Variables needed for passing Ragel intermediate results + char mr_type = '\0'; + int mr_counted_value = -1; + int mr_subvar_count = 0; + char **mr_subvariables = NULL; + char *mr_name = NULL; + char *mr_label = NULL; + + // Execute Ragel finite state machine (FSM) + +#line 99 "src/spss/readstat_sav_parse_mr_name.c" { - cs = (int)mr_extractor_start; + cs = mr_extractor_start; } - -#line 127 "src/spss/readstat_sav_parse_mr_name.rl" - -#line 94 "src/spss/readstat_sav_parse_mr_name.c" +#line 142 "src/spss/readstat_sav_parse_mr_name.rl" + +#line 106 "src/spss/readstat_sav_parse_mr_name.c" { - int _klen; - unsigned int _trans = 0; - const char * _keys; - const signed char * _acts; - unsigned int _nacts; - _resume: {} - if ( p == pe ) - goto _out; - _keys = ( _mr_extractor_trans_keys + (_mr_extractor_key_offsets[cs])); - _trans = (unsigned int)_mr_extractor_index_offsets[cs]; - - _klen = (int)_mr_extractor_single_lengths[cs]; - if ( _klen > 0 ) { - const char *_lower = _keys; - const char *_upper = _keys + _klen - 1; - const char *_mid; - while ( 1 ) { - if ( _upper < _lower ) { - _keys += _klen; - _trans += (unsigned int)_klen; - break; - } - - _mid = _lower + ((_upper-_lower) >> 1); - if ( ( (*( p))) < (*( _mid)) ) - _upper = _mid - 1; - else if ( ( (*( p))) > (*( _mid)) ) - _lower = _mid + 1; - else { - _trans += (unsigned int)(_mid - _keys); - goto _match; - } + int _klen; + unsigned int _trans; + const char *_acts; + unsigned int _nacts; + const char *_keys; + + if ( p == pe ) + goto _test_eof; + if ( cs == 0 ) + goto _out; +_resume: + _keys = _mr_extractor_trans_keys + _mr_extractor_key_offsets[cs]; + _trans = _mr_extractor_index_offsets[cs]; + + _klen = _mr_extractor_single_lengths[cs]; + if ( _klen > 0 ) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + _klen - 1; + while (1) { + if ( _upper < _lower ) + break; + + _mid = _lower + ((_upper-_lower) >> 1); + if ( (*p) < *_mid ) + _upper = _mid - 1; + else if ( (*p) > *_mid ) + _lower = _mid + 1; + else { + _trans += (unsigned int)(_mid - _keys); + goto _match; } } - - _klen = (int)_mr_extractor_range_lengths[cs]; - if ( _klen > 0 ) { - const char *_lower = _keys; - const char *_upper = _keys + (_klen<<1) - 2; - const char *_mid; - while ( 1 ) { - if ( _upper < _lower ) { - _trans += (unsigned int)_klen; - break; - } - - _mid = _lower + (((_upper-_lower) >> 1) & ~1); - if ( ( (*( p))) < (*( _mid)) ) - _upper = _mid - 2; - else if ( ( (*( p))) > (*( _mid + 1)) ) - _lower = _mid + 2; - else { - _trans += (unsigned int)((_mid - _keys)>>1); - break; - } + _keys += _klen; + _trans += _klen; + } + + _klen = _mr_extractor_range_lengths[cs]; + if ( _klen > 0 ) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + (_klen<<1) - 2; + while (1) { + if ( _upper < _lower ) + break; + + _mid = _lower + (((_upper-_lower) >> 1) & ~1); + if ( (*p) < _mid[0] ) + _upper = _mid - 2; + else if ( (*p) > _mid[1] ) + _lower = _mid + 2; + else { + _trans += (unsigned int)((_mid - _keys)>>1); + goto _match; } } - - _match: {} - cs = (int)_mr_extractor_cond_targs[_trans]; - - if ( _mr_extractor_cond_actions[_trans] != 0 ) { - - _acts = ( _mr_extractor_actions + (_mr_extractor_cond_actions[_trans])); - _nacts = (unsigned int)(*( _acts)); - _acts += 1; - while ( _nacts > 0 ) { - switch ( (*( _acts)) ) - { - case 0: { - { + _trans += _klen; + } + +_match: + _trans = _mr_extractor_indicies[_trans]; + cs = _mr_extractor_trans_targs[_trans]; + + if ( _mr_extractor_trans_actions[_trans] == 0 ) + goto _again; + + _acts = _mr_extractor_actions + _mr_extractor_trans_actions[_trans]; + _nacts = (unsigned int) *_acts++; + while ( _nacts-- > 0 ) + { + switch ( *_acts++ ) + { + case 0: #line 10 "src/spss/readstat_sav_parse_mr_name.rl" - - mr_name = (char *)readstat_malloc(p - start + 1); - if (mr_name == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - memcpy(mr_name, start, p - start); - mr_name[p - start] = '\0'; - } - -#line 177 "src/spss/readstat_sav_parse_mr_name.c" - - break; - } - case 1: { - { + { + mr_name = (char *)readstat_malloc(p - start + 1); + if (mr_name == NULL) { + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + memcpy(mr_name, start, p - start); + mr_name[p - start] = '\0'; + } + break; + case 1: #line 20 "src/spss/readstat_sav_parse_mr_name.rl" - - mr_type = *p; - start = p + 1; - } - -#line 188 "src/spss/readstat_sav_parse_mr_name.c" - - break; - } - case 2: { - { + { + mr_type = *p; + start = p + 1; + } + break; + case 2: #line 25 "src/spss/readstat_sav_parse_mr_name.rl" - - int n_cv_digs = p - start; - char *n_dig_str = (char *)readstat_malloc(n_cv_digs + 1); - if (n_dig_str == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - memcpy(n_dig_str, start, n_cv_digs); - n_dig_str[n_cv_digs] = '\0'; - int n_digs = strtol(n_dig_str, NULL, 10); - free(n_dig_str); - if (n_digs != 0) { - char *cv = (char *)readstat_malloc(n_digs + 1); - if (cv == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - memcpy(cv, p + 1, n_digs); - cv[n_digs] = '\0'; - mr_counted_value = strtol(cv, NULL, 10); - free(cv); - p = p + 1 + n_digs; - start = p + 1; - } - else { - mr_counted_value = -1; - } - } - -#line 223 "src/spss/readstat_sav_parse_mr_name.c" - - break; - } - case 3: { - { + { + int n_cv_digs = p - start; + char *n_dig_str = (char *)readstat_malloc(n_cv_digs + 1); + if (n_dig_str == NULL) { + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + memcpy(n_dig_str, start, n_cv_digs); + n_dig_str[n_cv_digs] = '\0'; + int n_digs = strtol(n_dig_str, NULL, 10); + free(n_dig_str); + if (n_digs != 0) { + char *cv = (char *)readstat_malloc(n_digs + 1); + if (cv == NULL) { + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + memcpy(cv, p + 1, n_digs); + cv[n_digs] = '\0'; + mr_counted_value = strtol(cv, NULL, 10); + free(cv); + p = p + 1 + n_digs; + start = p + 1; + } + else { + mr_counted_value = -1; + } + } + break; + case 3: #line 54 "src/spss/readstat_sav_parse_mr_name.rl" - - char *lbl_len_str = (char *)readstat_malloc(p - start + 1); - if (lbl_len_str == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - memcpy(lbl_len_str, start, p - start); - lbl_len_str[p - start] = '\0'; - int len = strtol(lbl_len_str, NULL, 10); - free(lbl_len_str); - mr_label = (char *)readstat_malloc(len + 1); - if (mr_label == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - memcpy(mr_label, p + 1, len); - mr_label[len] = '\0'; - p = p + 1 + len; - start = p + 1; - } - -#line 250 "src/spss/readstat_sav_parse_mr_name.c" - - break; - } - case 4: { - { + { + char *lbl_len_str = (char *)readstat_malloc(p - start + 1); + if (lbl_len_str == NULL) { + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + memcpy(lbl_len_str, start, p - start); + lbl_len_str[p - start] = '\0'; + int len = strtol(lbl_len_str, NULL, 10); + free(lbl_len_str); + mr_label = (char *)readstat_malloc(len + 1); + if (mr_label == NULL) { + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + memcpy(mr_label, p + 1, len); + mr_label[len] = '\0'; + p = p + 1 + len; + start = p + 1; + } + break; + case 4: #line 75 "src/spss/readstat_sav_parse_mr_name.rl" - - int len = p - start; - char *subvar = (char *)readstat_malloc(len + 1); - if (subvar == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - memcpy(subvar, start, len); - subvar[len] = '\0'; - start = p + 1; - char **new_subvariables = readstat_realloc(mr_subvariables, sizeof(char *) * (mr_subvar_count + 1)); - if (new_subvariables == NULL) { - free(subvar); - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - mr_subvariables = new_subvariables; - mr_subvariables[mr_subvar_count++] = subvar; - } - -#line 276 "src/spss/readstat_sav_parse_mr_name.c" - - break; - } - } - _nacts -= 1; - _acts += 1; - } - - } - - if ( cs != 0 ) { - p += 1; - goto _resume; + { + int len = p - start; + char *subvar = (char *)readstat_malloc(len + 1); + if (subvar == NULL) { + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + memcpy(subvar, start, len); + subvar[len] = '\0'; + start = p + 1; + char **new_subvariables = readstat_realloc(mr_subvariables, sizeof(char *) * (mr_subvar_count + 1)); + if (new_subvariables == NULL) { + free(subvar); + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + mr_subvariables = new_subvariables; + mr_subvariables[mr_subvar_count++] = subvar; + } + break; +#line 275 "src/spss/readstat_sav_parse_mr_name.c" } - _out: {} - } - -#line 128 "src/spss/readstat_sav_parse_mr_name.rl" - - - // Check if FSM finished successfully - if (cs < -#line 296 "src/spss/readstat_sav_parse_mr_name.c" -9 -#line 131 "src/spss/readstat_sav_parse_mr_name.rl" - || p != pe) { - retval = READSTAT_ERROR_BAD_MR_STRING; - goto cleanup; } - - (void)mr_extractor_en_main; - - // Assign parsed values to output parameter - result->name = mr_name; - result->label = mr_label; - result->type = mr_type; - result->counted_value = mr_counted_value; - result->subvariables = mr_subvariables; - result->num_subvars = mr_subvar_count; - if (result->type == 'D') { - result->is_dichotomy = 1; + +_again: + if ( cs == 0 ) + goto _out; + if ( ++p != pe ) + goto _resume; + _test_eof: {} + _out: {} } - - cleanup: - if (retval != READSTAT_OK) { - if (mr_subvariables != NULL) { - for (int i = 0; i < mr_subvar_count; i++) { - if (mr_subvariables[i] != NULL) free(mr_subvariables[i]); - } - free(mr_subvariables); - } - if (mr_name != NULL) free(mr_name); - if (mr_label != NULL) free(mr_label); - } - return retval; + +#line 143 "src/spss/readstat_sav_parse_mr_name.rl" + + // Check if FSM finished successfully + if (cs < 12 || p != pe) { + retval = READSTAT_ERROR_BAD_MR_STRING; + goto cleanup; + } + + (void)mr_extractor_en_main; + + // Assign parsed values to output parameter + result->name = mr_name; + result->label = mr_label; + result->type = mr_type; + result->counted_value = mr_counted_value; + result->subvariables = mr_subvariables; + result->num_subvars = mr_subvar_count; + if (result->type == 'D' || result->type == 'E') { + result->is_dichotomy = 1; + } + +cleanup: + if (retval != READSTAT_OK) { + if (mr_subvariables != NULL) { + for (int i = 0; i < mr_subvar_count; i++) { + if (mr_subvariables[i] != NULL) free(mr_subvariables[i]); + } + free(mr_subvariables); + } + if (mr_name != NULL) free(mr_name); + if (mr_label != NULL) free(mr_label); + } + return retval; } readstat_error_t parse_mr_line(const char *line, mr_set_t *result) { - *result = (mr_set_t){0}; - return extract_mr_data(line, result); + *result = (mr_set_t){0}; + return extract_mr_data(line, result); } -#line 335 "src/spss/readstat_sav_parse_mr_name.c" -static const signed char _mr_parser_actions[] = { - 0, 1, 0, 0 +#line 330 "src/spss/readstat_sav_parse_mr_name.c" +static const char _mr_parser_actions[] = { + 0, 1, 0 }; -static const signed char _mr_parser_key_offsets[] = { - 0, 0, 1, 2, 4, 0 +static const char _mr_parser_key_offsets[] = { + 0, 0, 1, 2, 4 }; static const char _mr_parser_trans_keys[] = { 36, 10, 0, 10, 10, 0 }; -static const signed char _mr_parser_single_lengths[] = { - 0, 1, 1, 2, 1, 0 +static const char _mr_parser_single_lengths[] = { + 0, 1, 1, 2, 1 +}; + +static const char _mr_parser_range_lengths[] = { + 0, 0, 0, 0, 0 }; -static const signed char _mr_parser_range_lengths[] = { - 0, 0, 0, 0, 0, 0 +static const char _mr_parser_index_offsets[] = { + 0, 0, 2, 4, 7 }; -static const signed char _mr_parser_index_offsets[] = { - 0, 0, 2, 4, 7, 0 +static const char _mr_parser_indicies[] = { + 0, 1, 2, 0, 3, 2, 0, 2, + 0, 0 }; -static const signed char _mr_parser_cond_targs[] = { - 2, 0, 3, 2, 4, 3, 2, 3, - 2, 0, 1, 2, 3, 4, 0 +static const char _mr_parser_trans_targs[] = { + 2, 0, 3, 4 }; -static const signed char _mr_parser_cond_actions[] = { - 0, 0, 1, 0, 0, 1, 0, 1, - 0, 0, 0, 0, 0, 0, 0 +static const char _mr_parser_trans_actions[] = { + 0, 0, 1, 0 }; static const int mr_parser_start = 1; @@ -390,157 +369,166 @@ static const int mr_parser_start = 1; static const int mr_parser_en_main = 1; -#line 202 "src/spss/readstat_sav_parse_mr_name.rl" +#line 216 "src/spss/readstat_sav_parse_mr_name.rl" readstat_error_t parse_mr_string(const char *line, mr_set_t **mr_sets, size_t *n_mr_lines) { - readstat_error_t retval = READSTAT_OK; - int cs = 0; - char *p = (char *)line; - char *start = p; - char *pe = p + strlen(p) + 1; - *mr_sets = NULL; - *n_mr_lines = 0; - - -#line 385 "src/spss/readstat_sav_parse_mr_name.c" + readstat_error_t retval = READSTAT_OK; + int cs = 0; + char *p = (char *)line; + char *start = p; + char *pe = p + strlen(p) + 1; + *mr_sets = NULL; + *n_mr_lines = 0; + + +#line 386 "src/spss/readstat_sav_parse_mr_name.c" { - cs = (int)mr_parser_start; + cs = mr_parser_start; } - -#line 213 "src/spss/readstat_sav_parse_mr_name.rl" - -#line 390 "src/spss/readstat_sav_parse_mr_name.c" +#line 228 "src/spss/readstat_sav_parse_mr_name.rl" + +#line 393 "src/spss/readstat_sav_parse_mr_name.c" { - int _klen; - unsigned int _trans = 0; - const char * _keys; - const signed char * _acts; - unsigned int _nacts; - _resume: {} - if ( p == pe ) - goto _out; - _keys = ( _mr_parser_trans_keys + (_mr_parser_key_offsets[cs])); - _trans = (unsigned int)_mr_parser_index_offsets[cs]; - - _klen = (int)_mr_parser_single_lengths[cs]; - if ( _klen > 0 ) { - const char *_lower = _keys; - const char *_upper = _keys + _klen - 1; - const char *_mid; - while ( 1 ) { - if ( _upper < _lower ) { - _keys += _klen; - _trans += (unsigned int)_klen; - break; - } - - _mid = _lower + ((_upper-_lower) >> 1); - if ( ( (*( p))) < (*( _mid)) ) - _upper = _mid - 1; - else if ( ( (*( p))) > (*( _mid)) ) - _lower = _mid + 1; - else { - _trans += (unsigned int)(_mid - _keys); - goto _match; - } - } - } - - _klen = (int)_mr_parser_range_lengths[cs]; - if ( _klen > 0 ) { - const char *_lower = _keys; - const char *_upper = _keys + (_klen<<1) - 2; - const char *_mid; - while ( 1 ) { - if ( _upper < _lower ) { - _trans += (unsigned int)_klen; - break; - } - - _mid = _lower + (((_upper-_lower) >> 1) & ~1); - if ( ( (*( p))) < (*( _mid)) ) - _upper = _mid - 2; - else if ( ( (*( p))) > (*( _mid + 1)) ) - _lower = _mid + 2; - else { - _trans += (unsigned int)((_mid - _keys)>>1); - break; - } + int _klen; + unsigned int _trans; + const char *_acts; + unsigned int _nacts; + const char *_keys; + + if ( p == pe ) + goto _test_eof; + if ( cs == 0 ) + goto _out; +_resume: + _keys = _mr_parser_trans_keys + _mr_parser_key_offsets[cs]; + _trans = _mr_parser_index_offsets[cs]; + + _klen = _mr_parser_single_lengths[cs]; + if ( _klen > 0 ) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + _klen - 1; + while (1) { + if ( _upper < _lower ) + break; + + _mid = _lower + ((_upper-_lower) >> 1); + if ( (*p) < *_mid ) + _upper = _mid - 1; + else if ( (*p) > *_mid ) + _lower = _mid + 1; + else { + _trans += (unsigned int)(_mid - _keys); + goto _match; } } - - _match: {} - cs = (int)_mr_parser_cond_targs[_trans]; - - if ( _mr_parser_cond_actions[_trans] != 0 ) { - - _acts = ( _mr_parser_actions + (_mr_parser_cond_actions[_trans])); - _nacts = (unsigned int)(*( _acts)); - _acts += 1; - while ( _nacts > 0 ) { - switch ( (*( _acts)) ) - { - case 0: { - { -#line 172 "src/spss/readstat_sav_parse_mr_name.rl" - - char *mln = (char *)readstat_malloc(p - start); - if (mln == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - memcpy(mln, start + 1, p - start); - mln[p - start - 1] = '\0'; - mr_set_t *new_mr_sets = readstat_realloc(*mr_sets, ((*n_mr_lines) + 1) * sizeof(mr_set_t)); - if (new_mr_sets == NULL) { - free(mln); - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - *mr_sets = new_mr_sets; - retval = parse_mr_line(mln, &(*mr_sets)[*n_mr_lines]); - free(mln); - if (retval != READSTAT_OK) { - goto cleanup; - } - (*n_mr_lines)++; - start = p + 1; - } - -#line 487 "src/spss/readstat_sav_parse_mr_name.c" - - break; - } - } - _nacts -= 1; - _acts += 1; + _keys += _klen; + _trans += _klen; + } + + _klen = _mr_parser_range_lengths[cs]; + if ( _klen > 0 ) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + (_klen<<1) - 2; + while (1) { + if ( _upper < _lower ) + break; + + _mid = _lower + (((_upper-_lower) >> 1) & ~1); + if ( (*p) < _mid[0] ) + _upper = _mid - 2; + else if ( (*p) > _mid[1] ) + _lower = _mid + 2; + else { + _trans += (unsigned int)((_mid - _keys)>>1); + goto _match; } - } - - if ( cs != 0 ) { - p += 1; - goto _resume; + _trans += _klen; + } + +_match: + _trans = _mr_parser_indicies[_trans]; + cs = _mr_parser_trans_targs[_trans]; + + if ( _mr_parser_trans_actions[_trans] == 0 ) + goto _again; + + _acts = _mr_parser_actions + _mr_parser_trans_actions[_trans]; + _nacts = (unsigned int) *_acts++; + while ( _nacts-- > 0 ) + { + switch ( *_acts++ ) + { + case 0: +#line 186 "src/spss/readstat_sav_parse_mr_name.rl" + { + char *mln = (char *)readstat_malloc(p - start); + if (mln == NULL) { + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + memcpy(mln, start + 1, p - start); + mln[p - start - 1] = '\0'; + mr_set_t *new_mr_sets = readstat_realloc(*mr_sets, ((*n_mr_lines) + 1) * sizeof(mr_set_t)); + if (new_mr_sets == NULL) { + free(mln); + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + *mr_sets = new_mr_sets; + retval = parse_mr_line(mln, &(*mr_sets)[*n_mr_lines]); + free(mln); + if (retval != READSTAT_OK) { + goto cleanup; + } + (*n_mr_lines)++; + start = p + 1; + } + break; +#line 493 "src/spss/readstat_sav_parse_mr_name.c" } - _out: {} } - -#line 214 "src/spss/readstat_sav_parse_mr_name.rl" - - if (cs < -#line 506 "src/spss/readstat_sav_parse_mr_name.c" -4 -#line 216 "src/spss/readstat_sav_parse_mr_name.rl" - || p != pe) { - retval = READSTAT_ERROR_BAD_MR_STRING; - goto cleanup; +_again: + if ( cs == 0 ) + goto _out; + if ( ++p != pe ) + goto _resume; + _test_eof: {} + _out: {} } - - (void)mr_parser_en_main; - - cleanup: - return retval; + +#line 229 "src/spss/readstat_sav_parse_mr_name.rl" + + if (cs < 4 || p != pe) { + retval = READSTAT_ERROR_BAD_MR_STRING; + goto cleanup; + } + + (void)mr_parser_en_main; + +cleanup: + if (retval != READSTAT_OK && *mr_sets != NULL) { + // Free all successfully parsed MR sets + for (size_t i = 0; i < *n_mr_lines; i++) { + if ((*mr_sets)[i].name != NULL) free((*mr_sets)[i].name); + if ((*mr_sets)[i].label != NULL) free((*mr_sets)[i].label); + if ((*mr_sets)[i].subvariables != NULL) { + for (size_t j = 0; j < (*mr_sets)[i].num_subvars; j++) { + if ((*mr_sets)[i].subvariables[j] != NULL) { + free((*mr_sets)[i].subvariables[j]); + } + } + free((*mr_sets)[i].subvariables); + } + } + free(*mr_sets); + *mr_sets = NULL; + *n_mr_lines = 0; + } + return retval; } diff --git a/src/spss/readstat_sav_parse_mr_name.rl b/src/spss/readstat_sav_parse_mr_name.rl index 817638b..52e6820 100644 --- a/src/spss/readstat_sav_parse_mr_name.rl +++ b/src/spss/readstat_sav_parse_mr_name.rl @@ -94,14 +94,28 @@ nc = (alnum | '_' | '.' ); # name character (including dots) name = nc+ '=' > extract_mr_name; - type = ('C' | 'D'){1} > extract_mr_type; + + # Define types + c_type = 'C' > extract_mr_type; + d_type = 'D' > extract_mr_type; + e_type = 'E' > extract_mr_type; + + # For type E, we need an optional pattern for the additional parameters + e_params = ' ' ('1' | '11') ' '; + counted_value = digit* ' ' > extract_counted_value; label = digit+ ' '+ > extract_label; end = (space | '\0'); # subvar token terminator subvariable = (nc+ end >extract_subvar); - main := name type counted_value label subvariable+; + # Define patterns for each type + c_pattern = c_type counted_value label subvariable* end*; + d_pattern = d_type counted_value label subvariable* end*; + e_pattern = e_type e_params counted_value label subvariable* end*; + + # Main pattern is one of the type patterns + main := name (c_pattern | d_pattern | e_pattern); write data nofinal noerror; }%% @@ -142,7 +156,7 @@ readstat_error_t extract_mr_data(const char *line, mr_set_t *result) { result->counted_value = mr_counted_value; result->subvariables = mr_subvariables; result->num_subvars = mr_subvar_count; - if (result->type == 'D') { + if (result->type == 'D' || result->type == 'E') { result->is_dichotomy = 1; } @@ -221,5 +235,23 @@ readstat_error_t parse_mr_string(const char *line, mr_set_t **mr_sets, size_t *n (void)mr_parser_en_main; cleanup: + if (retval != READSTAT_OK && *mr_sets != NULL) { + // Free all successfully parsed MR sets + for (size_t i = 0; i < *n_mr_lines; i++) { + if ((*mr_sets)[i].name != NULL) free((*mr_sets)[i].name); + if ((*mr_sets)[i].label != NULL) free((*mr_sets)[i].label); + if ((*mr_sets)[i].subvariables != NULL) { + for (size_t j = 0; j < (*mr_sets)[i].num_subvars; j++) { + if ((*mr_sets)[i].subvariables[j] != NULL) { + free((*mr_sets)[i].subvariables[j]); + } + } + free((*mr_sets)[i].subvariables); + } + } + free(*mr_sets); + *mr_sets = NULL; + *n_mr_lines = 0; + } return retval; } diff --git a/src/spss/readstat_sav_read.c b/src/spss/readstat_sav_read.c index 731d8ad..ebd33dc 100644 --- a/src/spss/readstat_sav_read.c +++ b/src/spss/readstat_sav_read.c @@ -981,6 +981,9 @@ static readstat_error_t sav_parse_machine_integer_info_record(const void *data, // but the field only has room for two bytes). So to prevent the client // from receiving an invalid byte sequence, we ram everything through // our iconv machinery. + + // Invalid byte sequences are now handled in readstat_convert() by + // skipping bad bytes, which works cross-platform (unlike //IGNORE) iconv_t converter = iconv_open(dst_charset, src_charset); if (converter == (iconv_t)-1) { return READSTAT_ERROR_UNSUPPORTED_CHARSET; @@ -1364,14 +1367,43 @@ static readstat_error_t sav_parse_records_pass1(sav_ctx_t *ctx) { retval = sav_parse_machine_integer_info_record(data_buf, data_len, ctx); if (retval != READSTAT_OK) goto cleanup; - } else if (subtype == SAV_RECORD_SUBTYPE_MULTIPLE_RESPONSE_SETS) { - if (ctx->mr_sets != NULL) { - retval = READSTAT_ERROR_BAD_MR_STRING; - goto cleanup; - } + } else if (subtype == SAV_RECORD_SUBTYPE_MULTIPLE_RESPONSE_SETS || subtype == SAV_RECORD_SUBTYPE_MULTIPLE_RESPONSE_SETS_V14) { + // Files may contain multiple MR set records (subtype 7 and/or 19) + // Save existing MR sets to merge with new ones + mr_set_t *old_mr_sets = ctx->mr_sets; + size_t old_count = ctx->multiple_response_sets_length; + + // Reset context to load new MR sets + ctx->mr_sets = NULL; + ctx->multiple_response_sets_length = 0; + retval = sav_read_multiple_response_sets(data_len, ctx); - if (retval != READSTAT_OK) + if (retval != READSTAT_OK) { + // Restore old MR sets to context so they get cleaned up properly + ctx->mr_sets = old_mr_sets; + ctx->multiple_response_sets_length = old_count; goto cleanup; + } + + // Merge with existing MR sets if any + if (old_mr_sets != NULL && old_count > 0) { + size_t total_count = old_count + ctx->multiple_response_sets_length; + mr_set_t *merged = readstat_realloc(old_mr_sets, total_count * sizeof(mr_set_t)); + if (merged == NULL) { + // Restore old MR sets to context so they get cleaned up properly + ctx->mr_sets = old_mr_sets; + ctx->multiple_response_sets_length = old_count; + retval = READSTAT_ERROR_MALLOC; + goto cleanup; + } + + // Append new MR sets after existing ones + memcpy(merged + old_count, ctx->mr_sets, ctx->multiple_response_sets_length * sizeof(mr_set_t)); + free(ctx->mr_sets); + + ctx->mr_sets = merged; + ctx->multiple_response_sets_length = total_count; + } } else { if (io->seek(data_len, READSTAT_SEEK_CUR, io->io_ctx) == -1) { retval = READSTAT_ERROR_SEEK;