diff --git a/ext/json/ext/json.h b/ext/json/ext/json.h index 78b0eee0..a29d37be 100644 --- a/ext/json/ext/json.h +++ b/ext/json/ext/json.h @@ -28,8 +28,13 @@ #else # define JSON_UNREACHABLE_RETURN UNREACHABLE_RETURN #endif + /* shims */ +#ifndef UNDEF_P +#define UNDEF_P(val) (val == Qundef) +#endif + #if SIZEOF_UINT64_T == SIZEOF_LONG_LONG # define INT64T2NUM(x) LL2NUM(x) # define UINT64T2NUM(x) ULL2NUM(x) diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index 5559561e..46da2bb6 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -5,7 +5,7 @@ static VALUE mJSON, eNestingError, eParserError, Encoding_UTF_8; static VALUE CNaN, CInfinity, CMinusInfinity; -static ID i_new, i_try_convert, i_uminus, i_encode, i_at_line, i_at_column; +static ID i_new, i_try_convert, i_uminus, i_encode, i_at_line, i_at_column, i_at_eos; static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_allow_comments, sym_allow_control_characters, sym_allow_invalid_escape, sym_symbolize_names, @@ -58,6 +58,20 @@ typedef struct rvalue_cache_struct { VALUE entries[JSON_RVALUE_CACHE_CAPA]; } rvalue_cache; +static void rvalue_cache_mark(rvalue_cache *cache) +{ + for (int index = 0; index < cache->length; index++) { + rb_gc_mark_movable(cache->entries[index]); + } +} + +static void rvalue_cache_compact(rvalue_cache *cache) +{ + for (int index = 0; index < cache->length; index++) { + cache->entries[index] = rb_gc_location(cache->entries[index]); + } +} + static rb_encoding *enc_utf8; #define JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH 55 @@ -206,7 +220,7 @@ static rvalue_stack *rvalue_stack_spill(rvalue_stack *old_stack, VALUE *handle, static rvalue_stack *rvalue_stack_grow(rvalue_stack *stack, VALUE *handle, rvalue_stack **stack_ref) { - long required = stack->capa * 2; + long required = stack->capa ? stack->capa * 2 : RVALUE_STACK_INITIAL_CAPA; if (stack->type == RVALUE_STACK_STACK_ALLOCATED) { stack = rvalue_stack_spill(stack, handle, stack_ref); @@ -222,8 +236,10 @@ static VALUE rvalue_stack_push(rvalue_stack *stack, VALUE value, VALUE *handle, if (RB_UNLIKELY(stack->head >= stack->capa)) { stack = rvalue_stack_grow(stack, handle, stack_ref); } + stack->ptr[stack->head] = value; stack->head++; + return value; } @@ -372,7 +388,7 @@ typedef struct json_frame_struct { enum json_frame_type type; enum json_frame_phase phase; long value_stack_head; // rvalue_stack->head when this container opened - const char *start_cursor; // object frames only (the '{'); NULL otherwise + size_t start_offset; // object frames only (the '{'); NULL otherwise } json_frame; typedef struct json_frame_stack_struct { @@ -421,7 +437,7 @@ static json_frame_stack *json_frame_stack_spill(json_frame_stack *old_stack, VAL static json_frame_stack *json_frame_stack_grow(json_frame_stack *stack, VALUE *handle, json_frame_stack **stack_ref) { - long required = stack->capa * 2; + long required = stack->capa ? stack->capa * 2 : JSON_FRAME_STACK_INITIAL_CAPA; if (stack->type == RVALUE_STACK_STACK_ALLOCATED) { stack = json_frame_stack_spill(stack, handle, stack_ref); @@ -643,26 +659,49 @@ static VALUE build_parse_error_message(const char *format, JSON_ParserState *sta return message; } -static VALUE parse_error_new(VALUE message, long line, long column) +static VALUE parse_error_new(VALUE message, long line, long column, bool eos) { VALUE exc = rb_exc_new_str(eParserError, message); rb_ivar_set(exc, i_at_line, LONG2NUM(line)); rb_ivar_set(exc, i_at_column, LONG2NUM(column)); + if (eos) { + rb_ivar_set(exc, i_at_eos, Qtrue); + } return exc; } -NORETURN(static) void raise_parse_error(const char *format, JSON_ParserState *state) +NORETURN(static) void raise_parse_error(const char *format, JSON_ParserState *state, bool eos) { long line, column; cursor_position(state, &line, &column); VALUE message = build_parse_error_message(format, state, line, column); - rb_exc_raise(parse_error_new(message, line, column)); + rb_exc_raise(parse_error_new(message, line, column, eos)); +} + +NORETURN(static) void raise_eos_error(const char *format, JSON_ParserState *state) +{ + raise_parse_error(format, state, true); } -NORETURN(static) void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at) +NORETURN(static) void raise_syntax_error(const char *format, JSON_ParserState *state) +{ + raise_parse_error(format, state, false); +} + +NORETURN(static) void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at, bool eos) { state->cursor = at; - raise_parse_error(format, state); + raise_parse_error(format, state, eos); +} + +NORETURN(static) void raise_eos_error_at(const char *format, JSON_ParserState *state, const char *at) +{ + raise_parse_error_at(format, state, at, true); +} + +NORETURN(static) void raise_syntax_error_at(const char *format, JSON_ParserState *state, const char *at) +{ + raise_parse_error_at(format, state, at, false); } /* unicode */ @@ -687,7 +726,7 @@ static const signed char digit_values[256] = { static uint32_t unescape_unicode(JSON_ParserState *state, const char *sp, const char *spe) { if (RB_UNLIKELY(sp > spe - 4)) { - raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2); + raise_eos_error_at("incomplete unicode character escape sequence at %s", state, sp - 2); } const unsigned char *p = (const unsigned char *)sp; @@ -698,7 +737,7 @@ static uint32_t unescape_unicode(JSON_ParserState *state, const char *sp, const const signed char b3 = digit_values[p[3]]; if (RB_UNLIKELY((signed char)(b0 | b1 | b2 | b3) < 0)) { - raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2); + raise_syntax_error_at("incomplete unicode character escape sequence at %s", state, sp - 2); } return ((uint32_t)b0 << 12) | ((uint32_t)b1 << 8) | ((uint32_t)b2 << 4) | (uint32_t)b3; @@ -715,7 +754,7 @@ NOINLINE(static) void json_eat_comments(JSON_ParserState *state, JSON_ParserConfig *config) { if (config->on_comment == JSON_RAISE) { - raise_parse_error("unexpected token %s", state); + raise_syntax_error("unexpected token %s", state); } const char *start = state->cursor; @@ -737,7 +776,7 @@ json_eat_comments(JSON_ParserState *state, JSON_ParserConfig *config) while (true) { const char *next_match = memchr(state->cursor, '*', state->end - state->cursor); if (!next_match) { - raise_parse_error_at("unterminated comment, expected closing '*/'", state, start); + raise_eos_error_at("unterminated comment, expected closing '*/'", state, start); } state->cursor = next_match + 1; @@ -749,7 +788,7 @@ json_eat_comments(JSON_ParserState *state, JSON_ParserConfig *config) break; } default: - raise_parse_error_at("unexpected token %s", state, start); + raise_parse_error_at("unexpected token %s", state, start, eos(state)); break; } @@ -945,13 +984,13 @@ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_Parser uint32_t sur = unescape_unicode(state, pe + 2, stringEnd); if (RB_UNLIKELY((sur & 0xFC00) != 0xDC00)) { - raise_parse_error_at("invalid surrogate pair at %s", state, p); + raise_syntax_error_at("invalid surrogate pair at %s", state, p); } ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) | (sur & 0x3FF)); pe += 5; } else { - raise_parse_error_at("incomplete surrogate pair at %s", state, p); + raise_syntax_error_at("incomplete surrogate pair at %s", state, p); break; } } @@ -961,20 +1000,22 @@ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_Parser p = ++pe; break; } + case 0: + return Qundef; default: if ((unsigned char)*pe < 0x20) { if (!config->allow_control_characters) { if (*pe == '\n') { - raise_parse_error_at("Invalid unescaped newline character (\\n) in string: %s", state, pe - 1); + raise_syntax_error_at("Invalid unescaped newline character (\\n) in string: %s", state, pe - 1); } - raise_parse_error_at("invalid ASCII control character in string: %s", state, pe - 1); + raise_syntax_error_at("invalid ASCII control character in string: %s", state, pe - 1); } } if (config->allow_invalid_escape) { APPEND_CHAR(*pe); } else { - raise_parse_error_at("invalid escape character in string: %s", state, pe - 1); + raise_syntax_error_at("invalid escape character in string: %s", state, pe - 1); } break; } @@ -1129,7 +1170,7 @@ NORETURN(static) void raise_duplicate_key_error(JSON_ParserState *state, VALUE d long line, column; cursor_position(state, &line, &column); rb_str_concat(message, build_parse_error_message("", state, line, column)) ; - rb_exc_raise(parse_error_new(message, line, column)); + rb_exc_raise(parse_error_new(message, line, column, false)); } NOINLINE(static) void json_on_duplicate_key(JSON_ParserState *state, JSON_ParserConfig *config, size_t count, const VALUE *pairs) @@ -1265,7 +1306,7 @@ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfi } default: if (!config->allow_control_characters) { - raise_parse_error("invalid ASCII control character in string: %s", state); + raise_syntax_error("invalid ASCII control character in string: %s", state); } break; } @@ -1273,8 +1314,7 @@ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfi state->cursor++; } while (string_scan(state)); - raise_parse_error("unexpected end of input, expected closing \"", state); - return Qfalse; + return Qundef; } ALWAYS_INLINE(static) VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name) @@ -1283,7 +1323,7 @@ ALWAYS_INLINE(static) VALUE json_parse_string(JSON_ParserState *state, JSON_Pars const char *start = state->cursor; if (RB_UNLIKELY(!string_scan(state))) { - raise_parse_error("unexpected end of input, expected closing \"", state); + return Qundef; } VALUE string; @@ -1382,7 +1422,7 @@ static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig int mantissa_digits = json_parse_digits(state, &mantissa); if (RB_UNLIKELY((first_digit == '0' && mantissa_digits > 1) || (negative && mantissa_digits == 0))) { - raise_parse_error_at("invalid number: %s", state, start); + return Qundef; } // Parse fractional part @@ -1395,7 +1435,7 @@ static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig mantissa_digits += fractional_digits; if (RB_UNLIKELY(!fractional_digits)) { - raise_parse_error_at("invalid number: %s", state, start); + return Qundef; } } @@ -1415,7 +1455,7 @@ static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig int exponent_digits = json_parse_digits(state, &abs_exponent); if (RB_UNLIKELY(!exponent_digits)) { - raise_parse_error_at("invalid number: %s", state, start); + return Qundef; } if (RB_UNLIKELY(exponent_digits >= 20 || abs_exponent > (uint64_t)INT64_MAX)) { @@ -1460,16 +1500,16 @@ static inline long json_frame_entry_count(const json_frame *frame, const rvalue_ // moves on to expecting a ',' or its closing bracket. The caller passes the // frame it already has in hand -- the one that was expecting the value -- which // after a container close is the freshly re-exposed parent. -static inline void json_value_completed(json_frame *frame) +static inline enum json_frame_phase json_value_completed(json_frame *frame) { JSON_ASSERT((int)JSON_PHASE_DONE == (int)JSON_FRAME_ROOT); JSON_ASSERT((int)JSON_PHASE_ARRAY_COMMA == (int)JSON_FRAME_ARRAY); JSON_ASSERT((int)JSON_PHASE_OBJECT_COMMA == (int)JSON_FRAME_OBJECT); - frame->phase = (enum json_frame_phase) frame->type; + return frame->phase = (enum json_frame_phase) frame->type; } -ALWAYS_INLINE(static) bool json_match_keyword(JSON_ParserState *state, const char *keyword, size_t offset) +ALWAYS_INLINE(static) void json_match_keyword(JSON_ParserState *state, bool resumable, const char *keyword, size_t offset) { // It is assumed that since `keyword` is always a literal, the compiler is able to constantize this // `strlen` and several other computations in that routine, such as eliminating the `if (resumable)` branch. @@ -1480,9 +1520,11 @@ ALWAYS_INLINE(static) bool json_match_keyword(JSON_ParserState *state, const cha // That's why we sometime compare starting from the first byte and sometimes from the second. if (rest(state) >= len && (memcmp(state->cursor + offset, keyword + offset, len - offset) == 0)) { state->cursor += len; - return true; + return; } - return false; + + bool eos = rest(state) < len && memcmp(state->cursor, keyword, rest(state)) == 0; + raise_parse_error("unexpected token %s", state, eos); } // Parse an arbitrary JSON value iteratively. This is a state machine driven @@ -1490,7 +1532,10 @@ ALWAYS_INLINE(static) bool json_match_keyword(JSON_ParserState *state, const cha // resume purely from the frame stack. A JSON_FRAME_ROOT frame sits at the // bottom of the stack, so the stack is never empty mid-parse and the document // itself is just another frame whose value, once parsed, leaves its phase DONE. -static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) +// When invoked in resumable mode, it returns true after parsing a complete document. +// If reaching EOS without having parsed a complete document, either returns false +// of raise a JSON::ParserError tagged with `@eos=true`. +ALWAYS_INLINE(static) bool json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config, bool resumable) { json_frame *frame = json_frame_stack_peek(state->frames); @@ -1507,7 +1552,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) JSON_PHASE_DONE: { // The root document value is parsed; it is the lone survivor on // the rvalue stack. - return *rvalue_stack_peek(state->value_stack, 1); + return true; } JSON_PHASE_VALUE: { @@ -1516,68 +1561,110 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) VALUE value; switch (peek(state)) { case 'n': - if (json_match_keyword(state, "null", 0)) { - value = Qnil; - break; - } - raise_parse_error("unexpected token %s", state); + json_match_keyword(state, resumable, "null", 0); + value = Qnil; + break; case 't': - if (json_match_keyword(state, "true", 0)) { - value = Qtrue; - break; - } - raise_parse_error("unexpected token %s", state); + json_match_keyword(state, resumable, "true", 0); + value = Qtrue; + break; case 'f': - if (json_match_keyword(state, "false", 1)) { - value = Qfalse; - break; - } - raise_parse_error("unexpected token %s", state); + json_match_keyword(state, resumable, "false", 1); + value = Qfalse; + break; case 'N': - // Note: memcmp with a small power of two compile to an integer comparison - if (config->allow_nan && json_match_keyword(state, "NaN", 1)) { - value = CNaN; - break; + if (!config->allow_nan) { + raise_syntax_error("unexpected token %s", state); } - raise_parse_error("unexpected token %s", state); + + json_match_keyword(state, resumable, "NaN", 1); + value = CNaN; + break; case 'I': - if (config->allow_nan && json_match_keyword(state, "Infinity", 0)) { - value = CInfinity; - break; + if (!config->allow_nan) { + raise_syntax_error("unexpected token %s", state); } - raise_parse_error("unexpected token %s", state); + + json_match_keyword(state, resumable, "Infinity", 0); + value = CInfinity; + break; case '-': { + const char *start = state->cursor; state->cursor++; - if (config->allow_nan && json_match_keyword(state, "Infinity", 0)) { - value = CMinusInfinity; - } else { - value = json_parse_negative_number(state, config); + + value = json_parse_negative_number(state, config); + + if (RB_UNLIKELY(UNDEF_P(value))) { + if (config->allow_nan && peek(state) == 'I') { + state->cursor = start; + json_match_keyword(state, resumable, "-Infinity", 1); + value = CMinusInfinity; + break; + } + } + + // Top level numbers are ambiguous when parsing streams, we can't + // know if we parsed all the digits if we hit EOS. + if (resumable && eos(state)) { + state->cursor = start; + return false; + } + + if (RB_UNLIKELY(UNDEF_P(value))) { + raise_syntax_error_at("invalid number: %s", state, start); } break; } - case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { + const char *start = state->cursor; value = json_parse_positive_number(state, config); + + // Top level numbers are ambiguous when parsing streams, we can't + // know if we parsed all the digits if we hit EOS. + if (resumable && eos(state)) { + state->cursor = start; + return false; + } + + if (RB_UNLIKELY(UNDEF_P(value))) { + raise_syntax_error_at("invalid number: %s", state, start); + } break; + } - case '"': + case '"': { // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"} + const char *start = state->cursor; value = json_parse_string(state, config, false); + + if (RB_UNLIKELY(UNDEF_P(value))) { + bool is_eos = eos(state); + if (resumable) { + state->cursor = start; + } + raise_parse_error("unexpected end of input, expected closing \"", state, is_eos); + } break; + } case '[': { - state->cursor++; + const char *start = state->cursor++; json_eat_whitespace(state, config); - if (peek(state) == ']') { + const char next = peek(state); + if (next == ']') { state->cursor++; value = json_decode_array(state, config, 0); break; + } else if (resumable && next == 0) { + state->cursor = start; + return false; } state->current_nesting++; @@ -1594,6 +1681,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) }); goto JSON_PHASE_VALUE; } + case '{': { const char *object_start_cursor = state->cursor; @@ -1604,6 +1692,9 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) state->cursor++; value = json_decode_object(state, config, 0); break; + } else if (resumable && eos(state)) { + state->cursor = object_start_cursor; + return false; } state->current_nesting++; @@ -1616,16 +1707,20 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) .type = JSON_FRAME_OBJECT, .phase = JSON_PHASE_OBJECT_KEY, .value_stack_head = state->value_stack->head, - .start_cursor = object_start_cursor, + .start_offset = object_start_cursor - state->start, }); goto JSON_PHASE_OBJECT_KEY; } case 0: - raise_parse_error("unexpected end of input", state); + if (resumable) { + return false; + } else { + raise_eos_error("unexpected end of input", state); + } default: - raise_parse_error("unexpected character: %s", state); + raise_syntax_error("unexpected character: %s", state); } json_push_value(state, config, value); @@ -1636,10 +1731,10 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) case JSON_PHASE_ARRAY_COMMA: goto JSON_PHASE_ARRAY_COMMA; case JSON_PHASE_OBJECT_COMMA: goto JSON_PHASE_OBJECT_COMMA; case JSON_PHASE_VALUE: goto JSON_PHASE_VALUE; - case JSON_PHASE_OBJECT_KEY: JSON_UNREACHABLE_RETURN(Qundef); + case JSON_PHASE_OBJECT_KEY: JSON_UNREACHABLE_RETURN(false); case JSON_PHASE_OBJECT_COLON: goto JSON_PHASE_OBJECT_COLON; } - JSON_UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(false); } JSON_PHASE_OBJECT_KEY: { @@ -1647,21 +1742,34 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) json_eat_whitespace(state, config); + const char *start = state->cursor; + if (RB_LIKELY(peek(state) == '"')) { - json_push_value(state, config, json_parse_string(state, config, true)); + VALUE string = json_parse_string(state, config, true); + if (UNDEF_P(string)) { + if (resumable) { + state->cursor = start; + return false; + } else { + raise_syntax_error("unexpected end of input, expected closing \"", state); + } + } + json_push_value(state, config, string); frame->phase = JSON_PHASE_OBJECT_COLON; goto JSON_PHASE_OBJECT_COLON; + } else if (resumable && eos(state)) { + return false; } else { // The message differs for the first key vs. a key after a // ',': the first is the only one reached with nothing pushed // for this object yet. if (json_frame_entry_count(frame, state->value_stack) == 0) { - raise_parse_error("expected object key, got %s", state); + raise_syntax_error("expected object key, got %s", state); } else { - raise_parse_error("expected object key, got: %s", state); + raise_syntax_error("expected object key, got: %s", state); } } - JSON_UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(false); } JSON_PHASE_OBJECT_COLON: { @@ -1673,16 +1781,18 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) state->cursor++; frame->phase = JSON_PHASE_VALUE; goto JSON_PHASE_VALUE; + } else if (resumable && eos(state)) { + return false; } else { // First colon (only the first pair's key is pushed, nothing // else) vs. a later one. if (json_frame_entry_count(frame, state->value_stack) == 1) { - raise_parse_error("expected ':' after object key", state); + raise_syntax_error("expected ':' after object key", state); } else { - raise_parse_error("expected ':' after object key, got: %s", state); + raise_syntax_error("expected ':' after object key, got: %s", state); } } - JSON_UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(false); } JSON_PHASE_ARRAY_COMMA: { @@ -1708,9 +1818,11 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) long count = json_frame_entry_count(frame, state->value_stack); state->current_nesting--; state->in_array--; - json_frame_stack_pop(state->frames); + json_push_value(state, config, json_decode_array(state, config, count)); + json_frame_stack_pop(state->frames); frame = json_frame_stack_peek(state->frames); + json_value_completed(frame); switch (frame->phase) { @@ -1718,13 +1830,15 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) case JSON_PHASE_ARRAY_COMMA: goto JSON_PHASE_ARRAY_COMMA; case JSON_PHASE_OBJECT_COMMA: goto JSON_PHASE_OBJECT_COMMA; case JSON_PHASE_VALUE: goto JSON_PHASE_VALUE; - case JSON_PHASE_OBJECT_KEY: JSON_UNREACHABLE_RETURN(Qundef); + case JSON_PHASE_OBJECT_KEY: JSON_UNREACHABLE_RETURN(false); case JSON_PHASE_OBJECT_COLON: goto JSON_PHASE_OBJECT_COLON; } + } else if (resumable && next_char == 0) { + return false; } else { - raise_parse_error("expected ',' or ']' after array value", state); + raise_syntax_error("expected ',' or ']' after array value", state); } - JSON_UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(false); } JSON_PHASE_OBJECT_COMMA: { @@ -1735,9 +1849,9 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) if (RB_LIKELY(next_char == ',')) { state->cursor++; + json_eat_whitespace(state, config); if (config->allow_trailing_comma) { - json_eat_whitespace(state, config); if (peek(state) == '}') { // Trailing comma: stay in COMMA to close on the next iteration. goto JSON_PHASE_OBJECT_COMMA; @@ -1753,7 +1867,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) // Temporary rewind cursor in case an error is raised const char *final_cursor = state->cursor; - state->cursor = frame->start_cursor; + state->cursor = state->start + frame->start_offset; VALUE object = json_decode_object(state, config, count); state->cursor = final_cursor; @@ -1767,13 +1881,15 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) case JSON_PHASE_ARRAY_COMMA: goto JSON_PHASE_ARRAY_COMMA; case JSON_PHASE_OBJECT_COMMA: goto JSON_PHASE_OBJECT_COMMA; case JSON_PHASE_VALUE: goto JSON_PHASE_VALUE; - case JSON_PHASE_OBJECT_KEY: JSON_UNREACHABLE_RETURN(Qundef); + case JSON_PHASE_OBJECT_KEY: JSON_UNREACHABLE_RETURN(false); case JSON_PHASE_OBJECT_COLON: goto JSON_PHASE_OBJECT_COLON; } + } else if (resumable && next_char == 0) { + return false; } else { - raise_parse_error("expected ',' or '}' after object value, got: %s", state); + raise_syntax_error("expected ',' or '}' after object value, got: %s", state); } - JSON_UNREACHABLE_RETURN(Qundef); + JSON_UNREACHABLE_RETURN(false); } JSON_UNREACHABLE_RETURN(Qundef); @@ -1783,7 +1899,7 @@ static void json_ensure_eof(JSON_ParserState *state, JSON_ParserConfig *config) { json_eat_whitespace(state, config); if (!eos(state)) { - raise_parse_error("unexpected token at end of stream %s", state); + raise_syntax_error("unexpected token at end of stream %s", state); } } @@ -1982,7 +2098,11 @@ static VALUE cParser_parse(JSON_ParserConfig *config, VALUE src) }; JSON_ParserState *state = &_state; - VALUE result = json_parse_any(state, config); + json_parse_any(state, config, false); + + // The root document value is parsed; it is the lone survivor on + // the rvalue stack. + VALUE result = *rvalue_stack_peek(state->value_stack, 1); // This may be skipped in case of exception, but // it won't cause a leak. @@ -2058,6 +2178,359 @@ static VALUE cJSON_parser_s_allocate(VALUE klass) return TypedData_Make_Struct(klass, JSON_ParserConfig, &JSON_ParserConfig_type, config); } +typedef struct JSON_ResumableParserStruct { + JSON_ParserConfig config; + JSON_ParserState state; // TODO: Should this actually be on the heap? Maybe it should be on the stack and reset on every parse. + rvalue_stack value_stack; + json_frame_stack frames; + VALUE buffer; + bool in_use; +} JSON_ResumableParser; + +static void JSON_ResumableParser_mark(void *ptr) +{ + JSON_ResumableParser *parser = (JSON_ResumableParser *)ptr; + JSON_ParserConfig_mark(&parser->config); + rvalue_stack_mark(&parser->value_stack); + rvalue_cache_mark(&parser->state.name_cache); + rb_gc_mark(parser->buffer); // pin the buffer +} + +static void JSON_ResumableParser_free(void *ptr) +{ + JSON_ResumableParser *parser = (JSON_ResumableParser *)ptr; + rvalue_stack_free_buffer(&parser->value_stack); + json_frame_stack_free_buffer(&parser->frames); +} + +static size_t JSON_ResumableParser_memsize(const void *ptr) +{ + const JSON_ResumableParser *parser = (const JSON_ResumableParser *)ptr; + size_t memsize = JSON_ParserConfig_memsize(&parser->config); + memsize += rvalue_stack_memsize(&parser->value_stack); + memsize += json_frame_stack_memsize(&parser->frames); +#ifndef HAVE_RUBY_TYPED_EMBEDDABLE + memsize += ( + sizeof(JSON_ResumableParser) + - sizeof(JSON_ParserState) + - sizeof(JSON_ParserConfig) + - sizeof(rvalue_stack) + - sizeof(json_frame_stack) + ); +#endif + return memsize; +} + +static void JSON_ResumableParser_compact(void *ptr) +{ + JSON_ResumableParser *parser = (JSON_ResumableParser *)ptr; + JSON_ParserConfig_compact(&parser->config); + rvalue_stack_compact(&parser->value_stack); + rvalue_cache_compact(&parser->state.name_cache); + parser->buffer = rb_gc_location(parser->buffer); +} + +static const rb_data_type_t JSON_ResumableParser_type = { + .wrap_struct_name = "JSON::Ext::ResumableParser", + .function = { + JSON_ResumableParser_mark, + JSON_ResumableParser_free, + JSON_ResumableParser_memsize, + JSON_ResumableParser_compact, + }, + // TODO: Figure how to set RUBY_TYPED_WB_PROTECTED + // Need to the the rvalue_stack, marked by another object. + .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_EMBEDDABLE, +}; + +static VALUE cResumableParser_allocate(VALUE klass) +{ + JSON_ResumableParser *parser; + VALUE obj = TypedData_Make_Struct(klass, JSON_ResumableParser, &JSON_ResumableParser_type, parser); + parser->state.in_array++; + return obj; +} + +static inline JSON_ResumableParser *cResumableParser_get(VALUE self) +{ + JSON_ResumableParser *parser; + TypedData_Get_Struct(self, JSON_ResumableParser, &JSON_ResumableParser_type, parser); + return parser; +} + +// TODO: doc +// TODO: optional arg +static VALUE cResumableParser_initialize(VALUE self, VALUE opts) +{ + rb_check_frozen(self); + JSON_ResumableParser *parser = cResumableParser_get(self); + + parser_config_init(&parser->config, opts, self); + + return self; +} + +// TODO: doc +static VALUE cResumableParser_feed(VALUE self, VALUE str) +{ + rb_check_frozen(self); + str = convert_encoding(str); + if (!RSTRING_LEN(str)) { + return self; + } + + JSON_ResumableParser *parser = cResumableParser_get(self); + + size_t offset = parser->state.cursor - parser->state.start; + const size_t remaining = parser->state.end - parser->state.cursor; + + if (!remaining) { + parser->buffer = RB_OBJ_FROZEN_RAW(str) ? str : rb_obj_hide(rb_str_new_shared(str)); + offset = 0; + } else { + JSON_ASSERT(parser->buffer); + + const size_t size = parser->state.end - parser->state.start; + const size_t consumed = size - remaining; + char *old_ptr = RSTRING_PTR(parser->buffer); + + if (RB_OBJ_FROZEN_RAW(parser->buffer)) { + VALUE new_buffer = rb_obj_hide(rb_str_buf_new(remaining + RSTRING_LEN(str))); + MEMCPY(RSTRING_PTR(new_buffer), old_ptr + consumed, char, remaining); + rb_str_set_len(new_buffer, remaining); + offset = 0; + parser->buffer = new_buffer; + } else if (consumed > (size / 2) && size >= 512) { + MEMMOVE(old_ptr, old_ptr + consumed, char, remaining); + rb_str_set_len(parser->buffer, remaining); + offset = 0; + } + rb_str_append(parser->buffer, str); + } + + long len; + const char *start; + RSTRING_GETMEM(parser->buffer, start, len); + parser->state.start = start; + parser->state.end = start + len; + parser->state.cursor = parser->state.start + offset; + + return self; +} + +struct json_parse_any_args { + JSON_ParserState *state; + JSON_ParserConfig *config; +}; + +static VALUE json_parse_any_resumable_safe(VALUE _args) +{ + struct json_parse_any_args *args = (struct json_parse_any_args *)_args; + return (VALUE)json_parse_any(args->state, args->config, true); +} + +static JSON_ResumableParser *ResumableParser_acquire(VALUE self, bool lock) +{ + JSON_ResumableParser *parser = cResumableParser_get(self); + + if (lock) { + if (parser->in_use) { + rb_raise(rb_eArgError, "ResumableParser can't be used recursively"); + } + parser->in_use = true; + } + + // self may have moved, so we need to update all pointers + // Investigate: We might be better off keeping JSON_ParserState on the stack + // and only persist what we need. + parser->state.value_stack_handle = &self; + parser->state.frame_stack_handle = &self; + parser->state.value_stack = &parser->value_stack; + parser->state.frames = &parser->frames; + + return parser; +} + +// TODO: doc +static VALUE cResumableParser_parse(VALUE self) +{ + JSON_ResumableParser *parser = ResumableParser_acquire(self, true); + if (!parser->buffer) { + return Qfalse; + } + + if (parser->frames.head == 0) { + json_frame_stack_push(&parser->state, (json_frame){ + .type = JSON_FRAME_ROOT, + .phase = JSON_PHASE_VALUE, + }); + } + + VALUE Vsource = parser->buffer; // Prevent compaction + + json_frame *frame = json_frame_stack_peek(&parser->frames); + + if (frame->phase == JSON_PHASE_DONE) { + JSON_ASSERT(parser->value_stack.head == 1); + JSON_ASSERT(parser->frames.head == 1); + + frame->phase = JSON_PHASE_VALUE; + rvalue_stack_pop(parser->state.value_stack, 1); + } + + struct json_parse_any_args args = { + .state = &parser->state, + .config = &parser->config, + }; + int status; + bool complete = rb_protect(json_parse_any_resumable_safe, (VALUE)&args, &status); + parser->in_use = false; + if (status) { + complete = false; + if (RTEST(rb_ivar_get(rb_errinfo(), rb_intern("@eos")))) { + complete = false; // is an EOS error + } else { + rb_jump_tag(status); // reraise + } + } + RB_GC_GUARD(Vsource); + return complete ? Qtrue : Qfalse; +} + +// TODO: doc +static VALUE cResumableParser_value_p(VALUE self) +{ + JSON_ResumableParser *parser = ResumableParser_acquire(self, false); + + if (parser->value_stack.head > 0) { + json_frame *frame = json_frame_stack_peek(&parser->frames); + if (frame->phase == JSON_PHASE_DONE) { + return Qtrue; + } + } + return Qfalse; +} + +// TODO: doc +static VALUE cResumableParser_value(VALUE self) +{ + JSON_ResumableParser *parser = ResumableParser_acquire(self, false); + + if (parser->frames.head > 0) { + json_frame *frame = json_frame_stack_peek(&parser->frames); + + if (frame->phase == JSON_PHASE_DONE) { + VALUE result = *rvalue_stack_peek(parser->state.value_stack, 1); + rvalue_stack_pop(parser->state.value_stack, 1); + json_frame_stack_pop(parser->state.frames); + return result; + } + } + rb_raise(rb_eArgError, "no ready value"); // TODO: Figure out the best exception and message +} + +// TODO: doc +static VALUE cResumableParser_clear(VALUE self) +{ + JSON_ResumableParser *parser = ResumableParser_acquire(self, false); + parser->buffer = 0; + parser->frames.head = 0; + parser->value_stack.head = 0; + parser->state.name_cache.length = 0; + parser->state.start = parser->state.cursor = parser->state.end = NULL; + return self; +} + +// TODO: doc +static VALUE cResumableParser_partial_value(VALUE self) +{ + JSON_ResumableParser *original_parser = ResumableParser_acquire(self, false); + JSON_ResumableParser parser = *original_parser; + + if (parser.value_stack.head == 0) { + return Qnil; + } + + json_frame *frame = json_frame_stack_peek(parser.state.frames); + long missing_object_value = 0; + if (frame->type == JSON_FRAME_OBJECT && (frame->phase == JSON_PHASE_VALUE || frame->phase == JSON_PHASE_OBJECT_COLON)) { + missing_object_value = 1; + } + + // Copy the value stack as we need to mutate it. + long capa = parser.value_stack.head; + parser.value_stack.capa = (capa + missing_object_value); + VALUE tmpbuf, *value_stack_buffer = ALLOCV_N(VALUE, tmpbuf, capa + missing_object_value); + MEMCPY(value_stack_buffer, parser.value_stack.ptr, VALUE, parser.value_stack.capa); + parser.value_stack.ptr = value_stack_buffer; + + JSON_ParserState *state = &parser.state; + JSON_ParserConfig *config = &parser.config; + + if (missing_object_value) { + rvalue_stack_push(state->value_stack, Qnil, NULL, &state->value_stack); + } + + VALUE partial_result = Qundef; + + while (UNDEF_P(partial_result)) { + frame = json_frame_stack_peek(state->frames); + + switch (frame->type) { + case JSON_FRAME_ROOT: { + partial_result = *rvalue_stack_peek(state->value_stack, 1); + break; + } + + case JSON_FRAME_ARRAY: { + long count = json_frame_entry_count(frame, state->value_stack); + json_push_value(state, config, json_decode_array(state, config, count)); + json_frame_stack_pop(state->frames); + + break; + } + + case JSON_FRAME_OBJECT: { + long count = json_frame_entry_count(frame, state->value_stack); + json_push_value(state, config, json_decode_object(state, config, count)); + json_frame_stack_pop(state->frames); + break; + } + + default: { + JSON_UNREACHABLE_RETURN(Qundef); + break; + } + } + } + + ALLOCV_END(tmpbuf); + return partial_result; +} + +// TODO: doc +static VALUE cResumableParser_rest(VALUE self) +{ + JSON_ResumableParser *parser = cResumableParser_get(self); + + if (!parser->buffer) { + return rb_utf8_str_new("", 0); + } + + size_t offset = parser->state.cursor - parser->state.start; + const char *ptr; + long len; + RSTRING_GETMEM(parser->buffer, ptr, len); + return rb_utf8_str_new(ptr + offset, len - offset); +} + +// TODO: doc +static VALUE cResumableParser_eos_p(VALUE self) +{ + JSON_ResumableParser *parser = cResumableParser_get(self); + return eos(&parser->state) ? Qtrue : Qfalse; +} + void Init_parser(void) { #ifdef HAVE_RB_EXT_RACTOR_SAFE @@ -2077,12 +2550,24 @@ void Init_parser(void) eNestingError = rb_path2class("JSON::NestingError"); rb_define_alloc_func(cParserConfig, cJSON_parser_s_allocate); - rb_define_method(cParserConfig, "initialize", cParserConfig_initialize, 1); + rb_define_private_method(cParserConfig, "initialize", cParserConfig_initialize, 1); rb_define_method(cParserConfig, "parse", cParserConfig_parse, 1); VALUE cParser = rb_define_class_under(mExt, "Parser", rb_cObject); rb_define_singleton_method(cParser, "parse", cParser_m_parse, 2); + VALUE cResumableParser = rb_define_class_under(mExt, "ResumableParser", rb_cObject); + rb_define_alloc_func(cResumableParser, cResumableParser_allocate); + rb_define_private_method(cResumableParser, "initialize", cResumableParser_initialize, 1); + rb_define_method(cResumableParser, "<<", cResumableParser_feed, 1); + rb_define_method(cResumableParser, "parse", cResumableParser_parse, 0); + rb_define_method(cResumableParser, "value", cResumableParser_value, 0); + rb_define_method(cResumableParser, "value?", cResumableParser_value_p, 0); + rb_define_method(cResumableParser, "partial_value", cResumableParser_partial_value, 0); + rb_define_method(cResumableParser, "clear", cResumableParser_clear, 0); + rb_define_method(cResumableParser, "rest", cResumableParser_rest, 0); + rb_define_method(cResumableParser, "eos?", cResumableParser_eos_p, 0); + rb_global_variable(&CNaN); CNaN = rb_const_get(mJSON, rb_intern("NaN")); @@ -2113,6 +2598,7 @@ void Init_parser(void) i_encode = rb_intern("encode"); i_at_line = rb_intern("@line"); i_at_column = rb_intern("@column"); + i_at_eos = rb_intern("@eos"); binary_encindex = rb_ascii8bit_encindex(); utf8_encindex = rb_utf8_encindex(); diff --git a/test/json/resumable_parser_test.rb b/test/json/resumable_parser_test.rb new file mode 100644 index 00000000..1004a697 --- /dev/null +++ b/test/json/resumable_parser_test.rb @@ -0,0 +1,218 @@ +# frozen_string_literal: true +require_relative 'test_helper' + +class JSONResumageParserTest < Test::Unit::TestCase + include JSON + + def setup + omit "JRuby not supported" if RUBY_ENGINE == "jruby" + @parser = new_parser + end + + def test_value + refute_predicate @parser, :value? + assert_raise(ArgumentError) { @parser.value } + + @parser << '[]' + + refute_predicate @parser, :value? + assert_raise(ArgumentError) { @parser.value } + + assert @parser.parse + + assert_predicate @parser, :value? + assert_equal [], @parser.value + + refute_predicate @parser, :value? + assert_raise(ArgumentError) { @parser.value } + end + + def test_clear + @parser << '[1, 2, 3' + refute @parser.parse + assert_equal '3', @parser.rest + + @parser.clear + + assert_equal '', @parser.rest + refute_predicate @parser, :value? + + @parser << '[1, 2, 3][' + assert @parser.parse + assert_predicate @parser, :value? + assert_equal '[', @parser.rest + + @parser.clear + + assert_equal '', @parser.rest + refute_predicate @parser, :value? + end + + def test_parse_document_direct + @parser << '[true]' + assert_equal true, @parser.parse + assert_equal [true], @parser.value + end + + def test_parse_multiple_documents_direct + @parser << '[true]{}[1, 2, 3]' + + assert_equal true, @parser.parse + assert_equal [true], @parser.value + + assert_equal true, @parser.parse + assert_equal({}, @parser.value) + + assert_equal true, @parser.parse + assert_equal [1, 2, 3], @parser.value + end + + def test_parse_top_level_keywords + assert_resumed_parsing('true') + assert_resumed_parsing('false') + assert_resumed_parsing('null') + + assert_parse_stream([true, false, nil], 'truefalsenull') + end + + def test_parse_top_level_numbers + assert_parse_stream([1, 2, 3], '1 2 3 ') + assert_parse_stream([1, 2], '1 2 3') # Parser can't know if the number is terminated + assert_parse_stream([3], ' ') + assert_parse_stream([1, 2, 3, true], '1 2 3true') + assert_parse_stream([-1, 2.34, 5.0e67], '-1 2.34 5e67 ') + end + + def test_parse_byte_by_byte_array + assert_resumed_parsing('[]') + assert_resumed_parsing('[ ]') + assert_resumed_parsing('[true]') + assert_resumed_parsing('[12]') + assert_resumed_parsing('[ 12 ]') + assert_resumed_parsing('[ 12.3 ]') + assert_resumed_parsing('[ 12.3e12 ]') + assert_resumed_parsing('[ 1e12 ]') + assert_resumed_parsing('[-12]') + assert_resumed_parsing('[ -12 ]') + assert_resumed_parsing('[ -12.3 ]') + assert_resumed_parsing('[ -12.3e12 ]') + assert_resumed_parsing('[ -1e12 ]') + end + + def test_parse_byte_by_byte_object + assert_resumed_parsing('{}') + assert_resumed_parsing('{ }') + assert_resumed_parsing('{"test" : true}') + assert_resumed_parsing('{ "test":12, "value" : { "key": 42} }') + end + + def test_parse_byte_by_byte_string + assert_resumed_parsing(JSON.generate('test')) + assert_resumed_parsing(JSON.generate('te\\st')) + assert_resumed_parsing('"te\\u2028st"') + assert_resumed_parsing(JSON.generate("te\u2028st")) + assert_resumed_parsing(JSON.generate("te \u2028 st")) + end + + def test_parse_byte_by_byte_numbers + assert_resumed_parsing('123 ') + end + + def test_rest + @parser << '[1, 2, 3, "unterminated string' + refute @parser.parse + assert_equal '"unterminated string', @parser.rest + end + + def test_eos + assert_predicate @parser, :eos? + + @parser << '[1, 2, 3]' + refute_predicate @parser, :eos? + + assert @parser.parse + assert_predicate @parser, :eos? + + @parser << '123' + refute_predicate @parser, :eos? + + refute @parser.parse + refute_predicate @parser, :eos? + + @parser << ' ' + assert @parser.parse + assert_equal 123, @parser.value + refute_predicate @parser, :eos? + + refute @parser.parse + assert_predicate @parser, :eos? + end + + def test_partial_value + assert_nil @parser.partial_value + assert_partial_value [1, 2, 3], '[1, 2, 3, "unterminated string' + assert_partial_value({ "a" => 1, "b" => { "c" => nil } }, '{ "a": 1, "b": { "c": "unterminated string') + assert_partial_value({ "a" => 1, "b" => { "c" => nil } }, '{ "a": 1, "b": { "c"') + assert_partial_value([1, { "a" => 1, "b" => { "c" => nil } }], '[1, { "a": 1, "b": { "c"') + end + + def test_partial_value_missing + assert_nil @parser.partial_value + end + + def test_assert_reentrency_prevented + called = false + parser = nil + callback = ->(_o) do + unless called + called = true + parser.parse + end + end + parser = new_parser(on_load: callback) + parser << '[]' + error = assert_raise ArgumentError do + parser.parse + end + assert_equal "ResumableParser can't be used recursively", error.message + end + + private + + def assert_partial_value(expected, json) + parser = new_parser + parser << json + refute parser.parse + 2.times do + assert_equal expected, parser.partial_value + end + end + + def assert_resumed_parsing(json, parser = @parser) + expected = JSON.parse(json) + + last_parsed_byte_index = 0 + json.each_byte do |byte| + parser << byte.chr + last_parsed_byte_index += 1 + break if parser.parse + end + actual = parser.value + assert_equal expected, actual + remaining_bytes = (json.bytesize - last_parsed_byte_index) + assert_equal 0, remaining_bytes, "unconsumed bytes: #{actual.inspect}, remaining: #{json.byteslice(-1, remaining_bytes).inspect}" + end + + def assert_parse_stream(expected, json, parser = @parser) + actual = [] + parser << json + while parser.parse + actual << parser.value + end + assert_equal(expected, actual) + end + + def new_parser(options = {}) + JSON::Ext::ResumableParser.new(options) + end +end