From 3cffc44510da21a9021ff0b36bb5cf10cbaa6f2f Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Thu, 19 Mar 2026 14:14:54 -0400 Subject: [PATCH 1/4] Lazily build offsets from a packed array --- ext/prism/extension.c | 8 +++----- lib/prism/parse_result.rb | 24 +++++++++++++++++------- rbi/generated/prism/parse_result.rbi | 12 ++++++++---- sig/generated/prism/parse_result.rbs | 16 +++++++++++----- templates/ext/prism/api_node.c.erb | 6 +----- 5 files changed, 40 insertions(+), 26 deletions(-) diff --git a/ext/prism/extension.c b/ext/prism/extension.c index a3ff4a1632..9f9169cfff 100644 --- a/ext/prism/extension.c +++ b/ext/prism/extension.c @@ -802,16 +802,14 @@ parse_lex_input(const uint8_t *input, size_t input_length, const pm_options_t *o pm_node_t *node = pm_parse(parser); - // Here we need to update the Source object to have the correct - // encoding for the source string and the correct newline offsets. - // We do it here because we've already created the Source object and given - // it over to all of the tokens, and both of these are only set after pm_parse(). + /* Update the Source object with the correct encoding and line offsets, + * which are only available after pm_parse() completes. */ rb_encoding *encoding = rb_enc_find(pm_parser_encoding_name(parser)); rb_enc_associate(source_string, encoding); const pm_line_offset_list_t *line_offsets = pm_parser_line_offsets(parser); for (size_t index = 0; index < line_offsets->size; index++) { - rb_ary_push(offsets, ULONG2NUM(line_offsets->offsets[index])); + rb_ary_store(offsets, (long) index, ULONG2NUM(line_offsets->offsets[index])); } if (pm_options_freeze(options)) { diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb index 5c4d4fcb8a..4d1fa2c296 100644 --- a/lib/prism/parse_result.rb +++ b/lib/prism/parse_result.rb @@ -58,16 +58,26 @@ def self.for(source, start_line, offsets) # The line number where this source starts. attr_reader :start_line #: Integer - # The list of newline byte offsets in the source code. - attr_reader :offsets #: Array[Integer] + # The list of newline byte offsets in the source code. When initialized from + # the C extension, this may be a packed binary string of uint32_t values + # that is lazily unpacked on first access. + #-- + #: () -> Array[Integer] + def offsets + offsets = @offsets + return offsets if offsets.is_a?(Array) + @offsets = offsets.unpack("L*") + end - # Create a new source object with the given source code. + # Create a new source object with the given source code. The offsets + # parameter can be either an Array of Integer byte offsets or a packed + # binary string of uint32_t values (from the C extension). #-- - #: (String source, Integer start_line, Array[Integer] offsets) -> void + #: (String source, Integer start_line, Array[Integer] | String offsets) -> void def initialize(source, start_line, offsets) @source = source - @start_line = start_line # set after parsing is done - @offsets = offsets # set after parsing is done + @start_line = start_line + @offsets = offsets end # Replace the value of start_line with the given value. @@ -81,7 +91,7 @@ def replace_start_line(start_line) #-- #: (Array[Integer] offsets) -> void def replace_offsets(offsets) - @offsets.replace(offsets) + @offsets = offsets end # Returns the encoding of the source code, which is set by parameters to the diff --git a/rbi/generated/prism/parse_result.rbi b/rbi/generated/prism/parse_result.rbi index ddced69934..f20ba90ef5 100644 --- a/rbi/generated/prism/parse_result.rbi +++ b/rbi/generated/prism/parse_result.rbi @@ -28,12 +28,16 @@ module Prism sig { returns(Integer) } attr_reader :start_line - # The list of newline byte offsets in the source code. + # The list of newline byte offsets in the source code. When initialized from + # the C extension, this may be a packed binary string of uint32_t values + # that is lazily unpacked on first access. sig { returns(T::Array[Integer]) } - attr_reader :offsets + def offsets; end - # Create a new source object with the given source code. - sig { params(source: String, start_line: Integer, offsets: T::Array[Integer]).void } + # Create a new source object with the given source code. The offsets + # parameter can be either an Array of Integer byte offsets or a packed + # binary string of uint32_t values (from the C extension). + sig { params(source: String, start_line: Integer, offsets: ::T.any(T::Array[Integer], String)).void } def initialize(source, start_line, offsets); end # Replace the value of start_line with the given value. diff --git a/sig/generated/prism/parse_result.rbs b/sig/generated/prism/parse_result.rbs index d2b4035960..1f3b8a8d54 100644 --- a/sig/generated/prism/parse_result.rbs +++ b/sig/generated/prism/parse_result.rbs @@ -33,13 +33,19 @@ module Prism # The line number where this source starts. attr_reader start_line: Integer - # The list of newline byte offsets in the source code. - attr_reader offsets: Array[Integer] + # The list of newline byte offsets in the source code. When initialized from + # the C extension, this may be a packed binary string of uint32_t values + # that is lazily unpacked on first access. + # -- + # : () -> Array[Integer] + def offsets: () -> Array[Integer] - # Create a new source object with the given source code. + # Create a new source object with the given source code. The offsets + # parameter can be either an Array of Integer byte offsets or a packed + # binary string of uint32_t values (from the C extension). # -- - # : (String source, Integer start_line, Array[Integer] offsets) -> void - def initialize: (String source, Integer start_line, Array[Integer] offsets) -> void + # : (String source, Integer start_line, Array[Integer] | String offsets) -> void + def initialize: (String source, Integer start_line, Array[Integer] | String offsets) -> void # Replace the value of start_line with the given value. # -- diff --git a/templates/ext/prism/api_node.c.erb b/templates/ext/prism/api_node.c.erb index 506c2e87f8..42a30ce0f2 100644 --- a/templates/ext/prism/api_node.c.erb +++ b/templates/ext/prism/api_node.c.erb @@ -81,11 +81,7 @@ pm_source_new(const pm_parser_t *parser, rb_encoding *encoding, bool freeze) { VALUE source_string = rb_enc_str_new((const char *) start, pm_parser_end(parser) - start, encoding); const pm_line_offset_list_t *line_offsets = pm_parser_line_offsets(parser); - VALUE offsets = rb_ary_new_capa(line_offsets->size); - - for (size_t index = 0; index < line_offsets->size; index++) { - rb_ary_push(offsets, ULONG2NUM(line_offsets->offsets[index])); - } + VALUE offsets = rb_str_new((const char *) line_offsets->offsets, line_offsets->size * sizeof(uint32_t)); if (freeze) { rb_obj_freeze(source_string); From bbcb5690ce663f0316d2091f433143fde5d10237 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Thu, 19 Mar 2026 14:22:02 -0400 Subject: [PATCH 2/4] Use an arena for building the Prism AST --- templates/ext/prism/api_node.c.erb | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/templates/ext/prism/api_node.c.erb b/templates/ext/prism/api_node.c.erb index 42a30ce0f2..71f7fe273e 100644 --- a/templates/ext/prism/api_node.c.erb +++ b/templates/ext/prism/api_node.c.erb @@ -1,6 +1,7 @@ #line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" #include "prism/extension.h" #include "prism/internal/allocator.h" +#include "prism/internal/arena.h" #include @@ -101,8 +102,8 @@ typedef struct pm_node_stack_node { } pm_node_stack_node_t; static void -pm_node_stack_push(pm_node_stack_node_t **stack, const pm_node_t *visit) { - pm_node_stack_node_t *node = xmalloc(sizeof(pm_node_stack_node_t)); +pm_node_stack_push(pm_arena_t *arena, pm_node_stack_node_t **stack, const pm_node_t *visit) { + pm_node_stack_node_t *node = (pm_node_stack_node_t *) pm_arena_alloc(arena, sizeof(pm_node_stack_node_t), PRISM_ALIGNOF(pm_node_stack_node_t)); node->prev = *stack; node->visit = visit; node->visited = false; @@ -115,7 +116,6 @@ pm_node_stack_pop(pm_node_stack_node_t **stack) { const pm_node_t *visit = current->visit; *stack = current->prev; - xfree_sized(current, sizeof(pm_node_stack_node_t)); return visit; } @@ -147,8 +147,9 @@ pm_ast_new(const pm_parser_t *parser, const pm_node_t *node, rb_encoding *encodi pm_ast_constants_each_data_t constants_data = { .constants = constants, .encoding = encoding }; pm_parser_constants_each(parser, pm_ast_constants_each, &constants_data); + pm_arena_t *node_arena = pm_arena_new(); pm_node_stack_node_t *node_stack = NULL; - pm_node_stack_push(&node_stack, node); + pm_node_stack_push(node_arena, &node_stack, node); VALUE value_stack = rb_ary_new(); while (node_stack != NULL) { @@ -171,10 +172,10 @@ pm_ast_new(const pm_parser_t *parser, const pm_node_t *node, rb_encoding *encodi <%- node.fields.each do |field| -%> <%- case field -%> <%- when Prism::Template::NodeField, Prism::Template::OptionalNodeField -%> - pm_node_stack_push(&node_stack, (pm_node_t *) cast-><%= field.name %>); + pm_node_stack_push(node_arena, &node_stack, (pm_node_t *) cast-><%= field.name %>); <%- when Prism::Template::NodeListField -%> for (size_t index = 0; index < cast-><%= field.name %>.size; index++) { - pm_node_stack_push(&node_stack, (pm_node_t *) cast-><%= field.name %>.nodes[index]); + pm_node_stack_push(node_arena, &node_stack, (pm_node_t *) cast-><%= field.name %>.nodes[index]); } <%- end -%> <%- end -%> @@ -276,6 +277,7 @@ pm_ast_new(const pm_parser_t *parser, const pm_node_t *node, rb_encoding *encodi } } + pm_arena_free(node_arena); return rb_ary_pop(value_stack); } From 9df357af897e93f65cfe63c071c979a3d054e638 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Thu, 19 Mar 2026 17:18:21 -0400 Subject: [PATCH 3/4] Fast path for returning from parse_arguments_list --- src/prism.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/prism.c b/src/prism.c index 53c33cee39..5e2d37a8e0 100644 --- a/src/prism.c +++ b/src/prism.c @@ -12589,6 +12589,14 @@ match4(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4); } +/** + * Returns true if the current token is any of the six given types. + */ +static PRISM_INLINE bool +match6(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3, pm_token_type_t type4, pm_token_type_t type5, pm_token_type_t type6) { + return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4) || match1(parser, type5) || match1(parser, type6); +} + /** * Returns true if the current token is any of the seven given types. */ @@ -15091,6 +15099,16 @@ parse_block(pm_parser_t *parser, uint16_t depth) { */ static bool parse_arguments_list(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_block, uint8_t flags, uint16_t depth) { + /* Fast path: if the current token can't begin an expression and isn't + * a parenthesis, block opener, or splat/block-pass operator, there are + * no arguments to parse. */ + if ( + !token_begins_expression_p(parser->current.type) && + !match6(parser, PM_TOKEN_PARENTHESIS_LEFT, PM_TOKEN_KEYWORD_DO, PM_TOKEN_KEYWORD_DO_BLOCK, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR, PM_TOKEN_UAMPERSAND) + ) { + return false; + } + bool found = false; bool parsed_command_args = false; From 56cdcbbb8c7ef02bfd58d0ac61bce558f5782d37 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Fri, 20 Mar 2026 12:18:17 -0400 Subject: [PATCH 4/4] Provide a single-entry cache on parser for avoiding constant hashes --- include/prism/internal/parser.h | 12 ++++++++++++ src/prism.c | 14 +++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/include/prism/internal/parser.h b/include/prism/internal/parser.h index dbed71e737..3afe226757 100644 --- a/include/prism/internal/parser.h +++ b/include/prism/internal/parser.h @@ -582,6 +582,18 @@ struct pm_parser_t { */ uint32_t node_id; + /* + * A single-entry cache for pm_parser_constant_id_raw. Avoids redundant + * constant pool lookups when the same token is resolved multiple times + * (e.g., once during lexing for local variable detection, and again + * during parsing for node creation). + */ + struct { + const uint8_t *start; + const uint8_t *end; + pm_constant_id_t id; + } constant_cache; + /* The current state of the lexer. */ pm_lex_state_t lex_state; diff --git a/src/prism.c b/src/prism.c index 5e2d37a8e0..1fa4a46ed8 100644 --- a/src/prism.c +++ b/src/prism.c @@ -1120,7 +1120,19 @@ pm_locals_order(pm_parser_t *parser, pm_locals_t *locals, pm_constant_id_list_t */ static PRISM_INLINE pm_constant_id_t pm_parser_constant_id_raw(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { - return pm_constant_pool_insert_shared(&parser->metadata_arena, &parser->constant_pool, start, (size_t) (end - start)); + /* Fast path: if this is the same token as the last lookup (same pointer + * range), return the cached result. */ + if (start == parser->constant_cache.start && end == parser->constant_cache.end) { + return parser->constant_cache.id; + } + + pm_constant_id_t id = pm_constant_pool_insert_shared(&parser->metadata_arena, &parser->constant_pool, start, (size_t) (end - start)); + + parser->constant_cache.start = start; + parser->constant_cache.end = end; + parser->constant_cache.id = id; + + return id; } /**