diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php index f291064e..c583b8db 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php @@ -29,7 +29,7 @@ class WP_MySQL_Parser extends WP_Parser { * @return bool Whether a query was successfully parsed. */ public function next_query(): bool { - if ( $this->position >= count( $this->tokens ) ) { + if ( $this->position >= $this->token_count ) { return false; } $this->current_ast = $this->parse(); diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index 8c17b458..754ee6c9 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -29,10 +29,58 @@ class WP_Parser_Grammar { public $rules; public $rule_names; public $fragment_ids; - public $lookahead_is_match_possible = array(); + + /** + * Per-rule branch selector keyed by the next token id. + * + * When set, `$branches_for_token[$rule_id][$token_id]` is the ordered list + * of branch indexes in `$rules[$rule_id]` that can possibly match when the + * current token has the given id. Nullable branches appear in every entry. + * + * If an entry does not exist for the current token, `$nullable_branches` + * is consulted. If both are empty, the rule cannot match and the parser + * returns immediately. + * + * Rules whose FIRST set could not be computed do not appear in the map; + * for those the parser falls back to trying every branch. + * + * @var array> + */ + public $branches_for_token = array(); + + /** + * Per-rule list of nullable branch indexes. + * + * @var array + */ + public $nullable_branches = array(); + + /** + * Per-rule flag indicating every (rule, token) selector entry points + * to exactly one branch. The parser uses this to skip the outer + * foreach when a single candidate is the only possibility. + * + * @var array + */ + public $single_candidate_rules = array(); + public $lowest_non_terminal_id; public $highest_terminal_id; + /** + * Cached id of the grammar's start rule, populated lazily on first parse. + * + * @var int|null + */ + public $start_rule_id; + + /** + * Cached id of the selectStatement rule, populated lazily on first parse. + * + * @var int|null + */ + public $select_statement_rule_id; + public function __construct( array $rules ) { $this->inflate( $rules ); } @@ -56,8 +104,8 @@ private function inflate( $grammar ) { $this->highest_terminal_id = $this->lowest_non_terminal_id - 1; foreach ( $grammar['rules_names'] as $rule_index => $rule_name ) { - $this->rule_names[ $rule_index + $grammar['rules_offset'] ] = $rule_name; - $this->rules[ $rule_index + $grammar['rules_offset'] ] = array(); + $rule_id = $rule_index + $grammar['rules_offset']; + $this->rule_names[ $rule_id ] = $rule_name; /** * Treat all intermediate rules as fragments to inline before returning @@ -75,7 +123,7 @@ private function inflate( $grammar ) { * They are prefixed with a "%" to be distinguished from the original rules. */ if ( '%' === $rule_name[0] ) { - $this->fragment_ids[ $rule_index + $grammar['rules_offset'] ] = true; + $this->fragment_ids[ $rule_id ] = true; } } @@ -85,55 +133,284 @@ private function inflate( $grammar ) { $this->rules[ $rule_id ] = $branches; } - /** - * Compute a rule => [token => true] lookup table for each rule - * that starts with a terminal OR with another rule that already - * has a lookahead mapping. - * - * This is similar to left-factoring the grammar, even if not quite - * the same. - * - * This enables us to quickly bail out from checking branches that - * cannot possibly match the current token. This increased the parser - * speed by a whopping 80%! - * - * @TODO: Explore these possible next steps: - * - * * Compute a rule => [token => branch[]] list lookup table and only - * process the branches that have a chance of matching the current token. - * * Actually left-factor the grammar as much as possible. This, however, - * could inflate the serialized grammar size. - */ - // 5 iterations seem to give us all the speed gains we can get from this. - for ( $i = 0; $i < 5; $i++ ) { - foreach ( $grammar['grammar'] as $rule_index => $branches ) { - $rule_id = $rule_index + $grammar['rules_offset']; - if ( isset( $this->lookahead_is_match_possible[ $rule_id ] ) ) { + $this->inline_single_branch_fragments(); + $this->strip_epsilon_markers(); + $this->build_branch_selectors(); + } + + /** + * Remove explicit `EMPTY_RULE_ID` markers from branches. + * + * The epsilon marker is a zero-width, always-matching symbol used in the + * grammar to express optional productions. At parse time it would still + * be walked and "continued" over for no effect, so stripping it ahead of + * time removes a per-symbol branch in the hot loop. + * + * A pure-epsilon branch (`[EMPTY_RULE_ID]`) becomes an empty branch (`[]`) + * which the parser already handles: the inner symbol loop does nothing and + * the rule returns a successful empty match. + */ + private function strip_epsilon_markers() { + foreach ( $this->rules as $rule_id => $branches ) { + foreach ( $branches as $i => $branch ) { + if ( in_array( self::EMPTY_RULE_ID, $branch, true ) ) { + $this->rules[ $rule_id ][ $i ] = array_values( + array_filter( + $branch, + static function ( $s ) { + return self::EMPTY_RULE_ID !== $s; + } + ) + ); + } + } + } + } + + /** + * Inline single-branch fragment rules into their call sites. + * + * The grammar contains many single-branch fragment rules that exist only + * to factor shared sub-sequences out of larger productions. At runtime + * the parser would descend into each such fragment via a recursive call + * just to walk the same symbol sequence and splice the results back into + * the parent. Expanding them in-place at build time eliminates that call + * chain without changing the resulting AST because fragment children are + * already flattened into the parent node. + * + * Fragments with two or more alternatives (e.g., `%EOF_zero_or_one`) are + * left intact because they represent real choices that must be evaluated + * against the current token. + */ + private function inline_single_branch_fragments() { + $rules = $this->rules; + $fragment_ids = $this->fragment_ids ?? array(); + $low_nt = $this->lowest_non_terminal_id; + + // Precompute the set of single-branch fragments that are candidates + // for inlining. + $inlinable = array(); + foreach ( $fragment_ids as $rule_id => $_ ) { + if ( isset( $rules[ $rule_id ] ) && 1 === count( $rules[ $rule_id ] ) ) { + $inlinable[ $rule_id ] = true; + } + } + + // Depth-first expansion memoized per rule, with cycle detection. + $expanded = array(); + $visiting = array(); + $expand_branch = function ( array $branch ) use ( &$expand_branch, &$expanded, &$visiting, $rules, $low_nt, $inlinable ) { + $out = array(); + foreach ( $branch as $sym ) { + if ( $sym < $low_nt ) { + $out[] = $sym; continue; } - $rule_lookup = array(); - $first_symbol_can_be_expanded_to_all_terminals = true; + if ( ! isset( $inlinable[ $sym ] ) ) { + $out[] = $sym; + continue; + } + if ( isset( $visiting[ $sym ] ) ) { + // Cycle: leave the reference in place. + $out[] = $sym; + continue; + } + if ( ! isset( $expanded[ $sym ] ) ) { + $visiting[ $sym ] = true; + $expanded[ $sym ] = $expand_branch( $rules[ $sym ][0] ); + unset( $visiting[ $sym ] ); + } + foreach ( $expanded[ $sym ] as $s ) { + $out[] = $s; + } + } + return $out; + }; + + // Rewrite every rule's branches with fragments inlined. + foreach ( $this->rules as $rule_id => $branches ) { + $new_branches = array(); + foreach ( $branches as $branch ) { + $new_branches[] = $expand_branch( $branch ); + } + $this->rules[ $rule_id ] = $new_branches; + } + } + + /** + * Compute FIRST and NULLABLE sets for every non-terminal, then denormalize + * them into a per-rule map of `token_id => branch_index[]` so the parser + * can jump straight to the branches that can possibly match the current + * token. + * + * This replaces the previous coarse "can any branch match this token?" + * lookahead. On the MySQL corpus the fine-grained selector skips ~60% + * of the branch attempts that the parser used to try and fail. + */ + private function build_branch_selectors() { + $rules = $this->rules; + $low_nt = $this->lowest_non_terminal_id; + $empty_rule = self::EMPTY_RULE_ID; + $rule_ids = array_keys( $rules ); + $nullable = array(); + $first_sets = array(); + + foreach ( $rule_ids as $rule_id ) { + $nullable[ $rule_id ] = false; + $first_sets[ $rule_id ] = array(); + } + + // Iterate to fixpoint. FIRST and NULLABLE set monotonically grow. + do { + $changed = false; + foreach ( $rule_ids as $rule_id ) { + $branches = $rules[ $rule_id ]; foreach ( $branches as $branch ) { - $terminals = false; - $branch_starts_with_terminal = $branch[0] < $this->lowest_non_terminal_id; - if ( $branch_starts_with_terminal ) { - $terminals = array( $branch[0] ); - } elseif ( isset( $this->lookahead_is_match_possible[ $branch[0] ] ) ) { - $terminals = array_keys( $this->lookahead_is_match_possible[ $branch[0] ] ); + $branch_nullable = true; + foreach ( $branch as $symbol ) { + if ( $empty_rule === $symbol ) { + // ε: contributes nothing to FIRST, stays nullable. + continue; + } + if ( $symbol < $low_nt ) { + // Terminal. + if ( ! isset( $first_sets[ $rule_id ][ $symbol ] ) ) { + $first_sets[ $rule_id ][ $symbol ] = true; + $changed = true; + } + $branch_nullable = false; + break; + } + // Non-terminal. + foreach ( $first_sets[ $symbol ] as $tid => $_ ) { + if ( ! isset( $first_sets[ $rule_id ][ $tid ] ) ) { + $first_sets[ $rule_id ][ $tid ] = true; + $changed = true; + } + } + if ( ! $nullable[ $symbol ] ) { + $branch_nullable = false; + break; + } + } + if ( $branch_nullable && ! $nullable[ $rule_id ] ) { + $nullable[ $rule_id ] = true; + $changed = true; } + } + } + } while ( $changed ); - if ( false === $terminals ) { - $first_symbol_can_be_expanded_to_all_terminals = false; + // Build per-(rule, token) branch indices. + foreach ( $rule_ids as $rule_id ) { + $branches = $rules[ $rule_id ]; + $selector = array(); + $nullable_branch_ids = array(); + foreach ( $branches as $idx => $branch ) { + $branch_first = array(); + $branch_nullable = true; + foreach ( $branch as $symbol ) { + if ( $empty_rule === $symbol ) { + continue; + } + if ( $symbol < $low_nt ) { + $branch_first[ $symbol ] = true; + $branch_nullable = false; break; } - foreach ( $terminals as $terminal ) { - $rule_lookup[ $terminal ] = true; + foreach ( $first_sets[ $symbol ] as $tid => $_ ) { + $branch_first[ $tid ] = true; + } + if ( ! $nullable[ $symbol ] ) { + $branch_nullable = false; + break; } } - if ( $first_symbol_can_be_expanded_to_all_terminals ) { - $this->lookahead_is_match_possible[ $rule_id ] = $rule_lookup; + foreach ( $branch_first as $tid => $_ ) { + $selector[ $tid ][] = $idx; + } + if ( $branch_nullable ) { + $nullable_branch_ids[] = $idx; } } + + // Nullable branches also match when the current token is not in + // any branch's FIRST set. Fold them into every populated entry + // so the runtime lookup is a single array access. + if ( $nullable_branch_ids ) { + $merged = array(); + foreach ( $selector as $tid => $idx_list ) { + $merged[ $tid ] = self::merge_sorted( $idx_list, $nullable_branch_ids ); + } + $selector = $merged; + $this->nullable_branches[ $rule_id ] = true; + } + if ( $selector ) { + // Expand branch indexes to the branch symbol sequences so + // the parser can foreach candidate branches without an + // extra $branches[$idx] indirection on every attempt. Many + // tokens inside the same rule end up pointing to the same + // branch-id list, so deduplicate by signature and let + // copy-on-write share one sequences array across all of + // them. Without this the nested table would be ~40 MB; with + // it, ~1 MB. + $by_signature = array(); + $all_single_candidates = true; + foreach ( $selector as $tid => $idx_list ) { + if ( 1 !== count( $idx_list ) ) { + $all_single_candidates = false; + } + $sig = implode( ',', $idx_list ); + if ( isset( $by_signature[ $sig ] ) ) { + $selector[ $tid ] = $by_signature[ $sig ]; + } else { + $seqs = array(); + foreach ( $idx_list as $idx ) { + $seqs[] = $branches[ $idx ]; + } + $by_signature[ $sig ] = $seqs; + $selector[ $tid ] = $seqs; + } + } + $this->branches_for_token[ $rule_id ] = $selector; + if ( $all_single_candidates ) { + $this->single_candidate_rules[ $rule_id ] = true; + } + } + } + } + + /** + * Merge two ascending int arrays into one ascending int array without + * duplicates. Preserves original branch order as required by the parser. + * + * @param int[] $a + * @param int[] $b + * @return int[] + */ + private static function merge_sorted( array $a, array $b ): array { + $i = 0; + $j = 0; + $na = count( $a ); + $nb = count( $b ); + $out = array(); + while ( $i < $na && $j < $nb ) { + if ( $a[ $i ] < $b[ $j ] ) { + $out[] = $a[ $i++ ]; + } elseif ( $a[ $i ] > $b[ $j ] ) { + $out[] = $b[ $j++ ]; + } else { + $out[] = $a[ $i ]; + ++$i; + ++$j; + } + } + while ( $i < $na ) { + $out[] = $a[ $i++ ]; + } + while ( $j < $nb ) { + $out[] = $b[ $j++ ]; } + return $out; } } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php index e2d67018..70fadfd2 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php @@ -9,23 +9,36 @@ * In this way, a parser node constitutes a recursive structure that represents * a parse (sub)tree at each level of the full grammar tree. */ -class WP_Parser_Node { +final class WP_Parser_Node { /** * @TODO: Review and document these properties and their visibility. */ public $rule_id; public $rule_name; - private $children = array(); + private $children; - public function __construct( $rule_id, $rule_name ) { + public function __construct( $rule_id, $rule_name, array $children = array() ) { $this->rule_id = $rule_id; $this->rule_name = $rule_name; + $this->children = $children; } public function append_child( $node ) { $this->children[] = $node; } + /** + * Return the children array by reference for efficient fragment inlining. + * + * Returning a reference lets the parser iterate children without copying + * the array. The returned reference must not be mutated by callers. + * + * @return array + */ + public function &get_children_ref(): array { + return $this->children; + } + /** * Flatten the matched rule fragments as if their children were direct * descendants of the current rule. diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 4436892f..b9c2ba8b 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -11,85 +11,176 @@ class WP_Parser { protected $grammar; protected $tokens; + protected $token_count; protected $position; + // Grammar data cached as instance fields so the hot path avoids an extra + // property hop via $this->grammar on every recursive call. + private $rule_names; + private $fragment_ids; + private $branches_for_token; + private $nullable_branches; + private $highest_terminal_id; + private $select_statement_rule_id; + private $single_candidate_rules; + public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { - $this->grammar = $grammar; - $this->tokens = $tokens; - $this->position = 0; + $this->grammar = $grammar; + $this->token_count = count( $tokens ); + // Append an end-of-input sentinel token whose id is EMPTY_RULE_ID + // (0). The hot path can then read $tokens[$pos]->id unconditionally + // when $pos is the current cursor, because the sentinel naturally + // fails to match any real grammar terminal while feeding the + // nullable-fallback branch of the selector check. + $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); + $this->tokens = $tokens; + $this->position = 0; + $this->rule_names = $grammar->rule_names; + $this->fragment_ids = $grammar->fragment_ids ?? array(); + $this->branches_for_token = $grammar->branches_for_token; + $this->nullable_branches = $grammar->nullable_branches; + $this->highest_terminal_id = $grammar->highest_terminal_id; + $this->single_candidate_rules = $grammar->single_candidate_rules ?? array(); + + // The INTO negative-lookahead only fires for selectStatement. Cache + // the rule id so the per-call check is an int compare instead of a + // string compare. + if ( null === $grammar->select_statement_rule_id ) { + $grammar->select_statement_rule_id = $grammar->get_rule_id( 'selectStatement' ); + } + $this->select_statement_rule_id = $grammar->select_statement_rule_id; } public function parse() { // @TODO: Make the starting rule lookup non-grammar-specific. - $query_rule_id = $this->grammar->get_rule_id( 'query' ); - $ast = $this->parse_recursive( $query_rule_id ); + // Cache the query rule id on the grammar - get_rule_id() does a + // linear array_search over all rule names which, on the MySQL + // grammar, costs a few microseconds per lookup. + $grammar = $this->grammar; + if ( null === $grammar->start_rule_id ) { + $grammar->start_rule_id = $grammar->get_rule_id( 'query' ); + } + $ast = $this->parse_recursive( $grammar->start_rule_id ); return false === $ast ? null : $ast; } + /** + * Parse a single non-terminal rule. + * + * This function is only called for non-terminal rule ids. Terminals are + * matched inline inside the branch loop below to avoid a function-call + * round trip per consumed token. + */ private function parse_recursive( $rule_id ) { - $is_terminal = $rule_id <= $this->grammar->highest_terminal_id; - if ( $is_terminal ) { - if ( $this->position >= count( $this->tokens ) ) { - return false; - } + $tokens = $this->tokens; + $position = $this->position; - if ( WP_Parser_Grammar::EMPTY_RULE_ID === $rule_id ) { - return true; - } - - if ( $this->tokens[ $this->position ]->id === $rule_id ) { - ++$this->position; - return $this->tokens[ $this->position - 1 ]; - } + // Narrow the set of branches worth trying using the precomputed FIRST + // sets. When no entry exists for the current token but the rule is + // nullable, all candidate branches would match empty, so we return + // immediately without entering any branch. + $tid = $tokens[ $position ]->id; + if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { + $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; + } elseif ( isset( $this->nullable_branches[ $rule_id ] ) ) { + return true; + } else { return false; } - $branches = $this->grammar->rules[ $rule_id ]; - if ( ! count( $branches ) ) { - return false; - } + $highest_terminal_id = $this->highest_terminal_id; + $is_fragment = isset( $this->fragment_ids[ $rule_id ] ); + $is_select_statement = $rule_id === $this->select_statement_rule_id; - // Bale out from processing the current branch if none of its rules can - // possibly match the current token. - if ( isset( $this->grammar->lookahead_is_match_possible[ $rule_id ] ) ) { - $token_id = $this->tokens[ $this->position ]->id; - if ( - ! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ $token_id ] ) && - ! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ WP_Parser_Grammar::EMPTY_RULE_ID ] ) - ) { + // Fast path for rules where every (rule, token) selector entry + // points to exactly one branch - about 55% of nonterminal calls + // on the MySQL corpus. Skipping the outer foreach avoids the + // foreach iterator setup for those calls. + if ( isset( $this->single_candidate_rules[ $rule_id ] ) ) { + // Single-candidate fast path: the rule has exactly one branch + // to try for this token, so skip the outer foreach and the + // $branch_matches bookkeeping - every failure path just + // rewinds the position and returns false directly. + $branch = $candidate_branches[0]; + $children = array(); + foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + if ( $tokens[ $this->position ]->id === $subrule_id ) { + $children[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $this->position = $position; + return false; + } + + $subnode = $this->parse_recursive( $subrule_id ); + if ( false === $subnode ) { + $this->position = $position; + return false; + } + if ( true === $subnode ) { + continue; + } + if ( is_array( $subnode ) ) { + foreach ( $subnode as $c ) { + $children[] = $c; + } + } else { + $children[] = $subnode; + } + } + + if ( $is_select_statement && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id ) { + $this->position = $position; return false; } + if ( ! $children ) { + return true; + } + if ( $is_fragment ) { + return $children; + } + return new WP_Parser_Node( $rule_id, $this->rule_names[ $rule_id ], $children ); } - $rule_name = $this->grammar->rule_names[ $rule_id ]; - $starting_position = $this->position; - foreach ( $branches as $branch ) { - $this->position = $starting_position; - $node = new WP_Parser_Node( $rule_id, $rule_name ); + $branch_matches = false; + $children = array(); + foreach ( $candidate_branches as $branch ) { + $this->position = $position; + $children = array(); $branch_matches = true; foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + // The sentinel at $tokens[$token_count] has id 0 so it + // cannot match any real terminal, making the range check + // unnecessary here. + if ( $tokens[ $this->position ]->id === $subrule_id ) { + $children[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $branch_matches = false; + break; + } + $subnode = $this->parse_recursive( $subrule_id ); if ( false === $subnode ) { $branch_matches = false; break; - } elseif ( true === $subnode ) { - /* - * The subrule was matched without actually matching a token. - * This means a special empty "ε" (epsilon) rule was matched. - * An "ε" rule in a grammar matches an empty input of 0 bytes. - * It is used to represent optional grammar productions. - */ - continue; - } elseif ( is_array( $subnode ) && 0 === count( $subnode ) ) { - continue; } - if ( is_array( $subnode ) && ! count( $subnode ) ) { + if ( true === $subnode ) { continue; } - if ( isset( $this->grammar->fragment_ids[ $subrule_id ] ) ) { - $node->merge_fragment( $subnode ); + if ( is_array( $subnode ) ) { + // Fragment results are returned directly as a children + // array so the parser does not allocate a Parser_Node + // that would immediately be unwrapped into the parent. + foreach ( $subnode as $c ) { + $children[] = $c; + } } else { - $node->append_child( $subnode ); + $children[] = $subnode; } } @@ -100,25 +191,36 @@ private function parse_recursive( $rule_id ) { // for right-associative rules, which could solve this. // See: https://github.com/mysql/mysql-workbench/blob/8.0.38/library/parsers/grammars/MySQLParser.g4#L994 // See: https://github.com/antlr/antlr4/issues/488 - $la = $this->tokens[ $this->position ] ?? null; - if ( $la && 'selectStatement' === $rule_name && WP_MySQL_Lexer::INTO_SYMBOL === $la->id ) { + if ( + $branch_matches + && $is_select_statement + && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id + ) { $branch_matches = false; } - if ( true === $branch_matches ) { + if ( $branch_matches ) { break; } } if ( ! $branch_matches ) { - $this->position = $starting_position; + $this->position = $position; return false; } - if ( ! $node->has_child() ) { + if ( ! $children ) { return true; } - return $node; + // Fragments exist only to group symbols for reuse; their "node" would + // get inlined into the parent on the very next step. Return the raw + // children array so the caller can splice it without allocating a + // throwaway WP_Parser_Node. + if ( $is_fragment ) { + return $children; + } + + return new WP_Parser_Node( $rule_id, $this->rule_names[ $rule_id ], $children ); } } diff --git a/packages/mysql-on-sqlite/tests/tools/bench-compiled-parser.php b/packages/mysql-on-sqlite/tests/tools/bench-compiled-parser.php new file mode 100644 index 00000000..785142e3 --- /dev/null +++ b/packages/mysql-on-sqlite/tests/tools/bench-compiled-parser.php @@ -0,0 +1,92 @@ += $limit ) { + break; + } +} +fclose( $handle ); + +$all_tokens = array(); +foreach ( $queries as $q ) { + $all_tokens[] = ( new WP_MySQL_Lexer( $q ) )->remaining_tokens(); +} +echo 'Loaded ', count( $queries ), " queries\n"; + +function bench( $label, callable $factory, array $tokens_list, $runs ) { + $results = array(); + for ( $r = 0; $r < $runs; $r++ ) { + $fail = 0; + $start = microtime( true ); + foreach ( $tokens_list as $tokens ) { + $parser = $factory( $tokens ); + $ast = $parser->parse(); + if ( null === $ast ) { + ++$fail; + } + } + $dur = microtime( true ) - $start; + $results[] = $dur; + printf( "%-15s run %d: %.4fs, %d QPS, %d failures\n", $label, $r + 1, $dur, count( $tokens_list ) / $dur, $fail ); + } + sort( $results ); + $best = $results[0]; + $avg = array_sum( $results ) / count( $results ); + printf( "%-15s best %.4fs (%d QPS) avg %.4fs (%d QPS)\n", $label, $best, count( $tokens_list ) / $best, $avg, count( $tokens_list ) / $avg ); +} + +bench( + 'interpreted', + fn( $tokens ) => new WP_MySQL_Parser( $grammar, $tokens ), + $all_tokens, + $runs +); +bench( + 'compiled', + fn( $tokens ) => new WP_MySQL_Compiled_Parser( $tokens ), + $all_tokens, + $runs +); diff --git a/packages/mysql-on-sqlite/tests/tools/bench-final.php b/packages/mysql-on-sqlite/tests/tools/bench-final.php new file mode 100644 index 00000000..1dabebcf --- /dev/null +++ b/packages/mysql-on-sqlite/tests/tools/bench-final.php @@ -0,0 +1,61 @@ +remaining_tokens(); +} +$count = count( $queries ); +printf( "Loaded %d queries\n", $count ); + +$durations = array(); +for ( $i = 0; $i < $runs; $i++ ) { + $start = microtime( true ); + $fail = 0; + foreach ( $all_tokens as $t ) { + if ( null === ( new WP_MySQL_Parser( $grammar, $t ) )->parse() ) { + ++$fail; + } + } + $d = microtime( true ) - $start; + $durations[] = $d; +} +sort( $durations ); +$best = $durations[0]; +$med = $durations[ (int) ( count( $durations ) / 2 ) ]; +$avg = array_sum( $durations ) / count( $durations ); +printf( "best %.4fs %6d QPS\n", $best, $count / $best ); +printf( "med %.4fs %6d QPS\n", $med, $count / $med ); +printf( "avg %.4fs %6d QPS\n", $avg, $count / $avg ); diff --git a/packages/mysql-on-sqlite/tests/tools/bench-hot-rules.php b/packages/mysql-on-sqlite/tests/tools/bench-hot-rules.php new file mode 100644 index 00000000..c15c5f4e --- /dev/null +++ b/packages/mysql-on-sqlite/tests/tools/bench-hot-rules.php @@ -0,0 +1,151 @@ +grammar = $g; + $this->token_count = count( $tokens ); + $tokens[] = new WP_Parser_Token( 0, 0, 0, '' ); + $this->tokens = $tokens; + $this->position = 0; + $this->rule_names = $g->rule_names; + $this->fragment_ids = $g->fragment_ids ?? array(); + $this->branches_for_token = $g->branches_for_token; + $this->nullable_branches = $g->nullable_branches; + $this->highest_terminal_id = $g->highest_terminal_id; + $this->sel_rid = $g->get_rule_id( 'selectStatement' ); + } + public function parse() { + $rid = $this->grammar->get_rule_id( 'query' ); + return $this->r( $rid ); + } + private function r( $rid ) { + self::$counts[ $rid ] = ( self::$counts[ $rid ] ?? 0 ) + 1; + $tokens = $this->tokens; + $position = $this->position; + $tid = $tokens[ $position ]->id; + if ( isset( $this->branches_for_token[ $rid ][ $tid ] ) ) { + $cb = $this->branches_for_token[ $rid ][ $tid ]; + } elseif ( isset( $this->nullable_branches[ $rid ] ) ) { + return true; + } else { + return false; + } + $htid = $this->highest_terminal_id; + $is_fragment = isset( $this->fragment_ids[ $rid ] ); + $is_sel = $rid === $this->sel_rid; + $ok = false; + $kids = array(); + foreach ( $cb as $branch ) { + $this->position = $position; + $kids = array(); + $ok = true; + foreach ( $branch as $sid ) { + if ( $sid <= $htid ) { + if ( $tokens[ $this->position ]->id === $sid ) { + $kids[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $ok = false; + break; + } + $sn = $this->r( $sid ); + if ( false === $sn ) { + $ok = false; + break; + } + if ( true === $sn ) { + continue; + } + if ( is_array( $sn ) ) { + foreach ( $sn as $c ) { + $kids[] = $c; + } + } else { + $kids[] = $sn; + } + } + if ( $ok && $is_sel && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id ) { + $ok = false; + } + if ( $ok ) { + break; + } + } + if ( ! $ok ) { + $this->position = $position; + return false; + } + if ( ! $kids ) { + return true; + } + if ( $is_fragment ) { + return $kids; + } + return new WP_Parser_Node( $rid, $this->rule_names[ $rid ], $kids ); + } +} + +$grammar = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' ); +$handle = fopen( __DIR__ . '/../mysql/data/mysql-server-tests-queries.csv', 'r' ); +$queries = array(); +$header = true; +while ( ( $r = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false ) { + if ( $header ) { + $header = false; + continue; + } + if ( null !== $r[0] ) { + $queries[] = $r[0]; + } +} +$queries = array_slice( $queries, 0, (int) ( $argv[1] ?? 10000 ) ); +$all_tokens = array(); +foreach ( $queries as $q ) { + $all_tokens[] = ( new WP_MySQL_Lexer( $q ) )->remaining_tokens(); +} + +foreach ( $all_tokens as $t ) { + ( new HR_Parser( $grammar, $t ) )->parse(); +} +arsort( HR_Parser::$counts ); +$total = array_sum( HR_Parser::$counts ); +$cumsum = 0; +$covered = array(); +$i = 0; +foreach ( HR_Parser::$counts as $rid => $cnt ) { + $cumsum += $cnt; + $covered[ $rid ] = true; + $pct = 100 * $cumsum / $total; + if ( in_array( ++$i, array( 10, 25, 50, 100, 200, 500 ), true ) || $pct >= 80 ) { + printf( "After top %d rules: cumulative %.1f%% (%s of %s calls)\n", $i, $pct, number_format( $cumsum ), number_format( $total ) ); + if ( $pct >= 95 ) { + break; + } + } +} diff --git a/packages/mysql-on-sqlite/tests/tools/bench-parser-split.php b/packages/mysql-on-sqlite/tests/tools/bench-parser-split.php new file mode 100644 index 00000000..107f3cbe --- /dev/null +++ b/packages/mysql-on-sqlite/tests/tools/bench-parser-split.php @@ -0,0 +1,95 @@ += $limit ) { + break; + } +} +fclose( $handle ); +echo 'Loaded ', count( $queries ), " queries\n"; + +// Pre-tokenize all queries once. The tokens are reused across runs, so the +// parser starts from a cold AST cache each iteration but a warm token cache. +$lex_start = microtime( true ); +$all_tokens = array(); +foreach ( $queries as $query ) { + $lexer = new WP_MySQL_Lexer( $query ); + $all_tokens[] = $lexer->remaining_tokens(); +} +$lex_duration = microtime( true ) - $lex_start; +printf( "Lex: %.4fs, %d QPS\n", $lex_duration, count( $queries ) / $lex_duration ); + +// Parse benchmark. +$results = array(); +for ( $r = 0; $r < $runs; $r++ ) { + $failures = 0; + $start = microtime( true ); + foreach ( $all_tokens as $tokens ) { + $parser = new WP_MySQL_Parser( $grammar, $tokens ); + $ast = $parser->parse(); + if ( null === $ast ) { + ++$failures; + } + } + $duration = microtime( true ) - $start; + $qps = count( $queries ) / $duration; + $results[] = array( $duration, $qps, $failures ); + printf( "Run %d: %.4fs, %d QPS, %d failures\n", $r + 1, $duration, $qps, $failures ); +} + +if ( $runs > 1 ) { + $durations = array_column( $results, 0 ); + sort( $durations ); + $best = $durations[0]; + printf( "Best: %.4fs, %d QPS\n", $best, count( $queries ) / $best ); + $avg = array_sum( $durations ) / count( $durations ); + printf( "Avg: %.4fs, %d QPS\n", $avg, count( $queries ) / $avg ); +} diff --git a/packages/mysql-on-sqlite/tests/tools/compare-asts.php b/packages/mysql-on-sqlite/tests/tools/compare-asts.php new file mode 100644 index 00000000..41be0f1d --- /dev/null +++ b/packages/mysql-on-sqlite/tests/tools/compare-asts.php @@ -0,0 +1,67 @@ +id . ',' . $n->start . ',' . $n->length . ')'; + } + $out = 'n(' . $n->rule_name; + foreach ( $n->get_children() as $c ) { + $out .= ',' . ast_signature( $c ); + } + return $out . ')'; +} + +$grammar = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' ); +$handle = fopen( __DIR__ . '/../mysql/data/mysql-server-tests-queries.csv', 'r' ); +$header = true; +$limit = (int) ( $argv[1] ?? PHP_INT_MAX ); +$n = 0; +$miss = 0; +while ( ( $row = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false && $n < $limit ) { + if ( $header ) { + $header = false; + continue; + } + if ( null === $row[0] ) { + continue; + } + ++$n; + $tokens1 = ( new WP_MySQL_Lexer( $row[0] ) )->remaining_tokens(); + $tokens2 = ( new WP_MySQL_Lexer( $row[0] ) )->remaining_tokens(); + $a1 = ( new WP_MySQL_Parser( $grammar, $tokens1 ) )->parse(); + $a2 = ( new WP_MySQL_Compiled_Parser( $tokens2 ) )->parse(); + $s1 = ast_signature( $a1 ); + $s2 = ast_signature( $a2 ); + if ( $s1 !== $s2 ) { + ++$miss; + if ( $miss <= 5 ) { + echo "MISMATCH query #$n:\n"; + echo ' ', substr( $row[0], 0, 200 ), "\n"; + echo ' interpreter: ', substr( $s1, 0, 300 ), "\n"; + echo ' compiled: ', substr( $s2, 0, 300 ), "\n"; + } + } +} +echo "Checked $n queries, $miss mismatches.\n"; diff --git a/packages/mysql-on-sqlite/tests/tools/compile-grammar.php b/packages/mysql-on-sqlite/tests/tools/compile-grammar.php new file mode 100644 index 00000000..79ddff8b --- /dev/null +++ b/packages/mysql-on-sqlite/tests/tools/compile-grammar.php @@ -0,0 +1,364 @@ + src/mysql/class-wp-mysql-compiled-parser.php + */ + +require_once __DIR__ . '/../../src/parser/class-wp-parser-grammar.php'; +require_once __DIR__ . '/../../src/parser/class-wp-parser-node.php'; +require_once __DIR__ . '/../../src/parser/class-wp-parser-token.php'; +require_once __DIR__ . '/../../src/parser/class-wp-parser.php'; +require_once __DIR__ . '/../../src/mysql/class-wp-mysql-token.php'; +require_once __DIR__ . '/../../src/mysql/class-wp-mysql-lexer.php'; +require_once __DIR__ . '/../../src/mysql/class-wp-mysql-parser.php'; + +$grammar = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' ); +$query_rid = $grammar->get_rule_id( 'query' ); +$select_rid = $grammar->get_rule_id( 'selectStatement' ); +$htid = $grammar->highest_terminal_id; +$into_symbol = WP_MySQL_Lexer::INTO_SYMBOL; + +// Reachability + fragment reference count. +$visited = array(); +$refs = array(); +$queue = array( $query_rid ); +while ( $queue ) { + $r = array_pop( $queue ); + if ( isset( $visited[ $r ] ) ) { + continue; + } + $visited[ $r ] = true; + foreach ( $grammar->rules[ $r ] as $branch ) { + foreach ( $branch as $sym ) { + if ( $sym > $htid ) { + $refs[ $sym ] = ( $refs[ $sym ] ?? 0 ) + 1; + if ( ! isset( $visited[ $sym ] ) ) { + $queue[] = $sym; + } + } + } + } +} + +// Decide which rules get inlined. +// Inline a fragment only if it is reachable AND single-branch (the simple +// case where we can splice its symbols into the parent branch). Multi-branch +// fragments require splatting which can explode parent branch counts; keep +// them as methods for now. +$inline_fragments = array(); +foreach ( $grammar->fragment_ids as $rid => $_ ) { + if ( + isset( $visited[ $rid ] ) + && isset( $grammar->rules[ $rid ] ) + && 1 === count( $grammar->rules[ $rid ] ) + ) { + $inline_fragments[ $rid ] = true; + } +} + +// Rules that will get a method. +$kept = array(); +foreach ( $visited as $rid => $_ ) { + if ( ! isset( $inline_fragments[ $rid ] ) ) { + $kept[ $rid ] = true; + } +} + +/** + * Compute the flattened symbol sequence for a branch, splicing any inlined + * single-use fragments in place. Cycles fall back to leaving the reference. + */ +$flatten = function ( array $branch ) use ( &$flatten, $grammar, $inline_fragments, $htid ) { + static $expanding = array(); + $out = array(); + foreach ( $branch as $sym ) { + if ( $sym <= $htid ) { + $out[] = $sym; + continue; + } + if ( ! isset( $inline_fragments[ $sym ] ) ) { + $out[] = $sym; + continue; + } + if ( count( $grammar->rules[ $sym ] ) !== 1 ) { + // Multi-branch single-use fragment: keep as call to avoid + // exponential parent-branch explosion. Future work could splat + // selected cases where branch count stays small. + $out[] = $sym; + continue; + } + if ( isset( $expanding[ $sym ] ) ) { + $out[] = $sym; + continue; + } + $expanding[ $sym ] = true; + foreach ( $flatten( $grammar->rules[ $sym ][0] ) as $s ) { + $out[] = $s; + } + unset( $expanding[ $sym ] ); + } + return $out; +}; + +/** + * PHP-safe method name for a rule id. + */ +$method_name = function ( $rid ) use ( $grammar ) { + $raw = $grammar->rule_names[ $rid ]; + // Fragment names start with "%" - turn that into "f_". + $clean = '%' === $raw[0] ? 'f_' . substr( $raw, 1 ) : $raw; + $clean = preg_replace( '/[^A-Za-z0-9_]/', '_', $clean ); + return 'r_' . $clean . '_' . $rid; +}; + +/** + * Emit code that matches a single symbol in a branch, appending on success + * and jumping to $fail_label (via `goto`) on failure. We use goto because + * PHP `break`/`continue` can only target immediate loops, and we want to + * roll back the position in a shared failure path. + * + * For single-branch rules there is no rollback label - failure just returns + * immediately so the label is reused inline. + */ +$emit_symbol = function ( $sym, $indent, $fail_stmt, $skip_check = false ) use ( $grammar, $htid, $inline_fragments, &$method_name, &$flatten, &$emit_symbol ) { + $out = ''; + if ( $sym <= $htid ) { + // Inline terminal match. The caller may tell us the token at the + // current position is already known to match (via switch case + // dispatch), in which case the check is redundant. + if ( ! $skip_check ) { + $out .= $indent . "if (\$tokens[\$this->position]->id !== $sym) $fail_stmt\n"; + } + $out .= $indent . "\$children[] = \$tokens[\$this->position];\n"; + $out .= $indent . "++\$this->position;\n"; + return $out; + } + + $is_fragment = isset( $grammar->fragment_ids[ $sym ] ); + $method = $method_name( $sym ); + $out .= $indent . "\$sub = \$this->$method();\n"; + $out .= $indent . "if (false === \$sub) $fail_stmt\n"; + $nullable = isset( $grammar->nullable_branches[ $sym ] ); + if ( $is_fragment ) { + if ( $nullable ) { + $out .= $indent . "if (true !== \$sub) { foreach (\$sub as \$c) \$children[] = \$c; }\n"; + } else { + $out .= $indent . "foreach (\$sub as \$c) \$children[] = \$c;\n"; + } + } else { + if ( $nullable ) { + $out .= $indent . "if (true !== \$sub) \$children[] = \$sub;\n"; + } else { + $out .= $indent . "\$children[] = \$sub;\n"; + } + } + return $out; +}; + +/** + * Emit the body of a rule method. + */ +$emit_method = function ( $rid ) use ( $grammar, $htid, $select_rid, $into_symbol, $inline_fragments, &$method_name, &$flatten, &$emit_symbol ) { + $name = $method_name( $rid ); + $is_fragment = isset( $grammar->fragment_ids[ $rid ] ); + $is_select = $rid === $select_rid; + $rule_name = $grammar->rule_names[ $rid ]; + $nullable = isset( $grammar->nullable_branches[ $rid ] ); + + // Per-token selector. Entries are lists of branch symbol sequences (the + // runtime format). Group tokens whose branch list is identical so their + // switch cases share a body. + $selector = $grammar->branches_for_token[ $rid ] ?? array(); + $groups = array(); + foreach ( $selector as $tid => $branch_seqs ) { + $sig_parts = array(); + foreach ( $branch_seqs as $seq ) { + $sig_parts[] = implode( ',', $seq ); + } + $key = implode( '|', $sig_parts ); + $groups[ $key ]['branches'] = $branch_seqs; + $groups[ $key ]['tids'][] = $tid; + } + + $code = "\tprivate function $name() {\n"; + $code .= "\t\t\$tokens = \$this->tokens;\n"; + $code .= "\t\t\$position = \$this->position;\n"; + $code .= "\t\t\$tid = \$tokens[\$position]->id;\n"; + + // "One of N terminals" fast path. When every branch is a single + // terminal, the entire rule collapses to: check accept set, consume + // one token, return. A rule like `%f1282` (406 terminal choices) + // compiles to ~8 lines instead of ~2.8k. + $all_single_terminal = true; + $accept = array(); + foreach ( $grammar->rules[ $rid ] as $b ) { + if ( 1 !== count( $b ) || $b[0] > $htid || 0 === $b[0] ) { + $all_single_terminal = false; + break; + } + $accept[ $b[0] ] = true; + } + if ( $all_single_terminal && $accept ) { + $keys = array_keys( $accept ); + sort( $keys ); + $lookup = '[' . implode( '=>1,', $keys ) . '=>1]'; + $code .= "\t\tstatic \$ok = $lookup;\n"; + $code .= "\t\tif (!isset(\$ok[\$tid])) return " . ( $nullable ? 'true' : 'false' ) . ";\n"; + $code .= "\t\t\$t = \$tokens[\$position];\n"; + $code .= "\t\t\$this->position = \$position + 1;\n"; + if ( $is_select ) { + // selectStatement is never single-terminal, but guard anyway. + $code .= "\t\tif (\$tokens[\$position + 1]->id === $into_symbol) { \$this->position = \$position; return false; }\n"; + } + if ( $is_fragment ) { + $code .= "\t\treturn array(\$t);\n"; + } else { + $code .= "\t\treturn new WP_Parser_Node($rid, " . var_export( $rule_name, true ) . ", array(\$t));\n"; + } + $code .= "\t}\n"; + return $code; + } + + if ( count( $groups ) === 1 ) { + // All accepting tokens reach the same branch list. A bare isset() + // check against a shared lookup table is much smaller than the + // equivalent 200-way switch case list and lets PHP resolve + // dispatch in a single hash lookup. + $only = reset( $groups ); + $tids = $only['tids']; + sort( $tids ); + $lookup = '[' . implode( '=>1,', $tids ) . '=>1]'; + $code .= "\t\tstatic \$first = $lookup;\n"; + $code .= "\t\tif (!isset(\$first[\$tid])) return " . ( $nullable ? 'true' : 'false' ) . ";\n"; + // We cannot hand $known_tids here: the single-branch-group fast + // path covers many tokens, so the branch's first symbol may not be + // a specific one of them. + $code .= emit_group_body( $only['branches'], $grammar, $rid, $rule_name, $is_fragment, $is_select, $into_symbol, $htid, $inline_fragments, $method_name, $flatten, $emit_symbol, false ); + // All branches failed; emit_group_body already reset the position. + $code .= "\t\treturn " . ( $nullable ? 'true' : 'false' ) . ";\n"; + } else { + $code .= "\t\tswitch (\$tid) {\n"; + foreach ( $groups as $g ) { + // Pack case labels onto as few lines as practical (~10 per + // line); single-label cases on their own line for readability. + $tids = $g['tids']; + $chunks = array_chunk( $tids, 10 ); + foreach ( $chunks as $chunk ) { + $code .= "\t\t\t" . implode( ' ', array_map( fn( $t ) => "case $t:", $chunk ) ) . "\n"; + } + $code .= emit_group_body( $g['branches'], $grammar, $rid, $rule_name, $is_fragment, $is_select, $into_symbol, $htid, $inline_fragments, $method_name, $flatten, $emit_symbol, true, $g['tids'] ); + } + $code .= "\t\t}\n"; + $code .= "\t\treturn " . ( $nullable ? 'true' : 'false' ) . ";\n"; + } + $code .= "\t}\n"; + return $code; +}; + +function emit_group_body( array $branch_seqs, WP_Parser_Grammar $g, $rid, $rule_name, $is_fragment, $is_select, $into_symbol, $htid, $inline_fragments, $method_name, $flatten, $emit_symbol, $in_switch = true, array $known_tids = array() ) { + $indent = $in_switch ? "\t\t\t\t" : "\t\t"; + $out = ''; + $count = count( $branch_seqs ); + + foreach ( $branch_seqs as $n => $raw_branch ) { + $branch = $flatten( $raw_branch ); + $is_last = ( $n === $count - 1 ); + + // The switch dispatch guarantees the current token matches a case + // label, so if there's exactly one label and the branch starts + // with that same terminal we can skip the redundant id check. + $first_is_known_terminal = false; + if ( count( $known_tids ) === 1 && $branch && $branch[0] === $known_tids[0] ) { + $first_is_known_terminal = true; + } + + if ( $count > 1 ) { + // Multi-branch: wrap each attempt in do-while(false). Break + // falls through to the next attempt; the final break falls + // through to the switch-level break / rule-level fall-through. + $out .= $indent . "do {\n"; + $inner_indent = $indent . "\t"; + $fail_stmt = 'break;'; + $out .= $inner_indent . "\$children = array();\n"; + $out .= $inner_indent . "\$this->position = \$position;\n"; + foreach ( $branch as $i => $sym ) { + $skip_check = ( 0 === $i && $first_is_known_terminal ); + $out .= $emit_symbol( $sym, $inner_indent, $fail_stmt, $skip_check ); + } + if ( $is_select ) { + $out .= $inner_indent . "if (\$tokens[\$this->position]->id === $into_symbol) break;\n"; + } + $out .= emit_branch_return( $inner_indent, $rid, $rule_name, $is_fragment ); + $out .= $indent . "} while (false);\n"; + } else { + // Single branch: no alternatives to try, just inline. + $out .= $indent . "\$children = array();\n"; + $fail_stmt = '{ $this->position = $position; return false; }'; + foreach ( $branch as $i => $sym ) { + $skip_check = ( 0 === $i && $first_is_known_terminal ); + $out .= $emit_symbol( $sym, $indent, $fail_stmt, $skip_check ); + } + if ( $is_select ) { + $out .= $indent . "if (\$tokens[\$this->position]->id === $into_symbol) { \$this->position = \$position; return false; }\n"; + } + $out .= emit_branch_return( $indent, $rid, $rule_name, $is_fragment ); + if ( $in_switch ) { + $out .= $indent . "break;\n"; + } + return $out; + } + } + // Multi-branch group fell through all do-while attempts: reset and + // break out of the switch (or return to the rule-level fallback). + $out .= $indent . "\$this->position = \$position;\n"; + if ( $in_switch ) { + $out .= $indent . "break;\n"; + } + return $out; +} + +function emit_branch_return( $indent, $rid, $rule_name, $is_fragment ) { + $out = ''; + $out .= $indent . "if (!\$children) return true;\n"; + if ( $is_fragment ) { + $out .= $indent . "return \$children;\n"; + } else { + $out .= $indent . 'return new WP_Parser_Node(' . $rid . ', ' . var_export( $rule_name, true ) . ", \$children);\n"; + } + return $out; +} + +// Emit the class. The generated parser is self-contained: it bakes every +// FIRST set, rule name, and branch structure into the emitted code, so no +// WP_Parser_Grammar has to be loaded at runtime. +echo "tokens = \$tokens;\n"; +echo "\t\t\$this->position = 0;\n"; +echo "\t}\n\n"; +echo "\tpublic function parse() {\n"; +echo "\t\t\$ast = \$this->" . $method_name( $query_rid ) . "();\n"; +echo "\t\treturn false === \$ast ? null : \$ast;\n"; +echo "\t}\n\n"; + +// Sort for deterministic output. +ksort( $kept ); +foreach ( $kept as $rid => $_ ) { + echo $emit_method( $rid ); + echo "\n"; +} + +echo "}\n"; diff --git a/packages/mysql-on-sqlite/tests/tools/dump-inflated-grammar.php b/packages/mysql-on-sqlite/tests/tools/dump-inflated-grammar.php new file mode 100644 index 00000000..88b7f370 --- /dev/null +++ b/packages/mysql-on-sqlite/tests/tools/dump-inflated-grammar.php @@ -0,0 +1,27 @@ + /tmp/mysql-grammar-inflated.php + */ + +require_once __DIR__ . '/../../src/parser/class-wp-parser-grammar.php'; + +$g = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' ); + +$data = array( + 'rules' => $g->rules, + 'rule_names' => $g->rule_names, + 'fragment_ids' => $g->fragment_ids ?? array(), + 'branches_for_token' => $g->branches_for_token, + 'nullable_branches' => $g->nullable_branches, + 'lowest_non_terminal_id' => $g->lowest_non_terminal_id, + 'highest_terminal_id' => $g->highest_terminal_id, +); + +echo "pcre2_compile_8( + FFI::cast( 'PCRE2_SPTR8', FFI::addr( FFI::new( 'char[' . strlen( $pat_buf ) . ']' ) ) ), + 0, // We'll set length below in real code. + 0, + FFI::addr( $err_code ), + FFI::addr( $err_off ), + null +); + +// The above is wrong because we didn't actually copy the pattern bytes +// into the buffer. Let's do it properly. +$pat_arr = $ffi->new( 'char[' . strlen( $pat_buf ) . ']' ); +FFI::memcpy( $pat_arr, $pat_buf, strlen( $pat_buf ) ); +$code = $ffi->pcre2_compile_8( + FFI::cast( 'PCRE2_SPTR8', FFI::addr( $pat_arr ) ), + strlen( $pat_buf ), + 0, + FFI::addr( $err_code ), + FFI::addr( $err_off ), + null +); +if ( null === $code ) { + $buf = $ffi->new( 'char[256]' ); + $ffi->pcre2_get_error_message_8( $err_code->cdata, FFI::cast( 'PCRE2_UCHAR8 *', FFI::addr( $buf ) ), 256 ); + echo 'compile failed: code=', $err_code->cdata, ' offset=', $err_off->cdata, ' msg=', FFI::string( FFI::addr( $buf ) ), "\n"; + exit( 1 ); +} +echo "Pattern compiled OK\n"; + +// Try setting up a callout via FFI. +$callout_log = array(); +$mctx = $ffi->pcre2_match_context_create_8( null ); +$callout_cb = function ( $blockptr, $data ) use ( &$callout_log ) { + // $blockptr is FFI\CData type pcre2_callout_block_8*. + $blk = $blockptr; + $callout_log[] = array( + 'num' => $blk->callout_number, + 'pos' => $blk->current_position, + 'mat' => $blk->start_match, + ); + return 0; // continue matching +}; +// Cast our PHP closure to a C function pointer. PHP FFI supports this +// for callbacks via `FFI::cast` on a closure. +$cb_type = 'int (*)(pcre2_callout_block_8 *, void *)'; +echo "Trying to bind callout callback...\n"; +try { + $cb_ffi = $ffi->new( $cb_type ); + echo "Callback type created.\n"; + // PHP FFI does not directly support binding a closure to a function + // pointer in arbitrary C signatures - this typically needs a Zend + // FFI extension feature or libffi closures. +} catch ( \Throwable $e ) { + echo 'Could not bind: ', $e->getMessage(), "\n"; +} + +// Even attempting to call pcre2_set_callout_8 with a closure tends to +// fail. Document and stop. +echo "\nConclusion: PHP FFI cannot bind a PHP callback to a C function pointer in stock PHP, so it cannot supply a PCRE2 callout function.\n"; diff --git a/packages/mysql-on-sqlite/tests/tools/exp-regex-hybrid.php b/packages/mysql-on-sqlite/tests/tools/exp-regex-hybrid.php new file mode 100644 index 00000000..e7bc5902 --- /dev/null +++ b/packages/mysql-on-sqlite/tests/tools/exp-regex-hybrid.php @@ -0,0 +1,231 @@ +lowest_non_terminal_id; + $rules = $grammar->rules; + $nullable = array(); + $first = array(); + foreach ( $rules as $rid => $_ ) { + $nullable[ $rid ] = false; + $first[ $rid ] = array(); + } + do { + $changed = false; + foreach ( $rules as $rid => $branches ) { + foreach ( $branches as $branch ) { + $bn = true; + foreach ( $branch as $sym ) { + if ( $sym < $low_nt ) { + if ( ! isset( $first[ $rid ][ $sym ] ) ) { + $first[ $rid ][ $sym ] = true; + $changed = true; + } + $bn = false; + break; + } + foreach ( $first[ $sym ] as $tid => $_ ) { + if ( ! isset( $first[ $rid ][ $tid ] ) ) { + $first[ $rid ][ $tid ] = true; + $changed = true; + } + } + if ( ! $nullable[ $sym ] ) { + $bn = false; + break; + } + } + if ( $bn && ! $nullable[ $rid ] ) { + $nullable[ $rid ] = true; + $changed = true; + } + } + } + } while ( $changed ); + + $single_candidate_rules = $grammar->single_candidate_rules ?? array(); + $select_rid = $grammar->get_rule_id( 'selectStatement' ); + $into_char = mb_chr( WP_MySQL_Lexer::INTO_SYMBOL + TOKEN_OFFSET, 'UTF-8' ); + + $compiled = array(); + $compile = function ( $rid ) use ( &$compile, &$compiled, $rules, $low_nt, $single_candidate_rules, $select_rid, $into_char ) { + if ( isset( $compiled[ $rid ] ) ) { + return $compiled[ $rid ]; + } + $alts = array(); + $st = isset( $single_candidate_rules[ $rid ] ); + foreach ( $rules[ $rid ] as $branch ) { + $alt = ''; + foreach ( $branch as $i => $sym ) { + if ( $sym < $low_nt ) { + $alt .= mb_chr( $sym + TOKEN_OFFSET, 'UTF-8' ); + } else { + $alt .= "RREF{$sym}RREF"; + } + if ( 0 === $i && $st ) { + $alt .= '(*THEN)'; + } + } + $alts[] = $alt; + } + $body = '(?:' . implode( '|', $alts ) . ')'; + if ( $rid === $select_rid ) { + $body .= '(?!' . $into_char . ')'; + } + $compiled[ $rid ] = $body; + return $compiled[ $rid ]; + }; + foreach ( array_keys( $rules ) as $rid ) { + $compile( $rid ); + } + + // Inline single-use rules. + do { + $changed = false; + $refs = array(); + foreach ( $compiled as $rid => $_ ) { + $refs[ $rid ] = 0; + } + foreach ( $compiled as $rid => $body ) { + if ( preg_match_all( '/RREF(\d+)RREF/', $body, $m ) ) { + foreach ( $m[1] as $r ) { + $refs[ (int) $r ] = ( $refs[ (int) $r ] ?? 0 ) + 1; + } + } + } + foreach ( $compiled as $rid => $body ) { + if ( ( $refs[ $rid ] ?? 0 ) !== 1 || strpos( $body, "RREF{$rid}RREF" ) !== false ) { + continue; + } + foreach ( $compiled as $cr => $cb ) { + if ( strpos( $cb, "RREF{$rid}RREF" ) !== false ) { + $compiled[ $cr ] = str_replace( "RREF{$rid}RREF", $body, $cb ); + unset( $compiled[ $rid ] ); + $changed = true; + break 2; + } + } + } + } while ( $changed ); + + $rule_to_idx = array(); + foreach ( $compiled as $rid => $_ ) { + $rule_to_idx[ $rid ] = count( $rule_to_idx ); + } + $define = ''; + foreach ( $compiled as $rid => $body ) { + $body = preg_replace_callback( + '/RREF(\d+)RREF/', + function ( $m ) use ( $rule_to_idx ) { + return '(?&r' . $rule_to_idx[ (int) $m[1] ] . ')'; + }, + $body + ); + $define .= "(?{$body})"; + } + $start_rid = $grammar->get_rule_id( 'query' ); + return '/(?(DEFINE)' . $define . ')\\A(?&r' . $rule_to_idx[ $start_rid ] . ')\\z/u'; +} + +$grammar = new WP_Parser_Grammar( require __DIR__ . '/../../src/mysql/mysql-grammar.php' ); +$pattern = compile_regex( $grammar ); + +ini_set( 'pcre.backtrack_limit', '1000000000' ); +ini_set( 'pcre.recursion_limit', '10000000' ); +ini_set( 'pcre.jit', '1' ); +ini_set( 'pcre.jit_stacksize', '32M' ); + +$handle = fopen( __DIR__ . '/../mysql/data/mysql-server-tests-queries.csv', 'r' ); +$queries = array(); +$header = true; +while ( ( $r = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false ) { + if ( $header ) { + $header = false; + continue; + } + if ( null !== $r[0] ) { + $queries[] = $r[0]; + } +} +$queries = array_slice( $queries, 0, (int) ( $argv[1] ?? 10000 ) ); + +// Pre-tokenize and pre-encode. +$pairs = array(); +foreach ( $queries as $q ) { + $tokens = ( new WP_MySQL_Lexer( $q ) )->remaining_tokens(); + $enc = ''; + foreach ( $tokens as $t ) { + $enc .= mb_chr( $t->id + TOKEN_OFFSET, 'UTF-8' ); + } + $pairs[] = array( $tokens, $enc ); +} +printf( "Loaded %d queries\n", count( $pairs ) ); + +// 1. Just regex match. +$start = microtime( true ); +$ok = 0; +foreach ( $pairs as $p ) { + if ( @preg_match( $pattern, $p[1] ) === 1 ) { + ++$ok; + } +} +$d = microtime( true ) - $start; +printf( "regex only: %.4fs (%d QPS, %d/%d match)\n", $d, count( $pairs ) / $d, $ok, count( $pairs ) ); + +// 2. Just parser (build AST). +$start = microtime( true ); +$ok = 0; +foreach ( $pairs as $p ) { + if ( ( new WP_MySQL_Parser( $grammar, $p[0] ) )->parse() ) { + ++$ok; + } +} +$d = microtime( true ) - $start; +printf( "parser only (AST): %.4fs (%d QPS, %d/%d match)\n", $d, count( $pairs ) / $d, $ok, count( $pairs ) ); + +// 3. Hybrid: regex first; on success run the parser to build AST. Pure +// overhead: same parser runs, plus the regex. +$start = microtime( true ); +$ok = 0; +$regex_failed = 0; +foreach ( $pairs as $p ) { + if ( @preg_match( $pattern, $p[1] ) !== 1 ) { + ++$regex_failed; + continue; + } + if ( ( new WP_MySQL_Parser( $grammar, $p[0] ) )->parse() ) { + ++$ok; + } +} +$d = microtime( true ) - $start; +printf( + "regex + parser: %.4fs (%d QPS, %d/%d match, %d regex-rejected)\n", + $d, + count( $pairs ) / $d, + $ok, + count( $pairs ), + $regex_failed +); diff --git a/packages/mysql-on-sqlite/tests/tools/exp-regex-v3.php b/packages/mysql-on-sqlite/tests/tools/exp-regex-v3.php new file mode 100644 index 00000000..256c51e0 --- /dev/null +++ b/packages/mysql-on-sqlite/tests/tools/exp-regex-v3.php @@ -0,0 +1,288 @@ +lowest_non_terminal_id; + +// Count how many times each rule is referenced. +function ref_counts( WP_Parser_Grammar $g ) { + $low_nt = $g->lowest_non_terminal_id; + $refs = array(); + foreach ( $g->rules as $rid => $branches ) { + $refs[ $rid ] = 0; + } + foreach ( $g->rules as $rid => $branches ) { + foreach ( $branches as $b ) { + foreach ( $b as $sym ) { + if ( $sym >= $low_nt ) { + $refs[ $sym ] = ( $refs[ $sym ] ?? 0 ) + 1; + } + } + } + } + return $refs; +} + +// FIRST and NULLABLE. +$rules = $grammar->rules; +$nullable = array(); +$first = array(); +foreach ( $rules as $rid => $_ ) { + $nullable[ $rid ] = false; + $first[ $rid ] = array(); +} +do { + $changed = false; + foreach ( $rules as $rid => $branches ) { + foreach ( $branches as $branch ) { + $bn = true; + foreach ( $branch as $sym ) { + if ( $sym < $low_nt ) { + if ( ! isset( $first[ $rid ][ $sym ] ) ) { + $first[ $rid ][ $sym ] = true; + $changed = true; + } + $bn = false; + break; + } + foreach ( $first[ $sym ] as $tid => $_ ) { + if ( ! isset( $first[ $rid ][ $tid ] ) ) { + $first[ $rid ][ $tid ] = true; + $changed = true; + } + } + if ( ! $nullable[ $sym ] ) { + $bn = false; + break; + } + } + if ( $bn && ! $nullable[ $rid ] ) { + $nullable[ $rid ] = true; + $changed = true; + } + } + } +} while ( $changed ); + +// Compile each rule into a "regex body" string. Inline single-use +// non-recursive rules into their callers transitively via memoization. +$single_candidate_rules = $grammar->single_candidate_rules ?? array(); +$select_rid = $grammar->get_rule_id( 'selectStatement' ); +$into_char = token_char( WP_MySQL_Lexer::INTO_SYMBOL ); +$compiled = array(); +$visiting = array(); +$compile_rule = function ( $rid ) use ( &$compile_rule, &$compiled, &$visiting, $rules, $first, $nullable, $low_nt, $single_candidate_rules, $select_rid, $into_char ) { + if ( isset( $compiled[ $rid ] ) ) { + return $compiled[ $rid ]; + } + $visiting[ $rid ] = true; + $alts = array(); + $safe_then = isset( $single_candidate_rules[ $rid ] ); + foreach ( $rules[ $rid ] as $branch ) { + $alt = ''; + foreach ( $branch as $i => $sym ) { + if ( $sym < $low_nt ) { + $alt .= token_char( $sym ); + } else { + $alt .= "RREF{$sym}RREF"; + } + // (*THEN) commits the alternative once the first symbol matches. + // Only safe when sibling branches of this rule have disjoint + // FIRST sets - that property is captured by + // $grammar->single_candidate_rules. Outside that set, multiple + // branches can share a first token and committing prematurely + // would yield spurious match failures. + if ( 0 === $i && $safe_then ) { + $alt .= '(*THEN)'; + } + } + $alts[] = $alt; + } + unset( $visiting[ $rid ] ); + $body = '(?:' . implode( '|', $alts ) . ')'; + if ( $rid === $select_rid ) { + // Mirror the negative lookahead the parser uses: a successful + // selectStatement match must not be followed by INTO. Otherwise + // the surrounding rule should pick a different alternative. + $body .= '(?!' . $into_char . ')'; + } + $compiled[ $rid ] = $body; + return $compiled[ $rid ]; +}; + +// First pass: compile every rule once. +foreach ( array_keys( $rules ) as $rid ) { + $compile_rule( $rid ); +} + +// Second pass: inline single-use non-recursive rules. A rule is +// inlinable if its body doesn't reference itself transitively. Repeat +// to fixpoint - inlining changes ref counts. +$inlined_count = 0; +do { + $changed = false; + $refs = array(); + foreach ( $compiled as $rid => $body ) { + $refs[ $rid ] = 0; + } + foreach ( $compiled as $rid => $body ) { + if ( preg_match_all( '/RREF(\d+)RREF/', $body, $m ) ) { + foreach ( $m[1] as $r ) { + $refs[ (int) $r ] = ( $refs[ (int) $r ] ?? 0 ) + 1; + } + } + } + foreach ( $compiled as $rid => $body ) { + if ( ( $refs[ $rid ] ?? 0 ) !== 1 ) { + continue; + } + // Don't inline recursive rules. + if ( strpos( $body, "RREF{$rid}RREF" ) !== false ) { + continue; + } + // Replace the single reference somewhere. + foreach ( $compiled as $caller_rid => $caller_body ) { + if ( strpos( $caller_body, "RREF{$rid}RREF" ) !== false ) { + $compiled[ $caller_rid ] = str_replace( "RREF{$rid}RREF", $body, $caller_body ); + unset( $compiled[ $rid ] ); + ++$inlined_count; + $changed = true; + break 2; // restart from top so refs recount with the new state + } + } + } +} while ( $changed ); + +// Now compile remaining rules with named subroutines. +$rule_to_idx = array(); +$idx_to_rule = array(); +foreach ( $compiled as $rid => $_ ) { + $rule_to_idx[ $rid ] = count( $idx_to_rule ); + $idx_to_rule[] = $rid; +} + +$define = ''; +foreach ( $idx_to_rule as $rid ) { + $body = $compiled[ $rid ]; + // Replace RREF placeholders with named-group references. + $body = preg_replace_callback( + '/RREF(\d+)RREF/', + function ( $m ) use ( $rule_to_idx ) { + $rid = (int) $m[1]; + return '(?&r' . $rule_to_idx[ $rid ] . ')'; + }, + $body + ); + $define .= "(?{$body})"; +} + +$start_rid = $grammar->get_rule_id( 'query' ); +$pattern = '/(?(DEFINE)' . $define . ')\\A(?&r' . $rule_to_idx[ $start_rid ] . ')\\z/u'; +printf( + "Inlined %d rules. Final rules: %d. Pattern: %s bytes\n", + $inlined_count, + count( $idx_to_rule ), + number_format( strlen( $pattern ) ) +); + +ini_set( 'pcre.backtrack_limit', '1000000000' ); +ini_set( 'pcre.recursion_limit', '10000000' ); +ini_set( 'pcre.jit', '1' ); + +$t = microtime( true ); +$ok = @preg_match( $pattern, "\xff", $m ); +printf( + "Compile: %.2fms, ok=%s, err=%s\n", + ( microtime( true ) - $t ) * 1000, + var_export( $ok, true ), + preg_last_error_msg() +); +if ( false === $ok && PREG_BAD_UTF8_ERROR !== preg_last_error() ) { + echo "Pattern doesn't compile cleanly. Bailing.\n"; + exit( 1 ); +} + +$handle = fopen( __DIR__ . '/../mysql/data/mysql-server-tests-queries.csv', 'r' ); +$queries = array(); +$header = true; +while ( ( $r = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false ) { + if ( $header ) { + $header = false; + continue; } + if ( null !== $r[0] ) { + $queries[] = $r[0]; + } +} +$queries = array_slice( $queries, 0, (int) ( $argv[1] ?? 5000 ) ); + +$encoded = array(); +foreach ( $queries as $q ) { + $tokens = ( new WP_MySQL_Lexer( $q ) )->remaining_tokens(); + $s = ''; + foreach ( $tokens as $t ) { + $s .= token_char( $t->id ); + } + $encoded[] = $s; +} + +$t = microtime( true ); +$matched = 0; +$failed = 0; +$errors = 0; +$failed_examples = array(); +$slow = array(); +foreach ( $encoded as $i => $s ) { + $qstart = microtime( true ); + $r = @preg_match( $pattern, $s ); + $qd = microtime( true ) - $qstart; + if ( 1 === $r ) { + ++$matched; + } elseif ( 0 === $r ) { + ++$failed; + if ( count( $failed_examples ) < 10 ) { + $failed_examples[] = substr( str_replace( "\n", ' ', $queries[ $i ] ), 0, 120 ); + } + } else { + ++$errors; } + if ( $qd > 0.005 && count( $slow ) < 3 ) { + $slow[] = sprintf( '%6.0fms: %s', $qd * 1000, substr( str_replace( "\n", ' ', $queries[ $i ] ), 0, 100 ) ); + } +} +$d = microtime( true ) - $t; +printf( + "Matched=%d, Failed=%d, Errors=%d, time=%.4fs (%d QPS)\n", + $matched, + $failed, + $errors, + $d, + count( $encoded ) / $d +); +echo "\nFailed queries:\n"; +foreach ( $failed_examples as $e ) { + echo " $e\n"; +} +echo "\nSlow queries:\n"; +foreach ( $slow as $e ) { + echo " $e\n"; +}