diff --git a/.flake8 b/.flake8 deleted file mode 100644 index ea058021..00000000 --- a/.flake8 +++ /dev/null @@ -1,3 +0,0 @@ -[flake8] -max-line-length = 88 -max-complexity = 8 \ No newline at end of file diff --git a/.github/workflows/auto-merge-dependabot.yml b/.github/workflows/auto-merge-dependabot.yml index 6c73eb04..93779f1c 100644 --- a/.github/workflows/auto-merge-dependabot.yml +++ b/.github/workflows/auto-merge-dependabot.yml @@ -23,9 +23,7 @@ jobs: if: "${{ steps.metadata.outputs.update-type == 'version-update:semver-minor' || steps.metadata.outputs.update-type == - 'version-update:semver-patch' || - steps.metadata.outputs.dependency-names == - 'black' }}" + 'version-update:semver-patch' }}" # https://cli.github.com/manual/gh_pr_merge run: gh pr merge --auto --squash "$PR_URL" diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml deleted file mode 100644 index 2e7dab02..00000000 --- a/.github/workflows/black.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: Code formatting - -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] - -jobs: - build: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v6 - - name: Set up Python - uses: actions/setup-python@v6 - with: - python-version: 3.x - - - name: Install black - run: | - # black = "^21.7b0" - export BLACK_VERSION=$(grep black pyproject.toml | egrep -o '\^[0-9a-z.]+' | sed 's/\^//g') - - set -x - pip install black==${BLACK_VERSION} - - # https://pypi.org/project/black/ - - name: Check code formatting - run: | - black --check . diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000..6f6a09a2 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,30 @@ +name: Lint + +on: + push: + branches: [ master ] + pull_request: + +jobs: + lint: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v6 + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Install Poetry + uses: snok/install-poetry@v1.4.1 + with: + version: latest + virtualenvs-create: true + virtualenvs-in-project: true + + - name: Install dependencies with poetry + run: poetry install --no-root + + - name: Lint with ruff + run: make lint diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 8d3532b1..95990755 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -42,9 +42,6 @@ jobs: - name: Install Python wheel support to speed up things run: pip install wheel - - name: Pre-install black - run: pip install black - # https://github.com/marketplace/actions/install-poetry-action - name: Install Poetry uses: snok/install-poetry@v1.4.1 @@ -77,8 +74,5 @@ jobs: pip install coveralls poetry run coveralls --service=github - - name: Lint with pylint - run: make lint - - name: Build a distribution package run: poetry build -vvv diff --git a/.github/workflows/type-check.yml b/.github/workflows/type-check.yml new file mode 100644 index 00000000..6d3383de --- /dev/null +++ b/.github/workflows/type-check.yml @@ -0,0 +1,30 @@ +name: Type Check + +on: + push: + branches: [ master ] + pull_request: + +jobs: + type-check: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v6 + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Install Poetry + uses: snok/install-poetry@v1.4.1 + with: + version: latest + virtualenvs-create: true + virtualenvs-in-project: true + + - name: Install dependencies with poetry + run: poetry install --no-root + + - name: Type check with mypy + run: make type_check diff --git a/AGENTS.md b/AGENTS.md index 0abed2d6..82922557 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -13,31 +13,78 @@ This file contains important information about the sql-metadata repository for A **Technology Stack:** - Python 3.10+ -- sqlparse library for tokenization +- sqlglot library for SQL parsing and AST construction +- sqlparse used only for legacy tokenization fallback - Poetry for dependency management - pytest for testing -- flake8 and pylint for linting +- ruff for linting and formatting ## Repository Structure ``` sql-metadata/ -├── sql_metadata/ # Main package -│ ├── parser.py # Core Parser class -│ ├── token.py # SQLToken and EmptyToken classes -│ ├── keywords_lists.py # SQL keyword definitions -│ └── __init__.py -├── test/ # Test suite +├── sql_metadata/ # Main package +│ ├── parser.py # Public facade — Parser class +│ ├── ast_parser.py # ASTParser — thin orchestrator, composes SqlCleaner + DialectParser +│ ├── sql_cleaner.py # SqlCleaner — raw SQL preprocessing (no sqlglot dependency) +│ ├── dialect_parser.py # DialectParser — dialect detection, parsing, quality validation +│ ├── column_extractor.py # ColumnExtractor — single-pass DFS column/alias extraction +│ ├── table_extractor.py # TableExtractor — table extraction with position sorting +│ ├── nested_resolver.py # NestedResolver — CTE/subquery names, bodies, resolution +│ ├── query_type_extractor.py # QueryTypeExtractor — query type detection +│ ├── comments.py # Comment extraction/stripping (pure functions) +│ ├── keywords_lists.py # QueryType/TokenType enums, keyword sets +│ ├── utils.py # UniqueList, flatten_list, shared helpers +│ ├── generalizator.py # Query anonymisation +│ └── __init__.py # Exports: Parser, QueryType +├── test/ # Test suite (25 test files) │ ├── test_with_statements.py │ ├── test_getting_tables.py │ ├── test_getting_columns.py -│ └── ... (30+ test files) -├── pyproject.toml # Poetry configuration -├── Makefile # Common commands -├── .flake8 # Flake8 configuration +│ └── ... +├── ARCHITECTURE.md # Detailed architecture docs with Mermaid diagrams +├── pyproject.toml # Poetry configuration +├── Makefile # Common commands └── README.md ``` +## Architecture Overview + +The v3 architecture uses sqlglot to build an AST, then walks it with specialised extractor classes composed by a thin `Parser` facade. See [ARCHITECTURE.md](ARCHITECTURE.md) for detailed module deep dives, traced walkthroughs, and Mermaid diagrams. + +### Pipeline + +``` +Raw SQL → SqlCleaner (preprocessing) + → DialectParser (dialect detection, sqlglot.parse()) + → sqlglot AST (cached by ASTParser) + → TableExtractor (tables, table aliases) + → ColumnExtractor (columns, column aliases — single-pass DFS) + → NestedResolver (CTE/subquery names + bodies, column resolution) + → Final metadata (cached on Parser) +``` + +### Key Design Patterns + +- **Composition over inheritance** — `Parser` composes `ASTParser`, `TableExtractor`, `ColumnExtractor`, `NestedResolver`, `QueryTypeExtractor` +- **Lazy evaluation with caching** — properties compute on first access, cache the result +- **Single-pass DFS** — `ColumnExtractor` walks AST in `arg_types` key order (mirrors SQL text order) +- **Multi-dialect retry** — `ASTParser` tries several sqlglot dialects, picks first non-degraded result +- **Graceful regex fallbacks** — degrades to regex when sqlglot parse fails + +### Class Responsibilities + +| Class | Owns | Does NOT own | +|-------|------|-------------| +| `Parser` | Facade, caching, regex fallbacks, value extraction | No extraction logic | +| `ASTParser` | Orchestration, lazy AST caching | No preprocessing, no parsing | +| `SqlCleaner` | Raw SQL preprocessing (REPLACE rewrite, comment strip, CTE normalisation) | No AST, no sqlglot | +| `DialectParser` | Dialect detection, sqlglot parsing, parse-quality validation | No preprocessing | +| `ColumnExtractor` | Column names, column aliases (during DFS walk) | CTE/subquery name extraction (standalone) | +| `TableExtractor` | Table names, table aliases, position sorting | Nothing else | +| `NestedResolver` | CTE/subquery names, CTE/subquery bodies, column resolution | Column extraction | +| `QueryTypeExtractor` | Query type detection | Nothing else | + ## Development Workflow ### Setup @@ -55,107 +102,45 @@ poetry run pytest test/test_with_statements.py::test_name # Run specific test ### Linting ```bash -make lint # Run flake8 and pylint -poetry run flake8 sql_metadata/ -poetry run pylint sql_metadata/ +make lint # Run ruff check with auto-fix +poetry run ruff check --fix sql_metadata ``` ### Code Formatting ```bash -make format # Run black formatter +make format # Run ruff formatter +poetry run ruff format . +``` + +### Type Checking +```bash +poetry run mypy sql_metadata ``` ### Coverage ```bash make coverage # Run tests with coverage report +poetry run pytest -vv --cov=sql_metadata --cov-report=term-missing ``` **Important:** The project has a 100% test coverage requirement (`fail_under = 100` in pyproject.toml). +### Verification after changes +After making code changes, always run all three checks: +```bash +poetry run pytest -vv --cov=sql_metadata --cov-report=term-missing # tests + coverage +poetry run mypy sql_metadata # type checking +poetry run ruff check sql_metadata # linting +``` + ## Code Quality Standards -### Flake8 Configuration (.flake8) -- Max line length: Not explicitly set (defaults apply) +### Ruff Configuration (pyproject.toml) +- Max line length: 88 - Max complexity: 8 (C901 error for complexity > 8) +- Enabled rule sets: E, F, W (pycodestyle/pyflakes), C90 (mccabe), I (isort) - Exceptions: Use `# noqa: C901` for complex but necessary functions -### Complexity Suppression Pattern -When a function legitimately needs higher complexity, suppress the warning: -```python -@property -def complex_method(self) -> Type: # noqa: C901 - """Method with necessary complexity""" -``` - -Examples in codebase: -- `parser.py:134`: `tokens` property -- `parser.py:450`: `with_names` property -- `parser.py:822`: `_resolve_nested_query` method - -### Pylint -The Parser class has `# pylint: disable=R0902` to suppress "too many instance attributes" warnings. - -## Parser Architecture - -### Core Class: `Parser` -Located in `sql_metadata/parser.py` - -The Parser class uses sqlparse to tokenize SQL and then processes tokens to extract metadata. - -**Key Properties (lazy evaluation):** -- `tokens` - Tokenized SQL -- `tables` - Tables referenced in query -- `columns` - Columns referenced -- `with_names` - CTE (Common Table Expression) names -- `with_queries` - CTE definitions -- `query_type` - Type of SQL query -- `subqueries` - Subquery definitions - -**Important Pattern:** Most properties cache their results: -```python -@property -def example(self): - if self._example is not None: - return self._example - # ... computation ... - self._example = result - return self._example -``` - -### Token Processing - -The parser processes `SQLToken` objects which have properties like: -- `value` - The token text -- `normalized` - Uppercased token value -- `next_token` - Next token in sequence -- `previous_token` - Previous token -- `next_token_not_comment` - Next non-comment token -- `is_as_keyword` - Boolean flag -- `is_with_query_end` - Boolean flag for WITH clause boundaries -- `token_type` - Type classification - -### WITH Statement Parsing - -Located in `parser.py:450` (`with_names` property) - -**Key Logic:** -1. Iterates through tokens looking for "WITH" keywords -2. Enters a while loop that stays in WITH block until finding ending keywords -3. Processes each CTE by finding "AS" keywords and extracting names -4. Advances through tokens until finding `is_with_query_end` -5. Checks if at end of WITH block using `WITH_ENDING_KEYWORDS` - -**WITH_ENDING_KEYWORDS** (from `keywords_lists.py`): -- UPDATE -- SELECT -- DELETE -- REPLACE -- INSERT - -**Common Pitfall:** Malformed SQL with consecutive AS keywords (e.g., `WITH a AS (...) AS b`) can cause infinite loops if not properly detected and handled. - -**Solution Pattern:** After processing a WITH clause, always check if the next token is another AS keyword (which indicates malformed SQL) and raise `ValueError("This query is wrong")`. - ## Error Handling Patterns ### Malformed SQL Detection @@ -163,7 +148,7 @@ Located in `parser.py:450` (`with_names` property) The codebase has established patterns for handling malformed SQL: 1. **Detect the malformed pattern early** -2. **Raise `ValueError("This query is wrong")`** - This is the standard error message +2. **Raise `ValueError("This query is wrong")`** — This is the standard error message 3. **Use pytest.raises in tests:** ```python parser = Parser(malformed_query) @@ -171,39 +156,14 @@ with pytest.raises(ValueError, match="This query is wrong"): parser.tables ``` -Examples: -- `test_with_statements.py:500-528`: Tests for malformed WITH queries -- `parser.py:679`: Detection in `_handle_with_name_save` - -### Infinite Loop Prevention - -When processing tokens in loops: -1. Always ensure the token advances in each iteration -2. Check for malformed patterns before looping back -3. Have clear exit conditions - -Pattern: -```python -while condition and token.next_token: - if some_pattern: - # ... process ... - if exit_condition: - break - else: - # Always advance token to prevent infinite loop - token = token.next_token - else: - token = token.next_token -``` - ## Testing Patterns ### Test Organization Tests are organized by feature/SQL clause: -- `test_with_statements.py` - WITH clause (CTEs) -- `test_getting_tables.py` - Table extraction -- `test_getting_columns.py` - Column extraction -- `test_query_type.py` - Query type detection +- `test_with_statements.py` — WITH clause (CTEs) +- `test_getting_tables.py` — Table extraction +- `test_getting_columns.py` — Column extraction +- `test_query_type.py` — Query type detection - Database-specific: `test_mssql_server.py`, `test_postgress.py`, `test_hive.py`, etc. ### Test Naming Convention @@ -231,134 +191,47 @@ def test_malformed_case(): - Every bug fix needs a test that would have caught the bug - Coverage must remain at 100% +### Test Comments +Reference issues in test comments: +```python +def test_issue_fix(): + # Test for issue #556 - malformed WITH query causes infinite loop + # https://github.com/macbre/sql-metadata/issues/556 +``` + ## Git Workflow ### Commit Message Format Following the established pattern: ``` -Brief description of change +Brief description of change Resolves #issue-number. -More detailed explanation of what was wrong and why. - -The issue was: [explain the problem] - -This fix: -- Bullet point 1 -- Bullet point 2 -- Bullet point 3 - Co-Authored-By: Claude ``` ### Branch Naming - Feature: `feature/description` - Bug fix: `fix/description` -- Example: `fix/parser-tables-hangs` - -### Recent Commits (as of 2026-03-04) -``` -1fbfee4 Drop Python 3.9 support (#604) -d0e6fc6 Parser.columns drops column named 'source' when it is the last column in a SELECT statement (#603) -``` - -## Common Issues and Solutions - -### Issue: Parser Hangs/Infinite Loop - -**Symptoms:** Parser never returns when calling `.tables` or other properties - -**Common Causes:** -1. Token not advancing in a while loop -2. Malformed SQL not detected early enough -3. Missing exit condition in nested loops - -**Solution Checklist:** -- [ ] Ensure token advances in all loop branches -- [ ] Check for malformed SQL patterns and raise ValueError -- [ ] Verify exit conditions are reachable -- [ ] Add timeout test to verify fix - -### Issue: Flake8 Complexity Warning (C901) - -**When it happens:** Function exceeds complexity threshold of 8 - -**Solutions:** -1. Refactor to reduce complexity (preferred) -2. Use `# noqa: C901` if complexity is necessary (see examples in codebase) - -### Issue: Tests Pass Locally but Coverage Fails - -**Cause:** Missing test coverage for new code paths - -**Solution:** -```bash -poetry run pytest -vv --cov=sql_metadata --cov-report=term-missing -``` -This shows which lines are not covered. - -## Important Files - -### `sql_metadata/parser.py` -- **Lines 134-200:** Token processing and initialization -- **Lines 450-482:** WITH clause parsing (with_names property) -- **Lines 484-580:** WITH queries extraction -- **Lines 669-700:** `_handle_with_name_save` helper method -- **Lines 822+:** Nested query resolution - -### `sql_metadata/keywords_lists.py` -Defines SQL keyword sets: -- `WITH_ENDING_KEYWORDS` (line 40) -- `SUBQUERY_PRECEDING_KEYWORDS` -- `TABLE_ADJUSTMENT_KEYWORDS` -- `KEYWORDS_BEFORE_COLUMNS` -- `SUPPORTED_QUERY_TYPES` - -### `test/test_with_statements.py` -Comprehensive tests for WITH clause parsing: -- Valid multi-CTE queries -- CTEs with column definitions -- Nested WITH statements -- Malformed SQL detection (lines 500-540) - -## Debugging Tips - -### Running Single Test with Timeout -```bash -timeout 5 poetry run pytest test/test_file.py::test_name -vv -``` - -### Testing Infinite Loop Fix -```bash -timeout 3 poetry run python -c "from sql_metadata import Parser; Parser(query).tables" -``` -If it times out, there's still an infinite loop. - -### Inspecting Token Flow -Add debug prints in parser.py: -```python -print(f"Token: {token.value}, Next: {token.next_token.value if token.next_token else None}") -``` ## Dependencies ### Production -- **sqlparse** (>=0.4.1, <0.6.0): SQL tokenization +- **sqlglot** (^30.0.3): SQL parsing and AST construction +- **sqlparse** (>=0.4.1, <0.6.0): Legacy tokenization ### Development -- **pytest** (^8.4.2): Testing framework -- **pytest-cov** (^7.0.0): Coverage reporting -- **black** (^25.11): Code formatting -- **flake8** (^7.3.0): Linting -- **pylint** (^3.3.9): Advanced linting -- **coverage** (^7.10): Coverage measurement +- **pytest** (^9.0.2): Testing framework +- **pytest-cov** (^7.1.0): Coverage reporting +- **ruff** (^0.11): Linting and formatting +- **coverage** (^7.13): Coverage measurement ## Version Information -- **Current Version:** 2.19.0 -- **Python Support:** ^3.10 (Python 3.9 support dropped in #604) +- **Current Version:** 2.20.0 +- **Python Support:** ^3.10 - **License:** MIT - **Homepage:** https://github.com/macbre/sql-metadata @@ -375,32 +248,14 @@ def my_property(self): return self._my_property ``` -### 2. Token Advancement Safety -In loops, ensure every branch advances: -```python -while condition: - if pattern_match: - # ... process ... - if should_exit: - flag = False - else: - token = token.next_token # MUST advance - else: - token = token.next_token # MUST advance -``` - -### 3. Error Messages +### 2. Error Messages Use consistent error messages: -- `"This query is wrong"` - for malformed SQL +- `"This query is wrong"` — for malformed SQL +- `"Empty queries are not supported!"` — for empty input - Keep messages simple and consistent with existing patterns -### 4. Test Comments -Reference issues in test comments: -```python -def test_issue_fix(): - # Test for issue #556 - malformed WITH query causes infinite loop - # https://github.com/macbre/sql-metadata/issues/556 -``` +### 3. Prefer sqlglot over manual parsing +Always use sqlglot AST features (node types, `find_all`, `arg_types` traversal) rather than regex or manual string parsing when possible. ## Quick Reference Commands @@ -423,17 +278,20 @@ make coverage # Coverage report poetry run python -c "from sql_metadata import Parser; print(Parser('SELECT * FROM t').tables)" ``` -## Notes for Future Work +## Debugging Tips -### Potential Improvements -1. Consider refactoring `with_names` property to reduce complexity below 8 -2. Add more detailed error messages for different types of malformed SQL -3. Consider extracting token advancement logic into helper methods +### Inspecting the AST +```python +from sql_metadata import Parser +p = Parser("SELECT a FROM t") +print(p._ast_parser.ast) # sqlglot AST tree +print(repr(p._ast_parser.ast)) # Detailed node repr +``` -### Technical Debt -- Poetry dev-dependencies section is deprecated (migrate to poetry.group.dev.dependencies) -- Consider adding type hints more comprehensively -- Some test files could be consolidated +### Running Single Test with Timeout +```bash +timeout 5 poetry run pytest test/test_file.py::test_name -vv +``` ## Last Updated -2026-03-04 - Initial creation after fixing issue #556 (infinite loop in WITH statement parsing) +2026-03-31 — Rewritten for v3 architecture (sqlglot-based, class extractors) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 00000000..74df9c22 --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,569 @@ +# Architecture + +sql-metadata v3 is a Python library that parses SQL queries and extracts metadata (tables, columns, aliases, CTEs, subqueries, etc.). It delegates SQL parsing to [sqlglot](https://github.com/tobymao/sqlglot) for AST construction, then walks the resulting tree with specialised extractors. + +## Module Map + +| Module | Role | Key Class/Function | +|--------|------|--------------------| +| [`parser.py`](sql_metadata/parser.py) | Public facade — composes all extractors via lazy properties | `Parser` | +| [`ast_parser.py`](sql_metadata/ast_parser.py) | Thin orchestrator — composes SqlCleaner + DialectParser, caches AST | `ASTParser` | +| [`sql_cleaner.py`](sql_metadata/sql_cleaner.py) | Raw SQL preprocessing (no sqlglot dependency) | `SqlCleaner`, `CleanResult` | +| [`dialect_parser.py`](sql_metadata/dialect_parser.py) | Dialect detection, sqlglot parsing, parse-quality validation | `DialectParser`, `HashVarDialect`, `BracketedTableDialect` | +| [`column_extractor.py`](sql_metadata/column_extractor.py) | Single-pass DFS column/alias extraction | `ColumnExtractor` | +| [`table_extractor.py`](sql_metadata/table_extractor.py) | Table extraction with position-based sorting | `TableExtractor` | +| [`nested_resolver.py`](sql_metadata/nested_resolver.py) | CTE/subquery name and body extraction, nested column resolution | `NestedResolver` | +| [`query_type_extractor.py`](sql_metadata/query_type_extractor.py) | Query type detection from AST root node | `QueryTypeExtractor` | +| [`comments.py`](sql_metadata/comments.py) | Comment extraction/stripping via tokenizer gaps | `extract_comments`, `strip_comments` | +| [`keywords_lists.py`](sql_metadata/keywords_lists.py) | Keyword sets, `QueryType` and `TokenType` enums | — | +| [`utils.py`](sql_metadata/utils.py) | `UniqueList` (deduplicating list), `flatten_list`, `_make_reverse_cte_map` | — | +| [`generalizator.py`](sql_metadata/generalizator.py) | Query anonymisation for log aggregation | `Generalizator` | + +--- + +## High-Level Pipeline + +```mermaid +flowchart TB + SQL["Raw SQL string"] + + subgraph AST_CONSTRUCTION["ASTParser (ast_parser.py)"] + direction TB + PP["SqlCleaner\n(sql_cleaner.py)"] + DP["DialectParser\n(dialect_parser.py)"] + PP --> DP + end + + SQL --> AST_CONSTRUCTION + AST_CONSTRUCTION --> AST["sqlglot AST"] + + subgraph EXTRACTION["Parallel Extractors"] + direction TB + TE["TableExtractor\n(table_extractor.py)"] + CE["ColumnExtractor\n(column_extractor.py)"] + QT["QueryTypeExtractor\n(query_type_extractor.py)"] + end + + AST --> EXTRACTION + + TE --> TA["tables, tables_aliases"] + CE --> COLS["columns, aliases"] + QT --> QTR["query_type"] + + TA --> NR + COLS --> NR + + subgraph RESOLVE["NestedResolver (nested_resolver.py)"] + direction TB + NR["Resolve subquery.column\nreferences"] + NE["Extract CTE/subquery\nnames and bodies"] + end + + RESOLVE --> FINAL["Final metadata\n(cached on Parser)"] + + COM["comments.py"] -.-> AST_CONSTRUCTION + COM -.-> FINAL +``` + +The `Parser` class ([`parser.py`](sql_metadata/parser.py)) is a thin facade that orchestrates these components through lazy cached properties. No extraction work happens until a property like `.columns` or `.tables` is first accessed. + +--- + +## Module Deep Dives + +### Parser — the facade + +**File:** [`parser.py`](sql_metadata/parser.py) | **Class:** `Parser` + +The constructor (`__init__`) stores the raw SQL and initialises ~20 cache fields to `None`. It creates an `ASTParser` instance (lazy — no parsing yet) and defers everything else. + +**Composition:** + +```mermaid +flowchart LR + P["Parser"] + P --> AP["ASTParser\n(self._ast_parser)"] + P --> TE["TableExtractor\n(created per .tables call)"] + P --> CE["ColumnExtractor\n(via extract_all())"] + P --> NR["NestedResolver\n(self._resolver, lazy)"] + P --> QTE["QueryTypeExtractor\n(via extract_query_type())"] +``` + +**Public properties:** + +| Property | Returns | Triggers | +|----------|---------|----------| +| `query` | Preprocessed SQL (normalised quoting) | — | +| `query_type` | `QueryType` enum | `QueryTypeExtractor(ast, raw_query).extract()` | +| `tokens` | `List[str]` of token strings | sqlglot tokenizer | +| `columns` | Column names | AST parse → TableExtractor → `ColumnExtractor.extract()` → NestedResolver | +| `columns_dict` | Columns by clause section | `.columns` | +| `columns_aliases` | `{alias: target_column}` | `.columns` | +| `columns_aliases_names` | List of alias names | `.columns` | +| `columns_aliases_dict` | Aliases by clause section | `.columns` | +| `tables` | Table names | AST parse → TableExtractor | +| `tables_aliases` | `{alias: real_table}` | AST parse → TableExtractor | +| `with_names` | CTE names | AST parse → NestedResolver | +| `with_queries` | `{cte_name: body_sql}` | NestedResolver | +| `subqueries` | `{subquery_name: body_sql}` | NestedResolver | +| `subqueries_names` | Subquery aliases (innermost first) | AST parse → NestedResolver | +| `limit_and_offset` | `(limit, offset)` tuple | AST parse (regex fallback) | +| `values` | Literal values from INSERT | AST parse | +| `values_dict` | `{column: value}` pairs | `.values` + `.columns` | +| `comments` | Comment strings | sqlglot tokenizer | +| `without_comments` | SQL sans comments | sqlglot tokenizer | +| `generalize` | Anonymised SQL | Generalizator | + +**Caching pattern** — every property checks its cache field first: + +```python +@property +def tables(self) -> List[str]: + if self._tables is not None: + return self._tables + # ... compute and cache ... + self._tables = result + return self._tables +``` + +**Regex fallbacks** — when `sqlglot.parse()` fails (raises `ValueError`), the parser falls back to regex extraction for columns (`_extract_columns_regex`) and LIMIT/OFFSET (`_extract_limit_regex`) rather than raising an error. + +--- + +### ASTParser — Orchestrator + +**File:** [`ast_parser.py`](sql_metadata/ast_parser.py) | **Class:** `ASTParser` + +Thin orchestrator that composes `SqlCleaner` and `DialectParser`. Instantiated once per `Parser` — actual parsing is deferred until `.ast` is first accessed. Exposes `.ast`, `.dialect`, `.is_replace`, and `.cte_name_map` properties. + +--- + +### SqlCleaner — Raw SQL Preprocessing + +**File:** [`sql_cleaner.py`](sql_metadata/sql_cleaner.py) | **Class:** `SqlCleaner` + +Pure string transformations with no sqlglot dependency. `SqlCleaner.clean(sql)` returns a `CleanResult` namedtuple with the cleaned SQL, `is_replace` flag, and CTE name map. + +#### Preprocessing pipeline + +```mermaid +flowchart LR + A["1. REPLACE INTO\n→ INSERT INTO"] --> B["2. SELECT...INTO\nvars stripped"] + B --> C["3. Strip\ncomments"] + C --> D["4. Normalise\nqualified CTE names"] + D --> E["5. Strip DB2\nisolation clauses"] + E --> F["6. Strip outer\nparentheses"] +``` + +| Step | Why | Example | +|------|-----|---------| +| REPLACE INTO rewrite | sqlglot parses `REPLACE INTO` as opaque `Command` | `REPLACE INTO t` → `INSERT INTO t` (flag set) | +| SELECT...INTO strip | Prevents sqlglot from treating variables as tables | `SELECT x INTO @v FROM t` → `SELECT x FROM t` | +| Comment stripping | Uses `strip_comments_for_parsing()` from `comments.py` | `SELECT /* hi */ 1` → `SELECT 1` | +| CTE name normalisation | sqlglot can't parse `WITH db.name AS (...)` | `db.cte` → `db__DOT__cte` (reverse map stored) | +| DB2 isolation clauses | Removes trailing `WITH UR/CS/RS/RR` | `SELECT 1 WITH UR` → `SELECT 1` | +| Outer paren stripping | sqlglot can't parse `((UPDATE ...))` | `((UPDATE t SET x=1))` → `UPDATE t SET x=1` | + +--- + +### DialectParser — Dialect Detection and Parsing + +**File:** [`dialect_parser.py`](sql_metadata/dialect_parser.py) | **Class:** `DialectParser` + +Combines dialect heuristics, `sqlglot.parse()` calls, and parse-quality validation. `DialectParser().parse(clean_sql)` returns `(ast, dialect)`. + +**Custom dialects (defined in same file):** + +- `HashVarDialect` — treats `#` as part of identifiers for MSSQL temp tables (`#temp`) and template variables (`#VAR#`) +- `BracketedTableDialect` — TSQL subclass for `[bracket]` quoting; also signals `TableExtractor` to preserve brackets in output + +#### Dialect detection + +`_detect_dialects(sql)` inspects the SQL for syntax hints and returns an ordered list of dialects to try: + +```mermaid +flowchart TD + SQL["Cleaned SQL"] + SQL --> H{"#WORD\nvariables?"} + H -->|Yes| HD["[HashVarDialect, None, mysql]"] + H -->|No| BT{"Backticks?"} + BT -->|Yes| MY["[mysql, None]"] + BT -->|No| BR{"Brackets\nor TOP?"} + BR -->|Yes| BD["[BracketedTableDialect, None, mysql]"] + BR -->|No| UN{"UNIQUE?"} + UN -->|Yes| UO["[None, mysql, oracle]"] + UN -->|No| LV{"LATERAL VIEW?"} + LV -->|Yes| SP["[spark, None, mysql]"] + LV -->|No| DF["[None, mysql]"] +``` + +#### Multi-dialect retry + +`_try_dialects` iterates through the dialect list. For each dialect: + +1. Parse with `sqlglot.parse()` (warnings suppressed) +2. Check for degradation via `_is_degraded` — phantom tables (`IGNORE`, `""`), keyword-as-column names (`UNIQUE`, `DISTINCT`) +3. If degraded and not the last dialect, try the next one +4. If all fail, raise `ValueError("This query is wrong")` + +--- + +### ColumnExtractor — columns and aliases + +**File:** [`column_extractor.py`](sql_metadata/column_extractor.py) | **Class:** `ColumnExtractor` + +Performs a single-pass depth-first walk of the AST in `arg_types` key order (which mirrors left-to-right SQL text order). Collects columns and column aliases into a `_Collector` accumulator. Returns an `ExtractionResult` frozen dataclass — consumed directly by `Parser.columns` and friends. + +`Parser` calls `ColumnExtractor` directly (no wrapper functions): + +```python +extractor = ColumnExtractor(ast, table_aliases, cte_name_map) +result = extractor.extract() # returns ExtractionResult +result.columns # UniqueList of column names +result.columns_dict # columns by clause section +result.alias_map # {alias: target_column} +``` + +#### Data flow + +```mermaid +flowchart TB + AST["sqlglot AST"] --> EXT["ColumnExtractor.extract()"] + TA["table_aliases\n(from TableExtractor)"] --> EXT + EXT --> WALK["_walk() — DFS in\narg_types key order"] + WALK --> COLL["_Collector\n(mutable accumulator)"] + COLL --> RES["ExtractionResult\n(frozen dataclass)"] +``` + +#### DFS dispatch + +The walk visits each node and dispatches to specialised handlers: + +| AST Node Type | Handler | What it does | +|---------------|---------|-------------| +| `exp.Star` | `_handle_star` | Adds `*` (skips if inside function like `COUNT(*)`) | +| `exp.ColumnDef` | (inline) | Adds column name for CREATE TABLE DDL | +| `exp.Identifier` | `_handle_identifier` | Adds column if in JOIN USING context | +| `exp.CTE` | `_handle_cte` | Records CTE name, processes column definitions | +| `exp.Column` | `_handle_column` | Main handler — resolves table alias, builds full name | +| `exp.Subquery` (aliased) | (inline) | Records subquery name and depth for ordering | + +**Special processing** in `_process_child_key`: +- SELECT expressions → `_handle_select_exprs` → iterates expressions, detects aliases +- INSERT schema → `_handle_insert_schema` → extracts column list from `INSERT INTO t(col1, col2)` +- JOIN USING → `_handle_join_using` → extracts column identifiers + +#### Clause classification + +`_classify_clause` maps each `arg_types` key to a `columns_dict` section: + +| Key | Section | +|-----|---------| +| `expressions` (under `Select`) | `"select"` | +| `expressions` (under `Update`) | `"update"` | +| `where` | `"where"` | +| `group` | `"group_by"` | +| `order` | `"order_by"` | +| `having` | `"having"` | +| `on`, `using` | `"join"` | + +#### Alias handling + +`_handle_alias` processes `SELECT expr AS alias`: + +1. If the aliased expression contains a subquery → walk it recursively, extract its SELECT columns as the alias target +2. If the expression has columns → add them, then register the alias mapping (unless it's a self-alias like `SELECT col AS col`) +3. If no columns (e.g., `SELECT 1 AS num`) → register the alias with no target + +#### Date-part function filtering + +`_is_date_part_unit` prevents extracting unit keywords as columns in functions like `DATEADD(day, 1, col)` — `day` is a keyword, not a column reference. + +--- + +### TableExtractor — tables and table aliases + +**File:** [`table_extractor.py`](sql_metadata/table_extractor.py) | **Class:** `TableExtractor` + +Walks the AST for `exp.Table` and `exp.Lateral` nodes, builds fully-qualified table names, and sorts results by first occurrence in the raw SQL. + +#### Extraction flow + +```mermaid +flowchart TB + AST["sqlglot AST"] --> CHECK{"exp.Command?"} + CHECK -->|Yes| REGEX["Regex fallback\n(_extract_tables_from_command)"] + CHECK -->|No| CREATE{"exp.Create?"} + CREATE -->|Yes| TARGET["Extract CREATE target"] + CREATE -->|No| SKIP["skip"] + TARGET --> COLLECT + SKIP --> COLLECT["_collect_all()\nWalk exp.Table + exp.Lateral"] + COLLECT --> FILTER["Filter out CTE names"] + FILTER --> SORT["Sort by _first_position()\n(regex in raw SQL)"] + SORT --> ORDER["_place_tables_in_order()\nCREATE target goes first"] +``` + +**Key algorithms:** + +- **Name construction** — `_table_full_name` assembles `catalog.db.name`, with special handling for bracket mode (TSQL) and double-dot notation (`catalog..name`) +- **Position sorting** — `_first_position` finds each table name in the raw SQL via regex, preferring matches after table-introducing keywords (`FROM`, `JOIN`, `TABLE`, `INTO`, `UPDATE`). This ensures output order matches left-to-right reading order. +- **CTE filtering** — table names matching known CTE names are excluded, so only real tables appear in the output + +**Alias extraction** — `extract_aliases` walks `exp.Table` nodes looking for aliases: + +```sql +SELECT * FROM users u JOIN orders o ON u.id = o.user_id +-- ^ ^ +-- alias="u" alias="o" +-- Result: {"u": "users", "o": "orders"} +``` + +--- + +### NestedResolver — CTE/subquery names, bodies, and resolution + +**File:** [`nested_resolver.py`](sql_metadata/nested_resolver.py) | **Class:** `NestedResolver` + +Handles the complete "look inside nested queries" concern. Created lazily by `Parser._get_resolver()`. + +#### Four responsibilities + +**1. Name extraction** — extract CTE and subquery names from the AST: + +- `extract_cte_names(ast, cte_name_map)` — static method, walks `exp.CTE` nodes and collects their aliases (with reverse CTE name map applied) +- `extract_subquery_names(ast)` — static method, post-order walk collecting aliased `exp.Subquery` names + +Called directly by `Parser.with_names` and `Parser.subqueries_names`. + +**2. Body extraction** — render CTE/subquery AST nodes back to SQL: + +- `extract_cte_bodies` — finds `exp.CTE` nodes in the AST, renders their body via `_PreservingGenerator` +- `extract_subquery_bodies` — post-order walk so inner subqueries appear before outer ones +- `_PreservingGenerator` — custom sqlglot `Generator` that preserves function signatures sqlglot would normalise (e.g., keeps `IFNULL` instead of converting to `COALESCE`, keeps `DIV` instead of `CAST(... / ... AS INT)`) + +**3. Column resolution** — `resolve()` runs two phases: + +```mermaid +flowchart TB + INPUT["columns from ColumnExtractor"] + INPUT --> P1["Phase 1: _resolve_sub_queries()\nReplace subquery.column refs\nwith actual columns"] + P1 --> P2["Phase 2: _resolve_bare_through_nested()\nDrop bare names that are\naliases in nested queries"] + P2 --> OUTPUT["Resolved columns"] +``` + +Phase 1 example: +```sql +SELECT sq.name FROM (SELECT name FROM users) sq +-- "sq.name" → resolved through subquery → "name" +``` + +Phase 2 example: +```sql +WITH cte AS (SELECT id, name AS label FROM users) +SELECT label FROM cte +-- "label" is an alias inside the CTE → dropped from columns, added to aliases +``` + +**4. Recursive sub-Parser instantiation** — when resolving `subquery.column`, the resolver creates a new `Parser(body_sql)` for each nested query body (cached in `_subqueries_parsers` / `_with_parsers`). This means the full pipeline runs recursively for each CTE/subquery. + +#### Alias resolution with cycle detection + +`_resolve_column_alias` follows alias chains with a `visited` set to prevent infinite loops: + +```python +# a → b → c (resolves to "c") +# a → b → a (cycle detected, stops at "a") +``` + +--- + +### QueryTypeExtractor + +**File:** [`query_type_extractor.py`](sql_metadata/query_type_extractor.py) | **Class:** `QueryTypeExtractor` + +Maps the AST root node type to a `QueryType` enum value via `_SIMPLE_TYPE_MAP`: + +| AST Node | QueryType | +|----------|-----------| +| `exp.Select`, `exp.Union`, `exp.Intersect`, `exp.Except` | `SELECT` | +| `exp.Insert` | `INSERT` | +| `exp.Update` | `UPDATE` | +| `exp.Delete` | `DELETE` | +| `exp.Create` | `CREATE` | +| `exp.Alter` | `ALTER` | +| `exp.Drop` | `DROP` | +| `exp.TruncateTable` | `TRUNCATE` | +| `exp.Merge` | `MERGE` | + +Special handling: +- Parenthesised queries → `_unwrap_parens` strips `Paren`/`Subquery` wrappers +- `exp.Command` → `_resolve_command_type` checks for `CREATE FUNCTION` / `ALTER` +- `REPLACE INTO` → detected via `ASTParser.is_replace` flag, patched in `Parser.query_type` + +--- + +### Comments + +**File:** [`comments.py`](sql_metadata/comments.py) + +A collection of pure stateless functions (no class). Exploits the fact that sqlglot's tokenizer skips comments — comments live in the *gaps* between consecutive token positions. + +**Algorithm:** + +1. Tokenize the SQL with the appropriate tokenizer +2. For each gap between token `[i].end` and token `[i+1].start`, scan for comment delimiters (`--`, `/* */`, `#`) +3. Collect or strip the matches + +**Tokenizer selection** — `_choose_tokenizer`: +- If SQL contains `#` used as a comment (not a variable) → MySQL tokenizer (treats `#` as comment delimiter) +- Otherwise → default sqlglot tokenizer +- `_has_hash_variables` distinguishes `#temp` (MSSQL) and `#VAR#` (template) from `# comment` (MySQL) + +**Two stripping variants:** +- `strip_comments` — public API, preserves `#VAR` references +- `strip_comments_for_parsing` — internal, always strips `#` comments (needed before `sqlglot.parse()`) + +--- + +### Supporting Modules + +**[`keywords_lists.py`](sql_metadata/keywords_lists.py)** — keyword sets used for token classification and query type mapping: +- `KEYWORDS_BEFORE_COLUMNS` — keywords after which columns appear (`SELECT`, `WHERE`, `ON`, etc.) +- `TABLE_ADJUSTMENT_KEYWORDS` — keywords after which tables appear (`FROM`, `JOIN`, `INTO`, etc.) +- `COLUMNS_SECTIONS` — maps keywords to `columns_dict` section names +- `QueryType` — string enum (`str, Enum`) for direct comparison (`parser.query_type == "SELECT"`) + +**[`utils.py`](sql_metadata/utils.py):** +- `UniqueList` — deduplicating list with O(1) membership checks via internal `set`. Used everywhere to collect columns, tables, aliases. +- `flatten_list` — recursively flattens nested lists from multi-column alias resolution. +- `_make_reverse_cte_map` — builds reverse mapping from placeholder CTE names to originals, shared by `ColumnExtractor` and `NestedResolver`. + +**[`generalizator.py`](sql_metadata/generalizator.py)** — anonymises SQL for log aggregation: strips comments, replaces literals with `X`, numbers with `N`, collapses `IN(...)` lists to `(XYZ)`. + +--- + +## Traced Walkthrough + +Let's trace `Parser("SELECT a AS x FROM t").columns_aliases` step by step. + +```mermaid +sequenceDiagram + participant User + participant Parser + participant ASTParser + participant sqlglot + participant TableExtractor + participant ColumnExtractor + participant NestedResolver + + User->>Parser: .columns_aliases + Parser->>Parser: .columns (not cached yet) + + Note over Parser: Need AST and table_aliases + + Parser->>ASTParser: .ast (first access) + ASTParser->>ASTParser: SqlCleaner.clean() + Note over ASTParser: No REPLACE, no comments,
no qualified CTEs + ASTParser->>ASTParser: DialectParser().parse() + Note over ASTParser: No special syntax →
[None, "mysql"] + ASTParser->>sqlglot: sqlglot.parse(sql, dialect=None) + sqlglot-->>ASTParser: exp.Select AST + + Parser->>Parser: .tables_aliases + Parser->>TableExtractor: extract_aliases(tables) + Note over TableExtractor: No aliases on "t" + TableExtractor-->>Parser: {} + + Parser->>ColumnExtractor: ColumnExtractor(ast, {}, {}).extract() + Note over ColumnExtractor: _walk() DFS begins + + Note over ColumnExtractor: Visit Select node →
_walk_children() + Note over ColumnExtractor: key="expressions" + Select →
_handle_select_exprs() + Note over ColumnExtractor: expr[0] is Alias "x" →
_handle_alias() + Note over ColumnExtractor: inner is Column "a" →
_flat_columns() → ["a"]
add_column("a", "select")
add_alias("x", "a", "select") + Note over ColumnExtractor: key="from" →
skip (Table, not Column) + + ColumnExtractor-->>Parser: ExtractionResult (frozen dataclass) + + Note over Parser: result.columns=["a"]
result.alias_map={"x": "a"} + + Parser->>NestedResolver: resolve(columns, ...) + Note over NestedResolver: No subqueries or CTEs
→ columns unchanged + + NestedResolver-->>Parser: (["a"], {...}, {"x": "a"}) + + Parser-->>User: {"x": "a"} +``` + +**What happened:** + +1. **`Parser.__init__`** — stored raw SQL, created `ASTParser` (lazy) +2. **`.columns_aliases`** accessed → triggers `.columns` (not cached) +3. **`.columns`** needs the AST → accesses `self._ast_parser.ast` +4. **`ASTParser.ast`** (first access) → `SqlCleaner.clean()` → `DialectParser().parse()` → `sqlglot.parse()` +5. **`.tables_aliases`** needed for column extraction → `TableExtractor.extract_aliases()` → `{}` (no aliases on `t`) +6. **`ColumnExtractor(ast, {}, {}).extract()`** → DFS walk: + - Visits `Select` node, key `"expressions"` → `_handle_select_exprs()` + - Finds `Alias(Column("a"), "x")` → `_handle_alias()` → records column `"a"` in select section, alias `"x"` → `"a"` + - Key `"from"` → finds `Table("t")`, not a column node, skipped +7. **`NestedResolver.resolve()`** — no subqueries or CTEs, columns pass through unchanged +8. **Result cached** — `_columns = ["a"]`, `_columns_aliases = {"x": "a"}` + +--- + +## Dependency Graph + +```mermaid +flowchart TB + INIT["__init__.py"] + INIT --> P["parser.py"] + + P --> AST["ast_parser.py"] + P --> EXT["column_extractor.py"] + P --> TAB["table_extractor.py"] + P --> RES["nested_resolver.py"] + P --> QT["query_type_extractor.py"] + P --> COM["comments.py"] + P --> GEN["generalizator.py"] + P --> KW["keywords_lists.py"] + P --> UT["utils.py"] + + AST --> SC["sql_cleaner.py"] + AST --> DP["dialect_parser.py"] + + SC --> COM + DP --> COM + DP -.->|"sqlglot.parse()"| SG["sqlglot"] + TAB --> DP + + EXT -.-> SG + EXT --> UT + TAB -.-> SG + RES -.-> SG + RES --> UT + RES -->|"sub-Parser\n(recursive)"| P + QT -.-> SG + QT --> KW + COM -.->|"Tokenizer"| SG + GEN --> COM + + style SG fill:#f0f0f0,stroke:#999 +``` + +Note the circular dependency: `nested_resolver.py` imports `Parser` from `parser.py` to create sub-Parser instances for nested queries. This import is deferred (inside method bodies) to avoid import-time cycles. + +--- + +## Key Design Patterns + +**Lazy evaluation with caching** — every `Parser` property computes on first access and caches the result. This means you pay zero cost for properties you never access. + +**Composition over inheritance** — `Parser` doesn't subclass anything meaningful. It composes `ASTParser` (which itself composes `SqlCleaner` and `DialectParser`), `TableExtractor`, `ColumnExtractor`, `NestedResolver`, and `QueryTypeExtractor` as separate concerns. + +**Single-pass DFS extraction** — `ColumnExtractor` walks the AST exactly once in `arg_types` key order. Because sqlglot's `arg_types` keys are ordered to mirror left-to-right SQL text, the walk naturally processes clauses in source order. + +**Multi-dialect retry with degradation detection** — rather than guessing one dialect, `DialectParser` tries several in order and picks the first that doesn't produce a degraded result (phantom tables, keyword-as-column names). + +**Graceful regex fallbacks** — when the AST parse fails entirely, the parser degrades to regex-based extraction for columns (INSERT INTO pattern) and LIMIT/OFFSET rather than raising an error. + +**Recursive sub-parsing** — `NestedResolver` creates fresh `Parser` instances for CTE/subquery bodies. This reuses the entire pipeline recursively, with caching to avoid re-parsing the same body twice. diff --git a/CLAUDE.md b/CLAUDE.md index 43c994c2..2ace935b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1 +1,5 @@ @AGENTS.md + +## Rules + +- **Never change test files to match incorrect code output.** Tests define the expected behavior. If a test fails, fix the source code, not the test. The only exception is when a feature is explicitly removed (like `get_query_tokens` in the v3 migration). diff --git a/Makefile b/Makefile index 226c0adf..b686ed9c 100644 --- a/Makefile +++ b/Makefile @@ -8,11 +8,13 @@ coverage: poetry run pytest -vv --cov=sql_metadata --cov-report=term --cov-report=html lint: - poetry run flake8 sql_metadata - poetry run pylint sql_metadata + poetry run ruff check --fix sql_metadata format: - poetry run black . + poetry run ruff format . + +type_check: + poetry run mypy sql_metadata publish: # run git tag -a v0.0.0 before running make publish diff --git a/README.md b/README.md index 4cd34512..95c67976 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![PyPI](https://img.shields.io/pypi/v/sql_metadata.svg)](https://pypi.python.org/pypi/sql_metadata) [![Tests](https://github.com/macbre/sql-metadata/actions/workflows/python-ci.yml/badge.svg)](https://github.com/macbre/sql-metadata/actions/workflows/python-ci.yml) [![Coverage Status](https://coveralls.io/repos/github/macbre/sql-metadata/badge.svg?branch=master&1)](https://coveralls.io/github/macbre/sql-metadata?branch=master) -Code style: black +[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) [![Maintenance](https://img.shields.io/badge/maintained%3F-yes-green.svg)](https://github.com/macbre/sql-metadata/graphs/commit-activity) [![Downloads](https://pepy.tech/badge/sql-metadata/month)](https://pepy.tech/project/sql-metadata) diff --git a/poetry.lock b/poetry.lock index 6631f283..e807a295 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,87 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. - -[[package]] -name = "astroid" -version = "4.0.4" -description = "An abstract syntax tree for Python with inference support." -optional = false -python-versions = ">=3.10.0" -groups = ["dev"] -files = [ - {file = "astroid-4.0.4-py3-none-any.whl", hash = "sha256:52f39653876c7dec3e3afd4c2696920e05c83832b9737afc21928f2d2eb7a753"}, - {file = "astroid-4.0.4.tar.gz", hash = "sha256:986fed8bcf79fb82c78b18a53352a0b287a73817d6dbcfba3162da36667c49a0"}, -] - -[package.dependencies] -typing-extensions = {version = ">=4", markers = "python_version < \"3.11\""} - -[[package]] -name = "black" -version = "26.3.1" -description = "The uncompromising code formatter." -optional = false -python-versions = ">=3.10" -groups = ["dev"] -files = [ - {file = "black-26.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:86a8b5035fce64f5dcd1b794cf8ec4d31fe458cf6ce3986a30deb434df82a1d2"}, - {file = "black-26.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5602bdb96d52d2d0672f24f6ffe5218795736dd34807fd0fd55ccd6bf206168b"}, - {file = "black-26.3.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6c54a4a82e291a1fee5137371ab488866b7c86a3305af4026bdd4dc78642e1ac"}, - {file = "black-26.3.1-cp310-cp310-win_amd64.whl", hash = "sha256:6e131579c243c98f35bce64a7e08e87fb2d610544754675d4a0e73a070a5aa3a"}, - {file = "black-26.3.1-cp310-cp310-win_arm64.whl", hash = "sha256:5ed0ca58586c8d9a487352a96b15272b7fa55d139fc8496b519e78023a8dab0a"}, - {file = "black-26.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:28ef38aee69e4b12fda8dba75e21f9b4f979b490c8ac0baa7cb505369ac9e1ff"}, - {file = "black-26.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bf9bf162ed91a26f1adba8efda0b573bc6924ec1408a52cc6f82cb73ec2b142c"}, - {file = "black-26.3.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:474c27574d6d7037c1bc875a81d9be0a9a4f9ee95e62800dab3cfaadbf75acd5"}, - {file = "black-26.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:5e9d0d86df21f2e1677cc4bd090cd0e446278bcbbe49bf3659c308c3e402843e"}, - {file = "black-26.3.1-cp311-cp311-win_arm64.whl", hash = "sha256:9a5e9f45e5d5e1c5b5c29b3bd4265dcc90e8b92cf4534520896ed77f791f4da5"}, - {file = "black-26.3.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b5e6f89631eb88a7302d416594a32faeee9fb8fb848290da9d0a5f2903519fc1"}, - {file = "black-26.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:41cd2012d35b47d589cb8a16faf8a32ef7a336f56356babd9fcf70939ad1897f"}, - {file = "black-26.3.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f76ff19ec5297dd8e66eb64deda23631e642c9393ab592826fd4bdc97a4bce7"}, - {file = "black-26.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:ddb113db38838eb9f043623ba274cfaf7d51d5b0c22ecb30afe58b1bb8322983"}, - {file = "black-26.3.1-cp312-cp312-win_arm64.whl", hash = "sha256:dfdd51fc3e64ea4f35873d1b3fb25326773d55d2329ff8449139ebaad7357efb"}, - {file = "black-26.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:855822d90f884905362f602880ed8b5df1b7e3ee7d0db2502d4388a954cc8c54"}, - {file = "black-26.3.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8a33d657f3276328ce00e4d37fe70361e1ec7614da5d7b6e78de5426cb56332f"}, - {file = "black-26.3.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f1cd08e99d2f9317292a311dfe578fd2a24b15dbce97792f9c4d752275c1fa56"}, - {file = "black-26.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:c7e72339f841b5a237ff14f7d3880ddd0fc7f98a1199e8c4327f9a4f478c1839"}, - {file = "black-26.3.1-cp313-cp313-win_arm64.whl", hash = "sha256:afc622538b430aa4c8c853f7f63bc582b3b8030fd8c80b70fb5fa5b834e575c2"}, - {file = "black-26.3.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:2d6bfaf7fd0993b420bed691f20f9492d53ce9a2bcccea4b797d34e947318a78"}, - {file = "black-26.3.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:f89f2ab047c76a9c03f78d0d66ca519e389519902fa27e7a91117ef7611c0568"}, - {file = "black-26.3.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b07fc0dab849d24a80a29cfab8d8a19187d1c4685d8a5e6385a5ce323c1f015f"}, - {file = "black-26.3.1-cp314-cp314-win_amd64.whl", hash = "sha256:0126ae5b7c09957da2bdbd91a9ba1207453feada9e9fe51992848658c6c8e01c"}, - {file = "black-26.3.1-cp314-cp314-win_arm64.whl", hash = "sha256:92c0ec1f2cc149551a2b7b47efc32c866406b6891b0ee4625e95967c8f4acfb1"}, - {file = "black-26.3.1-py3-none-any.whl", hash = "sha256:2bd5aa94fc267d38bb21a70d7410a89f1a1d318841855f698746f8e7f51acd1b"}, - {file = "black-26.3.1.tar.gz", hash = "sha256:2c50f5063a9641c7eed7795014ba37b0f5fa227f3d408b968936e24bc0566b07"}, -] - -[package.dependencies] -click = ">=8.0.0" -mypy-extensions = ">=0.4.3" -packaging = ">=22.0" -pathspec = ">=1.0.0" -platformdirs = ">=2" -pytokens = ">=0.4.0,<0.5.0" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""} - -[package.extras] -colorama = ["colorama (>=0.4.3)"] -d = ["aiohttp (>=3.10)"] -jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] -uvloop = ["uvloop (>=0.15.2) ; sys_platform != \"win32\"", "winloop (>=0.5.0) ; sys_platform == \"win32\""] - -[[package]] -name = "click" -version = "8.1.8" -description = "Composable command line interface toolkit" -optional = false -python-versions = ">=3.7" -groups = ["dev"] -files = [ - {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"}, - {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"}, -] - -[package.dependencies] -colorama = {version = "*", markers = "platform_system == \"Windows\""} +# This file is automatically @generated by Poetry 2.3.2 and should not be changed by hand. [[package]] name = "colorama" @@ -90,7 +7,7 @@ description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" groups = ["dev"] -markers = "platform_system == \"Windows\" or sys_platform == \"win32\"" +markers = "sys_platform == \"win32\"" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, @@ -218,22 +135,6 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1 [package.extras] toml = ["tomli ; python_full_version <= \"3.11.0a6\""] -[[package]] -name = "dill" -version = "0.4.0" -description = "serialize all of Python" -optional = false -python-versions = ">=3.8" -groups = ["dev"] -files = [ - {file = "dill-0.4.0-py3-none-any.whl", hash = "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049"}, - {file = "dill-0.4.0.tar.gz", hash = "sha256:0633f1d2df477324f53a895b02c901fb961bdbf65a17122586ea7019292cbcf0"}, -] - -[package.extras] -graph = ["objgraph (>=1.7.2)"] -profile = ["gprof2dot (>=2022.7.29)"] - [[package]] name = "exceptiongroup" version = "1.2.2" @@ -250,23 +151,6 @@ files = [ [package.extras] test = ["pytest (>=6)"] -[[package]] -name = "flake8" -version = "7.3.0" -description = "the modular source code checker: pep8 pyflakes and co" -optional = false -python-versions = ">=3.9" -groups = ["dev"] -files = [ - {file = "flake8-7.3.0-py2.py3-none-any.whl", hash = "sha256:b9696257b9ce8beb888cdbe31cf885c90d31928fe202be0889a7cdafad32f01e"}, - {file = "flake8-7.3.0.tar.gz", hash = "sha256:fe044858146b9fc69b551a4b490d69cf960fcb78ad1edcb84e7fbb1b4a8e3872"}, -] - -[package.dependencies] -mccabe = ">=0.7.0,<0.8.0" -pycodestyle = ">=2.14.0,<2.15.0" -pyflakes = ">=3.4.0,<3.5.0" - [[package]] name = "iniconfig" version = "2.1.0" @@ -280,33 +164,168 @@ files = [ ] [[package]] -name = "isort" -version = "6.0.1" -description = "A Python utility / library to sort Python imports." +name = "librt" +version = "0.8.1" +description = "Mypyc runtime library" optional = false -python-versions = ">=3.9.0" +python-versions = ">=3.9" groups = ["dev"] +markers = "platform_python_implementation != \"PyPy\"" files = [ - {file = "isort-6.0.1-py3-none-any.whl", hash = "sha256:2dc5d7f65c9678d94c88dfc29161a320eec67328bc97aad576874cb4be1e9615"}, - {file = "isort-6.0.1.tar.gz", hash = "sha256:1cb5df28dfbc742e490c5e41bad6da41b805b0a8be7bc93cd0fb2a8a890ac450"}, + {file = "librt-0.8.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:81fd938344fecb9373ba1b155968c8a329491d2ce38e7ddb76f30ffb938f12dc"}, + {file = "librt-0.8.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5db05697c82b3a2ec53f6e72b2ed373132b0c2e05135f0696784e97d7f5d48e7"}, + {file = "librt-0.8.1-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d56bc4011975f7460bea7b33e1ff425d2f1adf419935ff6707273c77f8a4ada6"}, + {file = "librt-0.8.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5cdc0f588ff4b663ea96c26d2a230c525c6fc62b28314edaaaca8ed5af931ad0"}, + {file = "librt-0.8.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:97c2b54ff6717a7a563b72627990bec60d8029df17df423f0ed37d56a17a176b"}, + {file = "librt-0.8.1-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8f1125e6bbf2f1657d9a2f3ccc4a2c9b0c8b176965bb565dd4d86be67eddb4b6"}, + {file = "librt-0.8.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8f4bb453f408137d7581be309b2fbc6868a80e7ef60c88e689078ee3a296ae71"}, + {file = "librt-0.8.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:c336d61d2fe74a3195edc1646d53ff1cddd3a9600b09fa6ab75e5514ba4862a7"}, + {file = "librt-0.8.1-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:eb5656019db7c4deacf0c1a55a898c5bb8f989be904597fcb5232a2f4828fa05"}, + {file = "librt-0.8.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c25d9e338d5bed46c1632f851babf3d13c78f49a225462017cf5e11e845c5891"}, + {file = "librt-0.8.1-cp310-cp310-win32.whl", hash = "sha256:aaab0e307e344cb28d800957ef3ec16605146ef0e59e059a60a176d19543d1b7"}, + {file = "librt-0.8.1-cp310-cp310-win_amd64.whl", hash = "sha256:56e04c14b696300d47b3bc5f1d10a00e86ae978886d0cee14e5714fafb5df5d2"}, + {file = "librt-0.8.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:681dc2451d6d846794a828c16c22dc452d924e9f700a485b7ecb887a30aad1fd"}, + {file = "librt-0.8.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3b4350b13cc0e6f5bec8fa7caf29a8fb8cdc051a3bae45cfbfd7ce64f009965"}, + {file = "librt-0.8.1-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ac1e7817fd0ed3d14fd7c5df91daed84c48e4c2a11ee99c0547f9f62fdae13da"}, + {file = "librt-0.8.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:747328be0c5b7075cde86a0e09d7a9196029800ba75a1689332348e998fb85c0"}, + {file = "librt-0.8.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f0af2bd2bc204fa27f3d6711d0f360e6b8c684a035206257a81673ab924aa11e"}, + {file = "librt-0.8.1-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d480de377f5b687b6b1bc0c0407426da556e2a757633cc7e4d2e1a057aa688f3"}, + {file = "librt-0.8.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d0ee06b5b5291f609ddb37b9750985b27bc567791bc87c76a569b3feed8481ac"}, + {file = "librt-0.8.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:9e2c6f77b9ad48ce5603b83b7da9ee3e36b3ab425353f695cba13200c5d96596"}, + {file = "librt-0.8.1-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:439352ba9373f11cb8e1933da194dcc6206daf779ff8df0ed69c5e39113e6a99"}, + {file = "librt-0.8.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:82210adabbc331dbb65d7868b105185464ef13f56f7f76688565ad79f648b0fe"}, + {file = "librt-0.8.1-cp311-cp311-win32.whl", hash = "sha256:52c224e14614b750c0a6d97368e16804a98c684657c7518752c356834fff83bb"}, + {file = "librt-0.8.1-cp311-cp311-win_amd64.whl", hash = "sha256:c00e5c884f528c9932d278d5c9cbbea38a6b81eb62c02e06ae53751a83a4d52b"}, + {file = "librt-0.8.1-cp311-cp311-win_arm64.whl", hash = "sha256:f7cdf7f26c2286ffb02e46d7bac56c94655540b26347673bea15fa52a6af17e9"}, + {file = "librt-0.8.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a28f2612ab566b17f3698b0da021ff9960610301607c9a5e8eaca62f5e1c350a"}, + {file = "librt-0.8.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:60a78b694c9aee2a0f1aaeaa7d101cf713e92e8423a941d2897f4fa37908dab9"}, + {file = "librt-0.8.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:758509ea3f1eba2a57558e7e98f4659d0ea7670bff49673b0dde18a3c7e6c0eb"}, + {file = "librt-0.8.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:039b9f2c506bd0ab0f8725aa5ba339c6f0cd19d3b514b50d134789809c24285d"}, + {file = "librt-0.8.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5bb54f1205a3a6ab41a6fd71dfcdcbd278670d3a90ca502a30d9da583105b6f7"}, + {file = "librt-0.8.1-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:05bd41cdee35b0c59c259f870f6da532a2c5ca57db95b5f23689fcb5c9e42440"}, + {file = "librt-0.8.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:adfab487facf03f0d0857b8710cf82d0704a309d8ffc33b03d9302b4c64e91a9"}, + {file = "librt-0.8.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:153188fe98a72f206042be10a2c6026139852805215ed9539186312d50a8e972"}, + {file = "librt-0.8.1-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:dd3c41254ee98604b08bd5b3af5bf0a89740d4ee0711de95b65166bf44091921"}, + {file = "librt-0.8.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e0d138c7ae532908cbb342162b2611dbd4d90c941cd25ab82084aaf71d2c0bd0"}, + {file = "librt-0.8.1-cp312-cp312-win32.whl", hash = "sha256:43353b943613c5d9c49a25aaffdba46f888ec354e71e3529a00cca3f04d66a7a"}, + {file = "librt-0.8.1-cp312-cp312-win_amd64.whl", hash = "sha256:ff8baf1f8d3f4b6b7257fcb75a501f2a5499d0dda57645baa09d4d0d34b19444"}, + {file = "librt-0.8.1-cp312-cp312-win_arm64.whl", hash = "sha256:0f2ae3725904f7377e11cc37722d5d401e8b3d5851fb9273d7f4fe04f6b3d37d"}, + {file = "librt-0.8.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7e6bad1cd94f6764e1e21950542f818a09316645337fd5ab9a7acc45d99a8f35"}, + {file = "librt-0.8.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cf450f498c30af55551ba4f66b9123b7185362ec8b625a773b3d39aa1a717583"}, + {file = "librt-0.8.1-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:eca45e982fa074090057132e30585a7e8674e9e885d402eae85633e9f449ce6c"}, + {file = "librt-0.8.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0c3811485fccfda840861905b8c70bba5ec094e02825598bb9d4ca3936857a04"}, + {file = "librt-0.8.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e4af413908f77294605e28cfd98063f54b2c790561383971d2f52d113d9c363"}, + {file = "librt-0.8.1-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5212a5bd7fae98dae95710032902edcd2ec4dc994e883294f75c857b83f9aba0"}, + {file = "librt-0.8.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e692aa2d1d604e6ca12d35e51fdc36f4cda6345e28e36374579f7ef3611b3012"}, + {file = "librt-0.8.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4be2a5c926b9770c9e08e717f05737a269b9d0ebc5d2f0060f0fe3fe9ce47acb"}, + {file = "librt-0.8.1-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:fd1a720332ea335ceb544cf0a03f81df92abd4bb887679fd1e460976b0e6214b"}, + {file = "librt-0.8.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2af9e01e0ef80d95ae3c720be101227edae5f2fe7e3dc63d8857fadfc5a1d"}, + {file = "librt-0.8.1-cp313-cp313-win32.whl", hash = "sha256:086a32dbb71336627e78cc1d6ee305a68d038ef7d4c39aaff41ae8c9aa46e91a"}, + {file = "librt-0.8.1-cp313-cp313-win_amd64.whl", hash = "sha256:e11769a1dbda4da7b00a76cfffa67aa47cfa66921d2724539eee4b9ede780b79"}, + {file = "librt-0.8.1-cp313-cp313-win_arm64.whl", hash = "sha256:924817ab3141aca17893386ee13261f1d100d1ef410d70afe4389f2359fea4f0"}, + {file = "librt-0.8.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:6cfa7fe54fd4d1f47130017351a959fe5804bda7a0bc7e07a2cdbc3fdd28d34f"}, + {file = "librt-0.8.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:228c2409c079f8c11fb2e5d7b277077f694cb93443eb760e00b3b83cb8b3176c"}, + {file = "librt-0.8.1-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7aae78ab5e3206181780e56912d1b9bb9f90a7249ce12f0e8bf531d0462dd0fc"}, + {file = "librt-0.8.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:172d57ec04346b047ca6af181e1ea4858086c80bdf455f61994c4aa6fc3f866c"}, + {file = "librt-0.8.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6b1977c4ea97ce5eb7755a78fae68d87e4102e4aaf54985e8b56806849cc06a3"}, + {file = "librt-0.8.1-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:10c42e1f6fd06733ef65ae7bebce2872bcafd8d6e6b0a08fe0a05a23b044fb14"}, + {file = "librt-0.8.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:4c8dfa264b9193c4ee19113c985c95f876fae5e51f731494fc4e0cf594990ba7"}, + {file = "librt-0.8.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:01170b6729a438f0dedc4a26ed342e3dc4f02d1000b4b19f980e1877f0c297e6"}, + {file = "librt-0.8.1-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:7b02679a0d783bdae30d443025b94465d8c3dc512f32f5b5031f93f57ac32071"}, + {file = "librt-0.8.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:190b109bb69592a3401fe1ffdea41a2e73370ace2ffdc4a0e8e2b39cdea81b78"}, + {file = "librt-0.8.1-cp314-cp314-win32.whl", hash = "sha256:e70a57ecf89a0f64c24e37f38d3fe217a58169d2fe6ed6d70554964042474023"}, + {file = "librt-0.8.1-cp314-cp314-win_amd64.whl", hash = "sha256:7e2f3edca35664499fbb36e4770650c4bd4a08abc1f4458eab9df4ec56389730"}, + {file = "librt-0.8.1-cp314-cp314-win_arm64.whl", hash = "sha256:0d2f82168e55ddefd27c01c654ce52379c0750ddc31ee86b4b266bcf4d65f2a3"}, + {file = "librt-0.8.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2c74a2da57a094bd48d03fa5d196da83d2815678385d2978657499063709abe1"}, + {file = "librt-0.8.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a355d99c4c0d8e5b770313b8b247411ed40949ca44e33e46a4789b9293a907ee"}, + {file = "librt-0.8.1-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:2eb345e8b33fb748227409c9f1233d4df354d6e54091f0e8fc53acdb2ffedeb7"}, + {file = "librt-0.8.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9be2f15e53ce4e83cc08adc29b26fb5978db62ef2a366fbdf716c8a6c8901040"}, + {file = "librt-0.8.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:785ae29c1f5c6e7c2cde2c7c0e148147f4503da3abc5d44d482068da5322fd9e"}, + {file = "librt-0.8.1-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1d3a7da44baf692f0c6aeb5b2a09c5e6fc7a703bca9ffa337ddd2e2da53f7732"}, + {file = "librt-0.8.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5fc48998000cbc39ec0d5311312dda93ecf92b39aaf184c5e817d5d440b29624"}, + {file = "librt-0.8.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:e96baa6820280077a78244b2e06e416480ed859bbd8e5d641cf5742919d8beb4"}, + {file = "librt-0.8.1-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:31362dbfe297b23590530007062c32c6f6176f6099646bb2c95ab1b00a57c382"}, + {file = "librt-0.8.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cc3656283d11540ab0ea01978378e73e10002145117055e03722417aeab30994"}, + {file = "librt-0.8.1-cp314-cp314t-win32.whl", hash = "sha256:738f08021b3142c2918c03692608baed43bc51144c29e35807682f8070ee2a3a"}, + {file = "librt-0.8.1-cp314-cp314t-win_amd64.whl", hash = "sha256:89815a22daf9c51884fb5dbe4f1ef65ee6a146e0b6a8df05f753e2e4a9359bf4"}, + {file = "librt-0.8.1-cp314-cp314t-win_arm64.whl", hash = "sha256:bf512a71a23504ed08103a13c941f763db13fb11177beb3d9244c98c29fb4a61"}, + {file = "librt-0.8.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3dff3d3ca8db20e783b1bc7de49c0a2ab0b8387f31236d6a026597d07fcd68ac"}, + {file = "librt-0.8.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:08eec3a1fc435f0d09c87b6bf1ec798986a3544f446b864e4099633a56fcd9ed"}, + {file = "librt-0.8.1-cp39-cp39-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:e3f0a41487fd5fad7e760b9e8a90e251e27c2816fbc2cff36a22a0e6bcbbd9dd"}, + {file = "librt-0.8.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bacdb58d9939d95cc557b4dbaa86527c9db2ac1ed76a18bc8d26f6dc8647d851"}, + {file = "librt-0.8.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b6d7ab1f01aa753188605b09a51faa44a3327400b00b8cce424c71910fc0a128"}, + {file = "librt-0.8.1-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:4998009e7cb9e896569f4be7004f09d0ed70d386fa99d42b6d363f6d200501ac"}, + {file = "librt-0.8.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2cc68eeeef5e906839c7bb0815748b5b0a974ec27125beefc0f942715785b551"}, + {file = "librt-0.8.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:0bf69d79a23f4f40b8673a947a234baeeb133b5078b483b7297c5916539cf5d5"}, + {file = "librt-0.8.1-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:22b46eabd76c1986ee7d231b0765ad387d7673bbd996aa0d0d054b38ac65d8f6"}, + {file = "librt-0.8.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:237796479f4d0637d6b9cbcb926ff424a97735e68ade6facf402df4ec93375ed"}, + {file = "librt-0.8.1-cp39-cp39-win32.whl", hash = "sha256:4beb04b8c66c6ae62f8c1e0b2f097c1ebad9295c929a8d5286c05eae7c2fc7dc"}, + {file = "librt-0.8.1-cp39-cp39-win_amd64.whl", hash = "sha256:64548cde61b692dc0dc379f4b5f59a2f582c2ebe7890d09c1ae3b9e66fa015b7"}, + {file = "librt-0.8.1.tar.gz", hash = "sha256:be46a14693955b3bd96014ccbdb8339ee8c9346fbe11c1b78901b55125f14c73"}, ] -[package.extras] -colors = ["colorama"] -plugins = ["setuptools"] - [[package]] -name = "mccabe" -version = "0.7.0" -description = "McCabe checker, plugin for flake8" +name = "mypy" +version = "1.19.1" +description = "Optional static typing for Python" optional = false -python-versions = ">=3.6" +python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, - {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, + {file = "mypy-1.19.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5f05aa3d375b385734388e844bc01733bd33c644ab48e9684faa54e5389775ec"}, + {file = "mypy-1.19.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:022ea7279374af1a5d78dfcab853fe6a536eebfda4b59deab53cd21f6cd9f00b"}, + {file = "mypy-1.19.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee4c11e460685c3e0c64a4c5de82ae143622410950d6be863303a1c4ba0e36d6"}, + {file = "mypy-1.19.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de759aafbae8763283b2ee5869c7255391fbc4de3ff171f8f030b5ec48381b74"}, + {file = "mypy-1.19.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ab43590f9cd5108f41aacf9fca31841142c786827a74ab7cc8a2eacb634e09a1"}, + {file = "mypy-1.19.1-cp310-cp310-win_amd64.whl", hash = "sha256:2899753e2f61e571b3971747e302d5f420c3fd09650e1951e99f823bc3089dac"}, + {file = "mypy-1.19.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d8dfc6ab58ca7dda47d9237349157500468e404b17213d44fc1cb77bce532288"}, + {file = "mypy-1.19.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e3f276d8493c3c97930e354b2595a44a21348b320d859fb4a2b9f66da9ed27ab"}, + {file = "mypy-1.19.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2abb24cf3f17864770d18d673c85235ba52456b36a06b6afc1e07c1fdcd3d0e6"}, + {file = "mypy-1.19.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a009ffa5a621762d0c926a078c2d639104becab69e79538a494bcccb62cc0331"}, + {file = "mypy-1.19.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f7cee03c9a2e2ee26ec07479f38ea9c884e301d42c6d43a19d20fb014e3ba925"}, + {file = "mypy-1.19.1-cp311-cp311-win_amd64.whl", hash = "sha256:4b84a7a18f41e167f7995200a1d07a4a6810e89d29859df936f1c3923d263042"}, + {file = "mypy-1.19.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a8174a03289288c1f6c46d55cef02379b478bfbc8e358e02047487cad44c6ca1"}, + {file = "mypy-1.19.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ffcebe56eb09ff0c0885e750036a095e23793ba6c2e894e7e63f6d89ad51f22e"}, + {file = "mypy-1.19.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b64d987153888790bcdb03a6473d321820597ab8dd9243b27a92153c4fa50fd2"}, + {file = "mypy-1.19.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c35d298c2c4bba75feb2195655dfea8124d855dfd7343bf8b8c055421eaf0cf8"}, + {file = "mypy-1.19.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:34c81968774648ab5ac09c29a375fdede03ba253f8f8287847bd480782f73a6a"}, + {file = "mypy-1.19.1-cp312-cp312-win_amd64.whl", hash = "sha256:b10e7c2cd7870ba4ad9b2d8a6102eb5ffc1f16ca35e3de6bfa390c1113029d13"}, + {file = "mypy-1.19.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e3157c7594ff2ef1634ee058aafc56a82db665c9438fd41b390f3bde1ab12250"}, + {file = "mypy-1.19.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdb12f69bcc02700c2b47e070238f42cb87f18c0bc1fc4cdb4fb2bc5fd7a3b8b"}, + {file = "mypy-1.19.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f859fb09d9583a985be9a493d5cfc5515b56b08f7447759a0c5deaf68d80506e"}, + {file = "mypy-1.19.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9a6538e0415310aad77cb94004ca6482330fece18036b5f360b62c45814c4ef"}, + {file = "mypy-1.19.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:da4869fc5e7f62a88f3fe0b5c919d1d9f7ea3cef92d3689de2823fd27e40aa75"}, + {file = "mypy-1.19.1-cp313-cp313-win_amd64.whl", hash = "sha256:016f2246209095e8eda7538944daa1d60e1e8134d98983b9fc1e92c1fc0cb8dd"}, + {file = "mypy-1.19.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:06e6170bd5836770e8104c8fdd58e5e725cfeb309f0a6c681a811f557e97eac1"}, + {file = "mypy-1.19.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:804bd67b8054a85447c8954215a906d6eff9cabeabe493fb6334b24f4bfff718"}, + {file = "mypy-1.19.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:21761006a7f497cb0d4de3d8ef4ca70532256688b0523eee02baf9eec895e27b"}, + {file = "mypy-1.19.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:28902ee51f12e0f19e1e16fbe2f8f06b6637f482c459dd393efddd0ec7f82045"}, + {file = "mypy-1.19.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:481daf36a4c443332e2ae9c137dfee878fcea781a2e3f895d54bd3002a900957"}, + {file = "mypy-1.19.1-cp314-cp314-win_amd64.whl", hash = "sha256:8bb5c6f6d043655e055be9b542aa5f3bdd30e4f3589163e85f93f3640060509f"}, + {file = "mypy-1.19.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7bcfc336a03a1aaa26dfce9fff3e287a3ba99872a157561cbfcebe67c13308e3"}, + {file = "mypy-1.19.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b7951a701c07ea584c4fe327834b92a30825514c868b1f69c30445093fdd9d5a"}, + {file = "mypy-1.19.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b13cfdd6c87fc3efb69ea4ec18ef79c74c3f98b4e5498ca9b85ab3b2c2329a67"}, + {file = "mypy-1.19.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f28f99c824ecebcdaa2e55d82953e38ff60ee5ec938476796636b86afa3956e"}, + {file = "mypy-1.19.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c608937067d2fc5a4dd1a5ce92fd9e1398691b8c5d012d66e1ddd430e9244376"}, + {file = "mypy-1.19.1-cp39-cp39-win_amd64.whl", hash = "sha256:409088884802d511ee52ca067707b90c883426bd95514e8cfda8281dc2effe24"}, + {file = "mypy-1.19.1-py3-none-any.whl", hash = "sha256:f1235f5ea01b7db5468d53ece6aaddf1ad0b88d9e7462b86ef96fe04995d7247"}, + {file = "mypy-1.19.1.tar.gz", hash = "sha256:19d88bb05303fe63f71dd2c6270daca27cb9401c4ca8255fe50d1d920e0eb9ba"}, ] +[package.dependencies] +librt = {version = ">=0.6.2", markers = "platform_python_implementation != \"PyPy\""} +mypy_extensions = ">=1.0.0" +pathspec = ">=0.9.0" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing_extensions = ">=4.6.0" + +[package.extras] +dmypy = ["psutil (>=4.0)"] +faster-cache = ["orjson"] +install-types = ["pip"] +mypyc = ["setuptools (>=50)"] +reports = ["lxml"] + [[package]] name = "mypy-extensions" version = "1.1.0" @@ -349,23 +368,6 @@ optional = ["typing-extensions (>=4)"] re2 = ["google-re2 (>=1.1)"] tests = ["pytest (>=9)", "typing-extensions (>=4.15)"] -[[package]] -name = "platformdirs" -version = "4.3.7" -description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." -optional = false -python-versions = ">=3.9" -groups = ["dev"] -files = [ - {file = "platformdirs-4.3.7-py3-none-any.whl", hash = "sha256:a03875334331946f13c549dbd8f4bac7a13a50a895a0eb1e8c6a8ace80d40a94"}, - {file = "platformdirs-4.3.7.tar.gz", hash = "sha256:eb437d586b6a0986388f0d6f74aa0cde27b48d0e3d66843640bfb6bdcdb6e351"}, -] - -[package.extras] -docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"] -test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.4)", "pytest-cov (>=6)", "pytest-mock (>=3.14)"] -type = ["mypy (>=1.14.1)"] - [[package]] name = "pluggy" version = "1.5.0" @@ -382,30 +384,6 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] -[[package]] -name = "pycodestyle" -version = "2.14.0" -description = "Python style guide checker" -optional = false -python-versions = ">=3.9" -groups = ["dev"] -files = [ - {file = "pycodestyle-2.14.0-py2.py3-none-any.whl", hash = "sha256:dd6bf7cb4ee77f8e016f9c8e74a35ddd9f67e1d5fd4184d86c3b98e07099f42d"}, - {file = "pycodestyle-2.14.0.tar.gz", hash = "sha256:c4b5b517d278089ff9d0abdec919cd97262a3367449ea1c8b49b91529167b783"}, -] - -[[package]] -name = "pyflakes" -version = "3.4.0" -description = "passive checker of Python programs" -optional = false -python-versions = ">=3.9" -groups = ["dev"] -files = [ - {file = "pyflakes-3.4.0-py2.py3-none-any.whl", hash = "sha256:f742a7dbd0d9cb9ea41e9a24a918996e8170c799fa528688d40dd582c8265f4f"}, - {file = "pyflakes-3.4.0.tar.gz", hash = "sha256:b24f96fafb7d2ab0ec5075b7350b3d2d2218eab42003821c06344973d3ea2f58"}, -] - [[package]] name = "pygments" version = "2.19.1" @@ -421,36 +399,6 @@ files = [ [package.extras] windows-terminal = ["colorama (>=0.4.6)"] -[[package]] -name = "pylint" -version = "4.0.5" -description = "python code static checker" -optional = false -python-versions = ">=3.10.0" -groups = ["dev"] -files = [ - {file = "pylint-4.0.5-py3-none-any.whl", hash = "sha256:00f51c9b14a3b3ae08cff6b2cdd43f28165c78b165b628692e428fb1f8dc2cf2"}, - {file = "pylint-4.0.5.tar.gz", hash = "sha256:8cd6a618df75deb013bd7eb98327a95f02a6fb839205a6bbf5456ef96afb317c"}, -] - -[package.dependencies] -astroid = ">=4.0.2,<=4.1.dev0" -colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} -dill = [ - {version = ">=0.2", markers = "python_version < \"3.11\""}, - {version = ">=0.3.7", markers = "python_version >= \"3.12\""}, - {version = ">=0.3.6", markers = "python_version == \"3.11\""}, -] -isort = ">=5,<5.13 || >5.13,<9" -mccabe = ">=0.6,<0.8" -platformdirs = ">=2.2" -tomli = {version = ">=1.1", markers = "python_version < \"3.11\""} -tomlkit = ">=0.10.1" - -[package.extras] -spelling = ["pyenchant (>=3.2,<4.0)"] -testutils = ["gitpython (>3)"] - [[package]] name = "pytest" version = "9.0.2" @@ -496,59 +444,49 @@ pytest = ">=7" testing = ["process-tests", "pytest-xdist", "virtualenv"] [[package]] -name = "pytokens" -version = "0.4.1" -description = "A Fast, spec compliant Python 3.14+ tokenizer that runs on older Pythons." +name = "ruff" +version = "0.11.13" +description = "An extremely fast Python linter and code formatter, written in Rust." optional = false -python-versions = ">=3.8" +python-versions = ">=3.7" groups = ["dev"] files = [ - {file = "pytokens-0.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2a44ed93ea23415c54f3face3b65ef2b844d96aeb3455b8a69b3df6beab6acc5"}, - {file = "pytokens-0.4.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:add8bf86b71a5d9fb5b89f023a80b791e04fba57960aa790cc6125f7f1d39dfe"}, - {file = "pytokens-0.4.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:670d286910b531c7b7e3c0b453fd8156f250adb140146d234a82219459b9640c"}, - {file = "pytokens-0.4.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:4e691d7f5186bd2842c14813f79f8884bb03f5995f0575272009982c5ac6c0f7"}, - {file = "pytokens-0.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:27b83ad28825978742beef057bfe406ad6ed524b2d28c252c5de7b4a6dd48fa2"}, - {file = "pytokens-0.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d70e77c55ae8380c91c0c18dea05951482e263982911fc7410b1ffd1dadd3440"}, - {file = "pytokens-0.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a58d057208cb9075c144950d789511220b07636dd2e4708d5645d24de666bdc"}, - {file = "pytokens-0.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b49750419d300e2b5a3813cf229d4e5a4c728dae470bcc89867a9ad6f25a722d"}, - {file = "pytokens-0.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d9907d61f15bf7261d7e775bd5d7ee4d2930e04424bab1972591918497623a16"}, - {file = "pytokens-0.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:ee44d0f85b803321710f9239f335aafe16553b39106384cef8e6de40cb4ef2f6"}, - {file = "pytokens-0.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:140709331e846b728475786df8aeb27d24f48cbcf7bcd449f8de75cae7a45083"}, - {file = "pytokens-0.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d6c4268598f762bc8e91f5dbf2ab2f61f7b95bdc07953b602db879b3c8c18e1"}, - {file = "pytokens-0.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:24afde1f53d95348b5a0eb19488661147285ca4dd7ed752bbc3e1c6242a304d1"}, - {file = "pytokens-0.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5ad948d085ed6c16413eb5fec6b3e02fa00dc29a2534f088d3302c47eb59adf9"}, - {file = "pytokens-0.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:3f901fe783e06e48e8cbdc82d631fca8f118333798193e026a50ce1b3757ea68"}, - {file = "pytokens-0.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8bdb9d0ce90cbf99c525e75a2fa415144fd570a1ba987380190e8b786bc6ef9b"}, - {file = "pytokens-0.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5502408cab1cb18e128570f8d598981c68a50d0cbd7c61312a90507cd3a1276f"}, - {file = "pytokens-0.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:29d1d8fb1030af4d231789959f21821ab6325e463f0503a61d204343c9b355d1"}, - {file = "pytokens-0.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:970b08dd6b86058b6dc07efe9e98414f5102974716232d10f32ff39701e841c4"}, - {file = "pytokens-0.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:9bd7d7f544d362576be74f9d5901a22f317efc20046efe2034dced238cbbfe78"}, - {file = "pytokens-0.4.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4a14d5f5fc78ce85e426aa159489e2d5961acf0e47575e08f35584009178e321"}, - {file = "pytokens-0.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97f50fd18543be72da51dd505e2ed20d2228c74e0464e4262e4899797803d7fa"}, - {file = "pytokens-0.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc74c035f9bfca0255c1af77ddd2d6ae8419012805453e4b0e7513e17904545d"}, - {file = "pytokens-0.4.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:f66a6bbe741bd431f6d741e617e0f39ec7257ca1f89089593479347cc4d13324"}, - {file = "pytokens-0.4.1-cp314-cp314-win_amd64.whl", hash = "sha256:b35d7e5ad269804f6697727702da3c517bb8a5228afa450ab0fa787732055fc9"}, - {file = "pytokens-0.4.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:8fcb9ba3709ff77e77f1c7022ff11d13553f3c30299a9fe246a166903e9091eb"}, - {file = "pytokens-0.4.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79fc6b8699564e1f9b521582c35435f1bd32dd06822322ec44afdeba666d8cb3"}, - {file = "pytokens-0.4.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d31b97b3de0f61571a124a00ffe9a81fb9939146c122c11060725bd5aea79975"}, - {file = "pytokens-0.4.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:967cf6e3fd4adf7de8fc73cd3043754ae79c36475c1c11d514fc72cf5490094a"}, - {file = "pytokens-0.4.1-cp314-cp314t-win_amd64.whl", hash = "sha256:584c80c24b078eec1e227079d56dc22ff755e0ba8654d8383b2c549107528918"}, - {file = "pytokens-0.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:da5baeaf7116dced9c6bb76dc31ba04a2dc3695f3d9f74741d7910122b456edc"}, - {file = "pytokens-0.4.1-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11edda0942da80ff58c4408407616a310adecae1ddd22eef8c692fe266fa5009"}, - {file = "pytokens-0.4.1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0fc71786e629cef478cbf29d7ea1923299181d0699dbe7c3c0f4a583811d9fc1"}, - {file = "pytokens-0.4.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:dcafc12c30dbaf1e2af0490978352e0c4041a7cde31f4f81435c2a5e8b9cabb6"}, - {file = "pytokens-0.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:42f144f3aafa5d92bad964d471a581651e28b24434d184871bd02e3a0d956037"}, - {file = "pytokens-0.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:34bcc734bd2f2d5fe3b34e7b3c0116bfb2397f2d9666139988e7a3eb5f7400e3"}, - {file = "pytokens-0.4.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:941d4343bf27b605e9213b26bfa1c4bf197c9c599a9627eb7305b0defcfe40c1"}, - {file = "pytokens-0.4.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3ad72b851e781478366288743198101e5eb34a414f1d5627cdd585ca3b25f1db"}, - {file = "pytokens-0.4.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:682fa37ff4d8e95f7df6fe6fe6a431e8ed8e788023c6bcc0f0880a12eab80ad1"}, - {file = "pytokens-0.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:30f51edd9bb7f85c748979384165601d028b84f7bd13fe14d3e065304093916a"}, - {file = "pytokens-0.4.1-py3-none-any.whl", hash = "sha256:26cef14744a8385f35d0e095dc8b3a7583f6c953c2e3d269c7f82484bf5ad2de"}, - {file = "pytokens-0.4.1.tar.gz", hash = "sha256:292052fe80923aae2260c073f822ceba21f3872ced9a68bb7953b348e561179a"}, + {file = "ruff-0.11.13-py3-none-linux_armv6l.whl", hash = "sha256:4bdfbf1240533f40042ec00c9e09a3aade6f8c10b6414cf11b519488d2635d46"}, + {file = "ruff-0.11.13-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:aef9c9ed1b5ca28bb15c7eac83b8670cf3b20b478195bd49c8d756ba0a36cf48"}, + {file = "ruff-0.11.13-py3-none-macosx_11_0_arm64.whl", hash = "sha256:53b15a9dfdce029c842e9a5aebc3855e9ab7771395979ff85b7c1dedb53ddc2b"}, + {file = "ruff-0.11.13-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ab153241400789138d13f362c43f7edecc0edfffce2afa6a68434000ecd8f69a"}, + {file = "ruff-0.11.13-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6c51f93029d54a910d3d24f7dd0bb909e31b6cd989a5e4ac513f4eb41629f0dc"}, + {file = "ruff-0.11.13-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1808b3ed53e1a777c2ef733aca9051dc9bf7c99b26ece15cb59a0320fbdbd629"}, + {file = "ruff-0.11.13-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:d28ce58b5ecf0f43c1b71edffabe6ed7f245d5336b17805803312ec9bc665933"}, + {file = "ruff-0.11.13-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:55e4bc3a77842da33c16d55b32c6cac1ec5fb0fbec9c8c513bdce76c4f922165"}, + {file = "ruff-0.11.13-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:633bf2c6f35678c56ec73189ba6fa19ff1c5e4807a78bf60ef487b9dd272cc71"}, + {file = "ruff-0.11.13-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4ffbc82d70424b275b089166310448051afdc6e914fdab90e08df66c43bb5ca9"}, + {file = "ruff-0.11.13-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:4a9ddd3ec62a9a89578c85842b836e4ac832d4a2e0bfaad3b02243f930ceafcc"}, + {file = "ruff-0.11.13-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d237a496e0778d719efb05058c64d28b757c77824e04ffe8796c7436e26712b7"}, + {file = "ruff-0.11.13-py3-none-musllinux_1_2_i686.whl", hash = "sha256:26816a218ca6ef02142343fd24c70f7cd8c5aa6c203bca284407adf675984432"}, + {file = "ruff-0.11.13-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:51c3f95abd9331dc5b87c47ac7f376db5616041173826dfd556cfe3d4977f492"}, + {file = "ruff-0.11.13-py3-none-win32.whl", hash = "sha256:96c27935418e4e8e77a26bb05962817f28b8ef3843a6c6cc49d8783b5507f250"}, + {file = "ruff-0.11.13-py3-none-win_amd64.whl", hash = "sha256:29c3189895a8a6a657b7af4e97d330c8a3afd2c9c8f46c81e2fc5a31866517e3"}, + {file = "ruff-0.11.13-py3-none-win_arm64.whl", hash = "sha256:b4385285e9179d608ff1d2fb9922062663c658605819a6876d8beef0c30b7f3b"}, + {file = "ruff-0.11.13.tar.gz", hash = "sha256:26fa247dc68d1d4e72c179e08889a25ac0c7ba4d78aecfc835d49cbfd60bf514"}, +] + +[[package]] +name = "sqlglot" +version = "30.0.3" +description = "An easily customizable SQL parser and transpiler" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "sqlglot-30.0.3-py3-none-any.whl", hash = "sha256:5489cc98b5666f1fafc21e0304ca286e513e142aa054ee5760806a2139d07a05"}, + {file = "sqlglot-30.0.3.tar.gz", hash = "sha256:35ba7514c132b54f87fd1732a65a73615efa9fd83f6e1eed0a315bc9ee3e1027"}, ] [package.extras] -dev = ["black", "build", "mypy", "pytest", "pytest-cov", "setuptools", "tox", "twine", "wheel"] +c = ["sqlglotc (==30.0.3)"] +dev = ["duckdb (>=0.6)", "pandas", "pandas-stubs", "pdoc", "pre-commit", "pyperf", "python-dateutil", "pytz", "ruff (==0.15.6)", "setuptools_scm", "sqlglot-mypy (>=1.19.1.post1)", "types-python-dateutil", "types-pytz", "typing_extensions"] +rs = ["sqlglotc (==30.0.3)", "sqlglotrs (==0.13.0)"] [[package]] name = "sqlparse" @@ -573,7 +511,7 @@ description = "A lil' TOML parser" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version == \"3.10\"" +markers = "python_full_version <= \"3.11.0a6\"" files = [ {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"}, @@ -609,32 +547,19 @@ files = [ {file = "tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff"}, ] -[[package]] -name = "tomlkit" -version = "0.13.2" -description = "Style preserving TOML library" -optional = false -python-versions = ">=3.8" -groups = ["dev"] -files = [ - {file = "tomlkit-0.13.2-py3-none-any.whl", hash = "sha256:7a974427f6e119197f670fbbbeae7bef749a6c14e793db934baefc1b5f03efde"}, - {file = "tomlkit-0.13.2.tar.gz", hash = "sha256:fff5fe59a87295b278abd31bec92c15d9bc4a06885ab12bcea52c71119392e79"}, -] - [[package]] name = "typing-extensions" -version = "4.13.2" -description = "Backported and Experimental Type Hints for Python 3.8+" +version = "4.15.0" +description = "Backported and Experimental Type Hints for Python 3.9+" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["dev"] -markers = "python_version == \"3.10\"" files = [ - {file = "typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c"}, - {file = "typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef"}, + {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, + {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, ] [metadata] lock-version = "2.1" python-versions = "^3.10" -content-hash = "a44741e2c45e6702fb176a07d1bacb6b4f3e887d907bb2d8c1439785edded9c3" +content-hash = "bf0ac67ffa320d1ed6a0f60a19f6a0243d54233d3c754ef5fbb3b3fd47a1ff03" diff --git a/pyproject.toml b/pyproject.toml index f402d82c..827a00e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,19 +15,38 @@ packages = [ [tool.poetry.dependencies] python = "^3.10" sqlparse = ">=0.4.1,<0.6.0" +sqlglot = "^30.0.3" -[tool.poetry.dev-dependencies] -black = "^26.3" +[tool.poetry.group.dev.dependencies] coverage = {extras = ["toml"], version = "^7.13"} -pylint = "^4.0.5" pytest = "^9.0.2" pytest-cov = "^7.1.0" -flake8 = "^7.3.0" +ruff = "^0.11" +mypy = "^1.19" [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" +[tool.ruff] +line-length = 88 +target-version = "py310" + +[tool.ruff.lint] +select = ["E", "F", "W", "C90", "I"] + +[tool.ruff.lint.mccabe] +max-complexity = 8 + +[tool.mypy] +python_version = "3.10" +warn_return_any = true +warn_unused_configs = true +check_untyped_defs = true +disallow_untyped_defs = true +disallow_any_generics = true +ignore_missing_imports = true + [tool.coverage.run] relative_files = true diff --git a/sql_metadata/__init__.py b/sql_metadata/__init__.py index cb4048f5..183e47bf 100644 --- a/sql_metadata/__init__.py +++ b/sql_metadata/__init__.py @@ -1,10 +1,22 @@ -""" -Module for parsing sql queries and returning columns, -tables, names of with statements etc. +"""Parse SQL queries and extract structural metadata. + +The ``sql-metadata`` package analyses SQL statements and returns the +tables, columns, aliases, CTE definitions, subqueries, values, comments, +and query type they contain. The primary entry point is :class:`Parser`:: + + from sql_metadata import Parser + + parser = Parser("SELECT id, name FROM users WHERE active = 1") + print(parser.tables) # ['users'] + print(parser.columns) # ['id', 'name', 'active'] + +Under the hood the library delegates to `sqlglot `_ +for AST construction and tokenization, with custom dialect handling for +MSSQL, MySQL, Hive/Spark, and TSQL bracket notation. """ -# pylint:disable=unsubscriptable-object -from sql_metadata.parser import Parser +from sql_metadata.exceptions import InvalidQueryDefinition from sql_metadata.keywords_lists import QueryType +from sql_metadata.parser import Parser -__all__ = ["Parser", "QueryType"] +__all__ = ["InvalidQueryDefinition", "Parser", "QueryType"] diff --git a/sql_metadata/ast_parser.py b/sql_metadata/ast_parser.py new file mode 100644 index 00000000..fe33a191 --- /dev/null +++ b/sql_metadata/ast_parser.py @@ -0,0 +1,107 @@ +"""Wrap ``sqlglot.parse()`` to produce an AST from raw SQL strings. + +Thin orchestrator that composes :class:`~sql_cleaner.SqlCleaner` (raw SQL +preprocessing) and :class:`~dialect_parser.DialectParser` (dialect +detection, parsing, quality validation) so that downstream extractors +always receive a clean ``sqlglot.exp.Expression`` tree (or ``None`` / +``ValueError``). +""" + +from sqlglot import exp +from sqlglot.dialects.dialect import DialectType + +from sql_metadata.dialect_parser import DialectParser +from sql_metadata.sql_cleaner import SqlCleaner + + +class ASTParser: + """Lazy wrapper around SQL parsing with dialect auto-detection. + + Instantiated once per :class:`Parser` with the raw SQL string. The + actual parsing is deferred until :attr:`ast` is first accessed, at + which point the SQL is cleaned and parsed through one or more sqlglot + dialects until a satisfactory AST is obtained. + + :param sql: Raw SQL query string. + :type sql: str + """ + + def __init__(self, sql: str) -> None: + self._raw_sql = sql + self._ast: exp.Expression | None = None + self._dialect: DialectType = None + self._parsed = False + self._is_replace = False + self._cte_name_map: dict[str, str] = {} + + @property + def ast(self) -> exp.Expression | None: + """The sqlglot AST for the query, lazily parsed on first access. + + :returns: Root AST node, or ``None`` for empty/comment-only queries. + :rtype: exp.Expression + :raises ValueError: If the SQL is malformed and cannot be parsed. + """ + if self._parsed: + return self._ast + self._parsed = True + self._ast = self._parse(self._raw_sql) + return self._ast + + @property + def dialect(self) -> DialectType: + """The sqlglot dialect that produced the current AST. + + Set as a side-effect of :attr:`ast` access. May be ``None`` + (default dialect), a string like ``"mysql"``, or a custom + :class:`Dialect` subclass such as :class:`HashVarDialect`. + """ + _ = self.ast + return self._dialect + + @property + def is_replace(self) -> bool: + """Whether the original query was a ``REPLACE INTO`` statement. + + ``REPLACE INTO`` is rewritten to ``INSERT INTO`` before parsing + (sqlglot otherwise produces an opaque ``Command`` node). This + flag allows :attr:`Parser.query_type` to restore the correct + :class:`QueryType.REPLACE` value. + """ + _ = self.ast + return self._is_replace + + @property + def cte_name_map(self) -> dict[str, str]: + """Map of placeholder CTE names back to their original qualified form. + + Keys are underscore-separated placeholders (``db__DOT__name``), + values are the original dotted names (``db.name``). + """ + _ = self.ast + return self._cte_name_map + + def _parse(self, sql: str) -> exp.Expression | None: + """Parse *sql* into a sqlglot AST. + + Delegates preprocessing to :class:`SqlCleaner` and dialect + detection / parsing to :class:`DialectParser`. + + :param sql: Raw SQL string (may include comments). + :type sql: str + :returns: Root AST node, or ``None`` for empty input. + :rtype: exp.Expression | None + :raises ValueError: If the SQL is malformed. + """ + if not sql or not sql.strip(): + return None + + result = SqlCleaner.clean(sql) + if result.sql is None: + return None + + self._is_replace = result.is_replace + self._cte_name_map = result.cte_name_map + + ast, self._dialect = DialectParser().parse(result.sql) + return ast diff --git a/sql_metadata/column_extractor.py b/sql_metadata/column_extractor.py new file mode 100644 index 00000000..7718a0ef --- /dev/null +++ b/sql_metadata/column_extractor.py @@ -0,0 +1,1087 @@ +"""Single-pass SQL metadata extraction from a sqlglot AST. + +Walks the AST in ``arg_types``-key order (which mirrors the left-to-right +SQL text order) and collects columns, column aliases, CTE names, and +subquery names into a :class:`_Collector` accumulator. The +:class:`ColumnExtractor` class encapsulates the walk and all helper methods, +replacing the earlier flat-function design with a cohesive class. + +The public entry point is :meth:`ColumnExtractor.extract`, which returns an +:class:`ExtractionResult` dataclass consumed by :attr:`Parser.columns` +and friends. +""" + +from dataclasses import dataclass +from typing import Any + +from sqlglot import exp + +from sql_metadata.exceptions import InvalidQueryDefinition +from sql_metadata.utils import UniqueList, last_segment + +# --------------------------------------------------------------------------- +# Result dataclass +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class ExtractionResult: + """Immutable container for column extraction results. + + Replaces the earlier 7-tuple return value with named fields. + """ + + columns: UniqueList + columns_dict: dict[str, UniqueList] + alias_names: UniqueList + alias_dict: dict[str, UniqueList] | None + alias_map: dict[str, str | list[str]] + cte_names: UniqueList + subquery_names: UniqueList + output_columns: list[str] + + +# --------------------------------------------------------------------------- +# Clause classification (pure functions, no state) +# --------------------------------------------------------------------------- + + +#: Simple key → clause-name lookup for most ``arg_types`` keys. +_CLAUSE_MAP: dict[str, str] = { + "where": "where", + "group": "group_by", + "order": "order_by", + "having": "having", +} + +#: Keys that map to the ``"join"`` clause section. +_JOIN_KEYS = frozenset({"on", "using"}) + + +def _classify_expressions_clause(parent_type: type) -> str: + """Resolve the clause for an ``"expressions"`` key based on the parent node. + + :param parent_type: The type of the parent AST node. + :returns: ``"update"``, ``"select"``, or ``""`` for other parents. + """ + if parent_type is exp.Update: + return "update" + if parent_type is exp.Select: + return "select" + return "" + + +def _classify_clause(key: str, parent_type: type) -> str: + """Map an ``arg_types`` key and parent node type to a ``columns_dict`` section. + + :param key: The ``arg_types`` key through which the child was reached. + :param parent_type: The type of the parent AST node. + :returns: Section name string, or ``""`` if the key does not map. + """ + if key == "expressions": + return _classify_expressions_clause(parent_type) + if key in _JOIN_KEYS: + return "join" + return _CLAUSE_MAP.get(key, "") + + +# --------------------------------------------------------------------------- +# Pure helpers (no state) +# --------------------------------------------------------------------------- + + +def _dfs(node: exp.Expression) -> Any: + """Yield *node* and all its descendants in depth-first order. + + :param node: Root expression node. + :yields: Each expression node in DFS pre-order. + """ + yield node + for child in node.iter_expressions(): + yield from _dfs(child) + + +#: Functions whose first argument is a date-part unit keyword, not a column. +_DATE_PART_FUNCTIONS = frozenset( + { + "dateadd", + "datediff", + "datepart", + "datename", + "date_add", + "date_sub", + "date_diff", + "date_trunc", + "timestampadd", + "timestampdiff", + } +) + + +def _is_date_part_unit(node: exp.Column) -> bool: + """Return True if *node* is the first arg of a date-part function.""" + parent = node.parent + if ( + isinstance(parent, exp.Anonymous) + and parent.this.lower() in _DATE_PART_FUNCTIONS + ): + exprs = parent.expressions + return len(exprs) > 0 and exprs[0] is node + return False + + +# --------------------------------------------------------------------------- +# Collector — accumulates results during AST walk +# --------------------------------------------------------------------------- + + +class _Collector: + """Mutable accumulator for metadata gathered during the AST walk. + + :param table_aliases: Pre-computed table alias → real name mapping. + """ + + __slots__ = ( + "ta", + "columns", + "columns_dict", + "alias_names", + "alias_dict", + "alias_map", + "cte_names", + "cte_alias_names", + "subquery_items", + "output_columns", + ) + + def __init__(self, table_aliases: dict[str, str]): + self.ta = table_aliases + self.columns = UniqueList() + self.columns_dict: dict[str, UniqueList] = {} + self.alias_names = UniqueList() + self.alias_dict: dict[str, UniqueList] = {} + self.alias_map: dict[str, str | list[str]] = {} + self.cte_names = UniqueList() + self.cte_alias_names: set[str] = set() + self.subquery_items: list[tuple[int, str]] = [] + self.output_columns: list[str] = [] + + def add_column(self, name: str, clause: str) -> None: + """Record a column name, filing it into the appropriate section.""" + self.columns.append(name) + if clause: + self.columns_dict.setdefault(clause, UniqueList()).append(name) + + def add_alias(self, name: str, target: Any, clause: str) -> None: + """Record a column alias and its target expression.""" + self.alias_names.append(name) + if clause: + self.alias_dict.setdefault(clause, UniqueList()).append(name) + if target is not None: + self.alias_map[name] = target + + +# --------------------------------------------------------------------------- +# arg_types keys to skip during the walk. +# --------------------------------------------------------------------------- + +_SKIP_KEYS = frozenset({"conflict", "returning", "alternative"}) + + +# --------------------------------------------------------------------------- +# ColumnExtractor — the main class +# --------------------------------------------------------------------------- + + +class ColumnExtractor: + """Single-pass DFS extraction of columns, aliases, CTEs, and subqueries. + + Walks the AST in ``arg_types``-key order (which mirrors the left-to-right + SQL text order) and collects all metadata into an internal + :class:`_Collector`. Call :meth:`extract` to run the walk and return an + :class:`ExtractionResult`. + + The class is designed around a single public entry point + (:meth:`extract`), which triggers a recursive depth-first traversal of + the sqlglot AST. Specialised handler methods process leaf-like nodes + (columns, aliases, CTEs, subqueries) while the walk engine manages + clause classification and child iteration. + + :param ast: Root sqlglot AST node (e.g. ``Select``, ``Insert``, + ``Create``). + :param table_aliases: Pre-computed mapping of table alias names to + their real (resolved) table names. + :param cte_name_map: Optional mapping of placeholder CTE names + (produced by :class:`SqlCleaner`) back to the original qualified + CTE names. + """ + + def __init__( + self, + ast: exp.Expression, + table_aliases: dict[str, str], + cte_name_map: dict[str, str] | None = None, + ): + self._ast = ast + self._table_aliases = table_aliases + self._cte_name_map = cte_name_map or {} + self._collector = _Collector(table_aliases) + self._reverse_cte_map = self._cte_name_map + + # ------------------------------------------------------------------- + # Public API + # ------------------------------------------------------------------- + + def extract(self) -> ExtractionResult: + """Run the full extraction walk and return an immutable result. + + Orchestrates the three-phase extraction process: + + 1. **Seed** — pre-populate CTE names so downstream handlers can + recognise CTE column-alias references. + 2. **Walk** — depth-first traversal of the AST, dispatching each + node to the appropriate handler. + 3. **Finalise** — restore qualified CTE names, sort subquery + names, and package everything into an :class:`ExtractionResult`. + + For ``CREATE TABLE`` statements without a ``SELECT`` body (pure + DDL), only ``ColumnDef`` nodes are collected during the walk. + + Example SQL:: + + SELECT a, b FROM t WHERE a > 1 + + :returns: An :class:`ExtractionResult` containing columns, + aliases, CTE names, subquery names, and output columns. + """ + c = self._collector + + self._seed_cte_names() + + # Reset cte_names — walk will re-collect them in text order + c.cte_names = UniqueList() + self._walk(self._ast) + + # Restore qualified CTE names (reverse placeholder mapping) + final_cte = UniqueList() + for name in c.cte_names: + final_cte.append(self._reverse_cte_map.get(name, name)) + + alias_dict = c.alias_dict if c.alias_dict else None + return ExtractionResult( + columns=c.columns, + columns_dict=c.columns_dict, + alias_names=c.alias_names, + alias_dict=alias_dict, + alias_map=c.alias_map, + cte_names=final_cte, + subquery_names=self._build_subquery_names(), + output_columns=c.output_columns, + ) + + # ------------------------------------------------------------------- + # Setup helpers + # ------------------------------------------------------------------- + + def _seed_cte_names(self) -> None: + """Pre-populate CTE names in the collector before the main walk. + + Scans the AST for all ``CTE`` nodes and records their alias + names. This allows :meth:`_handle_column` to recognize + references like ``cte_name.col`` as CTE column-alias references + rather than regular columns. + + Example SQL:: + + WITH sales AS (SELECT id FROM orders) SELECT sales.id FROM sales + + The seed step records ``"sales"`` so that ``sales.id`` in the + outer SELECT can be identified as a CTE-qualified reference. + """ + for cte in self._ast.find_all(exp.CTE): + alias = cte.alias + if alias: + self._collector.cte_names.append( + self._reverse_cte_map.get(alias, alias) + ) + + def _build_subquery_names(self) -> UniqueList: + """Sort collected subquery items by depth and return their names. + + Subqueries are collected during the walk with their nesting + depth. This method sorts them innermost-first (descending depth) + and returns a :class:`UniqueList` of alias names in that order. + + Example SQL:: + + SELECT (SELECT 1) AS a, (SELECT 2) AS b FROM t + + :returns: A :class:`UniqueList` of subquery alias names, ordered + from innermost to outermost. + """ + c = self._collector + c.subquery_items.sort(key=lambda x: -x[0]) + names = UniqueList() + for _, name in c.subquery_items: + names.append(name) + return names + + # ------------------------------------------------------------------- + # DFS walk engine + # ------------------------------------------------------------------- + + def _walk( + self, node: exp.Expression, clause: str = "", depth: int = 0 + ) -> None: + """Perform a depth-first walk of the AST in ``arg_types`` key order. + + This is the core recursive method. For each node it first + attempts leaf dispatch via :meth:`_dispatch_leaf`. If the node + is not a leaf, it iterates the node's ``arg_types`` keys in + declaration order (which mirrors SQL text order) and recurses + into each populated child. + + :param node: The current AST node to process. + :param clause: The current SQL clause context (e.g. ``"select"``, + ``"where"``). Propagated to child nodes and used to file + columns into ``columns_dict`` sections. + :param depth: Current nesting depth, used to sort subqueries by + depth (innermost first). + """ + assert node is not None + + if self._dispatch_leaf(node, clause, depth): + return + + if hasattr(node, "arg_types"): + self._walk_children(node, clause, depth) + + def _walk_children(self, node: exp.Expression, clause: str, depth: int) -> None: + """Iterate and recurse into children of *node* in ``arg_types`` key order. + + For each child key, determines the SQL clause context (e.g. + ``"where"`` → ``where``, ``"on"`` → ``join``) via + :func:`_classify_clause`. Special-case keys (SELECT expressions, + INSERT schema, JOIN USING) are routed to dedicated handlers via + :meth:`_process_child_key`; all others get the default recursive + walk via :meth:`_recurse_child`. + + :param node: Parent AST node whose children are being iterated. + :param clause: Inherited clause context from the parent. + :param depth: Current nesting depth. + """ + for key in node.arg_types: + if key in _SKIP_KEYS: + continue + child = node.args.get(key) + if child is None: + continue + + new_clause = _classify_clause(key, type(node)) or clause + + if not self._process_child_key(node, key, child, new_clause, depth): + self._recurse_child(child, new_clause, depth) + + def _dispatch_leaf(self, node: exp.Expression, clause: str, depth: int) -> bool: + """Dispatch leaf-like AST nodes to their specialised handlers. + + Checks if *node* is a terminal or semi-terminal node type that + should be handled directly rather than recursed into. Each + branch delegates to the appropriate handler and returns ``True`` + to stop further recursion, or ``False`` to let the walk continue. + + :param node: The AST node to inspect. + :param clause: Current clause context. + :param depth: Current nesting depth. + :returns: ``True`` if the node was handled (caller should stop + recursion), ``False`` to continue the walk. + """ + if self._is_literal_values_without_subquery(node): + # e.g. INSERT INTO t VALUES (1, 2) — skip literal value lists + return True + if isinstance(node, (exp.Star, exp.ColumnDef, exp.Identifier)): + if isinstance(node, exp.ColumnDef): + # e.g. CREATE TABLE t (col INT) — collect ColumnDef names + self._collector.add_column(node.name, clause) + # Star and Identifier are terminal — no further recursion + return True + if isinstance(node, exp.CTE): + # e.g. WITH cte AS (SELECT ...) — delegate to CTE handler + self._handle_cte(node, depth) + return True + if isinstance(node, exp.Column): + # e.g. SELECT t.col FROM t — delegate to column handler + self._handle_column(node, clause) + return True + if isinstance(node, exp.Subquery) and node.alias: + # e.g. SELECT (SELECT 1) AS sub — record named subquery + self._collector.subquery_items.append((depth, node.alias)) + return False + + def _process_child_key( + self, node: exp.Expression, key: str, child: Any, clause: str, depth: int + ) -> bool: + """Route special ``arg_types`` keys to dedicated handlers. + + Intercepts three specific key/parent combinations that need + custom processing instead of the default recursive walk: + + - ``"expressions"`` on a ``SELECT`` — column list with aliases + - ``"this"`` on an ``INSERT`` — schema with target column names + - ``"using"`` on a ``JOIN`` — shared column identifiers + + Example SQL:: + + SELECT a, b AS c FROM t JOIN t2 USING (id) + + :param node: Parent AST node. + :param key: The ``arg_types`` key for the child. + :param child: The child node or list of nodes. + :param clause: Current clause context. + :param depth: Current nesting depth. + :returns: ``True`` if handled by a specialised handler, + ``False`` for default recursive walk. + """ + if key == "expressions" and isinstance(node, exp.Select): + # e.g. SELECT a, b, c — handle the SELECT expression list + self._handle_select_exprs(child, clause, depth) + return True + if isinstance(node, exp.Insert) and key == "this": + # e.g. INSERT INTO t (col1, col2) — extract schema columns + self._handle_insert_schema(node) + return True + if key == "using" and isinstance(node, exp.Join): + # e.g. JOIN t2 USING (id) — extract shared join columns + self._handle_join_using(child) + return True + return False + + def _recurse_child(self, child: Any, clause: str, depth: int) -> None: + """Recursively walk a child value, handling both single nodes and lists. + + This is the default recursion path for ``arg_types`` children + that are not intercepted by :meth:`_process_child_key`. + + :param child: A single :class:`~sqlglot.expressions.Expression` + or a list of expressions. + :param clause: Current clause context to propagate. + :param depth: Current nesting depth (incremented for children). + """ + if isinstance(child, list): + # e.g. GROUP BY a, b — child is a list of Column expressions + for item in child: + if isinstance(item, exp.Expression): + self._walk(item, clause, depth + 1) + elif isinstance(child, exp.Expression): + # e.g. WHERE a > 1 — child is a single expression tree + self._walk(child, clause, depth + 1) + + # ------------------------------------------------------------------- + # Node handlers + # ------------------------------------------------------------------- + + def _handle_select_exprs( + self, exprs: list[exp.Expression], clause: str, depth: int + ) -> None: + """Process the expression list of a SELECT clause. + + Iterates each expression in the SELECT list, dispatching to + the appropriate handler based on node type. Also builds the + ``output_columns`` list which records the projected column + names in their original SELECT order. + + Example SQL:: + + SELECT a, b AS alias, *, COALESCE(c, d) FROM t + + :param exprs: List of expression nodes from ``SELECT.expressions``. + :param clause: Current clause context (typically ``"select"``). + :param depth: Current nesting depth. + """ + assert isinstance(exprs, list) + out = self._collector.output_columns + + for expr in exprs: + if isinstance(expr, exp.Alias): + # e.g. SELECT price * qty AS total + self._handle_alias(expr, clause, depth) + out.append(expr.alias) + elif isinstance(expr, exp.Star): + # e.g. SELECT * + self._collector.add_column("*", clause) + out.append("*") + elif isinstance(expr, exp.Column): + # e.g. SELECT t.col_name + self._handle_column(expr, clause) + out.append(self._column_full_name(expr)) + else: + # e.g. SELECT COALESCE(a, b) — function/expression without alias + cols = self._flat_columns(expr) + for col in cols: + self._collector.add_column(col, clause) + out.append(cols[0] if len(cols) == 1 else str(expr)) + + def _handle_alias(self, alias_node: exp.Alias, clause: str, depth: int) -> None: + """Process an ``Alias`` node from a SELECT expression list. + + Handles three cases: + + 1. **Subquery alias** — the alias wraps a subquery (contains a + ``SELECT``). The subquery body is walked recursively, and + the alias target is derived from the subquery's own SELECT + columns. + 2. **Expression alias with columns** — the inner expression + contains one or more column references (e.g. ``a + b AS + total``). Columns are recorded and the alias is mapped to + its source column(s). + 3. **Expression alias without columns** — a literal or star + expression (e.g. ``COUNT(*) AS cnt``). The alias is + recorded with a ``"*"`` or ``None`` target. + + Example SQL:: + + SELECT (SELECT id FROM t) AS sub, a + b AS total, 1 AS one + + :param alias_node: The ``Alias`` AST node. + :param clause: Current clause context. + :param depth: Current nesting depth. + """ + c = self._collector + alias_name = alias_node.alias + inner = alias_node.this + + select = inner.find(exp.Select) + if select: + # Case 1: alias wraps a subquery — e.g. SELECT (SELECT id FROM t) AS sub + self._walk(inner, clause, depth + 1) + target_cols = self._flat_columns_select_only(select) + target = ( + target_cols[0] + if len(target_cols) == 1 + else (target_cols if target_cols else None) + ) + c.add_alias(alias_name, target, clause) + return + + inner_cols = self._flat_columns(inner) + + if inner_cols: + # Case 2: inner expression has column references + # e.g. SELECT a + b AS total — record columns a, b + for col in inner_cols: + c.add_column(col, clause) + + unique_inner = UniqueList(inner_cols) + is_self_alias = self._is_self_alias(alias_name, unique_inner) + is_direct = isinstance(inner, exp.Column) + + if is_direct and is_self_alias: + pass # e.g. SELECT col AS col — trivial self-alias, skip + else: + target = None + if not is_self_alias: + # e.g. SELECT a + b AS total → target = ["a", "b"] + target = unique_inner[0] if len(unique_inner) == 1 else unique_inner + c.add_alias(alias_name, target, clause) + else: + # Case 3: no column references — e.g. SELECT COUNT(*) AS cnt + target = None + if inner.find(exp.Star): + # e.g. SELECT * AS all_cols — star target + target = "*" + c.add_alias(alias_name, target, clause) + + def _handle_cte(self, cte: exp.CTE, depth: int) -> None: + """Process a CTE (Common Table Expression) AST node. + + Records the CTE alias as a CTE name. If the CTE declares + explicit column aliases (e.g. ``cte(x, y) AS (...)``), maps + each alias to its corresponding column from the CTE body. + Otherwise, walks the CTE body recursively to extract its + columns normally. + + Example SQL:: + + WITH cte(x, y) AS (SELECT a, b FROM t) SELECT x FROM cte + + :param cte: The ``CTE`` AST node. + :param depth: Current nesting depth. + :raises InvalidQueryDefinition: If the CTE has no alias (invalid SQL). + """ + c = self._collector + alias = cte.alias + if not alias: + raise InvalidQueryDefinition( + "All CTEs require an alias, not a valid SQL" + ) + + c.cte_names.append(alias) + + body = cte.this + + if self._has_cte_explicit_column_definitions(cte): + # e.g. WITH stats(total, avg) AS (SELECT SUM(x), AVG(x) FROM t) + table_alias = cte.args.get("alias") + assert table_alias is not None + body_cols = self._flat_columns(body) + real_cols = [x for x in body_cols if x != "*"] + cte_col_names = [col.name for col in table_alias.columns] + + for col in body_cols: + c.add_column(col, "select") + + for i, cte_col in enumerate(cte_col_names): + if i < len(real_cols): + # Map CTE alias to body column by position + target = real_cols[i] + elif "*" in body_cols: + # Body uses SELECT * — map alias to "*" + target = "*" + else: + # More aliases than body columns — no target + target = None + c.add_alias(cte_col, target, "select") + c.cte_alias_names.add(cte_col) + elif self._is_cte_with_query_body(body): + # CTE without column aliases — e.g. WITH cte AS (SELECT a ...) + self._walk(body, "", depth + 1) + + def _handle_insert_schema(self, node: exp.Insert) -> None: + """Extract target column names from the Schema of an INSERT statement. + + Looks for the ``Schema`` node inside the INSERT AST and records + each column identifier as an ``"insert"``-clause column. + + Example SQL:: + + INSERT INTO users (name, email) VALUES ('a', 'b') + + :param node: The ``Insert`` AST node. + """ + schema = node.find(exp.Schema) + if schema and schema.expressions: + for col_id in schema.expressions: + name = col_id.name if hasattr(col_id, "name") else str(col_id) + self._collector.add_column(name, "insert") + + def _handle_join_using(self, child: Any) -> None: + """Extract column identifiers from a ``JOIN ... USING`` clause. + + Iterates the identifier list and records each as a + ``"join"``-clause column. + + Example SQL:: + + SELECT * FROM orders JOIN customers USING (customer_id) + + :param child: The USING clause child — a list of identifier + nodes. + """ + if isinstance(child, list): + # e.g. USING (id, name) — child is a list of Identifier nodes + for item in child: + if hasattr(item, "name"): + self._collector.add_column(item.name, "join") + + def _handle_column(self, col: exp.Column, clause: str) -> None: + """Process a ``Column`` AST node during the walk. + + Handles several column forms: + + - **Table-qualified star** — ``t.*`` is recorded as + ``"resolved_table.*"``. + - **CTE column-alias reference** — ``cte.col`` where ``col`` + is a known CTE alias is filed into ``alias_dict`` instead of + ``columns``. + - **Bare alias reference** — a bare name matching a known alias + (e.g. in ``ORDER BY alias``) is filed into ``alias_dict``. + - **Regular column** — everything else is recorded via the + fully-qualified name. + + Example SQL:: + + SELECT t.id, t.*, alias_col FROM t ORDER BY alias_col + + :param col: The ``Column`` AST node. + :param clause: Current clause context. + """ + c = self._collector + + star = col.find(exp.Star) + if star: + # e.g. SELECT t.* — table-qualified star + table = col.table + if table: + table = self._resolve_table_alias(table) + c.add_column(f"{table}.*", clause) + return + + if self._is_cte_column_alias_reference(col): + # e.g. SELECT cte.x — CTE column alias reference + c.alias_dict.setdefault(clause, UniqueList()).append(col.name) + return + + full = self._column_full_name(col) + + unqualified = col.name + if self._is_unqualified_alias_reference(col): + # e.g. ORDER BY alias_name — name matches a known alias + c.alias_dict.setdefault(clause, UniqueList()).append(unqualified) + return + + # e.g. SELECT t.col — regular column, no alias match + c.add_column(full, clause) + + # ------------------------------------------------------------------- + # Column name resolution + # ------------------------------------------------------------------- + + def _resolve_table_alias(self, col_table: str) -> str: + """Replace a table alias with the real table name if mapped. + + Looks up *col_table* in the pre-computed ``table_aliases`` dict. + If found, returns the resolved real table name; otherwise + returns the input unchanged. + + Example:: + + # Given table_aliases = {"t": "users"} + _resolve_table_alias("t") # → "users" + + :param col_table: A table name or alias string. + :returns: The resolved table name, or *col_table* if no mapping + exists. + """ + return self._table_aliases.get(col_table, col_table) + + def _column_full_name(self, col: exp.Column) -> str: + """Build a dot-separated fully-qualified column name. + + Resolves the table alias portion (if present) and assembles + the name from up to four parts: ``catalog.db.table.column``. + Trailing ``#`` characters are stripped from the column name + (used by some dialects for temp-table markers). + + Example SQL:: + + SELECT catalog.schema.t.col FROM t + + :param col: A ``Column`` AST node. + :returns: The fully-qualified column name string + (e.g. ``"users.name"``). + """ + name = col.name.rstrip("#") + table = col.table + db = col.args.get("db") + catalog = col.args.get("catalog") + + if table: + # e.g. SELECT t.col — table-qualified column + resolved = self._resolve_table_alias(table) + parts = [] + if catalog: + # e.g. SELECT catalog.schema.t.col — has catalog prefix + parts.append( + catalog.name if isinstance(catalog, exp.Expression) else catalog + ) + if db: + # e.g. SELECT schema.t.col — has db/schema prefix + parts.append(db.name if isinstance(db, exp.Expression) else db) + parts.append(resolved) + parts.append(name) + return ".".join(parts) + # e.g. SELECT col — bare column name without table qualifier + return name + + @staticmethod + def _is_star_inside_function(star: exp.Star) -> bool: + """Check whether a ``*`` node sits inside a function call. + + Uses sqlglot's ``find_ancestor`` to walk the parent chain and + look for ``Func`` (built-in functions) or ``Anonymous`` + (user-defined function) nodes. A star inside a function like + ``COUNT(*)`` should not be recorded as a standalone column. + + Example SQL:: + + SELECT COUNT(*) FROM t + + :param star: A ``Star`` AST node. + :returns: ``True`` if the star is inside a function call. + """ + return star.find_ancestor(exp.Func, exp.Anonymous) is not None + + # ------------------------------------------------------------------- + # Predicate helpers + # ------------------------------------------------------------------- + + @staticmethod + def _is_literal_values_without_subquery( + node: exp.Expression, + ) -> bool: + """Check whether *node* is a VALUES clause with only literal values. + + Returns ``True`` for plain ``VALUES (1, 2), (3, 4)`` rows and + ``False`` when the VALUES clause contains a subquery + (``VALUES (SELECT ...)``). Literal value lists are skipped + during the walk because they contain no column references. + + Example SQL:: + + INSERT INTO t VALUES (1, 2) -- True + INSERT INTO t VALUES (SELECT x ...) -- False + + :param node: An AST node to test. + :returns: ``True`` if the node is a literal-only VALUES clause. + """ + return isinstance(node, exp.Values) and not node.find( + exp.Select + ) + + def _is_cte_column_alias_reference( + self, col: exp.Column + ) -> bool: + """Check whether *col* references a known CTE column alias. + + Returns ``True`` when the column is table-qualified with a CTE + name and the column name matches one of the CTE's declared + column aliases (recorded during CTE processing). + + Example SQL:: + + WITH cte AS (...) SELECT cte.x -- True when x is a CTE alias + + :param col: A ``Column`` AST node. + :returns: ``True`` if this is a CTE column-alias reference. + """ + c = self._collector + return bool( + col.table + and col.table in c.cte_names + and col.name in c.cte_alias_names + ) + + def _is_unqualified_alias_reference( + self, col: exp.Column + ) -> bool: + """Check whether *col* is an unqualified reference to a known alias. + + Returns ``True`` when the column has no table qualifier and its + name matches a previously recorded column alias. This typically + occurs in ``ORDER BY``, ``GROUP BY``, or ``HAVING`` clauses + that reference a SELECT alias by name. + + Example SQL:: + + SELECT a AS x ... ORDER BY x -- True (x has no table qualifier) + + :param col: A ``Column`` AST node. + :returns: ``True`` if this is an unqualified alias reference. + """ + c = self._collector + return not col.table and col.name in c.alias_names + + @staticmethod + def _is_self_alias( + alias_name: str, unique_inner: UniqueList + ) -> bool: + """Check whether an alias maps back to itself. + + Returns ``True`` when the alias name is identical to the single + source column (either exactly or by last segment for + table-qualified columns). Self-aliases like + ``SELECT col AS col`` are not recorded as meaningful aliases. + + Example SQL:: + + SELECT col AS col -- True (exact match) + SELECT t.col AS col -- True (last_segment match) + SELECT a + b AS total -- False + + :param alias_name: The alias string. + :param unique_inner: Deduplicated list of source column names. + :returns: ``True`` if the alias is a trivial self-reference. + """ + return len(unique_inner) == 1 and ( + unique_inner[0] == alias_name + or last_segment(unique_inner[0]) == alias_name + ) + + @staticmethod + def _is_standalone_star( + child: exp.Star, seen_stars: set[int] + ) -> bool: + """Check whether a star node is standalone (not consumed by a Column). + + Returns ``True`` when the star has not already been accounted + for by a parent ``Column`` node (e.g. ``t.*``) and is not + directly nested inside a ``Column``. Stars inside functions + like ``COUNT(*)`` are filtered separately by + :meth:`_is_star_inside_function`. + + Example SQL:: + + SELECT * FROM t -- True + SELECT t.* FROM t -- False (consumed by Column parent) + + :param child: A ``Star`` AST node. + :param seen_stars: Set of ``id()`` values for stars already + consumed by a parent ``Column`` node. + :returns: ``True`` if this is a standalone star. + """ + return id(child) not in seen_stars and not isinstance( + child.parent, exp.Column + ) + + @staticmethod + def _has_cte_explicit_column_definitions( + cte: exp.CTE, + ) -> bool: + """Check whether a CTE declares explicit column aliases. + + Returns ``True`` when the CTE has a column definition list in + its signature (e.g. ``cte(x, y)``) and the CTE body is a + ``SELECT`` statement. + + Example SQL:: + + WITH stats(total, avg) AS (SELECT SUM(x), AVG(x) FROM t) -- True + WITH cte AS (SELECT a FROM t) -- False + + :param cte: A ``CTE`` AST node. + :returns: ``True`` if the CTE has explicit column definitions. + """ + table_alias = cte.args.get("alias") + return bool( + table_alias + and table_alias.columns + and cte.this + and isinstance(cte.this, exp.Select) + ) + + @staticmethod + def _is_cte_with_query_body( + body: exp.Expression, + ) -> bool: + """Check whether a CTE body is a walkable query statement. + + Returns ``True`` for standard SQL query bodies (SELECT, UNION, + INTERSECT, EXCEPT) and ``False`` for scalar expression bodies + used by some dialects (e.g. ClickHouse's + ``WITH '2019-08-01' AS ts`` where the body is a Literal, + or ``WITH 1 + 2 AS val`` where the body is an Add). + + :param body: The ``this`` child of a CTE node. + :returns: ``True`` if the body is a query that should be walked. + """ + return isinstance( + body, (exp.Select, exp.Union, exp.Intersect, exp.Except) + ) + + # ------------------------------------------------------------------- + # Flat column extraction + # ------------------------------------------------------------------- + + def _flat_columns_select_only(self, select: exp.Select) -> list[str]: + """Extract column/alias names from a SELECT's immediate expressions. + + Unlike :meth:`_flat_columns`, this does not recurse into the + full AST subtree — it only inspects the top-level expressions + of a SELECT clause. Used by :meth:`_handle_alias` to determine + the alias target for subquery aliases. + + Example SQL:: + + SELECT a, b AS alias, * FROM t + + :param select: A ``Select`` AST node. + :returns: A list of column name / alias name strings in SELECT + order. + """ + cols = [] + for expr in select.expressions or []: + if isinstance(expr, exp.Alias): + # e.g. SELECT b AS alias — use the alias name + cols.append(expr.alias) + elif isinstance(expr, exp.Column): + # e.g. SELECT a — use the fully-qualified column name + cols.append(self._column_full_name(expr)) + elif isinstance(expr, exp.Star): + # e.g. SELECT * — literal star + cols.append("*") + else: + # e.g. SELECT COALESCE(a, b) — extract columns from expression + for col_name in self._flat_columns(expr): + cols.append(col_name) + return cols + + def _flat_columns(self, node: exp.Expression) -> list[str]: + """Extract all column names from an expression subtree via DFS. + + Performs a full depth-first traversal of *node* using + :func:`_dfs` and collects every ``Column`` and standalone + ``Star`` reference found. Tracks already-seen star nodes to + avoid double-counting table-qualified stars (e.g. ``t.*`` + produces both a ``Column`` and a nested ``Star``). + + Example SQL:: + + COALESCE(t.a, b, c) + + :param node: Root expression node to scan. + :returns: A list of column name strings in DFS encounter order. + """ + assert node is not None + cols = [] + seen_stars: set[int] = set() + for child in _dfs(node): + name = self._collect_column_from_node(child, seen_stars) + if name is not None: + cols.append(name) + return cols + + def _collect_column_from_node( + self, child: exp.Expression, seen_stars: set[int] + ) -> str | None: + """Extract a column name from a single DFS-visited node. + + Called by :meth:`_flat_columns` for each node in the traversal. + Handles ``Column`` nodes (resolving table aliases and skipping + date-part unit keywords) and standalone ``Star`` nodes (skipping + stars inside functions like ``COUNT(*)``). + + Example SQL:: + + DATEDIFF(day, start_date, end_date) + + In this example, ``day`` is a date-part unit keyword and should + be skipped, while ``start_date`` and ``end_date`` are real + columns. + + :param child: A single AST node from the DFS traversal. + :param seen_stars: Set of ``id()`` values for ``Star`` nodes + already consumed by a parent ``Column`` (e.g. ``t.*``). + :returns: The column name string, or ``None`` if the node is + not a column reference. + """ + if isinstance(child, exp.Column): + # e.g. SELECT t.col, DATEDIFF(day, a, b) + if _is_date_part_unit(child): + # e.g. DATEDIFF(day, ...) — "day" is a unit keyword, not a column + return None + star = child.find(exp.Star) + if star: + # e.g. SELECT t.* — table-qualified star within a Column node + seen_stars.add(id(star)) + table = child.table + if table: + table = self._resolve_table_alias(table) + return f"{table}.*" + return self._column_full_name(child) # e.g. SELECT t.col + if isinstance(child, exp.Star): + # e.g. SELECT * — standalone star (not inside a Column node) + if self._is_standalone_star(child, seen_stars): + if not self._is_star_inside_function(child): + # e.g. SELECT * FROM t — standalone star, not COUNT(*) + return "*" + return None diff --git a/sql_metadata/comments.py b/sql_metadata/comments.py new file mode 100644 index 00000000..835e7736 --- /dev/null +++ b/sql_metadata/comments.py @@ -0,0 +1,188 @@ +"""Extract and strip SQL comments using the sqlglot tokenizer. + +sqlglot's tokenizer skips comments during tokenization, which means +comments live in the *gaps* between consecutive token positions. This +module exploits that property: it tokenizes the SQL, then scans each gap +for comment delimiters (``--``, ``/* */``, ``#``). + +Two public entry points exist: + +* :func:`extract_comments` — returns the raw comment texts (delimiters + included) for inspection or logging. +* :func:`strip_comments` — returns the SQL with all comments removed and + whitespace normalised, used by :class:`Parser` for the ``without_comments`` + property. + +A third, internal variant :func:`strip_comments_for_parsing` is consumed +by :mod:`_ast` before handing SQL to ``sqlglot.parse()``; it always uses +the MySQL tokenizer so that ``#``-style comments are reliably stripped. +""" + +import re +from typing import Any + +from sqlglot.tokens import Tokenizer + + +def _choose_tokenizer(sql: str) -> Tokenizer: + """Select the appropriate sqlglot tokenizer for *sql*. + + The default sqlglot tokenizer does **not** treat ``#`` as a comment + delimiter, but MySQL does. When ``#`` appears in the SQL and is used + as a comment (not as a variable/template prefix), we switch to the + MySQL tokenizer so that ``#``-style comments are properly skipped. + + :param sql: Raw SQL string to inspect. + :type sql: str + :returns: An instantiated tokenizer (MySQL or default). + :rtype: sqlglot.tokens.Tokenizer + """ + if "#" in sql and not _has_hash_variables(sql): + from sqlglot.dialects.mysql import MySQL + + return MySQL.Tokenizer() + return Tokenizer() + + +def _has_hash_variables(sql: str) -> bool: + """Determine whether ``#`` characters in *sql* are variable references. + + MSSQL uses ``#table`` for temporary tables and some template engines + use ``#VAR#`` placeholders. This function distinguishes those from + MySQL-style ``# comment`` lines so that :func:`_choose_tokenizer` + picks the right dialect. + + Heuristics (checked via regex): + + * ``#WORD#`` — bracketed template variable. + * ``= #WORD`` or ``(#WORD`` — assignment / parameter context. + + :param sql: Raw SQL string. + :type sql: str + :returns: ``True`` if at least one ``#`` looks like a variable prefix. + :rtype: bool + """ + # #WORD# template variable (e.g. #VAR#) + if re.search(r"#\w+#", sql): + return True + # = #WORD or (#WORD with optional whitespace before # + if re.search(r"[=(]\s*#\w", sql): + return True + return False + + +def extract_comments(sql: str) -> list[str]: + """Return all comments found in *sql*, with delimiters preserved. + + Tokenizes the SQL, then scans every gap between consecutive token + positions for comment delimiters. Returned strings include the + opening delimiter (``--``, ``/*``, ``#``) and, for block comments, + the closing ``*/``. + + Called by :attr:`Parser.comments`. + + :param sql: Raw SQL string. + :type sql: str + :returns: List of comment strings in source order. + :rtype: List[str] + """ + if not sql: + return [] + try: + tokens = list(_choose_tokenizer(sql).tokenize(sql)) + # TODO: revisit if sqlglot tokenizer starts raising on specific inputs + except Exception: # pragma: no cover + return [] + comments: list[str] = [] + prev_end = -1 + for tok in tokens: + _scan_gap(sql, prev_end + 1, tok.start, comments) + prev_end = tok.end + _scan_gap(sql, prev_end + 1, len(sql), comments) + return comments + + +#: Matches all three SQL comment styles in a single pass: +#: ``/* ... */`` (block, possibly unterminated), ``-- ...``, and ``# ...``. +_COMMENT_RE = re.compile(r"/\*.*?\*/|/\*.*$|--[^\n]*\n?|#[^\n]*\n?", re.DOTALL) + + +def _scan_gap(sql: str, start: int, end: int, out: list[str]) -> None: + """Scan a slice of *sql* for comment delimiters and append matches. + + :param sql: The full SQL string (not just the gap). + :param start: Start index of the gap to scan. + :param end: End index (exclusive) of the gap. + :param out: Mutable list to which discovered comment strings are appended. + """ + out.extend(_COMMENT_RE.findall(sql[start:end])) + + +def _reconstruct_from_tokens(sql: str, tokens: list[Any]) -> str: + """Rebuild SQL from token spans, collapsing gaps to single spaces.""" + if not tokens: + return "" + parts = [sql[tokens[0].start : tokens[0].end + 1]] + for i in range(1, len(tokens)): + if tokens[i].start > tokens[i - 1].end + 1: + parts.append(" ") + parts.append(sql[tokens[i].start : tokens[i].end + 1]) + return "".join(parts).strip() + + +def strip_comments_for_parsing(sql: str) -> str: + """Strip **all** comments — including ``#`` lines — for sqlglot parsing. + + Unlike :func:`strip_comments`, this always uses the MySQL tokenizer + (which treats ``#`` as a comment delimiter) so that hash-style + comments are removed before ``sqlglot.parse()`` sees the SQL. The + only exceptions are ``CREATE FUNCTION`` bodies (which may contain + ``#`` in procedural code) and MSSQL ``#temp`` table references. + + Called exclusively by :meth:`ASTParser._parse` in ``_ast.py``. + + :param sql: Raw SQL string. + :type sql: str + :returns: SQL with all comments removed and whitespace collapsed. + :rtype: str + """ + if not sql: + return sql or "" + # Skip MySQL tokenizer when # is used as variable (not comment) + upper = sql.strip().upper() + if upper.startswith("CREATE FUNCTION") or _has_hash_variables(sql): + tokenizer = Tokenizer() + else: + from sqlglot.dialects.mysql import MySQL + + tokenizer = MySQL.Tokenizer() + try: + tokens = list(tokenizer.tokenize(sql)) + except Exception: + return sql.strip() + return _reconstruct_from_tokens(sql, tokens) + + +def strip_comments(sql: str) -> str: + """Remove comments and normalise whitespace, preserving ``#VAR`` references. + + Reconstructs the SQL from its token spans, inserting a single space + wherever a gap (comment or extra whitespace) existed between two + tokens. Uses :func:`_choose_tokenizer` so that ``#VAR`` template + variables in MSSQL queries are kept intact. + + Called by :attr:`Parser.without_comments` and + :attr:`Generalizator.without_comments`. + + :param sql: Raw SQL string. + :type sql: str + :returns: SQL with comments removed and whitespace normalised. + :rtype: str + """ + if not sql: + return sql or "" + try: + tokens = list(_choose_tokenizer(sql).tokenize(sql)) + except Exception: + return sql.strip() + return _reconstruct_from_tokens(sql, tokens) diff --git a/sql_metadata/compat.py b/sql_metadata/compat.py deleted file mode 100644 index 88eea38e..00000000 --- a/sql_metadata/compat.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -This module provides a temporary compatibility layer -for legacy API dating back to 1.x version. - -Change your old imports: - -from sql_metadata import get_query_columns, get_query_tables - -into: - -from sql_metadata.compat import get_query_columns, get_query_tables - -""" - -# pylint:disable=missing-function-docstring -from typing import List, Optional, Tuple - -import sqlparse -from sqlparse.sql import TokenList -from sqlparse.tokens import Whitespace - -from sql_metadata import Parser - - -def preprocess_query(query: str) -> str: - return Parser(query).query - - -def get_query_tokens(query: str) -> List[sqlparse.sql.Token]: - query = preprocess_query(query) - parsed = sqlparse.parse(query) - - # handle empty queries (#12) - if not parsed: - return [] - - tokens = TokenList(parsed[0].tokens).flatten() - - return [token for token in tokens if token.ttype is not Whitespace] - - -def get_query_columns(query: str) -> List[str]: - return Parser(query).columns - - -def get_query_tables(query: str) -> List[str]: - return Parser(query).tables - - -def get_query_limit_and_offset(query: str) -> Optional[Tuple[int, int]]: - return Parser(query).limit_and_offset - - -def generalize_sql(query: Optional[str] = None) -> Optional[str]: - if query is None: - return None - - return Parser(query).generalize diff --git a/sql_metadata/dialect_parser.py b/sql_metadata/dialect_parser.py new file mode 100644 index 00000000..848fc365 --- /dev/null +++ b/sql_metadata/dialect_parser.py @@ -0,0 +1,235 @@ +"""SQL dialect detection, parsing, and parse-quality validation. + +Combines dialect heuristics (which sqlglot dialect to try), the actual +``sqlglot.parse()`` call, and degraded-result detection into a single +class so that callers only need to call :meth:`DialectParser.parse`. +""" + +import logging +from typing import Any + +import sqlglot +from sqlglot import exp +from sqlglot.dialects.dialect import Dialect, DialectType +from sqlglot.dialects.redshift import Redshift +from sqlglot.dialects.tsql import TSQL +from sqlglot.errors import ParseError, TokenError +from sqlglot.parsers.redshift import RedshiftParser +from sqlglot.tokens import Tokenizer as BaseTokenizer + +from sql_metadata.comments import _has_hash_variables +from sql_metadata.exceptions import InvalidQueryDefinition + +#: Table names that indicate a degraded parse result. +_BAD_TABLE_NAMES = frozenset({"IGNORE", ""}) + +#: SQL keywords that should not appear as bare column names. +_BAD_COLUMN_NAMES = frozenset({"UNIQUE", "DISTINCT", "SELECT", "FROM", "WHERE"}) + + +# --------------------------------------------------------------------------- +# Custom dialect classes +# --------------------------------------------------------------------------- + + +class HashVarDialect(Dialect): + """Custom sqlglot dialect that treats ``#WORD`` as identifiers. + + MSSQL uses ``#`` to prefix temporary table names (e.g. ``#temp``) + and some template engines use ``#VAR#`` placeholders. The default + sqlglot tokenizer treats ``#`` as an unknown single-character token; + this dialect moves it into ``VAR_SINGLE_TOKENS`` so it becomes part + of a ``VAR`` token instead. + """ + + class Tokenizer(BaseTokenizer): + """Tokenizer subclass that includes ``#`` in variable tokens.""" + + SINGLE_TOKENS = {**BaseTokenizer.SINGLE_TOKENS} + SINGLE_TOKENS.pop("#", None) + VAR_SINGLE_TOKENS = {*BaseTokenizer.VAR_SINGLE_TOKENS, "#"} + + +class _RedshiftAppendParser(RedshiftParser): + """Redshift parser extended with ``ALTER TABLE ... APPEND FROM``.""" + + def _parse_alter_table_append(self) -> "exp.Expr | None": + self._match_text_seq("FROM") + return self._parse_table() + + ALTER_PARSERS = { + **RedshiftParser.ALTER_PARSERS, + "APPEND": lambda self: self._parse_alter_table_append(), + } + + +class RedshiftAppendDialect(Redshift): + """Redshift dialect extended with ``ALTER TABLE ... APPEND FROM`` support. + + Redshift's ``APPEND FROM`` syntax is not natively supported by sqlglot, + which causes the statement to degrade to ``exp.Command``. This dialect + adds an ``APPEND`` entry to ``ALTER_PARSERS`` so the statement is parsed + as a proper ``exp.Alter`` with ``exp.Table`` nodes. + """ + + Parser = _RedshiftAppendParser + + +class BracketedTableDialect(TSQL): + """TSQL dialect for queries containing ``[bracketed]`` identifiers. + + sqlglot's TSQL dialect correctly interprets square-bracket quoting, + which the default dialect does not. This thin subclass exists so + that ``TableExtractor`` can ``isinstance``-check to enable + bracket-preserving table name construction. + """ + + +# --------------------------------------------------------------------------- +# DialectParser +# --------------------------------------------------------------------------- + + +class DialectParser: + """Detect the appropriate sqlglot dialect and parse SQL into an AST.""" + + def parse(self, clean_sql: str) -> tuple[exp.Expression, DialectType]: + """Parse *clean_sql*, returning ``(ast, dialect)``. + + Detects candidate dialects via heuristics, tries each in order, + and returns the first non-degraded result. + + :param clean_sql: Preprocessed SQL string (comments stripped, etc.). + :type clean_sql: str + :returns: 2-tuple of ``(ast_node, winning_dialect)``. + :rtype: tuple + :raises ValueError: If all dialect attempts fail. + """ + dialects = self._detect_dialects(clean_sql) + return self._try_dialects(clean_sql, dialects) + + # -- dialect detection -------------------------------------------------- + + @staticmethod + def _detect_dialects(sql: str) -> list[Any]: + """Choose an ordered list of sqlglot dialects to try for *sql*. + + Heuristics: + + * ``#WORD`` → :class:`HashVarDialect` (MSSQL temp tables). + * Back-ticks → ``"mysql"``. + * Square brackets or ``TOP`` → :class:`BracketedTableDialect`. + * ``UNIQUE`` → try default, MySQL, Oracle. + * ``LATERAL VIEW`` → ``"spark"`` (Hive). + + :param sql: Cleaned SQL string. + :type sql: str + :returns: Ordered list of dialects to attempt. + :rtype: list + """ + upper = sql.upper() + if _has_hash_variables(sql): + return [HashVarDialect, None, "mysql"] + if "`" in sql: + return ["mysql", None] + if "LATERAL VIEW" in upper: + return ["spark", None, "mysql"] + if "[" in sql or " TOP " in upper: + return [BracketedTableDialect, None, "mysql"] + if " UNIQUE " in upper: + return [None, "mysql", "oracle"] + if "APPEND FROM" in upper: + return [RedshiftAppendDialect, None, "mysql"] + return [None, "mysql"] + + # -- parsing ------------------------------------------------------------ + + def _try_dialects( + self, clean_sql: str, dialects: list[Any] + ) -> tuple[exp.Expression, DialectType]: + """Try parsing *clean_sql* with each dialect, returning the best. + + :returns: 2-tuple of ``(ast_node, winning_dialect)``. + :raises ValueError: If all dialect attempts fail. + """ + last_result = None + winning_dialect = None + for dialect in dialects: + try: + result = self._parse_with_dialect(clean_sql, dialect) + if result is None: + continue + last_result = result + winning_dialect = dialect + is_last = dialect == dialects[-1] + if not is_last and self._is_degraded(result, clean_sql): + continue + return result, dialect + except (ParseError, TokenError): + if dialect is not None and dialect == dialects[-1]: + raise InvalidQueryDefinition( + "Query could not be parsed — SQL syntax error" + ) + continue + + # TODO: revisit if sqlglot starts returning None from parse for last dialect + if last_result is not None: # pragma: no cover + return last_result, winning_dialect + raise InvalidQueryDefinition( + "Query could not be parsed — no dialect could handle this SQL" + ) + + @staticmethod + def _parse_with_dialect(clean_sql: str, dialect: Any) -> exp.Expression | None: + """Parse *clean_sql* with a single dialect, suppressing warnings.""" + logger = logging.getLogger("sqlglot") + old_level = logger.level + logger.setLevel(logging.CRITICAL) + try: + results = sqlglot.parse( + clean_sql, + dialect=dialect, + error_level=sqlglot.ErrorLevel.WARN, + ) + finally: + logger.setLevel(old_level) + + if not results or results[0] is None: + return None + result = results[0] + assert result is not None # guaranteed by check above + # TODO: revisit if sqlglot returns top-level Subquery + if isinstance(result, exp.Subquery) and not result.alias: # pragma: no cover + inner = result.this + if isinstance(inner, exp.Expression): + return inner + return result # type: ignore[return-value] + + # -- quality checks ----------------------------------------------------- + + def _is_degraded(self, result: exp.Expression, clean_sql: str) -> bool: + """Return ``True`` when a better dialect should be tried.""" + if isinstance(result, exp.Command) and not self._is_expected_command(clean_sql): + return True + return self._has_parse_issues(result) + + @staticmethod + def _is_expected_command(sql: str) -> bool: + """Check whether *sql* legitimately parses as ``exp.Command``.""" + upper = sql.strip().upper() + return upper.startswith("CREATE FUNCTION") + + @staticmethod + def _has_parse_issues(ast: exp.Expression) -> bool: + """Detect signs of a degraded or incorrect parse. + + Checks for table nodes with empty/keyword-like names and column + nodes whose name is a SQL keyword without a table qualifier. + """ + for table in ast.find_all(exp.Table): + if table.name in _BAD_TABLE_NAMES: + return True + for col in ast.find_all(exp.Column): + if col.name.upper() in _BAD_COLUMN_NAMES and not col.table: + return True + return False diff --git a/sql_metadata/exceptions.py b/sql_metadata/exceptions.py new file mode 100644 index 00000000..c698b370 --- /dev/null +++ b/sql_metadata/exceptions.py @@ -0,0 +1,5 @@ +"""Custom exceptions for the sql-metadata package.""" + + +class InvalidQueryDefinition(ValueError): + """Raised when the SQL query is structurally invalid or unsupported.""" diff --git a/sql_metadata/generalizator.py b/sql_metadata/generalizator.py index 97eb35d1..f0639517 100644 --- a/sql_metadata/generalizator.py +++ b/sql_metadata/generalizator.py @@ -1,26 +1,52 @@ -""" -Module used to produce generalized sql out of given query +"""Produce a generalised (anonymised) version of a SQL query. + +Replaces string literals with ``X``, numbers with ``N``, and +multi-value ``IN (...)`` / ``VALUES (...)`` lists with ``(XYZ)`` so +that structurally identical queries can be grouped for analysis +(e.g. slow-query log aggregation). Based on MediaWiki's +``DatabaseBase::generalizeSQL``. """ import re -import sqlparse + +from sql_metadata.comments import strip_comments class Generalizator: - """ - Class used to produce generalized sql out of given query + """Produce a generalised form of a SQL query. + + Strips comments, removes string literals and numeric values, and + collapses repeated ``LIKE`` / ``IN`` / ``VALUES`` clauses. Designed + for grouping structurally identical queries in monitoring and logging + pipelines. + + Used by :attr:`Parser.generalize`, which delegates to + :attr:`Generalizator.generalize`. + + :param sql: Raw SQL query string to generalise. + :type sql: str """ def __init__(self, sql: str = ""): + """Initialise with the raw SQL string. + + :param sql: SQL query to generalise. + :type sql: str + """ self._raw_query = sql # SQL queries normalization (#16) @staticmethod def _normalize_likes(sql: str) -> str: - """ - Normalize and wrap LIKE statements + """Normalise and collapse repeated ``LIKE`` clauses. + + Strips ``%`` wildcards, replaces ``LIKE '...'`` with ``LIKE X``, + and collapses consecutive ``or/and ... LIKE X`` clauses into a + single instance with ``...`` suffix. - :type sql str + :param sql: SQL string with LIKE clauses. + :type sql: str + :returns: SQL with LIKE clauses normalised. :rtype: str """ sql = sql.replace("%", "") @@ -29,11 +55,11 @@ def _normalize_likes(sql: str) -> str: sql = re.sub(r"LIKE '[^\']+'", "LIKE X", sql) # or all_groups LIKE X or all_groups LIKE X - matches = re.finditer(r"(or|and) [^\s]+ LIKE X", sql, flags=re.IGNORECASE) - matches = [match.group(0) for match in matches] if matches else None + found = re.finditer(r"(or|and) [^\s]+ LIKE X", sql, flags=re.IGNORECASE) + like_matches = [m.group(0) for m in found] - if matches: - for match in set(matches): + if like_matches: + for match in set(like_matches): sql = re.sub( r"(\s?" + re.escape(match) + ")+", " " + match + " ...", sql ) @@ -42,23 +68,33 @@ def _normalize_likes(sql: str) -> str: @property def without_comments(self) -> str: - """ - Removes comments from SQL query + """Return the SQL with all comments removed. + Delegates to :func:`strip_comments` from ``_comments.py``. + + :returns: Comment-free SQL string. :rtype: str """ - sql = sqlparse.format(self._raw_query, strip_comments=True) - sql = sql.replace("\n", " ") - sql = re.sub(r"[ \t]+", " ", sql) - return sql + return strip_comments(self._raw_query) @property def generalize(self) -> str: - """ - Removes most variables from an SQL query - and replaces them with X or N for numbers. + """Return a generalised version of the SQL query. + + Applies the following transformations in order: - Based on Mediawiki's DatabaseBase::generalizeSQL + 1. Strip comments. + 2. Remove double-quotes. + 3. Collapse multiple spaces. + 4. Normalise ``LIKE`` clauses. + 5. Replace escaped characters. + 6. Replace string literals with ``X``. + 7. Collapse whitespace to single spaces. + 8. Replace numbers with ``N``. + 9. Collapse ``IN (...)`` / ``VALUES (...)`` lists to ``(XYZ)``. + + :returns: Generalised SQL string, or ``""`` for empty input. + :rtype: str """ if self._raw_query == "": return "" diff --git a/sql_metadata/keywords_lists.py b/sql_metadata/keywords_lists.py index f086287a..4e4fbc66 100644 --- a/sql_metadata/keywords_lists.py +++ b/sql_metadata/keywords_lists.py @@ -1,11 +1,18 @@ -""" -Module provide lists of sql keywords that should trigger or skip -checks for tables an columns +"""SQL keyword sets and enums used to classify tokens and query types. + +Defines the canonical sets of normalised SQL keywords that the token-based +parser (``token.py``) and the AST-based extractors use to decide when a +token is relevant (e.g. precedes a column or table reference) and to map +query prefixes to :class:`QueryType` values. Keyword values are stored +**without spaces** (``INNERJOIN``, ``ORDERBY``) because the tokeniser +strips whitespace before comparison. """ -# these keywords are followed by columns reference from enum import Enum +#: Normalised keywords after which the next token(s) are column references. +#: Used by the token-linked-list walker and by ``COLUMNS_SECTIONS`` to +#: decide which ``columns_dict`` section a column belongs to. KEYWORDS_BEFORE_COLUMNS = { "SELECT", "WHERE", @@ -17,7 +24,9 @@ "USING", } -# normalized list of table preceding keywords +#: Normalised keywords after which the next token is a **table** name. +#: Includes all JOIN variants (whitespace-stripped) as well as INTO, +#: UPDATE, TABLE, and the DDL guard ``IFNOTEXISTS``. TABLE_ADJUSTMENT_KEYWORDS = { "FROM", "JOIN", @@ -36,10 +45,14 @@ "IFNOTEXISTS", } -# next statement beginning after with statement +#: Keywords that signal the end of a ``WITH`` (CTE) block and the start +#: of the main statement body. Used by the legacy token-based WITH parser +#: and referenced in ``_ast.py`` for malformed-query detection. WITH_ENDING_KEYWORDS = {"UPDATE", "SELECT", "DELETE", "REPLACE", "INSERT"} -# subquery preceding keywords +#: Keywords that can appear immediately before a parenthesised subquery +#: in a FROM/JOIN position. A subset of ``TABLE_ADJUSTMENT_KEYWORDS`` +#: excluding DML-only entries (INTO, UPDATE, TABLE). SUBQUERY_PRECEDING_KEYWORDS = { "FROM", "JOIN", @@ -54,8 +67,10 @@ "NATURALJOIN", } -# section of a query in which column can exists -# based on last normalized keyword +#: Maps a normalised keyword to the ``columns_dict`` section name that +#: columns following it belong to. For example, columns after ``SELECT`` +#: go into the ``"select"`` section, columns after ``ON``/``USING`` go +#: into ``"join"``. COLUMNS_SECTIONS = { "SELECT": "select", "WHERE": "where", @@ -71,8 +86,11 @@ class QueryType(str, Enum): - """ - Types of supported queries + """Enumeration of SQL statement types recognised by the parser. + + Inherits from :class:`str` so that values are directly comparable to + plain strings (``parser.query_type == "SELECT"``). Returned by + :attr:`Parser.query_type` and by :class:`_query_type.QueryTypeExtractor`. """ INSERT = "INSERT" @@ -84,11 +102,16 @@ class QueryType(str, Enum): ALTER = "ALTER TABLE" DROP = "DROP TABLE" TRUNCATE = "TRUNCATE TABLE" + MERGE = "MERGE" class TokenType(str, Enum): - """ - Types of SQLTokens + """Semantic classification assigned to an :class:`SQLToken` during parsing. + + These types are used by the legacy token-based extraction pipeline to + label each token after the keyword-driven classification pass. In the + v3 sqlglot-based pipeline they are still referenced for backward + compatibility in test assertions and token introspection. """ COLUMN = "COLUMN" @@ -100,7 +123,10 @@ class TokenType(str, Enum): PARENTHESIS = "PARENTHESIS" -# cannot fully replace with enum as with/select has the same key +#: Maps normalised query-prefix strings to :class:`QueryType` values. +#: Cannot be replaced by the enum alone because ``WITH`` maps to +#: ``SELECT`` (a CTE followed by its main query) and composite prefixes +#: like ``CREATETABLE`` need their own entries. SUPPORTED_QUERY_TYPES = { "INSERT": QueryType.INSERT, "REPLACE": QueryType.REPLACE, @@ -116,8 +142,10 @@ class TokenType(str, Enum): "TRUNCATETABLE": QueryType.TRUNCATE, } -# all the keywords we care for - rest is ignored in assigning -# the last keyword +#: Union of all keyword sets the tokeniser cares about. Tokens whose +#: normalised value falls outside this set are **not** tracked as the +#: ``last_keyword`` on subsequent tokens, keeping the classification +#: logic focused on structurally significant positions only. RELEVANT_KEYWORDS = { *KEYWORDS_BEFORE_COLUMNS, *TABLE_ADJUSTMENT_KEYWORDS, diff --git a/sql_metadata/nested_resolver.py b/sql_metadata/nested_resolver.py new file mode 100644 index 00000000..b43c57e0 --- /dev/null +++ b/sql_metadata/nested_resolver.py @@ -0,0 +1,658 @@ +"""Nested column resolution and CTE/subquery body extraction. + +The :class:`NestedResolver` class owns the complete "look inside nested +queries" concern: rendering CTE/subquery AST nodes back to SQL, parsing +those bodies with sub-:class:`Parser` instances, and resolving +``subquery.column`` references to actual columns. +""" + +from __future__ import annotations + +import copy +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from sql_metadata.parser import Parser + +from sqlglot import exp +from sqlglot.generator import Generator + +from sql_metadata.utils import ( + UniqueList, + last_segment, +) + +# --------------------------------------------------------------------------- +# Custom SQL generator — preserves function signatures +# --------------------------------------------------------------------------- + + +class _PreservingGenerator(Generator): + """Custom SQL generator that preserves function signatures. + + sqlglot normalises certain functions when rendering SQL (e.g. + ``IFNULL`` → ``COALESCE``, ``DIV`` → ``CAST(… / … AS INT)``). + This generator overrides those transformations so that the output + only differs from the input in keyword/function-name casing and + explicit ``AS`` insertion. + """ + + TRANSFORMS = { + **Generator.TRANSFORMS, + exp.CurrentDate: lambda self, e: "CURRENT_DATE()", + exp.IntDiv: lambda self, e: ( + f"{self.sql(e, 'this')} DIV {self.sql(e, 'expression')}" + ), + } + + def coalesce_sql(self, expression: exp.Expression) -> str: + args = [expression.this] + expression.expressions + if len(args) == 2: + return f"IFNULL({self.sql(args[0])}, {self.sql(args[1])})" + args_sql = ", ".join(self.sql(a) for a in args) + return f"COALESCE({args_sql})" + + def dateadd_sql(self, expression: exp.Expression) -> str: + return ( + f"DATE_ADD({self.sql(expression, 'this')}, " + f"{self.sql(expression, 'expression')})" + ) + + def datesub_sql(self, expression: exp.Expression) -> str: + return ( + f"DATE_SUB({self.sql(expression, 'this')}, " + f"{self.sql(expression, 'expression')})" + ) + + def tsordsadd_sql(self, expression: exp.Expression) -> str: + this = self.sql(expression, "this") + expr_node = expression.expression + if isinstance(expr_node, exp.Mul): + right = expr_node.expression + if ( + isinstance(right, exp.Neg) + and isinstance(right.this, exp.Literal) + and right.this.this == "1" + ): + left = self.sql(expr_node, "this") + return f"DATE_SUB({this}, {left})" + return f"DATE_ADD({this}, {self.sql(expression, 'expression')})" + + def not_sql(self, expression: exp.Expression) -> str: + child = expression.this + if isinstance(child, exp.Is) and isinstance(child.expression, exp.Null): + return f"{self.sql(child, 'this')} IS NOT NULL" + if isinstance(child, exp.In): + return f"{self.sql(child, 'this')} NOT IN ({self.expressions(child)})" + return super().not_sql(expression) # type: ignore[arg-type, no-any-return] + + +_GENERATOR = _PreservingGenerator() + + +# --------------------------------------------------------------------------- +# Resolution helpers +# --------------------------------------------------------------------------- + + +def _is_qualified_reference(result: list[str]) -> bool: + """Check if result is a single dotted reference like ``['cte.col']``.""" + return len(result) == 1 and "." in result[0] + + +def _is_not_already_resolved_qualified_reference( + result: list[str], column: str +) -> bool: + """Check if result is a qualified reference that changed from the input.""" + return _is_qualified_reference(result) and result != [column] + + +# --------------------------------------------------------------------------- +# NestedResolver class +# --------------------------------------------------------------------------- + + +class NestedResolver: + """Resolve column references through subqueries and CTEs. + + Owns the complete lifecycle of nested query resolution: + + 1. **Body extraction** — render CTE/subquery AST nodes back to SQL + via :class:`_PreservingGenerator`. + 2. **Column resolution** — parse bodies with sub-Parsers and resolve + ``subquery.column`` references to actual columns. + 3. **Unqualified alias resolution** — detect column names that are actually + aliases defined inside nested queries. + + :param ast: Root AST node (for body extraction). + """ + + def __init__(self, ast: exp.Expression): + self._ast = ast + + # Lazy caches + self._subqueries_parsers: dict[str, "Parser"] = {} + self._with_parsers: dict[str, "Parser"] = {} + self._columns_aliases: dict[str, str | list[str]] = {} + self._cached_cte_nodes: list[exp.CTE] | None = None + + # Set by resolve() caller + self._subqueries_names: list[str] = [] + self._subqueries: dict[str, str] = {} + self._with_names: list[str] = [] + self._with_queries: dict[str, str] = {} + + # ------------------------------------------------------------------- + # Public API — name extraction + # ------------------------------------------------------------------- + + def extract_cte_names( + self, + cte_name_map: dict[str, str], + ) -> list[str]: + """Extract CTE names from the AST. + + Called by :attr:`Parser.with_names`. + + :param cte_name_map: Mapping of placeholder names to original + qualified names, e.g. ``{"db__DOT__cte": "db.cte"}``. + Built by :func:`SqlCleaner._normalize_cte_names` because + sqlglot cannot parse dots in CTE names — they get rewritten + to placeholders before parsing. This map restores the + original names in the output. + :returns: List of CTE names, e.g. ``["db.cte", "sales"]``. + """ + return UniqueList([ + cte_name_map.get(cte.alias, cte.alias) for cte in self._cte_nodes() + ]) + + def extract_cte_bodies( + self, + cte_name_map: dict[str, str], + ) -> dict[str, str]: + """Extract CTE body SQL for each CTE in the AST. + + :param cte_name_map: Placeholder-to-original mapping, e.g. + ``{"db__DOT__cte": "db.cte"}``. See :meth:`extract_cte_names` + for details. + :returns: Mapping of ``{cte_name: body_sql}``, + e.g. ``{"db.cte": "SELECT id FROM t"}``. + """ + results: dict[str, str] = {} + for cte in self._cte_nodes(): + alias = cte.alias + original_name = cte_name_map.get(alias, alias) + results[original_name] = self._body_sql(cte.this) + + return results + + @staticmethod + def extract_subqueries( + ast: exp.Expression, + ) -> tuple[list[str], dict[str, str]]: + """Extract subquery names and bodies in a single post-order walk. + + Aliased subqueries keep their alias as the name. Unaliased + subqueries (e.g. ``WHERE id IN (SELECT …)``) get auto-generated + names ``subquery_1``, ``subquery_2``, etc. + + Example SQL:: + + SELECT * FROM (SELECT id FROM t) AS sub + WHERE id IN (SELECT id FROM t2) + + :returns: ``(names, bodies)`` where *names* is ordered innermost-first, + e.g. ``(["subquery_1", "sub"], {...})``. + """ + names: list[str] = UniqueList() + bodies: dict[str, str] = {} + NestedResolver._walk_subqueries(ast, names, bodies, 0) + return names, bodies + + # ------------------------------------------------------------------- + # Public API — column resolution + # ------------------------------------------------------------------- + + def resolve( + self, + columns: "UniqueList", + columns_dict: dict[str, UniqueList], + columns_aliases: dict[str, str | list[str]], + subqueries_names: list[str], + subqueries: dict[str, str], + with_names: list[str], + with_queries: dict[str, str], + ) -> tuple[UniqueList, dict[str, UniqueList], dict[str, str | list[str]]]: + """Resolve columns that reference subqueries or CTEs. + + Two-phase resolution: + + 1. Replace ``subquery.column`` references with the actual column + from the subquery/CTE definition. + 2. Drop unqualified column names that are actually aliases defined + inside a nested query. + + Also applies the same resolution to *columns_dict*. + + Example SQL:: + + WITH cte AS (SELECT a FROM t) + SELECT cte.a FROM cte + + :returns: Tuple of ``(columns, columns_dict, columns_aliases)``. + """ + self._subqueries_names = subqueries_names + self._subqueries = subqueries + self._with_names = with_names + self._with_queries = with_queries + self._columns_aliases = columns_aliases + + # For columns drop aliases as we need only actual columns + columns = self._resolve_and_filter(columns, drop_unqualified_aliases=True) + + if columns_dict: + # For columns_dict do not drop aliases but instead resolve them to columns. + # That ensures the column is present in all the relevant sections regardless + # if it's called directly or by alias i.e. SELECT a AS x FROM tbl ORDER BY x + # the column a should appear both in select and order_by sections. + for section, cols in list(columns_dict.items()): + columns_dict[section] = self._resolve_and_filter( + cols, drop_unqualified_aliases=False + ) + + return columns, columns_dict, self._columns_aliases + + def resolve_column_alias( + self, alias: str | list[str], columns_aliases: dict[str, str | list[str]] + ) -> list[str]: + """Public interface for alias resolution (used by parser.py). + + Example SQL:: + + SELECT a AS x FROM t ORDER BY x + + Resolves ``"x"`` → ``"a"`` using the alias map. + """ + return self._resolve_column_alias(alias, columns_aliases) + + # ------------------------------------------------------------------- + # Resolution pipeline — callers before callees + # ------------------------------------------------------------------- + + def _resolve_and_filter( + self, columns: "UniqueList", drop_unqualified_aliases: bool = True + ) -> "UniqueList": + """Apply subquery/CTE resolution and unqualified-alias handling. + + Phase 1: resolve ``sub.col`` references via :meth:`_resolve_sub_queries`. + Phase 2: detect unqualified names that are nested-query aliases. + + Example SQL:: + + SELECT sub.id FROM (SELECT id FROM users) AS sub + + Phase 1 resolves ``sub.id`` → ``id``. + Phase 2 checks if ``id`` is a nested alias (it is not, so it stays). + """ + resolved: list[str] = UniqueList() + for col in columns: + resolved.extend(self._resolve_sub_queries(col)) + + final = UniqueList() + for col in resolved: + if "." in col: + # e.g. schema.col — skip unqualified alias resolution + final.append(col) + continue + new_cols = self._resolve_unqualified_through_nested(col) + if new_cols != [col]: + # e.g. SELECT x FROM (SELECT a AS x FROM t) AS sub + # — "x" resolved to "a", drop the alias from columns + if not drop_unqualified_aliases: + final.extend(new_cols) + continue + # e.g. SELECT id FROM t — no alias match, keep as-is + final.append(col) + return final + + def _resolve_sub_queries(self, column: str) -> list[str]: + """Resolve a ``subquery.column`` reference to actual column(s). + + Tries subquery sources first, then CTE sources. + + Example SQL:: + + SELECT sub.id FROM (SELECT id FROM users) AS sub + + Resolves ``"sub.id"`` → ``["id"]``. + """ + result: list[str] = [column] + for names, defs, cache in self._nested_sources(): + if _is_qualified_reference(result): + # e.g. "sub.id" — still a qualified reference, try next source + result = self._resolve_nested_query( + subquery_alias=result[0], + nested_queries_names=names, + nested_queries=defs, + already_parsed=cache, + ) + # Recursively resolve chained CTE references: c3.a → c2.a → c1.a → a + if _is_not_already_resolved_qualified_reference(result, column): + return self._resolve_sub_queries(result[0]) + return result + + def _resolve_unqualified_through_nested( + self, col_name: str + ) -> list[str]: + """Resolve an unqualified column name through subquery/CTE alias definitions. + + Checks subquery aliases first (``check_columns=True``), then CTE + aliases (``check_columns=False``). + + Example SQL:: + + SELECT x FROM (SELECT a AS x FROM users) AS sub + + Resolves ``"x"`` → ``["a"]`` (found as alias in subquery body). + """ + for i, (names, defs, cache) in enumerate(self._nested_sources()): + # check_columns for subqueries only — prevents CTE aliases + # from claiming subquery columns, e.g. in: + # WITH cte AS (SELECT x AS name FROM t1) + # SELECT name FROM (SELECT name FROM t2) AS sub + # "name" is a real column in sub, not the CTE alias. + result = self._lookup_alias_in_nested( + col_name, names, defs, cache, check_columns=(i == 0) + ) + if result is not None: + return result + return [col_name] + + def _lookup_alias_in_nested( + self, + col_name: str, + names: list[str], + definitions: dict[str, str], + parser_cache: dict[str, "Parser"], + check_columns: bool = False, + ) -> list[str] | None: + """Search for an unqualified column as an alias in nested queries. + + Parses each nested query body and checks whether *col_name* is a + known column alias inside that body. Three outcomes are possible: + + 1. **Alias match** — the column is an alias defined inside a nested + query and gets resolved to the underlying column(s):: + + WITH cte AS (SELECT a AS x FROM t) SELECT x FROM cte + -- "x" found as alias in CTE body → resolves to ["a"] + + Multi-column aliases are also handled:: + + SELECT y FROM (SELECT a + b AS y FROM t) AS sub + -- "y" found as alias → resolves to ["a + b"] + + 2. **Direct column match** (subqueries only, ``check_columns=True``) — + the column exists directly in the nested query and is kept as-is:: + + SELECT id FROM (SELECT id FROM users) AS sub + -- "id" found as real column in subquery → returns ["id"] + + 3. **No match** — the column is not found in any nested query, + returns ``None`` so the caller can try other sources or keep + the column unchanged:: + + SELECT name FROM (SELECT id FROM users) AS sub + -- "name" not in subquery → returns None + """ + from sql_metadata.parser import Parser + + for nested_name in names: + nested_def = definitions[nested_name] + nested_parser = parser_cache.setdefault(nested_name, Parser(nested_def)) + if col_name in nested_parser.columns_aliases_names: + # Path 1: alias match — resolve through the full alias chain + # e.g. SELECT col1 AS a ... then SELECT a AS x ... + # resolving "x": follows x → a → col1, returns ["col1"] + resolved = self._resolve_column_alias( + col_name, nested_parser.columns_aliases + ) + # Record the immediate (one-step) alias mapping for the + # outer query's columns_aliases property, preserving the + # direct relationship as written in SQL: + # e.g. x → a (not the fully resolved x → col1) + if self._columns_aliases is not None: + immediate = nested_parser.columns_aliases.get(col_name, resolved) + self._columns_aliases[col_name] = immediate + return resolved + if check_columns and col_name in nested_parser.columns: + # Path 2: direct column match in subquery + return [col_name] + # Path 3: not found in any nested query + return None + + @staticmethod + def _resolve_nested_query( + subquery_alias: str, + nested_queries_names: list[str], + nested_queries: dict[str, str], + already_parsed: dict[str, "Parser"], + ) -> list[str]: + """Resolve a ``prefix.column`` reference through a nested query. + + Splits the alias on ``"."`` — if the prefix matches a known + nested query name, parses that query and resolves the column. + + Example SQL:: + + SELECT sub.id FROM (SELECT id FROM users) AS sub + + Resolving ``"sub.id"``: prefix ``"sub"`` matches, column + ``"id"`` is found in the subquery → returns ``["id"]``. + """ + from sql_metadata.parser import Parser + + parts = subquery_alias.split(".") + if len(parts) != 2 or parts[0] not in nested_queries_names: + # e.g. "table.col" or "schema.table.col" — not a subquery ref + return [subquery_alias] + sub_query, column_name = parts[0], parts[-1] + sub_query_definition = nested_queries[sub_query] + subparser = already_parsed.setdefault(sub_query, Parser(sub_query_definition)) + return NestedResolver._resolve_column_in_subparser( + column_name, subparser, subquery_alias + ) + + @staticmethod + def _resolve_column_in_subparser( + column_name: str, subparser: "Parser", original_ref: str + ) -> list[str]: + """Resolve a column name through a parsed nested query. + + Three resolution paths: + + 1. Column name is a known alias in the subparser → resolve it. + 2. Column name is ``*`` → return all subparser columns. + 3. Otherwise → fall back to positional/wildcard matching. + + Example SQL (path 1 — alias):: + + SELECT sub.x FROM (SELECT a AS x FROM t) AS sub + + ``"x"`` is an alias → resolves to ``["a"]``. + + Example SQL (path 2 — star):: + + SELECT sub.* FROM (SELECT a, b FROM t) AS sub + + ``"*"`` → returns ``["a", "b"]``. + """ + if column_name in subparser.columns_aliases_names: + # e.g. sub.x where x is aliased to a → resolve alias chain + return subparser._resolve_column_alias(column_name) + if column_name == "*": + # e.g. sub.* → return all columns from subquery + return subparser.columns + return NestedResolver._find_column_fallback( + column_name, subparser, original_ref + ) + + @staticmethod + def _find_column_fallback( + column_name: str, subparser: "Parser", original_ref: str + ) -> list[str]: + """Find a column by name in the subparser with wildcard fallbacks. + + Tries to match *column_name* against the last segment of each + subparser column. If no match is found, checks for wildcard + columns (``*`` or ``table.*``) before giving up. + + Example SQL (positional match):: + + SELECT sub.id FROM (SELECT users.id FROM users) AS sub + + ``"id"`` matches ``"users.id"`` by last segment → ``["users.id"]``. + + Example SQL (wildcard fallback):: + + SELECT sub.id FROM (SELECT * FROM users) AS sub + + ``"id"`` not found, but subparser has ``*`` → returns ``["id"]``. + """ + try: + idx = [last_segment(x) for x in subparser.columns].index(column_name) + except ValueError: + if "*" in subparser.columns: + # e.g. SELECT * FROM t — subquery selects everything + return [column_name] + for table in subparser.tables: + if f"{table}.*" in subparser.columns: + # e.g. SELECT t.* FROM t — table-qualified wildcard + return [column_name] + # e.g. column not found in subquery at all — keep original ref + return [original_ref] + # e.g. "id" matched at position idx → return fully-qualified form + return [subparser.columns[idx]] + + # ------------------------------------------------------------------- + # Alias resolution + # ------------------------------------------------------------------- + + def _resolve_column_alias( + self, + alias: str | list[str], + columns_aliases: dict[str, str | list[str]], + visited: set[str] | None = None, + ) -> list[str]: + """Recursively resolve a column alias to its underlying column(s). + + Follows alias chains until a non-alias column is reached. + Tracks visited aliases to prevent infinite loops on circular + definitions. + + Example SQL:: + + WITH cte AS (SELECT a AS x FROM t) SELECT x AS y FROM cte + + Resolving ``"y"`` → ``"x"`` → ``["a"]``. + """ + visited = visited or set() + if isinstance(alias, list): + # e.g. alias mapped to multiple columns — resolve each + return [ + item + for x in alias + for item in self._resolve_column_alias(x, columns_aliases, visited) + ] + while alias in columns_aliases and alias not in visited: + visited.add(alias) + alias = columns_aliases[alias] + if isinstance(alias, list): + # e.g. alias mapped to [col1, col2] — resolve list recursively + return self._resolve_column_alias(alias, columns_aliases, visited) + return [alias] + + # ------------------------------------------------------------------- + # Shared helpers + # ------------------------------------------------------------------- + + def _nested_sources( + self, + ) -> list[tuple[list[str], dict[str, str], dict[str, "Parser"]]]: + """Return the (names, defs, cache) tuples for subqueries then CTEs. + + Subqueries are checked first because they are more specific than + CTEs — a column reference ``sub.col`` should resolve against the + subquery named ``sub`` before falling back to a CTE with the + same name. + """ + return [ + (self._subqueries_names, self._subqueries, self._subqueries_parsers), + (self._with_names, self._with_queries, self._with_parsers), + ] + + def _cte_nodes(self) -> list[exp.CTE]: + """Return all ``exp.CTE`` nodes from the AST (cached). + + Example SQL:: + + WITH a AS (SELECT 1), b AS (SELECT 2) SELECT * FROM a, b + + Returns two ``exp.CTE`` nodes (for ``a`` and ``b``). + """ + if self._cached_cte_nodes is None: + self._cached_cte_nodes = list(self._ast.find_all(exp.CTE)) + return self._cached_cte_nodes + + # ------------------------------------------------------------------- + # Body extraction helpers + # ------------------------------------------------------------------- + + @staticmethod + def _body_sql(node: exp.Expression) -> str: + """Render an AST node to SQL, stripping identifier quoting. + + Example SQL:: + + WITH cte AS (SELECT "id" FROM "users") ... + + Renders the CTE body as ``SELECT id FROM users`` (quotes stripped). + """ + body = copy.deepcopy(node) + for ident in body.find_all(exp.Identifier): + ident.set("quoted", False) + return _GENERATOR.generate(body) + + @staticmethod + def _walk_subqueries( + node: exp.Expression, + names: list[str], + bodies: dict[str, str], + counter: int, + ) -> int: + """Post-order walk collecting subquery names and bodies. + + Returns the updated *counter* so unnamed subqueries are numbered + sequentially. + + Example SQL:: + + SELECT * FROM (SELECT 1) AS named, (SELECT 2) + + Produces names ``["named", "subquery_1"]`` with corresponding bodies. + """ + for child in node.iter_expressions(): + counter = NestedResolver._walk_subqueries( + child, names, bodies, counter + ) + if isinstance(node, exp.Subquery): + if node.alias: + # e.g. (SELECT 1) AS named — use the explicit alias + name = node.alias + else: + # e.g. WHERE id IN (SELECT 1) — auto-generate name + counter += 1 + name = f"subquery_{counter}" + names.append(name) + bodies[name] = NestedResolver._body_sql(node.this) + return counter diff --git a/sql_metadata/parser.py b/sql_metadata/parser.py index 122075c8..43fe8239 100644 --- a/sql_metadata/parser.py +++ b/sql_metadata/parser.py @@ -1,34 +1,45 @@ -# pylint: disable=C0302 -""" -This module provides SQL query parsing functions +"""SQL query parsing facade. + +Thin facade that composes the specialised extractors via lazy properties: + +* :class:`~ast_parser.ASTParser` — AST construction and dialect detection. +* :class:`~column_extractor.ColumnExtractor` — single-pass column/alias extraction. +* :class:`~table_extractor.TableExtractor` — table extraction with position sorting. +* :class:`~nested_resolver.NestedResolver` — CTE/subquery name and body extraction, + nested column resolution. +* :mod:`query_type_extractor` — query type detection. +* :mod:`comments` — comment extraction. """ import logging import re -from typing import Dict, List, Optional, Set, Tuple, Union +from typing import Any -import sqlparse -from sqlparse.sql import Token -from sqlparse.tokens import Name, Number, Whitespace +from sqlglot import exp +from sql_metadata.ast_parser import ASTParser +from sql_metadata.column_extractor import ColumnExtractor +from sql_metadata.comments import extract_comments, strip_comments from sql_metadata.generalizator import Generalizator -from sql_metadata.keywords_lists import ( - COLUMNS_SECTIONS, - KEYWORDS_BEFORE_COLUMNS, - TokenType, - RELEVANT_KEYWORDS, - SUBQUERY_PRECEDING_KEYWORDS, - SUPPORTED_QUERY_TYPES, - TABLE_ADJUSTMENT_KEYWORDS, - WITH_ENDING_KEYWORDS, -) -from sql_metadata.token import EmptyToken, SQLToken -from sql_metadata.utils import UniqueList, flatten_list - - -class Parser: # pylint: disable=R0902 - """ - Main class to parse sql query +from sql_metadata.keywords_lists import QueryType +from sql_metadata.nested_resolver import NestedResolver +from sql_metadata.query_type_extractor import QueryTypeExtractor +from sql_metadata.table_extractor import TableExtractor +from sql_metadata.utils import UniqueList + + +class Parser: + """Parse a SQL query and extract metadata. + + The primary public interface of the ``sql-metadata`` library. Given a + raw SQL string, the parser lazily extracts tables, columns, aliases, + CTE definitions, subqueries, values, comments, and more — each + available as a cached property. + + :param sql: The SQL query string to parse. + :type sql: str + :param disable_logging: If ``True``, suppress all log output. + :type disable_logging: bool """ def __init__(self, sql: str = "", disable_logging: bool = False) -> None: @@ -36,1091 +47,499 @@ def __init__(self, sql: str = "", disable_logging: bool = False) -> None: self._logger.disabled = disable_logging self._raw_query = sql - self._query = self._preprocess_query() - self._query_type = None - - self._tokens = None - - self._columns = None - self._columns_dict = None - self._columns_aliases_names = None - self._columns_aliases = None - self._columns_with_tables_aliases = {} - self._columns_aliases_dict = None - - self._tables = None - self._table_aliases = None - - self._with_names = None - self._with_queries = None - self._with_queries_columns = None - self._subqueries = None - self._subqueries_names = None - self._subqueries_parsers = {} - self._with_parsers = {} - - self._limit_and_offset = None - - self._values = None - self._values_dict = None - - self._subquery_level = 0 - self._nested_level = 0 - self._parenthesis_level = 0 - self._open_parentheses: List[SQLToken] = [] - self._preceded_keywords: List[SQLToken] = [] - self._aliases_to_check = None - self._is_in_nested_function = False - self._is_in_with_block = False - self._with_columns_candidates = {} - self._column_aliases_max_subquery_level = {} - - self.sqlparse_tokens = None - self.non_empty_tokens = None - self.tokens_length = None + self._query_type: str | None = None + + self._ast_parser = ASTParser(sql) + self._resolver: NestedResolver | None = None + + self._tokens: list[str] | None = None + + self._columns: UniqueList | None = None + self._columns_dict: dict[str, UniqueList] | None = None + self._columns_aliases_names: UniqueList | None = None + self._columns_aliases: dict[str, str | list[str]] | None = None + self._columns_aliases_dict: dict[str, UniqueList] | None = None + self._columns_with_tables_aliases: dict[str, str] = {} + + self._tables: list[str] | None = None + self._table_aliases: dict[str, str] | None = None + + self._with_names: list[str] | None = None + self._with_queries: dict[str, str] | None = None + self._subqueries: dict[str, str] | None = None + self._subqueries_names: list[str] | None = None + + self._limit_and_offset: tuple[int, int] | None = None + + self._output_columns: list[str] | None = None + + self._values: list[Any] | None = None + self._values_dict: dict[str, int | float | str | list[Any]] | None = None + + # ------------------------------------------------------------------- + # NestedResolver access + # ------------------------------------------------------------------- + + def _get_resolver(self) -> NestedResolver: + """Return (and cache) the NestedResolver instance.""" + if self._resolver is None: + ast = self._ast_parser.ast + assert ast is not None + self._resolver = NestedResolver(ast) + return self._resolver + + # ------------------------------------------------------------------- + # Query preprocessing + # ------------------------------------------------------------------- @property def query(self) -> str: - """ - Returns preprocessed query - """ - return self._query.replace("\n", " ").replace(" ", " ") + """Return the preprocessed SQL query.""" + return self._preprocess_query().replace("\n", " ").replace(" ", " ") + + def _preprocess_query(self) -> str: + """Normalise quoting in the raw query.""" + if self._raw_query == "": + return "" + + def replace_quotes_in_string(match: re.Match[str]) -> str: + return re.sub('"', "", match.group()) + + def replace_back_quotes_in_string(match: re.Match[str]) -> str: + return re.sub("", '"', match.group()) + + query = re.sub(r"'.*?'", replace_quotes_in_string, self._raw_query) + query = re.sub(r'"([^`]+?)"', r"`\1`", query) + query = re.sub(r"'.*?'", replace_back_quotes_in_string, query) + return query + + # ------------------------------------------------------------------- + # Query type + # ------------------------------------------------------------------- @property def query_type(self) -> str: - """ - Returns type of the query. - Currently supported queries are: - select, insert, update, replace, create table, alter table, with + select - """ + """Return the type of the SQL query.""" if self._query_type: return self._query_type - if not self._tokens: - _ = self.tokens - - # remove comment tokens to not confuse the logic below (see #163) - tokens: List[SQLToken] = list( - filter(lambda token: not token.is_comment, self._tokens or []) - ) - - if not tokens: - raise ValueError("Empty queries are not supported!") - - index = ( - 0 - if not tokens[0].is_left_parenthesis - else tokens[0] - .find_nearest_token( - value=False, value_attribute="is_left_parenthesis", direction="right" - ) - .position - ) - if tokens[index].normalized == "CREATE": - switch = self._get_switch_by_create_query(tokens, index) - elif tokens[index].normalized in ("ALTER", "DROP", "TRUNCATE"): - switch = tokens[index].normalized + tokens[index + 1].normalized - else: - switch = tokens[index].normalized - self._query_type = SUPPORTED_QUERY_TYPES.get(switch, "UNSUPPORTED") - if self._query_type == "UNSUPPORTED": - # do not log the full query - # https://github.com/macbre/sql-metadata/issues/543 - shorten_query = " ".join(self._raw_query.split(" ")[:3]) - - self._logger.error("Not supported query type: %s", shorten_query) - raise ValueError("Not supported query type!") + try: + ast = self._ast_parser.ast + except ValueError: + ast = None + self._query_type = QueryTypeExtractor(ast, self._raw_query).extract() + if self._query_type == QueryType.INSERT and self._ast_parser.is_replace: + self._query_type = QueryType.REPLACE return self._query_type + # ------------------------------------------------------------------- + # Tokens + # ------------------------------------------------------------------- + @property - def tokens(self) -> List[SQLToken]: # noqa: C901 - """ - Tokenizes the query - """ + def tokens(self) -> list[str]: + """Return the SQL as a list of token strings.""" if self._tokens is not None: return self._tokens + if not self._raw_query or not self._raw_query.strip(): + self._tokens = [] + return self._tokens + from sql_metadata.comments import _choose_tokenizer - # allow parser to be overriden - parsed = self._parse(self._query) - tokens = [] - # handle empty queries (#12) - if not parsed: - return tokens - self._get_sqlparse_tokens(parsed) - last_keyword = None - combine_flag = False - for index, tok in enumerate(self.non_empty_tokens): - # combine dot separated identifiers - if self._is_token_part_of_complex_identifier(token=tok, index=index): - combine_flag = True - continue - token = SQLToken( - tok=tok, - index=index, - subquery_level=self._subquery_level, - last_keyword=last_keyword, - ) - if combine_flag: - self._combine_qualified_names(index=index, token=token) - combine_flag = False - - previous_token = tokens[-1] if index > 0 else EmptyToken - token.previous_token = previous_token - previous_token.next_token = token if index > 0 else None - - if token.is_left_parenthesis: - token.token_type = TokenType.PARENTHESIS - self._determine_opening_parenthesis_type(token=token) - elif token.is_right_parenthesis: - token.token_type = TokenType.PARENTHESIS - self._determine_closing_parenthesis_type(token=token) - if token.is_subquery_end: - last_keyword = self._preceded_keywords.pop() - - last_keyword = self._determine_last_relevant_keyword( - token=token, last_keyword=last_keyword + try: + sg_tokens = list( + _choose_tokenizer(self._raw_query).tokenize(self._raw_query) ) - token.is_in_nested_function = self._is_in_nested_function - token.parenthesis_level = self._parenthesis_level - tokens.append(token) - - self._tokens = tokens - # since tokens are used in all methods required parsing (so w/o generalization) - # we set the query type here (and not in init) to allow for generalization - # but disallow any other usage for not supported queries to avoid unexpected - # results which are not really an error - _ = self.query_type - return tokens + # TODO: revisit if sqlglot tokenizer starts raising on specific inputs + except Exception: # pragma: no cover + sg_tokens = [] + self._tokens = [t.text.strip("`").strip('"') for t in sg_tokens] + return self._tokens + + # ------------------------------------------------------------------- + # Columns + # ------------------------------------------------------------------- @property - def columns(self) -> List[str]: - """ - Returns the list columns this query refers to - """ + def columns(self) -> list[str]: + """Return the list of column names referenced in the query.""" if self._columns is not None: return self._columns - columns = UniqueList() - - for token in self._not_parsed_tokens: - if token.is_name or token.is_keyword_column_name: - if token.is_column_definition_inside_create_table( - query_type=self.query_type - ): - token.token_type = TokenType.COLUMN - columns.append(token.value) - elif ( - token.is_potential_column_name - and token.is_not_an_alias_or_is_self_alias_outside_of_subquery( - columns_aliases_names=self.columns_aliases_names, - max_subquery_level=self._column_aliases_max_subquery_level, - ) - and not token.is_sub_query_name_or_with_name_or_function_name( - sub_queries_names=self.subqueries_names, - with_names=self.with_names, - ) - and not token.is_table_definition_suffix_in_non_select_create_table( - query_type=self.query_type - ) - and not token.is_conversion_specifier - ): - self._handle_column_save(token=token, columns=columns) - - elif token.is_column_name_inside_insert_clause: - column = str(token.value) - self._add_to_columns_subsection( - keyword=token.last_keyword_normalized, column=column - ) - token.token_type = TokenType.COLUMN - columns.append(column) - elif token.is_a_wildcard_in_select_statement: - self._handle_column_save(token=token, columns=columns) - self._columns = columns + try: + ast = self._ast_parser.ast + ta = self.tables_aliases + except ValueError: + cols = self._extract_columns_regex() + self._columns = UniqueList(cols) + self._columns_dict = {} + self._columns_aliases_names = UniqueList() + self._columns_aliases_dict = {} + self._columns_aliases = {} + self._output_columns = [] + return self._columns + + if ast is None: # pragma: no cover — tables_aliases raises for None ast + self._columns = UniqueList() + self._columns_dict = {} + self._columns_aliases_names = UniqueList() + self._columns_aliases_dict = {} + self._columns_aliases = {} + self._output_columns = [] + return self._columns + + extractor = ColumnExtractor(ast, ta, self._ast_parser.cte_name_map) + result = extractor.extract() + + self._columns = result.columns + self._columns_dict = result.columns_dict + self._columns_aliases_names = result.alias_names + self._columns_aliases_dict = result.alias_dict + self._columns_aliases = result.alias_map if result.alias_map else {} + self._output_columns = result.output_columns + + # Use only aliased subquery names for column resolution — + # auto-generated names (subquery_1, …) are never referenced in SQL. + aliased_names = result.subquery_names + all_names, all_bodies = NestedResolver.extract_subqueries(ast) + aliased_bodies = {k: v for k, v in all_bodies.items() if k in aliased_names} + resolver = self._get_resolver() + self._columns, self._columns_dict, self._columns_aliases = resolver.resolve( + self._columns, + self._columns_dict, + self._columns_aliases, + aliased_names, + aliased_bodies, + self.with_names, + self.with_queries, + ) + # Cache full results for the public properties + self._subqueries_names = all_names + self._subqueries = all_bodies + return self._columns @property - def columns_dict(self) -> Dict[str, List[str]]: - """ - Returns dictionary of column names divided into section of the query in which - given column is present. - - Sections consist of: select, where, order_by, group_by, join, insert and update - """ - if not self._columns_dict: + def columns_dict(self) -> dict[str, UniqueList]: + """Return column names organised by query section.""" + if self._columns_dict is None: _ = self.columns + assert self._columns_dict is not None + # Resolve aliases used in other sections if self.columns_aliases_dict: + resolver = self._get_resolver() for key, value in self.columns_aliases_dict.items(): for alias in value: - resolved = self._resolve_column_alias(alias) - if isinstance(resolved, list): - for res_alias in resolved: - self._columns_dict.setdefault(key, UniqueList()).append( - res_alias - ) - else: - self._columns_dict.setdefault(key, UniqueList()).append( - resolved - ) + resolved = resolver.resolve_column_alias( + alias, self.columns_aliases + ) + for r in resolved: + self._columns_dict.setdefault(key, UniqueList()).append(r) return self._columns_dict @property - def columns_aliases(self) -> Dict: - """ - Returns a dictionary of column aliases with columns - """ - if self._columns_aliases is not None: - return self._columns_aliases - column_aliases = {} - _ = self.columns - self._aliases_to_check = ( - list(self._columns_with_tables_aliases.keys()) - + self.columns_aliases_names - + ["*"] - ) - for token in self.tokens: - if token.is_potential_column_alias( - column_aliases=column_aliases, - columns_aliases_names=self.columns_aliases_names, - ): - token_check = ( - token.previous_token - if not token.previous_token.is_as_keyword - else token.get_nth_previous(2) - ) - if token_check.is_column_definition_end: - alias_of = self._resolve_subquery_alias(token=token) - elif token_check.is_partition_clause_end: - start_token = token.find_nearest_token( - True, value_attribute="is_partition_clause_start" - ) - alias_of = self._find_all_columns_between_tokens( - start_token=start_token, end_token=token - ) - elif token.is_in_with_columns: - # columns definition is to the right in subquery - # we are in: with with_name () as (subquery) - alias_of = self._find_column_for_with_column_alias(token) - else: - alias_of = self._resolve_function_alias(token=token) - if token.value != alias_of: - # skip aliases of self, like sum(column) as column - column_aliases[token.value] = alias_of - - self._columns_aliases = column_aliases + def columns_aliases(self) -> dict[str, str | list[str]]: + """Return the alias-to-column mapping for column aliases.""" + if self._columns_aliases is None: + _ = self.columns + assert self._columns_aliases is not None return self._columns_aliases @property - def columns_aliases_dict(self) -> Dict[str, List[str]]: - """ - Returns dictionary of column names divided into section of the query in which - given column is present. - - Sections consist of: select, where, order_by, group_by, join, insert and update - """ - if self._columns_aliases_dict: - return self._columns_aliases_dict - _ = self.columns_aliases_names + def columns_aliases_dict(self) -> dict[str, UniqueList] | None: + """Return column alias names organised by query section.""" + if self._columns_aliases_dict is None: + _ = self.columns return self._columns_aliases_dict @property - def columns_aliases_names(self) -> List[str]: - """ - Extract names of the column aliases used in query - """ - if self._columns_aliases_names is not None: - return self._columns_aliases_names - column_aliases_names = UniqueList() - with_names = self.with_names - subqueries_names = self.subqueries_names - for token in self._not_parsed_tokens: - if token.is_potential_alias: - if token.value in column_aliases_names: - self._handle_column_alias_subquery_level_update(token=token) - elif ( - token.is_a_valid_alias - and token.value not in with_names + subqueries_names - ): - column_aliases_names.append(token.value) - self._handle_column_alias_subquery_level_update(token=token) - - self._columns_aliases_names = column_aliases_names + def columns_aliases_names(self) -> list[str]: + """Return the names of all column aliases used in the query.""" + if self._columns_aliases_names is None: + _ = self.columns + assert self._columns_aliases_names is not None return self._columns_aliases_names @property - def tables(self) -> List[str]: - """ - Return the list of tables this query refers to - """ - if self._tables is not None: - return self._tables - tables = UniqueList() - with_names = self.with_names - - for token in self._not_parsed_tokens: - if token.is_potential_table_name: - if ( - token.is_alias_of_table_or_alias_of_subquery - or token.is_with_statement_nested_in_subquery - or token.is_constraint_definition_inside_create_table_clause( - query_type=self.query_type - ) - or token.is_columns_alias_of_with_query_or_column_in_insert_query( - with_names=with_names - ) - ): - continue - - # handle INSERT INTO ON DUPLICATE KEY UPDATE queries - if ( - token.last_keyword_normalized == "UPDATE" - and self.query_type == "INSERT" - ): - continue - token.token_type = TokenType.TABLE - tables.append(str(token.value)) - - self._tables = tables - with_names - return self._tables + def output_columns(self) -> list[str]: + """Return the ordered list of SELECT output column names. - @property - def limit_and_offset(self) -> Optional[Tuple[int, int]]: - """ - Returns value for limit and offset if set + Combines real columns and aliases in their original position. + For example, ``SELECT a, b AS c FROM t`` returns ``["a", "c"]``. """ - if self._limit_and_offset is not None: - return self._limit_and_offset - limit = None - offset = None - - for token in self._not_parsed_tokens: - if token.is_integer: - if token.last_keyword_normalized == "LIMIT" and not limit: - # LIMIT - limit = int(token.value) - elif token.last_keyword_normalized == "OFFSET": - # OFFSET - offset = int(token.value) - elif ( - token.previous_token.is_punctuation - and token.last_keyword_normalized == "LIMIT" - ): - # LIMIT , - # enter this condition only when the limit has already been parsed - offset = limit - limit = int(token.value) - - if limit is None: - return None + if self._output_columns is None: + _ = self.columns + assert self._output_columns is not None + return self._output_columns - self._limit_and_offset = limit, offset or 0 - return self._limit_and_offset + # ------------------------------------------------------------------- + # Tables + # ------------------------------------------------------------------- @property - def tables_aliases(self) -> Dict[str, str]: - """ - Returns tables aliases mapping from a given query + def tables(self) -> list[str]: + """Return the list of table names referenced in the query.""" + if self._tables is not None: + return self._tables + _ = self.query_type + ast = self._ast_parser.ast + assert ast is not None # guaranteed by query_type raising on None + cte_names = set(self.with_names) + for placeholder in self._ast_parser.cte_name_map: + cte_names.add(placeholder) + extractor = TableExtractor( + ast, + self._raw_query, + cte_names, + dialect=self._ast_parser.dialect, + ) + self._tables = extractor.extract() + return self._tables - E.g. SELECT a.* FROM users1 AS a JOIN users2 AS b ON a.ip_address = b.ip_address - will give you {'a': 'users1', 'b': 'users2'} - """ + @property + def tables_aliases(self) -> dict[str, str]: + """Return the table alias mapping for this query.""" if self._table_aliases is not None: return self._table_aliases - aliases = {} - tables = self.tables - - for token in self._not_parsed_tokens: - if ( - token.last_keyword_normalized in TABLE_ADJUSTMENT_KEYWORDS - and (token.is_name or (token.is_keyword and not token.is_as_keyword)) - and not token.next_token.is_as_keyword - ): - if token.previous_token.is_as_keyword: - # potential . as - potential_table_name = token.get_nth_previous(2).value - else: - # potential .
- potential_table_name = token.previous_token.value - - if potential_table_name in tables: - token.token_type = TokenType.TABLE_ALIAS - aliases[token.value] = potential_table_name - - self._table_aliases = aliases + ast = self._ast_parser.ast + assert ast is not None # guaranteed by prior tables/query_type access + extractor = TableExtractor(ast) + self._table_aliases = extractor.extract_aliases(self.tables) return self._table_aliases - @property - def with_names(self) -> List[str]: # noqa: C901 - """ - Returns with statements aliases list from a given query + # ------------------------------------------------------------------- + # CTEs and subqueries + # ------------------------------------------------------------------- - E.g. WITH database1.tableFromWith AS (SELECT * FROM table3) - SELECT "xxxxx" FROM database1.tableFromWith alias - LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx") - will return ["database1.tableFromWith"] - """ + @property + def with_names(self) -> list[str]: + """Return the CTE (Common Table Expression) names from the query.""" if self._with_names is not None: return self._with_names - with_names = UniqueList() - for token in self._not_parsed_tokens: - if token.previous_token.normalized == "WITH": - self._is_in_with_block = True - while self._is_in_with_block and token.next_token: - if token.next_token.is_as_keyword: - self._handle_with_name_save(token=token, with_names=with_names) - while token.next_token and not token.is_with_query_end: - token = token.next_token - is_end_of_with_block = ( - token.next_token_not_comment is None - or token.next_token_not_comment.normalized - in WITH_ENDING_KEYWORDS - ) - if is_end_of_with_block: - self._is_in_with_block = False - elif token.next_token and token.next_token.is_as_keyword: - # Malformed SQL like "... AS (...) AS ..." - raise ValueError("This query is wrong") - else: - # Advance token to prevent infinite loop - token = token.next_token - else: - token = token.next_token - - self._with_names = with_names + resolver = self._get_resolver() + self._with_names = resolver.extract_cte_names( + self._ast_parser.cte_name_map + ) return self._with_names @property - def with_queries(self) -> Dict[str, str]: - """ - Returns "WITH" subqueries with names - - E.g. WITH tableFromWith AS (SELECT * FROM table3) - SELECT "xxxxx" FROM database1.tableFromWith alias - LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx") - will return {"tableFromWith": "SELECT * FROM table3"} - """ + def with_queries(self) -> dict[str, str]: + """Return the SQL body for each CTE defined in the query.""" if self._with_queries is not None: return self._with_queries - with_queries = {} - with_queries_columns = {} - for name in self.with_names: - token = self.tokens[0].find_nearest_token( - name, value_attribute="value", direction="right" - ) - if token.next_token.is_with_columns_start: - with_queries_columns[name] = True - else: - with_queries_columns[name] = False - current_with_query = [] - with_start = token.find_nearest_token( - True, value_attribute="is_with_query_start", direction="right" - ) - with_end = with_start.find_nearest_token( - True, value_attribute="is_with_query_end", direction="right" - ) - query_token = with_start.next_token - while query_token is not None and query_token != with_end: - current_with_query.append(query_token) - query_token = query_token.next_token - with_query_text = "".join([x.stringified_token for x in current_with_query]) - with_queries[name] = with_query_text - self._with_queries = with_queries - self._with_queries_columns = with_queries_columns + resolver = self._get_resolver() + self._with_queries = resolver.extract_cte_bodies( + self._ast_parser.cte_name_map + ) return self._with_queries @property - def subqueries(self) -> Dict: - """ - Returns a dictionary with all sub-queries existing in query - """ + def subqueries(self) -> dict[str, str]: + """Return the SQL body for each subquery in the query.""" if self._subqueries is not None: return self._subqueries - subqueries = {} - token = self.tokens[0] - while token.next_token: - if token.previous_token.is_subquery_start: - current_subquery = [] - current_level = token.subquery_level - inner_token = token - while ( - inner_token.next_token - and not inner_token.next_token.subquery_level < current_level - ): - current_subquery.append(inner_token) - inner_token = inner_token.next_token - - query_name = None - if inner_token.next_token.value in self.subqueries_names: - query_name = inner_token.next_token.value - elif inner_token.next_token.is_as_keyword: - query_name = inner_token.next_token.next_token.value - - subquery_text = "".join([x.stringified_token for x in current_subquery]) - if query_name is not None: - subqueries[query_name] = subquery_text - - token = token.next_token - - self._subqueries = subqueries + ast = self._ast_parser.ast + assert ast is not None + self._subqueries_names, self._subqueries = ( + NestedResolver.extract_subqueries(ast) + ) return self._subqueries @property - def subqueries_names(self) -> List[str]: - """ - Returns sub-queries aliases list from a given query + def subqueries_names(self) -> list[str]: + """Return the names of all subqueries (innermost first). - e.g. SELECT COUNT(1) FROM - (SELECT std.task_id FROM some_task_detail std WHERE std.STATUS = 1) a - JOIN (SELECT st.task_id FROM some_task st WHERE task_type_id = 80) b - ON a.task_id = b.task_id; - will return ["a", "b"] + Aliased subqueries use their alias; unaliased ones get + auto-generated names (``subquery_1``, ``subquery_2``, …). """ if self._subqueries_names is not None: return self._subqueries_names - subqueries_names = UniqueList() - for token in self.tokens: - if (token.previous_token.is_subquery_end and not token.is_as_keyword) or ( - token.previous_token.is_as_keyword - and token.get_nth_previous(2).is_subquery_end - ): - token.token_type = TokenType.SUB_QUERY_NAME - subqueries_names.append(str(token)) - - self._subqueries_names = subqueries_names + ast = self._ast_parser.ast + assert ast is not None + self._subqueries_names, self._subqueries = ( + NestedResolver.extract_subqueries(ast) + ) return self._subqueries_names + # ------------------------------------------------------------------- + # Limit, offset, values + # ------------------------------------------------------------------- + + @staticmethod + def _extract_int_from_node(node: Any) -> int | None: + """Safely extract an integer value from a Limit or Offset node.""" + if not node: + return None + try: + return int(node.expression.this) + except (ValueError, AttributeError, TypeError): + return None + @property - def values(self) -> List: - """ - Returns list of values from insert queries - """ + def limit_and_offset(self) -> tuple[int, int] | None: + """Return the LIMIT and OFFSET values, if present.""" + if self._limit_and_offset is not None: + return self._limit_and_offset + + from sqlglot import exp + + ast = self._ast_parser.ast + if ast is None: + return None + + select = ast if isinstance(ast, exp.Select) else ast.find(exp.Select) + if select is None: + return None + + limit_val = self._extract_int_from_node(select.args.get("limit")) + offset_val = self._extract_int_from_node(select.args.get("offset")) + + if limit_val is None: + return self._extract_limit_regex() + + self._limit_and_offset = limit_val, offset_val or 0 + return self._limit_and_offset + + @property + def values(self) -> list[Any]: + """Return the list of literal values from INSERT/REPLACE queries.""" if self._values: return self._values - values = [] - for token in self._not_parsed_tokens: - if ( - token.last_keyword_normalized == "VALUES" - and token.is_in_parenthesis - and token.next_token.is_punctuation - ): - if token.is_integer: - value = int(token.value) - elif token.is_float: - value = float(token.value) - else: - value = token.value.strip("'\"") - values.append(value) - self._values = values + self._values = self._extract_values() return self._values @property - def values_dict(self) -> Dict: - """ - Returns dictionary of column-value pairs. - If columns are not set the auto generated column_ are added. - """ + def values_dict(self) -> dict[str, Any] | None: + """Return column-value pairs from INSERT/REPLACE queries.""" values = self.values if self._values_dict or not values: return self._values_dict - columns = self.columns + try: + columns = self.columns + # TODO: revisit if .columns starts propagating ValueError to callers + except ValueError: # pragma: no cover + columns = [] + + is_multi = values and isinstance(values[0], list) + first_row = values[0] if is_multi else values if not columns: - columns = [f"column_{ind + 1}" for ind in range(len(values))] - values_dict = dict(zip(columns, values)) - self._values_dict = values_dict + columns = [f"column_{ind + 1}" for ind in range(len(first_row))] + + if is_multi: + self._values_dict = { + col: [row[i] for row in values] for i, col in enumerate(columns) + } + else: + self._values_dict = dict(zip(columns, values)) return self._values_dict + # ------------------------------------------------------------------- + # Comments and generalization + # ------------------------------------------------------------------- + @property - def comments(self) -> List[str]: - """ - Return comments from SQL query - """ - return [x.value for x in self.tokens if x.is_comment] + def comments(self) -> list[str]: + """Return all comments from the SQL query.""" + return extract_comments(self._raw_query) @property def without_comments(self) -> str: - """ - Removes comments from SQL query - """ - return Generalizator(self._raw_query).without_comments + """Return the SQL with all comments removed.""" + return strip_comments(self._raw_query) @property def generalize(self) -> str: - """ - Removes most variables from an SQL query - and replaces them with X or N for numbers. - - Based on Mediawiki's DatabaseBase::generalizeSQL - """ + """Return a generalised (anonymised) version of the query.""" return Generalizator(self._raw_query).generalize - @property - def _not_parsed_tokens(self): - """ - Returns only tokens that have no type assigned yet - """ - return [x for x in self.tokens if x.token_type is None] - - def _handle_column_save(self, token: SQLToken, columns: List[str]): - column = token.table_prefixed_column(self.tables_aliases) - if self._is_with_query_already_resolved(column): - self._add_to_columns_aliases_subsection(token=token, left_expand=False) - token.token_type = TokenType.COLUMN_ALIAS - return - column = self._resolve_sub_queries(column) - self._add_to_columns_with_tables(token, column) - self._add_to_columns_subsection( - keyword=token.last_keyword_normalized, column=column - ) - token.token_type = TokenType.COLUMN - columns.extend(column) - - @staticmethod - def _handle_with_name_save(token: SQLToken, with_names: List[str]) -> None: - if token.is_right_parenthesis: - # inside columns of with statement - # like: with (col1, col2) as (subquery) - token.is_with_columns_end = True - token.is_nested_function_end = False - start_token = token.find_nearest_token("(") - # like: with (col1, col2) as (subquery) as ..., it enters an infinite loop. - # return exception - if start_token.is_with_query_start: - raise ValueError("This query is wrong") # pragma: no cover - start_token.is_with_columns_start = True - start_token.is_nested_function_start = False - prev_token = start_token.previous_token - prev_token.token_type = TokenType.WITH_NAME - with_names.append(prev_token.value) - else: - token.token_type = TokenType.WITH_NAME - with_names.append(token.value) - - def _handle_column_alias_subquery_level_update(self, token: SQLToken) -> None: - token.token_type = TokenType.COLUMN_ALIAS - self._add_to_columns_aliases_subsection(token=token) - current_level = self._column_aliases_max_subquery_level.setdefault( - token.value, 0 - ) - if token.subquery_level > current_level: - self._column_aliases_max_subquery_level[token.value] = token.subquery_level - - def _resolve_subquery_alias(self, token: SQLToken) -> Union[str, List[str]]: - # nested subquery like select a, (select a as b from x) as column - start_token = token.find_nearest_token( - True, value_attribute="is_column_definition_start" - ) - if start_token.next_token.normalized == "SELECT": - # we have a subquery - alias_token = start_token.next_token.find_nearest_token( - self._aliases_to_check, - direction="right", - value_attribute="value", - ) - return self._resolve_alias_to_column(alias_token) + # ------------------------------------------------------------------- + # Internal extraction helpers + # ------------------------------------------------------------------- - # chain of functions or redundant parenthesis - return self._find_all_columns_between_tokens( - start_token=start_token, end_token=token - ) + def _extract_values(self) -> list[Any]: + """Extract literal values from INSERT/REPLACE query AST.""" + from sqlglot import exp - def _resolve_function_alias(self, token: SQLToken) -> Union[str, List[str]]: - # it can be one function or a chain of functions - # like: sum(a) + sum(b) as alias - # or operation on columns like: col1 + col2 as alias - start_token = token.find_nearest_token( - [",", "SELECT"], value_attribute="normalized" - ) - while start_token.is_in_nested_function: - start_token = start_token.find_nearest_token( - [",", "SELECT"], value_attribute="normalized" - ) - return self._find_all_columns_between_tokens( - start_token=start_token, end_token=token - ) - - def _add_to_columns_subsection(self, keyword: str, column: Union[str, List[str]]): - """ - Add columns to the section in which it appears in query - """ - section = COLUMNS_SECTIONS[keyword] - self._columns_dict = self._columns_dict or {} - current_section = self._columns_dict.setdefault(section, UniqueList()) - if isinstance(column, str): - current_section.append(column) - else: - current_section.extend(column) - - def _add_to_columns_aliases_subsection( - self, token: SQLToken, left_expand: bool = True - ) -> None: - """ - Add alias to the section in which it appears in query - """ - keyword = token.last_keyword_normalized - alias = token.value if left_expand else token.value.split(".")[-1] - if ( - token.last_keyword_normalized in ["FROM", "WITH"] - and token.find_nearest_token("(").is_with_columns_start - ): - keyword = "SELECT" - section = COLUMNS_SECTIONS[keyword] - self._columns_aliases_dict = self._columns_aliases_dict or {} - self._columns_aliases_dict.setdefault(section, UniqueList()).append(alias) - - def _add_to_columns_with_tables( - self, token: SQLToken, column: Union[str, List[str]] - ) -> None: - if isinstance(column, list) and len(column) == 1: - column = column[0] - self._columns_with_tables_aliases[token.value] = column - - def _resolve_column_alias( - self, alias: Union[str, List[str]], visited: Set = None - ) -> Union[str, List]: - """ - Returns a column name for a given alias - """ - visited = visited or set() - if isinstance(alias, list): - return [self._resolve_column_alias(x, visited) for x in alias] - while alias in self.columns_aliases and alias not in visited: - visited.add(alias) - alias = self.columns_aliases[alias] - if isinstance(alias, list): - return self._resolve_column_alias(alias, visited) - return alias - - def _resolve_alias_to_column(self, alias_token: SQLToken) -> str: - """ - Resolves aliases of tables to already resolved columns - """ - if alias_token.value in self._columns_with_tables_aliases: - alias_of = self._columns_with_tables_aliases[alias_token.value] - else: - alias_of = alias_token.value - return alias_of - - def _resolve_sub_queries(self, column: str) -> List[str]: - """ - Resolve column names coming from sub queries and with queries to actual - column names as they appear in the query - """ - column = self._resolve_nested_query( - subquery_alias=column, - nested_queries_names=self.subqueries_names, - nested_queries=self.subqueries, - already_parsed=self._subqueries_parsers, - ) - if isinstance(column, str): - column = self._resolve_nested_query( - subquery_alias=column, - nested_queries_names=self.with_names, - nested_queries=self.with_queries, - already_parsed=self._with_parsers, - ) - return column if isinstance(column, list) else [column] + try: + ast = self._ast_parser.ast + except ValueError: + return [] + + if ast is None: + return [] + + values_node = ast.find(exp.Values) + if not values_node: + return [] + + rows = [] + for tup in values_node.expressions: + if isinstance(tup, exp.Tuple): + rows.append([self._convert_value(val) for val in tup.expressions]) + # TODO: revisit if sqlglot stops wrapping VALUES items in Tuple + else: # pragma: no cover + rows.append([self._convert_value(tup)]) + if len(rows) == 1: + return rows[0] + return rows @staticmethod - # pylint:disable=too-many-return-statements - def _resolve_nested_query( # noqa: C901 - subquery_alias: str, - nested_queries_names: List[str], - nested_queries: Dict, - already_parsed: Dict, - ) -> Union[str, List[str]]: - """ - Resolves subquery reference to the actual column in the subquery - """ - parts = subquery_alias.split(".") - if len(parts) != 2 or parts[0] not in nested_queries_names: - return subquery_alias - sub_query, column_name = parts[0], parts[-1] - sub_query_definition = nested_queries.get(sub_query) - subparser = already_parsed.setdefault(sub_query, Parser(sub_query_definition)) - # in subquery you cannot have more than one column with given name - # so it either has to have an alias or only one column with given name exists - if column_name in subparser.columns_aliases_names: - resolved_column = subparser._resolve_column_alias( # pylint: disable=W0212 - column_name - ) - if isinstance(resolved_column, list): - resolved_column = flatten_list(resolved_column) - return resolved_column - return [resolved_column] + def _convert_value(val: exp.Expression) -> int | float | str: + """Convert a sqlglot literal AST node to a Python type.""" + from sqlglot import exp + + if isinstance(val, exp.Literal): + if val.is_int: + return int(val.this) + if val.is_number: + return float(val.this) + return str(val.this) + if isinstance(val, exp.Neg): + inner = val.this + if isinstance(inner, exp.Literal): + if inner.is_int: + return -int(inner.this) + return -float(inner.this) + return str(val) + + def _extract_limit_regex(self) -> tuple[int, int] | None: + """Extract LIMIT and OFFSET using regex as a fallback.""" + sql = strip_comments(self._raw_query) + match = re.search(r"LIMIT\s+(\d+)\s*,\s*(\d+)", sql, re.IGNORECASE) + if match: + offset_val = int(match.group(1)) + limit_val = int(match.group(2)) + self._limit_and_offset = limit_val, offset_val + return self._limit_and_offset - if column_name == "*": - return subparser.columns - try: - column_index = [x.split(".")[-1] for x in subparser.columns].index( - column_name - ) - except ValueError as exc: - # handle case when column name is used but subquery select all by wildcard - if "*" in subparser.columns: - return column_name - for table in subparser.tables: - if f"{table}.*" in subparser.columns: - return column_name - raise exc # pragma: no cover - resolved_column = subparser.columns[column_index] - return [resolved_column] - - def _is_with_query_already_resolved(self, col_alias: str) -> bool: - """ - Checks if columns comes from a with query that has columns defined - cause if it does that means that column name is an alias and is already - resolved in aliases. - """ - parts = col_alias.split(".") - if len(parts) != 2 or parts[0] not in self.with_names: - return False - if self._with_queries_columns.get(parts[0]): - return True - return False - - def _determine_opening_parenthesis_type(self, token: SQLToken): - """ - Determines the type of left parenthesis in query - """ - if token.previous_token.normalized in SUBQUERY_PRECEDING_KEYWORDS: - # inside subquery / derived table - token.is_subquery_start = True - self._subquery_level += 1 - self._preceded_keywords.append(token.last_keyword_normalized) - token.subquery_level = self._subquery_level - elif token.previous_token.normalized in KEYWORDS_BEFORE_COLUMNS.union({","}): - # we are in columns and in a column subquery definition - token.is_column_definition_start = True - elif ( - token.previous_token_not_comment.is_as_keyword - and token.last_keyword_normalized != "WINDOW" - ): - # window clause also contains AS keyword, but it is not a query - token.is_with_query_start = True - elif ( - token.last_keyword_normalized == "TABLE" - and token.find_nearest_token("(") is EmptyToken - ): - token.is_create_table_columns_declaration_start = True - elif token.previous_token.normalized == "OVER": - token.is_partition_clause_start = True - else: - # nested function - token.is_nested_function_start = True - self._nested_level += 1 - self._is_in_nested_function = True - self._open_parentheses.append(token) - self._parenthesis_level += 1 - - def _determine_closing_parenthesis_type(self, token: SQLToken): - """ - Determines the type of right parenthesis in query - """ - last_open_parenthesis = self._open_parentheses.pop(-1) - if last_open_parenthesis.is_subquery_start: - token.is_subquery_end = True - self._subquery_level -= 1 - elif last_open_parenthesis.is_column_definition_start: - token.is_column_definition_end = True - elif last_open_parenthesis.is_with_query_start: - token.is_with_query_end = True - elif last_open_parenthesis.is_create_table_columns_declaration_start: - token.is_create_table_columns_declaration_end = True - elif last_open_parenthesis.is_partition_clause_start: - token.is_partition_clause_end = True - else: - token.is_nested_function_end = True - self._nested_level -= 1 - if self._nested_level == 0: - self._is_in_nested_function = False - self._parenthesis_level -= 1 - - def _find_column_for_with_column_alias(self, token: SQLToken) -> str: - start_token = token.find_nearest_token( - True, direction="right", value_attribute="is_with_query_start" + match = re.search( + r"LIMIT\s+(\d+)(?:\s+OFFSET\s+(\d+))?", + sql, + re.IGNORECASE, ) - if start_token not in self._with_columns_candidates: - end_token = start_token.find_nearest_token( - True, direction="right", value_attribute="is_with_query_end" - ) - columns = self._find_all_columns_between_tokens( - start_token=start_token, end_token=end_token - ) - self._with_columns_candidates[start_token] = columns - if isinstance(self._with_columns_candidates[start_token], list): - alias_of = self._with_columns_candidates[start_token].pop(0) - else: - alias_of = self._with_columns_candidates[start_token] - return alias_of - - def _find_all_columns_between_tokens( - self, start_token: SQLToken, end_token: SQLToken - ) -> Union[str, List[str]]: - """ - Returns a list of columns between two tokens - """ - loop_token = start_token - aliases = UniqueList() - while loop_token.next_token != end_token: - if loop_token.next_token.value in self._aliases_to_check: - alias_token = loop_token.next_token - if ( - alias_token.normalized != "*" - or alias_token.is_wildcard_not_operator - ): - aliases.append(self._resolve_alias_to_column(alias_token)) - loop_token = loop_token.next_token - return aliases[0] if len(aliases) == 1 else aliases - - def _preprocess_query(self) -> str: - """ - Perform initial query cleanup - """ - if self._raw_query == "": - return "" - - # python re does not have variable length look back/forward - # so we need to replace all the " (double quote) for a - # temporary placeholder as we DO NOT want to replace those - # in the strings as this is something that user provided - def replace_quotes_in_string(match): - return re.sub('"', "", match.group()) - - def replace_back_quotes_in_string(match): - return re.sub("", '"', match.group()) - - # unify quoting in queries, replace double quotes to backticks - # it's best to keep the quotes as they can have keywords - # or digits at the beginning so we only strip them in SQLToken - # as double quotes are not properly handled in sqlparse - query = re.sub(r"'.*?'", replace_quotes_in_string, self._raw_query) - query = re.sub(r'"([^`]+?)"', r"`\1`", query) - query = re.sub(r"'.*?'", replace_back_quotes_in_string, query) - - return query - - def _determine_last_relevant_keyword(self, token: SQLToken, last_keyword: str): - if token.value == "," and token.last_keyword_normalized == "ON": - return "FROM" - if token.is_keyword and "".join(token.normalized.split()) in RELEVANT_KEYWORDS: - if ( - not ( - token.normalized == "FROM" - and token.get_nth_previous(3).normalized == "EXTRACT" - ) - and not ( - token.normalized == "ORDERBY" - and len(self._open_parentheses) > 0 - and self._open_parentheses[-1].is_partition_clause_start - ) - and not (token.normalized == "USING" and last_keyword == "SELECT") - and not (token.normalized == "IFNOTEXISTS") - ): - last_keyword = token.normalized - return last_keyword - - def _is_token_part_of_complex_identifier( - self, token: sqlparse.tokens.Token, index: int - ) -> bool: - """ - Checks if token is a part of complex identifier like - .
. or
. - """ - if token.is_keyword: - return False - return str(token) == "." or ( - index + 1 < self.tokens_length - and str(self.non_empty_tokens[index + 1]) == "." + if match: + limit_val = int(match.group(1)) + offset_val = int(match.group(2)) if match.group(2) else 0 + self._limit_and_offset = limit_val, offset_val + return self._limit_and_offset + return None + + def _extract_columns_regex(self) -> list[str]: + """Extract column names from ``INTO ... (col1, col2)`` using regex.""" + match = re.search( + r"INTO\s+\S+\s*\(([^)]+)\)", + self._raw_query, + re.IGNORECASE, ) - - def _combine_qualified_names(self, index: int, token: SQLToken) -> None: - """ - Combines names like .
. or
. - """ - value = token.value - is_complex = True - while is_complex: - value, is_complex = self._combine_tokens(index=index, value=value) - index = index - 1 - token.value = value - - def _combine_tokens(self, index: int, value: str) -> Tuple[str, bool]: - """ - Checks if complex identifier is longer and follows back until it's finished - """ - if index > 1: - prev_value = self.non_empty_tokens[index - 1] - if not self._is_token_part_of_complex_identifier(prev_value, index - 1): - return value, False - prev_value = str(prev_value).strip("`") - value = f"{prev_value}{value}" - return value, True - return value, False - - def _get_sqlparse_tokens(self, parsed) -> None: - """ - Flattens the tokens and removes whitespace - """ - self.sqlparse_tokens = parsed[0].tokens - sqlparse_tokens = self._flatten_sqlparse() - self.non_empty_tokens = [ - token - for token in sqlparse_tokens - if token.ttype is not Whitespace and token.ttype.parent is not Whitespace - ] - self.tokens_length = len(self.non_empty_tokens) - - def _flatten_sqlparse(self): - for token in self.sqlparse_tokens: - # sqlparse returns mysql digit starting identifiers as group - # check https://github.com/andialbrecht/sqlparse/issues/337 - is_grouped_mysql_digit_name = ( - token.is_group - and len(token.tokens) == 2 - and token.tokens[0].ttype is Number.Integer - and ( - token.tokens[1].is_group and token.tokens[1].tokens[0].ttype is Name - ) - ) - if token.is_group and not is_grouped_mysql_digit_name: - yield from token.flatten() - elif is_grouped_mysql_digit_name: - # we have digit starting name - new_tok = Token( - value=f"{token.tokens[0].normalized}" - f"{token.tokens[1].tokens[0].normalized}", - ttype=token.tokens[1].tokens[0].ttype, - ) - new_tok.parent = token.parent - yield new_tok - if len(token.tokens[1].tokens) > 1: - # unfortunately there might be nested groups - remaining_tokens = token.tokens[1].tokens[1:] - for tok in remaining_tokens: - if tok.is_group: - yield from tok.flatten() - else: - yield tok - else: - yield token - - @staticmethod - def _get_switch_by_create_query(tokens: List[SQLToken], index: int) -> str: - """ - Return the switch that creates query type. - """ - switch = tokens[index].normalized + tokens[index + 1].normalized - - # Hive CREATE FUNCTION - if any( - index + i < len(tokens) and tokens[index + i].normalized == "FUNCTION" - for i in (1, 2) - ): - switch = "CREATEFUNCTION" - - return switch - - @staticmethod - def _parse(sql: str) -> Tuple[sqlparse.sql.Statement]: - """ - Parse the SQL query using sqlparse library - """ - return sqlparse.parse(sql) + if not match: + return [] + cols = [] + for col in match.group(1).split(","): + col = col.strip().strip("`").strip('"').strip("'") + if col: + cols.append(col) + return cols + + def _resolve_column_alias(self, alias: str | list[str]) -> list[str]: + """Recursively resolve a column alias (delegates to NestedResolver).""" + resolver = self._get_resolver() + return resolver.resolve_column_alias(alias, self.columns_aliases) diff --git a/sql_metadata/py.typed b/sql_metadata/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/sql_metadata/query_type_extractor.py b/sql_metadata/query_type_extractor.py new file mode 100644 index 00000000..6d13ff5a --- /dev/null +++ b/sql_metadata/query_type_extractor.py @@ -0,0 +1,114 @@ +"""Extract the query type from a sqlglot AST root node. + +The :class:`QueryTypeExtractor` class maps the top-level AST node to a +:class:`QueryType` enum value, handling parenthesised wrappers, set +operations, and opaque ``Command`` nodes. +""" + +import logging +from typing import NoReturn + +from sqlglot import exp + +from sql_metadata.exceptions import InvalidQueryDefinition +from sql_metadata.keywords_lists import QueryType + +logger = logging.getLogger(__name__) + + +#: Direct AST type → QueryType mapping for simple cases. +_SIMPLE_TYPE_MAP = { + exp.Select: QueryType.SELECT, + exp.Union: QueryType.SELECT, + exp.Intersect: QueryType.SELECT, + exp.Except: QueryType.SELECT, + exp.Insert: QueryType.INSERT, + exp.Update: QueryType.UPDATE, + exp.Delete: QueryType.DELETE, + exp.Create: QueryType.CREATE, + exp.Alter: QueryType.ALTER, + exp.Drop: QueryType.DROP, + exp.TruncateTable: QueryType.TRUNCATE, + exp.Merge: QueryType.MERGE, +} + + +class QueryTypeExtractor: + """Determine the query type from a sqlglot AST root node. + + :param ast: Root AST node (may be ``None``). + :param raw_query: Original SQL string (for error messages). + """ + + def __init__( + self, + ast: exp.Expression | None, + raw_query: str, + ): + self._ast = ast + self._raw_query = raw_query + + def extract(self) -> QueryType: + """Determine the :class:`QueryType` for the parsed SQL. + + :returns: The detected query type. + :raises ValueError: If the query is empty, malformed, or + unsupported. + """ + if self._ast is None: + self._raise_for_none_ast() + + root = self._unwrap_parens(self._ast) + node_type = type(root) + + if node_type is exp.With: + raise InvalidQueryDefinition( + "WITH clause without a main statement is not valid SQL" + ) + + simple = _SIMPLE_TYPE_MAP.get(node_type) + if simple is not None: + return simple + + if node_type is exp.Command: + result = self._resolve_command_type(root) + if result is not None: + return result + + shorten_query = " ".join(self._raw_query.split(" ")[:3]) + logger.error("Not supported query type: %s", shorten_query) + raise InvalidQueryDefinition("Not supported query type!") + + @staticmethod + def _unwrap_parens(ast: exp.Expression) -> exp.Expression: + """Remove Paren and Subquery wrappers to reach the real statement.""" + # TODO: revisit if sqlglot stops stripping outer parens before this is called + if isinstance(ast, (exp.Paren, exp.Subquery)): # pragma: no cover + return QueryTypeExtractor._unwrap_parens(ast.this) + return ast + + @staticmethod + def _resolve_command_type(root: exp.Expression) -> QueryType | None: + """Determine query type for an opaque ``exp.Command`` node. + + Hive ``CREATE FUNCTION ... USING JAR ... WITH SERDEPROPERTIES`` + is not supported by any sqlglot dialect and degrades to + ``exp.Command(this='CREATE', ...)``. This fallback extracts + the query type from the command text so callers still get + ``QueryType.CREATE``. + """ + expression_text = str(root.this).upper() if root.this else "" + if expression_text == "CREATE": + return QueryType.CREATE + return None + + def _raise_for_none_ast(self) -> "NoReturn": + """Raise an appropriate error when the AST is None.""" + from sql_metadata.comments import strip_comments + + stripped = strip_comments(self._raw_query) if self._raw_query else "" + if stripped.strip(): + raise InvalidQueryDefinition( + "Could not parse the query — the SQL syntax appears to be invalid" + ) + raise InvalidQueryDefinition("Empty queries are not supported!") diff --git a/sql_metadata/sql_cleaner.py b/sql_metadata/sql_cleaner.py new file mode 100644 index 00000000..fd1dbe72 --- /dev/null +++ b/sql_metadata/sql_cleaner.py @@ -0,0 +1,180 @@ +"""Raw SQL preprocessing before AST construction. + +Pure string transformations — no sqlglot dependency. Handles comment +stripping, ``REPLACE INTO`` rewriting, qualified CTE name normalisation, +DB2 isolation-level clauses, malformed-query rejection, and redundant +outer-parenthesis removal. +""" + +import itertools +import re +from typing import NamedTuple + +from sql_metadata.comments import strip_comments_for_parsing as _strip_comments +from sql_metadata.exceptions import InvalidQueryDefinition +from sql_metadata.utils import DOT_PLACEHOLDER + + +class CleanResult(NamedTuple): + """Result of :meth:`SqlCleaner.clean`.""" + + sql: str | None + is_replace: bool + cte_name_map: dict[str, str] + + +def _strip_outer_parens(sql: str) -> str: + """Strip redundant outer parentheses from *sql*. + + Needed because sqlglot cannot parse double-wrapped non-SELECT + statements like ``((UPDATE ...))``. Uses ``itertools.accumulate`` + to verify balanced parens in one pass, with recursion for nesting. + """ + s = sql.strip() + + def _is_wrapped(text: str) -> bool: + if len(text) < 2 or text[0] != "(" or text[-1] != ")": + return False + inner = text[1:-1] + depths = list( + itertools.accumulate( + (1 if c == "(" else -1 if c == ")" else 0) for c in inner + ) + ) + return not depths or min(depths) >= 0 + + if _is_wrapped(s): + return _strip_outer_parens(s[1:-1].strip()) + return s + + +def _normalize_cte_names(sql: str) -> tuple[str, dict[str, str]]: + """Replace qualified CTE names with simple placeholders. + + sqlglot cannot parse ``WITH db.cte_name AS (...)`` because it + interprets ``db.cte_name`` as a table reference. This function + rewrites such names to ``db__DOT__cte_name`` and returns a mapping + so that the original qualified names can be restored after extraction. + + :param sql: SQL string that may contain qualified CTE names. + :type sql: str + :returns: A 2-tuple of ``(modified_sql, {placeholder: original_name})``. + :rtype: tuple + """ + name_map = {} + # Find WITH ... AS patterns with qualified names + pattern = re.compile( + r"(\bWITH\s+|,\s*)(\w+\.\w+)(\s+AS\s*\()", + re.IGNORECASE, + ) + + def replacer(match: re.Match[str]) -> str: + prefix = match.group(1) + qualified_name = match.group(2) + suffix = match.group(3) + placeholder = qualified_name.replace(".", DOT_PLACEHOLDER) + name_map[placeholder] = qualified_name + return f"{prefix}{placeholder}{suffix}" + + modified = pattern.sub(replacer, sql) + + # Also replace references to qualified CTE names in FROM/JOIN clauses + for placeholder, original in name_map.items(): + # Replace references but not the definition (already replaced) + # Use word boundary to avoid partial matches + modified = re.sub( + r"\b" + re.escape(original) + r"\b", + placeholder, + modified, + ) + + return modified, name_map + + +class SqlCleaner: + """Preprocess raw SQL strings before dialect parsing.""" + + @staticmethod + def clean(sql: str) -> CleanResult: + """Apply all preprocessing steps to raw SQL. + + Steps (in order): + + 1. Rewrite ``REPLACE INTO`` → ``INSERT INTO``. + 2. Rewrite ``SELECT...INTO var FROM`` → ``SELECT...FROM``. + 3. Strip comments. + 4. Normalise qualified CTE names. + 5. Strip DB2 isolation-level clauses. + 6. Detect malformed ``WITH...AS(...) AS`` patterns. + 7. Strip redundant outer parentheses. + + :param sql: Raw SQL string. + :type sql: str + :returns: Cleaning result with preprocessed SQL (``None`` if + effectively empty), replace flag, and CTE name map. + :rtype: CleanResult + :raises ValueError: If a malformed WITH pattern is detected. + """ + is_replace = False + if re.match(r"\s*REPLACE\b", sql, re.IGNORECASE): + sql = re.sub( + r"\bREPLACE\s+INTO\b", + "INSERT INTO", + sql, + count=1, + flags=re.IGNORECASE, + ) + is_replace = True + + # Rewrite SELECT...INTO var1,var2 FROM → SELECT...FROM + # so sqlglot doesn't treat variables as tables. + sql = re.sub( + r"(?i)(\bSELECT\b.+?)\bINTO\b.+?\bFROM\b", + r"\1FROM", + sql, + count=1, + flags=re.DOTALL, + ) + + clean_sql = _strip_comments(sql) + if not clean_sql.strip(): + return CleanResult(sql=None, is_replace=is_replace, cte_name_map={}) + + clean_sql, cte_name_map = _normalize_cte_names(clean_sql) + clean_sql = re.sub( + r"\bwith\s+(ur|cs|rs|rr)\s*$", "", clean_sql, flags=re.IGNORECASE + ).strip() + + SqlCleaner._detect_malformed_with(clean_sql) + + clean_sql = _strip_outer_parens(clean_sql) + if not clean_sql.strip(): + return CleanResult( + sql=None, is_replace=is_replace, cte_name_map=cte_name_map + ) + + return CleanResult( + sql=clean_sql, is_replace=is_replace, cte_name_map=cte_name_map + ) + + @staticmethod + def _detect_malformed_with(clean_sql: str) -> None: + """Raise ``ValueError`` if the SQL contains a malformed WITH pattern. + + Detects ``WITH...AS(...) AS `` or + ``WITH...AS(...) AS `` — an extra ``AS`` token + after the CTE body that indicates malformed SQL. + + :param clean_sql: Preprocessed SQL string. + :type clean_sql: str + :raises ValueError: If a malformed WITH pattern is found. + """ + if not re.match(r"\s*WITH\b", clean_sql, re.IGNORECASE): + return + main_kw = r"(?:SELECT|INSERT|UPDATE|DELETE)" + if re.search( + r"\)\s+AS\s+" + main_kw + r"\b", clean_sql, re.IGNORECASE + ) or re.search(r"\)\s+AS\s+\w+\s+" + main_kw + r"\b", clean_sql, re.IGNORECASE): + raise InvalidQueryDefinition( + "Malformed WITH clause — extra AS keyword after CTE body" + ) diff --git a/sql_metadata/table_extractor.py b/sql_metadata/table_extractor.py new file mode 100644 index 00000000..04321536 --- /dev/null +++ b/sql_metadata/table_extractor.py @@ -0,0 +1,461 @@ +"""Extract tables and table aliases from a sqlglot AST. + +The :class:`TableExtractor` class walks the AST for ``exp.Table`` and +``exp.Lateral`` nodes, builds fully-qualified table names (optionally +preserving ``[bracket]`` notation for TSQL), and sorts results by their +first occurrence in the raw SQL so the output order matches left-to-right +reading order. CTE names are excluded from the result so that only *real* +tables are reported. +""" + +import re + +from sqlglot import exp +from sqlglot.dialects.dialect import DialectType + +from sql_metadata.utils import UniqueList + +# --------------------------------------------------------------------------- +# Pure static helpers (no instance state needed) +# --------------------------------------------------------------------------- + + +def _assemble_dotted_name( + catalog: str, db: str, name: str, *, preserve_empty: bool = False +) -> str: + """Assemble a dot-joined table name from catalog, db, and name parts. + + When *preserve_empty* is ``True``, empty segments are kept so that + double-dot notation (e.g. ``server..table``) is preserved. + + .. code-block:: sql + + -- preserve_empty=False (default) + SELECT * FROM mydb.dbo.users -- → "mydb.dbo.users" + -- preserve_empty=True + SELECT * FROM server..users -- → "server..users" + + :param catalog: Catalog / server segment (may be empty). + :param db: Database / schema segment (may be empty). + :param name: Table name segment. + :param preserve_empty: Keep empty segments for double-dot notation. + :returns: Dot-joined name string. + """ + return ".".join( + part for part in [catalog, db, name] if part or preserve_empty + ) + + +def _ident_str(node: exp.Identifier) -> str: + """Return an identifier string, wrapping it in ``[brackets]`` if quoted. + + TSQL uses square brackets for quoting — this helper preserves that + notation so the output matches the original SQL style. + + .. code-block:: sql + + SELECT * FROM [dbo].[Users] -- → "[dbo]", "[Users]" + SELECT * FROM dbo.Users -- → "dbo", "Users" + + :param node: An ``exp.Identifier`` AST node. + :returns: The identifier text, optionally bracket-wrapped. + """ + return f"[{node.name}]" if node.quoted else node.name + + +def _collect_node_parts(node: object, parts: list[str]) -> None: + """Append identifier strings from *node* into *parts*. + + Handles both simple ``exp.Identifier`` nodes and ``exp.Dot`` nodes + that contain two identifiers (e.g. ``schema.table``). + + :param node: An AST node — either ``exp.Identifier`` or ``exp.Dot``. + :param parts: Accumulator list to append identifier strings into. + """ + if isinstance(node, exp.Identifier): + # e.g. SELECT * FROM [Users] — single identifier + parts.append(_ident_str(node)) + elif isinstance(node, exp.Dot): + # e.g. SELECT * FROM [dbo].[Users] — dotted pair + for sub in [node.this, node.expression]: + if isinstance(sub, exp.Identifier): + parts.append(_ident_str(sub)) + + +def _bracketed_full_name(table: exp.Table) -> str: + """Build a table name preserving ``[bracket]`` notation from AST nodes. + + Walks the ``catalog``, ``db``, and ``this`` args of an ``exp.Table`` + node, collecting bracket-preserved identifier parts. + + .. code-block:: sql + + SELECT * FROM [mydb].[dbo].[Users] -- → "[mydb].[dbo].[Users]" + SELECT * FROM [Users] -- → "[Users]" + + :param table: An ``exp.Table`` AST node. + :returns: Dot-joined bracket-preserved name, or ``""`` if no parts found. + """ + parts: list[str] = [] + for key in ["catalog", "db", "this"]: + node = table.args.get(key) + if node is not None: + _collect_node_parts(node, parts) + return ".".join(parts) if parts else "" + + +def _ends_with_table_keyword(before: str) -> bool: + """Check whether *before* ends with a table-introducing keyword. + + Used to determine if a table name appears right after ``FROM``, + ``JOIN``, ``TABLE``, ``INTO``, or ``UPDATE``. + + :param before: Upper-cased SQL text preceding the candidate table name. + :returns: ``True`` if the text ends with a table keyword. + """ + return any(before.endswith(kw) for kw in _TABLE_CONTEXT_KEYWORDS) + + +def _is_in_comma_list_after_keyword(before: str) -> bool: + """Check whether a comma-preceded name belongs to a table list. + + Looks backward for the nearest table-introducing keyword (e.g. ``FROM``) + and verifies that no interrupting keyword (e.g. ``WHERE``, ``SELECT``) + appears between it and the comma. This handles multi-table ``FROM`` + clauses. + + .. code-block:: sql + + SELECT * FROM t1, t2, t3 -- t2 and t3 are in comma list after FROM + + :param before: Upper-cased SQL text preceding the comma + candidate name. + :returns: ``True`` if the name is part of a comma-separated table list. + """ + best_kw_pos = -1 + for kw in _TABLE_CONTEXT_KEYWORDS: + kw_pos = before.rfind(kw) + if kw_pos > best_kw_pos: + best_kw_pos = kw_pos + if best_kw_pos < 0: + # no table keyword found at all + return False + between = before[best_kw_pos:] + # e.g. FROM t1 WHERE ... , x — WHERE interrupts, so x is not a table + return not any(ik in between for ik in _INTERRUPTING_KEYWORDS) + + +#: SQL keywords that introduce a table-name context. +_TABLE_CONTEXT_KEYWORDS = {"FROM", "JOIN", "TABLE", "INTO", "UPDATE"} + +#: Keywords that interrupt a comma-separated table list. +_INTERRUPTING_KEYWORDS = {"SELECT", "WHERE", "ORDER", "GROUP", "HAVING", "SET"} + + +# --------------------------------------------------------------------------- +# TableExtractor class +# --------------------------------------------------------------------------- + + +class TableExtractor: + """Extract table names and aliases from a sqlglot AST. + + Encapsulates the raw SQL string and AST needed for position-based + table sorting, bracket-mode detection, and CTE name filtering. + + The extraction pipeline: + + 1. Collect all ``exp.Table`` nodes from the AST. + 2. Build fully-qualified names (with bracket preservation for TSQL). + 3. Filter out CTE names so only real tables are reported. + 4. Sort by first occurrence in the raw SQL for left-to-right order. + + :param ast: Root AST node produced by sqlglot. + :param raw_sql: Original SQL string, used for position-based sorting. + :param cte_names: Set of CTE names to exclude from the result. + :param dialect: The dialect used to parse the AST. + """ + + def __init__( + self, + ast: exp.Expression, + raw_sql: str = "", + cte_names: set[str] | None = None, + dialect: DialectType = None, + ): + self._ast = ast + self._raw_sql = raw_sql + self._upper_sql = raw_sql.upper() + self._cte_names = cte_names or set() + + from sql_metadata.dialect_parser import BracketedTableDialect + + self._bracket_mode = isinstance(dialect, type) and issubclass( + dialect, BracketedTableDialect + ) + self._cached_table_nodes: list[exp.Table] | None = None + + # ------------------------------------------------------------------- + # Public API + # ------------------------------------------------------------------- + + def extract(self) -> list[str]: + """Extract table names, excluding CTE definitions. + + For ``CREATE TABLE`` statements, the target table is always placed + first in the result regardless of its position in the SQL text. + All other tables are sorted by their first occurrence in the raw + SQL (left-to-right reading order). + + .. code-block:: sql + + SELECT * FROM users JOIN orders ON ... -- → ["users", "orders"] + CREATE TABLE new_t AS SELECT * FROM src -- → ["new_t", "src"] + + :returns: Ordered list of unique table names. + """ + create_target = None + if isinstance(self._ast, exp.Create): + # e.g. CREATE TABLE t AS SELECT ... — extract target first + create_target = self._extract_create_target() + + collected = self._collect_all() + collected_sorted = sorted(collected, key=lambda t: self._first_position(t)) + return UniqueList( + [create_target, *collected_sorted] if create_target + else collected_sorted + ) + + def extract_aliases(self, tables: list[str]) -> dict[str, str]: + """Extract table alias mappings from the AST. + + Walks all ``exp.Table`` nodes and maps each alias back to its + fully-qualified table name, but only if the table appears in the + provided *tables* list. + + .. code-block:: sql + + SELECT u.id FROM users u -- → {"u": "users"} + + :param tables: List of known table names (from :meth:`extract`). + :returns: Mapping of ``{alias: table_name}``. + """ + aliases = {} + for table in self._table_nodes(): + alias = table.alias + if not alias: + # e.g. SELECT * FROM users — no alias, skip + continue + full_name = self._table_full_name(table) + if full_name in tables: + aliases[alias] = full_name + + return aliases + + # ------------------------------------------------------------------- + # Collection helpers + # ------------------------------------------------------------------- + + def _extract_create_target(self) -> str | None: + """Extract the target table name from a ``CREATE TABLE`` statement. + + The ``CREATE`` node's ``this`` arg may be a ``Table`` directly or a + ``Schema`` wrapping one — both cases are handled. + + .. code-block:: sql + + CREATE TABLE my_table (id INT) -- → "my_table" + CREATE TABLE my_table AS SELECT * FROM src -- → "my_table" + + :returns: Target table name, or ``None`` if it cannot be determined. + """ + target = self._ast.this + target_table = ( + # e.g. CREATE TABLE t (col INT) — target.this is Schema, find Table inside + target.find(exp.Table) if not isinstance(target, exp.Table) + # e.g. CREATE TABLE t AS SELECT ... — target.this is Table directly + else target + ) + name = self._table_full_name(target_table) + return name or None + + def _collect_all(self) -> UniqueList: + """Collect table names from all ``exp.Table`` AST nodes. + + Iterates over every ``exp.Table`` node, builds the full name, and + filters out CTE names so that only real tables are collected. + + .. code-block:: sql + + WITH cte AS (SELECT 1) SELECT * FROM cte, real_table + -- cte is filtered out → collects only "real_table" + + :returns: :class:`UniqueList` of table names (unsorted). + """ + collected = UniqueList() + for table in self._table_nodes(): + full_name = self._table_full_name(table) + if full_name and full_name not in self._cte_names: + # e.g. FROM users — real table, collect it + collected.append(full_name) + # else: e.g. FROM cte_name — CTE reference, skip + return collected + + def _table_nodes(self) -> list[exp.Table]: + """Return all ``exp.Table`` nodes from the AST (cached). + + Uses ``find_all(exp.Table)`` which performs a DFS traversal, finding + tables in subqueries, CTEs, and joins. Results are cached so + repeated calls (from :meth:`extract_aliases`, :meth:`_collect_all`) + don't re-walk the tree. + + :returns: List of ``exp.Table`` AST nodes. + """ + if self._cached_table_nodes is None: + self._cached_table_nodes = list(self._ast.find_all(exp.Table)) + return self._cached_table_nodes + + # ------------------------------------------------------------------- + # Table name construction + # ------------------------------------------------------------------- + + def _table_full_name(self, table: exp.Table) -> str: + """Build a fully-qualified table name from an ``exp.Table`` node. + + In bracket mode (TSQL), delegates to :func:`_bracketed_full_name` to + preserve ``[square bracket]`` quoting. Otherwise, assembles a + dot-joined name from catalog, db, and name parts. Double-dot + notation (``server..table``) is detected from the raw SQL. + + .. code-block:: sql + + SELECT * FROM mydb.dbo.users -- → "mydb.dbo.users" + SELECT * FROM [dbo].[Users] -- (TSQL) → "[dbo].[Users]" + SELECT * FROM server..users -- → "server..users" + + :param table: An ``exp.Table`` AST node. + :returns: Fully-qualified table name string. + """ + name = table.name + + if self._bracket_mode: + # e.g. SELECT * FROM [dbo].[Users] — preserve bracket notation + bracketed = _bracketed_full_name(table) + if bracketed: + return bracketed + + # e.g. SELECT * FROM server..table — detect double-dot in raw SQL + has_double_dot = bool(name and f"..{name}" in self._raw_sql) + return _assemble_dotted_name( + table.catalog, table.db, name, preserve_empty=has_double_dot + ) + + # ------------------------------------------------------------------- + # Position detection + # ------------------------------------------------------------------- + + def _first_position(self, name: str) -> int: + """Find the first occurrence of a table name in a table context. + + Position sorting ensures the output order matches the left-to-right + reading order of the SQL. First tries to find the name after a + table-introducing keyword (``FROM``, ``JOIN``, etc.); if not found, + falls back to any whole-word occurrence; if still not found, returns + the SQL length (pushing unknown names to the end). + + .. code-block:: sql + + SELECT * FROM b JOIN a ON ... -- a at pos ~22, b at pos ~14 → [b, a] + + :param name: Table name to locate. + :returns: Character position (0-based), or ``len(sql)`` if not found. + """ + name_upper = name.upper() + + # try 1: find after a table keyword (FROM, JOIN, etc.) + pos = self._find_word_in_table_context(name_upper) + if pos >= 0: + return pos + + # try 2: find as a bare word anywhere in the SQL + pos = self._find_word(name_upper) + return pos if pos >= 0 else len(self._raw_sql) + + def _find_word_in_table_context(self, name_upper: str) -> int: + """Find a table name that appears after a table-introducing keyword. + + Scans all whole-word occurrences of *name_upper* and returns the + position of the first one that is directly preceded by a table + keyword (``FROM``, ``JOIN``, etc.) or is part of a comma-separated + table list following such a keyword. + + .. code-block:: sql + + SELECT t.id FROM users t -- "users" preceded by FROM → match + SELECT * FROM t1, t2 -- "t2" preceded by comma after FROM → match + SELECT users FROM other -- "users" in SELECT list → no match here + + :param name_upper: Upper-cased table name to search for. + :returns: Position of the match, or ``-1`` if not found in table context. + """ + for match in self._word_pattern(name_upper).finditer(self._upper_sql): + pos: int = int(match.start()) + before = self._upper_sql[:pos].rstrip() + if _ends_with_table_keyword(before): + # e.g. FROM users — directly after table keyword + return pos + if before.endswith(",") and _is_in_comma_list_after_keyword(before): + # e.g. FROM t1, t2 — part of comma-separated list + return pos + return -1 + + def _find_word(self, name_upper: str, start: int = 0) -> int: + """Find *name_upper* as a whole word in the upper-cased SQL. + + Uses a cached regex pattern that respects word boundaries and + handles optionally-quoted segments for dotted names. + + :param name_upper: Upper-cased name to search for. + :param start: Position to start searching from. + :returns: Position of the match, or ``-1`` if not found. + """ + match = self._word_pattern(name_upper).search(self._upper_sql, start) + return int(match.start()) if match else -1 + + _pattern_cache: dict[str, re.Pattern[str]] = {} + + # Optional quote wrappers — cover backticks, single/double quotes, and brackets + _OPT_OPEN_QUOTE = r"""[`"'\[]?""" + _OPT_CLOSE_QUOTE = r"""[`"'\]]?""" + + @staticmethod + def _word_pattern(name_upper: str) -> re.Pattern[str]: + """Build a regex matching *name_upper* as a whole word (cached). + + For qualified names (containing dots), each segment may be optionally + wrapped in backticks, single/double quotes, or brackets — so the + pattern for ``SCHEMA.TABLE`` also matches ``"SCHEMA"."TABLE"``, + ``[SCHEMA].[TABLE]``, or ```SCHEMA`.`TABLE```. + + The pattern is compiled once and cached in a class-level dict for + reuse across calls and instances. + + .. code-block:: sql + + SELECT * FROM schema.table -- matched by SCHEMA.TABLE + SELECT * FROM "schema"."table" -- also matched + SELECT * FROM [schema].[table] -- also matched + + :param name_upper: Upper-cased table name (may contain dots). + :returns: Compiled regex pattern with word-boundary assertions. + """ + pat = TableExtractor._pattern_cache.get(name_upper) + if pat is None: + oq = TableExtractor._OPT_OPEN_QUOTE + cq = TableExtractor._OPT_CLOSE_QUOTE + segments = name_upper.split(".") + inner = r"\.".join( + oq + re.escape(seg) + cq for seg in segments + ) + pat = re.compile(r"(? str: # pragma: no cover - """ - Representation - useful for debugging - """ - repr_str = ["=".join([str(k), str(v)]) for k, v in self.__dict__.items()] - return f"SQLToken({','.join(repr_str)})" - - @property - def normalized(self) -> str: - """ - Property returning uppercase value without end lines and spaces - """ - return self.value.translate(str.maketrans("", "", " \n\t\r")).upper() - - @property - def stringified_token(self) -> str: - """ - Returns string representation with whitespace or not - used to rebuild query - from list of tokens - """ - if self.previous_token: - if ( - self.normalized in [")", ".", ","] - or self.previous_token.normalized in ["(", "."] - or ( - self.is_left_parenthesis - and self.previous_token.normalized - not in RELEVANT_KEYWORDS.union({"*", ",", "IN", "NOTIN"}) - ) - ): - return str(self) - return f" {self}" - return str(self) # pragma: no cover - - @property - def last_keyword_normalized(self) -> str: - """ - Property returning uppercase last keyword without end lines and spaces - """ - if self.last_keyword: - return self.last_keyword.translate(str.maketrans("", "", " \n\t\r")).upper() - return "" - - @property - def is_in_parenthesis(self) -> bool: - """ - Property checks if token is surrounded with brackets () - """ - return self.parenthesis_level > 0 - - @property - def is_create_table_columns_definition(self) -> bool: - """ - Checks if given token is inside columns definition in - create table query like: create table name () - """ - open_parenthesis = self.find_nearest_token( - True, value_attribute="is_create_table_columns_declaration_start" - ) - if open_parenthesis is EmptyToken: - return False - close_parenthesis = self.find_nearest_token( - True, - direction="right", - value_attribute="is_create_table_columns_declaration_end", - ) - return ( - open_parenthesis is not EmptyToken and close_parenthesis is not EmptyToken - ) - - @property - def is_keyword_column_name(self) -> bool: - """ - Checks if given keyword can be a column name in SELECT query - """ - return ( - self.is_keyword - and self.normalized not in RELEVANT_KEYWORDS - and self.previous_token.normalized in [",", "SELECT"] - and self.next_token.normalized in [",", "AS", "FROM"] - ) - - @property - def is_alias_without_as(self) -> bool: - """ - Checks if a given token is an alias without as keyword, - like: SELECT col , col2 from table - """ - return ( - self.next_token.normalized in [",", "FROM"] - and self.previous_token.normalized not in ["*", ",", ".", "(", "SELECT"] - and not self.previous_token.is_keyword - and ( - self.last_keyword_normalized == "SELECT" - or self.previous_token.is_column_definition_end - or self.previous_token.is_partition_clause_end - ) - and not self.previous_token.is_comment - ) - - @property - def is_alias_definition(self): - """ - Returns if current token is a definition of an alias. - Note that aliases can also be used in other queries and be a part - of other nested columns with aliases. - - Note that this function only check if alias token is a token with - alias definition, it's not suitable for determining IF token is an alias - as it's more complicated and this method would match - also i.e. sub-queries names - """ - return ( - self.is_alias_without_as - or self.previous_token.normalized == "AS" - or self.is_in_with_columns - ) - - @property - def is_alias_of_self(self) -> bool: - """ - Checks if a given token is an alias but at the same time - is also an alias of self, so not really an alias - """ - - end_of_column = self.find_nearest_token( - [",", "FROM"], value_attribute="normalized", direction="right" - ) - while end_of_column.is_in_nested_function: - end_of_column = end_of_column.find_nearest_token( - [",", "FROM"], value_attribute="normalized", direction="right" - ) - return end_of_column.previous_token.normalized == self.normalized - - @property - def is_in_with_columns(self) -> bool: - """ - Checks if token is inside with colums part of a query - """ - return ( - self.find_nearest_token("(").is_with_columns_start - and self.find_nearest_token(")", direction="right").is_with_columns_end - ) - - @property - def is_wildcard_not_operator(self): - """ - Determines if * encountered in query is a wildcard like select <*> from aa - or is that an operator like Select aa <*> bb as cc from dd - """ - return self.normalized == "*" and ( - self.previous_token.value in [",", ".", "SELECT"] - or (self.previous_token.value == "(") - and self.next_token.value == ")" - ) - - @property - def is_potential_table_name(self) -> bool: - """ - Checks if token is a possible candidate for table name - """ - return ( - (self.is_name or self.is_keyword) - and self.last_keyword_normalized in TABLE_ADJUSTMENT_KEYWORDS - and self.previous_token.normalized not in ["AS", "WITH"] - and self.normalized - not in ["AS", "SELECT", "IF", "SET", "WITH", "IFNOTEXISTS"] - ) - - @property - def is_with_statement_nested_in_subquery(self) -> bool: - """ - Checks if token is with statement nested in subquery - """ - return ( - self.normalized == "WITH" - and self.previous_token.is_left_parenthesis - and self.get_nth_previous(2).normalized == "FROM" - ) - - @property - def is_alias_of_table_or_alias_of_subquery(self) -> bool: - """ - Checks if token is alias of table or alias of subquery - - It's not a list of tables, e.g. SELECT * FROM foo, bar - hence, it can be the case of alias without AS, e.g. SELECT * FROM foo bar - or an alias of subquery (SELECT * FROM foo) bar - """ - is_alias_without_as = ( - self.previous_token.normalized != self.last_keyword_normalized - and not self.previous_token.is_punctuation - and not self.previous_token.normalized == "IFNOTEXISTS" - ) - return is_alias_without_as or self.previous_token.is_right_parenthesis - - @property - def is_a_wildcard_in_select_statement(self) -> bool: - """ - Checks if token is a wildcard in select statement - - Handle * wildcard in select part, but ignore count(*) - """ - return ( - self.is_wildcard - and self.last_keyword_normalized == "SELECT" - and not self.previous_token.is_left_parenthesis - ) - - @property - def is_potential_column_name(self) -> bool: - """ - Checks if token is a potential column name - """ - return ( - self.last_keyword_normalized in KEYWORDS_BEFORE_COLUMNS - and self.previous_token.normalized not in ["AS", ")"] - and not self.is_alias_without_as - ) - - @property - def is_conversion_specifier(self) -> bool: - """ - Checks if token is a format or data type in cast or convert - """ - return ( - self.previous_token.normalized in ["AS", "USING"] - and self.is_in_nested_function - ) - - @property - def is_column_name_inside_insert_clause(self) -> bool: - """ - Checks if token is a column name inside insert clause, - e.g. INSERT INTO `foo` (col1, `col2`) VALUES (..) - """ - return ( - self.last_keyword_normalized == "INTO" - and self.previous_token.is_punctuation - ) - - @property - def is_potential_alias(self) -> bool: - """ - Checks if given token can possibly be an alias - """ - return self.is_name or ( - self.is_keyword - and self.previous_token.normalized == "AS" - and self.last_keyword_normalized == "SELECT" - ) - - @property - def is_a_valid_alias(self) -> bool: - """ - Checks if given token meets the alias criteria - """ - return ( - self.last_keyword_normalized in KEYWORDS_BEFORE_COLUMNS - and self.normalized not in ["DIV"] - and self.is_alias_definition - and not self.is_in_nested_function - or self.is_in_with_columns - ) - - @property - def next_token_not_comment(self): - """ - Property returning next non-comment token - """ - if self.next_token and self.next_token.is_comment: - return self.next_token.next_token_not_comment - return self.next_token - - @property - def previous_token_not_comment(self): - """ - Property returning previous non-comment token - """ - if self.previous_token and self.previous_token.is_comment: - return self.previous_token.previous_token_not_comment - return self.previous_token - - def is_constraint_definition_inside_create_table_clause( - self, query_type: str - ) -> bool: - """ - Checks if token is constraint definition inside create table clause - - Used to handle CREATE TABLE queries (#35) to skip keyword that are withing - parenthesis-wrapped list of column - """ - return ( - query_type == QueryType.CREATE.value - and self.is_in_parenthesis - and self.is_create_table_columns_definition - ) - - def is_columns_alias_of_with_query_or_column_in_insert_query( - self, with_names: List[str] - ) -> bool: - """ - Check if token is column alias of with query or column in insert query - - We are in of INSERT INTO
(), - or columns of with statement: with () as ... - """ - return self.is_in_parenthesis and ( - self.find_nearest_token("(").previous_token.value in with_names - or self.last_keyword_normalized == "INTO" - ) - - def is_sub_query_alias(self, subqueries_names: List[str]) -> bool: - """ - Checks for aliases of sub-queries i.e.: SELECT from (...) - """ - return ( - self.previous_token.is_right_parenthesis and self.value in subqueries_names - ) - - def is_with_query_name(self, with_names: List[str]) -> bool: - """ - checks for names of the with queries as (subquery) - """ - return self.next_token.normalized == "AS" and self.value in with_names - - def is_sub_query_name_or_with_name_or_function_name( - self, sub_queries_names: List[str], with_names: List[str] - ) -> bool: - """ - Check for non applicable names: with, subquery or custom function - """ - return ( - self.is_sub_query_alias(subqueries_names=sub_queries_names) - or self.is_with_query_name(with_names=with_names) - or self.next_token.is_left_parenthesis - ) - - def is_not_an_alias_or_is_self_alias_outside_of_subquery( - self, columns_aliases_names: List[str], max_subquery_level: Dict - ) -> bool: - """ - Checks if token is not alias or alias of self outside of sub query - """ - return ( - self.value not in columns_aliases_names - or self.token_is_alias_of_self_not_from_subquery( - aliases_levels=max_subquery_level - ) - or self.token_name_is_same_as_alias_not_from_subquery( - aliases_levels=max_subquery_level - ) - ) - - def is_table_definition_suffix_in_non_select_create_table( - self, query_type: str - ) -> bool: - """ - Checks if we are after create table definition. - - Ignore annotations outside the parenthesis with the list of columns - e.g. ) CHARACTER SET utf8; - """ - return ( - query_type == QueryType.CREATE - and not self.is_in_parenthesis - and self.find_nearest_token("SELECT", value_attribute="normalized") - is EmptyToken - ) - - def is_column_definition_inside_create_table(self, query_type: str) -> bool: - """ - Checks for column names in create table - - Previous token is either ( or , -> indicates the column name - """ - return ( - query_type == QueryType.CREATE - and self.is_in_parenthesis - and self.previous_token.is_punctuation - and self.last_keyword_normalized == "TABLE" - ) - - def is_potential_column_alias( - self, columns_aliases_names: List[str], column_aliases: Dict - ) -> bool: - """ - Checks if column can be an alias - """ - return ( - self.value in columns_aliases_names - and self.value not in column_aliases - and not self.previous_token.is_nested_function_start - and self.is_alias_definition - ) - - def token_is_alias_of_self_not_from_subquery(self, aliases_levels: Dict) -> bool: - """ - Checks if token is also an alias, but is an alias of self that is not - coming from a subquery, that means it's a valid column - """ - return ( - self.last_keyword_normalized == "SELECT" - and self.is_alias_of_self - and self.subquery_level == aliases_levels[self.value] - ) - - def token_name_is_same_as_alias_not_from_subquery( - self, aliases_levels: Dict - ) -> bool: - """ - Checks if token is also an alias, but is an alias of self that is not - coming from a subquery, that means it's a valid column - """ - return ( - self.last_keyword_normalized == "SELECT" - and self.next_token.normalized == "AS" - and self.subquery_level == aliases_levels[self.value] - ) - - def table_prefixed_column(self, table_aliases: Dict) -> str: - """ - Substitutes table alias with actual table name - """ - value = self.value - if "." in value: - parts = value.split(".") - if len(parts) > 4: # pragma: no cover - raise ValueError(f"Wrong columns name: {value}") - parts[0] = table_aliases.get(parts[0], parts[0]) - value = ".".join(parts) - return value - - def get_nth_previous(self, level: int) -> "SQLToken": - """ - Function iterates previous tokens getting nth previous token - """ - assert level >= 1 - if self.previous_token: - if level > 1: - return self.previous_token.get_nth_previous(level=level - 1) - return self.previous_token - return EmptyToken # pragma: no cover - - def find_nearest_token( - self, - value: Union[Union[str, bool], List[Union[str, bool]]], - direction: str = "left", - value_attribute: str = "value", - ) -> "SQLToken": - """ - Returns token with given value to the left or right. - If value is not found it returns EmptyToken. - """ - if not isinstance(value, list): - value = [value] - attribute = "previous_token" if direction == "left" else "next_token" - token = self - while getattr(token, attribute): - tok_value = getattr(getattr(token, attribute), value_attribute) - if tok_value in value: - return getattr(token, attribute) - token = getattr(token, attribute) - return EmptyToken - - -EmptyToken = SQLToken() # pylint: disable=invalid-name diff --git a/sql_metadata/utils.py b/sql_metadata/utils.py index ccde60a4..df6d0453 100644 --- a/sql_metadata/utils.py +++ b/sql_metadata/utils.py @@ -1,35 +1,59 @@ -""" -Module with various utils +"""Utility classes and functions shared across the sql-metadata package. + +Provides ``UniqueList``, a deduplicating list used to collect columns, +tables, aliases, and CTE names while preserving insertion order, and +a ``last_segment`` helper for qualified name handling. """ -from typing import Any, List, Sequence +from typing import Any, Iterable +#: Placeholder used to encode dots in qualified CTE names so that sqlglot +#: does not misinterpret ``db.cte_name`` as a table reference. +DOT_PLACEHOLDER = "__DOT__" -class UniqueList(list): - """ - List that keeps it's items unique + +class UniqueList(list[str]): + """A list subclass that silently rejects duplicate items. + + Used throughout the extraction pipeline (``_extract.py``, ``parser.py``) + to collect columns, tables, aliases, CTE names, and subquery names while + guaranteeing uniqueness and preserving first-insertion order. Maintains + an internal ``set`` for O(1) membership checks. """ + def __init__(self, iterable: Any = None, **kwargs: Any) -> None: + self._seen: set[str] = set() + if iterable is not None: + super().__init__(**kwargs) + self.extend(iterable) + else: + super().__init__(**kwargs) + self._seen = set(self) + def append(self, item: Any) -> None: - if item not in self: + """Append *item* only if it is not already present (O(1) check).""" + if item not in self._seen: + self._seen.add(item) super().append(item) - def extend(self, items: Sequence[Any]) -> None: + def extend(self, items: Iterable[Any]) -> None: # type: ignore[override] + """Extend the list with *items*, skipping duplicates.""" for item in items: self.append(item) - def __sub__(self, other) -> List: - return [x for x in self if x not in other] + def __contains__(self, item: Any) -> bool: + """O(1) membership check using the internal set.""" + return item in self._seen + + def __sub__(self, other: Any) -> list[str]: + """Return a plain list of elements in *self* that are not in *other*.""" + other_set = set(other) + return [x for x in self if x not in other_set] + + + +def last_segment(name: str) -> str: + """Return the last dot-separated segment of a qualified name.""" + return name.rsplit(".", 1)[-1] -def flatten_list(input_list: List) -> List[str]: - """ - Flattens list of string and lists if there are nested lists. - """ - result = [] - for item in input_list: - if isinstance(item, list): - result.extend(flatten_list(item)) - else: - result.append(item) - return result diff --git a/test/test_aliases.py b/test/test_aliases.py index 1d822fde..97d0d656 100644 --- a/test/test_aliases.py +++ b/test/test_aliases.py @@ -16,9 +16,9 @@ def test_get_query_table_aliases(): assert Parser( "SELECT bar AS value FROM foo AS f INNER JOIN dimensions AS d ON f.id = d.id" ).tables_aliases == {"f": "foo", "d": "dimensions"} - assert ( - Parser("SELECT e.foo FROM (SELECT * FROM bar) AS e").tables_aliases == {} - ), "Sub-query aliases are ignored" + assert Parser("SELECT e.foo FROM (SELECT * FROM bar) AS e").tables_aliases == {}, ( + "Sub-query aliases are ignored" + ) assert Parser( "SELECT a.* FROM product_a AS a " "JOIN product_b AS b ON a.ip_address = b.ip_address" @@ -44,3 +44,27 @@ def test_tables_aliases_are_resolved(): "users1.ip_address", "users2.ip_address", ] + + +def test_column_alias_same_as_join_table_alias(): + # solved: https://github.com/macbre/sql-metadata/issues/424 + query = """ + SELECT + dependent_schema.name as dependent_schema, + relationships.dependent_name as dependent_name + FROM relationships + JOIN schema AS dependent_schema + ON relationships.dependent_schema_id = dependent_schema.id + JOIN schema AS referenced_schema + ON relationships.referenced_schema_id = referenced_schema.id + GROUP BY dependent_schema, dependent_name + ORDER BY dependent_schema, dependent_name + """ + parser = Parser(query) + assert parser.tables == ["relationships", "schema"] + assert parser.tables_aliases == { + "dependent_schema": "schema", + "referenced_schema": "schema", + } + assert "schema.name" in parser.columns + assert "relationships.dependent_name" in parser.columns diff --git a/test/test_alter.py b/test/test_alter.py index 572dba2c..69c8188a 100644 --- a/test/test_alter.py +++ b/test/test_alter.py @@ -11,3 +11,9 @@ def test_alter_table_indices_index(): parser = Parser("ALTER TABLE foo_table ADD INDEX `idx_foo` (`bar`);") assert parser.query_type == QueryType.ALTER assert parser.tables == ["foo_table"] + + +def test_alter_table_add_column(): + """ALTER TABLE ADD COLUMN is parsed correctly.""" + p = Parser("ALTER TABLE t ADD COLUMN new_col INT") + assert p.query_type == "ALTER TABLE" diff --git a/test/test_column_aliases.py b/test/test_column_aliases.py index d0a1d336..bf6e4d06 100644 --- a/test/test_column_aliases.py +++ b/test/test_column_aliases.py @@ -24,15 +24,10 @@ def test_column_aliases_with_subquery(): """ parser = Parser(query) assert parser.tables == ["data_contracts_report"] - assert parser.subqueries_names == ["sq2", "sq"] - assert parser.subqueries == { - "sq": "SELECT count(C2) as C2Count, BusinessSource, yearweek(Start1) Start1, " - "yearweek(End1) End1 from (SELECT ContractID as C2, BusinessSource, " - "StartDate as Start1, EndDate as End1 from data_contracts_report) sq2 " - "group by 2, 3, 4", - "sq2": "SELECT ContractID as C2, BusinessSource, StartDate as Start1, EndDate " - "as End1 from data_contracts_report", - } + assert parser.subqueries_names == ["sq2", "sq", "subquery_1"] + assert "sq" in parser.subqueries + assert "sq2" in parser.subqueries + assert "subquery_1" in parser.subqueries assert parser.columns == [ "SignDate", "BusinessSource", diff --git a/test/test_comments.py b/test/test_comments.py index 9a93bb5a..16db5f99 100644 --- a/test/test_comments.py +++ b/test/test_comments.py @@ -155,61 +155,37 @@ def test_inline_comments_with_hash(): assert parser.comments == [] -def test_next_token_not_comment_single(): - query = """ - SELECT column_1 -- comment_1 - FROM table_1 - """ +def test_without_comments_for_multiline_query(): + query = """SELECT * -- comment + FROM table + WHERE table.id = '123'""" parser = Parser(query) - column_1_tok = parser.tokens[1] - - assert column_1_tok.next_token.is_comment - assert not column_1_tok.next_token_not_comment.is_comment - assert column_1_tok.next_token.next_token == column_1_tok.next_token_not_comment + assert parser.without_comments == """SELECT * FROM table WHERE table.id = '123'""" -def test_next_token_not_comment_multiple(): - query = """ - SELECT column_1 -- comment_1 - - /* - comment_2 - */ - - # comment_3 - FROM table_1 - """ +def test_table_after_comment_not_ignored(): + # solved: https://github.com/macbre/sql-metadata/issues/251 + query = """SELECT c1 FROM + --Comment-- + d1, d2, d3""" parser = Parser(query) - column_1_tok = parser.tokens[1] + assert parser.tables == ["d1", "d2", "d3"] + assert parser.columns == ["c1"] + assert parser.columns_dict == {"select": ["c1"]} - assert column_1_tok.next_token.is_comment - assert column_1_tok.next_token.next_token.is_comment - assert column_1_tok.next_token.next_token.next_token.is_comment - assert not column_1_tok.next_token_not_comment.is_comment - assert ( - column_1_tok.next_token.next_token.next_token.next_token - == column_1_tok.next_token_not_comment - ) +def test_extract_comments_empty_string(): + """Extracting comments from empty SQL returns empty list.""" + assert Parser("").comments == [] -def test_next_token_not_comment_on_non_comments(): - query = """ - SELECT column_1 - FROM table_1 - """ - parser = Parser(query) - select_tok = parser.tokens[0] - assert select_tok.next_token == select_tok.next_token_not_comment - assert ( - select_tok.next_token.next_token - == select_tok.next_token_not_comment.next_token_not_comment - ) +def test_strip_comments_empty_string(): + """Stripping comments from empty SQL returns empty string.""" + assert Parser("").without_comments == "" -def test_without_comments_for_multiline_query(): - query = """SELECT * -- comment - FROM table - WHERE table.id = '123'""" - parser = Parser(query) - assert parser.without_comments == """SELECT * FROM table WHERE table.id = '123'""" +def test_strip_comments_for_parsing_empty(): + """SqlCleaner handles empty strings via strip_comments_for_parsing.""" + from sql_metadata.comments import strip_comments_for_parsing + + assert strip_comments_for_parsing("") == "" diff --git a/test/test_compat.py b/test/test_compat.py deleted file mode 100644 index 3883774f..00000000 --- a/test/test_compat.py +++ /dev/null @@ -1,61 +0,0 @@ -from sqlparse.tokens import Punctuation, Wildcard - -from sql_metadata.compat import ( - get_query_columns, - get_query_tables, - get_query_limit_and_offset, - generalize_sql, - preprocess_query, - get_query_tokens, -) - - -def test_get_query_columns(): - assert ["*"] == get_query_columns("SELECT * FROM `test_table`") - assert ["foo", "id"] == get_query_columns( - "SELECT foo, count(*) as bar FROM `test_table` WHERE id = 3" - ) - - -def test_get_query_tables(): - assert ["test_table"] == get_query_tables("SELECT * FROM `test_table`") - assert ["test_table", "second_table"] == get_query_tables( - "SELECT foo FROM test_table, second_table WHERE id = 1" - ) - - -def test_get_query_limit_and_offset(): - assert (200, 927600) == get_query_limit_and_offset( - "SELECT * FOO foo LIMIT 927600,200" - ) - - -def test_generalize_sql(): - assert generalize_sql() is None - assert "SELECT * FROM foo;" == generalize_sql("SELECT * FROM foo;") - assert "SELECT * FROM foo WHERE id = N" == generalize_sql( - "SELECT * FROM foo WHERE id = 123" - ) - assert "SELECT test FROM foo" == generalize_sql("SELECT /* foo */ test FROM foo") - - -def test_preprocess_query(): - assert "SELECT * FROM foo WHERE id = 123" == preprocess_query( - "SELECT * FROM foo WHERE id = 123" - ) - assert "SELECT /* foo */ test FROM `foo`.`bar`" == preprocess_query( - "SELECT /* foo */ test\nFROM `foo`.`bar`" - ) - - -def test_get_query_tokens(): - tokens = get_query_tokens("SELECT * FROM foo;") - assert len(tokens) == 5 - - assert tokens[0].normalized == "SELECT" - assert tokens[1].ttype is Wildcard - assert tokens[2].normalized == "FROM" - assert tokens[3].normalized == "foo" - assert tokens[4].ttype is Punctuation - - assert [] == get_query_tokens("") diff --git a/test/test_create_table.py b/test/test_create_table.py index 6c065d75..2e5d1fec 100644 --- a/test/test_create_table.py +++ b/test/test_create_table.py @@ -1,11 +1,11 @@ import pytest -from sql_metadata import Parser +from sql_metadata import InvalidQueryDefinition, Parser from sql_metadata.keywords_lists import QueryType def test_is_create_table_query(): - with pytest.raises(ValueError): + with pytest.raises(InvalidQueryDefinition): assert Parser("BEGIN").query_type assert Parser("SELECT * FROM `foo` ()").query_type == QueryType.SELECT @@ -78,7 +78,7 @@ def test_creating_table_as_select_with_with_clause(): parser = Parser(qry) assert parser.query_type == QueryType.CREATE assert parser.with_names == ["sub"] - assert parser.with_queries == {"sub": "select it_id from internal_table"} + assert parser.with_queries == {"sub": "SELECT it_id FROM internal_table"} assert parser.columns == [ "it_id", "*", @@ -170,3 +170,75 @@ def test_create_temporary_table(): assert parser.query_type == QueryType.CREATE assert parser.tables == ["new_tbl", "orig_tbl"] assert parser.columns == ["*"] + + +def test_create_index_extracts_table(): + """CREATE INDEX correctly extracts the target table.""" + p = Parser("CREATE INDEX idx ON t (col)") + assert "t" in p.tables + + +def test_create_table_with_columns_only(): + """CREATE TABLE with column definitions (no SELECT) extracts columns.""" + p = Parser("CREATE TABLE users (id INT, name VARCHAR(100), active BOOL)") + assert p.columns == ["id", "name", "active"] + assert p.tables == ["users"] + + +def test_create_table_with_column_defs_and_select(): + """CREATE TABLE with both column definitions and AS SELECT.""" + p = Parser("CREATE TABLE t (id INT) AS SELECT a FROM t2") + assert p.columns == ["id", "a"] + assert p.tables == ["t", "t2"] + + +def test_ctas_with_redshift_distkey_sortkey(): + # Solved: https://github.com/macbre/sql-metadata/issues/367 + p = Parser( + "CREATE TABLE my_table distkey(col1) sortkey(col1, col3) " + "AS SELECT col1, col2, col3 FROM source_table" + ) + assert p.tables == ["my_table", "source_table"] + assert p.columns == ["col1", "col2", "col3"] + + +def test_create_table_mysql_charset_and_collate(): + # Solved: https://github.com/macbre/sql-metadata/issues/358 + p = Parser("""CREATE TABLE `jeecg_order_main` ( + `id` varchar(32) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL, + `order_code` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL, + `order_date` datetime NULL DEFAULT NULL, + `order_money` double(10, 3) NULL DEFAULT NULL, + `bpm_status` varchar(3) CHARACTER SET utf8 COLLATE utf8_general_ci NULL, + PRIMARY KEY (`id`) USING BTREE + ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci""") + assert p.tables == ["jeecg_order_main"] + assert p.columns == ["id", "order_code", "order_date", "order_money", "bpm_status"] + + +def test_create_table_with_comments_and_keyword_columns(): + # Solved: https://github.com/macbre/sql-metadata/issues/507 + p = Parser(""" + CREATE TABLE accounts ( + id INTEGER, /* comment */ + username TEXT UNIQUE, + status TEXT, + online_at INTEGER, + hash TEXT UNIQUE, + uid TEXT UNIQUE, + test INTEGER, + usage INTEGER, + PRIMARY KEY (id) + ) + """) + assert p.tables == ["accounts"] + assert p.columns == [ + "id", "username", "status", "online_at", "hash", "uid", "test", "usage" + ] + + +def test_create_table_as_select_with_cte_same_name(): + """CREATE TABLE target should be reported even when a CTE shares its name.""" + query = "CREATE TABLE foo AS WITH foo AS (SELECT 1 as id) SELECT * FROM foo" + parser = Parser(query) + assert parser.tables == ["foo"] diff --git a/test/test_edge_cases.py b/test/test_edge_cases.py new file mode 100644 index 00000000..9e685ef3 --- /dev/null +++ b/test/test_edge_cases.py @@ -0,0 +1,23 @@ +"""Edge-case tests for internals not covered by feature-specific test files.""" + +from sql_metadata.sql_cleaner import SqlCleaner +from sql_metadata.utils import UniqueList + + +def test_unique_list_subtraction(): + """UniqueList.__sub__ returns elements not present in the other list.""" + ul = UniqueList(["a", "b", "c", "d"]) + result = ul - ["b", "d"] + assert result == ["a", "c"] + + +def test_unique_list_deduplicates_on_init(): + """UniqueList removes duplicates when constructed from an iterable.""" + ul = UniqueList(["x", "y", "x", "z", "y"]) + assert list(ul) == ["x", "y", "z"] + + +def test_clean_empty_after_paren_strip(): + """SQL that becomes empty after outer-paren stripping.""" + result = SqlCleaner.clean("(())") + assert result.sql is None diff --git a/test/test_getting_columns.py b/test/test_getting_columns.py index 902e4e51..8ebad638 100644 --- a/test/test_getting_columns.py +++ b/test/test_getting_columns.py @@ -113,6 +113,52 @@ def test_columns_with_order_by(): "foo", "id", ] + # Star inside COUNT(*) in ORDER BY should not be extracted as a column + assert Parser( + "SELECT dept FROM employees GROUP BY dept ORDER BY COUNT(*) DESC" + ).columns == ["dept"] + + +def test_output_columns(): + # Solved: https://github.com/macbre/sql-metadata/issues/468 + parser = Parser("""SELECT + dj.field_1, + cardinality(dj.field_1) as field_1_count, + dj.field_2, + cardinality(dj.field_2) as field_2_count, + dj.field_3 as field_3 + FROM dj""") + assert parser.output_columns == [ + "dj.field_1", "field_1_count", "dj.field_2", "field_2_count", "field_3" + ] + + # Simple alias + assert Parser("SELECT a, b AS c FROM t").output_columns == ["a", "c"] + + # Star + assert Parser("SELECT * FROM t").output_columns == ["*"] + + # Self-alias preserves original name + assert Parser("SELECT a AS a FROM t").output_columns == ["a"] + + # Non-SELECT query returns empty list + assert Parser("CREATE TABLE t (id INT)").output_columns == [] + + # Solved: https://github.com/macbre/sql-metadata/issues/421 + # Window function alias resolved in output_columns + parser = Parser("""SELECT + DATE_TRUNC('month', o.order_date) AS month, + c.customer_id, + SUM(oi.quantity * oi.unit_price) AS revenue, + ROW_NUMBER() OVER (PARTITION BY c.customer_id + ORDER BY SUM(oi.quantity * oi.unit_price) DESC) AS revenue_rank + FROM orders o + JOIN customers c ON o.customer_id = c.customer_id + JOIN order_items oi ON o.order_id = oi.order_id""") + assert parser.output_columns == [ + "month", "customers.customer_id", "revenue", "revenue_rank" + ] + assert "revenue_rank" in parser.columns_aliases def test_update_and_replace(): @@ -304,6 +350,24 @@ def test_columns_and_sql_functions(): ).columns == ["col", "col2", "col3", "col4", "col5"] +def test_odbc_escape_function(): + # Solved: https://github.com/macbre/sql-metadata/issues/391 + parser = Parser( + "SELECT Calendar_year_lookup.Yr, " + "{fn concat('Q', Calendar_year_lookup.Qtr)}, " + "sum(Shop_facts.Amount_sold) " + "FROM Calendar_year_lookup, Shop_facts " + "GROUP BY Calendar_year_lookup.Yr, " + "{fn concat('Q', Calendar_year_lookup.Qtr)}" + ) + assert parser.tables == ["Calendar_year_lookup", "Shop_facts"] + assert parser.columns == [ + "Calendar_year_lookup.Yr", + "Calendar_year_lookup.Qtr", + "Shop_facts.Amount_sold", + ] + + def test_columns_starting_with_keywords(): query = """ SELECT `schema_name`, full_table_name, `column_name`, `catalog_name`, @@ -533,7 +597,7 @@ def test_double_inner_join(): parser = Parser(query) assert "loan.account_id" in parser.columns - assert parser.tables == ["loan", "account"] + assert parser.tables == ["loan", "account", "district"] def test_keyword_column_source(): @@ -555,3 +619,157 @@ def test_keyword_column_source(): # Test with 'source' as only column parser = Parser("select source from my_table") assert parser.columns == ["source"] + + +def test_sum_case_when_columns(): + # solved: https://github.com/macbre/sql-metadata/issues/579 + query = """ + SELECT CAST( + SUM(CASE WHEN segment = 'Premium' THEN 1 ELSE 0 END) AS REAL) * 100 / + COUNT(*) AS premiumpercentage + FROM gasstations WHERE country = 'SVK'""" + parser = Parser(query) + assert parser.columns == ["segment", "country"] + assert parser.columns_dict == {"select": ["segment"], "where": ["country"]} + assert parser.tables == ["gasstations"] + + +def test_quoted_column_with_whitespace(): + # solved: https://github.com/macbre/sql-metadata/issues/578 + query = ( + """SELECT COUNT(*) FROM examination WHERE "Examination Date" > '1997-01-01'""" + ) + parser = Parser(query) + assert parser.columns == ["Examination Date"] + assert parser.columns_dict == {"where": ["Examination Date"]} + assert parser.tables == ["examination"] + + +def test_coalesce_in_joins(): + # solved: https://github.com/macbre/sql-metadata/issues/559 + query = """ + select OPR.ID, OPR.year from operations OPR + INNER JOIN my_db_name.ipps_wage_index_annual WI ON OPR.year = WI.cms_year + INNER JOIN my_db_name.geo_county_cbsa CBS + ON WI.cbsa_cd = COALESCE(CBS.metropolitan_division_code, CBS.cbsa_code, SUBSTRING(CBS.ssa_codes, 1, 2))""" + parser = Parser(query) + assert parser.columns == [ + "operations.ID", + "operations.year", + "my_db_name.ipps_wage_index_annual.cms_year", + "my_db_name.ipps_wage_index_annual.cbsa_cd", + "my_db_name.geo_county_cbsa.metropolitan_division_code", + "my_db_name.geo_county_cbsa.cbsa_code", + "my_db_name.geo_county_cbsa.ssa_codes", + ] + assert parser.columns_dict == { + "join": [ + "operations.year", + "my_db_name.ipps_wage_index_annual.cms_year", + "my_db_name.ipps_wage_index_annual.cbsa_cd", + "my_db_name.geo_county_cbsa.metropolitan_division_code", + "my_db_name.geo_county_cbsa.cbsa_code", + "my_db_name.geo_county_cbsa.ssa_codes", + ], + "select": ["operations.ID", "operations.year"], + } + assert parser.tables == [ + "operations", + "my_db_name.ipps_wage_index_annual", + "my_db_name.geo_county_cbsa", + ] + + +def test_uid_pad_parsed_as_columns(): + # solved: https://github.com/macbre/sql-metadata/issues/412 + parser = Parser("SELECT * FROM t1 WHERE uid = 4") + assert parser.tables == ["t1"] + assert parser.columns == ["*", "uid"] + assert parser.columns_dict == {"select": ["*"], "where": ["uid"]} + + parser2 = Parser("SELECT * FROM t1 WHERE pad = 4") + assert parser2.tables == ["t1"] + assert parser2.columns == ["*", "pad"] + assert parser2.columns_dict == {"select": ["*"], "where": ["pad"]} + + +def test_dateadd_unit_not_column(): + # solved: https://github.com/macbre/sql-metadata/issues/411 + query = """ + SELECT + dateadd(dd, 30, DateReleased), + dateadd(WK, 2, DateReleased) + FROM test a + """ + parser = Parser(query) + assert parser.tables == ["test"] + assert parser.columns == ["DateReleased"] + assert parser.tables_aliases == {"a": "test"} + assert parser.columns_dict == {"select": ["DateReleased"]} + + +def test_backtick_column_with_operation(): + # solved: https://github.com/macbre/sql-metadata/issues/448 + query = "SELECT `col1 with space` / `col2_anything` FROM table1" + parser = Parser(query) + assert parser.tables == ["table1"] + assert parser.columns == ["col1 with space", "col2_anything"] + assert parser.columns_dict == { + "select": ["col1 with space", "col2_anything"], + } + + +def test_separator_not_column(): + # solved: https://github.com/macbre/sql-metadata/issues/400 + query = """ + SELECT JoinedMonth, + group_concat( + distinct FirstName + order by FirstName + separator '/') as FirstName + FROM customers + GROUP BY JoinedMonth + """ + parser = Parser(query) + assert parser.columns == ["JoinedMonth", "FirstName"] + columns_lower = [c.lower() for c in parser.columns] + assert "separator" not in columns_lower + + +def test_mssql_top_columns(): + # solved: https://github.com/macbre/sql-metadata/issues/318 + query = "SELECT TOP 10 id, name FROM foo" + parser = Parser(query) + assert parser.tables == ["foo"] + assert parser.columns == ["id", "name"] + assert parser.columns_dict == {"select": ["id", "name"]} + + +def test_columns_regex_fallback_on_invalid_insert(): + """Invalid INSERT falls back to regex for column extraction.""" + p = Parser("INSERT INTO t (col1, col2, col3) GARBAGE GARBAGE GARBAGE") + assert p.columns == ["col1", "col2", "col3"] + + +def test_columns_via_regex_on_completely_invalid_sql(): + """Totally invalid SQL with INTO...(cols) pattern uses regex fallback.""" + p = Parser("INTO tbl (col_a, col_b) FROM TO WHERE") + assert p.columns == ["col_a", "col_b"] + + +def test_cte_with_more_column_aliases_than_body(): + """CTE defines more column names than the body SELECT produces.""" + p = Parser( + "WITH cte(a, b, c) AS (SELECT x FROM t) " + "SELECT a FROM cte" + ) + assert "a" in p.columns_aliases_names + + +def test_cte_with_table_star_in_body(): + """CTE body uses table.* — exercises _flat_columns with table-qualified star.""" + p = Parser( + "WITH cte(a) AS (SELECT t.* FROM t) " + "SELECT a FROM cte" + ) + assert "t.*" in p.columns or "a" in p.columns_aliases_names diff --git a/test/test_getting_tables.py b/test/test_getting_tables.py index d6617037..d50f362b 100644 --- a/test/test_getting_tables.py +++ b/test/test_getting_tables.py @@ -286,11 +286,16 @@ def test_table_name_with_group_by(): == expected_tables ) - assert Parser(""" + assert ( + Parser( + """ SELECT s.cust_id,count(s.cust_id) FROM SH.sales s GROUP BY s.cust_id HAVING s.cust_id != '1660' AND s.cust_id != '2' - """.strip()).tables == expected_tables + """.strip() + ).tables + == expected_tables + ) def test_datasets(): @@ -777,3 +782,174 @@ def test_subquery_followed_by_tables(): "customer_address", "customer", ] + + +def test_joined_on_datetrunc(): + # solved: https://github.com/macbre/sql-metadata/issues/555 + query = """SELECT * + FROM test t + join test_1 t1 + on datetrunc('day', t.test_date) = datetrunc('day', t1.test_date)""" + parser = Parser(query) + assert parser.tables == ["test", "test_1"] + assert parser.columns == ["*", "test.test_date", "test_1.test_date"] + + +def test_ifnull_in_on_clause(): + # solved: https://github.com/macbre/sql-metadata/issues/534 + query = ( + "SELECT * FROM table1 a " + "LEFT JOIN table2 b ON ifnull(a.col1, '') = ifnull(b.col1, '')" + ) + parser = Parser(query) + assert parser.tables == ["table1", "table2"] + assert parser.columns == ["*", "table1.col1", "table2.col1"] + assert parser.tables_aliases == {"a": "table1", "b": "table2"} + assert parser.columns_dict == { + "select": ["*"], + "join": ["table1.col1", "table2.col1"], + } + + +def test_nvl_in_join_condition(): + # solved: https://github.com/macbre/sql-metadata/issues/446 + query = "SELECT 1 FROM t1 JOIN t2 ON t1.t2_id = nvl(t2.id, t2.uid)" + parser = Parser(query) + assert parser.tables == ["t1", "t2"] + assert parser.columns == ["t1.t2_id", "t2.id", "t2.uid"] + assert parser.columns_dict == {"join": ["t1.t2_id", "t2.id", "t2.uid"]} + + +def test_where_not_table_alias(): + # solved: https://github.com/macbre/sql-metadata/issues/451 + parser = Parser("SELECT name FROM employee WHERE age > 25") + assert parser.tables == ["employee"] + assert parser.columns == ["name", "age"] + assert parser.tables_aliases == {} + assert parser.columns_dict == {"select": ["name"], "where": ["age"]} + + +def test_column_not_in_tables_with_not_in(): + # solved: https://github.com/macbre/sql-metadata/issues/457 + query = """ + SELECT * + FROM TABLE1 + WHERE + SNAPSHOTDATE = (SELECT MAX(SNAPSHOTDATE) FROM TABLE1) + AND (MTYPE NOT IN ('Item1', 'Item2')) + """ + parser = Parser(query) + assert parser.tables == ["TABLE1"] + assert parser.columns == ["*", "SNAPSHOTDATE", "MTYPE"] + assert parser.columns_dict == { + "select": ["*", "SNAPSHOTDATE"], + "where": ["SNAPSHOTDATE", "MTYPE"], + } + + +def test_update_alias_not_extra_table(): + # solved: https://github.com/macbre/sql-metadata/issues/370 + query = "UPDATE a SET b=1 FROM schema1.testtable AS a" + parser = Parser(query) + assert "schema1.testtable" in parser.tables + assert parser.tables_aliases == {"a": "schema1.testtable"} + assert parser.columns == ["b"] + + +def test_select_into_vars_not_tables(): + # solved: https://github.com/macbre/sql-metadata/issues/397 + query = "SELECT C1, C2 INTO VAR1, VAR2 FROM TEST_TABLE" + parser = Parser(query) + assert parser.tables == ["TEST_TABLE"] + assert parser.columns == ["C1", "C2"] + assert parser.columns_dict == {"select": ["C1", "C2"]} + + +def test_presto_unnest_not_table(): + # solved: https://github.com/macbre/sql-metadata/issues/284 + query = """ + SELECT col_ + FROM my_table + CROSS JOIN UNNEST(my_col) AS t(col_) + """ + parser = Parser(query) + assert parser.tables == ["my_table"] + assert "col_" in parser.columns + + +def test_bigquery_unnest_not_table(): + # Solved: https://github.com/macbre/sql-metadata/issues/352 + p = Parser( + "SELECT A, B, metrics.C, metrics.D " + "FROM table1, UNNEST(metrics) as metrics" + ) + assert p.tables == ["table1"] + assert "metrics" in p.columns + + +def test_from_order_does_not_affect_tables(): + # solved: https://github.com/macbre/sql-metadata/issues/335 + query1 = "SELECT aa FROM (SELECT bb FROM bbb GROUP BY bb) AS a, omg" + query2 = "SELECT aa FROM omg, (SELECT bb FROM bbb GROUP BY bb) AS a" + parser1 = Parser(query1) + parser2 = Parser(query2) + assert set(parser1.tables) == {"bbb", "omg"} + assert set(parser2.tables) == {"bbb", "omg"} + assert set(parser1.columns) == {"aa", "bb"} + assert set(parser2.columns) == {"aa", "bb"} + + +def test_complex_subquery_join_tables(): + # solved: https://github.com/macbre/sql-metadata/issues/324 + query = """ + SELECT * FROM + ( (SELECT a1, a2 FROM ta1) tt1 + LEFT JOIN + (SELECT b1, b2 FROM tb1) tt2 + ON tt1.a1 = tt2.b1) tt3 + """ + parser = Parser(query) + assert parser.tables == ["ta1", "tb1"] + assert parser.columns == ["*", "a1", "a2", "b1", "b2"] + + +def test_on_keyword_not_table_alias(): + # solved: https://github.com/macbre/sql-metadata/issues/537 + parser = Parser( + """ + WITH + database1.tableFromWith AS (SELECT aa.* FROM table3 as aa + left join table4 on aa.col1=table4.col2), + test as (SELECT * from table3) + SELECT "xxxxx" + FROM database1.tableFromWith alias + LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx") + """ + ) + assert parser.tables == ["table3", "table4", "database2.table2"] + assert "on" not in parser.tables_aliases + assert "ON" not in parser.tables_aliases + assert parser.tables_aliases == {"aa": "table3"} + + +def test_unmatched_parentheses_graceful(): + # solved: https://github.com/macbre/sql-metadata/issues/532 + # Should not raise IndexError; graceful handling of malformed SQL + try: + parser = Parser("SELECT arrayJoin(tags.key)) FROM foo") + _ = parser.tables + except (ValueError, Exception): + pass + + +def test_degraded_parse_falls_through_to_last_dialect(): + """SELECT UNIQUE triggers multi-dialect retry.""" + p = Parser("SELECT UNIQUE col FROM t") + assert "t" in p.tables + + +def test_parenthesized_select_unwrapping(): + """Parenthesized top-level SELECT is correctly unwrapped.""" + p = Parser("(SELECT a, b FROM t)") + assert p.tables == ["t"] + assert p.columns == ["a", "b"] diff --git a/test/test_hive.py b/test/test_hive.py index 7dd00b49..b532b35d 100644 --- a/test/test_hive.py +++ b/test/test_hive.py @@ -46,3 +46,113 @@ def test_complex_hive_query(): "rollup_wiki_beacon_pageviews", "statsdb.dimension_wikis", ] == Parser(dag).tables + + +def test_hive_alter_table_drop_partition(): + # solved: https://github.com/macbre/sql-metadata/issues/495 + query = "ALTER TABLE table_name DROP IF EXISTS PARTITION (dt = 20240524)" + parser = Parser(query) + assert parser.tables == ["table_name"] + assert "PARTITION" not in parser.tables + assert "dt" not in parser.tables + + +def test_hive_insert_overwrite_with_partition(): + # solved: https://github.com/macbre/sql-metadata/issues/502 + query = """ + INSERT OVERWRITE TABLE tbl PARTITION (dt='20240101') + SELECT col1, col2 FROM table1 + JOIN table2 ON table1.id = table2.id + """ + parser = Parser(query) + assert parser.tables == ["tbl", "table1", "table2"] + assert "dt" not in parser.tables + assert parser.columns == ["col1", "col2", "table1.id", "table2.id"] + assert parser.columns_dict == { + "select": ["col1", "col2"], + "join": ["table1.id", "table2.id"], + } + + +def test_lateral_view_not_in_tables(): + # Solved: https://github.com/macbre/sql-metadata/issues/369 + # LATERAL VIEW aliases should not appear as tables + parser = Parser("""SELECT event_day, action_type + FROM t + LATERAL VIEW EXPLODE(ARRAY(1, 2)) lv AS action_type""") + assert parser.tables == ["t"] + assert parser.columns == ["event_day", "action_type"] + + +def test_array_subscript_with_lateral_view(): + # Solved: https://github.com/macbre/sql-metadata/issues/369 + # Array subscript [n] should not trigger MSSQL bracketed dialect + parser = Parser("""SELECT max(split(fourth_category, '~')[2]) AS ch_4th_class + FROM t + LATERAL VIEW EXPLODE(ARRAY(1, 2)) lv AS action_type""") + assert parser.tables == ["t"] + + +def test_complex_lateral_view_with_array_subscript(): + # Solved: https://github.com/macbre/sql-metadata/issues/369 + parser = Parser("""select + event_day, + cuid, + event_product_all, + max(os_name) as os_name, + max(app_version) as app_version, + max(if(event_product_all ='tomas', + if(is_bdapp_new='1',ch_4th_class,'-'),ta.channel)) as channel, + max(age) as age, + max(age_point) as age_point, + max(is_bdapp_new) as is_new, + action_type, + max(if(is_feed_dau=1, immersive_type, 0)) AS detail_page_type + from + ( + select event_day, + event_product_all, + os_name, + app_version, + channel, + age, + age_point, + is_bdapp_new, + action_type, + is_feed_dau, + immersive_type, + attr_channel + from bdapp_ads_bhv_cuid_all_1d + lateral view explode(array( + case when is_bdapp_dau=1 then 'bdapp' end, + case when is_feed_dau=1 then 'feed' end, + case when is_search_dau=1 then 'search' end, + case when is_novel_dau=1 then 'novel' end, + case when is_tts_dau=1 then 'radio' end + )) lv AS action_type + lateral view explode( + case when event_product = 'lite' + and appid in ('hao123', 'flyflow', 'lite_mission') + then array('lite', appid) + when event_product = 'lite' and appid = '10001' + then array('lite', 'purelite') + else array(event_product) end + ) lv AS event_product_all + where event_day in ('20230102') + and event_product in ('lite', 'tomas') + and is_bdapp_dau = '1' + and action_type is not null + )ta + left outer join + ( + select channel,max(split(fourth_category,'~')[2]) as ch_4th_class + from udw_ns.default.ug_dim_channel_new_df + where event_day = '20230102' + group by channel + )tb on ta.attr_channel=tb.channel + group by event_day, cuid, event_product_all, action_type + limit 100""") + assert parser.tables == [ + "bdapp_ads_bhv_cuid_all_1d", + "udw_ns.default.ug_dim_channel_new_df", + ] diff --git a/test/test_limit_and_offset.py b/test/test_limit_and_offset.py index 1fd6aaeb..b1ae7cba 100644 --- a/test/test_limit_and_offset.py +++ b/test/test_limit_and_offset.py @@ -52,3 +52,54 @@ def test_with_in_condition(): assert Parser( "SELECT count(*) FROM aa WHERE userid IN (222,333) LIMIT 50 OFFSET 1000" ).limit_and_offset == (50, 1000) + + +def test_limit_and_offset_on_update(): + """UPDATE has no LIMIT — returns None.""" + assert Parser("UPDATE t SET col = 1 WHERE id = 5").limit_and_offset is None + + +def test_limit_and_offset_on_insert(): + """INSERT has no LIMIT — returns None.""" + assert Parser("INSERT INTO t (a) VALUES (1)").limit_and_offset is None + + +def test_limit_with_parameter_placeholder(): + """LIMIT with a non-numeric placeholder triggers int conversion failure.""" + assert Parser("SELECT col FROM t LIMIT :limit").limit_and_offset is None + + +def test_limit_regex_mysql_comma_via_subquery(): + """Regex fallback finds MySQL comma LIMIT in subquery. + + LIMIT ALL makes sqlglot produce a non-integer limit node, triggering the + regex fallback which then matches the inner subquery's LIMIT 10, 20. + """ + p = Parser( + "SELECT * FROM (SELECT id FROM t LIMIT 10, 20) AS sub LIMIT ALL" + ) + assert p.limit_and_offset == (20, 10) + + +def test_limit_regex_standard_via_subquery(): + """Regex fallback finds standard LIMIT in subquery.""" + p = Parser( + "SELECT * FROM (SELECT id FROM t LIMIT 30) AS sub" + " FETCH FIRST 5 ROWS ONLY" + ) + assert p.limit_and_offset == (30, 0) + + +def test_limit_regex_with_offset_via_subquery(): + """Regex fallback finds LIMIT with OFFSET when outer is unparseable.""" + p = Parser( + "SELECT * FROM (SELECT id FROM t LIMIT 50 OFFSET 100)" + " AS sub LIMIT ALL" + ) + assert p.limit_and_offset == (50, 100) + + +def test_limit_and_offset_comment_only(): + """LIMIT/OFFSET on comment-only SQL returns None (AST is None).""" + p = Parser("/* just a comment */") + assert p.limit_and_offset is None diff --git a/test/test_mssql_server.py b/test/test_mssql_server.py index abf4cab1..82081082 100644 --- a/test/test_mssql_server.py +++ b/test/test_mssql_server.py @@ -104,7 +104,7 @@ def test_sql_server_cte_sales_by_year(): assert parser.tables == ["sales.orders"] assert parser.with_names == ["cte_sales"] assert parser.with_queries == { - "cte_sales": "SELECT staff_id, COUNT(*) order_count FROM sales.orders WHERE " + "cte_sales": "SELECT staff_id, COUNT(*) AS order_count FROM sales.orders WHERE " "YEAR(order_date) = 2018 GROUP BY staff_id" } assert parser.columns_aliases_names == ["order_count", "average_orders_by_staff"] @@ -181,3 +181,9 @@ def test_partition_over_with_row_number_and_many_orders(): "select": ["col_one", "col_two", "col_three", "col_four"], "where": ["col_one", "col_two", "col_three", "col_four"], } + + +def test_mssql_catalog_double_dot(): + """SQL Server three-part name with empty db: catalog..table.""" + p = Parser("SELECT * FROM mydb..orders") + assert "mydb..orders" in p.tables diff --git a/test/test_multiple_subqueries.py b/test/test_multiple_subqueries.py index 84d13124..a03442b1 100644 --- a/test/test_multiple_subqueries.py +++ b/test/test_multiple_subqueries.py @@ -81,6 +81,7 @@ def test_multiple_subqueries(): assert parser.subqueries_names == [ "jrah2", "main_qry", + "subquery_1", "days_sqry", "days_final_qry", "subdays", @@ -135,87 +136,92 @@ def test_multiple_subqueries(): "presentation.job_request_id", ] assert parser.subqueries == { - "days_final_qry": "SELECT PROJECT_ID, days_to_offer, (SELECT count(distinct " - "jro.job_request_application_id) from job_request_offer jro " - "left join job_request_application jra2 on " - "jro.job_request_application_id = jra2.id where " - "jra2.job_request_id = PROJECT_ID and " - "jro.first_presented_date is not null and " - "jro.first_presented_date <= InitialChangeDate) as RowNo " - "from (SELECT jr.id as PROJECT_ID, 5 * " + "days_final_qry": "SELECT PROJECT_ID, days_to_offer, (SELECT COUNT(DISTINCT " + "jro.job_request_application_id) FROM job_request_offer AS jro " + "LEFT JOIN job_request_application AS jra2 ON " + "jro.job_request_application_id = jra2.id WHERE " + "jra2.job_request_id = PROJECT_ID AND " + "jro.first_presented_date IS NOT NULL AND " + "jro.first_presented_date <= InitialChangeDate) AS RowNo " + "FROM (SELECT jr.id AS PROJECT_ID, 5 * " "(DATEDIFF(jro.first_presented_date, jr.creation_date) DIV " "7) + " "MID('0123444401233334012222340111123400001234000123440', 7 " "* WEEKDAY(jr.creation_date) + " - "WEEKDAY(jro.first_presented_date) + 1, 1) as " + "WEEKDAY(jro.first_presented_date) + 1, 1) AS " "days_to_offer, jro.job_request_application_id, " - "jro.first_presented_date as InitialChangeDate from " - "presentation pr left join presentation_job_request_offer " - "pjro on pr.id = pjro.presentation_id left join " - "job_request_offer jro on pjro.job_request_offer_id = " - "jro.id left join job_request jr on pr.job_request_id = " - "jr.id where jro.first_presented_date is not null) " + "jro.first_presented_date AS InitialChangeDate FROM " + "presentation AS pr LEFT JOIN presentation_job_request_offer " + "AS pjro ON pr.id = pjro.presentation_id LEFT JOIN " + "job_request_offer AS jro ON pjro.job_request_offer_id = " + "jro.id LEFT JOIN job_request AS jr ON pr.job_request_id = " + "jr.id WHERE jro.first_presented_date IS NOT NULL) AS " "days_sqry", - "days_sqry": "SELECT jr.id as PROJECT_ID, 5 * " + "days_sqry": "SELECT jr.id AS PROJECT_ID, 5 * " "(DATEDIFF(jro.first_presented_date, jr.creation_date) DIV 7) + " "MID('0123444401233334012222340111123400001234000123440', 7 * " "WEEKDAY(jr.creation_date) + WEEKDAY(jro.first_presented_date) + " - "1, 1) as days_to_offer, jro.job_request_application_id, " - "jro.first_presented_date as InitialChangeDate from presentation " - "pr left join presentation_job_request_offer pjro on pr.id = " - "pjro.presentation_id left join job_request_offer jro on " - "pjro.job_request_offer_id = jro.id left join job_request jr on " - "pr.job_request_id = jr.id where jro.first_presented_date is not " - "null", - "jrah2": "SELECT jro2.job_request_application_id, max(case when " - "jro2.first_interview_scheduled_date is not null then 1 else 0 end) " - "as IS_INTERVIEW, max(case when jro2.first_presented_date is not " - "null then 1 else 0 end) as IS_PRESENTATION from job_request_offer " - "jro2 group by 1", - "main_qry": "SELECT jr.id as PROJECT_ID, 5 * " - "(DATEDIFF(ifnull(lc.creation_date, now()), jr.creation_date) DIV " + "1, 1) AS days_to_offer, jro.job_request_application_id, " + "jro.first_presented_date AS InitialChangeDate FROM presentation " + "AS pr LEFT JOIN presentation_job_request_offer AS pjro ON pr.id = " + "pjro.presentation_id LEFT JOIN job_request_offer AS jro ON " + "pjro.job_request_offer_id = jro.id LEFT JOIN job_request AS jr ON " + "pr.job_request_id = jr.id WHERE jro.first_presented_date IS NOT " + "NULL", + "jrah2": "SELECT jro2.job_request_application_id, MAX(CASE WHEN " + "jro2.first_interview_scheduled_date IS NOT NULL THEN 1 ELSE 0 END) " + "AS IS_INTERVIEW, MAX(CASE WHEN jro2.first_presented_date IS NOT " + "NULL THEN 1 ELSE 0 END) AS IS_PRESENTATION FROM job_request_offer " + "AS jro2 GROUP BY 1", + "main_qry": "SELECT jr.id AS PROJECT_ID, 5 * " + "(DATEDIFF(IFNULL(lc.creation_date, NOW()), jr.creation_date) DIV " "7) + MID('0123444401233334012222340111123400001234000123440', 7 " - "* WEEKDAY(jr.creation_date) + WEEKDAY(ifnull(lc.creation_date, " - "now())) + 1, 1) as LIFETIME, count(distinct case when " - "jra.application_source = 'VERAMA' then jra.id else null end) " - "NUM_APPLICATIONS, count(distinct jra.id) NUM_CANDIDATES, " - "sum(case when jro.stage = 'DEAL' then 1 else 0 end) as " - "NUM_CONTRACTED, sum(ifnull(IS_INTERVIEW, 0)) as NUM_INTERVIEWED, " - "sum(ifnull(IS_PRESENTATION, 0)) as NUM_OFFERED from job_request " - "jr left join job_request_application jra on jr.id = " - "jra.job_request_id left join job_request_offer jro on " - "jro.job_request_application_id = jra.id left join lifecycle lc " - "on lc.object_id = jr.id and lc.lifecycle_object_type = " - "'JOB_REQUEST' and lc.event = 'JOB_REQUEST_CLOSED' left join " - "(SELECT jro2.job_request_application_id, max(case when " - "jro2.first_interview_scheduled_date is not null then 1 else 0 " - "end) as IS_INTERVIEW, max(case when jro2.first_presented_date is " - "not null then 1 else 0 end) as IS_PRESENTATION from " - "job_request_offer jro2 group by 1) jrah2 on jra.id = " - "jrah2.job_request_application_id left join client u on " - "jr.client_id = u.id where jr.from_point_break = 0 and u.name not " - "in ('Test', 'Demo Client') group by 1, 2", - "subdays": "SELECT PROJECT_ID, sum(case when RowNo = 1 then days_to_offer " - "else null end) as DAYS_OFFER1, sum(case when RowNo = 2 then " - "days_to_offer else null end) as DAYS_OFFER2, sum(case when RowNo " - "= 3 then days_to_offer else null end) as DAYS_OFFER3 from (SELECT " - "PROJECT_ID, days_to_offer, (SELECT count(distinct " - "jro.job_request_application_id) from job_request_offer jro left " - "join job_request_application jra2 on " - "jro.job_request_application_id = jra2.id where " - "jra2.job_request_id = PROJECT_ID and jro.first_presented_date is " - "not null and jro.first_presented_date <= InitialChangeDate) as " - "RowNo from (SELECT jr.id as PROJECT_ID, 5 * " + "* WEEKDAY(jr.creation_date) + WEEKDAY(IFNULL(lc.creation_date, " + "NOW())) + 1, 1) AS LIFETIME, COUNT(DISTINCT CASE WHEN " + "jra.application_source = 'VERAMA' THEN jra.id ELSE NULL END) " + "AS NUM_APPLICATIONS, COUNT(DISTINCT jra.id) AS NUM_CANDIDATES, " + "SUM(CASE WHEN jro.stage = 'DEAL' THEN 1 ELSE 0 END) AS " + "NUM_CONTRACTED, SUM(IFNULL(IS_INTERVIEW, 0)) AS NUM_INTERVIEWED, " + "SUM(IFNULL(IS_PRESENTATION, 0)) AS NUM_OFFERED FROM job_request " + "AS jr LEFT JOIN job_request_application AS jra ON jr.id = " + "jra.job_request_id LEFT JOIN job_request_offer AS jro ON " + "jro.job_request_application_id = jra.id LEFT JOIN lifecycle AS lc " + "ON lc.object_id = jr.id AND lc.lifecycle_object_type = " + "'JOB_REQUEST' AND lc.event = 'JOB_REQUEST_CLOSED' LEFT JOIN " + "(SELECT jro2.job_request_application_id, MAX(CASE WHEN " + "jro2.first_interview_scheduled_date IS NOT NULL THEN 1 ELSE 0 " + "END) AS IS_INTERVIEW, MAX(CASE WHEN jro2.first_presented_date IS " + "NOT NULL THEN 1 ELSE 0 END) AS IS_PRESENTATION FROM " + "job_request_offer AS jro2 GROUP BY 1) AS jrah2 ON jra.id = " + "jrah2.job_request_application_id LEFT JOIN client AS u ON " + "jr.client_id = u.id WHERE jr.from_point_break = 0 AND u.name NOT " + "IN ('Test', 'Demo Client') GROUP BY 1, 2", + "subquery_1": "SELECT COUNT(DISTINCT jro.job_request_application_id) FROM " + "job_request_offer AS jro LEFT JOIN job_request_application AS jra2 ON " + "jro.job_request_application_id = jra2.id WHERE jra2.job_request_id = " + "PROJECT_ID AND jro.first_presented_date IS NOT NULL AND " + "jro.first_presented_date <= InitialChangeDate", + "subdays": "SELECT PROJECT_ID, SUM(CASE WHEN RowNo = 1 THEN days_to_offer " + "ELSE NULL END) AS DAYS_OFFER1, SUM(CASE WHEN RowNo = 2 THEN " + "days_to_offer ELSE NULL END) AS DAYS_OFFER2, SUM(CASE WHEN RowNo " + "= 3 THEN days_to_offer ELSE NULL END) AS DAYS_OFFER3 FROM (SELECT " + "PROJECT_ID, days_to_offer, (SELECT COUNT(DISTINCT " + "jro.job_request_application_id) FROM job_request_offer AS jro LEFT " + "JOIN job_request_application AS jra2 ON " + "jro.job_request_application_id = jra2.id WHERE " + "jra2.job_request_id = PROJECT_ID AND jro.first_presented_date IS " + "NOT NULL AND jro.first_presented_date <= InitialChangeDate) AS " + "RowNo FROM (SELECT jr.id AS PROJECT_ID, 5 * " "(DATEDIFF(jro.first_presented_date, jr.creation_date) DIV 7) + " "MID('0123444401233334012222340111123400001234000123440', 7 * " "WEEKDAY(jr.creation_date) + WEEKDAY(jro.first_presented_date) + " - "1, 1) as days_to_offer, jro.job_request_application_id, " - "jro.first_presented_date as InitialChangeDate from presentation " - "pr left join presentation_job_request_offer pjro on pr.id = " - "pjro.presentation_id left join job_request_offer jro on " - "pjro.job_request_offer_id = jro.id left join job_request jr on " - "pr.job_request_id = jr.id where jro.first_presented_date is not " - "null) days_sqry) days_final_qry group by PROJECT_ID", + "1, 1) AS days_to_offer, jro.job_request_application_id, " + "jro.first_presented_date AS InitialChangeDate FROM presentation " + "AS pr LEFT JOIN presentation_job_request_offer AS pjro ON pr.id = " + "pjro.presentation_id LEFT JOIN job_request_offer AS jro ON " + "pjro.job_request_offer_id = jro.id LEFT JOIN job_request AS jr ON " + "pr.job_request_id = jr.id WHERE jro.first_presented_date IS NOT " + "NULL) AS days_sqry) AS days_final_qry GROUP BY PROJECT_ID", } @@ -259,9 +265,9 @@ def test_multiline_queries(): } assert parser.subqueries == { - "a": "SELECT std.task_id as new_task_id " - "FROM some_task_detail std WHERE std.STATUS = 1", - "b": "SELECT st.task_id FROM some_task st WHERE task_type_id = 80", + "a": "SELECT std.task_id AS new_task_id " + "FROM some_task_detail AS std WHERE std.STATUS = 1", + "b": "SELECT st.task_id FROM some_task AS st WHERE task_type_id = 80", } parser2 = Parser(parser.subqueries["a"]) @@ -417,8 +423,8 @@ def test_readme_query(): ON a.task_id = b.task_id; """) assert parser.subqueries == { - "a": "SELECT std.task_id FROM some_task_detail std WHERE std.STATUS = 1", - "b": "SELECT st.task_id FROM some_task st WHERE task_type_id = 80", + "a": "SELECT std.task_id FROM some_task_detail AS std WHERE std.STATUS = 1", + "b": "SELECT st.task_id FROM some_task AS st WHERE task_type_id = 80", } assert parser.subqueries_names == ["a", "b"] assert parser.columns == [ @@ -432,3 +438,173 @@ def test_readme_query(): "select": ["some_task_detail.task_id", "some_task.task_id"], "where": ["some_task_detail.STATUS", "task_type_id"], } + + +def test_subquery_extraction_with_case(): + # solved: https://github.com/macbre/sql-metadata/issues/469 + query = """ + SELECT o_year, + sum(case when nation = 'KENYA' then volume else 0 end) + / sum(volume) as mkt_share + FROM ( + SELECT extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) as volume, + n2.n_name as nation + FROM part, supplier, lineitem, orders, customer, + nation n1, nation n2, region + WHERE p_partkey = l_partkey + AND s_suppkey = l_suppkey + AND l_orderkey = o_orderkey + AND o_custkey = c_custkey + AND c_nationkey = n1.n_nationkey + AND n1.n_regionkey = r_regionkey + AND r_name = 'AFRICA' + AND s_nationkey = n2.n_nationkey + AND o_orderdate BETWEEN date '1995-01-01' AND date '1996-12-31' + AND p_type = 'PROMO POLISHED NICKEL' + ) as all_nations + GROUP BY o_year + ORDER BY o_year + """ + parser = Parser(query) + assert "part" in parser.tables + assert "supplier" in parser.tables + assert "lineitem" in parser.tables + assert "orders" in parser.tables + assert "customer" in parser.tables + assert "nation" in parser.tables + assert "region" in parser.tables + assert "o_orderdate" in parser.columns + + +def test_column_alias_same_as_subquery_alias(): + # solved: https://github.com/macbre/sql-metadata/issues/306 + query = """ + SELECT a.id as a_id, b.name as b_name + FROM table_a AS a + LEFT JOIN (SELECT * FROM table_b) AS b_name ON 1=1 + """ + parser = Parser(query) + assert parser.tables == ["table_a", "table_b"] + assert "table_a.id" in parser.columns + assert "*" in parser.columns + + +def test_subquery_in_select_closing_parens(): + # solved: https://github.com/macbre/sql-metadata/issues/447 + query = """ + SELECT a.pt_no, b.pt_name, + (SELECT dept_name FROM depart d WHERE a.dept_cd = d.dept_cd), + a.c_no, a.cls + FROM clinmt a, tbamv b + """ + parser = Parser(query) + assert parser.tables == ["depart", "clinmt", "tbamv"] + assert parser.tables_aliases == {"a": "clinmt", "b": "tbamv", "d": "depart"} + assert "clinmt.pt_no" in parser.columns + assert "tbamv.pt_name" in parser.columns + assert "dept_name" in parser.columns + assert "clinmt.c_no" in parser.columns + assert "clinmt.cls" in parser.columns + + +def test_subquery_alias_with_inner_column(): + """Alias wrapping a scalar subquery that returns a column.""" + p = Parser("SELECT (SELECT col FROM t LIMIT 1) AS x FROM s") + assert "x" in p.columns_aliases_names + + +def test_subquery_alias_with_inner_star(): + """Alias wrapping a scalar subquery that uses SELECT *.""" + p = Parser("SELECT (SELECT * FROM t LIMIT 1) AS x FROM s") + assert "x" in p.columns_aliases_names + + +def test_subquery_alias_with_inner_alias(): + """Alias wrapping a scalar subquery that returns an alias.""" + p = Parser("SELECT (SELECT col AS c FROM t LIMIT 1) AS x FROM s") + assert "x" in p.columns_aliases_names + + +def test_subquery_alias_in_columns_dict(): + # Solved: https://github.com/macbre/sql-metadata/issues/528 + p = Parser( + "SELECT ap.[AccountId], " + "(SELECT COUNT(*) FROM [Transactions] t " + "WHERE t.[AccountId] = ap.[AccountId]) AS TransactionCount " + "FROM [AccountProfiles] ap" + ) + assert p.tables == ["[Transactions]", "[AccountProfiles]"] + assert p.columns == ["ap.AccountId", "t.AccountId"] + assert p.columns_dict == { + "select": ["ap.AccountId", "TransactionCount"], + "where": ["t.AccountId", "ap.AccountId"], + } + assert "TransactionCount" in p.columns_aliases_names + + +def test_subquery_alias_with_aggregate_column(): + # Related to https://github.com/macbre/sql-metadata/issues/528 + # MAX(col) resolves alias to real column, unlike COUNT(*) + p = Parser( + "SELECT ap.[AccountId], " + "(SELECT MAX(t.[Id]) FROM [Transactions] t " + "WHERE t.[AccountId] = ap.[AccountId]) AS MaxTransactionId " + "FROM [AccountProfiles] ap" + ) + assert p.tables == ["[Transactions]", "[AccountProfiles]"] + assert p.columns == ["ap.AccountId", "t.Id", "t.AccountId"] + assert p.columns_dict == { + "select": ["ap.AccountId", "t.Id"], + "where": ["t.AccountId", "ap.AccountId"], + } + assert p.columns_aliases == {"MaxTransactionId": "t.Id"} + + +def test_unaliased_subquery(): + # Solved: https://github.com/macbre/sql-metadata/issues/365 + query = """SELECT * FROM customers + WHERE id IN ( + SELECT customer_id FROM reservations + WHERE year(reservation_date) = year(now()) + GROUP BY customer_id + ORDER BY count(*) DESC LIMIT 1 + )""" + p = Parser(query) + assert p.tables == ["customers", "reservations"] + assert p.subqueries_names == ["subquery_1"] + assert "subquery_1" in p.subqueries + + +def test_multiple_unaliased_subqueries(): + p = Parser( + "SELECT * FROM t " + "WHERE a IN (SELECT id FROM t2) " + "AND b IN (SELECT id FROM t3)" + ) + assert p.subqueries_names == ["subquery_1", "subquery_2"] + assert "subquery_1" in p.subqueries + assert "subquery_2" in p.subqueries + + +def test_mixed_aliased_and_unaliased_subqueries(): + p = Parser( + "SELECT * FROM (SELECT id FROM t2) sub " + "WHERE a IN (SELECT id FROM t3)" + ) + assert "sub" in p.subqueries_names + assert "subquery_1" in p.subqueries_names + assert "sub" in p.subqueries + assert "subquery_1" in p.subqueries + + +def test_subquery_bodies_empty_when_no_subquery(): + """A query with no subqueries has empty subqueries dict.""" + p = Parser("SELECT * FROM t") + assert p.subqueries == {} + + +def test_subquery_names_empty_when_no_subquery(): + """A query with no subqueries returns empty subqueries_names.""" + p = Parser("SELECT * FROM t") + assert p.subqueries_names == [] diff --git a/test/test_query.py b/test/test_query.py index 5a229667..fd572c0f 100644 --- a/test/test_query.py +++ b/test/test_query.py @@ -7,10 +7,10 @@ def test_get_query_tokens(): tokens = Parser("SELECT * FROM foo").tokens assert len(tokens) == 4 - assert str(tokens[0]) == "SELECT" - assert tokens[1].is_wildcard - assert tokens[2].is_keyword - assert str(tokens[2]) == "FROM" + assert tokens[0] == "SELECT" + assert tokens[1] == "*" + assert tokens[2] == "FROM" + assert tokens[3] == "foo" def test_preprocessing(): @@ -126,3 +126,16 @@ def test_case_syntax(): assert Parser( "SELECT case when p > 0 then 1 else 0 end as cs from c where g > f" ).tables == ["c"] + + +def test_empty_query_property(): + """The query property returns empty string for empty SQL.""" + assert Parser("").query == "" + + +def test_tokens_caching(): + """Second access to tokens returns the cached list.""" + p = Parser("SELECT col FROM t") + first = p.tokens + second = p.tokens + assert first is second diff --git a/test/test_query_type.py b/test/test_query_type.py index b9e3486f..a5a8e3ee 100644 --- a/test/test_query_type.py +++ b/test/test_query_type.py @@ -1,6 +1,6 @@ import pytest -from sql_metadata import Parser, QueryType +from sql_metadata import InvalidQueryDefinition, Parser, QueryType def test_insert_query(): @@ -55,26 +55,26 @@ def test_unsupported_query(caplog): ] for query in queries: - with pytest.raises(ValueError) as ex: + with pytest.raises(InvalidQueryDefinition) as ex: _ = Parser(query).query_type assert "Not supported query type!" in str(ex.value) # assert the SQL query is not logged # https://docs.pytest.org/en/stable/how-to/logging.html#caplog-fixture - assert ( - f"Not supported query type: {query}" not in caplog.text - ), "The SQL query should not be logged" - assert ( - f"Not supported query type: {query[:8]}" in caplog.text - ), "The SQL query should be trimmed when logged" + assert f"Not supported query type: {query}" not in caplog.text, ( + "The SQL query should not be logged" + ) + assert f"Not supported query type: {query[:8]}" in caplog.text, ( + "The SQL query should be trimmed when logged" + ) def test_empty_query(): queries = ["", "/* empty query */"] for query in queries: - with pytest.raises(ValueError) as ex: + with pytest.raises(InvalidQueryDefinition) as ex: _ = Parser(query).query_type assert "Empty queries are not supported!" in str(ex.value) @@ -121,3 +121,62 @@ def test_hive_create_function(): """ parser = Parser(query) assert parser.query_type == QueryType.CREATE + + +def test_merge_into_query_type(): + # solved: https://github.com/macbre/sql-metadata/issues/354 + query = """ + MERGE INTO wines w + USING (VALUES('Chateau Lafite 2003', '24')) v + ON v.column1 = w.winename + WHEN NOT MATCHED THEN INSERT VALUES(v.column1, v.column2) + WHEN MATCHED THEN UPDATE SET stock = stock + v.column2 + """ + parser = Parser(query) + assert parser.query_type == QueryType.MERGE + assert parser.tables == ["wines"] + assert parser.columns == [ + "v.column1", + "wines.winename", + "v.column2", + "stock", + ] + assert parser.tables_aliases == {"w": "wines"} + + +def test_create_temporary_table(): + # solved: https://github.com/macbre/sql-metadata/issues/439 + query = "CREATE TEMPORARY TABLE tablname AS SELECT * FROM source_table" + parser = Parser(query) + assert parser.query_type == QueryType.CREATE + assert "tablname" in parser.tables + assert "source_table" in parser.tables + assert parser.columns == ["*"] + + +def test_malformed_with_no_main_query(): + """WITH clause not followed by a main statement is rejected.""" + with pytest.raises( + InvalidQueryDefinition, match="WITH clause without a main statement" + ): + Parser("WITH cte AS (SELECT 1)").query_type + + +def test_unrecognized_command_type(): + """A query that parses as Command but isn't ALTER/CREATE.""" + with pytest.raises(InvalidQueryDefinition, match="Not supported query type"): + Parser("SHOW TABLES").query_type + + +def test_deeply_parenthesized_query(): + """Triple-parenthesized SELECT parses correctly.""" + p = Parser("(((SELECT col FROM t)))") + assert p.query_type == "SELECT" + assert p.tables == ["t"] + assert p.columns == ["col"] + + +def test_execute_command_not_supported(): + """EXECUTE parses as Command but isn't a known type — raises ValueError.""" + with pytest.raises(InvalidQueryDefinition, match="Not supported query type"): + Parser("EXECUTE sp_help").query_type diff --git a/test/test_sqlite.py b/test/test_sqlite.py index f0233535..1c8db2f3 100644 --- a/test/test_sqlite.py +++ b/test/test_sqlite.py @@ -10,3 +10,14 @@ def test_natural_join(): assert ["table1", "table2"] == Parser(query).tables assert ["id"] == Parser(query).columns + + +def test_single_quoted_identifiers(): + # Solved: https://github.com/macbre/sql-metadata/issues/541 + query = ( + "SELECT r.Year, AVG(r.'Walt Disney Parks and Resorts') AS Avg_Parks_Revenue" + " FROM 'revenue' r WHERE r.Year=2000" + ) + parser = Parser(query) + assert parser.tables == ["revenue"] + assert parser.columns == ["revenue.Year", "revenue.Walt Disney Parks and Resorts"] diff --git a/test/test_values.py b/test/test_values.py index 58b62c9b..07668a53 100644 --- a/test/test_values.py +++ b/test/test_values.py @@ -52,7 +52,7 @@ def test_getting_values(): " '2021-02-27 03:21:52', 'test comment', 0, '0', " "'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv: 78.0) " "Gecko/20100101 Firefox/78.0', " - "'comment', 0, 0)'," + "'comment', 0, 0)" ) assert parser.values == [ 1, @@ -93,3 +93,65 @@ def test_getting_values(): "comment_parent": 0, "user_id": 0, } + + +def test_values_on_invalid_sql(): + """Values extraction returns empty list for unparseable SQL.""" + from sql_metadata import Parser + + p = Parser(";;;") + assert p.values == [] + + +def test_values_on_comment_only_sql(): + """Values extraction returns empty list when SQL is only comments.""" + from sql_metadata import Parser + + p = Parser("/* just a comment */") + assert p.values == [] + + +def test_negative_integer_values(): + """INSERT with a negative integer value.""" + p = Parser("INSERT INTO scores (player, points) VALUES ('alice', -42)") + assert p.values == ["alice", -42] + assert p.values_dict == {"player": "alice", "points": -42} + + +def test_negative_float_values(): + """INSERT with a negative float value.""" + p = Parser( + "INSERT INTO measurements (sensor, reading) VALUES ('temp', -3.14)" + ) + assert p.values == ["temp", -3.14] + assert p.values_dict == {"sensor": "temp", "reading": -3.14} + + +def test_insert_with_null_value(): + """INSERT with NULL triggers the str(val) fallback in _convert_value.""" + p = Parser("INSERT INTO t (a, b) VALUES (1, NULL)") + assert p.values == [1, "NULL"] + assert p.values_dict == {"a": 1, "b": "NULL"} + + +def test_insert_with_scalar_subquery_in_values(): + """Scalar subquery inside VALUES — columns from the subquery are extracted.""" + p = Parser( + "INSERT INTO orders (customer_id) " + "VALUES ((SELECT id FROM customers WHERE email = 'foo@bar.com'))" + ) + assert p.tables == ["orders", "customers"] + assert p.columns == ["customer_id", "id", "email"] + + +def test_insert_multi_row_values(): + # Solved: https://github.com/macbre/sql-metadata/issues/558 + p = Parser("INSERT INTO t (field1, field2) VALUES (1, 2), (3, 4)") + assert p.values == [[1, 2], [3, 4]] + assert p.values_dict == {"field1": [1, 3], "field2": [2, 4]} + + +def test_insert_with_expression_value(): + """INSERT with a function call in VALUES uses str(val) fallback.""" + p = Parser("INSERT INTO t (a) VALUES (CURRENT_TIMESTAMP)") + assert len(p.values) == 1 diff --git a/test/test_with_statements.py b/test/test_with_statements.py index 07805d0c..51b4a88e 100644 --- a/test/test_with_statements.py +++ b/test/test_with_statements.py @@ -1,6 +1,6 @@ import pytest -from sql_metadata import Parser +from sql_metadata import InvalidQueryDefinition, Parser from sql_metadata.keywords_lists import QueryType @@ -19,9 +19,9 @@ def test_with_statements(): assert parser.tables == ["table3", "table4", "database2.table2"] assert parser.with_names == ["database1.tableFromWith", "test"] assert parser.with_queries == { - "database1.tableFromWith": "SELECT aa.* FROM table3 as aa left join table4 on " + "database1.tableFromWith": "SELECT aa.* FROM table3 AS aa LEFT JOIN table4 ON " "aa.col1 = table4.col2", - "test": "SELECT * from table3", + "test": "SELECT * FROM table3", } parser = Parser(""" WITH @@ -143,18 +143,17 @@ def test_complicated_with(): assert parser.query_type == QueryType.SELECT assert parser.with_names == ["uisd_filter_table"] assert parser.with_queries == { - "uisd_filter_table": "select session_id, srch_id, srch_ci, srch_co, srch_los, " - "srch_sort_type, impr_list from uisd where datem <= " - "date_sub(date_add(current_date(), 92), 7 * 52) and " - "lower(srch_sort_type) in ('expertpicks', 'recommended') " - "and srch_ci <= date_sub(date_add(current_date(), 92), 7 " - "* 52) and srch_co >= date_sub(date_add(current_date(), " + "uisd_filter_table": "SELECT session_id, srch_id, srch_ci, srch_co, srch_los, " + "srch_sort_type, impr_list FROM uisd WHERE datem <= " + "DATE_SUB(DATE_ADD(CURRENT_DATE(), 92), 7 * 52) AND " + "LOWER(srch_sort_type) IN ('expertpicks', 'recommended') " + "AND srch_ci <= DATE_SUB(DATE_ADD(CURRENT_DATE(), 92), 7 " + "* 52) AND srch_co >= DATE_SUB(DATE_ADD(CURRENT_DATE(), " "1), 7 * 52)" } assert parser.tables == [ "uisd", - "impr_list", - ] # this one is wrong too should be table + ] assert parser.columns == [ "session_id", "srch_id", @@ -268,9 +267,9 @@ def test_resolving_with_columns_with_nested_tables_prefixes(): parser = Parser(query) assert parser.with_names == ["query1", "query2"] assert parser.with_queries == { - "query1": "SELECT t5.c1, t5.c2, t6.c4 FROM t5 left join t6 on t5.link1 = " + "query1": "SELECT t5.c1, t5.c2, t6.c4 FROM t5 LEFT JOIN t6 ON t5.link1 = " "t6.link2", - "query2": "SELECT c3, c7 FROM t7 union all select c4, c12 from t8", + "query2": "SELECT c3, c7 FROM t7 UNION ALL SELECT c4, c12 FROM t8", } assert parser.tables == ["t5", "t6", "t7", "t8"] assert parser.columns_aliases == {} @@ -353,12 +352,12 @@ def test_nested_with_statement_in_create_table(): assert parser.with_names == ["sub", "abc"] assert parser.subqueries_names == ["table_a"] assert parser.with_queries == { - "abc": "select * from other_table", - "sub": "select it_id from internal_table", + "abc": "SELECT * FROM other_table", + "sub": "SELECT it_id FROM internal_table", } assert parser.subqueries == { - "table_a": "with abc as(select * from other_table) select name, age, it_id " - "from table_z join abc on (table_z.it_id = abc.it_id)" + "table_a": "WITH abc AS (SELECT * FROM other_table) SELECT name, age, it_id " + "FROM table_z JOIN abc ON (table_z.it_id = abc.it_id)" } assert parser.query_type == QueryType.CREATE @@ -444,7 +443,7 @@ def test_window_in_with(): assert parser.with_names == ["cte_1"] assert parser.columns == ["column_1", "column_2"] assert parser.with_queries == { - "cte_1": "SELECT column_1, column_2 FROM table_1 WINDOW window_1 AS(PARTITION BY column_2)" + "cte_1": "SELECT column_1, column_2 FROM table_1 WINDOW window_1 AS (PARTITION BY column_2)" } assert parser.tables == ["table_1"] @@ -500,7 +499,7 @@ def test_as_was_preceded_by_with_query(): SELECT 1; """ parser = Parser(query) - with pytest.raises(ValueError, match="This query is wrong"): + with pytest.raises(InvalidQueryDefinition): parser.tables query = """ @@ -509,7 +508,7 @@ def test_as_was_preceded_by_with_query(): SELECT 1; """ parser = Parser(query) - with pytest.raises(ValueError, match="This query is wrong"): + with pytest.raises(InvalidQueryDefinition): parser.tables query = """ @@ -518,7 +517,7 @@ def test_as_was_preceded_by_with_query(): SELECT 1; """ parser = Parser(query) - with pytest.raises(ValueError, match="This query is wrong"): + with pytest.raises(InvalidQueryDefinition): parser.tables @@ -530,5 +529,246 @@ def test_malformed_with_query_hang(): WHERE domain =e''$.f') AS g FROM h;""" parser = Parser(query) - with pytest.raises(ValueError, match="This query is wrong"): + with pytest.raises(InvalidQueryDefinition): parser.tables + + +def test_nested_cte_not_in_tables(): + # solved: https://github.com/macbre/sql-metadata/issues/314 + query = """ + WITH CTE_ROOT_1 as ( + WITH CTE_CHILD as ( + SELECT a FROM table_1 as t + ) + SELECT a FROM CTE_CHILD + ), + CTE_ROOT_2 as ( + SELECT b FROM table_2 + ) + SELECT a, b, c + FROM table_3 t3 + LEFT JOIN CTE_ROOT_1 cr1 on t3.id = cr1.id + LEFT JOIN CTE_ROOT_2 cr2 on t3.id = cr2.id + LEFT JOIN table_4 t4 on t3.id = t4.id + """ + parser = Parser(query) + assert parser.tables == ["table_1", "table_2", "table_3", "table_4"] + assert parser.columns == [ + "a", + "b", + "c", + "table_3.id", + "cr1.id", + "cr2.id", + "table_4.id", + ] + assert parser.tables_aliases == { + "t3": "table_3", + "t4": "table_4", + "t": "table_1", + } + + +def test_nested_with_name_not_table(): + # solved: https://github.com/macbre/sql-metadata/issues/413 + query = """ + WITH + A as ( + WITH intermediate_query as ( + SELECT id, some_column FROM table_one + ) + SELECT id, some_column FROM intermediate_query + ), + B as ( + SELECT id, other_column FROM table_two + ) + SELECT A.id, some_column, other_column + FROM A + INNER JOIN B ON A.id = B.id + """ + parser = Parser(query) + assert parser.tables == ["table_one", "table_two"] + assert parser.columns == ["id", "some_column", "other_column"] + + +def test_cte_alias_reuse(): + # solved: https://github.com/macbre/sql-metadata/issues/262 + query = """ + WITH + cte_one AS (SELECT cte_id, cte_name FROM cte_one_table), + cte_two AS (SELECT B.cte_id FROM cte_one B), + cte_three AS (SELECT B.id FROM (SELECT id FROM table_two) B) + SELECT * FROM cte_two + """ + parser = Parser(query) + assert parser.tables == ["cte_one_table", "table_two"] + assert "cte_id" in parser.columns + assert "cte_name" in parser.columns + + +def test_group_by_not_table_alias_in_cte(): + # solved: https://github.com/macbre/sql-metadata/issues/526 + query = """ + WITH [CTE1] AS ( + SELECT [Col1], MAX([Col2]) AS [MaxCol2] + FROM [Table1] + GROUP BY [Col1] + ) + SELECT t3.[Qty1], t4.[Code], t3.[DateCol] + FROM [Table1] t3 + JOIN [CTE1] t1 ON t3.[Col1] = t1.[Col1] AND t3.[DateCol] = t1.[MaxCol2] + JOIN [Table2] t4 ON t4.[ID] = t3.[Col2] + """ + parser = Parser(query) + aliases = parser.tables_aliases + assert "GROUP BY" not in aliases + assert "[Table1]" in parser.tables + assert "[Table2]" in parser.tables + + +def test_coalesce_three_args_in_cte(): + """COALESCE with 3+ args should render as COALESCE, not IFNULL.""" + p = Parser( + "WITH cte AS (SELECT COALESCE(a, b, c) FROM t) " + "SELECT * FROM cte" + ) + body = p.with_queries["cte"] + assert "COALESCE" in body.upper() + + +def test_date_add_in_cte(): + """DATE_ADD in a CTE body should be preserved by the custom generator.""" + p = Parser( + "WITH cte AS (SELECT DATE_ADD(created, INTERVAL 1 DAY) FROM events) " + "SELECT * FROM cte" + ) + body = p.with_queries["cte"] + assert "DATE_ADD" in body.upper() + + +def test_date_sub_in_cte(): + """DATE_SUB in a CTE body should be preserved by the custom generator.""" + p = Parser( + "WITH cte AS (SELECT DATE_SUB(created, INTERVAL 1 DAY) FROM events) " + "SELECT * FROM cte" + ) + body = p.with_queries["cte"] + assert "DATE_SUB" in body.upper() + + +def test_not_expression_in_cte(): + """NOT applied to a boolean expression (not IS NULL or IN) in CTE body.""" + p = Parser( + "WITH cte AS (SELECT * FROM t WHERE NOT (active > 0)) " + "SELECT * FROM cte" + ) + body = p.with_queries["cte"] + assert "NOT" in body.upper() + + +def test_nested_resolver_unresolvable_reference(): + """A dotted column reference not matching any CTE/subquery stays as-is.""" + p = Parser( + "WITH cte AS (SELECT id FROM t) " + "SELECT nonexistent.col FROM cte" + ) + assert "nonexistent.col" in p.columns + + +def test_cte_with_subquery_and_star_alias(): + # Solved: https://github.com/macbre/sql-metadata/issues/392 + p = Parser("""with x as (select d.nbr, d.af_pk + from test_db.test_table3 d) + select q.hx_id, q.text + from (select prod_code, s.* + from testdb.test_table s + inner join testdb.test_table2 p on s.s1_fk = p.p1_sk + ) q + inner join x on q.s2_fk = x.af_pk""") + assert p.tables == [ + "test_db.test_table3", "testdb.test_table", "testdb.test_table2" + ] + assert p.with_names == ["x"] + assert "testdb.test_table.*" in p.columns + + +def test_bracketed_select_with_cte_and_column_alias(): + # Solved: https://github.com/macbre/sql-metadata/issues/326 + p = Parser("""with a as (select id, a from tbl1), + with b as (select id, b from tbl2) + (select a.id, a.a + b.b as t + from a left join b on a.id = b.id)""") + assert p.tables == ["tbl1", "tbl2"] + assert p.with_names == ["a", "b"] + assert p.columns == ["id", "a", "b"] + + +def test_cte_without_alias_raises(): + """CTE without a name is invalid SQL.""" + with pytest.raises(InvalidQueryDefinition, match="All CTEs require an alias"): + Parser("WITH AS (SELECT 1) SELECT * FROM t").columns + + +def test_with_queries_empty_when_no_cte(): + """A query with no CTEs returns empty with_queries.""" + p = Parser("SELECT * FROM t") + assert p.with_queries == {} + + +def test_cte_subquery_full_resolution(): + """Subquery + CTE: CTE-qualified columns fully resolved.""" + parser = Parser(""" + WITH c AS (SELECT id, name FROM t1) + SELECT s.id, t2.name + FROM (SELECT c.id FROM c) AS s + JOIN t2 ON s.id = t2.id + """) + assert parser.tables == ["t1", "t2"] + assert "c.id" not in parser.columns + assert "id" in parser.columns + + +def test_chained_cte_qualified_columns_resolved(): + """CTE-qualified columns should resolve through chained CTEs.""" + # 2-level chain + p = Parser(""" + WITH c1 AS (SELECT a FROM t1), + c2 AS (SELECT c1.a FROM c1) + SELECT c2.a FROM c2 + """) + assert p.tables == ["t1"] + assert p.columns == ["a"] + + # 3-level chain + p = Parser(""" + WITH c1 AS (SELECT a FROM t1), + c2 AS (SELECT c1.a FROM c1), + c3 AS (SELECT c2.a FROM c2) + SELECT c3.a FROM c3 + """) + assert p.tables == ["t1"] + assert p.columns == ["a"] + + +def test_chained_cte_with_subquery(): + """CTE-qualified columns in subqueries wrapping chained CTEs.""" + p = Parser(""" + WITH c1 AS (SELECT a FROM t1), + c2 AS (SELECT c1.a FROM c1) + SELECT s.a FROM (SELECT c2.a FROM c2) AS s + """) + assert p.tables == ["t1"] + assert p.columns == ["a"] + + +def test_chained_cte_cross_reference(): + """4-level CTE chain where level 3 references both level 2 and level 1.""" + p = Parser(""" + WITH c1 AS (SELECT a, b FROM t1), + c2 AS (SELECT c1.a FROM c1), + c3 AS (SELECT c2.a, c1.b FROM c2 JOIN c1 ON c2.a = c1.a), + c4 AS (SELECT c3.a, c3.b FROM c3) + SELECT c4.a, c4.b FROM c4 + """) + assert p.tables == ["t1"] + assert p.columns == ["a", "b"]