Skip to content

Commit 4d2cc6d

Browse files
authored
Merge pull request #19 from microsoft/tokenization_fixes
Tokenization fixes
2 parents 01f34fb + 061c466 commit 4d2cc6d

File tree

10 files changed

+259
-24
lines changed

10 files changed

+259
-24
lines changed

flowquery-py/src/parsing/parser.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ def _parse_create(self) -> Optional[Operation]:
326326
if not self.token.is_colon():
327327
raise ValueError("Expected ':' for relationship type")
328328
self.set_next_token()
329-
if not self.token.is_identifier():
329+
if not self.token.is_identifier_or_keyword():
330330
raise ValueError("Expected relationship type identifier")
331331
rel_type = self.token.value or ""
332332
self.set_next_token()
@@ -450,17 +450,17 @@ def _parse_node(self) -> Optional[Node]:
450450
self.set_next_token()
451451
self._skip_whitespace_and_comments()
452452
identifier: Optional[str] = None
453-
if self.token.is_identifier():
453+
if self.token.is_identifier_or_keyword():
454454
identifier = self.token.value
455455
self.set_next_token()
456456
self._skip_whitespace_and_comments()
457457
label: Optional[str] = None
458458
peek = self.peek()
459-
if not self.token.is_colon() and peek is not None and peek.is_identifier():
459+
if not self.token.is_colon() and peek is not None and peek.is_identifier_or_keyword():
460460
raise ValueError("Expected ':' for node label")
461-
if self.token.is_colon() and (peek is None or not peek.is_identifier()):
461+
if self.token.is_colon() and (peek is None or not peek.is_identifier_or_keyword()):
462462
raise ValueError("Expected node label identifier")
463-
if self.token.is_colon() and peek is not None and peek.is_identifier():
463+
if self.token.is_colon() and peek is not None and peek.is_identifier_or_keyword():
464464
self.set_next_token()
465465
label = cast(str, self.token.value) # Guaranteed by is_identifier check
466466
self.set_next_token()
@@ -495,13 +495,13 @@ def _parse_relationship(self) -> Optional[Relationship]:
495495
return None
496496
self.set_next_token()
497497
variable: Optional[str] = None
498-
if self.token.is_identifier():
498+
if self.token.is_identifier_or_keyword():
499499
variable = self.token.value
500500
self.set_next_token()
501501
if not self.token.is_colon():
502502
raise ValueError("Expected ':' for relationship type")
503503
self.set_next_token()
504-
if not self.token.is_identifier():
504+
if not self.token.is_identifier_or_keyword():
505505
raise ValueError("Expected relationship type identifier")
506506
rel_type: str = self.token.value or ""
507507
self.set_next_token()
@@ -633,14 +633,14 @@ def _parse_expressions(
633633
def _parse_operand(self, expression: Expression) -> bool:
634634
"""Parse a single operand (without operators). Returns True if an operand was parsed."""
635635
self._skip_whitespace_and_comments()
636-
if self.token.is_identifier() and (self.peek() is None or not self.peek().is_left_parenthesis()):
636+
if self.token.is_identifier_or_keyword() and (self.peek() is None or not self.peek().is_left_parenthesis()):
637637
identifier = self.token.value or ""
638638
reference = Reference(identifier, self._variables.get(identifier))
639639
self.set_next_token()
640640
lookup = self._parse_lookup(reference)
641641
expression.add_node(lookup)
642642
return True
643-
elif self.token.is_identifier() and self.peek() is not None and self.peek().is_left_parenthesis():
643+
elif self.token.is_identifier_or_keyword() and self.peek() is not None and self.peek().is_left_parenthesis():
644644
func = self._parse_predicate_function() or self._parse_function()
645645
if func is not None:
646646
lookup = self._parse_lookup(func)
@@ -650,7 +650,7 @@ def _parse_operand(self, expression: Expression) -> bool:
650650
self.token.is_left_parenthesis()
651651
and self.peek() is not None
652652
and (
653-
self.peek().is_identifier()
653+
self.peek().is_identifier_or_keyword()
654654
or self.peek().is_colon()
655655
or self.peek().is_right_parenthesis()
656656
)
@@ -734,7 +734,7 @@ def _parse_lookup(self, node: ASTNode) -> ASTNode:
734734
while True:
735735
if self.token.is_dot():
736736
self.set_next_token()
737-
if not self.token.is_identifier() and not self.token.is_keyword():
737+
if not self.token.is_identifier_or_keyword():
738738
raise ValueError("Expected identifier")
739739
lookup = Lookup()
740740
lookup.index = Identifier(self.token.value or "")
@@ -847,7 +847,7 @@ def _parse_alias(self) -> Optional[Alias]:
847847
self._expect_previous_token_to_be_whitespace_or_comment()
848848
self.set_next_token()
849849
self._expect_and_skip_whitespace_and_comments()
850-
if not self.token.is_identifier():
850+
if not self.token.is_identifier_or_keyword():
851851
raise ValueError("Expected identifier")
852852
alias = Alias(self.token.value or "")
853853
self.set_next_token()

flowquery-py/src/tokenization/token.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,24 @@ def IDENTIFIER(value: str) -> Token:
106106
def is_identifier(self) -> bool:
107107
return self._type == TokenType.IDENTIFIER or self._type == TokenType.BACKTICK_STRING
108108

109+
def is_keyword_that_cannot_be_identifier(self) -> bool:
110+
"""Returns True for keywords that have special expression-level roles
111+
and should not be treated as identifiers (NULL, CASE, WHEN, THEN, ELSE, END)."""
112+
return self.is_keyword() and (
113+
self.is_null()
114+
or self.is_case()
115+
or self.is_when()
116+
or self.is_then()
117+
or self.is_else()
118+
or self.is_end()
119+
)
120+
121+
def is_identifier_or_keyword(self) -> bool:
122+
"""Returns True if the token is an identifier or a keyword that can be used as an identifier."""
123+
return self.is_identifier() or (
124+
self.is_keyword() and not self.is_keyword_that_cannot_be_identifier()
125+
)
126+
109127
# String token
110128

111129
@staticmethod

flowquery-py/tests/compute/test_runner.py

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1584,4 +1584,63 @@ async def test_schema_returns_nodes_and_relationships_with_sample_data(self):
15841584
assert chases["sample"] is not None
15851585
assert "left_id" not in chases["sample"]
15861586
assert "right_id" not in chases["sample"]
1587-
assert "speed" in chases["sample"]
1587+
assert "speed" in chases["sample"]
1588+
1589+
@pytest.mark.asyncio
1590+
async def test_reserved_keywords_as_identifiers(self):
1591+
"""Test reserved keywords as identifiers."""
1592+
runner = Runner("""
1593+
WITH 1 AS return
1594+
RETURN return
1595+
""")
1596+
await runner.run()
1597+
results = runner.results
1598+
assert len(results) == 1
1599+
assert results[0]["return"] == 1
1600+
1601+
@pytest.mark.asyncio
1602+
async def test_reserved_keywords_as_parts_of_identifiers(self):
1603+
"""Test reserved keywords as parts of identifiers."""
1604+
runner = Runner("""
1605+
unwind [
1606+
{from: "Alice", to: "Bob", organizer: "Charlie"},
1607+
{from: "Bob", to: "Charlie", organizer: "Alice"},
1608+
{from: "Charlie", to: "Alice", organizer: "Bob"}
1609+
] as data
1610+
return data.from as from, data.to as to, data.organizer as organizer
1611+
""")
1612+
await runner.run()
1613+
results = runner.results
1614+
assert len(results) == 3
1615+
assert results[0] == {"from": "Alice", "to": "Bob", "organizer": "Charlie"}
1616+
assert results[1] == {"from": "Bob", "to": "Charlie", "organizer": "Alice"}
1617+
assert results[2] == {"from": "Charlie", "to": "Alice", "organizer": "Bob"}
1618+
1619+
@pytest.mark.asyncio
1620+
async def test_reserved_keywords_as_relationship_types_and_labels(self):
1621+
"""Test reserved keywords as relationship types and labels."""
1622+
await Runner("""
1623+
CREATE VIRTUAL (:Return) AS {
1624+
unwind [
1625+
{id: 1, name: 'Node 1'},
1626+
{id: 2, name: 'Node 2'}
1627+
] as record
1628+
RETURN record.id as id, record.name as name
1629+
}
1630+
""").run()
1631+
await Runner("""
1632+
CREATE VIRTUAL (:Return)-[:With]-(:Return) AS {
1633+
unwind [
1634+
{left_id: 1, right_id: 2}
1635+
] as record
1636+
RETURN record.left_id as left_id, record.right_id as right_id
1637+
}
1638+
""").run()
1639+
runner = Runner("""
1640+
MATCH (a:Return)-[:With]->(b:Return)
1641+
RETURN a.name AS name1, b.name AS name2
1642+
""")
1643+
await runner.run()
1644+
results = runner.results
1645+
assert len(results) == 1
1646+
assert results[0] == {"name1": "Node 1", "name2": "Node 2"}

flowquery-py/tests/parsing/test_parser.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -719,3 +719,12 @@ def test_relationship_with_properties(self):
719719
assert isinstance(relationship, Relationship)
720720
assert relationship.properties.get("since") is not None
721721
assert relationship.properties["since"].value() == 2022
722+
723+
def test_case_statement_with_keywords_as_identifiers(self):
724+
"""Test that CASE/WHEN/THEN/ELSE/END are not treated as identifiers."""
725+
parser = Parser()
726+
ast = parser.parse("RETURN CASE WHEN 1 THEN 2 ELSE 3 END")
727+
assert "Case" in ast.print()
728+
assert "When" in ast.print()
729+
assert "Then" in ast.print()
730+
assert "Else" in ast.print()

flowquery-py/tests/tokenization/test_tokenizer.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,3 +162,37 @@ def test_relationship_with_hops(self):
162162
tokens = tokenizer.tokenize()
163163
assert tokens is not None
164164
assert len(tokens) > 0
165+
166+
def test_reserved_keywords_as_identifiers(self):
167+
"""Test reserved keywords as identifiers."""
168+
tokenizer = Tokenizer("""
169+
WITH 1 AS return
170+
RETURN return
171+
""")
172+
tokens = tokenizer.tokenize()
173+
assert tokens is not None
174+
assert len(tokens) > 0
175+
176+
def test_reserved_keywords_as_part_of_identifiers(self):
177+
"""Test reserved keywords as part of identifiers."""
178+
tokenizer = Tokenizer("""
179+
unwind [
180+
{from: "Alice", to: "Bob", organizer: "Charlie"},
181+
{from: "Bob", to: "Charlie", organizer: "Alice"},
182+
{from: "Charlie", to: "Alice", organizer: "Bob"}
183+
] as data
184+
return data.from, data.to
185+
""")
186+
tokens = tokenizer.tokenize()
187+
assert tokens is not None
188+
assert len(tokens) > 0
189+
190+
def test_reserved_keywords_as_relationship_types_and_labels(self):
191+
"""Test reserved keywords as relationship types and labels."""
192+
tokenizer = Tokenizer("""
193+
MATCH (a:RETURN)-[r:WITH]->(b:RETURN)
194+
RETURN a, b
195+
""")
196+
tokens = tokenizer.tokenize()
197+
assert tokens is not None
198+
assert len(tokens) > 0

src/parsing/parser.ts

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ class Parser extends BaseParser {
353353
throw new Error("Expected ':' for relationship type");
354354
}
355355
this.setNextToken();
356-
if (!this.token.isIdentifier()) {
356+
if (!this.token.isIdentifierOrKeyword()) {
357357
throw new Error("Expected relationship type identifier");
358358
}
359359
const type: string = this.token.value || "";
@@ -412,19 +412,19 @@ class Parser extends BaseParser {
412412
this.setNextToken();
413413
this.skipWhitespaceAndComments();
414414
let identifier: string | null = null;
415-
if (this.token.isIdentifier()) {
415+
if (this.token.isIdentifierOrKeyword()) {
416416
identifier = this.token.value || "";
417417
this.setNextToken();
418418
}
419419
this.skipWhitespaceAndComments();
420420
let label: string | null = null;
421-
if (!this.token.isColon() && this.peek()?.isIdentifier()) {
421+
if (!this.token.isColon() && this.peek()?.isIdentifierOrKeyword()) {
422422
throw new Error("Expected ':' for node label");
423423
}
424-
if (this.token.isColon() && !this.peek()?.isIdentifier()) {
424+
if (this.token.isColon() && !this.peek()?.isIdentifierOrKeyword()) {
425425
throw new Error("Expected node label identifier");
426426
}
427-
if (this.token.isColon() && this.peek()?.isIdentifier()) {
427+
if (this.token.isColon() && this.peek()?.isIdentifierOrKeyword()) {
428428
this.setNextToken();
429429
label = this.token.value || "";
430430
this.setNextToken();
@@ -595,15 +595,15 @@ class Parser extends BaseParser {
595595
}
596596
this.setNextToken();
597597
let variable: string | null = null;
598-
if (this.token.isIdentifier()) {
598+
if (this.token.isIdentifierOrKeyword()) {
599599
variable = this.token.value || "";
600600
this.setNextToken();
601601
}
602602
if (!this.token.isColon()) {
603603
throw new Error("Expected ':' for relationship type");
604604
}
605605
this.setNextToken();
606-
if (!this.token.isIdentifier()) {
606+
if (!this.token.isIdentifierOrKeyword()) {
607607
throw new Error("Expected relationship type identifier");
608608
}
609609
const type: string = this.token.value || "";
@@ -745,14 +745,14 @@ class Parser extends BaseParser {
745745
*/
746746
private parseOperand(expression: Expression): boolean {
747747
this.skipWhitespaceAndComments();
748-
if (this.token.isIdentifier() && !this.peek()?.isLeftParenthesis()) {
748+
if (this.token.isIdentifierOrKeyword() && !this.peek()?.isLeftParenthesis()) {
749749
const identifier: string = this.token.value || "";
750750
const reference = new Reference(identifier, this.variables.get(identifier));
751751
this.setNextToken();
752752
const lookup = this.parseLookup(reference);
753753
expression.addNode(lookup);
754754
return true;
755-
} else if (this.token.isIdentifier() && this.peek()?.isLeftParenthesis()) {
755+
} else if (this.token.isIdentifierOrKeyword() && this.peek()?.isLeftParenthesis()) {
756756
const func = this.parsePredicateFunction() || this.parseFunction();
757757
if (func !== null) {
758758
const lookup = this.parseLookup(func);
@@ -761,7 +761,7 @@ class Parser extends BaseParser {
761761
}
762762
} else if (
763763
this.token.isLeftParenthesis() &&
764-
(this.peek()?.isIdentifier() ||
764+
(this.peek()?.isIdentifierOrKeyword() ||
765765
this.peek()?.isColon() ||
766766
this.peek()?.isRightParenthesis())
767767
) {
@@ -857,7 +857,7 @@ class Parser extends BaseParser {
857857
while (true) {
858858
if (this.token.isDot()) {
859859
this.setNextToken();
860-
if (!this.token.isIdentifier() && !this.token.isKeyword()) {
860+
if (!this.token.isIdentifierOrKeyword()) {
861861
throw new Error("Expected identifier");
862862
}
863863
lookup = new Lookup();

src/tokenization/token.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,10 @@ class Token {
104104
return this._type === TokenType.IDENTIFIER || this._type === TokenType.BACKTICK_STRING;
105105
}
106106

107+
public isIdentifierOrKeyword(): boolean {
108+
return this.isIdentifier() || (this.isKeyword() && !this.isKeywordThatCannotBeIdentifier());
109+
}
110+
107111
// String token
108112

109113
public static STRING(value: string, quoteChar: string = '"'): Token {
@@ -387,6 +391,18 @@ class Token {
387391
return this._type === TokenType.KEYWORD;
388392
}
389393

394+
public isKeywordThatCannotBeIdentifier(): boolean {
395+
return (
396+
this.isKeyword() &&
397+
(this.isNull() ||
398+
this.isCase() ||
399+
this.isWhen() ||
400+
this.isThen() ||
401+
this.isElse() ||
402+
this.isEnd())
403+
);
404+
}
405+
390406
public static get WITH(): Token {
391407
return new Token(TokenType.KEYWORD, Keyword.WITH);
392408
}

0 commit comments

Comments
 (0)