From cc577831b636fcf6784eab5ede588b6cea6139d0 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Tue, 21 Apr 2026 19:28:45 -0700 Subject: [PATCH 01/31] Thompson --- src/Control/Lens/Grammar/BackusNaur.hs | 113 ++++++++++++++++++++++++- test/Properties/Kleene.hs | 21 +++++ 2 files changed, 133 insertions(+), 1 deletion(-) diff --git a/src/Control/Lens/Grammar/BackusNaur.hs b/src/Control/Lens/Grammar/BackusNaur.hs index bd47607e..5d61fa20 100644 --- a/src/Control/Lens/Grammar/BackusNaur.hs +++ b/src/Control/Lens/Grammar/BackusNaur.hs @@ -33,6 +33,10 @@ import Data.Bifunctor.Joker import Data.Coerce import Data.Foldable import Data.Function +import qualified Data.IntMap.Strict as IntMap +import Data.IntMap.Strict (IntMap) +import qualified Data.IntSet as IntSet +import Data.IntSet (IntSet) import Data.MemoTrie import qualified Data.Set as Set import Data.Set (Set) @@ -171,6 +175,111 @@ rulesNamed nameX = foldl' (flip inserter) Set.empty where inserter (nameY,y) = if nameX == nameY then Set.insert y else id +data ThompsonState token = ThompsonState (TokenClass token) IntSet + +thompsonFinalState :: Int +thompsonFinalState = 0 + +bypassStates :: Bool -> IntSet -> IntSet +bypassStates True = id +bypassStates False = const IntSet.empty + +containsNonTerminal :: RegEx token -> Bool +containsNonTerminal = \ + case + NonTerminal _ -> True + Sequence rex0 rex1 -> + containsNonTerminal rex0 || containsNonTerminal rex1 + KleeneStar rex -> containsNonTerminal rex + KleeneOpt rex -> containsNonTerminal rex + KleenePlus rex -> containsNonTerminal rex + RegExam (Alternate rex0 rex1) -> + containsNonTerminal rex0 || containsNonTerminal rex1 + _ -> False + +compileThompson + :: RegEx token + -> Int + -> IntSet + -> (IntSet, [(Int, ThompsonState token)], Int, Bool) +compileThompson = go where + go rex nextId dests = case rex of + SeqEmpty -> (IntSet.empty, [], nextId, True) + NonTerminal _ -> (IntSet.empty, [], nextId, False) + Sequence rex0 rex1 -> + let + (firsts1, states1, nextId1, bypass1) = go rex1 nextId dests + (firsts0, states0, nextId0, bypass0) = + go rex0 nextId1 (firsts1 <> bypassStates bypass1 dests) + in + ( firsts0 <> bypassStates bypass0 firsts1 + , states0 <> states1 + , nextId0 + , bypass0 && bypass1 + ) + KleeneStar rex0 -> + let + (firsts, states, nextId', _) = go rex0 nextId (firsts <> dests) + in + (firsts, states, nextId', True) + KleeneOpt rex0 -> + let + (firsts, states, nextId', _) = go rex0 nextId dests + in + (firsts, states, nextId', True) + KleenePlus rex0 -> + let + (firsts, states, nextId', bypass) = go rex0 nextId (firsts <> dests) + in + (firsts, states, nextId', bypass) + RegExam (OneOf chars) + | Set.null chars -> (IntSet.empty, [], nextId, False) + | otherwise -> + ( IntSet.singleton nextId + , [(nextId, ThompsonState (TokenClass (OneOf chars)) dests)] + , nextId + 1 + , False + ) + RegExam (NotOneOf chars catTest) -> + ( IntSet.singleton nextId + , [(nextId, ThompsonState (TokenClass (NotOneOf chars catTest)) dests)] + , nextId + 1 + , False + ) + RegExam (Alternate rex0 rex1) -> + let + (firsts1, states1, nextId1, bypass1) = go rex1 nextId dests + (firsts0, states0, nextId0, bypass0) = go rex0 nextId1 dests + in + ( firsts0 <> firsts1 + , states0 <> states1 + , nextId0 + , bypass0 || bypass1 + ) + +compileThompsonTop + :: RegEx token + -> (IntSet, IntMap (ThompsonState token)) +compileThompsonTop rex = + (firsts <> bypassStates bypass finalStates, IntMap.fromList states) + where + finalStates = IntSet.singleton thompsonFinalState + (firsts, states, _, bypass) = compileThompson rex 1 finalStates + +matchThompson :: Categorized token => [token] -> RegEx token -> Bool +matchThompson word rex = IntSet.member thompsonFinalState finalStates + where + (startStates, states) = compileThompsonTop rex + finalStates = foldl' step startStates word + step activeStates input = IntSet.foldl' advance IntSet.empty activeStates + where + advance nextStates stateId + | stateId == thompsonFinalState = nextStates + | otherwise = case IntMap.lookup stateId states of + Just (ThompsonState exam dests) + | tokenClass exam input -> nextStates <> dests + _ -> nextStates + -- instances instance (Ord rule, NonTerminalSymbol rule) => BackusNaurForm (Bnf rule) where @@ -215,6 +324,8 @@ instance (Categorized token, HasTrie token) (=~) word = δ . diffB word instance (Categorized token, HasTrie token) => Matching [token] (RegEx token) where - word =~ pattern = word =~ liftBnf0 pattern + word =~ pattern + | containsNonTerminal pattern = word =~ liftBnf0 pattern + | otherwise = matchThompson word pattern instance Matching s (APrism s t a b) where word =~ pattern = is pattern word diff --git a/test/Properties/Kleene.hs b/test/Properties/Kleene.hs index 7aa7ca3a..ab5f1c9d 100644 --- a/test/Properties/Kleene.hs +++ b/test/Properties/Kleene.hs @@ -2,6 +2,7 @@ module Properties.Kleene (kleeneProperties) where import Control.Lens.Grammar +import Data.Foldable (for_) import Test.Hspec import Test.Hspec.QuickCheck (prop) import Test.QuickCheck @@ -95,6 +96,26 @@ kleeneProperties = do prop "notAsIn cat = tokenClass (notAsIn cat)" $ \(cat :: GeneralCategory) -> (notAsIn cat :: RegEx Char) == tokenClass (notAsIn cat) + it "matching agrees with lifted Bnf on examples" $ do + let + cases = + [ ("", mempty :: RegEx Char) + , ("a", token 'a') + , ("b", token 'a') + , ("ab", token 'a' <> token 'b') + , ("a", token 'a' >|< token 'b') + , ("bbb", starK (token 'b')) + , ("bbb", plusK (token 'b')) + , ("", optK (token 'b')) + , ("x", oneOf "xyz") + , ("x", notOneOf "abc") + , ("A", asIn UppercaseLetter) + , ("a", notAsIn UppercaseLetter) + , ("abbb", token 'a' <> starK (token 'b')) + , ("cat", terminal "cat" >|< terminal "dog") + ] + for_ cases $ \(word, rex) -> + (word =~ rex) `shouldBe` (word =~ liftBnf0 rex) describe "BooleanAlgebra TokenClass" $ do it "trueB = anyToken" $ (trueB :: TokenClass Char) `shouldBe` anyToken From c928f94e976cd3f5f7063854e0b67eb1da1bd3a2 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Tue, 21 Apr 2026 20:10:20 -0700 Subject: [PATCH 02/31] mawr test --- src/Control/Lens/Grammar/BackusNaur.hs | 3 +- test/Main.hs | 116 +++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 1 deletion(-) diff --git a/src/Control/Lens/Grammar/BackusNaur.hs b/src/Control/Lens/Grammar/BackusNaur.hs index 5d61fa20..1d60d7b4 100644 --- a/src/Control/Lens/Grammar/BackusNaur.hs +++ b/src/Control/Lens/Grammar/BackusNaur.hs @@ -205,7 +205,8 @@ compileThompson compileThompson = go where go rex nextId dests = case rex of SeqEmpty -> (IntSet.empty, [], nextId, True) - NonTerminal _ -> (IntSet.empty, [], nextId, False) + NonTerminal _ -> + error "compileThompson: NonTerminal unsupported; route via diffB" Sequence rex0 rex1 -> let (firsts1, states1, nextId1, bypass1) = go rex1 nextId dests diff --git a/test/Main.hs b/test/Main.hs index 9398364f..4d371397 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -40,6 +40,7 @@ main = do describe "lenvecGrammar" $ for_ lenvecExamples $ testCtxGrammar True lenvecGrammar describe "chainGrammar" $ for_ chainExamples $ testCtxGrammar True chainGrammar describe "Parsector try rollback" tryRollbackTests + describe "Thompson matcher" thompsonMatcherTests describe "Kleene" kleeneProperties describe "meander" meanderProperties @@ -58,6 +59,121 @@ tryRollbackTests = do parsecStream actual `shouldBe` "" parsecResult actual `shouldBe` (Nothing :: Maybe String) +thompsonMatcherTests :: Spec +thompsonMatcherTests = do + let + sandwich = terminal "a" <> starK (terminal "b") <> terminal "a" + evenA = starK (sandwich >|< terminal "b") + nullableSeq = optK (terminal "a") <> terminal "b" + nestedStar = starK (starK (terminal "a")) + onePlusA = plusK (terminal "a") + altThenTerm = (terminal "a" >|< terminal "b") <> terminal "c" + notXY = notOneOf ("xy" :: String) :: RegEx Char + abcClass = oneOf ("abc" :: String) :: RegEx Char + checks :: [(String, RegEx Char, [(String, Bool)])] + checks = + [ ( "paper sandwich language" + , sandwich + , [ ("aa", True) + , ("aba", True) + , ("abbba", True) + , ("", False) + , ("a", False) + , ("ab", False) + , ("baa", False) + ] + ) + , ( "paper even-a language" + , evenA + , [ ("", True) + , ("b", True) + , ("bb", True) + , ("aa", True) + , ("abba", True) + , ("baab", True) + , ("abaaab", True) + , ("a", False) + , ("ababa", False) + ] + ) + , ( "nullable sequence wiring" + , nullableSeq + , [ ("b", True) + , ("ab", True) + , ("", False) + , ("a", False) + ] + ) + , ( "empty and empty-class handling" + , mempty + , [ ("", True) + , ("a", False) + ] + ) + , ( "failing token class" + , zeroK + , [ ("", False) + , ("a", False) + ] + ) + , ( "nested star (a*)*" + , nestedStar + , [ ("", True) + , ("a", True) + , ("aaaa", True) + , ("b", False) + , ("aab", False) + ] + ) + , ( "kleene plus a+" + , onePlusA + , [ ("a", True) + , ("aaa", True) + , ("", False) + , ("ab", False) + , ("b", False) + ] + ) + , ( "alternate then terminal (a|b)c" + , altThenTerm + , [ ("ac", True) + , ("bc", True) + , ("", False) + , ("c", False) + , ("abc", False) + , ("a", False) + ] + ) + , ( "notOneOf \"xy\"" + , notXY + , [ ("a", True) + , ("z", True) + , ("x", False) + , ("y", False) + , ("", False) + , ("ab", False) + ] + ) + , ( "oneOf \"abc\"" + , abcClass + , [ ("a", True) + , ("b", True) + , ("c", True) + , ("d", False) + , ("", False) + , ("ab", False) + ] + ) + ] + + for_ checks $ \(label, rex, cases) -> + for_ cases $ \(word, expected) -> + it (label <> " matches " <> show word) $ do + let thompsonMatch = word =~ rex + let derivativeMatch = word =~ liftBnf0 rex + thompsonMatch `shouldBe` expected + thompsonMatch `shouldBe` derivativeMatch + doctests :: IO () doctests = do let From fb01a85fcb8a1ece04d51441fd299646ced8efa8 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Wed, 22 Apr 2026 06:53:48 -0700 Subject: [PATCH 03/31] earley --- src/Control/Lens/Grammar/BackusNaur.hs | 234 ++++++++++++++++++++++++- test/Examples/Chain.hs | 2 +- test/Examples/Json.hs | 44 ++--- test/Main.hs | 104 ++++++++++- 4 files changed, 354 insertions(+), 30 deletions(-) diff --git a/src/Control/Lens/Grammar/BackusNaur.hs b/src/Control/Lens/Grammar/BackusNaur.hs index 1d60d7b4..bc465d7a 100644 --- a/src/Control/Lens/Grammar/BackusNaur.hs +++ b/src/Control/Lens/Grammar/BackusNaur.hs @@ -37,6 +37,8 @@ import qualified Data.IntMap.Strict as IntMap import Data.IntMap.Strict (IntMap) import qualified Data.IntSet as IntSet import Data.IntSet (IntSet) +import qualified Data.Map.Strict as Map +import Data.Map.Strict (Map) import Data.MemoTrie import qualified Data.Set as Set import Data.Set (Set) @@ -281,6 +283,232 @@ matchThompson word rex = IntSet.member thompsonFinalState finalStates | tokenClass exam input -> nextStates <> dests _ -> nextStates +-- | A state in the Earley-extended Thompson transducer for a `Bnf`. +-- @ESTerminal cls ds@ matches on a token class and transitions to @ds@. +-- @ESNonterm name ds@ is a call point for rule @name@; after @name@ +-- completes, control flows to @ds@. @ESEmit name@ is the final state +-- for rule @name@ and triggers completion during Earley closure. +data EarleyState token + = ESTerminal (TokenClass token) IntSet + | ESNonterm String IntSet + | ESEmit String + +data EarleyTransducer token = EarleyTransducer + { etStates :: IntMap (EarleyState token) + , etRuleEntries :: Map String (IntSet, Bool) + , etAcceptId :: Int + , etStartStates :: IntSet + } + +earleyAcceptId :: Int +earleyAcceptId = 0 + +nullableRules :: Map String [RegEx token] -> Map String Bool +nullableRules ruleMap = iter (Map.map (const False) ruleMap) + where + iter nm = + let nm' = Map.mapWithKey + (\n _ -> any (rexNullable nm) (Map.findWithDefault [] n ruleMap)) nm + in if nm == nm' then nm else iter nm' + rexNullable nm = \case + SeqEmpty -> True + NonTerminal n -> Map.findWithDefault False n nm + Sequence x y -> rexNullable nm x && rexNullable nm y + KleeneStar _ -> True + KleeneOpt _ -> True + KleenePlus x -> rexNullable nm x + RegExam (Alternate x y) -> rexNullable nm x || rexNullable nm y + RegExam (OneOf _) -> False + RegExam (NotOneOf _ _) -> False + +compileEarley + :: forall token. Bnf (RegEx token) -> EarleyTransducer token +compileEarley (Bnf start rules) = EarleyTransducer + { etStates = IntMap.fromList allStates + , etRuleEntries = Map.fromList + [ ( n + , ( Map.findWithDefault IntSet.empty n firstsMap + , Map.findWithDefault False n nullMap + ) + ) + | n <- Map.keys ruleMap + ] + , etAcceptId = earleyAcceptId + , etStartStates = startStates + } + where + ruleMap :: Map String [RegEx token] + ruleMap = foldr + (\(n, r) -> Map.insertWith (++) n [r]) Map.empty (toList rules) + nullMap = nullableRules ruleMap + ruleNames = Map.keys ruleMap + (finalMap, nextIdAfterFinals) = + foldl' alloc (Map.empty, earleyAcceptId + 1) ruleNames + where alloc (m, i) n = (Map.insert n i m, i + 1) + finalStatesList = [(finalMap Map.! n, ESEmit n) | n <- ruleNames] + (rulesStatesList, firstsMap, nextIdAfterRules) = + foldl' compileRule ([], Map.empty, nextIdAfterFinals) (Map.toList ruleMap) + where + compileRule (sts, fm, nid) (name, prods) = + let finalId = finalMap Map.! name + (newSts, newFirsts, nid') = + foldl' compileProd ([], IntSet.empty, nid) prods + compileProd (s, fs, i) prod = + let (f, st, i', _) = + goEarley nullMap prod i (IntSet.singleton finalId) + in (s <> st, fs <> f, i') + in (sts <> newSts, Map.insert name newFirsts fm, nid') + (startFirsts, startStatesRaw, _, startBypass) = + goEarley nullMap start nextIdAfterRules (IntSet.singleton earleyAcceptId) + startStates = + startFirsts <> bypassStates startBypass (IntSet.singleton earleyAcceptId) + allStates = finalStatesList <> rulesStatesList <> startStatesRaw + +goEarley + :: Map String Bool + -> RegEx token + -> Int + -> IntSet + -> (IntSet, [(Int, EarleyState token)], Int, Bool) +goEarley nullMap = go + where + go rex nextId dests = case rex of + SeqEmpty -> (IntSet.empty, [], nextId, True) + NonTerminal name -> + ( IntSet.singleton nextId + , [(nextId, ESNonterm name dests)] + , nextId + 1 + , Map.findWithDefault False name nullMap + ) + Sequence rex0 rex1 -> + let + (firsts1, states1, nextId1, bypass1) = go rex1 nextId dests + (firsts0, states0, nextId0, bypass0) = + go rex0 nextId1 (firsts1 <> bypassStates bypass1 dests) + in + ( firsts0 <> bypassStates bypass0 firsts1 + , states0 <> states1 + , nextId0 + , bypass0 && bypass1 + ) + KleeneStar rex0 -> + let + (firsts, states, nextId', _) = go rex0 nextId (firsts <> dests) + in + (firsts, states, nextId', True) + KleeneOpt rex0 -> + let + (firsts, states, nextId', _) = go rex0 nextId dests + in + (firsts, states, nextId', True) + KleenePlus rex0 -> + let + (firsts, states, nextId', bypass) = go rex0 nextId (firsts <> dests) + in + (firsts, states, nextId', bypass) + RegExam (OneOf chars) + | Set.null chars -> (IntSet.empty, [], nextId, False) + | otherwise -> + ( IntSet.singleton nextId + , [(nextId, ESTerminal (TokenClass (OneOf chars)) dests)] + , nextId + 1 + , False + ) + RegExam (NotOneOf chars catTest) -> + ( IntSet.singleton nextId + , [(nextId, ESTerminal (TokenClass (NotOneOf chars catTest)) dests)] + , nextId + 1 + , False + ) + RegExam (Alternate rex0 rex1) -> + let + (firsts1, states1, nextId1, bypass1) = go rex1 nextId dests + (firsts0, states0, nextId0, bypass0) = go rex0 nextId1 dests + in + ( firsts0 <> firsts1 + , states0 <> states1 + , nextId0 + , bypass0 || bypass1 + ) + +matchEarley + :: Categorized token => [token] -> EarleyTransducer token -> Bool +matchEarley word et = IntSet.member 0 acceptOrigins + where + initialE0 = IntMap.fromList + [ (s, IntSet.singleton 0) | s <- IntSet.toList (etStartStates et) ] + sets0 = IntMap.singleton 0 initialE0 + sets0closed = closureAt et 0 sets0 + (finalSets, n) = runInput 0 sets0closed word + runInput j ss [] = (ss, j) + runInput j ss (x : xs) = + let scanned = scanFrom et j x ss + ss' = IntMap.insert (j + 1) scanned ss + closed = closureAt et (j + 1) ss' + in runInput (j + 1) closed xs + en = IntMap.findWithDefault IntMap.empty n finalSets + acceptOrigins = IntMap.findWithDefault IntSet.empty (etAcceptId et) en + +scanFrom + :: Categorized token + => EarleyTransducer token -> Int -> token + -> IntMap (IntMap IntSet) -> IntMap IntSet +scanFrom et j input ss = IntMap.foldrWithKey advance IntMap.empty e_j + where + e_j = IntMap.findWithDefault IntMap.empty j ss + advance s origs acc = case IntMap.lookup s (etStates et) of + Just (ESTerminal cls ds) | tokenClass cls input -> + IntSet.foldr + (\d -> IntMap.insertWith IntSet.union d origs) acc ds + _ -> acc + +closureAt + :: EarleyTransducer token -> Int + -> IntMap (IntMap IntSet) -> IntMap (IntMap IntSet) +closureAt et j initialSets = loop initialWork initialSets + where + initialE = IntMap.findWithDefault IntMap.empty j initialSets + initialWork = + [ (s, i) | (s, os) <- IntMap.toList initialE, i <- IntSet.toList os ] + loop [] ss = ss + loop ((s, i) : rest) ss = case IntMap.lookup s (etStates et) of + Just (ESNonterm name ds) -> + let (firsts, isNull) = Map.findWithDefault + (IntSet.empty, False) name (etRuleEntries et) + predItems = [(f, j) | f <- IntSet.toList firsts] + nullItems = + if isNull then [(d, i) | d <- IntSet.toList ds] else [] + (ss', new) = addEarleyItems j (predItems <> nullItems) ss + in loop (new <> rest) ss' + Just (ESEmit name) -> + let e_i = IntMap.findWithDefault IntMap.empty i ss + completions = + [ (d, i') + | (t, os) <- IntMap.toList e_i + , Just (ESNonterm n ds) <- [IntMap.lookup t (etStates et)] + , n == name + , i' <- IntSet.toList os + , d <- IntSet.toList ds + ] + (ss', new) = addEarleyItems j completions ss + in loop (new <> rest) ss' + _ -> loop rest ss + +addEarleyItems + :: Int -> [(Int, Int)] -> IntMap (IntMap IntSet) + -> (IntMap (IntMap IntSet), [(Int, Int)]) +addEarleyItems j items ss = foldl' ins (ss, []) items + where + ins (acc, new) (state, origin) = + let e_j = IntMap.findWithDefault IntMap.empty j acc + os = IntMap.findWithDefault IntSet.empty state e_j + in if IntSet.member origin os + then (acc, new) + else + let e_j' = IntMap.insert state (IntSet.insert origin os) e_j + acc' = IntMap.insert j e_j' acc + in (acc', (state, origin) : new) + -- instances instance (Ord rule, NonTerminalSymbol rule) => BackusNaurForm (Bnf rule) where @@ -320,10 +548,10 @@ instance (Ord rule, Monoid rule) => Monoid (Bnf rule) where mempty = liftBnf0 mempty instance (Ord rule, Semigroup rule) => Semigroup (Bnf rule) where (<>) = liftBnf2 (<>) -instance (Categorized token, HasTrie token) +instance Categorized token => Matching [token] (Bnf (RegEx token)) where - (=~) word = δ . diffB word -instance (Categorized token, HasTrie token) + word =~ bnf = matchEarley word (compileEarley bnf) +instance Categorized token => Matching [token] (RegEx token) where word =~ pattern | containsNonTerminal pattern = word =~ liftBnf0 pattern diff --git a/test/Examples/Chain.hs b/test/Examples/Chain.hs index 5d0f964e..1b54285a 100644 --- a/test/Examples/Chain.hs +++ b/test/Examples/Chain.hs @@ -16,7 +16,7 @@ data Chain makePrisms ''Chain -chainGrammar :: CtxGrammar Char Chain +chainGrammar :: Grammar Char Chain chainGrammar = ruleRec "chain" seqG where seqG chn = rule "seq" $ diff --git a/test/Examples/Json.hs b/test/Examples/Json.hs index e3aec526..71305d6c 100644 --- a/test/Examples/Json.hs +++ b/test/Examples/Json.hs @@ -5,7 +5,7 @@ module Examples.Json ) where import Control.Applicative -import Control.Lens +import Control.Lens hiding (element) import Control.Lens.Grammar import qualified Data.Map.Strict as Map import Data.Map.Strict (Map) @@ -24,50 +24,54 @@ data Json -- Generate prisms makePrisms ''Json --- | JSON grammar following the McKeeman Form specification from json.org +-- | JSON grammar following the McKeeman Form specification from json.org. +-- The inner rules are mutually recursive: element ↔ value ↔ array ↔ +-- elements ↔ element, and element ↔ value ↔ object ↔ members ↔ member +-- ↔ element. Only a rule bound via `ruleRec` produces a stub that +-- breaks the cycle; a plain `rule` invocation forces its body, so +-- every cyclic back-edge must instead use the `element` stub produced +-- by the inner `ruleRec "element"` below. jsonGrammar :: Grammar Char Json -jsonGrammar = ruleRec "json" elementG +jsonGrammar = ruleRec "json" $ \_json -> + ruleRec "element" $ \element -> + ws >* valueG element *< ws where - -- element = ws value ws - elementG json = rule "element" $ - ws >* valueG json *< ws - -- value = object | array | string | number | "true" | "false" | "null" - valueG json = rule "value" $ choice + valueG element = rule "value" $ choice [ _JNull >? terminal "null" , _JBool . only True >? terminal "true" , _JBool . only False >? terminal "false" , _JNumber >? numberG , _JString >? stringG - , _JArray >? arrayG json - , _JObject >? objectG json + , _JArray >? arrayG element + , _JObject >? objectG element ] -- object = '{' ws '}' | '{' members '}' - objectG json = rule "object" $ choice + objectG element = rule "object" $ choice [ only Map.empty >? terminal "{" >* ws >* terminal "}" , iso Map.toList Map.fromList >~ - terminal "{" >* membersG json *< terminal "}" + terminal "{" >* membersG element *< terminal "}" ] -- members = member | member ',' members - membersG json = rule "members" $ - several1 (sepWith ",") (memberG json) + membersG element = rule "members" $ + several1 (sepWith ",") (memberG element) -- member = ws string ws ':' element - memberG json = rule "member" $ - ws >* stringG *< ws *< terminal ":" >*< elementG json + memberG element = rule "member" $ + ws >* stringG *< ws *< terminal ":" >*< element -- array = '[' ws ']' | '[' elements ']' - arrayG json = rule "array" $ choice + arrayG element = rule "array" $ choice [ only [] >? terminal "[" >* ws >* terminal "]" - , terminal "[" >* elementsG json *< terminal "]" + , terminal "[" >* elementsG element *< terminal "]" ] -- elements = element | element ',' elements - elementsG json = rule "elements" $ - several1 (sepWith ",") (elementG json) + elementsG element = rule "elements" $ + several1 (sepWith ",") element -- string = '"' characters '"' stringG = rule "string" $ diff --git a/test/Main.hs b/test/Main.hs index 4d371397..39c9f1cf 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -31,16 +31,17 @@ main = do describe "doctest" $ it "should run haddock examples" doctests describe "regexGrammar" $ for_ regexExamples $ testGrammar False regexGrammar - describe "semverGrammar" $ for_ semverExamples $ testCtxGrammar True semverGrammar + describe "semverGrammar" $ for_ semverExamples $ testGrammar True semverGrammar describe "semverCtxGrammar" $ for_ semverExamples $ testCtxGrammar True semverCtxGrammar describe "arithGrammar" $ for_ arithExamples $ testGrammar True arithGrammar describe "jsonGrammar" $ for_ jsonExamples $ testCtxGrammar False jsonGrammar - describe "sexprGrammar" $ for_ sexprExamples $ testCtxGrammar True sexprGrammar - describe "lambdaGrammar" $ for_ lambdaExamples $ testCtxGrammar True lambdaGrammar + describe "sexprGrammar" $ for_ sexprExamples $ testGrammar True sexprGrammar + describe "lambdaGrammar" $ for_ lambdaExamples $ testGrammar True lambdaGrammar describe "lenvecGrammar" $ for_ lenvecExamples $ testCtxGrammar True lenvecGrammar - describe "chainGrammar" $ for_ chainExamples $ testCtxGrammar True chainGrammar + describe "chainGrammar" $ for_ chainExamples $ testGrammar True chainGrammar describe "Parsector try rollback" tryRollbackTests describe "Thompson matcher" thompsonMatcherTests + describe "Earley matcher" earleyMatcherTests describe "Kleene" kleeneProperties describe "meander" meanderProperties @@ -170,9 +171,100 @@ thompsonMatcherTests = do for_ cases $ \(word, expected) -> it (label <> " matches " <> show word) $ do let thompsonMatch = word =~ rex - let derivativeMatch = word =~ liftBnf0 rex + let earleyMatch = word =~ liftBnf0 rex thompsonMatch `shouldBe` expected - thompsonMatch `shouldBe` derivativeMatch + thompsonMatch `shouldBe` earleyMatch + +earleyMatcherTests :: Spec +earleyMatcherTests = do + let + parens :: Bnf (RegEx Char) + parens = ruleRec "S" $ \s -> + mempty >|< terminal "(" <> s <> terminal ")" + -- S = A B; A = "a"; B = "b" + ab :: Bnf (RegEx Char) + ab = rule "A" (terminal "a") + <> rule "B" (terminal "b") + -- Left-recursive: A = A 'x' | 'x' + leftRec :: Bnf (RegEx Char) + leftRec = ruleRec "A" $ \a -> + a <> terminal "x" >|< terminal "x" + -- Mutual recursion with nullable bridges. + -- S = A B; A = 'a'?; B = 'b' + mutualNull :: Bnf (RegEx Char) + mutualNull = rule "A" (optK (terminal "a")) + <> rule "B" (terminal "b") + -- Ambiguous palindromes over 'a','b': + -- P = 'a' P 'a' | 'b' P 'b' | 'a' | 'b' | ε + palindrome :: Bnf (RegEx Char) + palindrome = ruleRec "P" $ \p -> + terminal "a" <> p <> terminal "a" + >|< terminal "b" <> p <> terminal "b" + >|< terminal "a" + >|< terminal "b" + >|< mempty + checks :: [(String, Bnf (RegEx Char), [(String, Bool)])] + checks = + [ ( "balanced parens S = (S) | ε" + , parens + , [ ("", True) + , ("()", True) + , ("(())", True) + , ("((()))", True) + , ("(", False) + , (")", False) + , ("(()", False) + , ("())", False) + ] + ) + , ( "two-rule concat A B" + , ab + , [ ("ab", True) + , ("", False) + , ("a", False) + , ("b", False) + , ("ba", False) + , ("abc", False) + ] + ) + , ( "left recursion A = A x | x" + , leftRec + , [ ("x", True) + , ("xx", True) + , ("xxxx", True) + , ("", False) + , ("y", False) + , ("xy", False) + ] + ) + , ( "nullable nonterminal A?B" + , mutualNull + , [ ("b", True) + , ("ab", True) + , ("", False) + , ("a", False) + , ("bb", False) + ] + ) + , ( "ambiguous palindromes" + , palindrome + , [ ("", True) + , ("a", True) + , ("b", True) + , ("aa", True) + , ("aba", True) + , ("abba", True) + , ("abaaba", True) + , ("ab", False) + , ("abab", False) + , ("abca", False) + ] + ) + ] + for_ checks $ \(label, bnf, cases) -> + for_ cases $ \(word, expected) -> + it (label <> " matches " <> show word) $ + (word =~ bnf) `shouldBe` expected doctests :: IO () doctests = do From c071f0f38fcc27c898c9cf8a7e4f9b822d12a754 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Wed, 22 Apr 2026 06:59:01 -0700 Subject: [PATCH 04/31] Update Json.hs --- test/Examples/Json.hs | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/test/Examples/Json.hs b/test/Examples/Json.hs index 71305d6c..3cb39591 100644 --- a/test/Examples/Json.hs +++ b/test/Examples/Json.hs @@ -32,46 +32,47 @@ makePrisms ''Json -- every cyclic back-edge must instead use the `element` stub produced -- by the inner `ruleRec "element"` below. jsonGrammar :: Grammar Char Json -jsonGrammar = ruleRec "json" $ \_json -> - ruleRec "element" $ \element -> - ws >* valueG element *< ws +jsonGrammar = rule "json" elementG where + elementG = ruleRec "element" $ \json -> + ws >* valueG json *< ws + -- value = object | array | string | number | "true" | "false" | "null" - valueG element = rule "value" $ choice + valueG json = rule "value" $ choice [ _JNull >? terminal "null" , _JBool . only True >? terminal "true" , _JBool . only False >? terminal "false" , _JNumber >? numberG , _JString >? stringG - , _JArray >? arrayG element - , _JObject >? objectG element + , _JArray >? arrayG json + , _JObject >? objectG json ] -- object = '{' ws '}' | '{' members '}' - objectG element = rule "object" $ choice + objectG json = rule "object" $ choice [ only Map.empty >? terminal "{" >* ws >* terminal "}" , iso Map.toList Map.fromList >~ - terminal "{" >* membersG element *< terminal "}" + terminal "{" >* membersG json *< terminal "}" ] -- members = member | member ',' members - membersG element = rule "members" $ - several1 (sepWith ",") (memberG element) + membersG json = rule "members" $ + several1 (sepWith ",") (memberG json) -- member = ws string ws ':' element - memberG element = rule "member" $ - ws >* stringG *< ws *< terminal ":" >*< element + memberG json = rule "member" $ + ws >* stringG *< ws *< terminal ":" >*< json -- array = '[' ws ']' | '[' elements ']' - arrayG element = rule "array" $ choice + arrayG json = rule "array" $ choice [ only [] >? terminal "[" >* ws >* terminal "]" - , terminal "[" >* elementsG element *< terminal "]" + , terminal "[" >* elementsG json *< terminal "]" ] -- elements = element | element ',' elements - elementsG element = rule "elements" $ - several1 (sepWith ",") element + elementsG json = rule "elements" $ + several1 (sepWith ",") json -- string = '"' characters '"' stringG = rule "string" $ From a50c662e7faf7e71e891a79f3db3c3d467b452e1 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Wed, 22 Apr 2026 06:59:24 -0700 Subject: [PATCH 05/31] Update Json.hs --- test/Examples/Json.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Examples/Json.hs b/test/Examples/Json.hs index 3cb39591..eb1b24d6 100644 --- a/test/Examples/Json.hs +++ b/test/Examples/Json.hs @@ -5,7 +5,7 @@ module Examples.Json ) where import Control.Applicative -import Control.Lens hiding (element) +import Control.Lens import Control.Lens.Grammar import qualified Data.Map.Strict as Map import Data.Map.Strict (Map) From ed89f88a6ed87593218153e2f8721659182392f0 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Wed, 22 Apr 2026 07:44:24 -0700 Subject: [PATCH 06/31] cleanup --- src/Control/Lens/Grammar/BackusNaur.hs | 452 ++++++++++--------------- test/Main.hs | 208 ------------ 2 files changed, 170 insertions(+), 490 deletions(-) diff --git a/src/Control/Lens/Grammar/BackusNaur.hs b/src/Control/Lens/Grammar/BackusNaur.hs index bc465d7a..c5868f31 100644 --- a/src/Control/Lens/Grammar/BackusNaur.hs +++ b/src/Control/Lens/Grammar/BackusNaur.hs @@ -177,155 +177,27 @@ rulesNamed nameX = foldl' (flip inserter) Set.empty where inserter (nameY,y) = if nameX == nameY then Set.insert y else id -data ThompsonState token = ThompsonState (TokenClass token) IntSet - -thompsonFinalState :: Int -thompsonFinalState = 0 - -bypassStates :: Bool -> IntSet -> IntSet -bypassStates True = id -bypassStates False = const IntSet.empty - -containsNonTerminal :: RegEx token -> Bool -containsNonTerminal = \ - case - NonTerminal _ -> True - Sequence rex0 rex1 -> - containsNonTerminal rex0 || containsNonTerminal rex1 - KleeneStar rex -> containsNonTerminal rex - KleeneOpt rex -> containsNonTerminal rex - KleenePlus rex -> containsNonTerminal rex - RegExam (Alternate rex0 rex1) -> - containsNonTerminal rex0 || containsNonTerminal rex1 - _ -> False - -compileThompson - :: RegEx token - -> Int - -> IntSet - -> (IntSet, [(Int, ThompsonState token)], Int, Bool) -compileThompson = go where - go rex nextId dests = case rex of - SeqEmpty -> (IntSet.empty, [], nextId, True) - NonTerminal _ -> - error "compileThompson: NonTerminal unsupported; route via diffB" - Sequence rex0 rex1 -> - let - (firsts1, states1, nextId1, bypass1) = go rex1 nextId dests - (firsts0, states0, nextId0, bypass0) = - go rex0 nextId1 (firsts1 <> bypassStates bypass1 dests) - in - ( firsts0 <> bypassStates bypass0 firsts1 - , states0 <> states1 - , nextId0 - , bypass0 && bypass1 - ) - KleeneStar rex0 -> - let - (firsts, states, nextId', _) = go rex0 nextId (firsts <> dests) - in - (firsts, states, nextId', True) - KleeneOpt rex0 -> - let - (firsts, states, nextId', _) = go rex0 nextId dests - in - (firsts, states, nextId', True) - KleenePlus rex0 -> - let - (firsts, states, nextId', bypass) = go rex0 nextId (firsts <> dests) - in - (firsts, states, nextId', bypass) - RegExam (OneOf chars) - | Set.null chars -> (IntSet.empty, [], nextId, False) - | otherwise -> - ( IntSet.singleton nextId - , [(nextId, ThompsonState (TokenClass (OneOf chars)) dests)] - , nextId + 1 - , False - ) - RegExam (NotOneOf chars catTest) -> - ( IntSet.singleton nextId - , [(nextId, ThompsonState (TokenClass (NotOneOf chars catTest)) dests)] - , nextId + 1 - , False - ) - RegExam (Alternate rex0 rex1) -> - let - (firsts1, states1, nextId1, bypass1) = go rex1 nextId dests - (firsts0, states0, nextId0, bypass0) = go rex0 nextId1 dests - in - ( firsts0 <> firsts1 - , states0 <> states1 - , nextId0 - , bypass0 || bypass1 - ) - -compileThompsonTop - :: RegEx token - -> (IntSet, IntMap (ThompsonState token)) -compileThompsonTop rex = - (firsts <> bypassStates bypass finalStates, IntMap.fromList states) - where - finalStates = IntSet.singleton thompsonFinalState - (firsts, states, _, bypass) = compileThompson rex 1 finalStates - -matchThompson :: Categorized token => [token] -> RegEx token -> Bool -matchThompson word rex = IntSet.member thompsonFinalState finalStates - where - (startStates, states) = compileThompsonTop rex - finalStates = foldl' step startStates word - step activeStates input = IntSet.foldl' advance IntSet.empty activeStates - where - advance nextStates stateId - | stateId == thompsonFinalState = nextStates - | otherwise = case IntMap.lookup stateId states of - Just (ThompsonState exam dests) - | tokenClass exam input -> nextStates <> dests - _ -> nextStates - -- | A state in the Earley-extended Thompson transducer for a `Bnf`. --- @ESTerminal cls ds@ matches on a token class and transitions to @ds@. --- @ESNonterm name ds@ is a call point for rule @name@; after @name@ --- completes, control flows to @ds@. @ESEmit name@ is the final state +-- @EarleyTerminal cls ds@ matches on a token class and transitions to @ds@. +-- @EarleyNonTerminal name ds@ is a call point for rule @name@; after @name@ +-- completes, control flows to @ds@. @EarleyEmit name@ is the final state -- for rule @name@ and triggers completion during Earley closure. data EarleyState token - = ESTerminal (TokenClass token) IntSet - | ESNonterm String IntSet - | ESEmit String + = EarleyTerminal (TokenClass token) IntSet + | EarleyNonTerminal String IntSet + | EarleyEmit String data EarleyTransducer token = EarleyTransducer - { etStates :: IntMap (EarleyState token) - , etRuleEntries :: Map String (IntSet, Bool) - , etAcceptId :: Int - , etStartStates :: IntSet + { earleyStates :: IntMap (EarleyState token) + , earleyRules :: Map String (IntSet, Bool) + , earleyAcceptId :: Int + , earleyStartStates :: IntSet } -earleyAcceptId :: Int -earleyAcceptId = 0 - -nullableRules :: Map String [RegEx token] -> Map String Bool -nullableRules ruleMap = iter (Map.map (const False) ruleMap) - where - iter nm = - let nm' = Map.mapWithKey - (\n _ -> any (rexNullable nm) (Map.findWithDefault [] n ruleMap)) nm - in if nm == nm' then nm else iter nm' - rexNullable nm = \case - SeqEmpty -> True - NonTerminal n -> Map.findWithDefault False n nm - Sequence x y -> rexNullable nm x && rexNullable nm y - KleeneStar _ -> True - KleeneOpt _ -> True - KleenePlus x -> rexNullable nm x - RegExam (Alternate x y) -> rexNullable nm x || rexNullable nm y - RegExam (OneOf _) -> False - RegExam (NotOneOf _ _) -> False - -compileEarley - :: forall token. Bnf (RegEx token) -> EarleyTransducer token +compileEarley :: Bnf (RegEx token) -> EarleyTransducer token compileEarley (Bnf start rules) = EarleyTransducer - { etStates = IntMap.fromList allStates - , etRuleEntries = Map.fromList + { earleyStates = IntMap.fromList allStates + , earleyRules = Map.fromList [ ( n , ( Map.findWithDefault IntSet.empty n firstsMap , Map.findWithDefault False n nullMap @@ -333,19 +205,43 @@ compileEarley (Bnf start rules) = EarleyTransducer ) | n <- Map.keys ruleMap ] - , etAcceptId = earleyAcceptId - , etStartStates = startStates + , earleyAcceptId = earleyAcceptId0 + , earleyStartStates = startStates } + where - ruleMap :: Map String [RegEx token] + ruleMap = foldr (\(n, r) -> Map.insertWith (++) n [r]) Map.empty (toList rules) - nullMap = nullableRules ruleMap + + rexNullable nm = \case + SeqEmpty -> True + NonTerminal n -> Map.findWithDefault False n nm + Sequence x y -> rexNullable nm x && rexNullable nm y + KleeneStar _ -> True + KleeneOpt _ -> True + KleenePlus x -> rexNullable nm x + RegExam (Alternate x y) -> rexNullable nm x || rexNullable nm y + RegExam (OneOf _) -> False + RegExam (NotOneOf _ _) -> False + + iterNull nm = + let nm' = Map.mapWithKey + (\n _ -> any (rexNullable nm) (Map.findWithDefault [] n ruleMap)) nm + in if nm == nm' then nm else iterNull nm' + + nullMap = iterNull (Map.map (const False) ruleMap) + ruleNames = Map.keys ruleMap + + earleyAcceptId0 = 0 + (finalMap, nextIdAfterFinals) = - foldl' alloc (Map.empty, earleyAcceptId + 1) ruleNames + foldl' alloc (Map.empty, earleyAcceptId0 + 1) ruleNames where alloc (m, i) n = (Map.insert n i m, i + 1) - finalStatesList = [(finalMap Map.! n, ESEmit n) | n <- ruleNames] + + finalStatesList = [(finalMap Map.! n, EarleyEmit n) | n <- ruleNames] + (rulesStatesList, firstsMap, nextIdAfterRules) = foldl' compileRule ([], Map.empty, nextIdAfterFinals) (Map.toList ruleMap) where @@ -355,159 +251,153 @@ compileEarley (Bnf start rules) = EarleyTransducer foldl' compileProd ([], IntSet.empty, nid) prods compileProd (s, fs, i) prod = let (f, st, i', _) = - goEarley nullMap prod i (IntSet.singleton finalId) + goEarley prod i (IntSet.singleton finalId) in (s <> st, fs <> f, i') in (sts <> newSts, Map.insert name newFirsts fm, nid') + (startFirsts, startStatesRaw, _, startBypass) = - goEarley nullMap start nextIdAfterRules (IntSet.singleton earleyAcceptId) + goEarley start nextIdAfterRules (IntSet.singleton earleyAcceptId0) + startStates = - startFirsts <> bypassStates startBypass (IntSet.singleton earleyAcceptId) + startFirsts <> bypassStates startBypass (IntSet.singleton earleyAcceptId0) + allStates = finalStatesList <> rulesStatesList <> startStatesRaw -goEarley - :: Map String Bool - -> RegEx token - -> Int - -> IntSet - -> (IntSet, [(Int, EarleyState token)], Int, Bool) -goEarley nullMap = go - where - go rex nextId dests = case rex of - SeqEmpty -> (IntSet.empty, [], nextId, True) - NonTerminal name -> - ( IntSet.singleton nextId - , [(nextId, ESNonterm name dests)] - , nextId + 1 - , Map.findWithDefault False name nullMap - ) - Sequence rex0 rex1 -> - let - (firsts1, states1, nextId1, bypass1) = go rex1 nextId dests - (firsts0, states0, nextId0, bypass0) = - go rex0 nextId1 (firsts1 <> bypassStates bypass1 dests) - in - ( firsts0 <> bypassStates bypass0 firsts1 - , states0 <> states1 - , nextId0 - , bypass0 && bypass1 - ) - KleeneStar rex0 -> - let - (firsts, states, nextId', _) = go rex0 nextId (firsts <> dests) - in - (firsts, states, nextId', True) - KleeneOpt rex0 -> - let - (firsts, states, nextId', _) = go rex0 nextId dests - in - (firsts, states, nextId', True) - KleenePlus rex0 -> - let - (firsts, states, nextId', bypass) = go rex0 nextId (firsts <> dests) - in - (firsts, states, nextId', bypass) - RegExam (OneOf chars) - | Set.null chars -> (IntSet.empty, [], nextId, False) - | otherwise -> + bypassStates True = id + bypassStates False = const IntSet.empty + + goEarley rex nextId dests = case rex of + SeqEmpty -> (IntSet.empty, [], nextId, True) + NonTerminal name -> ( IntSet.singleton nextId - , [(nextId, ESTerminal (TokenClass (OneOf chars)) dests)] + , [(nextId, EarleyNonTerminal name dests)] + , nextId + 1 + , Map.findWithDefault False name nullMap + ) + Sequence rex0 rex1 -> + let + (firsts1, states1, nextId1, bypass1) = goEarley rex1 nextId dests + (firsts0, states0, nextId0, bypass0) = + goEarley rex0 nextId1 (firsts1 <> bypassStates bypass1 dests) + in + ( firsts0 <> bypassStates bypass0 firsts1 + , states0 <> states1 + , nextId0 + , bypass0 && bypass1 + ) + KleeneStar rex0 -> + let + (firsts, states, nextId', _) = goEarley rex0 nextId (firsts <> dests) + in + (firsts, states, nextId', True) + KleeneOpt rex0 -> + let + (firsts, states, nextId', _) = goEarley rex0 nextId dests + in + (firsts, states, nextId', True) + KleenePlus rex0 -> + let + (firsts, states, nextId', bypass) = goEarley rex0 nextId (firsts <> dests) + in + (firsts, states, nextId', bypass) + RegExam (OneOf chars) + | Set.null chars -> (IntSet.empty, [], nextId, False) + | otherwise -> + ( IntSet.singleton nextId + , [(nextId, EarleyTerminal (TokenClass (OneOf chars)) dests)] + , nextId + 1 + , False + ) + RegExam (NotOneOf chars catTest) -> + ( IntSet.singleton nextId + , [(nextId, EarleyTerminal (TokenClass (NotOneOf chars catTest)) dests)] , nextId + 1 , False ) - RegExam (NotOneOf chars catTest) -> - ( IntSet.singleton nextId - , [(nextId, ESTerminal (TokenClass (NotOneOf chars catTest)) dests)] - , nextId + 1 - , False - ) - RegExam (Alternate rex0 rex1) -> - let - (firsts1, states1, nextId1, bypass1) = go rex1 nextId dests - (firsts0, states0, nextId0, bypass0) = go rex0 nextId1 dests - in - ( firsts0 <> firsts1 - , states0 <> states1 - , nextId0 - , bypass0 || bypass1 - ) - -matchEarley - :: Categorized token => [token] -> EarleyTransducer token -> Bool + RegExam (Alternate rex0 rex1) -> + let + (firsts1, states1, nextId1, bypass1) = goEarley rex1 nextId dests + (firsts0, states0, nextId0, bypass0) = goEarley rex0 nextId1 dests + in + ( firsts0 <> firsts1 + , states0 <> states1 + , nextId0 + , bypass0 || bypass1 + ) + +matchEarley :: Categorized token => [token] -> EarleyTransducer token -> Bool matchEarley word et = IntSet.member 0 acceptOrigins where + initialE0 = IntMap.fromList - [ (s, IntSet.singleton 0) | s <- IntSet.toList (etStartStates et) ] + [ (s, IntSet.singleton 0) | s <- IntSet.toList (earleyStartStates et) ] + sets0 = IntMap.singleton 0 initialE0 - sets0closed = closureAt et 0 sets0 + + sets0closed = closureAt 0 sets0 + (finalSets, n) = runInput 0 sets0closed word + runInput j ss [] = (ss, j) runInput j ss (x : xs) = - let scanned = scanFrom et j x ss + let scanned = scanFrom j x ss ss' = IntMap.insert (j + 1) scanned ss - closed = closureAt et (j + 1) ss' + closed = closureAt (j + 1) ss' in runInput (j + 1) closed xs + en = IntMap.findWithDefault IntMap.empty n finalSets - acceptOrigins = IntMap.findWithDefault IntSet.empty (etAcceptId et) en -scanFrom - :: Categorized token - => EarleyTransducer token -> Int -> token - -> IntMap (IntMap IntSet) -> IntMap IntSet -scanFrom et j input ss = IntMap.foldrWithKey advance IntMap.empty e_j - where - e_j = IntMap.findWithDefault IntMap.empty j ss - advance s origs acc = case IntMap.lookup s (etStates et) of - Just (ESTerminal cls ds) | tokenClass cls input -> - IntSet.foldr - (\d -> IntMap.insertWith IntSet.union d origs) acc ds - _ -> acc - -closureAt - :: EarleyTransducer token -> Int - -> IntMap (IntMap IntSet) -> IntMap (IntMap IntSet) -closureAt et j initialSets = loop initialWork initialSets - where - initialE = IntMap.findWithDefault IntMap.empty j initialSets - initialWork = - [ (s, i) | (s, os) <- IntMap.toList initialE, i <- IntSet.toList os ] - loop [] ss = ss - loop ((s, i) : rest) ss = case IntMap.lookup s (etStates et) of - Just (ESNonterm name ds) -> - let (firsts, isNull) = Map.findWithDefault - (IntSet.empty, False) name (etRuleEntries et) - predItems = [(f, j) | f <- IntSet.toList firsts] - nullItems = - if isNull then [(d, i) | d <- IntSet.toList ds] else [] - (ss', new) = addEarleyItems j (predItems <> nullItems) ss - in loop (new <> rest) ss' - Just (ESEmit name) -> - let e_i = IntMap.findWithDefault IntMap.empty i ss - completions = - [ (d, i') - | (t, os) <- IntMap.toList e_i - , Just (ESNonterm n ds) <- [IntMap.lookup t (etStates et)] - , n == name - , i' <- IntSet.toList os - , d <- IntSet.toList ds - ] - (ss', new) = addEarleyItems j completions ss - in loop (new <> rest) ss' - _ -> loop rest ss - -addEarleyItems - :: Int -> [(Int, Int)] -> IntMap (IntMap IntSet) - -> (IntMap (IntMap IntSet), [(Int, Int)]) -addEarleyItems j items ss = foldl' ins (ss, []) items - where - ins (acc, new) (state, origin) = - let e_j = IntMap.findWithDefault IntMap.empty j acc - os = IntMap.findWithDefault IntSet.empty state e_j - in if IntSet.member origin os - then (acc, new) - else - let e_j' = IntMap.insert state (IntSet.insert origin os) e_j - acc' = IntMap.insert j e_j' acc - in (acc', (state, origin) : new) + acceptOrigins = IntMap.findWithDefault IntSet.empty (earleyAcceptId et) en + + scanFrom j input ss = IntMap.foldrWithKey advance IntMap.empty e_j + where + e_j = IntMap.findWithDefault IntMap.empty j ss + advance s origs acc = case IntMap.lookup s (earleyStates et) of + Just (EarleyTerminal cls ds) | tokenClass cls input -> + IntSet.foldr + (\d -> IntMap.insertWith IntSet.union d origs) acc ds + _ -> acc + + closureAt j initialSets = loop initialWork initialSets + where + initialE = IntMap.findWithDefault IntMap.empty j initialSets + initialWork = + [ (s, i) | (s, os) <- IntMap.toList initialE, i <- IntSet.toList os ] + loop [] ss = ss + loop ((s, i) : rest) ss = case IntMap.lookup s (earleyStates et) of + Just (EarleyNonTerminal name ds) -> + let (firsts, isNull) = Map.findWithDefault + (IntSet.empty, False) name (earleyRules et) + predItems = [(f, j) | f <- IntSet.toList firsts] + nullItems = + if isNull then [(d, i) | d <- IntSet.toList ds] else [] + (ss', new) = addEarleyItems j (predItems <> nullItems) ss + in loop (new <> rest) ss' + Just (EarleyEmit name) -> + let e_i = IntMap.findWithDefault IntMap.empty i ss + completions = + [ (d, i') + | (t, os) <- IntMap.toList e_i + , Just (EarleyNonTerminal n' ds) <- [IntMap.lookup t (earleyStates et)] + , n' == name + , i' <- IntSet.toList os + , d <- IntSet.toList ds + ] + (ss', new) = addEarleyItems j completions ss + in loop (new <> rest) ss' + _ -> loop rest ss + + addEarleyItems j items ss = foldl' ins (ss, []) items + where + ins (acc, new) (state, origin) = + let e_j = IntMap.findWithDefault IntMap.empty j acc + os = IntMap.findWithDefault IntSet.empty state e_j + in if IntSet.member origin os + then (acc, new) + else + let e_j' = IntMap.insert state (IntSet.insert origin os) e_j + acc' = IntMap.insert j e_j' acc + in (acc', (state, origin) : new) -- instances instance (Ord rule, NonTerminalSymbol rule) @@ -553,8 +443,6 @@ instance Categorized token word =~ bnf = matchEarley word (compileEarley bnf) instance Categorized token => Matching [token] (RegEx token) where - word =~ pattern - | containsNonTerminal pattern = word =~ liftBnf0 pattern - | otherwise = matchThompson word pattern + word =~ pattern = word =~ liftBnf0 pattern instance Matching s (APrism s t a b) where word =~ pattern = is pattern word diff --git a/test/Main.hs b/test/Main.hs index 39c9f1cf..140b217a 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -40,8 +40,6 @@ main = do describe "lenvecGrammar" $ for_ lenvecExamples $ testCtxGrammar True lenvecGrammar describe "chainGrammar" $ for_ chainExamples $ testGrammar True chainGrammar describe "Parsector try rollback" tryRollbackTests - describe "Thompson matcher" thompsonMatcherTests - describe "Earley matcher" earleyMatcherTests describe "Kleene" kleeneProperties describe "meander" meanderProperties @@ -60,212 +58,6 @@ tryRollbackTests = do parsecStream actual `shouldBe` "" parsecResult actual `shouldBe` (Nothing :: Maybe String) -thompsonMatcherTests :: Spec -thompsonMatcherTests = do - let - sandwich = terminal "a" <> starK (terminal "b") <> terminal "a" - evenA = starK (sandwich >|< terminal "b") - nullableSeq = optK (terminal "a") <> terminal "b" - nestedStar = starK (starK (terminal "a")) - onePlusA = plusK (terminal "a") - altThenTerm = (terminal "a" >|< terminal "b") <> terminal "c" - notXY = notOneOf ("xy" :: String) :: RegEx Char - abcClass = oneOf ("abc" :: String) :: RegEx Char - checks :: [(String, RegEx Char, [(String, Bool)])] - checks = - [ ( "paper sandwich language" - , sandwich - , [ ("aa", True) - , ("aba", True) - , ("abbba", True) - , ("", False) - , ("a", False) - , ("ab", False) - , ("baa", False) - ] - ) - , ( "paper even-a language" - , evenA - , [ ("", True) - , ("b", True) - , ("bb", True) - , ("aa", True) - , ("abba", True) - , ("baab", True) - , ("abaaab", True) - , ("a", False) - , ("ababa", False) - ] - ) - , ( "nullable sequence wiring" - , nullableSeq - , [ ("b", True) - , ("ab", True) - , ("", False) - , ("a", False) - ] - ) - , ( "empty and empty-class handling" - , mempty - , [ ("", True) - , ("a", False) - ] - ) - , ( "failing token class" - , zeroK - , [ ("", False) - , ("a", False) - ] - ) - , ( "nested star (a*)*" - , nestedStar - , [ ("", True) - , ("a", True) - , ("aaaa", True) - , ("b", False) - , ("aab", False) - ] - ) - , ( "kleene plus a+" - , onePlusA - , [ ("a", True) - , ("aaa", True) - , ("", False) - , ("ab", False) - , ("b", False) - ] - ) - , ( "alternate then terminal (a|b)c" - , altThenTerm - , [ ("ac", True) - , ("bc", True) - , ("", False) - , ("c", False) - , ("abc", False) - , ("a", False) - ] - ) - , ( "notOneOf \"xy\"" - , notXY - , [ ("a", True) - , ("z", True) - , ("x", False) - , ("y", False) - , ("", False) - , ("ab", False) - ] - ) - , ( "oneOf \"abc\"" - , abcClass - , [ ("a", True) - , ("b", True) - , ("c", True) - , ("d", False) - , ("", False) - , ("ab", False) - ] - ) - ] - - for_ checks $ \(label, rex, cases) -> - for_ cases $ \(word, expected) -> - it (label <> " matches " <> show word) $ do - let thompsonMatch = word =~ rex - let earleyMatch = word =~ liftBnf0 rex - thompsonMatch `shouldBe` expected - thompsonMatch `shouldBe` earleyMatch - -earleyMatcherTests :: Spec -earleyMatcherTests = do - let - parens :: Bnf (RegEx Char) - parens = ruleRec "S" $ \s -> - mempty >|< terminal "(" <> s <> terminal ")" - -- S = A B; A = "a"; B = "b" - ab :: Bnf (RegEx Char) - ab = rule "A" (terminal "a") - <> rule "B" (terminal "b") - -- Left-recursive: A = A 'x' | 'x' - leftRec :: Bnf (RegEx Char) - leftRec = ruleRec "A" $ \a -> - a <> terminal "x" >|< terminal "x" - -- Mutual recursion with nullable bridges. - -- S = A B; A = 'a'?; B = 'b' - mutualNull :: Bnf (RegEx Char) - mutualNull = rule "A" (optK (terminal "a")) - <> rule "B" (terminal "b") - -- Ambiguous palindromes over 'a','b': - -- P = 'a' P 'a' | 'b' P 'b' | 'a' | 'b' | ε - palindrome :: Bnf (RegEx Char) - palindrome = ruleRec "P" $ \p -> - terminal "a" <> p <> terminal "a" - >|< terminal "b" <> p <> terminal "b" - >|< terminal "a" - >|< terminal "b" - >|< mempty - checks :: [(String, Bnf (RegEx Char), [(String, Bool)])] - checks = - [ ( "balanced parens S = (S) | ε" - , parens - , [ ("", True) - , ("()", True) - , ("(())", True) - , ("((()))", True) - , ("(", False) - , (")", False) - , ("(()", False) - , ("())", False) - ] - ) - , ( "two-rule concat A B" - , ab - , [ ("ab", True) - , ("", False) - , ("a", False) - , ("b", False) - , ("ba", False) - , ("abc", False) - ] - ) - , ( "left recursion A = A x | x" - , leftRec - , [ ("x", True) - , ("xx", True) - , ("xxxx", True) - , ("", False) - , ("y", False) - , ("xy", False) - ] - ) - , ( "nullable nonterminal A?B" - , mutualNull - , [ ("b", True) - , ("ab", True) - , ("", False) - , ("a", False) - , ("bb", False) - ] - ) - , ( "ambiguous palindromes" - , palindrome - , [ ("", True) - , ("a", True) - , ("b", True) - , ("aa", True) - , ("aba", True) - , ("abba", True) - , ("abaaba", True) - , ("ab", False) - , ("abab", False) - , ("abca", False) - ] - ) - ] - for_ checks $ \(label, bnf, cases) -> - for_ cases $ \(word, expected) -> - it (label <> " matches " <> show word) $ - (word =~ bnf) `shouldBe` expected - doctests :: IO () doctests = do let From 3495538b71d3ab31cb9241a8f04e2357996a7093 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Wed, 22 Apr 2026 10:16:55 -0700 Subject: [PATCH 07/31] Update Main.hs --- test/Main.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Main.hs b/test/Main.hs index 140b217a..ae792432 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -34,7 +34,7 @@ main = do describe "semverGrammar" $ for_ semverExamples $ testGrammar True semverGrammar describe "semverCtxGrammar" $ for_ semverExamples $ testCtxGrammar True semverCtxGrammar describe "arithGrammar" $ for_ arithExamples $ testGrammar True arithGrammar - describe "jsonGrammar" $ for_ jsonExamples $ testCtxGrammar False jsonGrammar + describe "jsonGrammar" $ for_ jsonExamples $ testGrammar False jsonGrammar describe "sexprGrammar" $ for_ sexprExamples $ testGrammar True sexprGrammar describe "lambdaGrammar" $ for_ lambdaExamples $ testGrammar True lambdaGrammar describe "lenvecGrammar" $ for_ lenvecExamples $ testCtxGrammar True lenvecGrammar From d97009f117097b94053fb08c84886c58259241e3 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Wed, 22 Apr 2026 16:21:32 -0700 Subject: [PATCH 08/31] matching & generation via Thompson-Earley --- distributors.cabal | 2 + package.yaml | 1 + src/Control/Lens/Grammar.hs | 6 + src/Control/Lens/Grammar/BackusNaur.hs | 244 ----------------- src/Control/Lens/Grammar/Kleene.hs | 11 + src/Control/Lens/Grammar/Matching.hs | 353 +++++++++++++++++++++++++ src/Control/Lens/Grammar/Token.hs | 11 + src/Data/Profunctor/Grammar.hs | 1 + test/Main.hs | 42 +-- 9 files changed, 412 insertions(+), 259 deletions(-) create mode 100644 src/Control/Lens/Grammar/Matching.hs diff --git a/distributors.cabal b/distributors.cabal index 546551fa..ed7a02ce 100644 --- a/distributors.cabal +++ b/distributors.cabal @@ -36,6 +36,7 @@ library Control.Lens.Grammar.Internal.NestedPrismTH Control.Lens.Grammar.Internal.Orphanage Control.Lens.Grammar.Kleene + Control.Lens.Grammar.Matching Control.Lens.Grammar.Symbol Control.Lens.Grammar.Token Control.Lens.Grate @@ -98,6 +99,7 @@ library ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints build-depends: MemoTrie >=0.6 && <1 + , QuickCheck >=2.14 && <3 , adjunctions >=4.4 && <5 , base >=4.15 && <5 , bifunctors >=5.5 && <6 diff --git a/package.yaml b/package.yaml index 9f1c9a07..f0e85510 100644 --- a/package.yaml +++ b/package.yaml @@ -27,6 +27,7 @@ dependencies: - distributive >= 0.6 && < 1 - lens >= 5.0 && < 6 - MemoTrie >= 0.6 && < 1 +- QuickCheck >= 2.14 && < 3 - mtl >= 2.2 && < 3 - profunctors >= 5.6 && < 6 - tagged >= 0.8 && < 1 diff --git a/src/Control/Lens/Grammar.hs b/src/Control/Lens/Grammar.hs index 0cf344f7..39720532 100644 --- a/src/Control/Lens/Grammar.hs +++ b/src/Control/Lens/Grammar.hs @@ -24,6 +24,7 @@ module Control.Lens.Grammar , regbnfG , regbnfGrammar , applicativeG + , languageG -- * Context-sensitive grammar , CtxGrammar , printG @@ -45,6 +46,7 @@ import Control.Lens.PartialIso import Control.Lens.Grammar.BackusNaur import Control.Lens.Grammar.Boole import Control.Lens.Grammar.Kleene +import Control.Lens.Grammar.Matching import Control.Lens.Grammar.Token import Control.Lens.Grammar.Symbol import Data.Bifunctor.Joker @@ -67,6 +69,7 @@ import Witherable import Control.Lens.Grammar.BackusNaur as X import Control.Lens.Grammar.Boole as X import Control.Lens.Grammar.Kleene as X +import Control.Lens.Grammar.Matching as X import Control.Lens.Grammar.Symbol as X import Control.Lens.Grammar.Token as X import Control.Lens.PartialIso as X @@ -789,6 +792,9 @@ It can apply to a `RegGrammar`. regbnfG :: Grammar Char a -> RegBnf regbnfG bnf = runGrammor bnf +languageG :: (Applicative f, TokenAlgebra token (f token)) => Grammar token a -> f [[token]] +languageG bnf = languageGen (runGrammor bnf) + {- | `printG` generates a printer from a `CtxGrammar`. Since both `RegGrammar`s and context-free `Grammar`s are `CtxGrammar`s, the type system will allow `printG` to be applied to them. diff --git a/src/Control/Lens/Grammar/BackusNaur.hs b/src/Control/Lens/Grammar/BackusNaur.hs index c5868f31..ebfe058a 100644 --- a/src/Control/Lens/Grammar/BackusNaur.hs +++ b/src/Control/Lens/Grammar/BackusNaur.hs @@ -19,13 +19,10 @@ module Control.Lens.Grammar.BackusNaur , liftBnf0 , liftBnf1 , liftBnf2 - -- * Matching - , Matching (..) , diffB ) where import Control.Lens -import Control.Lens.Extras import Control.Lens.Grammar.Kleene import Control.Lens.Grammar.Token import Control.Lens.Grammar.Symbol @@ -33,12 +30,6 @@ import Data.Bifunctor.Joker import Data.Coerce import Data.Foldable import Data.Function -import qualified Data.IntMap.Strict as IntMap -import Data.IntMap.Strict (IntMap) -import qualified Data.IntSet as IntSet -import Data.IntSet (IntSet) -import qualified Data.Map.Strict as Map -import Data.Map.Strict (Map) import Data.MemoTrie import qualified Data.Set as Set import Data.Set (Set) @@ -113,11 +104,6 @@ liftBnf2 liftBnf2 f (Bnf start0 rules0) (Bnf start1 rules1) = Bnf (f start0 start1) (Set.map coerce rules0 <> Set.map coerce rules1) --- | Does a word match a pattern? -class Matching word pattern | pattern -> word where - (=~) :: word -> pattern -> Bool - infix 2 =~ - {- | The [Brzozowski derivative] (https://dl.acm.org/doi/pdf/10.1145/321239.321249) of a @@ -177,228 +163,6 @@ rulesNamed nameX = foldl' (flip inserter) Set.empty where inserter (nameY,y) = if nameX == nameY then Set.insert y else id --- | A state in the Earley-extended Thompson transducer for a `Bnf`. --- @EarleyTerminal cls ds@ matches on a token class and transitions to @ds@. --- @EarleyNonTerminal name ds@ is a call point for rule @name@; after @name@ --- completes, control flows to @ds@. @EarleyEmit name@ is the final state --- for rule @name@ and triggers completion during Earley closure. -data EarleyState token - = EarleyTerminal (TokenClass token) IntSet - | EarleyNonTerminal String IntSet - | EarleyEmit String - -data EarleyTransducer token = EarleyTransducer - { earleyStates :: IntMap (EarleyState token) - , earleyRules :: Map String (IntSet, Bool) - , earleyAcceptId :: Int - , earleyStartStates :: IntSet - } - -compileEarley :: Bnf (RegEx token) -> EarleyTransducer token -compileEarley (Bnf start rules) = EarleyTransducer - { earleyStates = IntMap.fromList allStates - , earleyRules = Map.fromList - [ ( n - , ( Map.findWithDefault IntSet.empty n firstsMap - , Map.findWithDefault False n nullMap - ) - ) - | n <- Map.keys ruleMap - ] - , earleyAcceptId = earleyAcceptId0 - , earleyStartStates = startStates - } - - where - - ruleMap = foldr - (\(n, r) -> Map.insertWith (++) n [r]) Map.empty (toList rules) - - rexNullable nm = \case - SeqEmpty -> True - NonTerminal n -> Map.findWithDefault False n nm - Sequence x y -> rexNullable nm x && rexNullable nm y - KleeneStar _ -> True - KleeneOpt _ -> True - KleenePlus x -> rexNullable nm x - RegExam (Alternate x y) -> rexNullable nm x || rexNullable nm y - RegExam (OneOf _) -> False - RegExam (NotOneOf _ _) -> False - - iterNull nm = - let nm' = Map.mapWithKey - (\n _ -> any (rexNullable nm) (Map.findWithDefault [] n ruleMap)) nm - in if nm == nm' then nm else iterNull nm' - - nullMap = iterNull (Map.map (const False) ruleMap) - - ruleNames = Map.keys ruleMap - - earleyAcceptId0 = 0 - - (finalMap, nextIdAfterFinals) = - foldl' alloc (Map.empty, earleyAcceptId0 + 1) ruleNames - where alloc (m, i) n = (Map.insert n i m, i + 1) - - finalStatesList = [(finalMap Map.! n, EarleyEmit n) | n <- ruleNames] - - (rulesStatesList, firstsMap, nextIdAfterRules) = - foldl' compileRule ([], Map.empty, nextIdAfterFinals) (Map.toList ruleMap) - where - compileRule (sts, fm, nid) (name, prods) = - let finalId = finalMap Map.! name - (newSts, newFirsts, nid') = - foldl' compileProd ([], IntSet.empty, nid) prods - compileProd (s, fs, i) prod = - let (f, st, i', _) = - goEarley prod i (IntSet.singleton finalId) - in (s <> st, fs <> f, i') - in (sts <> newSts, Map.insert name newFirsts fm, nid') - - (startFirsts, startStatesRaw, _, startBypass) = - goEarley start nextIdAfterRules (IntSet.singleton earleyAcceptId0) - - startStates = - startFirsts <> bypassStates startBypass (IntSet.singleton earleyAcceptId0) - - allStates = finalStatesList <> rulesStatesList <> startStatesRaw - - bypassStates True = id - bypassStates False = const IntSet.empty - - goEarley rex nextId dests = case rex of - SeqEmpty -> (IntSet.empty, [], nextId, True) - NonTerminal name -> - ( IntSet.singleton nextId - , [(nextId, EarleyNonTerminal name dests)] - , nextId + 1 - , Map.findWithDefault False name nullMap - ) - Sequence rex0 rex1 -> - let - (firsts1, states1, nextId1, bypass1) = goEarley rex1 nextId dests - (firsts0, states0, nextId0, bypass0) = - goEarley rex0 nextId1 (firsts1 <> bypassStates bypass1 dests) - in - ( firsts0 <> bypassStates bypass0 firsts1 - , states0 <> states1 - , nextId0 - , bypass0 && bypass1 - ) - KleeneStar rex0 -> - let - (firsts, states, nextId', _) = goEarley rex0 nextId (firsts <> dests) - in - (firsts, states, nextId', True) - KleeneOpt rex0 -> - let - (firsts, states, nextId', _) = goEarley rex0 nextId dests - in - (firsts, states, nextId', True) - KleenePlus rex0 -> - let - (firsts, states, nextId', bypass) = goEarley rex0 nextId (firsts <> dests) - in - (firsts, states, nextId', bypass) - RegExam (OneOf chars) - | Set.null chars -> (IntSet.empty, [], nextId, False) - | otherwise -> - ( IntSet.singleton nextId - , [(nextId, EarleyTerminal (TokenClass (OneOf chars)) dests)] - , nextId + 1 - , False - ) - RegExam (NotOneOf chars catTest) -> - ( IntSet.singleton nextId - , [(nextId, EarleyTerminal (TokenClass (NotOneOf chars catTest)) dests)] - , nextId + 1 - , False - ) - RegExam (Alternate rex0 rex1) -> - let - (firsts1, states1, nextId1, bypass1) = goEarley rex1 nextId dests - (firsts0, states0, nextId0, bypass0) = goEarley rex0 nextId1 dests - in - ( firsts0 <> firsts1 - , states0 <> states1 - , nextId0 - , bypass0 || bypass1 - ) - -matchEarley :: Categorized token => [token] -> EarleyTransducer token -> Bool -matchEarley word et = IntSet.member 0 acceptOrigins - where - - initialE0 = IntMap.fromList - [ (s, IntSet.singleton 0) | s <- IntSet.toList (earleyStartStates et) ] - - sets0 = IntMap.singleton 0 initialE0 - - sets0closed = closureAt 0 sets0 - - (finalSets, n) = runInput 0 sets0closed word - - runInput j ss [] = (ss, j) - runInput j ss (x : xs) = - let scanned = scanFrom j x ss - ss' = IntMap.insert (j + 1) scanned ss - closed = closureAt (j + 1) ss' - in runInput (j + 1) closed xs - - en = IntMap.findWithDefault IntMap.empty n finalSets - - acceptOrigins = IntMap.findWithDefault IntSet.empty (earleyAcceptId et) en - - scanFrom j input ss = IntMap.foldrWithKey advance IntMap.empty e_j - where - e_j = IntMap.findWithDefault IntMap.empty j ss - advance s origs acc = case IntMap.lookup s (earleyStates et) of - Just (EarleyTerminal cls ds) | tokenClass cls input -> - IntSet.foldr - (\d -> IntMap.insertWith IntSet.union d origs) acc ds - _ -> acc - - closureAt j initialSets = loop initialWork initialSets - where - initialE = IntMap.findWithDefault IntMap.empty j initialSets - initialWork = - [ (s, i) | (s, os) <- IntMap.toList initialE, i <- IntSet.toList os ] - loop [] ss = ss - loop ((s, i) : rest) ss = case IntMap.lookup s (earleyStates et) of - Just (EarleyNonTerminal name ds) -> - let (firsts, isNull) = Map.findWithDefault - (IntSet.empty, False) name (earleyRules et) - predItems = [(f, j) | f <- IntSet.toList firsts] - nullItems = - if isNull then [(d, i) | d <- IntSet.toList ds] else [] - (ss', new) = addEarleyItems j (predItems <> nullItems) ss - in loop (new <> rest) ss' - Just (EarleyEmit name) -> - let e_i = IntMap.findWithDefault IntMap.empty i ss - completions = - [ (d, i') - | (t, os) <- IntMap.toList e_i - , Just (EarleyNonTerminal n' ds) <- [IntMap.lookup t (earleyStates et)] - , n' == name - , i' <- IntSet.toList os - , d <- IntSet.toList ds - ] - (ss', new) = addEarleyItems j completions ss - in loop (new <> rest) ss' - _ -> loop rest ss - - addEarleyItems j items ss = foldl' ins (ss, []) items - where - ins (acc, new) (state, origin) = - let e_j = IntMap.findWithDefault IntMap.empty j acc - os = IntMap.findWithDefault IntSet.empty state e_j - in if IntSet.member origin os - then (acc, new) - else - let e_j' = IntMap.insert state (IntSet.insert origin os) e_j - acc' = IntMap.insert j e_j' acc - in (acc', (state, origin) : new) - -- instances instance (Ord rule, NonTerminalSymbol rule) => BackusNaurForm (Bnf rule) where @@ -438,11 +202,3 @@ instance (Ord rule, Monoid rule) => Monoid (Bnf rule) where mempty = liftBnf0 mempty instance (Ord rule, Semigroup rule) => Semigroup (Bnf rule) where (<>) = liftBnf2 (<>) -instance Categorized token - => Matching [token] (Bnf (RegEx token)) where - word =~ bnf = matchEarley word (compileEarley bnf) -instance Categorized token - => Matching [token] (RegEx token) where - word =~ pattern = word =~ liftBnf0 pattern -instance Matching s (APrism s t a b) where - word =~ pattern = is pattern word diff --git a/src/Control/Lens/Grammar/Kleene.hs b/src/Control/Lens/Grammar/Kleene.hs index 52d74473..5ffce0b1 100644 --- a/src/Control/Lens/Grammar/Kleene.hs +++ b/src/Control/Lens/Grammar/Kleene.hs @@ -38,6 +38,9 @@ import Data.Profunctor.Distributor import Data.Set (Set) import qualified Data.Set as Set import GHC.Generics +import Test.QuickCheck.Arbitrary +import Test.QuickCheck.Gen (Gen) +import qualified Test.QuickCheck.Gen as Gen import Text.ParserCombinators.ReadP (ReadP) import qualified Text.ParserCombinators.ReadP as ReadP @@ -225,6 +228,14 @@ instance TokenAlgebra token (f token) tokenClass = Joker . tokenClass instance TokenAlgebra Char (ReadP Char) where tokenClass = ReadP.satisfy . tokenClass +instance (Categorized token, Arbitrary token) => TokenAlgebra token (Gen token) where + tokenClass (TokenClass exam) = case exam of + OneOf xs -> Gen.elements (toList xs) + NotOneOf xs (AndAsIn cat) -> arbitrary `Gen.suchThat` + (\x -> x `notElem` xs && categorize x == cat) + NotOneOf xs (AndNotAsIn cats) -> arbitrary `Gen.suchThat` + (\x -> x `notElem` xs && categorize x `notElem` cats) + Alternate cls1 cls2 -> Gen.oneof [tokenClass cls1, tokenClass cls2] instance Categorized token => Monoid (RegEx token) where mempty = SeqEmpty instance Categorized token => Semigroup (RegEx token) where diff --git a/src/Control/Lens/Grammar/Matching.hs b/src/Control/Lens/Grammar/Matching.hs new file mode 100644 index 00000000..0bb6de50 --- /dev/null +++ b/src/Control/Lens/Grammar/Matching.hs @@ -0,0 +1,353 @@ +{- | +Module : Control.Lens.Grammar.Matching +Description : pattern matching & language generation +Copyright : (C) 2026 - Eitan Chatav +License : BSD-style (see the file LICENSE) +Maintainer : Eitan Chatav +Stability : provisional +Portability : non-portable + +https://www.cs.dartmouth.edu/~doug/nfa.pdf +http://trevorjim.com/papers/ldta-2009.pdf +-} + +module Control.Lens.Grammar.Matching + ( Matching (..) + , languageGen + ) where + +import Control.Lens +import Control.Lens.Extras +import Control.Lens.Grammar.BackusNaur +import Control.Lens.Grammar.Kleene +import Control.Lens.Grammar.Token +import Data.Foldable +import qualified Data.IntMap.Strict as IntMap +import Data.IntMap.Strict (IntMap) +import qualified Data.IntSet as IntSet +import Data.IntSet (IntSet) +import qualified Data.Map.Strict as Map +import Data.Map.Strict (Map) +import qualified Data.Set as Set + +-- | Does a word match a pattern? +class Matching word pattern | pattern -> word where + (=~) :: word -> pattern -> Bool + infix 2 =~ + +-- | A state in the Earley-extended Thompson transducer for a `Bnf`. +-- @EarleyTerminal cls ds@ matches on a token class and transitions to @ds@. +-- @EarleyNonTerminal name ds@ is a call point for rule @name@; after @name@ +-- completes, control flows to @ds@. @EarleyEmit name@ is the final state +-- for rule @name@ and triggers completion during Earley closure. +data EarleyState token + = EarleyTerminal (TokenClass token) IntSet + | EarleyNonTerminal String IntSet + | EarleyEmit String + +data EarleyTransducer token = EarleyTransducer + { earleyStates :: IntMap (EarleyState token) + , earleyRules :: Map String (IntSet, Bool) + , earleyAcceptId :: Int + , earleyStartStates :: IntSet + } + +compileEarley :: Bnf (RegEx token) -> EarleyTransducer token +compileEarley (Bnf start rules) = EarleyTransducer + { earleyStates = IntMap.fromList allStates + , earleyRules = Map.fromList + [ ( n + , ( Map.findWithDefault IntSet.empty n firstsMap + , Map.findWithDefault False n nullMap + ) + ) + | n <- Map.keys ruleMap + ] + , earleyAcceptId = earleyAcceptId0 + , earleyStartStates = startStates + } + + where + + ruleMap = foldr + (\(n, r) -> Map.insertWith (++) n [r]) Map.empty (toList rules) + + rexNullable nm = \case + SeqEmpty -> True + NonTerminal n -> Map.findWithDefault False n nm + Sequence x y -> rexNullable nm x && rexNullable nm y + KleeneStar _ -> True + KleeneOpt _ -> True + KleenePlus x -> rexNullable nm x + RegExam (Alternate x y) -> rexNullable nm x || rexNullable nm y + RegExam (OneOf _) -> False + RegExam (NotOneOf _ _) -> False + + iterNull nm = + let nm' = Map.mapWithKey + (\n _ -> any (rexNullable nm) (Map.findWithDefault [] n ruleMap)) nm + in if nm == nm' then nm else iterNull nm' + + nullMap = iterNull (Map.map (const False) ruleMap) + + ruleNames = Map.keys ruleMap + + earleyAcceptId0 = 0 + + (finalMap, nextIdAfterFinals) = + foldl' alloc (Map.empty, earleyAcceptId0 + 1) ruleNames + where alloc (m, i) n = (Map.insert n i m, i + 1) + + finalStatesList = [(finalMap Map.! n, EarleyEmit n) | n <- ruleNames] + + (rulesStatesList, firstsMap, nextIdAfterRules) = + foldl' compileRule ([], Map.empty, nextIdAfterFinals) (Map.toList ruleMap) + where + compileRule (sts, fm, nid) (name, prods) = + let finalId = finalMap Map.! name + (newSts, newFirsts, nid') = + foldl' compileProd ([], IntSet.empty, nid) prods + compileProd (s, fs, i) prod = + let (f, st, i', _) = + goEarley prod i (IntSet.singleton finalId) + in (s <> st, fs <> f, i') + in (sts <> newSts, Map.insert name newFirsts fm, nid') + + (startFirsts, startStatesRaw, _, startBypass) = + goEarley start nextIdAfterRules (IntSet.singleton earleyAcceptId0) + + startStates = + startFirsts <> bypassStates startBypass (IntSet.singleton earleyAcceptId0) + + allStates = finalStatesList <> rulesStatesList <> startStatesRaw + + bypassStates True = id + bypassStates False = const IntSet.empty + + goEarley rex nextId dests = case rex of + SeqEmpty -> (IntSet.empty, [], nextId, True) + NonTerminal name -> + ( IntSet.singleton nextId + , [(nextId, EarleyNonTerminal name dests)] + , nextId + 1 + , Map.findWithDefault False name nullMap + ) + Sequence rex0 rex1 -> + let + (firsts1, states1, nextId1, bypass1) = goEarley rex1 nextId dests + (firsts0, states0, nextId0, bypass0) = + goEarley rex0 nextId1 (firsts1 <> bypassStates bypass1 dests) + in + ( firsts0 <> bypassStates bypass0 firsts1 + , states0 <> states1 + , nextId0 + , bypass0 && bypass1 + ) + KleeneStar rex0 -> + let + (firsts, states, nextId', _) = goEarley rex0 nextId (firsts <> dests) + in + (firsts, states, nextId', True) + KleeneOpt rex0 -> + let + (firsts, states, nextId', _) = goEarley rex0 nextId dests + in + (firsts, states, nextId', True) + KleenePlus rex0 -> + let + (firsts, states, nextId', bypass) = goEarley rex0 nextId (firsts <> dests) + in + (firsts, states, nextId', bypass) + RegExam (OneOf chars) + | Set.null chars -> (IntSet.empty, [], nextId, False) + | otherwise -> + ( IntSet.singleton nextId + , [(nextId, EarleyTerminal (TokenClass (OneOf chars)) dests)] + , nextId + 1 + , False + ) + RegExam (NotOneOf chars catTest) -> + ( IntSet.singleton nextId + , [(nextId, EarleyTerminal (TokenClass (NotOneOf chars catTest)) dests)] + , nextId + 1 + , False + ) + RegExam (Alternate rex0 rex1) -> + let + (firsts1, states1, nextId1, bypass1) = goEarley rex1 nextId dests + (firsts0, states0, nextId0, bypass0) = goEarley rex0 nextId1 dests + in + ( firsts0 <> firsts1 + , states0 <> states1 + , nextId0 + , bypass0 || bypass1 + ) + +matchEarley :: Categorized token => [token] -> EarleyTransducer token -> Bool +matchEarley word et = acceptsChart n finalSets et + where + + initialE0 = IntMap.fromList + [ (s, IntSet.singleton 0) | s <- IntSet.toList (earleyStartStates et) ] + + sets0 = closeChartAt 0 (IntMap.singleton 0 initialE0) et + + (finalSets, n) = runInput 0 sets0 word + + runInput j ss [] = (ss, j) + runInput j ss (x : xs) = + let scanned = scanFrom j x ss + closed = closeChartAt (j + 1) (IntMap.insert (j + 1) scanned ss) et + in runInput (j + 1) closed xs + + scanFrom j input ss = IntMap.foldrWithKey advance IntMap.empty e_j + where + e_j = IntMap.findWithDefault IntMap.empty j ss + advance s origs acc = case IntMap.lookup s (earleyStates et) of + Just (EarleyTerminal cls ds) | tokenClass cls input -> + IntSet.foldr + (\d -> IntMap.insertWith IntSet.union d origs) acc ds + _ -> acc +-- instances +instance Categorized token + => Matching [token] (Bnf (RegEx token)) where + word =~ bnf = matchEarley word (compileEarley bnf) +instance Categorized token + => Matching [token] (RegEx token) where + word =~ pattern = word =~ liftBnf0 pattern +instance Matching s (APrism s t a b) where + word =~ pattern = is pattern word + +{- | +Generate words recognized by a `Bnf` using Earley chart progression. + +Chart/state progression is deterministic (state id order). Token realization is +random but always valid for the selected terminal class. +-} +languageGen + :: (Applicative f, TokenAlgebra token (f token)) + => Bnf (RegEx token) + -> f [[token]] +languageGen bnf = sequenceA (fmap sampleWord classWords) + where + et = compileEarley bnf + + classWords = enumerateByLength [(0, [], initialChart et)] Set.empty + + sampleWord = traverse tokenClass . reverse + + enumerateByLength [] _ = [] + enumerateByLength frontier seen = + let + (accepted, seen') = acceptedAtFrontier frontier seen + next = concatMap expand frontier + in accepted <> enumerateByLength next seen' + + acceptedAtFrontier frontier seen0 = + let (acceptedRev, seen') = foldl' step ([], seen0) frontier + in (reverse acceptedRev, seen') + where + step (acc, seen) (j, revWord, chart) + | acceptsChart j chart et = + if Set.member revWord seen + then (acc, seen) + else (revWord : acc, Set.insert revWord seen) + | otherwise = (acc, seen) + + expand (j, revWord, chart) = + [ (j + 1, cls : revWord, nextChart) + | (cls, nextChart) <- scanClassOptions j chart et + ] + +initialChart + :: EarleyTransducer token + -> IntMap (IntMap IntSet) +initialChart et = closeChartAt 0 (IntMap.singleton 0 initialE0) et + where + initialE0 = IntMap.fromList + [ (s, IntSet.singleton 0) | s <- IntSet.toList (earleyStartStates et) ] + +acceptsChart + :: Int + -> IntMap (IntMap IntSet) + -> EarleyTransducer token + -> Bool +acceptsChart j chart et = IntSet.member 0 acceptOrigins + where + e_j = IntMap.findWithDefault IntMap.empty j chart + acceptOrigins = IntMap.findWithDefault IntSet.empty (earleyAcceptId et) e_j + +scanClassOptions + :: Categorized token + => Int + -> IntMap (IntMap IntSet) + -> EarleyTransducer token + -> [(TokenClass token, IntMap (IntMap IntSet))] +scanClassOptions j chart et = + [ (cls, closeChartAt (j + 1) (IntMap.insert (j + 1) scanned chart) et) + | (cls, scanned) <- Map.toAscList grouped + ] + where + grouped = IntMap.foldrWithKey advance Map.empty e_j + e_j = IntMap.findWithDefault IntMap.empty j chart + + advance s origs acc = case IntMap.lookup s (earleyStates et) of + Just (EarleyTerminal cls ds) -> + Map.insertWith mergeEarleySet cls scanned acc + where + scanned = IntSet.foldr + (\d -> IntMap.insertWith IntSet.union d origs) IntMap.empty ds + _ -> acc + + mergeEarleySet = IntMap.unionWith IntSet.union + +closeChartAt + :: Int + -> IntMap (IntMap IntSet) + -> EarleyTransducer token + -> IntMap (IntMap IntSet) +closeChartAt j initialChart0 et = loop initialWork initialChart0 + where + initialE = IntMap.findWithDefault IntMap.empty j initialChart0 + initialWork = + [ (s, i) | (s, os) <- IntMap.toList initialE, i <- IntSet.toList os ] + + loop [] chart = chart + loop ((s, i) : rest) chart = case IntMap.lookup s (earleyStates et) of + Just (EarleyNonTerminal name ds) -> + let + (firsts, isNull) = Map.findWithDefault + (IntSet.empty, False) name (earleyRules et) + predItems = [(f, j) | f <- IntSet.toList firsts] + nullItems = + if isNull then [(d, i) | d <- IntSet.toList ds] else [] + (chart', new) = addEarleyItems (predItems <> nullItems) chart + in loop (new <> rest) chart' + Just (EarleyEmit name) -> + let + e_i = IntMap.findWithDefault IntMap.empty i chart + completions = + [ (d, i') + | (t, os) <- IntMap.toList e_i + , Just (EarleyNonTerminal n' ds) <- [IntMap.lookup t (earleyStates et)] + , n' == name + , i' <- IntSet.toList os + , d <- IntSet.toList ds + ] + (chart', new) = addEarleyItems completions chart + in loop (new <> rest) chart' + _ -> loop rest chart + + addEarleyItems items chart = foldl' ins (chart, []) items + where + ins (acc, new) (state, origin) = + let + e_j = IntMap.findWithDefault IntMap.empty j acc + os = IntMap.findWithDefault IntSet.empty state e_j + in if IntSet.member origin os + then (acc, new) + else + let + e_j' = IntMap.insert state (IntSet.insert origin os) e_j + acc' = IntMap.insert j e_j' acc + in (acc', (state, origin) : new) diff --git a/src/Control/Lens/Grammar/Token.hs b/src/Control/Lens/Grammar/Token.hs index 884d5963..449bfb76 100644 --- a/src/Control/Lens/Grammar/Token.hs +++ b/src/Control/Lens/Grammar/Token.hs @@ -22,9 +22,13 @@ import Control.Lens import Control.Lens.PartialIso import Data.Bifunctor.Joker import Data.Char +import Data.Foldable import Data.Profunctor import Data.Profunctor.Monoidal import Data.Word +import Test.QuickCheck.Arbitrary +import Test.QuickCheck.Gen (Gen) +import qualified Test.QuickCheck.Gen as Gen import Text.ParserCombinators.ReadP (ReadP) import qualified Text.ParserCombinators.ReadP as ReadP @@ -137,3 +141,10 @@ instance Tokenized Char (ReadP Char) where notOneOf = ReadP.satisfy . notOneOf asIn = ReadP.satisfy . asIn notAsIn = ReadP.satisfy . notAsIn +instance (Categorized token, Arbitrary token) => Tokenized token (Gen token) where + anyToken = arbitrary @token + token = pure + oneOf = Gen.elements . toList + notOneOf xs = arbitrary `Gen.suchThat` (`notElem` xs) + asIn cat = arbitrary `Gen.suchThat` ((==) cat . categorize) + notAsIn cat = arbitrary `Gen.suchThat` ((/=) cat . categorize) diff --git a/src/Data/Profunctor/Grammar.hs b/src/Data/Profunctor/Grammar.hs index b721355d..1eb2f03c 100644 --- a/src/Data/Profunctor/Grammar.hs +++ b/src/Data/Profunctor/Grammar.hs @@ -27,6 +27,7 @@ import Control.Lens import Control.Lens.Extras import Control.Lens.Grammar.BackusNaur import Control.Lens.Grammar.Kleene +import Control.Lens.Grammar.Matching import Control.Lens.Grammar.Symbol import Control.Lens.Grammar.Token import Control.Monad diff --git a/test/Main.hs b/test/Main.hs index ae792432..610e5363 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -11,6 +11,7 @@ import Data.Profunctor.Types (Star (..)) import System.Environment (lookupEnv) import Test.DocTest import Test.Hspec +import Test.QuickCheck (generate) import qualified Text.Megaparsec as M import Examples.Arithmetic @@ -30,15 +31,15 @@ main = do when shouldRunDoctests $ describe "doctest" $ it "should run haddock examples" doctests - describe "regexGrammar" $ for_ regexExamples $ testGrammar False regexGrammar - describe "semverGrammar" $ for_ semverExamples $ testGrammar True semverGrammar - describe "semverCtxGrammar" $ for_ semverExamples $ testCtxGrammar True semverCtxGrammar - describe "arithGrammar" $ for_ arithExamples $ testGrammar True arithGrammar - describe "jsonGrammar" $ for_ jsonExamples $ testGrammar False jsonGrammar - describe "sexprGrammar" $ for_ sexprExamples $ testGrammar True sexprGrammar - describe "lambdaGrammar" $ for_ lambdaExamples $ testGrammar True lambdaGrammar - describe "lenvecGrammar" $ for_ lenvecExamples $ testCtxGrammar True lenvecGrammar - describe "chainGrammar" $ for_ chainExamples $ testGrammar True chainGrammar + describe "regexGrammar" $ testCfg False regexExamples regexGrammar + describe "semverGrammar" $ testCfg True semverExamples semverGrammar + describe "semverCtxGrammar" $ testCsg True semverExamples semverCtxGrammar + describe "arithGrammar" $ testCfg True arithExamples arithGrammar + describe "jsonGrammar" $ testCfg False jsonExamples jsonGrammar + describe "sexprGrammar" $ testCfg True sexprExamples sexprGrammar + describe "lambdaGrammar" $ testCfg True lambdaExamples lambdaGrammar + describe "lenvecGrammar" $ testCsg True lenvecExamples lenvecGrammar + describe "chainGrammar" $ testCfg True chainExamples chainGrammar describe "Parsector try rollback" tryRollbackTests describe "Kleene" kleeneProperties describe "meander" meanderProperties @@ -117,12 +118,23 @@ meanderProperties = seen `shouldBe` input units `shouldBe` replicate (length input) () -testGrammar :: (Show a, Eq a) => Bool -> Grammar Char a -> (a, String) -> Spec -testGrammar isLL1 grammar (expectedSyntax, expectedString) = do - testCtxGrammar isLL1 grammar (expectedSyntax, expectedString) - it ("should match " <> expectedString <> " correctly") $ do - let actualMatch = expectedString =~ regbnfG grammar - actualMatch `shouldBe` True +testCfg :: (Show a, Eq a) => Bool -> [(a, String)] -> Grammar Char a -> Spec +testCfg isLL1 examples grammar = do + describe "examples" $ for_ examples $ \(expectedSyntax, expectedString) -> do + testCtxGrammar isLL1 grammar (expectedSyntax, expectedString) + it ("should match " <> expectedString <> " correctly") $ do + let actualMatch = expectedString =~ regbnfG grammar + actualMatch `shouldBe` True + describe "generated languageG" $ do + it "should parses with exactly one full parse" $ do + generated <- generate (take 100 <$> languageG grammar) + for_ generated $ \word -> do + let fullParses = [() | (_, "") <- parseG grammar word] + fullParses `shouldBe` [()] + +testCsg :: (Show a, Eq a) => Bool -> [(a, String)] -> CtxGrammar Char a -> Spec +testCsg isLL1 examples grammar = + describe "examples" $ for_ examples $ testCtxGrammar isLL1 grammar testCtxGrammar :: (Show a, Eq a) => Bool -> CtxGrammar Char a -> (a, String) -> Spec testCtxGrammar isLL1 grammar (expectedSyntax, expectedString) = do From 135aec1989b99ef19b9d937b3eeb4764a3fc6384 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Wed, 22 Apr 2026 16:36:34 -0700 Subject: [PATCH 09/31] random sample token classes --- distributors.cabal | 6 ++++++ package.yaml | 3 +++ src/Control/Lens/Grammar/Kleene.hs | 14 ++++++++++++++ src/Control/Lens/Grammar/Token.hs | 14 ++++++++++++++ 4 files changed, 37 insertions(+) diff --git a/distributors.cabal b/distributors.cabal index ed7a02ce..45b73833 100644 --- a/distributors.cabal +++ b/distributors.cabal @@ -108,12 +108,15 @@ library , contravariant >=1.5 && <2 , distributive >=0.6 && <1 , lens >=5.0 && <6 + , monad-loops >=0.4.3 && <1 , mtl >=2.2 && <3 , profunctors >=5.6 && <6 + , random >=1.2 && <2 , tagged >=0.8 && <1 , template-haskell >=2.17 && <3 , text ==2.* , th-abstraction >=0.4 && <1 + , transformers >=0.5 && <1 , vector >=0.12 && <1 , witherable >=0.4 && <1 default-language: Haskell2010 @@ -190,12 +193,15 @@ test-suite test , hspec >=2.7 && <3 , lens >=5.0 && <6 , megaparsec >=9.0 && <10 + , monad-loops >=0.4.3 && <1 , mtl >=2.2 && <3 , profunctors >=5.6 && <6 + , random >=1.2 && <2 , tagged >=0.8 && <1 , template-haskell >=2.17 && <3 , text ==2.* , th-abstraction >=0.4 && <1 + , transformers >=0.5 && <1 , vector >=0.12 && <1 , witherable >=0.4 && <1 default-language: Haskell2010 diff --git a/package.yaml b/package.yaml index f0e85510..1e652244 100644 --- a/package.yaml +++ b/package.yaml @@ -28,7 +28,10 @@ dependencies: - lens >= 5.0 && < 6 - MemoTrie >= 0.6 && < 1 - QuickCheck >= 2.14 && < 3 +- monad-loops >=0.4.3 && < 1 - mtl >= 2.2 && < 3 +- random >= 1.2 && < 2 +- transformers >= 0.5 && < 1 - profunctors >= 5.6 && < 6 - tagged >= 0.8 && < 1 - template-haskell >= 2.17 && < 3 diff --git a/src/Control/Lens/Grammar/Kleene.hs b/src/Control/Lens/Grammar/Kleene.hs index 5ffce0b1..03e8d739 100644 --- a/src/Control/Lens/Grammar/Kleene.hs +++ b/src/Control/Lens/Grammar/Kleene.hs @@ -29,6 +29,8 @@ import Control.Applicative import Control.Lens.Grammar.Boole import Control.Lens.Grammar.Symbol import Control.Lens.Grammar.Token +import Control.Monad.Loops +import Control.Monad.Trans.State.Strict (StateT, state) import Data.Bifunctor.Joker import Data.Foldable import Data.MemoTrie @@ -38,6 +40,7 @@ import Data.Profunctor.Distributor import Data.Set (Set) import qualified Data.Set as Set import GHC.Generics +import System.Random (RandomGen, Random, random) import Test.QuickCheck.Arbitrary import Test.QuickCheck.Gen (Gen) import qualified Test.QuickCheck.Gen as Gen @@ -236,6 +239,17 @@ instance (Categorized token, Arbitrary token) => TokenAlgebra token (Gen token) NotOneOf xs (AndNotAsIn cats) -> arbitrary `Gen.suchThat` (\x -> x `notElem` xs && categorize x `notElem` cats) Alternate cls1 cls2 -> Gen.oneof [tokenClass cls1, tokenClass cls2] +instance (RandomGen g, Monad m, Categorized token, Random token) + => TokenAlgebra token (StateT g m token) where + tokenClass (TokenClass exam) = case exam of + OneOf xs -> oneOf xs + NotOneOf xs (AndAsIn cat) -> + iterateUntil (\x -> x `notElem` xs && categorize x == cat) anyToken + NotOneOf xs (AndNotAsIn cats) -> + iterateUntil (\x -> x `notElem` xs && categorize x `notElem` cats) anyToken + Alternate cls1 cls2 -> do + b <- state random + if (b :: Bool) then tokenClass cls1 else tokenClass cls2 instance Categorized token => Monoid (RegEx token) where mempty = SeqEmpty instance Categorized token => Semigroup (RegEx token) where diff --git a/src/Control/Lens/Grammar/Token.hs b/src/Control/Lens/Grammar/Token.hs index 449bfb76..1bede35c 100644 --- a/src/Control/Lens/Grammar/Token.hs +++ b/src/Control/Lens/Grammar/Token.hs @@ -20,12 +20,15 @@ module Control.Lens.Grammar.Token import Control.Lens import Control.Lens.PartialIso +import Control.Monad.Loops (iterateUntil) import Data.Bifunctor.Joker import Data.Char import Data.Foldable import Data.Profunctor import Data.Profunctor.Monoidal import Data.Word +import Control.Monad.Trans.State.Strict (StateT, state) +import System.Random (RandomGen, Random, random, randomR) import Test.QuickCheck.Arbitrary import Test.QuickCheck.Gen (Gen) import qualified Test.QuickCheck.Gen as Gen @@ -148,3 +151,14 @@ instance (Categorized token, Arbitrary token) => Tokenized token (Gen token) whe notOneOf xs = arbitrary `Gen.suchThat` (`notElem` xs) asIn cat = arbitrary `Gen.suchThat` ((==) cat . categorize) notAsIn cat = arbitrary `Gen.suchThat` ((/=) cat . categorize) +instance (RandomGen g, Monad m, Categorized token, Random token) + => Tokenized token (StateT g m token) where + anyToken = state random + token = pure + oneOf xs = do + let ys = toList xs + i <- state (randomR (0, length ys - 1)) + pure (ys !! i) + notOneOf xs = iterateUntil (`notElem` xs) anyToken + asIn cat = iterateUntil ((== cat) . categorize) anyToken + notAsIn cat = iterateUntil ((/= cat) . categorize) anyToken From 6b91bcb4b37b14275c0789f71f145db4ce9ef8d2 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Wed, 22 Apr 2026 16:56:42 -0700 Subject: [PATCH 10/31] Update Matching.hs --- src/Control/Lens/Grammar/Matching.hs | 68 +++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 12 deletions(-) diff --git a/src/Control/Lens/Grammar/Matching.hs b/src/Control/Lens/Grammar/Matching.hs index 0bb6de50..056fe689 100644 --- a/src/Control/Lens/Grammar/Matching.hs +++ b/src/Control/Lens/Grammar/Matching.hs @@ -14,6 +14,8 @@ http://trevorjim.com/papers/ldta-2009.pdf module Control.Lens.Grammar.Matching ( Matching (..) , languageGen + , expected + , unreachableRules ) where import Control.Lens @@ -29,6 +31,7 @@ import Data.IntSet (IntSet) import qualified Data.Map.Strict as Map import Data.Map.Strict (Map) import qualified Data.Set as Set +import Data.Set (Set) -- | Does a word match a pattern? class Matching word pattern | pattern -> word where @@ -184,21 +187,21 @@ compileEarley (Bnf start rules) = EarleyTransducer ) matchEarley :: Categorized token => [token] -> EarleyTransducer token -> Bool -matchEarley word et = acceptsChart n finalSets et - where - - initialE0 = IntMap.fromList - [ (s, IntSet.singleton 0) | s <- IntSet.toList (earleyStartStates et) ] - - sets0 = closeChartAt 0 (IntMap.singleton 0 initialE0) et - - (finalSets, n) = runInput 0 sets0 word +matchEarley word et = acceptsChart n chart et + where (n, chart) = runEarleyPrefix word et - runInput j ss [] = (ss, j) - runInput j ss (x : xs) = +runEarleyPrefix + :: Categorized token + => [token] + -> EarleyTransducer token + -> (Int, IntMap (IntMap IntSet)) +runEarleyPrefix word et = go 0 (initialChart et) word + where + go j ss [] = (j, ss) + go j ss (x : xs) = let scanned = scanFrom j x ss closed = closeChartAt (j + 1) (IntMap.insert (j + 1) scanned ss) et - in runInput (j + 1) closed xs + in go (j + 1) closed xs scanFrom j input ss = IntMap.foldrWithKey advance IntMap.empty e_j where @@ -208,6 +211,47 @@ matchEarley word et = acceptsChart n finalSets et IntSet.foldr (\d -> IntMap.insertWith IntSet.union d origs) acc ds _ -> acc + +{- | +Token classes that could legally appear next after the given input prefix, +according to the grammar. An empty result means the prefix is a dead end — +no extension can ever be accepted. Useful for autocomplete and for +\"expected one of …\" parse errors. +-} +expected + :: Categorized token + => [token] -> Bnf (RegEx token) -> [TokenClass token] +expected word bnf = map fst (scanClassOptions n chart et) + where + et = compileEarley bnf + (n, chart) = runEarleyPrefix word et + +{- | +Rule names declared in the `Bnf` that can never be entered from the start +expression — dead productions. A non-empty result is a grammar-hygiene +warning: those rules can be deleted without changing the recognized language. +-} +unreachableRules :: Bnf (RegEx token) -> Set String +unreachableRules bnf = + Map.keysSet (earleyRules et) `Set.difference` called + where + et = compileEarley bnf + called = bfs (earleyStartStates et) IntSet.empty Set.empty + + bfs frontier seen calls + | IntSet.null fresh = calls + | otherwise = bfs next (seen <> fresh) calls' + where + fresh = IntSet.difference frontier seen + (next, calls') = IntSet.foldr step (IntSet.empty, calls) fresh + + step s (acc, cs) = case IntMap.lookup s (earleyStates et) of + Just (EarleyTerminal _ ds) -> (acc <> ds, cs) + Just (EarleyNonTerminal name ds) -> + let firsts = maybe IntSet.empty fst (Map.lookup name (earleyRules et)) + in (acc <> ds <> firsts, Set.insert name cs) + Just (EarleyEmit _) -> (acc, cs) + Nothing -> (acc, cs) -- instances instance Categorized token => Matching [token] (Bnf (RegEx token)) where From 4dfbb3a3b6926af5c5897bd7c3ecbbe7d437eb64 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Wed, 22 Apr 2026 21:54:56 -0700 Subject: [PATCH 11/31] cleaning --- src/Control/Lens/Grammar.hs | 2 +- src/Control/Lens/Grammar/Matching.hs | 143 +++++++++++++-------------- 2 files changed, 71 insertions(+), 74 deletions(-) diff --git a/src/Control/Lens/Grammar.hs b/src/Control/Lens/Grammar.hs index 39720532..fa08b090 100644 --- a/src/Control/Lens/Grammar.hs +++ b/src/Control/Lens/Grammar.hs @@ -793,7 +793,7 @@ regbnfG :: Grammar Char a -> RegBnf regbnfG bnf = runGrammor bnf languageG :: (Applicative f, TokenAlgebra token (f token)) => Grammar token a -> f [[token]] -languageG bnf = languageGen (runGrammor bnf) +languageG bnf = languageGen (compileTransducer (runGrammor bnf)) {- | `printG` generates a printer from a `CtxGrammar`. Since both `RegGrammar`s and context-free `Grammar`s are `CtxGrammar`s, diff --git a/src/Control/Lens/Grammar/Matching.hs b/src/Control/Lens/Grammar/Matching.hs index 056fe689..2aae09a6 100644 --- a/src/Control/Lens/Grammar/Matching.hs +++ b/src/Control/Lens/Grammar/Matching.hs @@ -13,9 +13,12 @@ http://trevorjim.com/papers/ldta-2009.pdf module Control.Lens.Grammar.Matching ( Matching (..) + , Transducer (..) + , TransducerState (..) + , compileTransducer , languageGen - , expected - , unreachableRules + , expectedGen + , unreachableGen ) where import Control.Lens @@ -39,26 +42,26 @@ class Matching word pattern | pattern -> word where infix 2 =~ -- | A state in the Earley-extended Thompson transducer for a `Bnf`. --- @EarleyTerminal cls ds@ matches on a token class and transitions to @ds@. --- @EarleyNonTerminal name ds@ is a call point for rule @name@; after @name@ --- completes, control flows to @ds@. @EarleyEmit name@ is the final state +-- @TransducerTokenClass cls ds@ matches on a token class and transitions to @ds@. +-- @TransducerNonTerminal name ds@ is a call point for rule @name@; after @name@ +-- completes, control flows to @ds@. @TransducerEmit name@ is the final state -- for rule @name@ and triggers completion during Earley closure. -data EarleyState token - = EarleyTerminal (TokenClass token) IntSet - | EarleyNonTerminal String IntSet - | EarleyEmit String - -data EarleyTransducer token = EarleyTransducer - { earleyStates :: IntMap (EarleyState token) - , earleyRules :: Map String (IntSet, Bool) - , earleyAcceptId :: Int - , earleyStartStates :: IntSet +data TransducerState token + = TransducerTokenClass (TokenClass token) IntSet + | TransducerNonTerminal String IntSet + | TransducerEmit String + +data Transducer token = Transducer + { transducerStates :: IntMap (TransducerState token) + , transducerRules :: Map String (IntSet, Bool) + , transducerAcceptId :: Int + , transducerStartStates :: IntSet } -compileEarley :: Bnf (RegEx token) -> EarleyTransducer token -compileEarley (Bnf start rules) = EarleyTransducer - { earleyStates = IntMap.fromList allStates - , earleyRules = Map.fromList +compileTransducer :: Bnf (RegEx token) -> Transducer token +compileTransducer (Bnf start rules) = Transducer + { transducerStates = IntMap.fromList allStates + , transducerRules = Map.fromList [ ( n , ( Map.findWithDefault IntSet.empty n firstsMap , Map.findWithDefault False n nullMap @@ -66,8 +69,8 @@ compileEarley (Bnf start rules) = EarleyTransducer ) | n <- Map.keys ruleMap ] - , earleyAcceptId = earleyAcceptId0 - , earleyStartStates = startStates + , transducerAcceptId = transducerAcceptId0 + , transducerStartStates = startStates } where @@ -95,13 +98,13 @@ compileEarley (Bnf start rules) = EarleyTransducer ruleNames = Map.keys ruleMap - earleyAcceptId0 = 0 + transducerAcceptId0 = 0 (finalMap, nextIdAfterFinals) = - foldl' alloc (Map.empty, earleyAcceptId0 + 1) ruleNames + foldl' alloc (Map.empty, transducerAcceptId0 + 1) ruleNames where alloc (m, i) n = (Map.insert n i m, i + 1) - finalStatesList = [(finalMap Map.! n, EarleyEmit n) | n <- ruleNames] + finalStatesList = [(finalMap Map.! n, TransducerEmit n) | n <- ruleNames] (rulesStatesList, firstsMap, nextIdAfterRules) = foldl' compileRule ([], Map.empty, nextIdAfterFinals) (Map.toList ruleMap) @@ -117,10 +120,10 @@ compileEarley (Bnf start rules) = EarleyTransducer in (sts <> newSts, Map.insert name newFirsts fm, nid') (startFirsts, startStatesRaw, _, startBypass) = - goEarley start nextIdAfterRules (IntSet.singleton earleyAcceptId0) + goEarley start nextIdAfterRules (IntSet.singleton transducerAcceptId0) startStates = - startFirsts <> bypassStates startBypass (IntSet.singleton earleyAcceptId0) + startFirsts <> bypassStates startBypass (IntSet.singleton transducerAcceptId0) allStates = finalStatesList <> rulesStatesList <> startStatesRaw @@ -131,7 +134,7 @@ compileEarley (Bnf start rules) = EarleyTransducer SeqEmpty -> (IntSet.empty, [], nextId, True) NonTerminal name -> ( IntSet.singleton nextId - , [(nextId, EarleyNonTerminal name dests)] + , [(nextId, TransducerNonTerminal name dests)] , nextId + 1 , Map.findWithDefault False name nullMap ) @@ -165,13 +168,13 @@ compileEarley (Bnf start rules) = EarleyTransducer | Set.null chars -> (IntSet.empty, [], nextId, False) | otherwise -> ( IntSet.singleton nextId - , [(nextId, EarleyTerminal (TokenClass (OneOf chars)) dests)] + , [(nextId, TransducerTokenClass (TokenClass (OneOf chars)) dests)] , nextId + 1 , False ) RegExam (NotOneOf chars catTest) -> ( IntSet.singleton nextId - , [(nextId, EarleyTerminal (TokenClass (NotOneOf chars catTest)) dests)] + , [(nextId, TransducerTokenClass (TokenClass (NotOneOf chars catTest)) dests)] , nextId + 1 , False ) @@ -186,16 +189,12 @@ compileEarley (Bnf start rules) = EarleyTransducer , bypass0 || bypass1 ) -matchEarley :: Categorized token => [token] -> EarleyTransducer token -> Bool -matchEarley word et = acceptsChart n chart et - where (n, chart) = runEarleyPrefix word et - -runEarleyPrefix +prefixGen :: Categorized token => [token] - -> EarleyTransducer token + -> Transducer token -> (Int, IntMap (IntMap IntSet)) -runEarleyPrefix word et = go 0 (initialChart et) word +prefixGen word et = go 0 (initialChart et) word where go j ss [] = (j, ss) go j ss (x : xs) = @@ -206,8 +205,8 @@ runEarleyPrefix word et = go 0 (initialChart et) word scanFrom j input ss = IntMap.foldrWithKey advance IntMap.empty e_j where e_j = IntMap.findWithDefault IntMap.empty j ss - advance s origs acc = case IntMap.lookup s (earleyStates et) of - Just (EarleyTerminal cls ds) | tokenClass cls input -> + advance s origs acc = case IntMap.lookup s (transducerStates et) of + Just (TransducerTokenClass cls ds) | tokenClass cls input -> IntSet.foldr (\d -> IntMap.insertWith IntSet.union d origs) acc ds _ -> acc @@ -218,25 +217,23 @@ according to the grammar. An empty result means the prefix is a dead end — no extension can ever be accepted. Useful for autocomplete and for \"expected one of …\" parse errors. -} -expected +expectedGen :: Categorized token - => [token] -> Bnf (RegEx token) -> [TokenClass token] -expected word bnf = map fst (scanClassOptions n chart et) + => [token] -> Transducer token -> [TokenClass token] +expectedGen word et = map fst (scanClassOptions n chart et) where - et = compileEarley bnf - (n, chart) = runEarleyPrefix word et + (n, chart) = prefixGen word et {- | Rule names declared in the `Bnf` that can never be entered from the start expression — dead productions. A non-empty result is a grammar-hygiene warning: those rules can be deleted without changing the recognized language. -} -unreachableRules :: Bnf (RegEx token) -> Set String -unreachableRules bnf = - Map.keysSet (earleyRules et) `Set.difference` called +unreachableGen :: Transducer token -> Set String +unreachableGen et = + Map.keysSet (transducerRules et) `Set.difference` called where - et = compileEarley bnf - called = bfs (earleyStartStates et) IntSet.empty Set.empty + called = bfs (transducerStartStates et) IntSet.empty Set.empty bfs frontier seen calls | IntSet.null fresh = calls @@ -245,17 +242,20 @@ unreachableRules bnf = fresh = IntSet.difference frontier seen (next, calls') = IntSet.foldr step (IntSet.empty, calls) fresh - step s (acc, cs) = case IntMap.lookup s (earleyStates et) of - Just (EarleyTerminal _ ds) -> (acc <> ds, cs) - Just (EarleyNonTerminal name ds) -> - let firsts = maybe IntSet.empty fst (Map.lookup name (earleyRules et)) + step s (acc, cs) = case IntMap.lookup s (transducerStates et) of + Just (TransducerTokenClass _ ds) -> (acc <> ds, cs) + Just (TransducerNonTerminal name ds) -> + let firsts = maybe IntSet.empty fst (Map.lookup name (transducerRules et)) in (acc <> ds <> firsts, Set.insert name cs) - Just (EarleyEmit _) -> (acc, cs) + Just (TransducerEmit _) -> (acc, cs) Nothing -> (acc, cs) -- instances instance Categorized token => Matching [token] (Bnf (RegEx token)) where - word =~ bnf = matchEarley word (compileEarley bnf) + word =~ bnf = acceptsChart n chart et + where + et = compileTransducer bnf + (n, chart) = prefixGen word et instance Categorized token => Matching [token] (RegEx token) where word =~ pattern = word =~ liftBnf0 pattern @@ -270,11 +270,10 @@ random but always valid for the selected terminal class. -} languageGen :: (Applicative f, TokenAlgebra token (f token)) - => Bnf (RegEx token) + => Transducer token -> f [[token]] -languageGen bnf = sequenceA (fmap sampleWord classWords) +languageGen et = sequenceA (fmap sampleWord classWords) where - et = compileEarley bnf classWords = enumerateByLength [(0, [], initialChart et)] Set.empty @@ -304,28 +303,28 @@ languageGen bnf = sequenceA (fmap sampleWord classWords) ] initialChart - :: EarleyTransducer token + :: Transducer token -> IntMap (IntMap IntSet) initialChart et = closeChartAt 0 (IntMap.singleton 0 initialE0) et where initialE0 = IntMap.fromList - [ (s, IntSet.singleton 0) | s <- IntSet.toList (earleyStartStates et) ] + [ (s, IntSet.singleton 0) | s <- IntSet.toList (transducerStartStates et) ] acceptsChart :: Int -> IntMap (IntMap IntSet) - -> EarleyTransducer token + -> Transducer token -> Bool acceptsChart j chart et = IntSet.member 0 acceptOrigins where e_j = IntMap.findWithDefault IntMap.empty j chart - acceptOrigins = IntMap.findWithDefault IntSet.empty (earleyAcceptId et) e_j + acceptOrigins = IntMap.findWithDefault IntSet.empty (transducerAcceptId et) e_j scanClassOptions :: Categorized token => Int -> IntMap (IntMap IntSet) - -> EarleyTransducer token + -> Transducer token -> [(TokenClass token, IntMap (IntMap IntSet))] scanClassOptions j chart et = [ (cls, closeChartAt (j + 1) (IntMap.insert (j + 1) scanned chart) et) @@ -335,20 +334,18 @@ scanClassOptions j chart et = grouped = IntMap.foldrWithKey advance Map.empty e_j e_j = IntMap.findWithDefault IntMap.empty j chart - advance s origs acc = case IntMap.lookup s (earleyStates et) of - Just (EarleyTerminal cls ds) -> - Map.insertWith mergeEarleySet cls scanned acc + advance s origs acc = case IntMap.lookup s (transducerStates et) of + Just (TransducerTokenClass cls ds) -> + Map.insertWith (IntMap.unionWith IntSet.union) cls scanned acc where scanned = IntSet.foldr (\d -> IntMap.insertWith IntSet.union d origs) IntMap.empty ds _ -> acc - mergeEarleySet = IntMap.unionWith IntSet.union - closeChartAt :: Int -> IntMap (IntMap IntSet) - -> EarleyTransducer token + -> Transducer token -> IntMap (IntMap IntSet) closeChartAt j initialChart0 et = loop initialWork initialChart0 where @@ -357,23 +354,23 @@ closeChartAt j initialChart0 et = loop initialWork initialChart0 [ (s, i) | (s, os) <- IntMap.toList initialE, i <- IntSet.toList os ] loop [] chart = chart - loop ((s, i) : rest) chart = case IntMap.lookup s (earleyStates et) of - Just (EarleyNonTerminal name ds) -> + loop ((s, i) : rest) chart = case IntMap.lookup s (transducerStates et) of + Just (TransducerNonTerminal name ds) -> let (firsts, isNull) = Map.findWithDefault - (IntSet.empty, False) name (earleyRules et) + (IntSet.empty, False) name (transducerRules et) predItems = [(f, j) | f <- IntSet.toList firsts] nullItems = if isNull then [(d, i) | d <- IntSet.toList ds] else [] (chart', new) = addEarleyItems (predItems <> nullItems) chart in loop (new <> rest) chart' - Just (EarleyEmit name) -> + Just (TransducerEmit name) -> let e_i = IntMap.findWithDefault IntMap.empty i chart completions = [ (d, i') | (t, os) <- IntMap.toList e_i - , Just (EarleyNonTerminal n' ds) <- [IntMap.lookup t (earleyStates et)] + , Just (TransducerNonTerminal n' ds) <- [IntMap.lookup t (transducerStates et)] , n' == name , i' <- IntSet.toList os , d <- IntSet.toList ds From 11d2439cddaedca344c1577fea60a9b87ca9128e Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Wed, 22 Apr 2026 22:26:36 -0700 Subject: [PATCH 12/31] machine head --- distributors.cabal | 2 +- src/Control/Lens/Grammar.hs | 4 +- .../Lens/Grammar/{Matching.hs => Machine.hs} | 211 +++++++++++++----- src/Data/Profunctor/Grammar.hs | 2 +- 4 files changed, 154 insertions(+), 65 deletions(-) rename src/Control/Lens/Grammar/{Matching.hs => Machine.hs} (60%) diff --git a/distributors.cabal b/distributors.cabal index 45b73833..c620f938 100644 --- a/distributors.cabal +++ b/distributors.cabal @@ -36,7 +36,7 @@ library Control.Lens.Grammar.Internal.NestedPrismTH Control.Lens.Grammar.Internal.Orphanage Control.Lens.Grammar.Kleene - Control.Lens.Grammar.Matching + Control.Lens.Grammar.Machine Control.Lens.Grammar.Symbol Control.Lens.Grammar.Token Control.Lens.Grate diff --git a/src/Control/Lens/Grammar.hs b/src/Control/Lens/Grammar.hs index fa08b090..80c23c6f 100644 --- a/src/Control/Lens/Grammar.hs +++ b/src/Control/Lens/Grammar.hs @@ -46,7 +46,7 @@ import Control.Lens.PartialIso import Control.Lens.Grammar.BackusNaur import Control.Lens.Grammar.Boole import Control.Lens.Grammar.Kleene -import Control.Lens.Grammar.Matching +import Control.Lens.Grammar.Machine import Control.Lens.Grammar.Token import Control.Lens.Grammar.Symbol import Data.Bifunctor.Joker @@ -69,7 +69,7 @@ import Witherable import Control.Lens.Grammar.BackusNaur as X import Control.Lens.Grammar.Boole as X import Control.Lens.Grammar.Kleene as X -import Control.Lens.Grammar.Matching as X +import Control.Lens.Grammar.Machine as X import Control.Lens.Grammar.Symbol as X import Control.Lens.Grammar.Token as X import Control.Lens.PartialIso as X diff --git a/src/Control/Lens/Grammar/Matching.hs b/src/Control/Lens/Grammar/Machine.hs similarity index 60% rename from src/Control/Lens/Grammar/Matching.hs rename to src/Control/Lens/Grammar/Machine.hs index 2aae09a6..54bab061 100644 --- a/src/Control/Lens/Grammar/Matching.hs +++ b/src/Control/Lens/Grammar/Machine.hs @@ -1,21 +1,51 @@ {- | -Module : Control.Lens.Grammar.Matching -Description : pattern matching & language generation +Module : Control.Lens.Grammar.Machine +Description : finite-machine compilation and Earley-style matching Copyright : (C) 2026 - Eitan Chatav License : BSD-style (see the file LICENSE) Maintainer : Eitan Chatav Stability : provisional Portability : non-portable -https://www.cs.dartmouth.edu/~doug/nfa.pdf -http://trevorjim.com/papers/ldta-2009.pdf +This module presents a machine-oriented view of grammars: + +1. Compile a grammar with regular right-hand sides into a finite control machine + (`Transducer`) using a Thompson-style construction over regular expressions. +2. Run Earley-style chart recognition over that machine. +3. Derive practical products from the same engine: boolean matching, + "expected token class" reporting, language generation, and dead-rule analysis. + +The implementation follows mainstream Earley terminology (predict/scan/complete) +while adopting the transducer perspective used by Jim and Mandelbaum: + +* Earley item `(q,i)` is represented as "machine state id @q@ with origin @i@" + stored in chart set @E_j@. +* `closeChartAt` performs predict and complete to a fixed point at position @j@. +* `scanFrom`/`scanClassOptions` perform scanner steps to build @E_{j+1}@. +* `TransducerEmit` provides the "completed nonterminal" signal used by complete. + +References: + +* Trevor Jim and Yitzhak Mandelbaum, /Efficient Earley Parsing with Regular + Right-hand Sides/ (LDTA 2009). +* Thompson-style NFA compilation accounts from compiler literature. +* Standard Earley algorithm presentations (state sets, origin indices, + predictor/scanner/completer). -} -module Control.Lens.Grammar.Matching - ( Matching (..) +module Control.Lens.Grammar.Machine + ( + -- * Matching Interface + Matching (..) + + -- * Machine Representation , Transducer (..) , TransducerState (..) + + -- * Compilation , compileTransducer + + -- * Machine Execution Utilities , languageGen , expectedGen , unreachableGen @@ -37,27 +67,73 @@ import qualified Data.Set as Set import Data.Set (Set) -- | Does a word match a pattern? +-- +-- For grammar and regular-expression patterns in this package, matching is +-- performed by chart recognition over a compiled machine (`Transducer`). class Matching word pattern | pattern -> word where (=~) :: word -> pattern -> Bool infix 2 =~ --- | A state in the Earley-extended Thompson transducer for a `Bnf`. --- @TransducerTokenClass cls ds@ matches on a token class and transitions to @ds@. --- @TransducerNonTerminal name ds@ is a call point for rule @name@; after @name@ --- completes, control flows to @ds@. @TransducerEmit name@ is the final state --- for rule @name@ and triggers completion during Earley closure. +-- | A control state in the compiled parsing machine. +-- +-- Read these constructors as machine instructions: +-- +-- * `TransducerTokenClass` is a scanner transition over a terminal class. +-- * `TransducerNonTerminal` is a call site for a nonterminal. +-- * `TransducerEmit` is a completed nonterminal output used by completion. +-- +-- This corresponds to the "finite control + call/return" view of parsing +-- transducers in LDTA 2009. data TransducerState token = TransducerTokenClass (TokenClass token) IntSet | TransducerNonTerminal String IntSet | TransducerEmit String +-- | Finite control machine used by the Earley-style recognizer. +-- +-- Formal machine view (adapted for this parser): +-- +-- @ +-- T = (Q, Σ, Γ, I, F, δ) +-- @ +-- +-- where: +-- +-- * @Q@ (control states) = keys of `transducerStates`. +-- * @Σ@ (terminal alphabet) is represented intensionally by +-- `TokenClass token` labels in `TransducerTokenClass` transitions. +-- * @Γ@ (call alphabet / nonterminal symbols) = keys of `transducerRules` +-- and names carried by `TransducerNonTerminal` / `TransducerEmit`. +-- * @I@ (initial states) = `transducerStartStates`. +-- * @F@ (accepting states) = singleton `{transducerAcceptId}` for recognition. +-- * @δ@ (transition relation) is encoded by `transducerStates` constructors: +-- +-- * `TransducerTokenClass cls ds` contributes terminal transitions on any +-- token matching @cls@ from the current state to each state in @ds@. +-- * `TransducerNonTerminal name ds` contributes call transitions on symbol +-- @name@ with return destinations @ds@ (used by Earley predict/complete). +-- * `TransducerEmit name` is a completion/output state for nonterminal @name@, +-- consumed by Earley completion rather than terminal scanning. +-- +-- `transducerRules` is auxiliary indexing for Earley closure (entry-state set and +-- nullability per nonterminal), not an additional machine component. data Transducer token = Transducer - { transducerStates :: IntMap (TransducerState token) - , transducerRules :: Map String (IntSet, Bool) - , transducerAcceptId :: Int - , transducerStartStates :: IntSet + { transducerStates :: IntMap (TransducerState token) -- ^ @Q@ + , transducerRules :: Map String (IntSet, Bool) -- ^ @Γ@ + , transducerAcceptId :: Int -- ^ @F@ + , transducerStartStates :: IntSet -- ^ @I@ } +-- | Compile a regular-right-side grammar into a parsing transducer. +-- +-- Construction outline: +-- +-- * Regular-expression fragments are lowered in Thompson style by `goEarley`. +-- * Each nonterminal gets a distinguished emit/final state. +-- * Concatenation/alternation/Kleene operators wire state sets and bypassability +-- (nullability) as in regular-expression automata construction. +-- * A fixed-point nullability analysis over the grammar enables null completion +-- during Earley closure. compileTransducer :: Bnf (RegEx token) -> Transducer token compileTransducer (Bnf start rules) = Transducer { transducerStates = IntMap.fromList allStates @@ -191,20 +267,20 @@ compileTransducer (Bnf start rules) = Transducer prefixGen :: Categorized token - => [token] - -> Transducer token + => Transducer token + -> [token] -> (Int, IntMap (IntMap IntSet)) -prefixGen word et = go 0 (initialChart et) word +prefixGen et word = go 0 (initialChart et) word where - go j ss [] = (j, ss) - go j ss (x : xs) = - let scanned = scanFrom j x ss - closed = closeChartAt (j + 1) (IntMap.insert (j + 1) scanned ss) et + go j chart [] = (j, chart) + go j chart (x : xs) = + let scanned = scanFrom j x chart + closed = closeChartAt et (j + 1) (IntMap.insert (j + 1) scanned chart) in go (j + 1) closed xs - scanFrom j input ss = IntMap.foldrWithKey advance IntMap.empty e_j + scanFrom j input chart = IntMap.foldrWithKey advance IntMap.empty eJ where - e_j = IntMap.findWithDefault IntMap.empty j ss + eJ = IntMap.findWithDefault IntMap.empty j chart advance s origs acc = case IntMap.lookup s (transducerStates et) of Just (TransducerTokenClass cls ds) | tokenClass cls input -> IntSet.foldr @@ -212,22 +288,27 @@ prefixGen word et = go 0 (initialChart et) word _ -> acc {- | -Token classes that could legally appear next after the given input prefix, -according to the grammar. An empty result means the prefix is a dead end — -no extension can ever be accepted. Useful for autocomplete and for -\"expected one of …\" parse errors. +Earley scanner frontier summarized as token classes. + +Returns terminal classes that can be scanned next after the given input prefix. +An empty result means the current chart has no scanner transitions, i.e. the +prefix is a dead end for recognition. + +This is the machine-level version of "what terminals are expected next?". -} expectedGen :: Categorized token - => [token] -> Transducer token -> [TokenClass token] -expectedGen word et = map fst (scanClassOptions n chart et) + => Transducer token -> [token] -> [TokenClass token] +expectedGen et word = map fst (scanClassOptions et n chart) where - (n, chart) = prefixGen word et + (n, chart) = prefixGen et word {- | Rule names declared in the `Bnf` that can never be entered from the start expression — dead productions. A non-empty result is a grammar-hygiene warning: those rules can be deleted without changing the recognized language. + +Operationally, this is reachability over control states plus nonterminal calls. -} unreachableGen :: Transducer token -> Set String unreachableGen et = @@ -252,10 +333,10 @@ unreachableGen et = -- instances instance Categorized token => Matching [token] (Bnf (RegEx token)) where - word =~ bnf = acceptsChart n chart et + word =~ bnf = acceptsChart et n chart where et = compileTransducer bnf - (n, chart) = prefixGen word et + (n, chart) = prefixGen et word instance Categorized token => Matching [token] (RegEx token) where word =~ pattern = word =~ liftBnf0 pattern @@ -263,10 +344,14 @@ instance Matching s (APrism s t a b) where word =~ pattern = is pattern word {- | -Generate words recognized by a `Bnf` using Earley chart progression. +Generate words recognized by a grammar machine using chart progression. + +The algorithm performs a breadth-first exploration over scanner frontiers derived +from Earley sets, so words are produced by nondecreasing length. -Chart/state progression is deterministic (state id order). Token realization is -random but always valid for the selected terminal class. +Chart/state progression is deterministic (state id order). Token realization uses +`TokenAlgebra` and may be nondeterministic, but is always valid for the chosen +terminal class. -} languageGen :: (Applicative f, TokenAlgebra token (f token)) @@ -291,7 +376,7 @@ languageGen et = sequenceA (fmap sampleWord classWords) in (reverse acceptedRev, seen') where step (acc, seen) (j, revWord, chart) - | acceptsChart j chart et = + | acceptsChart et j chart = if Set.member revWord seen then (acc, seen) else (revWord : acc, Set.insert revWord seen) @@ -299,40 +384,43 @@ languageGen et = sequenceA (fmap sampleWord classWords) expand (j, revWord, chart) = [ (j + 1, cls : revWord, nextChart) - | (cls, nextChart) <- scanClassOptions j chart et + | (cls, nextChart) <- scanClassOptions et j chart ] initialChart :: Transducer token -> IntMap (IntMap IntSet) -initialChart et = closeChartAt 0 (IntMap.singleton 0 initialE0) et +initialChart et = closeChartAt et 0 (IntMap.singleton 0 initialE0) where initialE0 = IntMap.fromList [ (s, IntSet.singleton 0) | s <- IntSet.toList (transducerStartStates et) ] +-- Accept iff (q_accept, 0) is in E_n. acceptsChart - :: Int + :: Transducer token + -> Int -> IntMap (IntMap IntSet) - -> Transducer token -> Bool -acceptsChart j chart et = IntSet.member 0 acceptOrigins +acceptsChart et j chart = IntSet.member 0 acceptOrigins where - e_j = IntMap.findWithDefault IntMap.empty j chart - acceptOrigins = IntMap.findWithDefault IntSet.empty (transducerAcceptId et) e_j + eJ = IntMap.findWithDefault IntMap.empty j chart + acceptOrigins = IntMap.findWithDefault IntSet.empty (transducerAcceptId et) eJ +-- Group all scanner moves from E_j by token class; each result also carries the +-- closed successor chart at j+1. scanClassOptions - :: Categorized token + :: Transducer token + -> Categorized token => Int -> IntMap (IntMap IntSet) - -> Transducer token -> [(TokenClass token, IntMap (IntMap IntSet))] -scanClassOptions j chart et = - [ (cls, closeChartAt (j + 1) (IntMap.insert (j + 1) scanned chart) et) +scanClassOptions et j chart = + [ (cls, closeChartAt et (j + 1) (IntMap.insert (j + 1) scanned chart)) | (cls, scanned) <- Map.toAscList grouped ] where - grouped = IntMap.foldrWithKey advance Map.empty e_j - e_j = IntMap.findWithDefault IntMap.empty j chart + grouped = IntMap.foldrWithKey advance Map.empty eJ + eJ = IntMap.findWithDefault IntMap.empty j chart advance s origs acc = case IntMap.lookup s (transducerStates et) of Just (TransducerTokenClass cls ds) -> @@ -343,16 +431,17 @@ scanClassOptions j chart et = _ -> acc closeChartAt - :: Int + :: Transducer token + -> Int -> IntMap (IntMap IntSet) - -> Transducer token -> IntMap (IntMap IntSet) -closeChartAt j initialChart0 et = loop initialWork initialChart0 +closeChartAt et j initialChart0 = loop initialWork initialChart0 where - initialE = IntMap.findWithDefault IntMap.empty j initialChart0 + initialEJ = IntMap.findWithDefault IntMap.empty j initialChart0 initialWork = - [ (s, i) | (s, os) <- IntMap.toList initialE, i <- IntSet.toList os ] + [ (s, i) | (s, os) <- IntMap.toList initialEJ, i <- IntSet.toList os ] + -- Earley closure at E_j: apply predict/complete until fixed point. loop [] chart = chart loop ((s, i) : rest) chart = case IntMap.lookup s (transducerStates et) of Just (TransducerNonTerminal name ds) -> @@ -366,10 +455,10 @@ closeChartAt j initialChart0 et = loop initialWork initialChart0 in loop (new <> rest) chart' Just (TransducerEmit name) -> let - e_i = IntMap.findWithDefault IntMap.empty i chart + eI = IntMap.findWithDefault IntMap.empty i chart completions = [ (d, i') - | (t, os) <- IntMap.toList e_i + | (t, os) <- IntMap.toList eI , Just (TransducerNonTerminal n' ds) <- [IntMap.lookup t (transducerStates et)] , n' == name , i' <- IntSet.toList os @@ -383,12 +472,12 @@ closeChartAt j initialChart0 et = loop initialWork initialChart0 where ins (acc, new) (state, origin) = let - e_j = IntMap.findWithDefault IntMap.empty j acc - os = IntMap.findWithDefault IntSet.empty state e_j + eJ = IntMap.findWithDefault IntMap.empty j acc + os = IntMap.findWithDefault IntSet.empty state eJ in if IntSet.member origin os then (acc, new) else let - e_j' = IntMap.insert state (IntSet.insert origin os) e_j - acc' = IntMap.insert j e_j' acc + eJ' = IntMap.insert state (IntSet.insert origin os) eJ + acc' = IntMap.insert j eJ' acc in (acc', (state, origin) : new) diff --git a/src/Data/Profunctor/Grammar.hs b/src/Data/Profunctor/Grammar.hs index 1eb2f03c..ee88cc91 100644 --- a/src/Data/Profunctor/Grammar.hs +++ b/src/Data/Profunctor/Grammar.hs @@ -27,7 +27,7 @@ import Control.Lens import Control.Lens.Extras import Control.Lens.Grammar.BackusNaur import Control.Lens.Grammar.Kleene -import Control.Lens.Grammar.Matching +import Control.Lens.Grammar.Machine import Control.Lens.Grammar.Symbol import Control.Lens.Grammar.Token import Control.Monad From 650d5e4c216a2913ed722022fb76e75630be3d22 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Wed, 22 Apr 2026 22:39:48 -0700 Subject: [PATCH 13/31] Update Machine.hs --- src/Control/Lens/Grammar/Machine.hs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Control/Lens/Grammar/Machine.hs b/src/Control/Lens/Grammar/Machine.hs index 54bab061..bb03e8ee 100644 --- a/src/Control/Lens/Grammar/Machine.hs +++ b/src/Control/Lens/Grammar/Machine.hs @@ -54,6 +54,7 @@ module Control.Lens.Grammar.Machine import Control.Lens import Control.Lens.Extras import Control.Lens.Grammar.BackusNaur +import Control.Lens.Grammar.Boole import Control.Lens.Grammar.Kleene import Control.Lens.Grammar.Token import Data.Foldable @@ -298,8 +299,8 @@ This is the machine-level version of "what terminals are expected next?". -} expectedGen :: Categorized token - => Transducer token -> [token] -> [TokenClass token] -expectedGen et word = map fst (scanClassOptions et n chart) + => Transducer token -> [token] -> TokenClass token +expectedGen et word = anyB fst (scanClassOptions et n chart) where (n, chart) = prefixGen et word @@ -409,9 +410,9 @@ acceptsChart et j chart = IntSet.member 0 acceptOrigins -- Group all scanner moves from E_j by token class; each result also carries the -- closed successor chart at j+1. scanClassOptions - :: Transducer token - -> Categorized token - => Int + :: Categorized token + => Transducer token + -> Int -> IntMap (IntMap IntSet) -> [(TokenClass token, IntMap (IntMap IntSet))] scanClassOptions et j chart = From 98baf121050c0cea4be9beb8626edcdcb6e8a091 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Wed, 22 Apr 2026 23:01:49 -0700 Subject: [PATCH 14/31] Update Machine.hs --- src/Control/Lens/Grammar/Machine.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Control/Lens/Grammar/Machine.hs b/src/Control/Lens/Grammar/Machine.hs index bb03e8ee..03f71e49 100644 --- a/src/Control/Lens/Grammar/Machine.hs +++ b/src/Control/Lens/Grammar/Machine.hs @@ -295,7 +295,7 @@ Returns terminal classes that can be scanned next after the given input prefix. An empty result means the current chart has no scanner transitions, i.e. the prefix is a dead end for recognition. -This is the machine-level version of "what terminals are expected next?". +This is the machine-level version of "what tokens are expected next?". -} expectedGen :: Categorized token From 4a21af78812ad2bb3a428b1eb05dcbeb9e699e6b Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Thu, 23 Apr 2026 06:57:08 -0700 Subject: [PATCH 15/31] Update Machine.hs --- src/Control/Lens/Grammar/Machine.hs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/Control/Lens/Grammar/Machine.hs b/src/Control/Lens/Grammar/Machine.hs index 03f71e49..b0323fcc 100644 --- a/src/Control/Lens/Grammar/Machine.hs +++ b/src/Control/Lens/Grammar/Machine.hs @@ -34,17 +34,13 @@ References: -} module Control.Lens.Grammar.Machine - ( - -- * Matching Interface + ( -- * Matching Interface Matching (..) - -- * Machine Representation , Transducer (..) , TransducerState (..) - -- * Compilation , compileTransducer - -- * Machine Execution Utilities , languageGen , expectedGen From 3aa6039f417266702d6d5b07f1b9d152784c298f Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 09:41:53 -0700 Subject: [PATCH 16/31] cleaning --- src/Control/Lens/Grammar.hs | 2 +- src/Control/Lens/Grammar/Machine.hs | 222 ++++++++++------------------ 2 files changed, 82 insertions(+), 142 deletions(-) diff --git a/src/Control/Lens/Grammar.hs b/src/Control/Lens/Grammar.hs index 80c23c6f..852c9dd8 100644 --- a/src/Control/Lens/Grammar.hs +++ b/src/Control/Lens/Grammar.hs @@ -793,7 +793,7 @@ regbnfG :: Grammar Char a -> RegBnf regbnfG bnf = runGrammor bnf languageG :: (Applicative f, TokenAlgebra token (f token)) => Grammar token a -> f [[token]] -languageG bnf = languageGen (compileTransducer (runGrammor bnf)) +languageG bnf = languageGen (transducer (runGrammor bnf)) {- | `printG` generates a printer from a `CtxGrammar`. Since both `RegGrammar`s and context-free `Grammar`s are `CtxGrammar`s, diff --git a/src/Control/Lens/Grammar/Machine.hs b/src/Control/Lens/Grammar/Machine.hs index b0323fcc..c90eeda9 100644 --- a/src/Control/Lens/Grammar/Machine.hs +++ b/src/Control/Lens/Grammar/Machine.hs @@ -1,49 +1,24 @@ {- | Module : Control.Lens.Grammar.Machine -Description : finite-machine compilation and Earley-style matching +Description : matching & transducers Copyright : (C) 2026 - Eitan Chatav License : BSD-style (see the file LICENSE) Maintainer : Eitan Chatav Stability : provisional Portability : non-portable -This module presents a machine-oriented view of grammars: - -1. Compile a grammar with regular right-hand sides into a finite control machine - (`Transducer`) using a Thompson-style construction over regular expressions. -2. Run Earley-style chart recognition over that machine. -3. Derive practical products from the same engine: boolean matching, - "expected token class" reporting, language generation, and dead-rule analysis. - -The implementation follows mainstream Earley terminology (predict/scan/complete) -while adopting the transducer perspective used by Jim and Mandelbaum: - -* Earley item `(q,i)` is represented as "machine state id @q@ with origin @i@" - stored in chart set @E_j@. -* `closeChartAt` performs predict and complete to a fixed point at position @j@. -* `scanFrom`/`scanClassOptions` perform scanner steps to build @E_{j+1}@. -* `TransducerEmit` provides the "completed nonterminal" signal used by complete. - -References: - -* Trevor Jim and Yitzhak Mandelbaum, /Efficient Earley Parsing with Regular - Right-hand Sides/ (LDTA 2009). -* Thompson-style NFA compilation accounts from compiler literature. -* Standard Earley algorithm presentations (state sets, origin indices, - predictor/scanner/completer). +See -} module Control.Lens.Grammar.Machine - ( -- * Matching Interface + ( -- * Matching Matching (..) - -- * Machine Representation + -- * Transducer , Transducer (..) - , TransducerState (..) - -- * Compilation - , compileTransducer - -- * Machine Execution Utilities - , languageGen + , TransducerStep (..) + , transducer , expectedGen + , languageGen , unreachableGen ) where @@ -64,76 +39,46 @@ import qualified Data.Set as Set import Data.Set (Set) -- | Does a word match a pattern? --- --- For grammar and regular-expression patterns in this package, matching is --- performed by chart recognition over a compiled machine (`Transducer`). class Matching word pattern | pattern -> word where (=~) :: word -> pattern -> Bool infix 2 =~ --- | A control state in the compiled parsing machine. --- --- Read these constructors as machine instructions: --- --- * `TransducerTokenClass` is a scanner transition over a terminal class. --- * `TransducerNonTerminal` is a call site for a nonterminal. --- * `TransducerEmit` is a completed nonterminal output used by completion. --- --- This corresponds to the "finite control + call/return" view of parsing --- transducers in LDTA 2009. -data TransducerState token - = TransducerTokenClass (TokenClass token) IntSet - | TransducerNonTerminal String IntSet - | TransducerEmit String - --- | Finite control machine used by the Earley-style recognizer. --- --- Formal machine view (adapted for this parser): --- --- @ --- T = (Q, Σ, Γ, I, F, δ) --- @ --- --- where: --- --- * @Q@ (control states) = keys of `transducerStates`. --- * @Σ@ (terminal alphabet) is represented intensionally by --- `TokenClass token` labels in `TransducerTokenClass` transitions. --- * @Γ@ (call alphabet / nonterminal symbols) = keys of `transducerRules` --- and names carried by `TransducerNonTerminal` / `TransducerEmit`. --- * @I@ (initial states) = `transducerStartStates`. --- * @F@ (accepting states) = singleton `{transducerAcceptId}` for recognition. --- * @δ@ (transition relation) is encoded by `transducerStates` constructors: --- --- * `TransducerTokenClass cls ds` contributes terminal transitions on any --- token matching @cls@ from the current state to each state in @ds@. --- * `TransducerNonTerminal name ds` contributes call transitions on symbol --- @name@ with return destinations @ds@ (used by Earley predict/complete). --- * `TransducerEmit name` is a completion/output state for nonterminal @name@, --- consumed by Earley completion rather than terminal scanning. --- --- `transducerRules` is auxiliary indexing for Earley closure (entry-state set and --- nullability per nonterminal), not an additional machine component. +{-| A `Transducer` is a tuple + +@ +T = (Σ, Δ, Q, I ⊆ Q, F ∈ Q, transition ⊆ Q × (Σ ∪ ∆) × Q, output ⊆ Q × ∆) +@ + +* @Σ@ is a (possibly infinite) set of terminal token classes, represented by `TokenClass`es. +* @Δ@ is a finite set of nonterminals, represented by the key set of `transducerRules`. +* @Q@ is a set of states, which is represented by the key set of `transducerRelations`. +* @I@ are initial states represented by `transducerStarts`. +* @F@ is a final state represented by @0@. +* @transition@ is a relation represented by `transducerRelations` + with `TransitionTokenClass` and `TransitionNonTerminal` transitions. +* @output@ is a relation represented by `transducerRelations` with `EmitNonTerminal` outputs. +-} data Transducer token = Transducer - { transducerStates :: IntMap (TransducerState token) -- ^ @Q@ - , transducerRules :: Map String (IntSet, Bool) -- ^ @Γ@ - , transducerAcceptId :: Int -- ^ @F@ - , transducerStartStates :: IntSet -- ^ @I@ + { transducerRelations :: IntMap (TransducerStep token) + , transducerRules :: Map String (IntSet, Bool) + -- ^ an index into `transducerRelations` for nonterminals with precomputed nullability + , transducerStarts :: IntSet } --- | Compile a regular-right-side grammar into a parsing transducer. --- --- Construction outline: --- --- * Regular-expression fragments are lowered in Thompson style by `goEarley`. --- * Each nonterminal gets a distinguished emit/final state. --- * Concatenation/alternation/Kleene operators wire state sets and bypassability --- (nullability) as in regular-expression automata construction. --- * A fixed-point nullability analysis over the grammar enables null completion --- during Earley closure. -compileTransducer :: Bnf (RegEx token) -> Transducer token -compileTransducer (Bnf start rules) = Transducer - { transducerStates = IntMap.fromList allStates +-- | A `TransducerStep` in a `Transducer`. +data TransducerStep token + = TransitionTokenClass (TokenClass token) IntSet + | TransitionNonTerminal String IntSet + | EmitNonTerminal String + +{- | Compile a `RegEx`tended `Bnf` into a `Transducer`, +using a combination of Thompson's algorithm for regular expressions +and Earley's algorithm for context-free grammars. See Jim & Mandelbaum, +[Efficient Earley Parsing with Regular Right-hand Sides](http://trevorjim.com/papers/ldta-2009.pdf). +-} +transducer :: Bnf (RegEx token) -> Transducer token +transducer (Bnf start rules) = Transducer + { transducerRelations = IntMap.fromList allStates , transducerRules = Map.fromList [ ( n , ( Map.findWithDefault IntSet.empty n firstsMap @@ -142,8 +87,7 @@ compileTransducer (Bnf start rules) = Transducer ) | n <- Map.keys ruleMap ] - , transducerAcceptId = transducerAcceptId0 - , transducerStartStates = startStates + , transducerStarts = startStates } where @@ -177,7 +121,7 @@ compileTransducer (Bnf start rules) = Transducer foldl' alloc (Map.empty, transducerAcceptId0 + 1) ruleNames where alloc (m, i) n = (Map.insert n i m, i + 1) - finalStatesList = [(finalMap Map.! n, TransducerEmit n) | n <- ruleNames] + finalStatesList = [(finalMap Map.! n, EmitNonTerminal n) | n <- ruleNames] (rulesStatesList, firstsMap, nextIdAfterRules) = foldl' compileRule ([], Map.empty, nextIdAfterFinals) (Map.toList ruleMap) @@ -188,12 +132,12 @@ compileTransducer (Bnf start rules) = Transducer foldl' compileProd ([], IntSet.empty, nid) prods compileProd (s, fs, i) prod = let (f, st, i', _) = - goEarley prod i (IntSet.singleton finalId) + thompson prod i (IntSet.singleton finalId) in (s <> st, fs <> f, i') in (sts <> newSts, Map.insert name newFirsts fm, nid') (startFirsts, startStatesRaw, _, startBypass) = - goEarley start nextIdAfterRules (IntSet.singleton transducerAcceptId0) + thompson start nextIdAfterRules (IntSet.singleton transducerAcceptId0) startStates = startFirsts <> bypassStates startBypass (IntSet.singleton transducerAcceptId0) @@ -203,19 +147,19 @@ compileTransducer (Bnf start rules) = Transducer bypassStates True = id bypassStates False = const IntSet.empty - goEarley rex nextId dests = case rex of + thompson rex nextId dests = case rex of SeqEmpty -> (IntSet.empty, [], nextId, True) NonTerminal name -> ( IntSet.singleton nextId - , [(nextId, TransducerNonTerminal name dests)] + , [(nextId, TransitionNonTerminal name dests)] , nextId + 1 , Map.findWithDefault False name nullMap ) Sequence rex0 rex1 -> let - (firsts1, states1, nextId1, bypass1) = goEarley rex1 nextId dests + (firsts1, states1, nextId1, bypass1) = thompson rex1 nextId dests (firsts0, states0, nextId0, bypass0) = - goEarley rex0 nextId1 (firsts1 <> bypassStates bypass1 dests) + thompson rex0 nextId1 (firsts1 <> bypassStates bypass1 dests) in ( firsts0 <> bypassStates bypass0 firsts1 , states0 <> states1 @@ -224,37 +168,37 @@ compileTransducer (Bnf start rules) = Transducer ) KleeneStar rex0 -> let - (firsts, states, nextId', _) = goEarley rex0 nextId (firsts <> dests) + (firsts, states, nextId', _) = thompson rex0 nextId (firsts <> dests) in (firsts, states, nextId', True) KleeneOpt rex0 -> let - (firsts, states, nextId', _) = goEarley rex0 nextId dests + (firsts, states, nextId', _) = thompson rex0 nextId dests in (firsts, states, nextId', True) KleenePlus rex0 -> let - (firsts, states, nextId', bypass) = goEarley rex0 nextId (firsts <> dests) + (firsts, states, nextId', bypass) = thompson rex0 nextId (firsts <> dests) in (firsts, states, nextId', bypass) RegExam (OneOf chars) | Set.null chars -> (IntSet.empty, [], nextId, False) | otherwise -> ( IntSet.singleton nextId - , [(nextId, TransducerTokenClass (TokenClass (OneOf chars)) dests)] + , [(nextId, TransitionTokenClass (TokenClass (OneOf chars)) dests)] , nextId + 1 , False ) RegExam (NotOneOf chars catTest) -> ( IntSet.singleton nextId - , [(nextId, TransducerTokenClass (TokenClass (NotOneOf chars catTest)) dests)] + , [(nextId, TransitionTokenClass (TokenClass (NotOneOf chars catTest)) dests)] , nextId + 1 , False ) RegExam (Alternate rex0 rex1) -> let - (firsts1, states1, nextId1, bypass1) = goEarley rex1 nextId dests - (firsts0, states0, nextId0, bypass0) = goEarley rex0 nextId1 dests + (firsts1, states1, nextId1, bypass1) = thompson rex1 nextId dests + (firsts0, states0, nextId0, bypass0) = thompson rex0 nextId1 dests in ( firsts0 <> firsts1 , states0 <> states1 @@ -278,24 +222,21 @@ prefixGen et word = go 0 (initialChart et) word scanFrom j input chart = IntMap.foldrWithKey advance IntMap.empty eJ where eJ = IntMap.findWithDefault IntMap.empty j chart - advance s origs acc = case IntMap.lookup s (transducerStates et) of - Just (TransducerTokenClass cls ds) | tokenClass cls input -> + advance s origs acc = case IntMap.lookup s (transducerRelations et) of + Just (TransitionTokenClass cls ds) | tokenClass cls input -> IntSet.foldr (\d -> IntMap.insertWith IntSet.union d origs) acc ds _ -> acc -{- | -Earley scanner frontier summarized as token classes. - -Returns terminal classes that can be scanned next after the given input prefix. -An empty result means the current chart has no scanner transitions, i.e. the -prefix is a dead end for recognition. - -This is the machine-level version of "what tokens are expected next?". +{- | What token is expected next? +The scanner frontier, `expectedGen` returns `TokenClass` +that can be scanned next after the given input prefix. +A `falseB` result means the current chart has no scanner transitions, +i.e. the prefix is a dead end for recognition. -} expectedGen :: Categorized token - => Transducer token -> [token] -> TokenClass token + => Transducer token -> [token] {- ^ prefix -} -> TokenClass token expectedGen et word = anyB fst (scanClassOptions et n chart) where (n, chart) = prefixGen et word @@ -311,7 +252,7 @@ unreachableGen :: Transducer token -> Set String unreachableGen et = Map.keysSet (transducerRules et) `Set.difference` called where - called = bfs (transducerStartStates et) IntSet.empty Set.empty + called = bfs (transducerStarts et) IntSet.empty Set.empty bfs frontier seen calls | IntSet.null fresh = calls @@ -320,19 +261,19 @@ unreachableGen et = fresh = IntSet.difference frontier seen (next, calls') = IntSet.foldr step (IntSet.empty, calls) fresh - step s (acc, cs) = case IntMap.lookup s (transducerStates et) of - Just (TransducerTokenClass _ ds) -> (acc <> ds, cs) - Just (TransducerNonTerminal name ds) -> + step s (acc, cs) = case IntMap.lookup s (transducerRelations et) of + Just (TransitionTokenClass _ ds) -> (acc <> ds, cs) + Just (TransitionNonTerminal name ds) -> let firsts = maybe IntSet.empty fst (Map.lookup name (transducerRules et)) in (acc <> ds <> firsts, Set.insert name cs) - Just (TransducerEmit _) -> (acc, cs) + Just (EmitNonTerminal _) -> (acc, cs) Nothing -> (acc, cs) -- instances instance Categorized token => Matching [token] (Bnf (RegEx token)) where - word =~ bnf = acceptsChart et n chart + word =~ bnf = acceptsChart n chart where - et = compileTransducer bnf + et = transducer bnf (n, chart) = prefixGen et word instance Categorized token => Matching [token] (RegEx token) where @@ -373,7 +314,7 @@ languageGen et = sequenceA (fmap sampleWord classWords) in (reverse acceptedRev, seen') where step (acc, seen) (j, revWord, chart) - | acceptsChart et j chart = + | acceptsChart j chart = if Set.member revWord seen then (acc, seen) else (revWord : acc, Set.insert revWord seen) @@ -390,18 +331,17 @@ initialChart initialChart et = closeChartAt et 0 (IntMap.singleton 0 initialE0) where initialE0 = IntMap.fromList - [ (s, IntSet.singleton 0) | s <- IntSet.toList (transducerStartStates et) ] + [ (s, IntSet.singleton 0) | s <- IntSet.toList (transducerStarts et) ] -- Accept iff (q_accept, 0) is in E_n. acceptsChart - :: Transducer token - -> Int + :: Int -> IntMap (IntMap IntSet) -> Bool -acceptsChart et j chart = IntSet.member 0 acceptOrigins +acceptsChart j chart = IntSet.member 0 acceptOrigins where eJ = IntMap.findWithDefault IntMap.empty j chart - acceptOrigins = IntMap.findWithDefault IntSet.empty (transducerAcceptId et) eJ + acceptOrigins = IntMap.findWithDefault IntSet.empty 0 eJ -- Group all scanner moves from E_j by token class; each result also carries the -- closed successor chart at j+1. @@ -419,8 +359,8 @@ scanClassOptions et j chart = grouped = IntMap.foldrWithKey advance Map.empty eJ eJ = IntMap.findWithDefault IntMap.empty j chart - advance s origs acc = case IntMap.lookup s (transducerStates et) of - Just (TransducerTokenClass cls ds) -> + advance s origs acc = case IntMap.lookup s (transducerRelations et) of + Just (TransitionTokenClass cls ds) -> Map.insertWith (IntMap.unionWith IntSet.union) cls scanned acc where scanned = IntSet.foldr @@ -440,8 +380,8 @@ closeChartAt et j initialChart0 = loop initialWork initialChart0 -- Earley closure at E_j: apply predict/complete until fixed point. loop [] chart = chart - loop ((s, i) : rest) chart = case IntMap.lookup s (transducerStates et) of - Just (TransducerNonTerminal name ds) -> + loop ((s, i) : rest) chart = case IntMap.lookup s (transducerRelations et) of + Just (TransitionNonTerminal name ds) -> let (firsts, isNull) = Map.findWithDefault (IntSet.empty, False) name (transducerRules et) @@ -450,13 +390,13 @@ closeChartAt et j initialChart0 = loop initialWork initialChart0 if isNull then [(d, i) | d <- IntSet.toList ds] else [] (chart', new) = addEarleyItems (predItems <> nullItems) chart in loop (new <> rest) chart' - Just (TransducerEmit name) -> + Just (EmitNonTerminal name) -> let eI = IntMap.findWithDefault IntMap.empty i chart completions = [ (d, i') | (t, os) <- IntMap.toList eI - , Just (TransducerNonTerminal n' ds) <- [IntMap.lookup t (transducerStates et)] + , Just (TransitionNonTerminal n' ds) <- [IntMap.lookup t (transducerRelations et)] , n' == name , i' <- IntSet.toList os , d <- IntSet.toList ds From b8c7b7868a9ea7cac74a7e479a61d4bca084b23a Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 10:56:24 -0700 Subject: [PATCH 17/31] cleaning --- src/Control/Lens/Grammar.hs | 6 ++- src/Control/Lens/Grammar/Machine.hs | 83 +++++++++++++++-------------- 2 files changed, 47 insertions(+), 42 deletions(-) diff --git a/src/Control/Lens/Grammar.hs b/src/Control/Lens/Grammar.hs index 852c9dd8..5b2b10e4 100644 --- a/src/Control/Lens/Grammar.hs +++ b/src/Control/Lens/Grammar.hs @@ -24,6 +24,7 @@ module Control.Lens.Grammar , regbnfG , regbnfGrammar , applicativeG + , transducerG , languageG -- * Context-sensitive grammar , CtxGrammar @@ -792,8 +793,11 @@ It can apply to a `RegGrammar`. regbnfG :: Grammar Char a -> RegBnf regbnfG bnf = runGrammor bnf +transducerG :: Categorized token => Grammar token a -> Transducer token +transducerG bnf = transducer (runGrammor bnf) + languageG :: (Applicative f, TokenAlgebra token (f token)) => Grammar token a -> f [[token]] -languageG bnf = languageGen (transducer (runGrammor bnf)) +languageG bnf = languageRun (transducer (runGrammor bnf)) {- | `printG` generates a printer from a `CtxGrammar`. Since both `RegGrammar`s and context-free `Grammar`s are `CtxGrammar`s, diff --git a/src/Control/Lens/Grammar/Machine.hs b/src/Control/Lens/Grammar/Machine.hs index c90eeda9..8f44f1f1 100644 --- a/src/Control/Lens/Grammar/Machine.hs +++ b/src/Control/Lens/Grammar/Machine.hs @@ -6,20 +6,18 @@ License : BSD-style (see the file LICENSE) Maintainer : Eitan Chatav Stability : provisional Portability : non-portable - -See -} module Control.Lens.Grammar.Machine ( -- * Matching Matching (..) -- * Transducer + , transducer + , languageRun + , expectedRun + , unreachableRun , Transducer (..) , TransducerStep (..) - , transducer - , expectedGen - , languageGen - , unreachableGen ) where import Control.Lens @@ -42,6 +40,20 @@ import Data.Set (Set) class Matching word pattern | pattern -> word where (=~) :: word -> pattern -> Bool infix 2 =~ +-- instances +instance Categorized token + => Matching [token] (Transducer token) where + word =~ et = acceptsChart n chart + where + (n, chart) = prefixRun et word +instance Categorized token + => Matching [token] (Bnf (RegEx token)) where + word =~ bnf = word =~ transducer bnf +instance Categorized token + => Matching [token] (RegEx token) where + word =~ pattern = word =~ liftBnf0 pattern +instance Matching s (APrism s t a b) where + word =~ pattern = is pattern word {-| A `Transducer` is a tuple @@ -74,7 +86,14 @@ data TransducerStep token {- | Compile a `RegEx`tended `Bnf` into a `Transducer`, using a combination of Thompson's algorithm for regular expressions and Earley's algorithm for context-free grammars. See Jim & Mandelbaum, -[Efficient Earley Parsing with Regular Right-hand Sides](http://trevorjim.com/papers/ldta-2009.pdf). +[Efficient Earley Parsing with Regular Right-hand Sides] +(http://trevorjim.com/papers/ldta-2009.pdf), +and McIlroy, [Enumerating the strings of regular languages] +(https://www.cs.dartmouth.edu/~doug/nfa.pdf). + +A transducer is a form of [finite state machine] +(https://www.scribd.com/doc/76189520/John-H-Conway-Regular-Algebra-and-Finite-Machines) +that can be run in various ways like `=~`, `expectedRun`, `languageRun` & `unreachableRun`. -} transducer :: Bnf (RegEx token) -> Transducer token transducer (Bnf start rules) = Transducer @@ -206,12 +225,12 @@ transducer (Bnf start rules) = Transducer , bypass0 || bypass1 ) -prefixGen +prefixRun :: Categorized token => Transducer token -> [token] -> (Int, IntMap (IntMap IntSet)) -prefixGen et word = go 0 (initialChart et) word +prefixRun et word = go 0 (initialChart et) word where go j chart [] = (j, chart) go j chart (x : xs) = @@ -229,27 +248,25 @@ prefixGen et word = go 0 (initialChart et) word _ -> acc {- | What token is expected next? -The scanner frontier, `expectedGen` returns `TokenClass` +The scanner frontier, `expectedRun` returns the `TokenClass` that can be scanned next after the given input prefix. A `falseB` result means the current chart has no scanner transitions, i.e. the prefix is a dead end for recognition. -} -expectedGen +expectedRun :: Categorized token => Transducer token -> [token] {- ^ prefix -} -> TokenClass token -expectedGen et word = anyB fst (scanClassOptions et n chart) +expectedRun et word = anyB fst (scanClassOptions et n chart) where - (n, chart) = prefixGen et word + (n, chart) = prefixRun et word {- | -Rule names declared in the `Bnf` that can never be entered from the start +Rule names that can never be entered from the start expression — dead productions. A non-empty result is a grammar-hygiene warning: those rules can be deleted without changing the recognized language. - -Operationally, this is reachability over control states plus nonterminal calls. -} -unreachableGen :: Transducer token -> Set String -unreachableGen et = +unreachableRun :: Transducer token -> Set String +unreachableRun et = Map.keysSet (transducerRules et) `Set.difference` called where called = bfs (transducerStarts et) IntSet.empty Set.empty @@ -268,34 +285,18 @@ unreachableGen et = in (acc <> ds <> firsts, Set.insert name cs) Just (EmitNonTerminal _) -> (acc, cs) Nothing -> (acc, cs) --- instances -instance Categorized token - => Matching [token] (Bnf (RegEx token)) where - word =~ bnf = acceptsChart n chart - where - et = transducer bnf - (n, chart) = prefixGen et word -instance Categorized token - => Matching [token] (RegEx token) where - word =~ pattern = word =~ liftBnf0 pattern -instance Matching s (APrism s t a b) where - word =~ pattern = is pattern word {- | -Generate words recognized by a grammar machine using chart progression. - -The algorithm performs a breadth-first exploration over scanner frontiers derived -from Earley sets, so words are produced by nondecreasing length. - -Chart/state progression is deterministic (state id order). Token realization uses -`TokenAlgebra` and may be nondeterministic, but is always valid for the chosen -terminal class. +`languageRun` lazily produces all words in a language from shortest to longest. +However since `TokenClass`es can resolve to infinite sets of tokens, +and the relevant case of `Char` tokens while not infinite is huge, +it samples tokens in an `Applicative` `TokenAlgebra`. -} -languageGen +languageRun :: (Applicative f, TokenAlgebra token (f token)) - => Transducer token + => Transducer token -- ^ transducer -> f [[token]] -languageGen et = sequenceA (fmap sampleWord classWords) +languageRun et = sequenceA (fmap sampleWord classWords) where classWords = enumerateByLength [(0, [], initialChart et)] Set.empty From 5c2ade7dff3782a4c49cbf75150aada681d8a5e2 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 14:56:25 -0700 Subject: [PATCH 18/31] cleaning --- src/Control/Lens/Grammar.hs | 7 ++++++- src/Control/Lens/Grammar/Machine.hs | 3 +-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/Control/Lens/Grammar.hs b/src/Control/Lens/Grammar.hs index 5b2b10e4..4eac34df 100644 --- a/src/Control/Lens/Grammar.hs +++ b/src/Control/Lens/Grammar.hs @@ -793,11 +793,16 @@ It can apply to a `RegGrammar`. regbnfG :: Grammar Char a -> RegBnf regbnfG bnf = runGrammor bnf +{- | Compile a `Grammar` into a `Transducer`. + +A transducer is a form of finite state machine +that can be run in various ways like `=~`, `expectedRun`, `languageRun` & `unreachableRun`. +-} transducerG :: Categorized token => Grammar token a -> Transducer token transducerG bnf = transducer (runGrammor bnf) languageG :: (Applicative f, TokenAlgebra token (f token)) => Grammar token a -> f [[token]] -languageG bnf = languageRun (transducer (runGrammor bnf)) +languageG bnf = languageRun (transducerG bnf) {- | `printG` generates a printer from a `CtxGrammar`. Since both `RegGrammar`s and context-free `Grammar`s are `CtxGrammar`s, diff --git a/src/Control/Lens/Grammar/Machine.hs b/src/Control/Lens/Grammar/Machine.hs index 8f44f1f1..be5476c8 100644 --- a/src/Control/Lens/Grammar/Machine.hs +++ b/src/Control/Lens/Grammar/Machine.hs @@ -91,8 +91,7 @@ and Earley's algorithm for context-free grammars. See Jim & Mandelbaum, and McIlroy, [Enumerating the strings of regular languages] (https://www.cs.dartmouth.edu/~doug/nfa.pdf). -A transducer is a form of [finite state machine] -(https://www.scribd.com/doc/76189520/John-H-Conway-Regular-Algebra-and-Finite-Machines) +A transducer is a form of finite state machine that can be run in various ways like `=~`, `expectedRun`, `languageRun` & `unreachableRun`. -} transducer :: Bnf (RegEx token) -> Transducer token From 23559716d167e18a055346ab915f36671ee7d8b1 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 15:06:42 -0700 Subject: [PATCH 19/31] Update Machine.hs --- src/Control/Lens/Grammar/Machine.hs | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/Control/Lens/Grammar/Machine.hs b/src/Control/Lens/Grammar/Machine.hs index be5476c8..0673a924 100644 --- a/src/Control/Lens/Grammar/Machine.hs +++ b/src/Control/Lens/Grammar/Machine.hs @@ -100,7 +100,7 @@ transducer (Bnf start rules) = Transducer , transducerRules = Map.fromList [ ( n , ( Map.findWithDefault IntSet.empty n firstsMap - , Map.findWithDefault False n nullMap + , Set.member n nullSet ) ) | n <- Map.keys ruleMap @@ -115,7 +115,7 @@ transducer (Bnf start rules) = Transducer rexNullable nm = \case SeqEmpty -> True - NonTerminal n -> Map.findWithDefault False n nm + NonTerminal n -> Set.member n nm Sequence x y -> rexNullable nm x && rexNullable nm y KleeneStar _ -> True KleeneOpt _ -> True @@ -124,14 +124,17 @@ transducer (Bnf start rules) = Transducer RegExam (OneOf _) -> False RegExam (NotOneOf _ _) -> False - iterNull nm = - let nm' = Map.mapWithKey - (\n _ -> any (rexNullable nm) (Map.findWithDefault [] n ruleMap)) nm - in if nm == nm' then nm else iterNull nm' + ruleNames = Map.keys ruleMap - nullMap = iterNull (Map.map (const False) ruleMap) + iterNull ns = + let ns' = Set.fromList + [ n + | n <- ruleNames + , any (rexNullable ns) (Map.findWithDefault [] n ruleMap) + ] + in if ns == ns' then ns else iterNull ns' - ruleNames = Map.keys ruleMap + nullSet = iterNull Set.empty transducerAcceptId0 = 0 @@ -171,7 +174,7 @@ transducer (Bnf start rules) = Transducer ( IntSet.singleton nextId , [(nextId, TransitionNonTerminal name dests)] , nextId + 1 - , Map.findWithDefault False name nullMap + , Set.member name nullSet ) Sequence rex0 rex1 -> let From 05f77056641c7ebda4319dea4e7be6ab368bc943 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 15:20:59 -0700 Subject: [PATCH 20/31] lazy state --- src/Control/Lens/Grammar/Kleene.hs | 2 +- src/Control/Lens/Grammar/Token.hs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Control/Lens/Grammar/Kleene.hs b/src/Control/Lens/Grammar/Kleene.hs index 03e8d739..85c54c1f 100644 --- a/src/Control/Lens/Grammar/Kleene.hs +++ b/src/Control/Lens/Grammar/Kleene.hs @@ -30,7 +30,7 @@ import Control.Lens.Grammar.Boole import Control.Lens.Grammar.Symbol import Control.Lens.Grammar.Token import Control.Monad.Loops -import Control.Monad.Trans.State.Strict (StateT, state) +import Control.Monad.State (StateT, state) import Data.Bifunctor.Joker import Data.Foldable import Data.MemoTrie diff --git a/src/Control/Lens/Grammar/Token.hs b/src/Control/Lens/Grammar/Token.hs index 1bede35c..5ad21da5 100644 --- a/src/Control/Lens/Grammar/Token.hs +++ b/src/Control/Lens/Grammar/Token.hs @@ -27,7 +27,7 @@ import Data.Foldable import Data.Profunctor import Data.Profunctor.Monoidal import Data.Word -import Control.Monad.Trans.State.Strict (StateT, state) +import Control.Monad.State (StateT, state) import System.Random (RandomGen, Random, random, randomR) import Test.QuickCheck.Arbitrary import Test.QuickCheck.Gen (Gen) From ac3d8172cf72163e1513283c8fa7886414184b2d Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 15:22:16 -0700 Subject: [PATCH 21/31] Update Grammar.hs --- src/Control/Lens/Grammar.hs | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/Control/Lens/Grammar.hs b/src/Control/Lens/Grammar.hs index 4eac34df..11b2ad0a 100644 --- a/src/Control/Lens/Grammar.hs +++ b/src/Control/Lens/Grammar.hs @@ -801,7 +801,28 @@ that can be run in various ways like `=~`, `expectedRun`, `languageRun` & `unrea transducerG :: Categorized token => Grammar token a -> Transducer token transducerG bnf = transducer (runGrammor bnf) -languageG :: (Applicative f, TokenAlgebra token (f token)) => Grammar token a -> f [[token]] +{- | +`languageG` lazily produces all words in a language from shortest to longest. +However since `TokenClass`es can resolve to infinite sets of tokens, +and the relevant case of `Char` tokens while not infinite is huge, +it samples tokens in an `Applicative` `TokenAlgebra`. + +>>> import Control.Monad.State +>>> import System.Random +>>> let gen = mkStdGen 69 +>>> evalState (take 10 <$> languageG @Char regexGrammar) gen +["","|","\776269","()","[]","\\[","||","|\249908","\770923*","\1008821+"] + +This is useful for generating valid language examples for property tests. + +>>> import Test.QuickCheck +>>> let rg = regbnfG regexGrammar +>>> words100 <- generate (take 100 <$> languageG @Char regexGrammar) +>>> quickCheckWith stdArgs {maxSuccess = 1} (property (all (=~ rg) words100)) ++++ OK, passed 1 test. + +-} +languageG :: (TokenAlgebra token (f token), Applicative f) => Grammar token a -> f [[token]] languageG bnf = languageRun (transducerG bnf) {- | `printG` generates a printer from a `CtxGrammar`. From cc2332573868f3b27badceb79c02338397f83c06 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 15:38:42 -0700 Subject: [PATCH 22/31] cache optimization For each Earley set E_i with i < j, build a caller index (keyed by nonterminal name) once and reuse it across all EmitNonTerminal completions that share the same origin i, avoiding repeated linear scans of E_i per completion. --- src/Control/Lens/Grammar/Machine.hs | 41 ++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/src/Control/Lens/Grammar/Machine.hs b/src/Control/Lens/Grammar/Machine.hs index 0673a924..ea14d1ac 100644 --- a/src/Control/Lens/Grammar/Machine.hs +++ b/src/Control/Lens/Grammar/Machine.hs @@ -375,15 +375,21 @@ closeChartAt -> Int -> IntMap (IntMap IntSet) -> IntMap (IntMap IntSet) -closeChartAt et j initialChart0 = loop initialWork initialChart0 +closeChartAt et j initialChart0 = loop initialWork initialChart0 IntMap.empty where initialEJ = IntMap.findWithDefault IntMap.empty j initialChart0 initialWork = [ (s, i) | (s, os) <- IntMap.toList initialEJ, i <- IntSet.toList os ] + -- For fixed i < j, E_i does not change while closing E_j. Cache an index + -- from nonterminal name to caller origins/continuations to speed completion. + -- IntMap key: origin index i + -- Map key: nonterminal name + -- Value: list of (caller origins, continuation destinations) + -- Earley closure at E_j: apply predict/complete until fixed point. - loop [] chart = chart - loop ((s, i) : rest) chart = case IntMap.lookup s (transducerRelations et) of + loop [] chart _ = chart + loop ((s, i) : rest) chart callerCache = case IntMap.lookup s (transducerRelations et) of Just (TransitionNonTerminal name ds) -> let (firsts, isNull) = Map.findWithDefault @@ -392,21 +398,36 @@ closeChartAt et j initialChart0 = loop initialWork initialChart0 nullItems = if isNull then [(d, i) | d <- IntSet.toList ds] else [] (chart', new) = addEarleyItems (predItems <> nullItems) chart - in loop (new <> rest) chart' + in loop (new <> rest) chart' callerCache Just (EmitNonTerminal name) -> let - eI = IntMap.findWithDefault IntMap.empty i chart + (indexed, callerCache') = callerEntries i chart callerCache + callerRows = Map.findWithDefault [] name indexed completions = [ (d, i') - | (t, os) <- IntMap.toList eI - , Just (TransitionNonTerminal n' ds) <- [IntMap.lookup t (transducerRelations et)] - , n' == name + | (os, ds) <- callerRows , i' <- IntSet.toList os , d <- IntSet.toList ds ] (chart', new) = addEarleyItems completions chart - in loop (new <> rest) chart' - _ -> loop rest chart + in loop (new <> rest) chart' callerCache' + _ -> loop rest chart callerCache + + callerEntries i chart callerCache + -- E_j mutates during closure, so do not cache index for i == j. + | i == j = (buildCallerIndex (IntMap.findWithDefault IntMap.empty i chart), callerCache) + | otherwise = case IntMap.lookup i callerCache of + Just indexed -> (indexed, callerCache) + Nothing -> + let indexed = buildCallerIndex (IntMap.findWithDefault IntMap.empty i chart) + in (indexed, IntMap.insert i indexed callerCache) + + buildCallerIndex eI = IntMap.foldrWithKey step Map.empty eI + where + step t os acc = case IntMap.lookup t (transducerRelations et) of + Just (TransitionNonTerminal n ds) -> + Map.insertWith (++) n [(os, ds)] acc + _ -> acc addEarleyItems items chart = foldl' ins (chart, []) items where From 1bfe148bd73aa79f569ebc4fd1f382c54766c29f Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 16:06:29 -0700 Subject: [PATCH 23/31] Update Machine.hs --- src/Control/Lens/Grammar/Machine.hs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Control/Lens/Grammar/Machine.hs b/src/Control/Lens/Grammar/Machine.hs index ea14d1ac..f31e8cf9 100644 --- a/src/Control/Lens/Grammar/Machine.hs +++ b/src/Control/Lens/Grammar/Machine.hs @@ -401,8 +401,8 @@ closeChartAt et j initialChart0 = loop initialWork initialChart0 IntMap.empty in loop (new <> rest) chart' callerCache Just (EmitNonTerminal name) -> let - (indexed, callerCache') = callerEntries i chart callerCache - callerRows = Map.findWithDefault [] name indexed + (ixed, callerCache') = callerEntries i chart callerCache + callerRows = Map.findWithDefault [] name ixed completions = [ (d, i') | (os, ds) <- callerRows @@ -417,10 +417,10 @@ closeChartAt et j initialChart0 = loop initialWork initialChart0 IntMap.empty -- E_j mutates during closure, so do not cache index for i == j. | i == j = (buildCallerIndex (IntMap.findWithDefault IntMap.empty i chart), callerCache) | otherwise = case IntMap.lookup i callerCache of - Just indexed -> (indexed, callerCache) + Just ixed -> (ixed, callerCache) Nothing -> - let indexed = buildCallerIndex (IntMap.findWithDefault IntMap.empty i chart) - in (indexed, IntMap.insert i indexed callerCache) + let ixed = buildCallerIndex (IntMap.findWithDefault IntMap.empty i chart) + in (ixed, IntMap.insert i ixed callerCache) buildCallerIndex eI = IntMap.foldrWithKey step Map.empty eI where From d6b8d76ff0aaa3dc333f393ff1ae14b3696ce1fb Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 16:35:53 -0700 Subject: [PATCH 24/31] parseForestRun --- src/Control/Lens/Grammar/Machine.hs | 116 +++++++++++++++++++++++++++- test/Main.hs | 24 ++++++ 2 files changed, 136 insertions(+), 4 deletions(-) diff --git a/src/Control/Lens/Grammar/Machine.hs b/src/Control/Lens/Grammar/Machine.hs index f31e8cf9..89fe0e2d 100644 --- a/src/Control/Lens/Grammar/Machine.hs +++ b/src/Control/Lens/Grammar/Machine.hs @@ -13,6 +13,7 @@ module Control.Lens.Grammar.Machine Matching (..) -- * Transducer , transducer + , parseForestRun , languageRun , expectedRun , unreachableRun @@ -35,6 +36,7 @@ import qualified Data.Map.Strict as Map import Data.Map.Strict (Map) import qualified Data.Set as Set import Data.Set (Set) +import Data.Tree (Tree (..)) -- | Does a word match a pattern? class Matching word pattern | pattern -> word where @@ -45,7 +47,7 @@ instance Categorized token => Matching [token] (Transducer token) where word =~ et = acceptsChart n chart where - (n, chart) = prefixRun et word + (n, chart) = prefixGen et word instance Categorized token => Matching [token] (Bnf (RegEx token)) where word =~ bnf = word =~ transducer bnf @@ -227,12 +229,118 @@ transducer (Bnf start rules) = Transducer , bypass0 || bypass1 ) -prefixRun +parseForestRun + :: Categorized token + => Transducer token + -> [token] + -> ([Tree (String, Int, Int, [token])], [token]) -- ^ parse forest spans & remaining unparsed tokens +parseForestRun et word = (concat (itemForests Set.empty Nothing 0 acceptedLen 0), drop acceptedLen word) + where + (n, chart) = prefixGen et word + relations = transducerRelations et + acceptedLen = maximum [j | j <- [0 .. n], acceptsChart j chart] + + acceptedWord = take acceptedLen word + sliceAt start end = take (end - start) (drop start acceptedWord) + itemsAt j = IntMap.findWithDefault IntMap.empty j chart + ruleInfo name = Map.findWithDefault (IntSet.empty, False) name (transducerRules et) + + edgesAt :: IntMap (IntMap [edge]) -> Int -> Int -> [edge] + edgesAt table pos stateId = + IntMap.findWithDefault [] stateId (IntMap.findWithDefault IntMap.empty pos table) + + insertEdges :: edge -> IntSet -> IntMap [edge] -> IntMap [edge] + insertEdges edge dests acc = IntSet.foldr + (\stateId m -> IntMap.insertWith (++) stateId [edge] m) + acc + dests + + scanBack = IntMap.fromList + [ (end, backRow (end - 1) input) + | (end, input) <- zip [1 .. acceptedLen] acceptedWord + ] + where + backRow prev input = IntMap.foldrWithKey step IntMap.empty (itemsAt prev) + where + step prevState origins acc = case IntMap.lookup prevState relations of + Just (TransitionTokenClass cls dests) | tokenClass cls input -> + insertEdges (prevState, origins) dests acc + _ -> acc + + completeBack = IntMap.fromList + [ (split, IntMap.foldrWithKey step IntMap.empty (itemsAt split)) + | split <- [0 .. acceptedLen] + ] + where + step caller origins acc = case IntMap.lookup caller relations of + Just (TransitionNonTerminal name dests) -> + insertEdges (caller, origins, name) dests acc + _ -> acc + + ruleFinals = IntMap.foldrWithKey finalStates Map.empty relations + finalStates stateId step acc = case step of + EmitNonTerminal name -> Map.insert name stateId acc + _ -> acc + + entryStates Nothing = transducerStarts et + entryStates (Just name) = fst (ruleInfo name) + + ruleNullable = snd . ruleInfo + + itemForests guards entry origin end stateId + | Set.member itemKey guards = [] + | otherwise = baseForests <> scannedForests <> completedForests + where + itemKey = Left (entry, origin, end, stateId) + guards' = Set.insert itemKey guards + + baseForests + | end == origin && IntSet.member stateId (entryStates entry) = [[]] + | otherwise = [] + + scannedForests + | end <= origin = [] + | otherwise = + [ forest + | (prevState, origins) <- edgesAt scanBack end stateId + , IntSet.member origin origins + , let prev = end - 1 + , forest <- itemForests guards' entry origin prev prevState + ] + + completedForests = + [ prefix <> [subtree] + | split <- [origin .. end] + , (caller, origins, name) <- edgesAt completeBack split stateId + , IntSet.member origin origins + , prefix <- itemForests guards' entry origin split caller + , subtree <- ruleTrees guards' name split end + ] + + ruleTrees guards name start end + | Set.member ruleKey guards = [] + | otherwise = nullableTrees <> derivedTrees + where + ruleKey = Right (name, start, end) + guards' = Set.insert ruleKey guards + + nullableTrees + | start == end && ruleNullable name = [Node (name, start, end, []) []] + | otherwise = [] + + derivedTrees = case Map.lookup name ruleFinals of + Nothing -> [] + Just finalState -> + [ Node (name, start, end, sliceAt start end) subtrees + | subtrees <- itemForests guards' (Just name) start end finalState + ] + +prefixGen :: Categorized token => Transducer token -> [token] -> (Int, IntMap (IntMap IntSet)) -prefixRun et word = go 0 (initialChart et) word +prefixGen et word = go 0 (initialChart et) word where go j chart [] = (j, chart) go j chart (x : xs) = @@ -260,7 +368,7 @@ expectedRun => Transducer token -> [token] {- ^ prefix -} -> TokenClass token expectedRun et word = anyB fst (scanClassOptions et n chart) where - (n, chart) = prefixRun et word + (n, chart) = prefixGen et word {- | Rule names that can never be entered from the start diff --git a/test/Main.hs b/test/Main.hs index 610e5363..cdb2ca76 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -8,6 +8,7 @@ import Data.Function (fix) import Data.List (genericLength) import Data.Maybe (isJust) import Data.Profunctor.Types (Star (..)) +import Data.Tree (Tree (..)) import System.Environment (lookupEnv) import Test.DocTest import Test.Hspec @@ -40,10 +41,33 @@ main = do describe "lambdaGrammar" $ testCfg True lambdaExamples lambdaGrammar describe "lenvecGrammar" $ testCsg True lenvecExamples lenvecGrammar describe "chainGrammar" $ testCfg True chainExamples chainGrammar + describe "parseForestRun" parseForestRunTests describe "Parsector try rollback" tryRollbackTests describe "Kleene" kleeneProperties describe "meander" meanderProperties +parseForestRunTests :: Spec +parseForestRunTests = do + it "returns the nested rule forest for a full parse" $ do + let (actualForest, actualRest) = parseForestRun (transducerG arithGrammar) "2*3+4;;;" + actualForest `shouldBe` + [ Node ("arith", 0, 5, "2*3+4") + [ Node ("sum", 0, 5, "2*3+4") + [ Node ("product", 0, 3, "2*3") + [ Node ("factor", 0, 1, "2") + [Node ("number", 0, 1, "2") []] + , Node ("factor", 2, 3, "3") + [Node ("number", 2, 3, "3") []] + ] + , Node ("product", 4, 5, "4") + [ Node ("factor", 4, 5, "4") + [Node ("number", 4, 5, "4") []] + ] + ] + ] + ] + actualRest `shouldBe` ";;;" + tryRollbackTests :: Spec tryRollbackTests = do it "rolls back parse stream/offset on failed try" $ do From 2bc2aa89b4e9c32c83fa34e02d6b01437fec5b52 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 17:06:40 -0700 Subject: [PATCH 25/31] Update Machine.hs --- src/Control/Lens/Grammar/Machine.hs | 34 +++++++++++++++-------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/Control/Lens/Grammar/Machine.hs b/src/Control/Lens/Grammar/Machine.hs index 89fe0e2d..20251421 100644 --- a/src/Control/Lens/Grammar/Machine.hs +++ b/src/Control/Lens/Grammar/Machine.hs @@ -13,10 +13,10 @@ module Control.Lens.Grammar.Machine Matching (..) -- * Transducer , transducer - , parseForestRun - , languageRun - , expectedRun - , unreachableRun + , parseForestGen + , languageGen + , expectedGen + , unreachableGen , Transducer (..) , TransducerStep (..) ) where @@ -94,7 +94,8 @@ and McIlroy, [Enumerating the strings of regular languages] (https://www.cs.dartmouth.edu/~doug/nfa.pdf). A transducer is a form of finite state machine -that can be run in various ways like `=~`, `expectedRun`, `languageRun` & `unreachableRun`. +that can be run in various ways like +`=~`, `expectedGen`, `languageGen`, `parseForestGen` & `unreachableGen`. -} transducer :: Bnf (RegEx token) -> Transducer token transducer (Bnf start rules) = Transducer @@ -229,12 +230,13 @@ transducer (Bnf start rules) = Transducer , bypass0 || bypass1 ) -parseForestRun +parseForestGen :: Categorized token => Transducer token -> [token] - -> ([Tree (String, Int, Int, [token])], [token]) -- ^ parse forest spans & remaining unparsed tokens -parseForestRun et word = (concat (itemForests Set.empty Nothing 0 acceptedLen 0), drop acceptedLen word) + -> ([Tree (String, Int, Int, [token])], [token]) + {- ^ parse forest & remaining unparsed tokens -} +parseForestGen et word = (concat (itemForests Set.empty Nothing 0 acceptedLen 0), drop acceptedLen word) where (n, chart) = prefixGen et word relations = transducerRelations et @@ -358,15 +360,15 @@ prefixGen et word = go 0 (initialChart et) word _ -> acc {- | What token is expected next? -The scanner frontier, `expectedRun` returns the `TokenClass` +The scanner frontier, `expectedGen` returns the `TokenClass` that can be scanned next after the given input prefix. A `falseB` result means the current chart has no scanner transitions, i.e. the prefix is a dead end for recognition. -} -expectedRun +expectedGen :: Categorized token => Transducer token -> [token] {- ^ prefix -} -> TokenClass token -expectedRun et word = anyB fst (scanClassOptions et n chart) +expectedGen et word = anyB fst (scanClassOptions et n chart) where (n, chart) = prefixGen et word @@ -375,8 +377,8 @@ Rule names that can never be entered from the start expression — dead productions. A non-empty result is a grammar-hygiene warning: those rules can be deleted without changing the recognized language. -} -unreachableRun :: Transducer token -> Set String -unreachableRun et = +unreachableGen :: Transducer token -> Set String +unreachableGen et = Map.keysSet (transducerRules et) `Set.difference` called where called = bfs (transducerStarts et) IntSet.empty Set.empty @@ -397,16 +399,16 @@ unreachableRun et = Nothing -> (acc, cs) {- | -`languageRun` lazily produces all words in a language from shortest to longest. +`languageGen` lazily produces all words in a language from shortest to longest. However since `TokenClass`es can resolve to infinite sets of tokens, and the relevant case of `Char` tokens while not infinite is huge, it samples tokens in an `Applicative` `TokenAlgebra`. -} -languageRun +languageGen :: (Applicative f, TokenAlgebra token (f token)) => Transducer token -- ^ transducer -> f [[token]] -languageRun et = sequenceA (fmap sampleWord classWords) +languageGen et = sequenceA (fmap sampleWord classWords) where classWords = enumerateByLength [(0, [], initialChart et)] Set.empty From 9a18679685d8832040a6aa28ac190e25a2c4cd8c Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 17:07:03 -0700 Subject: [PATCH 26/31] Update Main.hs --- test/Main.hs | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/test/Main.hs b/test/Main.hs index cdb2ca76..57abd412 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -41,15 +41,15 @@ main = do describe "lambdaGrammar" $ testCfg True lambdaExamples lambdaGrammar describe "lenvecGrammar" $ testCsg True lenvecExamples lenvecGrammar describe "chainGrammar" $ testCfg True chainExamples chainGrammar - describe "parseForestRun" parseForestRunTests + describe "parseForestGen" parseForestGenTests describe "Parsector try rollback" tryRollbackTests describe "Kleene" kleeneProperties describe "meander" meanderProperties -parseForestRunTests :: Spec -parseForestRunTests = do +parseForestGenTests :: Spec +parseForestGenTests = do it "returns the nested rule forest for a full parse" $ do - let (actualForest, actualRest) = parseForestRun (transducerG arithGrammar) "2*3+4;;;" + let (actualForest, actualRest) = parseForestGen (transducerG arithGrammar) "2*3+4;;;" actualForest `shouldBe` [ Node ("arith", 0, 5, "2*3+4") [ Node ("sum", 0, 5, "2*3+4") @@ -85,9 +85,22 @@ tryRollbackTests = do doctests :: IO () doctests = do + stackExe <- lookupEnv "STACK_EXE" + ghcEnvironment <- lookupEnv "GHC_ENVIRONMENT" let modulePaths = [ "src/Control/Lens/Grammar.hs" ] + sourceDirs = + [ "-isrc" + , "-itest" + ] + packageEnvFlags = case ghcEnvironment of + Just "-" -> [] + Just path -> ["-package-env=" <> path] + Nothing -> [] + runnerFlags + | isJust stackExe = [] + | otherwise = sourceDirs <> packageEnvFlags languageExtensions = [ "-XAllowAmbiguousTypes" , "-XArrows" @@ -129,7 +142,7 @@ doctests = do for_ modulePaths $ \modulePath -> do putStr "Testing module documentation in " putStrLn modulePath - doctest (modulePath : languageExtensions) + doctest (modulePath : runnerFlags <> languageExtensions) meanderProperties :: Spec meanderProperties = From 8dbaf4e612316d5b2e3b9ba6f2f69103b343e974 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 17:40:07 -0700 Subject: [PATCH 27/31] docs --- src/Control/Lens/Grammar.hs | 78 ++++++++++++++++++++--------- src/Control/Lens/Grammar/Machine.hs | 26 +++++----- test/Main.hs | 2 +- 3 files changed, 69 insertions(+), 37 deletions(-) diff --git a/src/Control/Lens/Grammar.hs b/src/Control/Lens/Grammar.hs index 11b2ad0a..b442dc90 100644 --- a/src/Control/Lens/Grammar.hs +++ b/src/Control/Lens/Grammar.hs @@ -25,7 +25,6 @@ module Control.Lens.Grammar , regbnfGrammar , applicativeG , transducerG - , languageG -- * Context-sensitive grammar , CtxGrammar , printG @@ -795,35 +794,68 @@ regbnfG bnf = runGrammor bnf {- | Compile a `Grammar` into a `Transducer`. -A transducer is a form of finite state machine -that can be run in various ways like `=~`, `expectedRun`, `languageRun` & `unreachableRun`. --} -transducerG :: Categorized token => Grammar token a -> Transducer token -transducerG bnf = transducer (runGrammor bnf) +>>> let regexMachine = transducerG @Char regexGrammar -{- | -`languageG` lazily produces all words in a language from shortest to longest. -However since `TokenClass`es can resolve to infinite sets of tokens, -and the relevant case of `Char` tokens while not infinite is huge, -it samples tokens in an `Applicative` `TokenAlgebra`. +A transducer is a form of finite state machine, +usable as an intermediary for further generators like +`=~`, `expectNext`, `languageSample`, `parseForestGen` & `unreachableRules`. +>>> import Test.QuickCheck +>>> let regexLang = languageSample @Char regexMachine +>>> words100 <- generate (take 100 <$> regexLang) +>>> quickCheck (property (all (=~ regexMachine) words100)) ++++ OK, passed 1 test. >>> import Control.Monad.State >>> import System.Random >>> let gen = mkStdGen 69 ->>> evalState (take 10 <$> languageG @Char regexGrammar) gen -["","|","\776269","()","[]","\\[","||","|\249908","\770923*","\1008821+"] - -This is useful for generating valid language examples for property tests. - ->>> import Test.QuickCheck ->>> let rg = regbnfG regexGrammar ->>> words100 <- generate (take 100 <$> languageG @Char regexGrammar) ->>> quickCheckWith stdArgs {maxSuccess = 1} (property (all (=~ rg) words100)) -+++ OK, passed 1 test. +>>> evalState (take 15 <$> regexLang) gen +["","|","\776269","()","[]","\\[","||","|\249908","\770923*","\1008821+","\318904?","\845807|","\477898\1026934","()*","()+"] + +>>> import Data.Tree (drawForest) + +@>>> let (forest, _) = parseForestGen regexMachine "xy|z" in putStr (drawForest (map (fmap show) forest)) +("regex",0,4,"xy|z") +| +`- ("alternate",0,4,"xy|z") + | + +- ("sequence",0,2,"xy") + | | + | +- ("expression",0,1,"x") + | | | + | | `- ("atom",0,1,"x") + | | | + | | `- ("class",0,1,"x") + | | | + | | `- ("class-one-of",0,1,"x") + | | | + | | `- ("char",0,1,"x") + | | + | `- ("expression",1,2,"y") + | | + | `- ("atom",1,2,"y") + | | + | `- ("class",1,2,"y") + | | + | `- ("class-one-of",1,2,"y") + | | + | `- ("char",1,2,"y") + | + `- ("sequence",3,4,"z") + | + `- ("expression",3,4,"z") + | + `- ("atom",3,4,"z") + | + `- ("class",3,4,"z") + | + `- ("class-one-of",3,4,"z") + | + `- ("char",3,4,"z") +@ -} -languageG :: (TokenAlgebra token (f token), Applicative f) => Grammar token a -> f [[token]] -languageG bnf = languageRun (transducerG bnf) +transducerG :: Categorized token => Grammar token a -> Transducer token +transducerG bnf = transducer (runGrammor bnf) {- | `printG` generates a printer from a `CtxGrammar`. Since both `RegGrammar`s and context-free `Grammar`s are `CtxGrammar`s, diff --git a/src/Control/Lens/Grammar/Machine.hs b/src/Control/Lens/Grammar/Machine.hs index 20251421..f3aa9989 100644 --- a/src/Control/Lens/Grammar/Machine.hs +++ b/src/Control/Lens/Grammar/Machine.hs @@ -14,9 +14,9 @@ module Control.Lens.Grammar.Machine -- * Transducer , transducer , parseForestGen - , languageGen - , expectedGen - , unreachableGen + , languageSample + , expectNext + , unreachableRules , Transducer (..) , TransducerStep (..) ) where @@ -95,7 +95,7 @@ and McIlroy, [Enumerating the strings of regular languages] A transducer is a form of finite state machine that can be run in various ways like -`=~`, `expectedGen`, `languageGen`, `parseForestGen` & `unreachableGen`. +`=~`, `expectNext`, `languageSample`, `parseForestGen` & `unreachableRules`. -} transducer :: Bnf (RegEx token) -> Transducer token transducer (Bnf start rules) = Transducer @@ -360,15 +360,15 @@ prefixGen et word = go 0 (initialChart et) word _ -> acc {- | What token is expected next? -The scanner frontier, `expectedGen` returns the `TokenClass` +The scanner frontier, `expectNext` returns the `TokenClass` that can be scanned next after the given input prefix. A `falseB` result means the current chart has no scanner transitions, i.e. the prefix is a dead end for recognition. -} -expectedGen +expectNext :: Categorized token => Transducer token -> [token] {- ^ prefix -} -> TokenClass token -expectedGen et word = anyB fst (scanClassOptions et n chart) +expectNext et word = anyB fst (scanClassOptions et n chart) where (n, chart) = prefixGen et word @@ -377,8 +377,8 @@ Rule names that can never be entered from the start expression — dead productions. A non-empty result is a grammar-hygiene warning: those rules can be deleted without changing the recognized language. -} -unreachableGen :: Transducer token -> Set String -unreachableGen et = +unreachableRules :: Transducer token -> Set String +unreachableRules et = Map.keysSet (transducerRules et) `Set.difference` called where called = bfs (transducerStarts et) IntSet.empty Set.empty @@ -399,16 +399,16 @@ unreachableGen et = Nothing -> (acc, cs) {- | -`languageGen` lazily produces all words in a language from shortest to longest. +`languageSample` lazily produces all words in a language from shortest to longest. However since `TokenClass`es can resolve to infinite sets of tokens, and the relevant case of `Char` tokens while not infinite is huge, it samples tokens in an `Applicative` `TokenAlgebra`. -} -languageGen - :: (Applicative f, TokenAlgebra token (f token)) +languageSample + :: (TokenAlgebra token (f token), Applicative f) => Transducer token -- ^ transducer -> f [[token]] -languageGen et = sequenceA (fmap sampleWord classWords) +languageSample et = sequenceA (fmap sampleWord classWords) where classWords = enumerateByLength [(0, [], initialChart et)] Set.empty diff --git a/test/Main.hs b/test/Main.hs index 57abd412..61526fba 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -164,7 +164,7 @@ testCfg isLL1 examples grammar = do actualMatch `shouldBe` True describe "generated languageG" $ do it "should parses with exactly one full parse" $ do - generated <- generate (take 100 <$> languageG grammar) + generated <- generate (take 100 <$> languageSample (transducerG grammar)) for_ generated $ \word -> do let fullParses = [() | (_, "") <- parseG grammar word] fullParses `shouldBe` [()] From 726ae7b93d848613be4d93fde1f482b9cc54e4c3 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 17:51:21 -0700 Subject: [PATCH 28/31] kleaning --- src/Control/Lens/Grammar.hs | 4 ++-- src/Control/Lens/Grammar/Machine.hs | 11 ++++++----- test/Main.hs | 8 ++++---- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/Control/Lens/Grammar.hs b/src/Control/Lens/Grammar.hs index b442dc90..8ac3779e 100644 --- a/src/Control/Lens/Grammar.hs +++ b/src/Control/Lens/Grammar.hs @@ -798,7 +798,7 @@ regbnfG bnf = runGrammor bnf A transducer is a form of finite state machine, usable as an intermediary for further generators like -`=~`, `expectNext`, `languageSample`, `parseForestGen` & `unreachableRules`. +`=~`, `expectNext`, `languageSample`, `parseForest` & `unreachableRules`. >>> import Test.QuickCheck >>> let regexLang = languageSample @Char regexMachine @@ -813,7 +813,7 @@ usable as an intermediary for further generators like >>> import Data.Tree (drawForest) -@>>> let (forest, _) = parseForestGen regexMachine "xy|z" in putStr (drawForest (map (fmap show) forest)) +@>>> let (forest, _) = parseForest regexMachine "xy|z" in putStr (drawForest (map (fmap show) forest)) ("regex",0,4,"xy|z") | `- ("alternate",0,4,"xy|z") diff --git a/src/Control/Lens/Grammar/Machine.hs b/src/Control/Lens/Grammar/Machine.hs index f3aa9989..369af27e 100644 --- a/src/Control/Lens/Grammar/Machine.hs +++ b/src/Control/Lens/Grammar/Machine.hs @@ -13,7 +13,7 @@ module Control.Lens.Grammar.Machine Matching (..) -- * Transducer , transducer - , parseForestGen + , parseForest , languageSample , expectNext , unreachableRules @@ -95,7 +95,7 @@ and McIlroy, [Enumerating the strings of regular languages] A transducer is a form of finite state machine that can be run in various ways like -`=~`, `expectNext`, `languageSample`, `parseForestGen` & `unreachableRules`. +`=~`, `expectNext`, `languageSample`, `parseForest` & `unreachableRules`. -} transducer :: Bnf (RegEx token) -> Transducer token transducer (Bnf start rules) = Transducer @@ -230,13 +230,14 @@ transducer (Bnf start rules) = Transducer , bypass0 || bypass1 ) -parseForestGen +{- | The parse forest of a string of tokens. -} +parseForest :: Categorized token => Transducer token - -> [token] + -> [token] -- ^ string -> ([Tree (String, Int, Int, [token])], [token]) {- ^ parse forest & remaining unparsed tokens -} -parseForestGen et word = (concat (itemForests Set.empty Nothing 0 acceptedLen 0), drop acceptedLen word) +parseForest et word = (concat (itemForests Set.empty Nothing 0 acceptedLen 0), drop acceptedLen word) where (n, chart) = prefixGen et word relations = transducerRelations et diff --git a/test/Main.hs b/test/Main.hs index 61526fba..02d4dbbf 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -41,15 +41,15 @@ main = do describe "lambdaGrammar" $ testCfg True lambdaExamples lambdaGrammar describe "lenvecGrammar" $ testCsg True lenvecExamples lenvecGrammar describe "chainGrammar" $ testCfg True chainExamples chainGrammar - describe "parseForestGen" parseForestGenTests + describe "parseForest" parseForestTests describe "Parsector try rollback" tryRollbackTests describe "Kleene" kleeneProperties describe "meander" meanderProperties -parseForestGenTests :: Spec -parseForestGenTests = do +parseForestTests :: Spec +parseForestTests = do it "returns the nested rule forest for a full parse" $ do - let (actualForest, actualRest) = parseForestGen (transducerG arithGrammar) "2*3+4;;;" + let (actualForest, actualRest) = parseForest (transducerG arithGrammar) "2*3+4;;;" actualForest `shouldBe` [ Node ("arith", 0, 5, "2*3+4") [ Node ("sum", 0, 5, "2*3+4") From 90fab479cc14525a97a20b0051b80037493729c8 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 18:08:57 -0700 Subject: [PATCH 29/31] Update Kleene.hs --- src/Control/Lens/Grammar/Kleene.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Control/Lens/Grammar/Kleene.hs b/src/Control/Lens/Grammar/Kleene.hs index 85c54c1f..fe8018a5 100644 --- a/src/Control/Lens/Grammar/Kleene.hs +++ b/src/Control/Lens/Grammar/Kleene.hs @@ -233,7 +233,7 @@ instance TokenAlgebra Char (ReadP Char) where tokenClass = ReadP.satisfy . tokenClass instance (Categorized token, Arbitrary token) => TokenAlgebra token (Gen token) where tokenClass (TokenClass exam) = case exam of - OneOf xs -> Gen.elements (toList xs) + OneOf xs -> oneOf xs NotOneOf xs (AndAsIn cat) -> arbitrary `Gen.suchThat` (\x -> x `notElem` xs && categorize x == cat) NotOneOf xs (AndNotAsIn cats) -> arbitrary `Gen.suchThat` From 49beb1863a0d7ed928f08eadf65d157e862612c7 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 18:18:36 -0700 Subject: [PATCH 30/31] update --- CHANGELOG.md | 33 +++++++++++++++++++++++++++++++++ distributors.cabal | 2 +- package.yaml | 2 +- test/Main.hs | 4 ++-- 4 files changed, 37 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e610530f..83402d2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,38 @@ # Changelog for `distributors` +## 0.6.0.0 - 2026-04-24 + +### New Module + +- Added `Control.Lens.Grammar.Machine` as the transducer/matching runtime layer. + +### New Types + +- Added `Transducer` and `TransducerStep` as the finite-state representation for + compiled grammar machines. + +### New APIs (Machine Runtime) + +- Added `transducer` to compile `Bnf (RegEx token)` into `Transducer`. +- Added `parseForest` to reconstruct parse forests with rule labels and token spans/slices, + returning the remaining unparsed suffix. +- Added `expectNext` to compute scanner-frontier expected token classes after a prefix. +- Added `languageSample` to lazily enumerate sampled language words from shortest length upward. +- Added `unreachableRules` to report dead nonterminals unreachable from the start expression. + +### Internal Machinery + +- Implemented Thompson-style transducer construction over `RegEx`-extended BNF. +- Implemented Earley-style chart runtime (`initialChart`, `closeChartAt`, `scanClassOptions`, + `prefixGen`) with predict/complete closure and scanner grouping. +- Added completion-time caller indexing/cache optimizations and precomputed rule nullability/first-state + indexing to speed machine execution. + +### Grammar Integration + +- `Control.Lens.Grammar` now exposes machine-backed generators: + `transducerG` and the parse-forest examples/docs built on the Machine runtime. + ## 0.5.0.0 - 2026-04-16 ### Changes diff --git a/distributors.cabal b/distributors.cabal index c620f938..2794cd48 100644 --- a/distributors.cabal +++ b/distributors.cabal @@ -5,7 +5,7 @@ cabal-version: 2.2 -- see: https://github.com/sol/hpack name: distributors -version: 0.5.0.0 +version: 0.6.0.0 synopsis: Unifying Parsers, Printers & Grammars description: Distributors provides mathematically inspired abstractions for coders to write parsers that can also be inverted to printers. category: Profunctors, Optics, Parsing diff --git a/package.yaml b/package.yaml index 1e652244..4319745c 100644 --- a/package.yaml +++ b/package.yaml @@ -1,5 +1,5 @@ name: distributors -version: 0.5.0.0 +version: 0.6.0.0 github: "morphismtech/distributors" license: BSD-3-Clause author: "Eitan Chatav" diff --git a/test/Main.hs b/test/Main.hs index 02d4dbbf..f73df07f 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -162,8 +162,8 @@ testCfg isLL1 examples grammar = do it ("should match " <> expectedString <> " correctly") $ do let actualMatch = expectedString =~ regbnfG grammar actualMatch `shouldBe` True - describe "generated languageG" $ do - it "should parses with exactly one full parse" $ do + describe "transducerG" $ do + it "should generate the hundred shorted valid words in a language" $ do generated <- generate (take 100 <$> languageSample (transducerG grammar)) for_ generated $ \word -> do let fullParses = [() | (_, "") <- parseG grammar word] From c5813b5cecc1fea9a8f677052fd28d6f72d7c7d0 Mon Sep 17 00:00:00 2001 From: Eitan Chatav Date: Fri, 24 Apr 2026 18:38:15 -0700 Subject: [PATCH 31/31] Update Machine.hs --- src/Control/Lens/Grammar/Machine.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Control/Lens/Grammar/Machine.hs b/src/Control/Lens/Grammar/Machine.hs index 369af27e..4f778b95 100644 --- a/src/Control/Lens/Grammar/Machine.hs +++ b/src/Control/Lens/Grammar/Machine.hs @@ -77,6 +77,7 @@ data Transducer token = Transducer , transducerRules :: Map String (IntSet, Bool) -- ^ an index into `transducerRelations` for nonterminals with precomputed nullability , transducerStarts :: IntSet + -- ^ an index into `transducerRelations` for the starting rule } -- | A `TransducerStep` in a `Transducer`.