From db25c26238862bd74be2755a9bf8ede934051bc0 Mon Sep 17 00:00:00 2001 From: Martin Di Paola Date: Fri, 20 Mar 2026 13:09:13 -0300 Subject: [PATCH 1/7] build: hide build/ --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 591d6c5..7ce35ac 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ prof-traces test/ds/good.args test/autocomplete_byexample.sh .check-secrets-impl.sh +build/ From 47cb9843dd5c0e9b377f50012b41e185ab8800cd Mon Sep 17 00:00:00 2001 From: Martin Di Paola Date: Fri, 20 Mar 2026 13:32:37 -0300 Subject: [PATCH 2/7] feat: impl #275 ignoring empty lines at begin of got string --- byexample/parser.py | 14 ++++----- byexample/parser_sm.py | 71 +++++++++++++++++++++++++++++++----------- 2 files changed, 59 insertions(+), 26 deletions(-) diff --git a/byexample/parser.py b/byexample/parser.py index 36dcba0..caa8e31 100644 --- a/byexample/parser.py +++ b/byexample/parser.py @@ -262,8 +262,8 @@ def expected_as_regexs( We return the regexs - >>> regexs - ('\\A', 'a', '(?P.*?)', 'b', '(?P.*?)', 'c', '\\n*\\Z') + >>> regexs # byexample: +norm-ws + ('\\A(?:[ \\t]*\\n)*?', 'a', '(?P.*?)', 'b', '(?P.*?)', 'c', '\\n*\\Z') >>> m = re.compile(''.join(regexs), re.MULTILINE | re.DOTALL) >>> m.match('axxbyyyc').groups() @@ -300,7 +300,7 @@ def expected_as_regexs( >>> regexs, _, _, tags_by_idx, _ = _as_regexs(expected, normalize_whitespace=True) >>> regexs # byexample: +norm-ws - ('\\A', 'a', '(?:.*?)(?.*?)', 'c', '\\s*\\Z') + ('\\A\\s*?', 'a', '(?:.*?)(?.*?)', 'c', '\\s*\\Z') >>> tags_by_idx {2: None, 4: 'foo-bar'} @@ -315,7 +315,7 @@ def expected_as_regexs( >>> regexs, _, _, tags_by_idx, _ = _as_regexs(expected) >>> regexs - ('\\A', 'abc', '\\n*\\Z') + ('\\A(?:[ \\t]*\\n)*?', 'abc', '\\n*\\Z') >>> tags_by_idx {} @@ -324,7 +324,7 @@ def expected_as_regexs( >>> regexs, _, _, tags_by_idx, _ = _as_regexs(expected) >>> regexs - ('\\A', 'a', '(?:.*?)', 'bc', '\\n*\\Z') + ('\\A(?:[ \\t]*\\n)*?', 'a', '(?:.*?)', 'bc', '\\n*\\Z') >>> tags_by_idx {2: None} @@ -433,7 +433,7 @@ def _extend_parser_and_parse_options_strictly_and_cache(self, optlist): >>> regexs, _, _, _, _ = _as_regexs(expected, normalize_whitespace=True) >>> regexs -('\\A', +('\\A\\s*?', 'ex', '\\s', '(?:\\s*(?!\\s)(?:.+)(?>> regexs, _, _, _, _ = _as_regexs(expected, normalize_whitespace=True) >>> regexs -('\\A', +('\\A\\s*?', 'ex', '\\s', '(?:\\s*(?!\\s)(?P.+?)(?>> r, p, c, _, _ = _as_regexs('a \n b \t\vc') >>> r - ('\\A', 'a', '\\s+(?!\\s)', 'b', '\\s+(?!\\s)', 'c', '\\s*\\Z') + ('\\A\\s*?', 'a', '\\s+(?!\\s)', 'b', '\\s+(?!\\s)', 'c', '\\s*\\Z') >>> match(r, 'a b c') is not None True @@ -845,7 +866,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs - ('\\A', 'a', '(?P.*?)', 'b', '\\s*\\Z') + ('\\A\\s*?', 'a', '(?P.*?)', 'b', '\\s*\\Z') >>> p (0, 0, 1, 6, 7) @@ -861,7 +882,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', 'a', '\\s+(?!\\s)', '(?P.*?)', 'b', '\\s*\\Z') + ('\\A\\s*?', 'a', '\\s+(?!\\s)', '(?P.*?)', 'b', '\\s*\\Z') >>> p (0, 0, 1, 2, 7, 8) @@ -873,7 +894,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', 'a', '(?P.*?)(?.*?)(?>> p (0, 0, 1, 6, 7, 8) @@ -888,7 +909,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: +norm-ws -tags - ('\\A', 'a', '\\s', '(?:\\s*(?!\\s)(?P.+?)(?.+?)(?>> p (0, 0, 1, 2, 7, 8, 9) @@ -918,7 +939,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '(?P.*?)(?.*?)(?>> p (0, 0, 5) @@ -930,7 +951,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '\\s', '(?:\\s*(?!\\s)(?P.+?)(?.+?)(?>> p (0, 0, 1, 6) @@ -942,7 +963,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '\\s', '(?:\\s*(?!\\s)(?P.+?)(?.+?)(?>> p (0, 0, 1, 6) @@ -954,7 +975,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '(?P.*?)(?.*?)(?>> p (0, 0, 5) @@ -966,7 +987,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '\\s*\\Z') + ('\\A\\s*?', '\\s*\\Z') >>> p (0, 0) @@ -975,7 +996,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, p, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '\\s*\\Z') + ('\\A\\s*?', '\\s*\\Z') >>> p (0, 0) @@ -989,7 +1010,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, charnos, rcounts, _, input_list = _as_regexs(expected) >>> regexs # byexample: +norm-ws - ('\\A', 'username', '\\s+(?!\\s)', '\\[john\\]', '\\s+(?!\\s)', + ('\\A\\s*?', 'username', '\\s+(?!\\s)', '\\[john\\]', '\\s+(?!\\s)', 'pass', '\\s+(?!\\s)', '\\[admin\\]', '\\s+(?!\\s)', 'comment', '\\s+(?!\\s)', '\\[', '\\s+(?!\\s)', 'none', '\\s+(?!\\s)', '\\]', '\\s*\\Z') @@ -1021,6 +1042,18 @@ def __init__( def trailing_newlines_regex(self): return re.compile(r'\n*\Z', re.MULTILINE | re.DOTALL) + def _begin_of_string_regex(self): + r''' + Skip any leading empty or whitespace-only lines in the got. + + A non-greedy *? is used to avoid consuming lines that the + expected regex (e.g. a tag) may need to match itself. + This is safe and non-pathological: each iteration of the + group consumes at least one \n, so the total work is linear + in the number of leading blank lines. + ''' + return r'\A(?:[ \t]*\n)*?' + def emit_tag(self, ctx, endline): assert ctx in ('n', '0') return SM.emit_tag(self, ctx, endline) @@ -1105,7 +1138,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, charnos, rcounts, tags_by_idx, input_list = _as_regexs(expected) >>> regexs # byexample: -tags +norm-ws - ('\\A', 'a', '(?P.*?)', 'b', '(?P.*?)', 'c', '(?:.*?)', 'd', '\\n*\\Z') + ('\\A(?:[ \\t]*\\n)*?', 'a', '(?P.*?)', 'b', '(?P.*?)', 'c', '(?:.*?)', 'd', '\\n*\\Z') >>> match(regexs, 'axxbyyyczzd').groups() ('xx', 'yyy') @@ -1152,7 +1185,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, _, rcounts, _, _ = _as_regexs(expected) >>> regexs # byexample: +norm-ws -tags - ('\\A', + ('\\A(?:[ \\t]*\\n)*?', 'a', '\\\n', '(?P.*?)', @@ -1200,7 +1233,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, _, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '(?:(?P.+?)(?.+?)(?>> match(regexs, ' 123 \n\n\n\n').groups() (' 123 ',) @@ -1209,7 +1242,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, _, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '(?:(?P.+?)(?.+?)(?>> match(regexs, '123\n\n\n\n').groups() ('123',) @@ -1218,7 +1251,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, _, _, _, _ = _as_regexs(expected) >>> regexs # byexample: -tags - ('\\A', '\\\n', '(?:(?P.+?)(?.+?)(?>> match(regexs, '\n123\n\n\n\n').groups() ('123',) @@ -1235,7 +1268,7 @@ def parse(self, expected, tags_enabled, input_enabled): >>> regexs, charnos, rcounts, _, input_list = _as_regexs(expected) >>> regexs # byexample: +norm-ws - ('\\A', 'username', '\\ ', '\\[john\\]', '\\\n', + ('\\A(?:[ \\t]*\\n)*?', 'username', '\\ ', '\\[john\\]', '\\\n', 'pass', '\\ ', '\\[admin\\]', '\\ \\ ', '\\\n', 'comment', '\\ ', '\\[', '\\ ', 'none', '\\ ', '\\]', '\\n*\\Z') From 0d74009cde5bd2db81efd4c9e108b42e8c096c02 Mon Sep 17 00:00:00 2001 From: Martin Di Paola Date: Fri, 20 Mar 2026 13:32:56 -0300 Subject: [PATCH 3/7] docs: doc+test about empty lines being ignored --- docs/basic/normalize-whitespace.md | 76 ++++++++++++++++++++++++++++++ test/bad-empty-line.md | 8 ++++ 2 files changed, 84 insertions(+) create mode 100644 test/bad-empty-line.md diff --git a/docs/basic/normalize-whitespace.md b/docs/basic/normalize-whitespace.md index d512dfc..50d7527 100644 --- a/docs/basic/normalize-whitespace.md +++ b/docs/basic/normalize-whitespace.md @@ -1,3 +1,11 @@ + # Normalize Whitespace Replace any sequence of whitespace by a single one. @@ -35,3 +43,71 @@ Here is another example, this time written in ``Ruby``: 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] ``` +## Empty lines at the begin are ignored by default + + +Consider the following `"\n \nSome line"` output. The following three +examples matches because by default `byexample` discards any empty line +at the begin of the output. + +```python +>>> someline = "\n \nSome line" + +>>> print(someline) # OK: <...> captures the empty lines +<...> +Some line + +>>> print(someline) # OK too: the same reason above +<...>Some line + +>>> print(someline) # OK: byexample ignores the empty lines "as if" a <...> was there +Some line +``` + +`byexample` understands as "empty lines" lines made entirely of spaces +and tabs ended with a new line. It is subtle but such definition does +not include indentation. + +Consider the following `"\n \n Some indented line"`: + +```python +>>> someindented = "\n \n Some indented line" + +>>> print(someindented) # FAIL: the example is not expecting indentation # byexample: +pass +<...> +Some indented line + +>>> print(someindented) # OK: <...> captures all including the indentation +<...>Some indented line + +>>> print(someindented) # FAIL: byexample ignores the empty lines but not the indentation # byexample: +pass +Some indented line +``` + +When `+norm-ws` is enabled, those two `FAIL` examples will work because +`byexample` relaxes the definition of empty lines and replaces by +"any whitespace" which the indentation gets included: + +```python +>>> print(someindented) # byexample: +norm-ws +<...> +Some indented line + +>>> print(someindented) # byexample: +norm-ws +Some indented line +``` + + diff --git a/test/bad-empty-line.md b/test/bad-empty-line.md new file mode 100644 index 0000000..6f1efd3 --- /dev/null +++ b/test/bad-empty-line.md @@ -0,0 +1,8 @@ +```python +>>> print("\n \n Some line") # should fail (missing indentation) +<...> +Some line + +>>> print("\n \n Some line") # should fail (missing indentation) +Some line +``` From 20b7922693070141d14ff81f8af268732a283ff7 Mon Sep 17 00:00:00 2001 From: Martin Di Paola Date: Fri, 20 Mar 2026 22:27:43 -0300 Subject: [PATCH 4/7] fix: fix a quadratic case --- byexample/parser_sm.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/byexample/parser_sm.py b/byexample/parser_sm.py index 342a0ec..2de7727 100644 --- a/byexample/parser_sm.py +++ b/byexample/parser_sm.py @@ -689,6 +689,11 @@ def _begin_of_string_regex(self): spaces and newlines can be freely skipped. Non-greedy *? is used so that the next part of the regex (e.g. a tag's \s+ lookahead) can still match the whitespace. + + Note: when the first content token is itself a whitespace transition + (\s+(?!\s)), emit_ws() folds \A\s*? + \s+(?!\s) into the single + equivalent \A\s+(?!\s) to avoid a quadratic interaction between the + two overlapping whitespace quantifiers. ''' return r'\A\s*?' @@ -698,6 +703,15 @@ def emit_ws(self, just_one=False): rx = r'\s' else: rx = r'\s+(?!\s)' + # \A\s*? + \s+(?!\s) is quadratic: both parts match whitespace + # and the engine explores O(n^2) splits on whitespace-only strings. + # \A\s*?\s+(?!\s) is semantically equivalent to \A\s+(?!\s) + # (both match one-or-more whitespace anchored at start, stopping + # before non-whitespace), so fold them when this is the first + # content emit (only the \A\s*? anchor is in results so far). + if len(self.results) == 1: + self.results.pop() + rx = r'\A' + rx rc = 1 self.record_input_event(charno, 'prefix', ' ', rx, rc) @@ -983,6 +997,32 @@ def parse(self, expected, tags_enabled, input_enabled): >>> match(regexs, ' 123 \n\n\n\n').groups() (' 123',) + When the expected starts with whitespace, \A\s*? + \s+(?!\s) would + be quadratic on whitespace-only got strings. They are folded into + the single equivalent \A\s+(?!\s) which is linear (greedy, no + overlap). + + >>> expected = ' foo' + >>> regexs, p, _, _, _ = _as_regexs(expected) + + >>> regexs # byexample: -tags + ('\\A\\s+(?!\\s)', 'foo', '\\s*\\Z') + + >>> p + (0, 2, 5) + + This still skips any extra leading whitespace in the got, just + like \A\s*?\s+(?!\s) would, because \s+ is greedy from \A. + + >>> match(regexs, '\n\n foo').groups() + () + + >>> match(regexs, ' foo').groups() + () + + >>> match(regexs, 'foo') is None + True + >>> expected = ' ' >>> regexs, p, _, _, _ = _as_regexs(expected) From f46a3ca47cb9da651b084c012afd956924dc1f62 Mon Sep 17 00:00:00 2001 From: Martin Di Paola Date: Fri, 20 Mar 2026 23:52:53 -0300 Subject: [PATCH 5/7] feat: impl +ignore-first-empty-lines (default true) --- byexample/example.py | 1 + byexample/expected.py | 4 +-- byexample/finder.py | 3 +- byexample/init.py | 6 ++++ byexample/parser.py | 30 ++++++++++++++--- byexample/parser_sm.py | 73 +++++++++++++++++++++++++++++------------- 6 files changed, 87 insertions(+), 30 deletions(-) diff --git a/byexample/example.py b/byexample/example.py index 39494a2..5dc32c8 100644 --- a/byexample/example.py +++ b/byexample/example.py @@ -82,6 +82,7 @@ class Example(object): >>> example.options {'capture': True, + 'ignore_first_empty_lines': True, 'input_prefix_range': (6, 12), 'norm_ws': False, 'rm': [], diff --git a/byexample/expected.py b/byexample/expected.py index 5544d36..161093f 100644 --- a/byexample/expected.py +++ b/byexample/expected.py @@ -32,7 +32,7 @@ class _LinearExpected(Expected): >>> from byexample.options import Options >>> from byexample.finder import _build_fake_example as build_example - >>> opts = {'norm_ws': False, 'tags': True, 'capture': True, 'rm': [], 'type': False, 'input_prefix_range': (6,12)} + >>> opts = {'norm_ws': False, 'tags': True, 'capture': True, 'rm': [], 'type': False, 'input_prefix_range': (6,12), 'ignore_first_empty_lines': True} Consider the following example with a named capture in the expected: @@ -145,7 +145,7 @@ class _LinearExpected(Expected): (See byexample.parser docs) - >>> opts = {'norm_ws': True, 'tags': True, 'capture': True, 'rm': [], 'type': False, 'input_prefix_range': (6, 12)} + >>> opts = {'norm_ws': True, 'tags': True, 'capture': True, 'rm': [], 'type': False, 'input_prefix_range': (6, 12), 'ignore_first_empty_lines': True} >>> ex = build_example('f()', '\n A \n\nB C\n', opts=opts) >>> exp = ex.expected diff --git a/byexample/finder.py b/byexample/finder.py index 33b2c41..89469e1 100644 --- a/byexample/finder.py +++ b/byexample/finder.py @@ -51,7 +51,8 @@ class F: 'capture': True, 'rm': [], 'type': False, - 'input_prefix_range': (6, 12) + 'input_prefix_range': (6, 12), + 'ignore_first_empty_lines': True, } ) parser.extract_options = lambda x: opts diff --git a/byexample/init.py b/byexample/init.py index bb5cc13..e3dd679 100644 --- a/byexample/init.py +++ b/byexample/init.py @@ -386,6 +386,12 @@ def get_default_options_parser(cmdline_args): options_parser.add_flag( "norm-ws", default=False, help="ignore the amount of whitespaces." ) + options_parser.add_flag( + "ignore-first-empty-lines", + default=True, + help= + "ignore any empty or whitespace-only lines at the begin of the got string." + ) options_parser.add_flag( "pass", default=False, diff --git a/byexample/parser.py b/byexample/parser.py index caa8e31..b57e16e 100644 --- a/byexample/parser.py +++ b/byexample/parser.py @@ -193,7 +193,8 @@ def parse(self, example, concerns): input_prefix_len_range = options['input_prefix_range'] expected_regexs, charnos, rcounts, tags_by_idx, input_list = self.expected_as_regexs( example.expected_str, options['tags'], options['capture'], - options['type'], options['norm_ws'], input_prefix_len_range + options['type'], options['norm_ws'], input_prefix_len_range, + options['ignore_first_empty_lines'] ) ExpectedClass = _LinearExpected @@ -230,8 +231,14 @@ def parse(self, example, concerns): @profile def expected_as_regexs( - self, expected, tags_enabled, capture_enabled, input_enabled, - normalize_whitespace, input_prefix_len_range + self, + expected, + tags_enabled, + capture_enabled, + input_enabled, + normalize_whitespace, + input_prefix_len_range, + ignore_first_empty_lines=True ): r''' From the expected string create a list of regular expressions that @@ -329,6 +336,19 @@ def expected_as_regexs( >>> tags_by_idx {2: None} + When ignore_first_empty_lines is False the begin anchor is a plain \\A, + so the got string must start exactly at the first expected character. + + >>> regexs, _, _, _, _ = _as_regexs('foo', ignore_first_empty_lines=False) + + >>> regexs + ('\\A', 'foo', '\\n*\\Z') + + >>> regexs, _, _, _, _ = _as_regexs('foo', normalize_whitespace=True, ignore_first_empty_lines=False) + + >>> regexs + ('\\A', 'foo', '\\s*\\Z') + ''' if capture_enabled: tag_regexs = self.tag_regexs() @@ -338,12 +358,12 @@ def expected_as_regexs( if normalize_whitespace: sm = SM_NormWS( tag_regexs, self.input_regexs(), self.ellipsis_marker(), - input_prefix_len_range + input_prefix_len_range, ignore_first_empty_lines ) else: sm = SM_NotNormWS( tag_regexs, self.input_regexs(), self.ellipsis_marker(), - input_prefix_len_range + input_prefix_len_range, ignore_first_empty_lines ) return sm.parse(expected, tags_enabled, input_enabled) diff --git a/byexample/parser_sm.py b/byexample/parser_sm.py index 2de7727..ccb64e9 100644 --- a/byexample/parser_sm.py +++ b/byexample/parser_sm.py @@ -37,7 +37,12 @@ class SM(object): def __init__( - self, tag_regexs, input_regexs, ellipsis_marker, input_prefix_len_range + self, + tag_regexs, + input_regexs, + ellipsis_marker, + input_prefix_len_range, + ignore_first_empty_lines=True ): self.tag_regex = tag_regexs.for_capture self.tag_split_regex = tag_regexs.for_split @@ -49,6 +54,8 @@ def __init__( self.input_prefix_min_len, self.input_prefix_max_len = input_prefix_len_range assert self.input_prefix_min_len <= self.input_prefix_max_len + self.ignore_first_empty_lines = ignore_first_empty_lines + self.reset() def reset(self): @@ -670,11 +677,16 @@ def build_prefix(self, partial_prefixes): class SM_NormWS(SM): def __init__( - self, tag_regexs, input_regexs, ellipsis_marker, input_prefix_len_range + self, + tag_regexs, + input_regexs, + ellipsis_marker, + input_prefix_len_range, + ignore_first_empty_lines=True ): SM.__init__( self, tag_regexs, input_regexs, ellipsis_marker, - input_prefix_len_range + input_prefix_len_range, ignore_first_empty_lines ) @constant @@ -683,19 +695,25 @@ def trailing_whitespace_regex(self): def _begin_of_string_regex(self): r''' - Skip any leading whitespace (including empty lines) in the got. + If ignore_first_empty_lines is True (the default), skip any leading + whitespace (including empty lines) in the got before matching content. + + In norm-ws mode all whitespace is equivalent, so leading spaces and + newlines can be freely skipped. Non-greedy *? is used so that the + next part of the regex (e.g. a tag's \s+ lookahead) can still match + the whitespace. - In norm-ws mode all whitespace is equivalent, so leading - spaces and newlines can be freely skipped. - Non-greedy *? is used so that the next part of the regex - (e.g. a tag's \s+ lookahead) can still match the whitespace. + If ignore_first_empty_lines is False, use a plain \A anchor so that + the got string must start exactly where the expected content begins. Note: when the first content token is itself a whitespace transition - (\s+(?!\s)), emit_ws() folds \A\s*? + \s+(?!\s) into the single - equivalent \A\s+(?!\s) to avoid a quadratic interaction between the + (\s+(?!\s)), emit_ws() folds the begin anchor + \s+(?!\s) into the + single equivalent \A\s+(?!\s) to avoid a quadratic interaction between two overlapping whitespace quantifiers. ''' - return r'\A\s*?' + if self.ignore_first_empty_lines: + return r'\A\s*?' + return r'\A' def emit_ws(self, just_one=False): charno, _ = self.pull() @@ -703,13 +721,14 @@ def emit_ws(self, just_one=False): rx = r'\s' else: rx = r'\s+(?!\s)' - # \A\s*? + \s+(?!\s) is quadratic: both parts match whitespace + # If self.ignore_first_empty_lines is True, + # the \A\s*? + \s+(?!\s) is quadratic: both parts match whitespace # and the engine explores O(n^2) splits on whitespace-only strings. # \A\s*?\s+(?!\s) is semantically equivalent to \A\s+(?!\s) # (both match one-or-more whitespace anchored at start, stopping # before non-whitespace), so fold them when this is the first # content emit (only the \A\s*? anchor is in results so far). - if len(self.results) == 1: + if len(self.results) == 1 and self.ignore_first_empty_lines: self.results.pop() rx = r'\A' + rx rc = 1 @@ -1071,11 +1090,16 @@ def parse(self, expected, tags_enabled, input_enabled): class SM_NotNormWS(SM): def __init__( - self, tag_regexs, input_regexs, ellipsis_marker, input_prefix_len_range + self, + tag_regexs, + input_regexs, + ellipsis_marker, + input_prefix_len_range, + ignore_first_empty_lines=True ): SM.__init__( self, tag_regexs, input_regexs, ellipsis_marker, - input_prefix_len_range + input_prefix_len_range, ignore_first_empty_lines ) @constant @@ -1084,15 +1108,20 @@ def trailing_newlines_regex(self): def _begin_of_string_regex(self): r''' - Skip any leading empty or whitespace-only lines in the got. + If ignore_first_empty_lines is True (the default), skip any leading + empty or whitespace-only lines in the got before matching content. - A non-greedy *? is used to avoid consuming lines that the - expected regex (e.g. a tag) may need to match itself. - This is safe and non-pathological: each iteration of the - group consumes at least one \n, so the total work is linear - in the number of leading blank lines. + A non-greedy *? is used to avoid consuming lines that the expected + regex (e.g. a tag) may need to match itself. This is safe and + non-pathological: each iteration of the group consumes at least one + \n, so the total work is linear in the number of leading blank lines. + + If ignore_first_empty_lines is False, use a plain \A anchor so that + the got string must start exactly where the expected content begins. ''' - return r'\A(?:[ \t]*\n)*?' + if self.ignore_first_empty_lines: + return r'\A(?:[ \t]*\n)*?' + return r'\A' def emit_tag(self, ctx, endline): assert ctx in ('n', '0') From 6013ca0f16948b2962cabefa0b4f39ddfc695949 Mon Sep 17 00:00:00 2001 From: Martin Di Paola Date: Fri, 20 Mar 2026 23:56:34 -0300 Subject: [PATCH 6/7] fix: broken test --- docs/contrib/how-to-support-new-finders-and-languages.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contrib/how-to-support-new-finders-and-languages.md b/docs/contrib/how-to-support-new-finders-and-languages.md index ceab692..ea23a1a 100644 --- a/docs/contrib/how-to-support-new-finders-and-languages.md +++ b/docs/contrib/how-to-support-new-finders-and-languages.md @@ -281,7 +281,7 @@ the scenes so you do not to be worry about the details): ```python >>> from byexample.options import Options, OptionParser ->>> parser = ArnoldCParser(cfg=Config(verbosity=0, encoding='utf-8', options=Options(rm=[], norm_ws=False, tags=True, capture=True, type=False, input_prefix_range=(6,12), optparser=OptionParser(add_help=False)))) +>>> parser = ArnoldCParser(cfg=Config(verbosity=0, encoding='utf-8', options=Options(rm=[], norm_ws=False, tags=True, capture=True, type=False, input_prefix_range=(6,12), ignore_first_empty_lines=True, optparser=OptionParser(add_help=False)))) >>> from byexample.finder import Example >>> runner = None # not yet From 7cf3ff82b7817b44453c20dd456be1f1d51300fc Mon Sep 17 00:00:00 2001 From: Martin Di Paola Date: Fri, 20 Mar 2026 23:56:50 -0300 Subject: [PATCH 7/7] docs: doc -ignore-first-empty-lines option (+test) --- docs/basic/normalize-whitespace.md | 8 +++++++- test/bad-empty-line.md | 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/basic/normalize-whitespace.md b/docs/basic/normalize-whitespace.md index 50d7527..d888fc8 100644 --- a/docs/basic/normalize-whitespace.md +++ b/docs/basic/normalize-whitespace.md @@ -97,6 +97,12 @@ Some indented line Some indented line ``` +> *New* in `byexample 11.0.0`: before `11.0.0` it was up to the user to +> put a <...> or similar to ignore the empty lines at the begin (or use +> `+rm=~` combined with `+norm-ws`). +> Since `11.0.0` this is the default. If you want to old behavior you +> can use the flag `-ignore-first-empty-lines` + diff --git a/test/bad-empty-line.md b/test/bad-empty-line.md index 6f1efd3..4eb9f14 100644 --- a/test/bad-empty-line.md +++ b/test/bad-empty-line.md @@ -5,4 +5,7 @@ Some line >>> print("\n \n Some line") # should fail (missing indentation) Some line + +>>> print("\n \nSome line") # should fail because we are using pre-11.0.0 behavour # byexample: -ignore-first-empty-lines +Some line ```