Skip to content

Commit e7d9767

Browse files
committed
Add Boyer-Moore-Horspool string-search algorithm
1 parent 791deb4 commit e7d9767

2 files changed

Lines changed: 132 additions & 0 deletions

File tree

DIRECTORY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1327,6 +1327,7 @@
13271327
* [Autocomplete Using Trie](strings/autocomplete_using_trie.py)
13281328
* [Barcode Validator](strings/barcode_validator.py)
13291329
* [Bitap String Match](strings/bitap_string_match.py)
1330+
* [Boyer Moore Horspool](strings/boyer_moore_horspool.py)
13301331
* [Boyer Moore Search](strings/boyer_moore_search.py)
13311332
* [Camel Case To Snake Case](strings/camel_case_to_snake_case.py)
13321333
* [Can String Be Rearranged As Palindrome](strings/can_string_be_rearranged_as_palindrome.py)

strings/boyer_moore_horspool.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
"""
2+
Boyer-Moore-Horspool string-search algorithm.
3+
4+
A simplification of the Boyer-Moore algorithm that keeps only the
5+
bad-character shift table (Horspool's variant). It still runs in
6+
sub-linear time on average (roughly O(n / m) for random text) while
7+
worst case is O(n * m). Memory is O(sigma) where sigma is the size
8+
of the alphabet that appears in the pattern.
9+
10+
Reference: https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore%E2%80%93Horspool_algorithm
11+
"""
12+
13+
from __future__ import annotations
14+
15+
16+
def _build_shift_table(pattern: str) -> dict[str, int]:
17+
"""
18+
Build the bad-character shift table for ``pattern``.
19+
20+
For every character in the pattern except the last one, the table
21+
stores the distance from that character to the end of the pattern.
22+
Characters that do not appear in the pattern fall back to ``len(pattern)``
23+
at lookup time.
24+
25+
>>> _build_shift_table("abcab")
26+
{'a': 1, 'b': 3, 'c': 2}
27+
>>> _build_shift_table("a")
28+
{}
29+
>>> _build_shift_table("")
30+
{}
31+
>>> _build_shift_table("aaaa")
32+
{'a': 1}
33+
"""
34+
pattern_length = len(pattern)
35+
table: dict[str, int] = {}
36+
for index in range(pattern_length - 1):
37+
table[pattern[index]] = pattern_length - 1 - index
38+
return table
39+
40+
41+
def boyer_moore_horspool_search(text: str, pattern: str) -> int:
42+
"""
43+
Return the index of the first occurrence of ``pattern`` in ``text``
44+
or ``-1`` if the pattern does not appear.
45+
46+
An empty pattern matches at position ``0`` (the same convention used by
47+
:py:meth:`str.find`).
48+
49+
>>> boyer_moore_horspool_search("ABAAABCD", "ABC")
50+
4
51+
>>> boyer_moore_horspool_search("hello world", "world")
52+
6
53+
>>> boyer_moore_horspool_search("hello world", "Python")
54+
-1
55+
>>> boyer_moore_horspool_search("aaaaa", "aa")
56+
0
57+
>>> boyer_moore_horspool_search("anything", "")
58+
0
59+
>>> boyer_moore_horspool_search("", "x")
60+
-1
61+
>>> sample = "the quick brown fox jumps over the lazy dog"
62+
>>> boyer_moore_horspool_search(sample, "fox") == sample.find("fox")
63+
True
64+
>>> boyer_moore_horspool_search(sample, "cat") == sample.find("cat")
65+
True
66+
"""
67+
pattern_length = len(pattern)
68+
text_length = len(text)
69+
if pattern_length == 0:
70+
return 0
71+
if pattern_length > text_length:
72+
return -1
73+
74+
shift_table = _build_shift_table(pattern)
75+
skip = 0
76+
while text_length - skip >= pattern_length:
77+
index = pattern_length - 1
78+
while index >= 0 and pattern[index] == text[skip + index]:
79+
index -= 1
80+
if index < 0:
81+
return skip
82+
skip += shift_table.get(text[skip + pattern_length - 1], pattern_length)
83+
return -1
84+
85+
86+
def boyer_moore_horspool_search_all(text: str, pattern: str) -> list[int]:
87+
"""
88+
Return every starting index where ``pattern`` occurs in ``text``.
89+
90+
Overlapping matches are reported (e.g. ``"aaa"`` contains ``"aa"``
91+
at indices ``0`` and ``1``). An empty pattern matches at every
92+
position from ``0`` to ``len(text)`` inclusive, mirroring
93+
:py:meth:`str.find` and :py:func:`re.finditer` conventions.
94+
95+
>>> boyer_moore_horspool_search_all("ababcabab", "ab")
96+
[0, 2, 5, 7]
97+
>>> boyer_moore_horspool_search_all("aaaa", "aa")
98+
[0, 1, 2]
99+
>>> boyer_moore_horspool_search_all("abcdef", "gh")
100+
[]
101+
>>> boyer_moore_horspool_search_all("abc", "")
102+
[0, 1, 2, 3]
103+
>>> boyer_moore_horspool_search_all("", "abc")
104+
[]
105+
"""
106+
pattern_length = len(pattern)
107+
text_length = len(text)
108+
if pattern_length == 0:
109+
return list(range(text_length + 1))
110+
if pattern_length > text_length:
111+
return []
112+
113+
shift_table = _build_shift_table(pattern)
114+
matches: list[int] = []
115+
skip = 0
116+
while text_length - skip >= pattern_length:
117+
index = pattern_length - 1
118+
while index >= 0 and pattern[index] == text[skip + index]:
119+
index -= 1
120+
if index < 0:
121+
matches.append(skip)
122+
skip += 1
123+
else:
124+
skip += shift_table.get(text[skip + pattern_length - 1], pattern_length)
125+
return matches
126+
127+
128+
if __name__ == "__main__":
129+
import doctest
130+
131+
doctest.testmod()

0 commit comments

Comments
 (0)