From ce35b95f256f479d888338fe066e5a3196950194 Mon Sep 17 00:00:00 2001 From: Sam Leonard Date: Thu, 12 Mar 2026 20:23:17 +0000 Subject: [PATCH] Switch to ahochorasick-rs pyahocorasick v1.4.0 doesn't build with modern compilers which prevents stringcheese from being run, depending on ahocorasick-rs from pypi allows it to build again. It's also a bit faster :) Note: this bumps the version to 2.1 as well --- requirements.txt | 1 + setup.py | 83 +- stringcheese/pyahocorasick-1.4.0/.gitignore | 22 - stringcheese/pyahocorasick-1.4.0/.travis.yml | 15 - stringcheese/pyahocorasick-1.4.0/Automaton.c | 1298 -------------- stringcheese/pyahocorasick-1.4.0/Automaton.h | 163 -- .../pyahocorasick-1.4.0/AutomatonItemsIter.c | 332 ---- .../pyahocorasick-1.4.0/AutomatonItemsIter.h | 67 - .../pyahocorasick-1.4.0/AutomatonSearchIter.c | 424 ----- .../pyahocorasick-1.4.0/AutomatonSearchIter.h | 55 - .../AutomatonSearchIterLong.c | 267 --- .../AutomatonSearchIterLong.h | 43 - .../pyahocorasick-1.4.0/Automaton_pickle.c | 490 ----- .../pyahocorasick-1.4.0/CHANGELOG.rst | 155 -- stringcheese/pyahocorasick-1.4.0/LICENSE | 33 - stringcheese/pyahocorasick-1.4.0/MANIFEST.in | 20 - stringcheese/pyahocorasick-1.4.0/Makefile | 67 - stringcheese/pyahocorasick-1.4.0/README.rst | 313 ---- stringcheese/pyahocorasick-1.4.0/allsources.c | 6 - stringcheese/pyahocorasick-1.4.0/appveyor.yml | 28 - .../benchmarks/benchmark.py | 117 -- .../benchmarks/benchmark3.py | 117 -- .../benchmarks/results/python2-westmere.txt | 9 - .../results/python3-broadwell-u.txt | 8 - stringcheese/pyahocorasick-1.4.0/common.h | 97 - stringcheese/pyahocorasick-1.4.0/cygwin.h | 17 - .../pyahocorasick-1.4.0/docs/.gitignore | 1 - .../pyahocorasick-1.4.0/docs/Makefile | 225 --- .../docs/automaton___reduce__.rst | 4 - .../docs/automaton___sizeof__.rst | 3 - .../docs/automaton_add_word.rst | 68 - .../docs/automaton_clear.rst | 23 - .../docs/automaton_constructor.rst | 36 - .../docs/automaton_dump.rst | 13 - .../docs/automaton_exists.rst | 22 - .../docs/automaton_find_all.rst | 15 - .../docs/automaton_get.rst | 27 - .../docs/automaton_get_stats.rst | 28 - .../docs/automaton_items.rst | 6 - .../docs/automaton_iter.rst | 17 - .../docs/automaton_iter_long.rst | 44 - .../docs/automaton_keys.rst | 20 - .../docs/automaton_len.rst | 22 - .../docs/automaton_longest_prefix.rst | 22 - .../docs/automaton_make_automaton.rst | 6 - .../docs/automaton_match.rst | 36 - .../docs/automaton_pop.rst | 29 - .../docs/automaton_remove_word.rst | 25 - .../docs/automaton_save.rst | 8 - .../docs/automaton_search_iter.rst | 3 - .../docs/automaton_search_iter_set.rst | 7 - .../docs/automaton_values.rst | 6 - stringcheese/pyahocorasick-1.4.0/docs/conf.py | 345 ---- .../pyahocorasick-1.4.0/docs/index.rst | 375 ---- .../pyahocorasick-1.4.0/docs/module.rst | 3 - .../pyahocorasick-1.4.0/docs/module_load.rst | 7 - stringcheese/pyahocorasick-1.4.0/dump2dot.py | 87 - .../pyahocorasick-1.4.0/msinttypes/inttypes.h | 306 ---- .../pyahocorasick-1.4.0/msinttypes/stdint.h | 259 --- stringcheese/pyahocorasick-1.4.0/posix.h | 17 - .../pyahocorasick-1.4.0/py/README.rst | 2 - .../pyahocorasick-1.4.0/py/exportdot.py | 81 - .../pyahocorasick-1.4.0/py/issue_21.py | 54 - .../pyahocorasick-1.4.0/py/pyahocorasick.py | 345 ---- .../pyahocorasick-1.4.0/py/unittests.py | 209 --- .../pyahocorasick-1.4.0/pyahocorasick.c | 137 -- .../regression/issue_10.py | 34 - .../regression/issue_19.py | 15 - .../regression/issue_26.py | 17 - .../pyahocorasick-1.4.0/regression/issue_5.py | 18 - .../regression/issue_50-part1.py | 10 - .../regression/issue_50-part2.py | 5 - .../regression/issue_53.py | 11 - .../regression/issue_56.py | 41 - .../pyahocorasick-1.4.0/regression/issue_8.py | 55 - .../pyahocorasick-1.4.0/regression/issue_9.py | 57 - stringcheese/pyahocorasick-1.4.0/setup.cfg | 7 - stringcheese/pyahocorasick-1.4.0/setup.py | 126 -- stringcheese/pyahocorasick-1.4.0/slist.c | 114 -- stringcheese/pyahocorasick-1.4.0/slist.h | 70 - .../src/custompickle/custompickle.c | 52 - .../src/custompickle/custompickle.h | 29 - .../src/custompickle/load/loadbuffer.c | 152 -- .../src/custompickle/load/loadbuffer.h | 43 - .../custompickle/load/module_automaton_load.c | 280 --- .../custompickle/load/module_automaton_load.h | 7 - .../src/custompickle/pyhelpers.c | 61 - .../src/custompickle/pyhelpers.h | 10 - .../src/custompickle/save/automaton_save.c | 138 -- .../src/custompickle/save/automaton_save.h | 7 - .../src/custompickle/save/savebuffer.c | 114 -- .../src/custompickle/save/savebuffer.h | 34 - .../pyahocorasick-1.4.0/src/inline_doc.h | 282 --- .../pyahocorasick-1.4.0/src/pickle/pickle.h | 9 - .../src/pickle/pickle_data.c | 126 -- .../src/pickle/pickle_data.h | 29 - .../src/pycallfault/pycallfault.c | 49 - .../src/pycallfault/pycallfault.h | 59 - .../pyahocorasick-1.4.0/stamp/.gitignore | 1 - stringcheese/pyahocorasick-1.4.0/test.py | 55 - .../tests/generate_random_words.py | 72 - .../tests/memdump_check.py | 83 - .../tests/memdump_maxalloc.py | 32 - .../tests/memdump_maxrealloc.py | 32 - .../tests/pickle_stresstest.py | 287 --- .../tests/pyfault_check.py | 43 - .../tests/removeword_stresstest.py | 183 -- .../tests/unittestlog_check.py | 53 - .../tests/valgrind_check.py | 72 - stringcheese/pyahocorasick-1.4.0/trie.c | 231 --- stringcheese/pyahocorasick-1.4.0/trie.h | 50 - stringcheese/pyahocorasick-1.4.0/trienode.c | 200 --- stringcheese/pyahocorasick-1.4.0/trienode.h | 89 - stringcheese/pyahocorasick-1.4.0/unittests.py | 1596 ----------------- .../pyahocorasick-1.4.0/unpickle_test.py | 457 ----- .../unresolved_bugs/.gitignore | 1 - .../unresolved_bugs/bug_81.py | 60 - .../pyahocorasick-1.4.0/update_inlinedoc.py | 151 -- stringcheese/pyahocorasick-1.4.0/utils.c | 409 ----- stringcheese/pyahocorasick-1.4.0/windows.bat | 43 - stringcheese/pyahocorasick-1.4.0/windows.h | 18 - stringcheese/stringcheese.py | 159 +- 122 files changed, 104 insertions(+), 13524 deletions(-) delete mode 100644 stringcheese/pyahocorasick-1.4.0/.gitignore delete mode 100644 stringcheese/pyahocorasick-1.4.0/.travis.yml delete mode 100644 stringcheese/pyahocorasick-1.4.0/Automaton.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/Automaton.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/AutomatonItemsIter.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/AutomatonItemsIter.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/AutomatonSearchIter.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/AutomatonSearchIter.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/AutomatonSearchIterLong.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/AutomatonSearchIterLong.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/Automaton_pickle.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/CHANGELOG.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/LICENSE delete mode 100644 stringcheese/pyahocorasick-1.4.0/MANIFEST.in delete mode 100644 stringcheese/pyahocorasick-1.4.0/Makefile delete mode 100644 stringcheese/pyahocorasick-1.4.0/README.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/allsources.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/appveyor.yml delete mode 100644 stringcheese/pyahocorasick-1.4.0/benchmarks/benchmark.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/benchmarks/benchmark3.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/benchmarks/results/python2-westmere.txt delete mode 100644 stringcheese/pyahocorasick-1.4.0/benchmarks/results/python3-broadwell-u.txt delete mode 100644 stringcheese/pyahocorasick-1.4.0/common.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/cygwin.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/.gitignore delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/Makefile delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton___reduce__.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton___sizeof__.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_add_word.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_clear.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_constructor.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_dump.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_exists.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_find_all.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_get.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_get_stats.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_items.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_iter.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_iter_long.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_keys.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_len.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_longest_prefix.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_make_automaton.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_match.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_pop.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_remove_word.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_save.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_search_iter.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_search_iter_set.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/automaton_values.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/conf.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/index.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/module.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/docs/module_load.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/dump2dot.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/msinttypes/inttypes.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/msinttypes/stdint.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/posix.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/py/README.rst delete mode 100644 stringcheese/pyahocorasick-1.4.0/py/exportdot.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/py/issue_21.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/py/pyahocorasick.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/py/unittests.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/pyahocorasick.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/regression/issue_10.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/regression/issue_19.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/regression/issue_26.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/regression/issue_5.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/regression/issue_50-part1.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/regression/issue_50-part2.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/regression/issue_53.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/regression/issue_56.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/regression/issue_8.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/regression/issue_9.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/setup.cfg delete mode 100644 stringcheese/pyahocorasick-1.4.0/setup.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/slist.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/slist.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/custompickle/custompickle.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/custompickle/custompickle.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/custompickle/load/loadbuffer.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/custompickle/load/loadbuffer.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/custompickle/load/module_automaton_load.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/custompickle/load/module_automaton_load.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/custompickle/pyhelpers.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/custompickle/pyhelpers.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/custompickle/save/automaton_save.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/custompickle/save/automaton_save.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/custompickle/save/savebuffer.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/custompickle/save/savebuffer.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/inline_doc.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/pickle/pickle.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/pickle/pickle_data.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/pickle/pickle_data.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/pycallfault/pycallfault.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/src/pycallfault/pycallfault.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/stamp/.gitignore delete mode 100644 stringcheese/pyahocorasick-1.4.0/test.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/tests/generate_random_words.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/tests/memdump_check.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/tests/memdump_maxalloc.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/tests/memdump_maxrealloc.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/tests/pickle_stresstest.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/tests/pyfault_check.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/tests/removeword_stresstest.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/tests/unittestlog_check.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/tests/valgrind_check.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/trie.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/trie.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/trienode.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/trienode.h delete mode 100644 stringcheese/pyahocorasick-1.4.0/unittests.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/unpickle_test.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/unresolved_bugs/.gitignore delete mode 100644 stringcheese/pyahocorasick-1.4.0/unresolved_bugs/bug_81.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/update_inlinedoc.py delete mode 100644 stringcheese/pyahocorasick-1.4.0/utils.c delete mode 100644 stringcheese/pyahocorasick-1.4.0/windows.bat delete mode 100644 stringcheese/pyahocorasick-1.4.0/windows.h diff --git a/requirements.txt b/requirements.txt index 78620c4..39e3fee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ tqdm +ahocorasick-rs diff --git a/setup.py b/setup.py index 9d2ec84..2672d9b 100644 --- a/setup.py +++ b/setup.py @@ -11,79 +11,26 @@ from sys import version_info as python_version if python_version.major not in [3]: - raise ValueError('Python %s is not supported' % python_version) + raise ValueError("Python %s is not supported" % python_version) ## Reading long_description this_directory = path.abspath(path.dirname(__file__)) -with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: +with open(path.join(this_directory, "README.md"), encoding="utf-8") as f: long_description = f.read() -## Install pyahocorasick -pyahocorasick_dir = "./stringcheese/pyahocorasick-1.4.0/" - -module = setuptools.Extension( - 'stringcheese.ahocorasick', - sources=[ - pyahocorasick_dir+'pyahocorasick.c', - ], - define_macros=[], #[('AHOCORASICK_UNICODE', '')], - depends=[pyahocorasick_dir+d for d in [ - 'common.h', - 'Automaton.c', - 'Automaton.h', - 'Automaton_pickle.c', - 'AutomatonItemsIter.c', - 'AutomatonItemsIter.h', - 'AutomatonSearchIter.c', - 'AutomatonSearchIter.h', - 'AutomatonSearchIterLong.c', - 'AutomatonSearchIterLong.h', - 'trie.c', - 'trie.h', - 'slist.c', - 'utils.c', - 'trienode.c', - 'trienode.h', - 'msinttypes/stdint.h', - 'src/inline_doc.h', - 'src/pickle/pickle.h', - 'src/pickle/pickle_data.h', - 'src/pickle/pickle_data.c', - 'src/custompickle/custompickle.h', - 'src/custompickle/custompickle.c', - 'src/custompickle/pyhelpers.h', - 'src/custompickle/pyhelpers.c', - 'src/custompickle/save/automaton_save.h', - 'src/custompickle/save/automaton_save.c', - 'src/custompickle/save/savebuffer.h', - 'src/custompickle/save/savebuffer.c', - 'src/custompickle/load/module_automaton_load.h', - 'src/custompickle/load/module_automaton_load.c', - 'src/custompickle/load/loadbuffer.h', - 'src/custompickle/load/loadbuffer.c', - 'src/pycallfault/pycallfault.h', - 'src/pycallfault/pycallfault.c', - ]], -) - setuptools.setup( - name = 'stringcheese', - version = '2.0', - description = 'StringCheese is a tool to get easy CTF flags automatically.', - long_description = long_description, - long_description_content_type = 'text/markdown', - url = 'https://github.com/MathisHammel/stringcheese', - author = 'MathisHammel', - author_email = 'mathis@h25.io', - license = 'GPL2', - ext_modules = [module], - packages = ['stringcheese'], + name="stringcheese", + version="2.1", + description="StringCheese is a tool to get easy CTF flags automatically.", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/MathisHammel/stringcheese", + author="MathisHammel", + author_email="mathis@h25.io", + license="GPL2", + packages=["stringcheese"], include_package_data=True, - entry_points = { - 'console_scripts' : ['stringcheese=stringcheese.stringcheese:main'] - }, - install_requires = [ - 'tqdm' - ], - zip_safe = False + entry_points={"console_scripts": ["stringcheese=stringcheese.stringcheese:main"]}, + install_requires=["tqdm", "ahocorasick-rs"], + zip_safe=False, ) diff --git a/stringcheese/pyahocorasick-1.4.0/.gitignore b/stringcheese/pyahocorasick-1.4.0/.gitignore deleted file mode 100644 index 6c540fb..0000000 --- a/stringcheese/pyahocorasick-1.4.0/.gitignore +++ /dev/null @@ -1,22 +0,0 @@ -# patterns -*.pyc -*.pyd -*.dot -*.swp -*.so -*.sh -*.o - -# files -MANIFEST -runtest.sh -tags -release_checklist.txt -.gdb_history - -# dirs -build/ -dist/ -/pyahocorasick.egg-info/ -/tmp/ -coverage/ diff --git a/stringcheese/pyahocorasick-1.4.0/.travis.yml b/stringcheese/pyahocorasick-1.4.0/.travis.yml deleted file mode 100644 index 368d9da..0000000 --- a/stringcheese/pyahocorasick-1.4.0/.travis.yml +++ /dev/null @@ -1,15 +0,0 @@ -language: c -addons: - apt: - packages: - - python3 - - python3-dev - -matrix: - include: - - compiler: gcc - - compiler: clang - -script: - - make - diff --git a/stringcheese/pyahocorasick-1.4.0/Automaton.c b/stringcheese/pyahocorasick-1.4.0/Automaton.c deleted file mode 100644 index 86783a3..0000000 --- a/stringcheese/pyahocorasick-1.4.0/Automaton.c +++ /dev/null @@ -1,1298 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - Automaton class implementation. - (this file includes Automaton_pickle.c) - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -*/ - -#include "Automaton.h" -#include "slist.h" -#include "src/inline_doc.h" -#include "src/custompickle/save/automaton_save.h" - -static PyTypeObject automaton_type; - - -static bool -check_store(const int store) { - switch (store) { - case STORE_LENGTH: - case STORE_INTS: - case STORE_ANY: - return true; - - default: - PyErr_SetString( - PyExc_ValueError, - "store value must be one of ahocorasick.STORE_LENGTH, STORE_INTS or STORE_ANY" - ); - return false; - } // switch -} - - -static bool -check_kind(const int kind) { - switch (kind) { - case EMPTY: - case TRIE: - case AHOCORASICK: - return true; - - default: - PyErr_SetString( - PyExc_ValueError, - "kind value must be one of ahocorasick.EMPTY, TRIE or AHOCORASICK" - ); - return false; - } -} - - -static bool -check_key_type(const int store) { - switch (store) { - case KEY_STRING: - case KEY_SEQUENCE: - return true; - - default: - PyErr_SetString( - PyExc_ValueError, - "key_type must have value KEY_STRING or KEY_SEQUENCE" - ); - return false; - } // switch -} - -static PyObject* -automaton_create() { - - Automaton* automaton; - - automaton = (Automaton*)F(PyObject_New)(Automaton, &automaton_type); - if (UNLIKELY(automaton == NULL)) { - return NULL; - } - - automaton->kind = EMPTY; - automaton->store = STORE_ANY; - automaton->key_type = KEY_STRING; - automaton->count = 0; - automaton->longest_word = 0; - - automaton->version = 0; - automaton->stats.version = -1; - - automaton->root = NULL; - - return (PyObject*)automaton; -} - -static PyObject* -automaton_new(PyTypeObject* self, PyObject* args, PyObject* kwargs) { - Automaton* automaton; - int key_type; - int store; - - automaton = (Automaton*)automaton_create(); - if (UNLIKELY(automaton == NULL)) - return NULL; - - - if (UNLIKELY(PyTuple_Size(args) == 7)) { - - int word_count; - int longest_word; - AutomatonKind kind; - KeysStore store; - KeyType key_type; - PyObject* bytes_list = NULL; - PyObject* values = NULL; - - const char* fmt = "OiiiiiO"; - - if (!F(PyArg_ParseTuple)(args, fmt, &bytes_list, &kind, &store, &key_type, &word_count, &longest_word, &values)) { - PyErr_SetString(PyExc_ValueError, "Unable to load from pickle."); - goto error; - } - - if (!check_store(store) || !check_kind(kind) || !check_key_type(key_type)) { - goto error; - } - - if (!PyList_CheckExact(bytes_list)) { - PyErr_SetString(PyExc_TypeError, "Expected list"); - goto error; - } - - if (kind != EMPTY) { - if (values == Py_None) { - Py_XDECREF(values); - values = NULL; - } - - if (automaton_unpickle(automaton, bytes_list, values)) { - automaton->kind = kind; - automaton->store = store; - automaton->key_type = key_type; - automaton->count = word_count; - automaton->longest_word = longest_word; - } - else - goto error; - } - } - else { - store = STORE_ANY; - key_type = KEY_STRING; - - // construct new object - if (F(PyArg_ParseTuple)(args, "ii", &store, &key_type)) { - if (not check_store(store)) { - goto error; - } - - if (not check_key_type(key_type)) { - goto error; - } - } - else if (F(PyArg_ParseTuple)(args, "i", &store)) { - if (not check_store(store)) { - goto error; - } - } - - PyErr_Clear(); - automaton->store = store; - automaton->key_type = key_type; - } - -//ok: - return (PyObject*)automaton; - -error: - Py_XDECREF(automaton); - return NULL; -} - - -static void -automaton_del(PyObject* self) { -#define automaton ((Automaton*)self) - automaton_clear(self, NULL); - PyObject_Del(self); -#undef automaton -} - - -static ssize_t -automaton_len(PyObject* self) { -#define automaton ((Automaton*)self) - return automaton->count; -#undef automaton -} - - -static PyObject* -automaton_add_word(PyObject* self, PyObject* args) { -#define automaton ((Automaton*)self) - // argument - PyObject* py_value = NULL; - struct Input input; - - Py_ssize_t integer = 0; - TrieNode* node; - bool new_word; - - if (!prepare_input_from_tuple(self, args, 0, &input)) { - return NULL; - } - - switch (automaton->store) { - case STORE_ANY: - py_value = F(PyTuple_GetItem)(args, 1); - if (not py_value) { - PyErr_SetString(PyExc_ValueError, "A value object is required as second argument."); - goto py_exception; - } - break; - - case STORE_INTS: - py_value = F(PyTuple_GetItem)(args, 1); - if (py_value) { - if (F(PyNumber_Check)(py_value)) { - integer = F(PyNumber_AsSsize_t)(py_value, PyExc_ValueError); - if (integer == -1 and PyErr_Occurred()) - goto py_exception; - } - else { - PyErr_SetString(PyExc_TypeError, "An integer value is required as second argument."); - goto py_exception; - } - } - else { - // default - PyErr_Clear(); - integer = automaton->count + 1; - } - break; - - case STORE_LENGTH: - integer = input.wordlen; - break; - - default: - PyErr_SetString(PyExc_SystemError, "Invalid value for this key: see documentation for supported values."); - goto py_exception; - } - - node = NULL; - new_word = false; - - if (input.wordlen > 0) { - node = trie_add_word(automaton, input.word, input.wordlen, &new_word); - - if (node == NULL) { - PyErr_NoMemory(); - goto py_exception; - } - } - - destroy_input(&input); - - if (node) { - switch (automaton->store) { - case STORE_ANY: - if (not new_word and node->eow) - // replace - Py_DECREF(node->output.object); - - Py_INCREF(py_value); - node->output.object = py_value; - break; - - default: - node->output.integer = integer; - } // switch - - if (new_word) { - automaton->version += 1; // change version only when new word appeared - if (input.wordlen > automaton->longest_word) - automaton->longest_word = (int)input.wordlen; - - Py_RETURN_TRUE; - } - else { - Py_RETURN_FALSE; - } - } - - Py_RETURN_FALSE; - -py_exception: - destroy_input(&input); - return NULL; -} - -static TristateResult -automaton_remove_word_aux(PyObject* self, PyObject* args, PyObject** value) { -#define automaton ((Automaton*)self) - struct Input input; - - if (!prepare_input_from_tuple(self, args, 0, &input)) { - return MEMORY_ERROR; - } - - if (input.wordlen == 0) { - destroy_input(&input); - return FALSE; - } - - *value = trie_remove_word(automaton, input.word, input.wordlen); - destroy_input(&input); - - if (UNLIKELY(PyErr_Occurred() != NULL)) { - return MEMORY_ERROR; - } else { - return (*value != NULL) ? TRUE : FALSE; - } -} - - -static PyObject* -automaton_remove_word(PyObject* self, PyObject* args) { - PyObject* value; - - switch (automaton_remove_word_aux(self, args, &value)) { - case FALSE: - Py_RETURN_FALSE; - break; - - case TRUE: - if (automaton->store == STORE_ANY) { - // value is meaningful - Py_DECREF(value); - } - - automaton->version += 1; - automaton->count -= 1; - Py_RETURN_TRUE; - break; - - case MEMORY_ERROR: - default: - return NULL; - } -} - - -static PyObject* -automaton_pop(PyObject* self, PyObject* args) { - PyObject* value; - - switch (automaton_remove_word_aux(self, args, &value)) { - case FALSE: - PyErr_SetNone(PyExc_KeyError); - return NULL; - - case TRUE: - automaton->version += 1; - automaton->count -= 1; - return value; // there's no need to increase refcount, the value was removed - - case MEMORY_ERROR: - default: - return NULL; - } -} - - -static void -clear_aux(TrieNode* node, KeysStore store) { - - unsigned i; - - if (node) { - switch (store) { - case STORE_INTS: - case STORE_LENGTH: - // nop - break; - - case STORE_ANY: - if (node->eow && node->output.object) - Py_DECREF(node->output.object); - break; - } - - for (i=0; i < node->n; i++) { - TrieNode* child = trienode_get_ith_unsafe(node, i); - if (child != node) // avoid self-loops! - clear_aux(child, store); - } - - trienode_free(node); - } -#undef automaton -} - - -static PyObject* -automaton_clear(PyObject* self, PyObject* args) { -#define automaton ((Automaton*)self) - clear_aux(automaton->root, automaton->store); - automaton->count = 0; - automaton->longest_word = 0; - automaton->kind = EMPTY; - automaton->root = NULL; - automaton->version += 1; - - Py_RETURN_NONE; -#undef automaton -} - - -static int -automaton_contains(PyObject* self, PyObject* args) { -#define automaton ((Automaton*)self) - TrieNode* node; - struct Input input; - - if (!prepare_input(self, args, &input)) { - return -1; - } - - node = trie_find(automaton->root, input.word, input.wordlen); - - destroy_input(&input); - - return (node and node->eow); -#undef automaton -} - - -static PyObject* -automaton_exists(PyObject* self, PyObject* args) { - PyObject* word; - - word = F(PyTuple_GetItem)(args, 0); - if (word) - switch (automaton_contains(self, word)) { - case 1: - Py_RETURN_TRUE; - - case 0: - Py_RETURN_FALSE; - - default: - return NULL; - } - else - return NULL; -} - - -static PyObject* -automaton_match(PyObject* self, PyObject* args) { -#define automaton ((Automaton*)self) - TrieNode* node; - struct Input input; - - if (!prepare_input_from_tuple(self, args, 0, &input)) { - return NULL; - } - - node = trie_find(automaton->root, input.word, input.wordlen);; - - destroy_input(&input); - - if (node) - Py_RETURN_TRUE; - else - Py_RETURN_FALSE; -#undef automaton -} - - -static PyObject* -automaton_longest_prefix(PyObject* self, PyObject* args) { -#define automaton ((Automaton*)self) - int len; - struct Input input; - - if (!prepare_input_from_tuple(self, args, 0, &input)) { - return NULL; - } - - len = trie_longest(automaton->root, input.word, input.wordlen); - - destroy_input(&input); - - return F(Py_BuildValue)("i", len); -#undef automaton -} - - -static PyObject* -automaton_get(PyObject* self, PyObject* args) { -#define automaton ((Automaton*)self) - struct Input input; - PyObject* py_def; - Py_ssize_t k; - TrieNode* node; - - k = PyTuple_GET_SIZE(args); - - if (k < 1 || k > 2) { - PyErr_Format(PyExc_TypeError, "get() takes one or two arguments (%ld given)", k); - return NULL; - } - - if (!prepare_input_from_tuple(self, args, 0, &input)) { - return NULL; - } - - node = trie_find(automaton->root, input.word, input.wordlen); - - destroy_input(&input); - - if (node and node->eow) { - switch (automaton->store) { - case STORE_INTS: - case STORE_LENGTH: - return F(Py_BuildValue)("i", node->output.integer); - - case STORE_ANY: - Py_INCREF(node->output.object); - return node->output.object; - - default: - PyErr_SetNone(PyExc_ValueError); - return NULL; - } - } - else { - py_def = F(PyTuple_GetItem)(args, 1); - if (py_def) { - Py_INCREF(py_def); - return py_def; - } - else { - PyErr_Clear(); - PyErr_SetNone(PyExc_KeyError); - return NULL; - } - } -#undef automaton -} - -typedef struct AutomatonQueueItem { - LISTITEM_data; - TrieNode* node; -} AutomatonQueueItem; - - -static PyObject* -automaton_make_automaton(PyObject* self, PyObject* args) { -#define automaton ((Automaton*)self) - - AutomatonQueueItem* item; - List queue; - unsigned i; - - TrieNode* node; - TrieNode* child; - TrieNode* state; - TRIE_LETTER_TYPE letter; - - - if (automaton->kind != TRIE) - Py_RETURN_FALSE; - - list_init(&queue); - - // 1. setup nodes at first level: they fail back to the root - ASSERT(automaton->root); - - for (i=0; i < automaton->root->n; i++) { - TrieNode* child = trienode_get_ith_unsafe(automaton->root, i); - ASSERT(child); - // fail edges go to the root - // every other letters loop on root - implicit (see automaton_next) - child->fail = automaton->root; - - item = (AutomatonQueueItem*)list_item_new(sizeof(AutomatonQueueItem)); - if (item) { - item->node = child; - list_append(&queue, (ListItem*)item); - } - else - goto no_mem; - } - - // 2. make links - while (true) { - AutomatonQueueItem* item = (AutomatonQueueItem*)list_pop_first(&queue); - if (item == NULL) - break; - else { - node = item->node; - memory_free(item); - } - - for (i=0; i < node->n; i++) { - child = trienode_get_ith_unsafe(node, i); - letter = trieletter_get_ith_unsafe(node, i); - ASSERT(child); - - item = (AutomatonQueueItem*)list_item_new(sizeof(AutomatonQueueItem)); - if (item) { - item->node = child; - list_append(&queue, (ListItem*)item); - } - else - goto no_mem; - - state = node->fail; - ASSERT(state); - ASSERT(child); - while (state != automaton->root and\ - not trienode_get_next(state, letter)) { - - state = state->fail; - ASSERT(state); - } - - child->fail = trienode_get_next(state, letter); - if (child->fail == NULL) - child->fail = automaton->root; - - ASSERT(child->fail); - } - } - - automaton->kind = AHOCORASICK; - automaton->version += 1; - list_delete(&queue); - Py_RETURN_NONE; -#undef automaton - -no_mem: - list_delete(&queue); - PyErr_NoMemory(); - return NULL; -} - - -static PyObject* -automaton_find_all(PyObject* self, PyObject* args) { -#define automaton ((Automaton*)self) - - struct Input input; - ssize_t start; - ssize_t end; - PyObject* callback; - PyObject* callback_ret; - - ssize_t i; - TrieNode* state; - TrieNode* tmp; - - if (automaton->kind != AHOCORASICK) - Py_RETURN_NONE; - - // arg 1 - if (!prepare_input_from_tuple(self, args, 0, &input)) { - return NULL; - } - - // arg 2 - callback = F(PyTuple_GetItem)(args, 1); - if (callback == NULL) { - destroy_input(&input); - return NULL; - } - else - if (not F(PyCallable_Check)(callback)) { - PyErr_SetString(PyExc_TypeError, "The callback argument must be a callable such as a function."); - destroy_input(&input); - return NULL; - } - - // parse start/end - if (pymod_parse_start_end(args, 2, 3, 0, input.wordlen, &start, &end)) { - destroy_input(&input); - return NULL; - } - - state = automaton->root; - for (i=start; i < end; i++) { - state = tmp = ahocorasick_next(state, automaton->root, input.word[i]); - - // return output - while (tmp) { - if (tmp->eow) { - if (automaton->store == STORE_ANY) - callback_ret = F(PyObject_CallFunction)(callback, "iO", i, tmp->output.object); - else - callback_ret = F(PyObject_CallFunction)(callback, "ii", i, tmp->output.integer); - - if (callback_ret == NULL) { - destroy_input(&input); - return NULL; - } else - Py_DECREF(callback_ret); - } - - tmp = tmp->fail; - } - } -#undef automaton - - destroy_input(&input); - Py_RETURN_NONE; -} - -static PyObject* -automaton_items_create(PyObject* self, PyObject* args, const ItemsType type) { -#define automaton ((Automaton*)self) - PyObject* arg1 = NULL; - PyObject* arg2 = NULL; - PyObject* arg3 = NULL; - TRIE_LETTER_TYPE* word = NULL; - TRIE_LETTER_TYPE* tmp = NULL; - ssize_t wordlen = 0; - - TRIE_LETTER_TYPE wildcard; - bool use_wildcard = false; - PatternMatchType matchtype = MATCH_AT_LEAST_PREFIX; - - AutomatonItemsIter* iter; - - bool word_is_copy = false; - bool tmp_is_copy = false; - - // arg 1: prefix/prefix pattern - if (args) - arg1 = F(PyTuple_GetItem)(args, 0); - else - arg1 = NULL; - - if (arg1) { - arg1 = pymod_get_string(arg1, &word, &wordlen, &word_is_copy); - if (arg1 == NULL) - goto error; - } - else { - PyErr_Clear(); - word = NULL; - wordlen = 0; - } - - // arg 2: wildcard - if (args) - arg2 = F(PyTuple_GetItem)(args, 1); - else - arg2 = NULL; - - if (arg2) { - ssize_t len = 0; - - arg2 = pymod_get_string(arg2, &tmp, &len, &tmp_is_copy); - if (arg2 == NULL) { - goto error; - } else { - if (len == 1) { - wildcard = tmp[0]; - use_wildcard = true; - } - else { - PyErr_SetString(PyExc_ValueError, "Wildcard must be a single character."); - goto error; - } - } - } - else { - PyErr_Clear(); - wildcard = 0; - use_wildcard = false; - } - - // arg3: matchtype - matchtype = MATCH_AT_LEAST_PREFIX; - if (args) { - arg3 = F(PyTuple_GetItem)(args, 2); - if (arg3) { - Py_ssize_t val = F(PyNumber_AsSsize_t)(arg3, PyExc_OverflowError); - if (val == -1 and PyErr_Occurred()) - goto error; - - switch ((PatternMatchType)val) { - case MATCH_AT_LEAST_PREFIX: - case MATCH_AT_MOST_PREFIX: - case MATCH_EXACT_LENGTH: - matchtype = (PatternMatchType)val; - break; - - default: - PyErr_SetString(PyExc_ValueError, - "The optional how third argument must be one of: " - "MATCH_EXACT_LENGTH, MATCH_AT_LEAST_PREFIX or MATCH_AT_LEAST_PREFIX" - ); - goto error; - } - } - else { - PyErr_Clear(); - if (use_wildcard) - matchtype = MATCH_EXACT_LENGTH; - else - matchtype = MATCH_AT_LEAST_PREFIX; - } - } - - // - iter = (AutomatonItemsIter*)automaton_items_iter_new( - automaton, - word, - wordlen, - use_wildcard, - wildcard, - matchtype); - - maybe_decref(word_is_copy, arg1) - maybe_decref(tmp_is_copy, arg2) - maybe_free(word_is_copy, word) - maybe_free(tmp_is_copy, tmp) - - if (iter) { - iter->type = type; - return (PyObject*)iter; - } - else - return NULL; - - -error: - maybe_decref(word_is_copy, arg1) - maybe_decref(tmp_is_copy, arg2) - maybe_free(word_is_copy, word) - maybe_free(tmp_is_copy, tmp) - return NULL; -#undef automaton -} - - -static PyObject* -automaton_keys(PyObject* self, PyObject* args) { - return automaton_items_create(self, args, ITER_KEYS); -} - - -static PyObject* -automaton_iterate(PyObject* self) { - return automaton_items_create(self, NULL, ITER_KEYS); -} - - -static PyObject* -automaton_values(PyObject* self, PyObject* args) { - return automaton_items_create(self, args, ITER_VALUES); -} - - -static PyObject* -automaton_items(PyObject* self, PyObject* args) { - return automaton_items_create(self, args, ITER_ITEMS); -} - - -static PyObject* -automaton_iter(PyObject* self, PyObject* args, PyObject* keywds) { -#define automaton ((Automaton*)self) - static char *kwlist[] = {"string", "start", "end", "ignore_white_space", NULL}; - - PyObject* object; - ssize_t start, start_tmp = -1; - ssize_t end, end_tmp = -1; - int ignore_white_space_tmp = -1; - bool ignore_white_space = false; - - if (automaton->kind != AHOCORASICK) { - PyErr_SetString(PyExc_AttributeError,"Not an Aho-Corasick automaton yet: " - "call add_word to add some keys and call make_automaton to " - "convert the trie to an automaton."); - return NULL; - } - - if (!F(PyArg_ParseTupleAndKeywords)(args, keywds, "O|iii", kwlist, &object, &start_tmp, &end_tmp, &ignore_white_space_tmp)) { - return NULL; - } - - if (ignore_white_space_tmp == 1) { - ignore_white_space = true; - } - - if (object) { - if (automaton->key_type == KEY_STRING) { -#ifdef PY3K - #ifdef AHOCORASICK_UNICODE - if (F(PyUnicode_Check)(object)) { - start = 0; - #if PY_MINOR_VERSION >= 3 - end = PyUnicode_GET_LENGTH(object); - #else - end = PyUnicode_GET_SIZE(object); - #endif - } - else { - PyErr_SetString(PyExc_TypeError, "string required"); - return NULL; - } - #else - if (F(PyBytes_Check)(object)) { - start = 0; - end = PyBytes_GET_SIZE(object); - } - else { - PyErr_SetString(PyExc_TypeError, "bytes required"); - return NULL; - } - #endif -#else - if (F(PyString_Check)(object)) { - start = 0; - end = PyString_GET_SIZE(object); - } else { - PyErr_SetString(PyExc_TypeError, "string required"); - return NULL; - } -#endif - } - else { - if (F(PyTuple_Check)(object)) { - start = 0; - end = PyTuple_GET_SIZE(object); - } else { - PyErr_SetString(PyExc_TypeError, "tuple required"); - return NULL; - } - } - } - else - return NULL; - - if (start_tmp != -1) { - start = start_tmp; - } - - if (end_tmp != -1) { - end = end_tmp; - } - - return automaton_search_iter_new( - automaton, - object, - (int)start, - (int)end, - ignore_white_space - ); -#undef automaton -} - - -static PyObject* -automaton_iter_long(PyObject* self, PyObject* args) { -#define automaton ((Automaton*)self) - - PyObject* object; - ssize_t start; - ssize_t end; - - if (automaton->kind != AHOCORASICK) { - PyErr_SetString(PyExc_AttributeError, "not an automaton yet; add some words and call make_automaton"); - return NULL; - } - - object = PyTuple_GetItem(args, 0); - if (object == NULL) - return NULL; - - if (automaton->key_type == KEY_STRING) { -#ifdef PY3K - #ifdef AHOCORASICK_UNICODE - if (F(PyUnicode_Check)(object)) { - start = 0; - #if PY_MINOR_VERSION >= 3 - end = PyUnicode_GET_LENGTH(object); - #else - end = PyUnicode_GET_SIZE(object); - #endif - } - else { - PyErr_SetString(PyExc_TypeError, "string required"); - return NULL; - } - #else - if (F(PyBytes_Check)(object)) { - start = 0; - end = PyBytes_GET_SIZE(object); - } - else { - PyErr_SetString(PyExc_TypeError, "bytes required"); - return NULL; - } - #endif -#else - if (F(PyString_Check)(object)) { - start = 0; - end = PyString_GET_SIZE(object); - } else { - PyErr_SetString(PyExc_TypeError, "string required"); - return NULL; - } -#endif - } - else { - if (F(PyTuple_Check)(object)) { - start = 0; - end = PyTuple_GET_SIZE(object); - } else { - PyErr_SetString(PyExc_TypeError, "tuple required"); - return NULL; - } - } - - if (pymod_parse_start_end(args, 1, 2, start, end, &start, &end)) - return NULL; - - return automaton_search_iter_long_new( - automaton, - object, - start, - end - ); -#undef automaton -} - - -static void -get_stats_aux(TrieNode* node, AutomatonStatistics* stats, int depth) { - - unsigned i; - - stats->nodes_count += 1; - stats->words_count += (int)(node->eow); - stats->links_count += node->n; - stats->total_size += trienode_get_size(node); - - if (depth > stats->longest_word) - stats->longest_word = depth; - - for (i=0; i < node->n; i++) - get_stats_aux(trienode_get_ith_unsafe(node, i), stats, depth + 1); -} - -static void -get_stats(Automaton* automaton) { - automaton->stats.nodes_count = 0; - automaton->stats.words_count = 0; - automaton->stats.longest_word = 0; - automaton->stats.links_count = 0; - automaton->stats.sizeof_node = sizeof(TrieNode); - automaton->stats.total_size = 0; - - if (automaton->kind != EMPTY) - get_stats_aux(automaton->root, &automaton->stats, 0); - - automaton->stats.version = automaton->version; -} - - -static PyObject* -automaton_get_stats(PyObject* self, PyObject* args) { -#define automaton ((Automaton*)self) - - PyObject* dict; - - if (automaton->stats.version != automaton->version) - get_stats(automaton); - - dict = F(Py_BuildValue)( - "{s:k,s:k,s:k,s:k,s:i,s:k}", - "nodes_count", automaton->stats.nodes_count, - "words_count", automaton->stats.words_count, - "longest_word", automaton->stats.longest_word, - "links_count", automaton->stats.links_count, - "sizeof_node", automaton->stats.sizeof_node, - "total_size", automaton->stats.total_size - ); - return dict; -#undef automaton -} - - -typedef struct DumpAux { - PyObject* nodes; - PyObject* edges; - PyObject* fail; - char error; -} DumpAux; - -static int -dump_aux(TrieNode* node, const int depth, void* extra) { -#define Dump ((DumpAux*)extra) - PyObject* tuple; - TrieNode* child; - unsigned i; - -#define append_tuple(list) \ - if (tuple == NULL) { \ - Dump->error = 1; \ - return 0; \ - } \ - else if (PyList_Append(list, tuple) < 0) { \ - Dump->error = 1; \ - return 0; \ - } - - - // 1. - tuple = F(Py_BuildValue)("ii", node, (int)(node->eow)); - append_tuple(Dump->nodes) - - // 2. - for (i=0; i < node->n; i++) { - child = trienode_get_ith_unsafe(node, i); - tuple = F(Py_BuildValue)("ici", node, trieletter_get_ith_unsafe(node, i), child); - append_tuple(Dump->edges) - } - - // 3. - if (node->fail) { - tuple = F(Py_BuildValue)("ii", node, node->fail); - append_tuple(Dump->fail); - } - - return 1; -#undef append_tuple -#undef Dump -} - - -static PyObject* -automaton_dump(PyObject* self, PyObject* args) { -#define automaton ((Automaton*)self) - DumpAux dump; - - if (automaton->kind == EMPTY) - Py_RETURN_NONE; - - dump.nodes = 0; - dump.edges = 0; - dump.fail = 0; - dump.error = 0; - - dump.nodes = F(PyList_New)(0); - dump.edges = F(PyList_New)(0); - dump.fail = F(PyList_New)(0); - if (dump.edges == NULL or dump.fail == NULL or dump.nodes == NULL) - goto error; - - trie_traverse(automaton->root, dump_aux, &dump); - if (dump.error) - goto error; - else - return F(Py_BuildValue)("OOO", dump.nodes, dump.edges, dump.fail); - -error: - Py_XDECREF(dump.nodes); - Py_XDECREF(dump.edges); - Py_XDECREF(dump.fail); - return NULL; - -#undef automaton -} - - -static PyObject* -automaton___sizeof__(PyObject* self, PyObject* args) { -#define automaton ((Automaton*)self) - Py_ssize_t size = sizeof(Automaton); - - if (automaton->kind != EMPTY) { - if (automaton->stats.version != automaton->version) { - get_stats(automaton); - } - - size += automaton->stats.total_size; - } - - return Py_BuildValue("i", size); -#undef automaton -} - - -#include "Automaton_pickle.c" - - -#define method(name, kind) {#name, (PyCFunction)automaton_##name, kind, automaton_##name##_doc} -static -PyMethodDef automaton_methods[] = { - method(add_word, METH_VARARGS), - method(remove_word, METH_VARARGS), - method(pop, METH_VARARGS), - method(clear, METH_NOARGS), - method(exists, METH_VARARGS), - method(match, METH_VARARGS), - method(longest_prefix, METH_VARARGS), - method(get, METH_VARARGS), - method(make_automaton, METH_NOARGS), - method(find_all, METH_VARARGS), - method(iter, METH_VARARGS|METH_KEYWORDS), - method(iter_long, METH_VARARGS), - method(keys, METH_VARARGS), - method(values, METH_VARARGS), - method(items, METH_VARARGS), - method(get_stats, METH_NOARGS), - method(dump, METH_NOARGS), - method(__reduce__, METH_VARARGS), - method(__sizeof__, METH_VARARGS), - method(save, METH_VARARGS), - - {NULL, NULL, 0, NULL} -}; -#undef method - - -static -PySequenceMethods automaton_as_sequence; - - -static -PyMemberDef automaton_members[] = { - { - "kind", - T_INT, - offsetof(Automaton, kind), - READONLY, - "Read-only attribute maintained automatically.\nKind for this Automaton instance.\nOne of ahocorasick.EMPTY, TRIE or AHOCORASICK." - }, - - { - "store", - T_INT, - offsetof(Automaton, store), - READONLY, - "Read-only attribute set when creating an Automaton().\nType of values accepted by this Automaton.\nOne of ahocorasick.STORE_ANY, STORE_INTS or STORE_LEN." - }, - - {NULL} -}; - -static PyTypeObject automaton_type = { - PY_OBJECT_HEAD_INIT - "ahocorasick.Automaton", /* tp_name */ - sizeof(Automaton), /* tp_size */ - 0, /* tp_itemsize? */ - (destructor)automaton_del, /* tp_dealloc */ - 0, /* tp_print */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_reserved */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - PyObject_GenericGetAttr, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - automaton_constructor_doc, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - automaton_iterate, /* tp_iter */ - 0, /* tp_iternext */ - automaton_methods, /* tp_methods */ - automaton_members, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - 0, /* tp_init */ - 0, /* tp_alloc */ - automaton_new, /* tp_new */ -}; - diff --git a/stringcheese/pyahocorasick-1.4.0/Automaton.h b/stringcheese/pyahocorasick-1.4.0/Automaton.h deleted file mode 100644 index e1088b5..0000000 --- a/stringcheese/pyahocorasick-1.4.0/Automaton.h +++ /dev/null @@ -1,163 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - Automaton class methods - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -*/ -#ifndef ahocorasick_Automaton_h_included -#define ahocorasick_Automaton_h_included - -#include "common.h" -#include "trie.h" - -typedef enum { - EMPTY = 0, - TRIE = 1, - AHOCORASICK = 2 -} AutomatonKind; - - -static bool -check_kind(const int kind); - - -typedef enum { - STORE_INTS = 10, - STORE_LENGTH = 20, - STORE_ANY = 30 -} KeysStore; - - -static bool -check_store(const int store); - - -typedef enum { - KEY_STRING = 100, - KEY_SEQUENCE = 200 -} KeyType; - - -static bool -check_key_type(const int key_type); - - -struct Input { - Py_ssize_t wordlen; - TRIE_LETTER_TYPE* word; - PyObject* py_word; - bool is_copy; -}; - - -typedef struct AutomatonStatistics { - int version; - - ssize_t nodes_count; ///< total number of nodes - ssize_t words_count; ///< len(automaton) - ssize_t longest_word; ///< longest word - ssize_t links_count; ///< links count - ssize_t sizeof_node; ///< size of single node (a C structure) - ssize_t total_size; ///< total size in bytes -} AutomatonStatistics; - - -typedef struct Automaton { - PyObject_HEAD - - AutomatonKind kind; ///< current kind of automaton - KeysStore store; ///< type of values: copy of string, bare integer, python object - KeyType key_type; ///< type of keys: strings or integer sequences - int count; ///< number of distinct words - int longest_word; ///< length of the longest word - TrieNode* root; ///< root of a trie - - int version; ///< current version of automaton, incremented by add_word, clean and make_automaton; used to lazy invalidate iterators - - AutomatonStatistics stats; ///< statistics -} Automaton; - -/*------------------------------------------------------------------------*/ - -static bool -automaton_unpickle( - Automaton* automaton, - PyObject* bytes_list, - PyObject* values -); - -static PyObject* -automaton_create(void); - -/* __init__ */ -static PyObject* -automaton_new(PyTypeObject* self, PyObject* args, PyObject* kwargs); - -/* clear() */ -static PyObject* -automaton_clear(PyObject* self, PyObject* args); - -/* len() */ -static ssize_t -automaton_len(PyObject* self); - -/* add_word */ -static PyObject* -automaton_add_word(PyObject* self, PyObject* args); - -/* clear() */ -static PyObject* -automaton_clear(PyObject* self, PyObject* args); - -/* __contains__ */ -static int -automaton_contains(PyObject* self, PyObject* args); - -/* exists() */ -static PyObject* -automaton_exists(PyObject* self, PyObject* args); - -/* match() */ -static PyObject* -automaton_match(PyObject* self, PyObject* args); - -/* get() */ -static PyObject* -automaton_get(PyObject* self, PyObject* args); - -/* make_automaton() */ -static PyObject* -automaton_make_automaton(PyObject* self, PyObject* args); - -/* find_all() */ -static PyObject* -automaton_find_all(PyObject* self, PyObject* args); - -/* keys() */ -static PyObject* -automaton_keys(PyObject* self, PyObject* args); - -/* values() */ -static PyObject* -automaton_values(PyObject* self, PyObject* args); - -/* items() */ -static PyObject* -automaton_items(PyObject* self, PyObject* args); - -/* iter() */ -static PyObject* -automaton_iter(PyObject* self, PyObject* args, PyObject* keywds); - -/* iter_long() */ -static PyObject* -automaton_iter_long(PyObject* self, PyObject* args); - -/* get_stats() */ -static PyObject* -automaton_get_stats(PyObject* self, PyObject* args); - -#endif diff --git a/stringcheese/pyahocorasick-1.4.0/AutomatonItemsIter.c b/stringcheese/pyahocorasick-1.4.0/AutomatonItemsIter.c deleted file mode 100644 index 614fc40..0000000 --- a/stringcheese/pyahocorasick-1.4.0/AutomatonItemsIter.c +++ /dev/null @@ -1,332 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - AutomatonItemsIter implementation - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -*/ -#include "AutomatonItemsIter.h" - -static PyTypeObject automaton_items_iter_type; - - -typedef struct AutomatonItemsStackItem { - LISTITEM_data; - - struct TrieNode* node; - TRIE_LETTER_TYPE letter; - size_t depth; -} AutomatonItemsStackItem; - -#define StackItem AutomatonItemsStackItem - -static PyObject* -automaton_items_iter_new( - Automaton* automaton, - const TRIE_LETTER_TYPE* word, - const ssize_t wordlen, - - const bool use_wildcard, - const TRIE_LETTER_TYPE wildcard, - const PatternMatchType matchtype -) { - AutomatonItemsIter* iter; - StackItem* new_item; - - iter = (AutomatonItemsIter*)F(PyObject_New)(AutomatonItemsIter, &automaton_items_iter_type); - if (iter == NULL) - return NULL; - - iter->automaton = automaton; - iter->version = automaton->version; - iter->state = NULL; - iter->type = ITER_KEYS; - iter->buffer = NULL; -#ifndef AHOCORASICK_UNICODE - iter->char_buffer = NULL; -#endif - iter->pattern = NULL; - iter->use_wildcard = use_wildcard; - iter->wildcard = wildcard; - iter->matchtype = matchtype; - list_init(&iter->stack); - - Py_INCREF((PyObject*)iter->automaton); - - iter->buffer = memory_alloc((automaton->longest_word + 1) * TRIE_LETTER_SIZE); - if (iter->buffer == NULL) { - goto no_memory; - } - -#ifndef AHOCORASICK_UNICODE - iter->char_buffer = memory_alloc(automaton->longest_word + 1); - if (iter->char_buffer == NULL) { - goto no_memory; - } -#endif - - if (word) { - iter->pattern = (TRIE_LETTER_TYPE*)memory_alloc(wordlen * TRIE_LETTER_SIZE); - if (UNLIKELY(iter->pattern == NULL)) { - goto no_memory; - } - else { - iter->pattern_length = wordlen; - memcpy(iter->pattern, word, wordlen * TRIE_LETTER_SIZE); - } - } - else - iter->pattern_length = 0; - - new_item = (StackItem*)list_item_new(sizeof(StackItem)); - if (UNLIKELY(new_item == NULL)) { - goto no_memory; - } - - new_item->node = automaton->root; - new_item->depth = 0; - list_push_front(&iter->stack, (ListItem*)new_item); - - return (PyObject*)iter; - -no_memory: - Py_DECREF((PyObject*)iter); - PyErr_NoMemory(); - return NULL; -} - - -#define iter ((AutomatonItemsIter*)self) - -static void -automaton_items_iter_del(PyObject* self) { - memory_safefree(iter->buffer); - memory_safefree(iter->pattern); -#ifndef AHOCORASICK_UNICODE - memory_safefree(iter->char_buffer); -#endif - - list_delete(&iter->stack); - Py_DECREF(iter->automaton); - - PyObject_Del(self); -} - - -static PyObject* -automaton_items_iter_iter(PyObject* self) { - Py_INCREF(self); - return self; -} - - -static PyObject* -automaton_items_iter_next(PyObject* self) { - - bool output; - TrieNode* node; - TRIE_LETTER_TYPE letter; - size_t depth; - - if (UNLIKELY(iter->version != iter->automaton->version)) { - PyErr_SetString(PyExc_ValueError, "The underlying automaton has changed: this iterator is no longer valid."); - return NULL; - } - - while (true) { - StackItem* top = (StackItem*)list_pop_first(&iter->stack); - if (top == NULL) - return NULL; /* Stop iteration */ - - if (top->node == NULL) { - memory_free(top); - return NULL; /* Stop iteration */ - } - - node = top->node; - letter = top->letter; - depth = top->depth; - memory_free(top); - - if (iter->matchtype != MATCH_AT_LEAST_PREFIX and depth > iter->pattern_length) - continue; - - switch (iter->matchtype) { - case MATCH_EXACT_LENGTH: - output = (depth == iter->pattern_length); - break; - - case MATCH_AT_MOST_PREFIX: - output = (depth <= iter->pattern_length); - break; - - case MATCH_AT_LEAST_PREFIX: - default: - output = (depth >= iter->pattern_length); - break; - - } - - iter->state = node; - iter->letter = letter; - if ((depth >= iter->pattern_length) or - (iter->use_wildcard and iter->pattern[depth] == iter->wildcard)) { - - // process all - const int n = iter->state->n; - int i; - for (i=0; i < n; i++) { - StackItem* new_item = (StackItem*)list_item_new(sizeof(StackItem)); - if (UNLIKELY(new_item == NULL)) { - PyErr_NoMemory(); - return NULL; - } - - new_item->node = trienode_get_ith_unsafe(iter->state, i); - new_item->letter = trieletter_get_ith_unsafe(iter->state, i); - new_item->depth = depth + 1; - list_push_front(&iter->stack, (ListItem*)new_item); - } - } - else { - // process single letter - TrieNode* node = trienode_get_next(iter->state, iter->pattern[depth]); - - if (node) { - StackItem* new_item = (StackItem*)list_item_new(sizeof(StackItem)); - if (UNLIKELY(new_item == NULL)) { - PyErr_NoMemory(); - return NULL; - } - - new_item->node = node; - new_item->letter = iter->pattern[depth]; - new_item->depth = depth + 1; - list_push_front(&iter->stack, (ListItem*)new_item); - } - } - - if (iter->type != ITER_VALUES) { - // update keys when needed - iter->buffer[depth] = iter->letter; -#ifndef AHOCORASICK_UNICODE - iter->char_buffer[depth] = (char)iter->letter; -#endif - } - - if (output and iter->state->eow) { - PyObject* val; - - switch (iter->type) { - case ITER_KEYS: -#if defined PEP393_UNICODE - return F(PyUnicode_FromKindAndData)(PyUnicode_4BYTE_KIND, (void*)(iter->buffer + 1), depth); -#elif defined AHOCORASICK_UNICODE - return PyUnicode_FromUnicode((Py_UNICODE*)(iter->buffer + 1), depth); -#else - return PyBytes_FromStringAndSize(iter->char_buffer + 1, depth); -#endif - - case ITER_VALUES: - switch (iter->automaton->store) { - case STORE_ANY: - val = iter->state->output.object; - Py_INCREF(val); - break; - - case STORE_LENGTH: - case STORE_INTS: - return F(Py_BuildValue)("i", iter->state->output.integer); - - default: - PyErr_SetString(PyExc_SystemError, "Incorrect 'store' attribute."); - return NULL; - } - - return val; - - case ITER_ITEMS: - switch (iter->automaton->store) { - case STORE_ANY: - return F(Py_BuildValue)( -#ifdef PY3K - #ifdef AHOCORASICK_UNICODE - "(u#O)", /*key*/ iter->buffer + 1, depth, - #else - "(y#O)", /*key*/ iter->buffer + 1, depth, - #endif -#else - "(s#O)", /*key*/ iter->char_buffer + 1, depth, -#endif - /*val*/ iter->state->output.object - ); - - case STORE_LENGTH: - case STORE_INTS: - return F(Py_BuildValue)( -#ifdef PY3K - #ifdef AHOCORASICK_UNICODE - "(u#i)", /*key*/ iter->buffer + 1, depth, - #else - "(y#i)", /*key*/ iter->buffer + 1, depth, - #endif -#else - "(s#i)", /*key*/ iter->char_buffer + 1, depth, -#endif - /*val*/ iter->state->output.integer - ); - - default: - PyErr_SetString(PyExc_SystemError, "Incorrect 'store' attribute."); - return NULL; - } // switch - } - } - } -} - -#undef StackItem -#undef iter - -static PyTypeObject automaton_items_iter_type = { - PY_OBJECT_HEAD_INIT - "AutomatonItemsIter", /* tp_name */ - sizeof(AutomatonItemsIter), /* tp_size */ - 0, /* tp_itemsize? */ - (destructor)automaton_items_iter_del, /* tp_dealloc */ - 0, /* tp_print */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_reserved */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - PyObject_GenericGetAttr, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - 0, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - automaton_items_iter_iter, /* tp_iter */ - automaton_items_iter_next, /* tp_iternext */ - 0, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - 0, /* tp_init */ - 0, /* tp_alloc */ - 0, /* tp_new */ -}; diff --git a/stringcheese/pyahocorasick-1.4.0/AutomatonItemsIter.h b/stringcheese/pyahocorasick-1.4.0/AutomatonItemsIter.h deleted file mode 100644 index cbc197e..0000000 --- a/stringcheese/pyahocorasick-1.4.0/AutomatonItemsIter.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - AutomatonItemsIter const, struct & methods declarations. - This class implements iterator walk over trie, that returns - words and associated values. Object of this class is - returned by 'keys'/'values'/'items' methods of Automaton class. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -*/ -#ifndef ahocorasick_AutomatonItemsIter_h_included -#define ahocorasick_AutomatonItemsIter_h_included - -#include "common.h" -#include "Automaton.h" - -typedef enum { - ITER_KEYS, - ITER_VALUES, - ITER_ITEMS -} ItemsType; - -typedef enum { - MATCH_EXACT_LENGTH, - MATCH_AT_MOST_PREFIX, - MATCH_AT_LEAST_PREFIX -} PatternMatchType; - - -typedef struct AutomatonItemsIter { - PyObject_HEAD - - Automaton* automaton; - int version; ///< automaton version - TrieNode* state; ///< current automaton node - TRIE_LETTER_TYPE letter; ///< current letter - List stack; ///< stack - ItemsType type; ///< type of iterator (KEYS/VALUES/ITEMS) - TRIE_LETTER_TYPE* buffer; ///< buffer to construct key representation -#ifndef AHOCORASICK_UNICODE - char *char_buffer; -#endif - - size_t pattern_length; - TRIE_LETTER_TYPE* pattern; ///< pattern - bool use_wildcard; - TRIE_LETTER_TYPE wildcard; ///< wildcard char - PatternMatchType matchtype; ///< how pattern have to be handled -} AutomatonItemsIter; - - -/* new() */ -static PyObject* -automaton_items_iter_new( - Automaton* automaton, - const TRIE_LETTER_TYPE* word, - const ssize_t wordlen, - - const bool use_wildcard, - const TRIE_LETTER_TYPE wildcard, - - const PatternMatchType matchtype -); - -#endif diff --git a/stringcheese/pyahocorasick-1.4.0/AutomatonSearchIter.c b/stringcheese/pyahocorasick-1.4.0/AutomatonSearchIter.c deleted file mode 100644 index b6b7871..0000000 --- a/stringcheese/pyahocorasick-1.4.0/AutomatonSearchIter.c +++ /dev/null @@ -1,424 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - AutomatonSearchIter implementation - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -*/ - -#include "AutomatonSearchIter.h" -#include - -static PyTypeObject automaton_search_iter_type; - - -#ifdef VARIABLE_LEN_CHARCODES -static int -automaton_search_iter_substring_index(struct Input* input, int position) { - - TRIE_LETTER_TYPE letter; - - int index = 0; - int i; - - for (i=0; i < position; i++) { - - letter = input->word[index]; - if (UNLIKELY(Py_UNICODE_IS_SURROGATE(letter))) { - if (UNLIKELY(!Py_UNICODE_IS_HIGH_SURROGATE(letter))) { - PyErr_Format(PyExc_ValueError, - "Malformed UCS-2 string: expected a high surrogate at %d, got %04x", - index, letter); - return -1; - } - - index += 1; - - if (index >= input->wordlen) { - PyErr_Format(PyExc_ValueError, - "Malformed UCS-2 string: unexpected end of string"); - return -1; - } - - letter = input->word[index]; - if (UNLIKELY(!Py_UNICODE_IS_LOW_SURROGATE(letter))) { - PyErr_Format(PyExc_ValueError, - "Malformed UCS-2 string: expected a low surrogate at %d, got %04x", - index, letter); - return -1; - } - - index += 1; - - } else { - index += 1; - } - } - - return index; -} -#endif // VARIABLE_LEN_CHARCODES - - -static PyObject* -automaton_search_iter_new( - Automaton* automaton, - PyObject* object, - int start, - int end, - bool ignore_white_space -) { - AutomatonSearchIter* iter; -#ifdef VARIABLE_LEN_CHARCODES - int tmp; -#endif - - iter = (AutomatonSearchIter*)F(PyObject_New)(AutomatonSearchIter, &automaton_search_iter_type); - if (iter == NULL) - return NULL; - - - iter->automaton = automaton; - iter->version = automaton->version; - - iter->state = automaton->root; - iter->output= NULL; - iter->shift = 0; - iter->ignore_white_space = ignore_white_space; - - init_input(&iter->input); - - Py_INCREF(iter->automaton); - - if (!prepare_input((PyObject*)automaton, object, &iter->input)) { - goto error; - } - -#ifdef VARIABLE_LEN_CHARCODES - if (automaton->key_type == KEY_STRING) { - tmp = automaton_search_iter_substring_index(&iter->input, start); - if (tmp >= 0) { - iter->index = tmp - 1; - iter->position = start - 1; - } else { - goto error; - } - - tmp = automaton_search_iter_substring_index(&iter->input, end); - if (tmp >= 0) { - iter->end = end; - } else { - goto error; - } - - iter->expected = pyaho_UCS2_Any; - } else { - iter->index = start - 1; - iter->end = end; - } -#else - // -1 because the first instruction in next() increments index - iter->index = start - 1; - iter->end = end; -#endif - return (PyObject*)iter; - -error: - Py_DECREF(iter); - return NULL; -} - -#define iter ((AutomatonSearchIter*)self) - -static void -automaton_search_iter_del(PyObject* self) { - Py_DECREF(iter->automaton); - destroy_input(&iter->input); - PyObject_Del(self); -} - - -static PyObject* -automaton_search_iter_iter(PyObject* self) { - Py_INCREF(self); - return self; -} - - -enum { - OutputValue, - OutputNone, - OutputError -}; - - -static int -automaton_build_output(PyObject* self, PyObject** result) { - TrieNode* node; - Py_ssize_t idx = 0; - - while (iter->output && !iter->output->eow) { - iter->output = iter->output->fail; - } - - if (iter->output) { - node = iter->output; - iter->output = iter->output->fail; - -#ifdef VARIABLE_LEN_CHARCODES - idx = iter->shift; - if (iter->automaton->key_type == KEY_STRING) { - idx += iter->position; - } else { - idx += iter->index; - } -#else - idx = iter->index + iter->shift; -#endif - switch (iter->automaton->store) { - case STORE_LENGTH: - case STORE_INTS: - *result = F(Py_BuildValue)("ii", idx, node->output.integer); - return OutputValue; - - case STORE_ANY: - *result = F(Py_BuildValue)("iO", idx, node->output.object); - return OutputValue; - - default: - PyErr_SetString(PyExc_ValueError, "inconsistent internal state!"); - return OutputError; - } - } - - return OutputNone; -} - - - -#ifdef VARIABLE_LEN_CHARCODES -static bool -automaton_search_iter_advance_index(PyObject* self) { - - TRIE_LETTER_TYPE letter; - - iter->index += 1; - if (iter->automaton->key_type == KEY_SEQUENCE) { - return true; - } - - letter = iter->input.word[iter->index]; - if (iter->expected == pyaho_UCS2_Any) { - if (UNLIKELY(Py_UNICODE_IS_SURROGATE(letter))) { - if (LIKELY(Py_UNICODE_IS_HIGH_SURROGATE(letter))) { - iter->expected = pyaho_UCS2_LowSurrogate; - } else { - PyErr_Format(PyExc_ValueError, - "Malformed UCS-2 string: expected a high surrogate at %d, got %04x", - iter->index, letter); - return false; - } - } else { - iter->position += 1; - } - } else { - assert(iter->expected == pyaho_UCS2_LowSurrogate); - if (LIKELY(Py_UNICODE_IS_LOW_SURROGATE(letter))) { - iter->expected = pyaho_UCS2_Any; - iter->position += 1; - } else { - PyErr_Format(PyExc_ValueError, - "Malformed UCS-2 string: expected a low surrogate at %d, got %04x", - iter->index, letter); - return false; - } - } - - return true; -} -#endif - -static PyObject* -automaton_search_iter_next(PyObject* self) { - PyObject* output; - - if (iter->version != iter->automaton->version) { - PyErr_SetString(PyExc_ValueError, "underlaying automaton has changed, iterator is not valid anymore"); - return NULL; - } - -return_output: - switch (automaton_build_output(self, &output)) { - case OutputValue: - return output; - - case OutputNone: - break; - - case OutputError: - return NULL; - } - -#ifdef VARIABLE_LEN_CHARCODES - if (!automaton_search_iter_advance_index(self)) { - return NULL; - } -#else - iter->index += 1; - if (iter->ignore_white_space) { - while ((iter->index < iter->end) and iswspace(iter->input.word[iter->index])) { - iter->index += 1; - } - } -#endif - while (iter->index < iter->end) { - // process single char - iter->state = ahocorasick_next( - iter->state, - iter->automaton->root, - iter->input.word[iter->index] - ); - - ASSERT(iter->state); - - iter->output = iter->state; - goto return_output; - -#ifdef VARIABLE_LEN_CHARCODES - if (!automaton_search_iter_advance_index(self)) { - return NULL; - } -#else - iter->index += 1; -#endif - - } // while - - return NULL; // StopIteration -} - - -static PyObject* -automaton_search_iter_set(PyObject* self, PyObject* args) { - PyObject* object; - PyObject* flag; - Py_ssize_t position; - bool reset; - struct Input new_input; - - // first argument - required string or buffer - object = F(PyTuple_GetItem)(args, 0); - if (object) { - init_input(&new_input); - if (!prepare_input((PyObject*)iter->automaton, object, &new_input)) { - return NULL; - } - } - else - return NULL; - - // second argument - optional bool - flag = F(PyTuple_GetItem)(args, 1); - if (flag) { - switch (PyObject_IsTrue(flag)) { - case 0: - reset = false; - break; - case 1: - reset = true; - break; - default: - return NULL; - } - } - else { - PyErr_Clear(); - reset = false; - } - - destroy_input(&iter->input); - assign_input(&iter->input, &new_input); - - if (!reset) { - position = iter->index; -#ifdef VARIABLE_LEN_CHARCODES - if (iter->automaton->key_type == KEY_STRING) { - position = iter->position; - } -#endif - iter->shift += (position >= 0) ? position : 0; - } - - iter->index = -1; - iter->end = new_input.wordlen; - - if (reset) { - iter->state = iter->automaton->root; - iter->shift = 0; - iter->output = NULL; -#ifdef VARIABLE_LEN_CHARCODES - iter->position = -1; - iter->expected = pyaho_UCS2_Any; -#endif - } - - Py_RETURN_NONE; -} - - -#undef iter - -#define method(name, kind) {#name, automaton_search_iter_##name, kind, automaton_search_iter_##name##_doc} - -static -PyMethodDef automaton_search_iter_methods[] = { - method(set, METH_VARARGS), - - {NULL, NULL, 0, NULL} -}; -#undef method - - -static PyTypeObject automaton_search_iter_type = { - PY_OBJECT_HEAD_INIT - "ahocorasick.AutomatonSearchIter", /* tp_name */ - sizeof(AutomatonSearchIter), /* tp_size */ - 0, /* tp_itemsize? */ - (destructor)automaton_search_iter_del, /* tp_dealloc */ - 0, /* tp_print */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_reserved */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - PyObject_GenericGetAttr, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - automaton_search_iter_doc, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - automaton_search_iter_iter, /* tp_iter */ - automaton_search_iter_next, /* tp_iternext */ - automaton_search_iter_methods, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - 0, /* tp_init */ - 0, /* tp_alloc */ - 0, /* tp_new */ -}; - diff --git a/stringcheese/pyahocorasick-1.4.0/AutomatonSearchIter.h b/stringcheese/pyahocorasick-1.4.0/AutomatonSearchIter.h deleted file mode 100644 index a98b647..0000000 --- a/stringcheese/pyahocorasick-1.4.0/AutomatonSearchIter.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - AutomatonSearchIter const, struct & methods declarations. - This class implements iterator walk over Aho-Corasick - automaton. Object of this class is returned by 'iter' method - of Automaton class. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -*/ -#ifndef ahocorasick_AutomatonSearchIter_h_included -#define ahocorasick_AutomatonSearchIter_h_included - -#include "common.h" -#include "Automaton.h" - -#ifdef VARIABLE_LEN_CHARCODES -typedef enum { - pyaho_UCS2_Any, - pyaho_UCS2_LowSurrogate -} UCS2ExpectedChar; -#endif - -typedef struct AutomatonSearchIter { - PyObject_HEAD - - Automaton* automaton; - int version; ///< automaton version - struct Input input; ///< input string - TrieNode* state; ///< current state of automaton - TrieNode* output; ///< current node, i.e. yielded value - - Py_ssize_t index; ///< current index in data - Py_ssize_t shift; ///< shift + index => output index - Py_ssize_t end; ///< end index - bool ignore_white_space; ///< ignore input string white spaces using iswspace() function -#ifdef VARIABLE_LEN_CHARCODES - int position; ///< position in string - UCS2ExpectedChar expected; -#endif -} AutomatonSearchIter; - - -static PyObject* -automaton_search_iter_new( - Automaton* automaton, - PyObject* object, - int start, - int end, - bool ignore_white_space -); - -#endif diff --git a/stringcheese/pyahocorasick-1.4.0/AutomatonSearchIterLong.c b/stringcheese/pyahocorasick-1.4.0/AutomatonSearchIterLong.c deleted file mode 100644 index 31dd39f..0000000 --- a/stringcheese/pyahocorasick-1.4.0/AutomatonSearchIterLong.c +++ /dev/null @@ -1,267 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - AutomatonSearchIterLong implementation - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - License : 3-clauses BSD (see LICENSE) -*/ - -#include "AutomatonSearchIterLong.h" - -static PyTypeObject automaton_search_iter_long_type; - -static PyObject* -automaton_search_iter_long_new( - Automaton* automaton, - PyObject* object, - int start, - int end -) { - AutomatonSearchIterLong* iter; - - iter = (AutomatonSearchIterLong*)PyObject_New(AutomatonSearchIterLong, &automaton_search_iter_long_type); - if (iter == NULL) - return NULL; - - iter->automaton = automaton; - iter->version = automaton->version; - iter->object = object; - - iter->state = automaton->root; - iter->shift = 0; - iter->index = start - 1; // -1 because first instruction in next() increments index - iter->end = end; - - iter->last_index = -1; - iter->last_node = NULL; - - Py_INCREF(iter->automaton); - Py_INCREF(iter->object); - - init_input(&iter->input); - if (!prepare_input((PyObject*)automaton, object, &iter->input)) { - goto error; - } - - return (PyObject*)iter; -error: - Py_DECREF(iter); - return NULL; -} - -#define iter ((AutomatonSearchIterLong*)self) - -static void -automaton_search_iter_long_del(PyObject* self) { - Py_DECREF(iter->automaton); - Py_DECREF(iter->object); - destroy_input(&iter->input); - PyObject_Del(self); -} - - -static PyObject* -automaton_search_iter_long_iter(PyObject* self) { - Py_INCREF(self); - return self; -} - - -static PyObject* -automaton_build_output_iter_long(PyObject* self) { - - switch (iter->automaton->store) { - case STORE_LENGTH: - case STORE_INTS: - return Py_BuildValue("ii", iter->shift + iter->last_index, iter->last_node->output.integer); - - case STORE_ANY: - return Py_BuildValue("iO", iter->shift + iter->last_index, iter->last_node->output.object); - - default: - PyErr_SetString(PyExc_ValueError, "inconsistent internal state!"); - return NULL; - } -} - - -static PyObject* -automaton_search_iter_long_next(PyObject* self) { - PyObject* output; - TrieNode* next; - - if (iter->version != iter->automaton->version) { - PyErr_SetString(PyExc_ValueError, "underlaying automaton has changed, iterator is not valid anymore"); - return NULL; - } - -return_output: - if (iter->last_node) { - output = automaton_build_output_iter_long(self); - - // start over, as we don't want overlapped results - // Note: this leads to quadratic complexity in the worst case - iter->state = iter->automaton->root; - iter->index = iter->last_index; - - iter->last_node = NULL; - iter->last_index = -1; - - return output; - } - - iter->index += 1; - while (iter->index < iter->end) { - next = trienode_get_next(iter->state, iter->input.word[iter->index]); - if (next) { - if (next->eow) { - // save the last node on the path - iter->last_node = next; - iter->last_index = iter->index; - } - - iter->state = next; - iter->index += 1; - } else { - if (iter->last_node) { - goto return_output; - } else { - while (true) { - iter->state = iter->state->fail; - if (iter->state == NULL) { - iter->state = iter->automaton->root; - iter->index += 1; - break; - } else if (trienode_get_next(iter->state, iter->input.word[iter->index])) { - break; - } - } - } - } - } // while - - if (iter->last_node) { - goto return_output; - } - - return NULL; // StopIteration -} - - -static PyObject* -automaton_search_iter_long_set(PyObject* self, PyObject* args) { - PyObject* object; - PyObject* flag; - bool reset; - struct Input new_input; - - // first argument - required string or buffer - object = PyTuple_GetItem(args, 0); - if (object) { - init_input(&new_input); - if (!prepare_input((PyObject*)iter->automaton, object, &new_input)) { - return NULL; - } - } - else - return NULL; - - // second argument - optional bool - flag = PyTuple_GetItem(args, 1); - if (flag) { - switch (PyObject_IsTrue(flag)) { - case 0: - reset = false; - break; - case 1: - reset = true; - break; - default: - return NULL; - } - } - else { - PyErr_Clear(); - reset = false; - } - - // update internal state - Py_XDECREF(iter->object); - Py_INCREF(object); - iter->object = object; - - destroy_input(&iter->input); - assign_input(&iter->input, &new_input); - - if (!reset) - iter->shift += (iter->index >= 0) ? iter->index : 0; - - iter->index = -1; - iter->end = new_input.wordlen; - - if (reset) { - iter->state = iter->automaton->root; - iter->shift = 0; - - iter->last_node = NULL; - iter->last_index = -1; - } - - Py_RETURN_NONE; -} - - -#undef iter - -#define method(name, kind) {#name, automaton_search_iter_long_##name, kind, ""} - -static -PyMethodDef automaton_search_iter_long_methods[] = { - method(set, METH_VARARGS), - - {NULL, NULL, 0, NULL} -}; -#undef method - - -static PyTypeObject automaton_search_iter_long_type = { - PY_OBJECT_HEAD_INIT - "ahocorasick.AutomatonSearchIterLong", /* tp_name */ - sizeof(AutomatonSearchIterLong), /* tp_size */ - 0, /* tp_itemsize? */ - (destructor)automaton_search_iter_long_del, /* tp_dealloc */ - 0, /* tp_print */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_reserved */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - PyObject_GenericGetAttr, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - 0, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - automaton_search_iter_long_iter, /* tp_iter */ - automaton_search_iter_long_next, /* tp_iternext */ - automaton_search_iter_long_methods, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - 0, /* tp_init */ - 0, /* tp_alloc */ - 0, /* tp_new */ -}; diff --git a/stringcheese/pyahocorasick-1.4.0/AutomatonSearchIterLong.h b/stringcheese/pyahocorasick-1.4.0/AutomatonSearchIterLong.h deleted file mode 100644 index 0600752..0000000 --- a/stringcheese/pyahocorasick-1.4.0/AutomatonSearchIterLong.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - AutomatonSearchIterLong const, struct & methods declarations. - This class implements iterator walk over Aho-Corasick - automaton. Object of this class is returnd by 'iter' method - of Automaton class. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - License : 3-clauses BSD (see LICENSE) -*/ -#ifndef ahocorasick_AutomatonSearchIterLong_h_included -#define ahocorasick_AutomatonSearchIterLong_h_included - -#include "common.h" -#include "Automaton.h" - -typedef struct AutomatonSearchIterLong { - PyObject_HEAD - - Automaton* automaton; - int version; ///< automaton version - PyObject* object; ///< unicode or buffer - struct Input input; ///< input string - TrieNode* state; ///< current state of automaton - TrieNode* last_node; ///< last node on trie path - int last_index; - - int index; ///< current index in data - int shift; ///< shift + index => output index - int end; ///< end index -} AutomatonSearchIterLong; - - -static PyObject* -automaton_search_iter_long_new( - Automaton* automaton, - PyObject* object, - int start, - int end -); - -#endif diff --git a/stringcheese/pyahocorasick-1.4.0/Automaton_pickle.c b/stringcheese/pyahocorasick-1.4.0/Automaton_pickle.c deleted file mode 100644 index 39d2034..0000000 --- a/stringcheese/pyahocorasick-1.4.0/Automaton_pickle.c +++ /dev/null @@ -1,490 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - Implementation of pickling/unpickling routines for Automaton class - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -*/ - -/* -Pickling (automaton___reduce__): - -1. assign sequential numbers to nodes in order to replace - address with these numbers - (pickle_dump_replace_fail_with_id) -2. save in array all nodes data in the same order as numbers, - also replace fail and next links with numbers; collect on - a list all values (python objects) stored in a trie - (pickle_dump_save); - - Before we start, all nodes of trie are visited and total - size of pickled data is calculated. If it is small enough - (less than given threshold), all data is saved in a single - byte array. Otherwise, data is saved in several byte arrays. - - In either case, the format of byte array is the same: - * 8 first bytes is number of nodes stored in this - chunk of memory - * the number if followed by some raw data. - - When there is just one byte array, it's size is fit to - needs. If data is split, then each array has exactly the - same size of bytes, but not all might be used (only the - last array is fit). - -3. clean up - (pickle_dump_undo_replace or pickle_dump_revert_replace) - -Unpickling (automaton_unpickle, called in Automaton constructor) -1. load all nodes from array -2. make number->node lookup table -3. replace numbers stored in fail and next pointers with - real pointers, reassign python objects as values -*/ - - -#include -#include "src/pickle/pickle_data.c" - -typedef struct NodeID { - TrieNode* fail; ///< original fail value - Py_uintptr_t id; ///< id -} NodeID; - -typedef struct DumpState { - Py_uintptr_t id; ///< next id - size_t total_size; ///< number of nodes - TrieNode* failed_on; ///< if fail while numerating, save node in order - /// to revert changes made in trie -} DumpState; - - -static size_t -get_pickled_size(TrieNode* node) { - ASSERT(node != NULL); - return PICKLE_TRIENODE_SIZE + node->n * sizeof(Pair); -} - -// replace fail with pairs (fail, id) -static int -pickle_dump_replace_fail_with_id(TrieNode* node, const int depth, void* extra) { - - NodeID* repl; - - ASSERT(sizeof(NodeID*) <= sizeof(TrieNode*)); -#define state ((DumpState*)extra) - repl = (NodeID*)memory_alloc(sizeof(NodeID)); - if (LIKELY(repl != NULL)) { - state->id += 1; - state->total_size += get_pickled_size(node); - - repl->id = state->id; - repl->fail = node->fail; - - node->fail = (TrieNode*)repl; - return 1; - } - else { - // error, revert is needed! - state->failed_on = node; - return 0; - } -#undef state -} - - -// revert changes in trie (in case of error) -static int -pickle_dump_revert_replace(TrieNode* node, const int depth, void* extra) { -#define state ((DumpState*)extra) - if (state->failed_on != node) { - NodeID* repl = (NodeID*)(node->fail); - node->fail = repl->fail; - memory_free(repl); - - return 1; - } - else - return 0; -#undef state -} - - -// revert changes in trie -static int -pickle_dump_undo_replace(TrieNode* node, const int depth, void* extra) { -#define state ((DumpState*)extra) - NodeID* repl = (NodeID*)(node->fail); - node->fail = repl->fail; - memory_free(repl); - - return 1; -#undef state -} - - -static int -pickle_dump_save(TrieNode* node, const int depth, void* extra) { -#define self ((PickleData*)extra) -#define NODEID(object) ((NodeID*)((TrieNode*)object)->fail) - - TrieNode* dump; - TrieNode* tmp; - Pair* arr; - unsigned i; - size_t size; - - size = get_pickled_size(node); - if (UNLIKELY(self->top + size > self->size)) { - if (UNLIKELY(!pickle_data__add_next_buffer(self))) { - self->error = true; - return 0; - } - } - - dump = (TrieNode*)(self->data + self->top); - - // we do not save the last pointer in array - arr = (Pair*)(self->data + self->top + PICKLE_TRIENODE_SIZE); - - // append the python object to the list - if (node->eow and self->values) { - if (PyList_Append(self->values, node->output.object) == -1) { - self->error = true; - return 0; - } - } - - // save node data - if (self->values) - dump->output.integer = 0; - else - dump->output.integer = node->output.integer; - - dump->n = node->n; - dump->eow = node->eow; - - tmp = NODEID(node)->fail; - if (tmp) - dump->fail = (TrieNode*)(NODEID(tmp)->id); - else - dump->fail = NULL; - - // save array of pointers - for (i=0; i < node->n; i++) { - TrieNode* child = trienode_get_ith_unsafe(node, i); - ASSERT(child); - arr[i].child = (TrieNode*)(NODEID(child)->id); // save the id of child node - arr[i].letter = trieletter_get_ith_unsafe(node, i); - } - - self->top += size; - (*self->count) += 1; - return 1; -#undef NODEID -#undef self -} - - -static PyObject* -automaton___reduce__(PyObject* self, PyObject* args) { -#define automaton ((Automaton*)self) - -#define MB ((size_t)(1024*1024)) - - const size_t array_size = 16*MB; - - DumpState state; - PickleData data; - PyObject* tuple; - - // 0. for an empty automaton do nothing - if (automaton->count == 0) { - // the class constructor feed with an empty argument build an empty automaton - return F(Py_BuildValue)("O()", Py_TYPE(self)); - } - - // 1. numerate nodes - state.id = 0; - state.failed_on = NULL; - state.total_size = 0; - - trie_traverse(automaton->root, pickle_dump_replace_fail_with_id, &state); - if (state.failed_on) { - // revert changes (partial) - trie_traverse(automaton->root, pickle_dump_revert_replace, &state); - - // and set error - PyErr_NoMemory(); - return NULL; - } - - // 2. gather data - if (!pickle_data__init(&data, automaton->store, state.total_size, array_size)) - goto exception; - - trie_traverse(automaton->root, pickle_dump_save, &data); - if (UNLIKELY(data.error)) { - goto exception; - } - - if (UNLIKELY(!pickle_data__shrink_last_buffer(&data))) { - goto exception; - } - - if (automaton->store != STORE_ANY) { // always pickle a Python object - data.values = Py_None; - Py_INCREF(data.values); - } - - /* 3: save tuple: - * binary data - * automaton->kind - * automaton->store - * automaton->key_type - * automaton->count - * automaton->longest_word - * list of values - */ - - tuple = F(Py_BuildValue)( - "O(OiiiiiO)", - Py_TYPE(self), - data.bytes_list, - automaton->kind, - automaton->store, - automaton->key_type, - automaton->count, - automaton->longest_word, - data.values - ); - - if (data.values == Py_None) { - data.values = NULL; - } - - if (UNLIKELY(tuple == NULL)) { - goto exception; - } - - // revert all changes - trie_traverse(automaton->root, pickle_dump_undo_replace, NULL); - - return tuple; - -exception: - // revert all changes - trie_traverse(automaton->root, pickle_dump_undo_replace, NULL); - - // and free memory - pickle_data__cleanup(&data); - return NULL; -#undef automaton -} - - -static bool -automaton_unpickle__validate_bytes_list(PyObject* bytes_list, size_t* result) { - - PyObject* bytes; - Py_ssize_t k; - Py_ssize_t nodes_count; - const uint8_t* data; - - size_t count = 0; - - // calculate the total number of nodes (and do validate data at the same time) - for (k=0; k < PyList_GET_SIZE(bytes_list); k++) { - bytes = PyList_GET_ITEM(bytes_list, k); - if (UNLIKELY(!F(PyBytes_CheckExact)(bytes))) { - PyErr_Format(PyExc_ValueError, - "Item #%d on the bytes list is not a bytes object", - k); - return false; - } - - data = (const uint8_t*)PyBytes_AS_STRING(bytes); - - nodes_count = *((Py_ssize_t*)data); - if (UNLIKELY(nodes_count <= 0)) { - PyErr_Format(PyExc_ValueError, - "Nodes count for item #%d on the bytes list is not positive (%d)", - k, nodes_count); - return false; - } - - count += nodes_count; - } - - *result = count; - return true; -} - - -static bool -automaton_unpickle( - Automaton* automaton, - PyObject* bytes_list, - PyObject* values -) { - TrieNode** id2node = NULL; - - TrieNode* node; - TrieNode* dump; - Pair* next; - PyObject* bytes; - PyObject* value; - Py_ssize_t nodes_count; - Py_ssize_t i; - - size_t id; - const uint8_t* data; - const uint8_t* ptr; - const uint8_t* end; - size_t k; - size_t j; - size_t object_idx = 0; - size_t index; - size_t count; - - if (!automaton_unpickle__validate_bytes_list(bytes_list, &count)) { - goto exception; - } - - id2node = (TrieNode**)memory_alloc((count+1) * sizeof(TrieNode*)); - if (UNLIKELY(id2node == NULL)) { - goto no_mem; - } - - // 1. make nodes - id = 1; - for (k=0; k < PyList_GET_SIZE(bytes_list); k++) { - bytes = PyList_GET_ITEM(bytes_list, k); - data = (const uint8_t*)PyBytes_AS_STRING(bytes); - - nodes_count = *((Py_ssize_t*)data); - - ptr = data + PICKLE_CHUNK_COUNTER_SIZE; - end = ptr + PyBytes_GET_SIZE(bytes) - PICKLE_CHUNK_COUNTER_SIZE; - for (i=0; i < nodes_count; i++) { - if (UNLIKELY(ptr + PICKLE_TRIENODE_SIZE > end)) { - PyErr_Format(PyExc_ValueError, - "Data truncated [parsing header of node #%d]: " - "chunk #%d @ offset %lu, expected at least %lu bytes", - i, k, ptr - data, PICKLE_TRIENODE_SIZE); - goto exception; - } - - dump = (TrieNode*)(ptr); - node = (TrieNode*)memory_alloc(sizeof(TrieNode)); - if (LIKELY(node != NULL)) { - node->output = dump->output; - node->fail = dump->fail; - node->n = dump->n; - node->eow = dump->eow; - node->next = NULL; - } - else - goto no_mem; - - ptr += PICKLE_TRIENODE_SIZE; - - id2node[id++] = node; - - if (node->n > 0) { - if (UNLIKELY(ptr + node->n * sizeof(Pair) > end)) { - PyErr_Format(PyExc_ValueError, - "Data truncated [parsing children of node #%d]: " - "chunk #%d @ offset %lu, expected at least %ld bytes", - i, k, ptr - data + i, node->n * sizeof(Pair)); - - goto exception; - } - - node->next = (Pair*)memory_alloc(node->n * sizeof(Pair)); - if (UNLIKELY(node->next == NULL)) { - goto no_mem; - } - - next = (Pair*)(ptr); - for (j=0; j < node->n; j++) { - node->next[j] = next[j]; - } - - ptr += node->n * sizeof(Pair); - } - } - } - - // 2. restore pointers and references to pyobjects - for (i=1; i < id; i++) { - node = id2node[i]; - - // references - if (values and node->eow) { - value = F(PyList_GetItem)(values, object_idx); - if (value) { - Py_INCREF(value); - node->output.object = value; - object_idx += 1; - } - else - goto exception; - } - - // pointers - if (node->fail) { - index = (size_t)(node->fail); - if (LIKELY(index < count + 1)) { - node->fail = id2node[index]; - } else { - PyErr_Format(PyExc_ValueError, - "Node #%lu malformed: the fail link points to node #%lu, while there are %lu nodes", - i - 1, index, count); - goto exception; - } - } - - for (j=0; j < node->n; j++) { - index = (size_t)(node->next[j].child); - if (LIKELY(index < count + 1)) { - node->next[j].child = id2node[index]; - } else { - PyErr_Format(PyExc_ValueError, - "Node #%lu malformed: next link #%lu points to node #%lu, while there are %lu nodes", - i - 1, j, index, count); - goto exception; - } - } - } - - automaton->root = id2node[1]; - - memory_free(id2node); - return 1; - -no_mem: - PyErr_NoMemory(); -exception: - // free memory - if (id2node) { - for (i=1; i < id; i++) { - trienode_free(id2node[i]); - } - - memory_free(id2node); - } - - // If there is value list and some of its items were already - // referenced, release them - if (values) { - for (i=0; i < object_idx; i++) { - Py_XDECREF(F(PyList_GetItem)(values, i)); - } - } - - return 0; -} - diff --git a/stringcheese/pyahocorasick-1.4.0/CHANGELOG.rst b/stringcheese/pyahocorasick-1.4.0/CHANGELOG.rst deleted file mode 100644 index 7182cc1..0000000 --- a/stringcheese/pyahocorasick-1.4.0/CHANGELOG.rst +++ /dev/null @@ -1,155 +0,0 @@ -1.4.0 (2020-01-26) --------------------------------------------------- - -- Add method ``iter_long``, that performs the modified - Aho-Corasick search procedure matching the longest - words from set. - -1.4.0 (2019-01-24) --------------------------------------------------- - -- Change internal trie representation thanks to that performance - of common operation is 1.5 - 2.5 times faster. Details are - presented in https://github.com/WojciechMula/pyahocorasick/pull/107 - Warning: this change breaks compatibility of pickle and ``save()`` - format, this won't be possible to load files created in the - previous version. - -1.3.0 (2018-12-20) --------------------------------------------------- - -- Add alternative pickling mechanism ``save()``/``load``, which - requires less memory than the standard pickle solution (issue #102) - -1.2.0 (2018-12-13) --------------------------------------------------- - -- Add methods ``remove_word()``/``pop()`` (issue #79) - -1.1.13.1 (2018-12-11) --------------------------------------------------- - -- Fix manifest file - -1.1.13 (2018-12-11) --------------------------------------------------- - -- Fix pickling of large automatons (issue #50); - The fix wouldn't be possible without great help and - patience of all people involved: - - * **Emil Stenström** (@EmilStenstrom) - * **David Woakes** (@woakesd) - * **@Dobatymo** - * **Philippe Ombredanne** (@pombredanne) - - The fix wouldn't also be possible without **Daniel Lemire** (@lemire), - who gave me access to decent machines and I was able to test fixes - on large data. - -1.1.12 (2018-12-03) --------------------------------------------------- - -- Add support for tuples of ints to ``iter()`` (by **Frankie Robertson**) - -1.1.11 (2018-12-02) --------------------------------------------------- - -- Reworked pickling code -- Fix pickling crash (issue #68) -- Fix pickling memory leak (issue #62) -- Fix documentation (by **Philippe Ombredanne**) -- Fix several latent bugs and problems - -1.1.10 (2018-10-25) --------------------------------------------------- - -- Fix handling of unicode in Python 3 (by **Frankie Robertson**) - -1.1.9 (2018-10-25) --------------------------------------------------- - -- Fix documentation typos (by **Sylvain Zimmer**) -- Add ability to skip white spaces in the input strings (by **@gladtosee**; issue #84) - -1.1.8 (2018-04-25) --------------------------------------------------- - -- Fix memory leak (issue #81) -- Add link to Python implementation from Abusix (by **Frederik Petersen**) -- Fix unit tests (by **Renat Nasyrov**) - -1.1.7 (2018-02-23) --------------------------------------------------- - -- Minor documentation fixes (by **Edward Betts**) -- Some internal improvements - -1.1.6 (2017-11-27) --------------------------------------------------- - -- Fix PyPI building (by **Philippe Ombredanne**; issue #71) - -1.1.5 (2017-11-22) --------------------------------------------------- - -- Fix handling of UCS2-encoded string (issue #53) -- Fix pickling error -- Several minor fixes and corrections to documentation - and infrastructure (thanks to: **Jan Fan**, **@blackelk**, - **David Woakes** and **Xiaopeng Xu**) - -1.1.4 (2016-08-08) --------------------------------------------------- - -- Fix URL in documentation (by **Philippe Ombredanne**) - -1.1.3 (2016-08-07) --------------------------------------------------- - -- Rewrite documentation and fix PyPI presentation (by **Philippe Ombredanne**) - -1.1.2 (2016-08-06) --------------------------------------------------- - -- Rewrite documentation continued (by **Philippe Ombredanne**) - -1.1.1 (2016-05-29) --------------------------------------------------- - -- Rewrite documentation, setup readthedocs.io__ page (by **Philippe Ombredanne**) -- Make the module compilable in Windows using MSVC compiler (issue #11) -- Fix ``get()`` method that crashed when trie was empty (issue #22) -- Fix pickling problem (issue #26) -- Add ``__sizeof__()`` method (issue #25) - -__ https://pyahocorasick.readthedocs.io/en/latest/ - -1.1.0 (2016-04-26) --------------------------------------------------- - -- Support for Python 2 (with help from **Philippe Ombredanne**; issue #12) - -1.0.3 (2016-04-24) --------------------------------------------------- - -- Fix memory leak (by **Jonathan Grs**; issue #9) - -1.0.2 (2016-04-23) --------------------------------------------------- - -- Fix range parsing (by **Jonathan Grs**; issue #10) -- Fix pickling on 64-bit machines (issue #20) -- Update documentation regarding wildcards - -1.0.1 (2016-04-19) --------------------------------------------------- - -- Fix Unicode handling during automaton build (issue #8) -- Fix some 64-bit code issues (issue #5) -- Fix documentation (thanks to **Pastafarianist**) - -1.0.0 (2014-11-25) --------------------------------------------------- - -- The first version available through PyPi diff --git a/stringcheese/pyahocorasick-1.4.0/LICENSE b/stringcheese/pyahocorasick-1.4.0/LICENSE deleted file mode 100644 index d3de68d..0000000 --- a/stringcheese/pyahocorasick-1.4.0/LICENSE +++ /dev/null @@ -1,33 +0,0 @@ - -Copyright (c) 2011-2016 Wojciech Muła -All rights reserved. - -Redistribution and use in source and binary forms, with or -without modification, are permitted provided that the following -conditions are met: - -* Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials - provided with the distribution. -* Neither the name of the Wojciech Muła nor the names of its - contributors may be used to endorse or promote products - derived from this software without specific prior written - permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND -CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, -INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF -USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED -AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING -IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -THE POSSIBILITY OF SUCH DAMAGE. - diff --git a/stringcheese/pyahocorasick-1.4.0/MANIFEST.in b/stringcheese/pyahocorasick-1.4.0/MANIFEST.in deleted file mode 100644 index b993ce3..0000000 --- a/stringcheese/pyahocorasick-1.4.0/MANIFEST.in +++ /dev/null @@ -1,20 +0,0 @@ -graft benchmarks -graft docs -graft msinttypes -graft py -graft docs -graft regression -graft stamp -graft unresolved_bugs -graft src -include README.rst -include LICENSE -include *.py -include *.h -include *.c -include *.cfg -include .gitignore -include MANIFEST.in -include travis.yml -include appveyor.yml -include Makefile diff --git a/stringcheese/pyahocorasick-1.4.0/Makefile b/stringcheese/pyahocorasick-1.4.0/Makefile deleted file mode 100644 index 2486a4d..0000000 --- a/stringcheese/pyahocorasick-1.4.0/Makefile +++ /dev/null @@ -1,67 +0,0 @@ -.SUFFIXES: -.PHONY: test clean valgrind - -export PYTHONPATH := .:$(PYTHONPATH):$(PATH) - -DEPS=*.c \ - *.h \ - setup.py \ - unittests.py - -test: stamp/regression_py2 stamp/regression_py3 - -stamp/build_py2: $(DEPS) - python2 setup.py build_ext --inplace - touch $@ - -stamp/unittests_py2: stamp/build_py2 - python2 unittests.py - touch $@ - -stamp/regression_py2: stamp/unittests_py2 - python2 regression/issue_5.py - python2 regression/issue_8.py - python2 regression/issue_9.py - python2 regression/issue_10.py - python2 regression/issue_26.py - python2 regression/issue_56.py - touch $@ - - -stamp/build_py3: $(DEPS) - python3 setup.py build_ext --inplace - touch $@ - -stamp/unittests_py3: stamp/build_py3 - python3 unittests.py - touch $@ - -stamp/regression_py3: stamp/unittests_py3 - python3 regression/issue_5.py - python3 regression/issue_8.py - python3 regression/issue_9.py - python3 regression/issue_10.py - python3 regression/issue_26.py - python3 regression/issue_56.py - touch $@ - - -benchmark: benchmarks/benchmark.py stamp/build_py2 - python2 $^ - -devbuild2: - python2 setup.py build_ext --inplace - -devbuild3: - python3 setup.py build_ext --inplace - -valgrind: - python -c "import sys;print(sys.version)" - valgrind --leak-check=full --track-origins=yes --log-file=valgrind.log python unittests.py - -pip-release: - python setup.py sdist upload - -clean: - rm -f stamp/* - rm -rf dist build diff --git a/stringcheese/pyahocorasick-1.4.0/README.rst b/stringcheese/pyahocorasick-1.4.0/README.rst deleted file mode 100644 index 0ea4736..0000000 --- a/stringcheese/pyahocorasick-1.4.0/README.rst +++ /dev/null @@ -1,313 +0,0 @@ -======================================================================== - pyahocorasick -======================================================================== - -.. image:: https://travis-ci.org/WojciechMula/pyahocorasick.svg?branch=master - :target: https://travis-ci.org/WojciechMula/pyahocorasick - :alt: Linux Master branch tests status - -.. image:: https://ci.appveyor.com/api/projects/status/github/WojciechMula/pyahocorasick?branch=master&svg=true - :target: https://ci.appveyor.com/project/WojciechMula/pyahocorasick - :alt: Windows Master branch tests status - -**pyahocorasick** is a fast and memory efficient library for exact or approximate -multi-pattern string search meaning that you can find multiple key strings -occurrences at once in some input text. The library provides an `ahocorasick` Python -module that you can use as a plain dict-like Trie or convert a Trie to an automaton -for efficient Aho-Corasick search. - -It is implemented in C and tested on Python 2.7 and 3.4+. It works on Linux, Mac and -Windows. - -The license_ is BSD-3-clause. Some utilities, such as tests and the pure Python -automaton are dedicated to the Public Domain. - - -Download and source code -======================== - -You can fetch **pyahocorasick** from: - - GitHub https://github.com/WojciechMula/pyahocorasick/ - - Pypi https://pypi.python.org/pypi/pyahocorasick/ - - Conda-Forge https://github.com/conda-forge/pyahocorasick-feedstock/ - - -Quick start -=========== - -This module is written in C. You need a C compiler installed to compile native -CPython extensions. To install:: - - pip install pyahocorasick - -Then create an Automaton:: - - >>> import ahocorasick - >>> A = ahocorasick.Automaton() - -You can use the Automaton class as a trie. Add some string keys and their associated -value to this trie. Here we associate a tuple of (insertion index, original string) -as a value to each key string we add to the trie:: - - >>> for idx, key in enumerate('he her hers she'.split()): - ... A.add_word(key, (idx, key)) - -Then check if some string exists in the trie:: - - >>> 'he' in A - True - >>> 'HER' in A - False - -And play with the ``get()`` dict-like method:: - - >>> A.get('he') - (0, 'he') - >>> A.get('she') - (3, 'she') - >>> A.get('cat', 'not exists') - 'not exists' - >>> A.get('dog') - Traceback (most recent call last): - File "", line 1, in - KeyError - -Now convert the trie to an Aho-Corasick automaton to enable Aho-Corasick search:: - - >>> A.make_automaton() - -Then search all occurrences of the keys (the needles) in an input string (our haystack). - -Here we print the results and just check that they are correct. The -`Automaton.iter()` method return the results as two-tuples of the `end index` where a -trie key was found in the input string and the associated `value` for this key. Here -we had stored as values a tuple with the original string and its trie insertion -order:: - - >>> for end_index, (insert_order, original_value) in A.iter(haystack): - ... start_index = end_index - len(original_value) + 1 - ... print((start_index, end_index, (insert_order, original_value))) - ... assert haystack[start_index:start_index + len(original_value)] == original_value - ... - (1, 2, (0, 'he')) - (1, 3, (1, 'her')) - (1, 4, (2, 'hers')) - (4, 6, (3, 'she')) - (5, 6, (0, 'he')) - -You can also create an eventually large automaton ahead of time and `pickle` it to -re-load later. Here we just pickle to a string. You would typically pickle to a -file instead:: - - >>> import cPickle - >>> pickled = cPickle.dumps(A) - >>> B = cPickle.loads(pickled) - >>> B.get('he') - (0, 'he') - - -See also: - - FAQ and Who is using pyahocorasick? https://github.com/WojciechMula/pyahocorasick/wiki/FAQ#who-is-using-pyahocorasick - - -Documentation -============= - -The full documentation including the API overview and reference is published on -`readthedocs `_. - - -Overview - -With an `Aho-Corasick automaton `_ -you can efficiently search all occurrences of multiple strings (the needles) in an -input string (the haystack) making a single pass over the input string. With -pyahocorasick you can eventually build large automatons and pickle them to reuse -them over and over as an indexed structure for fast multi pattern string matching. - -One of the advantages of an Aho-Corasick automaton is that the typical worst-case -and best-case **runtimes** are about the same and depends primarily on the size -of the input string and secondarily on the number of matches returned. While -this may not be the fastest string search algorithm in all cases, it can search -for multiple strings at once and its runtime guarantees make it rather unique. -Because pyahocorasick is based on a Trie, it stores redundant keys prefixes only -once using memory efficiently. - -A drawback is that it needs to be constructed and "finalized" ahead of time -before you can search strings. In several applications where you search for several -pre-defined "needles" in a variable "haystacks" this is actually an advantage. - -**Aho-Corasick automatons** are commonly used for fast multi-pattern matching -in intrusion detection systems (such as snort), anti-viruses and many other -applications that need fast matching against a pre-defined set of string keys. - -Internally an Aho-Corasick automaton is typically based on a Trie with extra -data for failure links and an implementation of the Aho-Corasick search -procedure. - -Behind the scenes the **pyahocorasick** Python library implements these two data -structures: a `Trie `_ and an Aho-Corasick string -matching automaton. Both are exposed through the `Automaton` class. - -In addition to Trie-like and Aho-Corasick methods and data structures, -**pyahocorasick** also implements dict-like methods: The pyahocorasick -**Automaton** is a **Trie** a dict-like structure indexed by string keys each -associated with a value object. You can use this to retrieve an associated value -in a time proportional to a string key length. - -pyahocorasick is available in two flavors: - -* a CPython **C-based extension**, compatible with Python 2 and 3. - -* a simpler pure Python module, compatible with Python 2 and 3. This is only - available in the source repository (not on Pypi) under the py/ directory and - has a slightly different API. - - -Unicode and bytes ------------------ - -The type of strings accepted and returned by ``Automaton`` methods are either -**unicode** or **bytes**, depending on a compile time settings (preprocessor -definition of ``AHOCORASICK_UNICODE`` as set in `setup.py`). - -The ``Automaton.unicode`` attributes can tell you how the library was built. -On Python 3, unicode is the default. On Python 2, bytes is the default and only value. - - -.. warning:: - - When the library is built with unicode support on Python 3, an Automaton will - store 2 or 4 bytes per letter, depending on your Python installation. When built - for bytes, only one byte per letter is needed. - - Unicode is **NOT supported** on Python 2 for now. - - -Build and install from PyPi -=========================== - -To install for common operating systems, use pip. Pre-built wheels should be -available on Pypi at some point in the future:: - - pip install pyahocorasick - -To build from sources you need to have a C compiler installed and configured which -should be standard on Linux and easy to get on MacOSX. - -On Windows and Python 2.7 you need the `Microsoft Visual C++ Compiler for Python 2.7 -`_ (aka. Visual -Studio 2008). There have been reports that `pyahocorasick` does not build yet with -MinGW. It may build with cygwin but this has not been tested. If you get this working -with these platforms, please report in a ticket! - -To build from sources, clone the git repository or download and extract the source -archive. - -Install `pip` (and its `setuptools` companion) and then run (in a `virtualenv` of -course!):: - - pip install . - -If compilation succeeds, the module is ready to use. - - -Support -======= - -Support is available through the `GitHub issue tracker -`_ to report bugs or ask -questions. - - -Contributing -============ - -You can submit contributions through `GitHub pull requests -`_. - - -Authors -======= - -The initial author and maintainer is Wojciech Muła. `Philippe Ombredanne -`_, the current co-owner, rewrote -documentation, setup CI servers and did a whole lot of work to make this module -better accessible to end users. - -Alphabetic list of authors: - -* **Andrew Grigorev** -* **Bogdan** -* **David Woakes** -* **Edward Betts** -* **Frankie Robertson** -* **Frederik Petersen** -* **gladtosee** -* **INADA Naoki** -* **Jan Fan** -* **Pastafarianist** -* **Philippe Ombredanne** -* **Renat Nasyrov** -* **Sylvain Zimmer** -* **Xiaopeng Xu** - -This library would not be possible without help of many people, who contributed in -various ways. -They created `pull requests `_, -reported bugs as `GitHub issues `_ -or via direct messages, proposed fixes, or spent their valuable time on testing. - -Thank you. - - -License -======= - -This library is licensed under very liberal -`BSD-3-Clause `_ license. Some portions of -the code are dedicated to the public domain such as the pure Python automaton and test -code. - -Full text of license is available in LICENSE file. - - -Other Aho-Corasick implementations for Python you can consider -============================================================== - -While **pyahocorasick** tries to be the finest and fastest Aho Corasick library -for Python you may consider these other libraries: - - -* `py_aho_corasick `_ by Jan - - * Written in pure Python. - * Poor performance. - -* `ahocorapy `_ by abusix - - * Written in pure Python. - * Better performance than py-aho-corasick. - * Using pypy, ahocorapy's search performance is only slightly worse than pyahocorasick's. - * Performs additional suffix shortcutting (more setup overhead, less search overhead for suffix lookups). - * Includes visualization tool for resulting automaton (using pygraphviz). - * MIT-licensed, 100% test coverage, tested on all major python versions (+ pypy) - -* `noaho `_ by Jeff Donner - - * Written in C. Does not return overlapping matches. - * Does not compile on Windows (July 2016). - * No support for the pickle protocol. - -* `acora `_ by Stefan Behnel - - * Written in Cython. - * Large automaton may take a long time to build (July 2016) - * No support for a dict-like protocol to associate a value to a string key. - -* `ahocorasick `_ by Danny Yoo - - * Written in C. - * seems unmaintained (last update in 2005). - * GPL-licensed. - diff --git a/stringcheese/pyahocorasick-1.4.0/allsources.c b/stringcheese/pyahocorasick-1.4.0/allsources.c deleted file mode 100644 index d97e2fa..0000000 --- a/stringcheese/pyahocorasick-1.4.0/allsources.c +++ /dev/null @@ -1,6 +0,0 @@ -#include "src/custompickle/custompickle.c" -#include "src/custompickle/pyhelpers.c" -#include "src/custompickle/save/savebuffer.c" -#include "src/custompickle/save/automaton_save.c" -#include "src/custompickle/load/loadbuffer.c" -#include "src/custompickle/load/module_automaton_load.c" diff --git a/stringcheese/pyahocorasick-1.4.0/appveyor.yml b/stringcheese/pyahocorasick-1.4.0/appveyor.yml deleted file mode 100644 index 6871d54..0000000 --- a/stringcheese/pyahocorasick-1.4.0/appveyor.yml +++ /dev/null @@ -1,28 +0,0 @@ -version: 1.0.{build} - -environment: - matrix: - - PYTHON: "C:\\Python27" - - PYTHON: "C:\\Python27-x64" - - PYTHON: "C:\\Python36" - - PYTHON: "C:\\Python36-x64" - - PYTHON: "C:\\Python37" - - PYTHON: "C:\\Python37-x64" - - PYTHON: "C:\\Python38" - - PYTHON: "C:\\Python38-x64" - -install: - - cmd: "%PYTHON%\\python.exe -m pip install --upgrade pip wheel" - -build: off - -test_script: - - cmd: "%PYTHON%\\python.exe -m pip install ." - - cmd: "%PYTHON%\\python.exe setup.py test" - - dir - - mkdir wheels - - cmd: "%PYTHON%\\python.exe -m pip wheel . --wheel-dir=wheels" - - dir wheels - -artifacts: - - path: wheels\* diff --git a/stringcheese/pyahocorasick-1.4.0/benchmarks/benchmark.py b/stringcheese/pyahocorasick-1.4.0/benchmarks/benchmark.py deleted file mode 100644 index 796fb47..0000000 --- a/stringcheese/pyahocorasick-1.4.0/benchmarks/benchmark.py +++ /dev/null @@ -1,117 +0,0 @@ -from time import clock -from random import choice, randint, seed -from sys import stdout - -import ahocorasick - - -def write(str): - stdout.write(str) - stdout.flush() - - -def writeln(str): - stdout.write(str) - stdout.write('\n') - - -class ElapsedTime: - def __init__(self, msg): - self.msg = msg - - def __enter__(self): - write("%-40s: " % self.msg) - self.start = clock() - - def __exit__(self, a1, a2, a3): - self.stop = clock() - writeln("%0.3f s" % self.get_time()) - - def get_time(self): - return self.stop - self.start - - -class Test: - - def __init__(self, max_word_length, count): - self.min_word_length = 3 - self.max_word_length = max_word_length - self.count = count - self.words = [] - self.inexisting = [] - self.input = "" - - self.automaton = None - seed(0) # make sure that tests will be repeatable - - def init_data(self): - - def random_word(length): - chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" - return ''.join(choice(chars) for _ in xrange(length)) - - for i in xrange(self.count): - length = randint(self.min_word_length, self.max_word_length) - self.words.append(random_word(length)) - - length = randint(self.min_word_length, self.max_word_length) - self.inexisting.append(random_word(length)) - - - self.input = random_word(self.count) - - assert(len(self.words) == len(self.inexisting)) - - def add_words(self): - - self.automaton = ahocorasick.Automaton() - A = self.automaton - for word in self.words: - A.add_word(word, word) - - def build(self): - - self.automaton.make_automaton() - - def lookup(self): - - n = len(self.words) - - A = self.automaton - for i in xrange(n): - A.get(self.words[i]) - A.get(self.inexisting[i], "unknown") - - - def search(self): - - A = self.automaton - n = 0 - for item in A.iter(self.input): - n += 1 - - - def run(self): - - with ElapsedTime("Generating data (%d words)" % self.count): - self.init_data() - - with ElapsedTime("Add words"): - self.add_words() - - with ElapsedTime("Building automaton"): - self.build() - - with ElapsedTime("Look up"): - self.lookup() - - with ElapsedTime("Search"): - self.search() - -def main(): - - test = Test(32, 1000000) - test.run() - -if __name__ == '__main__': - main() diff --git a/stringcheese/pyahocorasick-1.4.0/benchmarks/benchmark3.py b/stringcheese/pyahocorasick-1.4.0/benchmarks/benchmark3.py deleted file mode 100644 index 5a49a47..0000000 --- a/stringcheese/pyahocorasick-1.4.0/benchmarks/benchmark3.py +++ /dev/null @@ -1,117 +0,0 @@ -from time import clock -from random import choice, randint, seed -from sys import stdout - -import ahocorasick - - -def write(str): - stdout.write(str) - stdout.flush() - - -def writeln(str): - stdout.write(str) - stdout.write('\n') - - -class ElapsedTime: - def __init__(self, msg): - self.msg = msg - - def __enter__(self): - write("%-40s: " % self.msg) - self.start = clock() - - def __exit__(self, a1, a2, a3): - self.stop = clock() - writeln("%0.3f s" % self.get_time()) - - def get_time(self): - return self.stop - self.start - - -class Test: - - def __init__(self, max_word_length, count): - self.min_word_length = 3 - self.max_word_length = max_word_length - self.count = count - self.words = [] - self.inexisting = [] - self.input = "" - - self.automaton = None - seed(0) # make sure that tests will be repeatable - - def init_data(self): - - def random_word(length): - chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" - return ''.join(choice(chars) for _ in range(length)) - - for i in range(self.count): - length = randint(self.min_word_length, self.max_word_length) - self.words.append(random_word(length)) - - length = randint(self.min_word_length, self.max_word_length) - self.inexisting.append(random_word(length)) - - - self.input = random_word(self.count) - - assert(len(self.words) == len(self.inexisting)) - - def add_words(self): - - self.automaton = ahocorasick.Automaton() - A = self.automaton - for word in self.words: - A.add_word(word, word) - - def build(self): - - self.automaton.make_automaton() - - def lookup(self): - - n = len(self.words) - - A = self.automaton - for i in range(n): - A.get(self.words[i]) - A.get(self.inexisting[i], "unknown") - - - def search(self): - - A = self.automaton - n = 0 - for item in A.iter(self.input): - n += 1 - - - def run(self): - - with ElapsedTime("Generating data (%d words)" % self.count): - self.init_data() - - with ElapsedTime("Add words"): - self.add_words() - - with ElapsedTime("Building automaton"): - self.build() - - with ElapsedTime("Look up"): - self.lookup() - - with ElapsedTime("Search"): - self.search() - -def main(): - - test = Test(32, 1000000) - test.run() - -if __name__ == '__main__': - main() diff --git a/stringcheese/pyahocorasick-1.4.0/benchmarks/results/python2-westmere.txt b/stringcheese/pyahocorasick-1.4.0/benchmarks/results/python2-westmere.txt deleted file mode 100644 index 48bbad5..0000000 --- a/stringcheese/pyahocorasick-1.4.0/benchmarks/results/python2-westmere.txt +++ /dev/null @@ -1,9 +0,0 @@ -CPU: Intel(R) Core(TM) i5 CPU M 540 @ 2.53GHz - -$ make benchmark -python2 benchmarks/benchmark.py stamp/build_py2 -Generating data (1000000 words) : 24.886 s -Add words : 4.627 s -Building automaton : 33.362 s -Look up : 5.946 s -Search : 1.762 s diff --git a/stringcheese/pyahocorasick-1.4.0/benchmarks/results/python3-broadwell-u.txt b/stringcheese/pyahocorasick-1.4.0/benchmarks/results/python3-broadwell-u.txt deleted file mode 100644 index 601bffc..0000000 --- a/stringcheese/pyahocorasick-1.4.0/benchmarks/results/python3-broadwell-u.txt +++ /dev/null @@ -1,8 +0,0 @@ -CPU: Intel(R) Core(TM) i7 CPU 5600U @ 2.60GHz -Python 3.5.2 64 bit - -Generating data (1000000 words) : 35.752 s -Add words : 2.833 s -Building automaton : 15.418 s -Look up : 2.667 s -Search : 0.740 s diff --git a/stringcheese/pyahocorasick-1.4.0/common.h b/stringcheese/pyahocorasick-1.4.0/common.h deleted file mode 100644 index 262ff29..0000000 --- a/stringcheese/pyahocorasick-1.4.0/common.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - common definitions and includes - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -*/ - -#ifndef ahocorasick_common_h_included__ -#define ahocorasick_common_h_included__ - -#define PY_SSIZE_T_CLEAN -#include -#include // PyMemberDef - -#include - -#define DEBUG - -#if defined(_MSC_VER) // Visual Studio compiler -# include "windows.h" -#else -# if defined(__CYGWIN__) -# include "cygwin.h" -# else -# include "posix.h" -# endif -#endif - -#if PY_MAJOR_VERSION >= 3 - #define PY3K - #if PY_MINOR_VERSION >= 3 || PY_MAJOR_VERSION > 3 - #define PEP393 - #ifdef AHOCORASICK_UNICODE - #define PEP393_UNICODE - #endif - #endif -#else - #ifdef AHOCORASICK_UNICODE - #warning "No support for unicode in version for Python2" - #endif - #undef AHOCORASICK_UNICODE -#endif - -// setup supported character set -#ifdef AHOCORASICK_UNICODE -# if defined PEP393_UNICODE || defined Py_UNICODE_WIDE - // Either Python uses UCS-4 or we don't know what Python uses, - // but we use UCS-4 -# define TRIE_LETTER_TYPE uint32_t -# define TRIE_LETTER_SIZE 4 -# else - // Python use UCS-2 -# define TRIE_LETTER_TYPE uint16_t -# define TRIE_LETTER_SIZE 2 -# define VARIABLE_LEN_CHARCODES 1 -# endif -#else - // only bytes are supported -# define TRIE_LETTER_TYPE uint16_t -# define TRIE_LETTER_SIZE 2 -#endif - -#ifdef __GNUC__ -# define LIKELY(x) __builtin_expect(x, 1) -# define UNLIKELY(x) __builtin_expect(x, 0) -# define ALWAYS_INLINE __attribute__((always_inline)) -# define PURE __attribute__((pure)) -# define UNUSED __attribute__((unused)) -#else -# define LIKELY(x) x -# define UNLIKELY(x) x -# define ALWAYS_INLINE -# define PURE -# define UNUSED -#endif - -#ifdef DEBUG -# include -# define ASSERT(expr) do {if (!(expr)) {fprintf(stderr, "%s:%s:%d - %s failed!\n", __FILE__, __FUNCTION__, __LINE__, #expr); fflush(stderr); exit(1);} }while(0) -#else -# define ASSERT(expr) -#endif - -#if defined(PYCALLS_INJECT_FAULTS) && defined(PY3K) -# include "src/pycallfault/pycallfault.h" -#else -# define F(name) name -#endif - -typedef char bool; -#define true 1 -#define false 0 - -#endif diff --git a/stringcheese/pyahocorasick-1.4.0/cygwin.h b/stringcheese/pyahocorasick-1.4.0/cygwin.h deleted file mode 100644 index 22d404c..0000000 --- a/stringcheese/pyahocorasick-1.4.0/cygwin.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - CYGWIN declarations. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -*/ - -#ifndef PYAHCORASICK_CYGWIN_H__ -#define PYAHCORASICK_CYGWIN_H__ - -#define PY_OBJECT_HEAD_INIT PyVarObject_HEAD_INIT(NULL, 0) - -#endif - diff --git a/stringcheese/pyahocorasick-1.4.0/docs/.gitignore b/stringcheese/pyahocorasick-1.4.0/docs/.gitignore deleted file mode 100644 index ba65b13..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/_build/ diff --git a/stringcheese/pyahocorasick-1.4.0/docs/Makefile b/stringcheese/pyahocorasick-1.4.0/docs/Makefile deleted file mode 100644 index a81744e..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/Makefile +++ /dev/null @@ -1,225 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = _build - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -.PHONY: help -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " applehelp to make an Apple Help Book" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " epub3 to make an epub3" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - @echo " coverage to run coverage check of the documentation (if enabled)" - @echo " dummy to check syntax errors of document sources" - -.PHONY: clean -clean: - rm -rf $(BUILDDIR)/* - -.PHONY: html -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -.PHONY: dirhtml -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -.PHONY: singlehtml -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -.PHONY: pickle -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -.PHONY: json -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -.PHONY: htmlhelp -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -.PHONY: qthelp -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pyahocorasick.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pyahocorasick.qhc" - -.PHONY: applehelp -applehelp: - $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp - @echo - @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." - @echo "N.B. You won't be able to view it unless you put it in" \ - "~/Library/Documentation/Help or install it in your application" \ - "bundle." - -.PHONY: devhelp -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/pyahocorasick" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pyahocorasick" - @echo "# devhelp" - -.PHONY: epub -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -.PHONY: epub3 -epub3: - $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 - @echo - @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." - -.PHONY: latex -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -.PHONY: latexpdf -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -.PHONY: latexpdfja -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -.PHONY: text -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -.PHONY: man -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -.PHONY: texinfo -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -.PHONY: info -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -.PHONY: gettext -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -.PHONY: changes -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -.PHONY: linkcheck -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -.PHONY: doctest -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -.PHONY: coverage -coverage: - $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage - @echo "Testing of coverage in the sources finished, look at the " \ - "results in $(BUILDDIR)/coverage/python.txt." - -.PHONY: xml -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." - -.PHONY: pseudoxml -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." - -.PHONY: dummy -dummy: - $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy - @echo - @echo "Build finished. Dummy builder generates no files." diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton___reduce__.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton___reduce__.rst deleted file mode 100644 index 67e954d..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton___reduce__.rst +++ /dev/null @@ -1,4 +0,0 @@ -__reduce__() ----------------------------------------------------------------------- - -Return pickle-able data for this automaton instance. diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton___sizeof__.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton___sizeof__.rst deleted file mode 100644 index dd907db..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton___sizeof__.rst +++ /dev/null @@ -1,3 +0,0 @@ -Return the approximate size in bytes occupied by the Automaton instance in -memory excluding the size of associated objects when the Automaton is created -with Automaton() or Automaton(ahocorasick.STORE_ANY). diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_add_word.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_add_word.rst deleted file mode 100644 index 3084066..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_add_word.rst +++ /dev/null @@ -1,68 +0,0 @@ -add_word(key, [value]) -> boolean --------------------------------------------------------------------------------- - -Add a key string to the dict-like trie and associate this key with a value. -value is optional or mandatory depending how the ``Automaton`` instance was -created. Return True if the word key is inserted and did not exists in the -trie or False otherwise. The value associated with an existing word is replaced. - -The value is either mandatory or optional: - -- If the Automaton was created without argument (the default) as ``Automaton()`` - or with ``Automaton(ahocorasik.STORE_ANY)`` then the value is required and can - be any Python object. -- If the Automaton was created with ``Automaton(ahocorasik.STORE_INTS)`` then the - value is optional. If provided it must be an integer, otherwise it defaults to - ``len(automaton)`` which is therefore the order index in which keys are added - to the trie. -- If the Automaton was created with ``Automaton(ahocorasik.STORE_LENGTH)`` then - associating a value is not allowed - ``len(word)`` is saved automatically as - a value instead. - -Calling add_word() invalidates all iterators only if the new key did not exist -in the trie so far (i.e. the method returned True). - -Examples -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - >>> import ahocorasick - >>> A = ahocorasick.Automaton() - >>> A.add_word("pyahocorasick") - Traceback (most recent call last): - File "", line 1, in - ValueError: A value object is required as second argument. - >>> A.add_word("pyahocorasick", (42, 'text')) - True - >>> A.get("pyhocorasick") - (42, 'text') - >>> A.add_word("pyahocorasick", 12) - False - >>> A.get("pyhocorasick") - 12 - -.. code:: python - - >>> import ahocorasick - >>> B = ahocorasick.Automaton(ahocorasick.STORE_INTS) - >>> B.add_word("cat") - True - >>> B.get() - Traceback (most recent call last): - File "", line 1, in - IndexError: tuple index out of range - >>> B.get("cat") - 1 - >>> B.add_word("dog") - True - >>> B.get("dog") - 2 - >>> B.add_word("tree", 42) - True - >>> B.get("tree") - 42 - >>> B.add_word("cat", 43) - False - >>> B.get("cat") - 43 diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_clear.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_clear.rst deleted file mode 100644 index 7ce3ffa..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_clear.rst +++ /dev/null @@ -1,23 +0,0 @@ -clear() ----------------------------------------------------------------------- - -Remove all keys from the trie. This method invalidates all iterators. - -Examples -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - >>> import ahocorasick - >>> A = ahocorasick.Automaton() - >>> A.add_word("cat", 1) - True - >>> A.add_word("dog", 2) - True - >>> A.add_word("elephant", 3) - True - >>> len(A) - 3 - >>> A.clear() - >>> len(A) - 0 diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_constructor.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_constructor.rst deleted file mode 100644 index 552dfd4..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_constructor.rst +++ /dev/null @@ -1,36 +0,0 @@ -Automaton(value_type=ahocorasick.STORE_ANY, [key_type]) --------------------------------------------------------------------------------- - -Create a new empty Automaton. Both ``value_type`` and ``key_type`` are optional. - -``value_type`` is one of these constants: - -- ahocorasick.STORE_ANY [*default*] : The associated value can be any Python object. -- ahocorasick.STORE_LENGTH : The length of an added string key is automatically - used as the associated value stored in the trie for that key. -- ahocorasick.STORE_INTS : The associated value must be a 32-bit integer. - -``key_type`` defines the type of data that can be stored in an automaton; it is one of -these constants and defines type of data might be stored: - -- ahocorasick.KEY_STRING [*default*] : string -- ahocorasick.KEY_SEQUENCE : sequences of integers; The size of integer depends - the version and platform Python, but for versions of Python >= 3.3, it is - guaranteed to be 32-bits. - - -Examples -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - >>> import ahocorasick - >>> A = ahocorasick.Automaton() - >>> A - - >>> B = ahocorasick.Automaton(ahocorasick.STORE_ANY) - >>> B - - >>> C = ahocorasick.Automaton(ahocorasick.STORE_INTS, ahocorasick.KEY_STRING) - >>> C - diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_dump.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_dump.rst deleted file mode 100644 index 90dd3e6..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_dump.rst +++ /dev/null @@ -1,13 +0,0 @@ -dump() ----------------------------------------------------------------------- - -Return a three-tuple of lists describing the Automaton as a graph -of **nodes**, **edges**, **failure links**. - -- nodes: each item is a pair (node id, end of word marker) -- edges: each item is a triple (node id, label char, child node id) -- failure links: each item is a pair (source node id, node if connected - by fail node) - -For each of these, the node id is a unique number and a label is -a number. diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_exists.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_exists.rst deleted file mode 100644 index 0e6e0f5..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_exists.rst +++ /dev/null @@ -1,22 +0,0 @@ -exists(key) -> boolean ----------------------------------------------------------------------- - -Return True if the ``key`` is present in the trie. Same as using the 'in' keyword. - -Examples -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - >>> import ahocorasick - >>> A = ahocorasick.Automaton() - >>> A.add_word("cat", 1) - True - >>> A.exists("cat") - True - >>> A.exists("dog") - False - >>> 'elephant' in A - False - >>> 'cat' in A - True diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_find_all.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_find_all.rst deleted file mode 100644 index 3b8c7a0..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_find_all.rst +++ /dev/null @@ -1,15 +0,0 @@ -find_all(string, callback, [start, [end]]) ----------------------------------------------------------------------- - -Perform the Aho-Corasick search procedure using the provided input ``string`` -and iterate over the matching tuples (``end_index``, ``value``) for keys found -in string. Invoke the ``callback`` callable for each matching tuple. - -The callback callable must accept two positional arguments: -- end_index is the end index in the input string where a trie key string was found. -- value is the value associated with the found key string. - -The start and end optional arguments can be used to limit the search to an -input string slice as in string[start:end]. - -Equivalent to a loop on iter() calling a callable at each iteration. diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_get.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_get.rst deleted file mode 100644 index 797b485..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_get.rst +++ /dev/null @@ -1,27 +0,0 @@ -get(key[, default]) ----------------------------------------------------------------------- - -Return the value associated with the key string. - -Raise a ``KeyError`` exception if the key is not in the trie and no default is provided. - -Return the optional default value if provided and the key is not in the trie. - -Example -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - >>> import ahocorasick - >>> A = ahocorasick.Automaton() - >>> A.add_word("cat", 42) - True - >>> A.get("cat") - 42 - >>> A.get("dog") - Traceback (most recent call last): - File "", line 1, in - KeyError - >>> A.get("dog", "good dog") - 'good dog' - diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_get_stats.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_get_stats.rst deleted file mode 100644 index 4443905..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_get_stats.rst +++ /dev/null @@ -1,28 +0,0 @@ -get_stats() -> dict ----------------------------------------------------------------------- - -Return a dictionary containing Automaton statistics. - -- *nodes_count* - total number of nodes -- *words_count* - number of distinct words (same as ``len(automaton)``) -- *longest_word* - length of the longest word -- *links_count* - number of edges -- *sizeof_node* - size of single node in bytes -- *total_size* - total size of trie in bytes (about - nodes_count * size_of node + links_count * size of pointer). - -Examples -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - >>> import ahocorasick - >>> A = ahocorasick.Automaton() - >>> A.add_word("he", None) - True - >>> A.add_word("her", None) - True - >>> A.add_word("hers", None) - True - >>> A.get_stats() - {'nodes_count': 5, 'words_count': 3, 'longest_word': 4, 'links_count': 4, 'sizeof_node': 40, 'total_size': 232} diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_items.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_items.rst deleted file mode 100644 index 93d0f81..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_items.rst +++ /dev/null @@ -1,6 +0,0 @@ -items([prefix, [wildcard, [how]]]) ----------------------------------------------------------------------- - -Return an iterator on tuples of (key, value). -Keys are matched optionally to the prefix using the same logic -and arguments as in the keys() method. diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_iter.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_iter.rst deleted file mode 100644 index 150a9f2..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_iter.rst +++ /dev/null @@ -1,17 +0,0 @@ -iter(string, [start, [end]], ignore_white_space=False) ----------------------------------------------------------------------- - -Perform the Aho-Corasick search procedure using the provided input string. - -Return an iterator of tuples (``end_index``, ``value``) for keys found in -string where: - -- ``end_index`` is the end index in the input string where a trie key - string was found. -- ``value`` is the value associated with the found key string. - -The ``start`` and ``end`` optional arguments can be used to limit the search -to an input string slice as in ``string[start:end]``. - -The ``ignore_white_space`` optional arguments can be used to ignore white -spaces from input string. diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_iter_long.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_iter_long.rst deleted file mode 100644 index 93d3494..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_iter_long.rst +++ /dev/null @@ -1,44 +0,0 @@ -iter_long(string, [start, [end]]) ----------------------------------------------------------------------- - -Perform the modified Aho-Corasick search procedure which matches -the longest words from set. - -Return an iterator of tuples (``end_index``, ``value``) for keys found in -string where: - -- ``end_index`` is the end index in the input string where a trie key - string was found. -- ``value`` is the value associated with the found key string. - -The ``start`` and ``end`` optional arguments can be used to limit the search -to an input string slice as in ``string[start:end]``. - - -Example -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The default Aho-Corasick algorithm returns all occurrences of words stored -in the automaton, including substring of other words from string. Method -``iter_long`` reports only the longest match. - -For set of words {"he", "her", "here"} and a needle "he here her" the -default algorithm finds following words: "he", "he", "her", "here", "he", -"her", while the modified one yields only: "he", "here", "her". - -.. code:: python - - >>> import ahocorasick - >>> A = ahocorasick.Automaton() - >>> A.add_word("he", "he") - True - >>> A.add_word("her", "her") - True - >>> A.add_word("here", "here") - True - >>> A.make_automaton() - >>> needle = "he here her" - >>> list(A.iter_long(needle)) - [(1, 'he'), (6, 'here'), (10, 'her')] - >>> list(A.iter(needle)) - [(1, 'he'), (4, 'he'), (5, 'her'), (6, 'here'), (9, 'he'), (10, 'her')] diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_keys.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_keys.rst deleted file mode 100644 index ca2598d..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_keys.rst +++ /dev/null @@ -1,20 +0,0 @@ -keys([prefix, [wildcard, [how]]]) ----------------------------------------------------------------------- - -Return an iterator on keys. -If the optional ``prefix`` string is provided, only yield keys starting -with this prefix. - -If the optional ``wildcard`` is provided as a single character string, -then the prefix is treated as a simple pattern using this character -as a wildcard. - -The optional ``how`` argument is used to control how strings are matched -using one of these possible values: - -- **ahocorasick.MATCH_EXACT_LENGTH** (default) - Yield matches that have the same exact length as the prefix length. -- **ahocorasick.MATCH_AT_LEAST_PREFIX** - Yield matches that have a length greater or equal to the prefix length. -- **ahocorasick.MATCH_AT_MOST_PREFIX** - Yield matches that have a length lesser or equal to the prefix length. diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_len.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_len.rst deleted file mode 100644 index ab86757..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_len.rst +++ /dev/null @@ -1,22 +0,0 @@ -len() -> integer ----------------------------------------------------------------------- - -Return the number of distinct keys added to the trie. - -Examples -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - >>> import ahocorasick - >>> A = ahocorasick.Automaton() - >>> len(A) - 0 - >>> A.add_word("python", 1) - True - >>> len(A) - 1 - >>> A.add_word("elephant", True) - True - >>> len(A) - 2 diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_longest_prefix.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_longest_prefix.rst deleted file mode 100644 index 14c10d4..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_longest_prefix.rst +++ /dev/null @@ -1,22 +0,0 @@ -longest_prefix(string) => integer ----------------------------------------------------------------------- - -Return the length of the longest prefix of string that exists in the trie. - -Examples -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - >>> import ahocorasick - >>> A = ahocorasick.Automaton() - >>> A.add_word("he", True) - True - >>> A.add_word("her", True) - True - >>> A.add_word("hers", True) - True - >>> A.longest_prefix("she") - 0 - >>> A.longest_prefix("herself") - 4 diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_make_automaton.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_make_automaton.rst deleted file mode 100644 index 95beb71..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_make_automaton.rst +++ /dev/null @@ -1,6 +0,0 @@ -make_automaton() ----------------------------------------------------------------------- - -Finalize and create the Aho-Corasick automaton based on the keys already added -to the trie. This does not require additional memory. After successful creation -the ``Automaton.kind`` attribute is set to ``ahocorasick.AHOCORASICK``. diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_match.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_match.rst deleted file mode 100644 index d938018..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_match.rst +++ /dev/null @@ -1,36 +0,0 @@ -match(key) -> bool ----------------------------------------------------------------------- - -Return True if there is a prefix (or key) equal to key present in the trie. - -For example if the key 'example' has been added to the trie, then calls to -match('e'), match('ex'), ..., match('exampl') or match('example') all return -True. But exists() is True only when calling exists('example'). - -Examples -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - >>> import ahocorasick - >>> A = ahocorasick.Automaton() - >>> A.add_word("example", True) - True - >>> A.match("e") - True - >>> A.match("ex") - True - >>> A.match("exa") - True - >>> A.match("exam") - True - >>> A.match("examp") - True - >>> A.match("exampl") - True - >>> A.match("example") - True - >>> A.match("examples") - False - >>> A.match("python") - False diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_pop.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_pop.rst deleted file mode 100644 index 09f0cb9..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_pop.rst +++ /dev/null @@ -1,29 +0,0 @@ -pop(word) --------------------------------------------------------------------------------- - -Remove given word from a trie and return associated values. Raise a ``KeyError`` -if the word was not found. - -Examples -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - >>> import ahocorasick - >>> A = ahocorasick.Automaton() - >>> A.add_word("cat", 1) - True - >>> A.add_word("dog", 2) - True - >>> A.pop("elephant") - Traceback (most recent call last): - File "", line 1, in - KeyError - >>> A.pop("cat") - 1 - >>> A.pop("dog") - 2 - >>> A.pop("cat") - Traceback (most recent call last): - File "", line 1, in - KeyError diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_remove_word.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_remove_word.rst deleted file mode 100644 index 45e2c1f..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_remove_word.rst +++ /dev/null @@ -1,25 +0,0 @@ -remove_word(word) -> bool --------------------------------------------------------------------------------- - -Remove given word from a trie. Return True if words was found, False otherwise. - -Examples -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: python - - >>> import ahocorasick - >>> A = ahocorasick.Automaton() - >>> A.add_word("cat", 1) - True - >>> A.add_word("dog", 2) - True - >>> A.remove_word("cat") - True - >>> A.remove_word("cat") - False - >>> A.remove_word("dog") - True - >>> A.remove_word("dog") - False - >>> diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_save.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_save.rst deleted file mode 100644 index 29eaa43..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_save.rst +++ /dev/null @@ -1,8 +0,0 @@ -save(path, serializer) ----------------------------------------------------------------------- - -Save content of automaton in an on-disc file. - -``Serializer`` is a callable object that is used when automaton store -type is ``STORE_ANY``. This method converts a python object into -bytes; it can be ``pickle.dumps``. diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_search_iter.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_search_iter.rst deleted file mode 100644 index 027f1a9..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_search_iter.rst +++ /dev/null @@ -1,3 +0,0 @@ -This class is not available directly but instances of AutomatonSearchIter -are returned by the iter() method of an Automaton. This iterator can be -manipulated through its set() method. diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_search_iter_set.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_search_iter_set.rst deleted file mode 100644 index 8af5b08..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_search_iter_set.rst +++ /dev/null @@ -1,7 +0,0 @@ -set(string, reset=False) ----------------------------------------------------------------------- - -Set a new string to search. When the reset argument is False (default) -then the Aho-Corasick procedure is continued and the internal state of the -Automaton and end index of the string being searched are not reset. This allow -to search for large strings in multiple smaller chunks. diff --git a/stringcheese/pyahocorasick-1.4.0/docs/automaton_values.rst b/stringcheese/pyahocorasick-1.4.0/docs/automaton_values.rst deleted file mode 100644 index 862c37a..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/automaton_values.rst +++ /dev/null @@ -1,6 +0,0 @@ -values([prefix, [wildcard, [how]]]) ----------------------------------------------------------------------- - -Return an iterator on values associated with each keys. -Keys are matched optionally to the prefix using the same logic -and arguments as in the keys() method. diff --git a/stringcheese/pyahocorasick-1.4.0/docs/conf.py b/stringcheese/pyahocorasick-1.4.0/docs/conf.py deleted file mode 100644 index 47a8cbe..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/conf.py +++ /dev/null @@ -1,345 +0,0 @@ -# -*- coding: utf-8 -*- -# -# pyahocorasick documentation build configuration file, created by -# sphinx-quickstart on Fri Jul 29 14:38:56 2016. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -import os -import sys -sys.path.insert(0, os.path.abspath('../') ) - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - # see https://sphinxcontrib-napoleon.readthedocs.io/en/latest/ - # we use this for better docstrings - 'sphinx.ext.napoleon' -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = '.rst' - -# The encoding of source files. -# -source_encoding = 'utf-8' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'ahocorasick' -copyright = u'2019, Wojciech Muła' -author = u'Wojciech Muła' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -#version = u'1.1.0' -# The full version, including alpha/beta/rc tags. -#release = version - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -# -# today = '' -# -# Else, today_fmt is used as the format for a strftime call. -# -# today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'bin', 'include', 'lib'] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -# -# default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -# -# add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -# -# add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -# -# show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -# modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -# keep_warnings = False - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -#html_theme = 'alabaster' # for unknown reasons, this theme has empty "Navigation" bar, makes documentation useless -html_theme = 'classic' -# html_theme = 'sphinx_rtd_theme' - - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -# html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -# html_theme_path = [] - -# The name for this set of Sphinx documents. -# " v documentation" by default. -# -# html_title = u'pyahocorasick v1.1.0' - -# A shorter title for the navigation bar. Default is the same as html_title. -# -# html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -# -# html_logo = None - -# The name of an image file (relative to this directory) to use as a favicon of -# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -# -# html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -# html_static_path = ['_static'] - -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -# -# html_extra_path = [] - -# If not None, a 'Last updated on:' timestamp is inserted at every page -# bottom, using the given strftime format. -# The empty string is equivalent to '%b %d, %Y'. -# -# html_last_updated_fmt = None - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -# -# html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -# -# html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -# -# html_additional_pages = {} - -# If false, no module index is generated. -# -# html_domain_indices = True - -# If false, no index is generated. -# -html_use_index = False - -# If true, the index is split into individual pages for each letter. -# -# html_split_index = False - -# If true, links to the reST sources are added to the pages. -html_show_sourcelink = False - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -# -# html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -# -# html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -# -# html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -# html_file_suffix = None - -# Language to be used for generating the HTML full-text search index. -# Sphinx supports the following languages: -# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' -# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' -# -# html_search_language = 'en' - -# A dictionary with options for the search language support, empty by default. -# 'ja' uses this config value. -# 'zh' user can custom change `jieba` dictionary path. -# -# html_search_options = {'type': 'default'} - -# The name of a javascript file (relative to the configuration directory) that -# implements a search results scorer. If empty, the default will be used. -# -# html_search_scorer = 'scorer.js' - -# Output file base name for HTML help builder. -htmlhelp_basename = 'pyahocorasickdoc' - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, 'pyahocorasick.tex', u'pyahocorasick Documentation', - u'Wojciech Muła', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -# -# latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -# -# latex_use_parts = False - -# If true, show page references after internal links. -# -# latex_show_pagerefs = False - -# If true, show URL addresses after external links. -# -# latex_show_urls = False - -# Documents to append as an appendix to all manuals. -# -# latex_appendices = [] - -# It false, will not define \strong, \code, itleref, \crossref ... but only -# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added -# packages. -# -# latex_keep_old_macro_names = True - -# If false, no module index is generated. -# -# latex_domain_indices = True - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'pyahocorasick', u'pyahocorasick Documentation', - [author], 1) -] - -# If true, show URL addresses after external links. -# -# man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - (master_doc, 'pyahocorasick', u'pyahocorasick Documentation', - author, 'pyahocorasick', 'One line description of project.', - 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -# -# texinfo_appendices = [] - -# If false, no module index is generated. -# -# texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -# -# texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -# -# texinfo_no_detailmenu = False diff --git a/stringcheese/pyahocorasick-1.4.0/docs/index.rst b/stringcheese/pyahocorasick-1.4.0/docs/index.rst deleted file mode 100644 index 4be4ff2..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/index.rst +++ /dev/null @@ -1,375 +0,0 @@ - -.. include:: ../README.rst - - -API Overview -============ - -This is a quick tour of the API for the C **ahocorasick** module. -See the full API doc for more details. The pure Python module has a slightly different interface. - -The module ``ahocorasick`` contains a few constants and the main ``Automaton`` class. - - -Module constants ----------------- - - - ``ahocorasick.unicode`` --- see `Unicode and bytes`_ - - - ``ahocorasick.STORE_ANY``, ``ahocorasick.STORE_INTS``, - ``ahocorasick.STORE_LENGTH`` --- see `Automaton class`_ - - - ``ahocorasick.KEY_STRING`` ``ahocorasick.KEY_SEQUENCE`` - --- see `Automaton class`_ - - - ``ahocorasick.EMPTY``, ``ahocorasick.TRIE``, ``ahocorasick.AHOCORASICK`` - --- see `Automaton Attributes`_ - - - ``ahocorasick.MATCH_EXACT_LENGTH``, ``ahocorasick.MATCH_AT_MOST_PREFIX``, - ``ahocorasick.MATCH_AT_LEAST_PREFIX`` --- see description of the keys method - - -Automaton class ---------------- - -Note: ``Automaton`` instances are `pickle-able `_ -meaning that you can create ahead of time an eventually large automaton then save it to disk -and re-load it later to reuse it over and over as a persistent multi-string search index. -Internally, Automaton implements the ``__reduce__() magic method``. - - -``Automaton([value_type], [key_type])`` - - Create a new empty Automaton optionally passing a `value_type` to indicate - what is the type of associated values (default to any Python object type). - It can be one of ``ahocorasick.STORE_ANY``, ``ahocorasick.STORE_INTS`` or - ``ahocorasick.STORE_LENGTH``. In the last case the length of the key will - be stored in the automaton. The optional argument `key_type` can be - ``ahocorasick.KEY_STRING`` or ``ahocorasick.KEY_SEQUENCE``. In the latter - case keys will be tuples of integers. The size of integer depends on the - version and platform Python is running on, but for versions of Python >= - 3.3, it is guaranteed to be 32-bits. - -Automaton Trie methods ----------------------- - -The Automaton class has the following main trie-like methods: - -``add_word(key, [value]) => bool`` - Add a ``key`` string to the dict-like trie and associate this key with a ``value``. - -``remove_word(key) => bool`` - Remove a ``key`` string from the dict-like trie. - -``pop(key) => value`` - Remove a ``key`` string from the dict-like trie and return the associated ``value``. - -``exists(key) => bool`` or ``key in ...`` - Return True if the key is present in the trie. - -``match(key) => bool`` - Return True if there is a prefix (or key) equal to ``key`` present in the trie. - - -Automaton Dictionary-like methods ---------------------------------- - -A pyahocorasick Automaton trie behaves more or less like a Python dictionary and -implements a subset of dict-like methods. Some of them are: - -``get(key[, default])`` - Return the value associated with the ``key`` string. Similar to `dict.get()`. - -``keys([prefix, [wildcard, [how]]]) => yield strings`` - Return an iterator on keys. - -``values([prefix, [wildcard, [how]]]) => yield object`` - Return an iterator on values associated with each keys. - -``items([prefix, [wildcard, [how]]]) => yield tuple (string, object)`` - Return an iterator on tuples of (key, value). - -Wildcard search -~~~~~~~~~~~~~~~ - -The methods ``keys``, ``values`` and ``items`` can be called with an optional -**wildcard**. A wildcard character is equivalent to a question mark used in glob -patterns (?) or a dot (.) in regular expressions. You can use any character you -like as a wildcard. - -Note that it is not possible to escape a wildcard to match it exactly. -You need instead to select another wildcard character not present in the -provided prefix. For example:: - - automaton.keys("hi?", "?") # would match "him", "his" - automaton.keys("XX?", "X") # would match "me?", "he?" or "it?" - -Aho-Corasick methods --------------------- - -The Automaton class has the following main Aho-Corasick methods: - -``make_automaton()`` - Finalize and create the Aho-Corasick automaton. - -``iter(string, [start, [end]])`` - Perform the Aho-Corasick search procedure using the provided input ``string``. - Return an iterator of tuples (end_index, value) for keys found in string. - -``iter_long(string, [start, [end]])`` - Returns iterator (object of class AutomatonSearchIterLong) that - searches for longest, non-overlapping matches. - -AutomatonSearchIter class -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Instances of this class are returned by the ``iter`` method of an ``Automaton``. -This iterator can be manipulated through its `set()` method. - -``set(string, [reset]) => None`` - Set a new string to search eventually keeping the current Automaton state to - continue searching for the next chunk of a string. - - For example:: - - >>> it = A.iter(b"") - >>> while True: - ... buffer = receive(server_address, 4096) - ... if not buffer: - ... break - ... it.set(buffer) - ... for index, value in it: - ... print(index, '=>', value) - - When ``reset`` is ``True`` then processing is restarted. For example this code:: - - >>> for string in string_set: - ... for index, value in A.iter(string) - ... print(index, '=>', value) - - does the same job as:: - - >>> it = A.iter(b"") - >>> for string in string_set: - ... it.set(it, True) - ... for index, value in it: - ... print(index, '=>', value) - - -Automaton Attributes --------------------- - -The Automaton class has the following attributes: - -``kind`` [readonly] - Return the state of the ``Automaton`` instance. - -``store`` [readonly] - Return the type of values stored in the Automaton as specified at creation. - - -Saving and loading automaton ----------------------------- - -There is support for two method of saving and loading an automaton: - -* the standard ``pickle`` protocol, -* custom ``save`` and ``load`` methods. - -While pickling is more convenient to use, it has quite high memory -requirements. The ``save``/``load`` method try to overcome this -problem. - -.. warning:: - - Neither format of pickle nor save are safe. Although there are - a few sanity checks, they are not sufficient to detect all - possible input errors. - - -Pickle -~~~~~~ - -.. code:: python - - import ahocorasick - import pickle - - # build automaton - - A = ahocorasick.Automaton() - # ... A.add_data, A.make_automaton - - # save current state - with open(path, 'wb') as f: - pickle.dump(A, f) - - # load saved state - with open(path, 'rb') as f: - B = pickle.load(f) - - -Save/load methods -~~~~~~~~~~~~~~~~~ - -.. code:: python - - import ahocorasick - import pickle - - # build automaton - - A = ahocorasick.Automaton() - # ... A.add_data, A.make_automaton - - # save current state - A.save(path, pickle.dumps) - - # load saved state - B = ahocorasick.load(path, pickle.loads) - - -Automaton method ``save`` requires ``path`` to the file which will store data. -If the automaton type is ``STORE_ANY``, i.e. values associated with words are -any python objects, then ``save`` requires also another argument, a callable. -The callable serializes python object into bytes; in the example above we -use standard pickle ``dumps`` function. - -Module method ``load`` also requires ``path`` to file that has data previously -saved. Because at the moment of loading data we don't know what is the store -attribute of automaton, the second argument - a callable - is required. The -callable must convert back given bytes object into python value, that will be -stored in automaton. Similarly, standard ``pickle.loads`` function can be passed. - - -Other Automaton methods ------------------------ - -The Automaton class has a few other interesting methods: - -``dump() => (list of nodes, list of edges, list of fail links)`` - Return a three-tuple of lists describing the Automaton as a graph of - (nodes, edges, failure links). - The source repository and source package also contains the ``dump2dot.py`` - script that converts ``dump()`` results to a `graphviz `_ dot - format for convenient visualization of the trie and Automaton data structure. - -``get_stats() => dict`` - Return a dictionary containing Automaton statistics. - Note that the real size occupied by the data structure could be larger because - of `internal memory fragmentation `_ - that can occur in a memory manager. - -``__sizeof__() => int`` - Return the approximate size in bytes occupied by the Automaton instance. - Also available by calling sys.getsizeof(automaton instance). - - -Examples -======== - -:: - - >>> import ahocorasick - >>> A = ahocorasick.Automaton() - - >>> # add some key words to trie - >>> for index, word in enumerate('he her hers she'.split()): - ... A.add_word(word, (index, word)) - - >>> # test that these key words exists in the trie all right - >>> 'he' in A - True - >>> 'HER' in A - False - >>> A.get('he') - (0, 'he') - >>> A.get('she') - (3, 'she') - >>> A.get('cat', '') - '' - >>> A.get('dog') - Traceback (most recent call last): - File "", line 1, in - KeyError - >>> A.remove_word('he') - True - >>> A.remove_word('he') - False - >>> A.pop('she') - (3, 'she') - >>> 'she' in A - False - - >>> # convert the trie in an Aho-Corasick automaton - >>> A = ahocorasick.Automaton() - >>> for index, word in enumerate('he her hers she'.split()): - ... A.add_word(word, (index, word)) - >>> A.make_automaton() - - >>> # then find all occurrences of the stored keys in a string - >>> for item in A.iter('_hershe_'): - ... print(item) - ... - (2, (0, 'he')) - (3, (1, 'her')) - (4, (2, 'hers')) - (6, (3, 'she')) - (6, (0, 'he')) - - -Example of the keys method behavior ------------------------------------ - -:: - - >>> import ahocorasick - >>> A = ahocorasick.Automaton() - - >>> # add some key words to trie - >>> for index, word in enumerate('cat catastropha rat rate bat'.split()): - ... A.add_word(word, (index, word)) - - >>> # Search some prefix - >>> list(A.keys('cat')) - ['cat', 'catastropha'] - - >>> # Search with a wildcard: here '?' is used as a wildcard. You can use any character you like. - >>> list(A.keys('?at', '?', ahocorasick.MATCH_EXACT_LENGTH)) - ['bat', 'cat', 'rat'] - - >>> list(A.keys('?at?', '?', ahocorasick.MATCH_AT_MOST_PREFIX)) - ['bat', 'cat', 'rat', 'rate'] - - >>> list(A.keys('?at?', '?', ahocorasick.MATCH_AT_LEAST_PREFIX)) - ['rate'] - - -API Reference -============= - -.. include:: automaton_constructor.rst -.. include:: automaton_add_word.rst -.. include:: automaton_exists.rst -.. include:: automaton_get.rst -.. include:: automaton_longest_prefix.rst -.. include:: automaton_match.rst -.. include:: automaton_len.rst -.. include:: automaton_remove_word.rst -.. include:: automaton_pop.rst -.. include:: automaton_clear.rst -.. include:: automaton_keys.rst -.. include:: automaton_items.rst -.. include:: automaton_values.rst -.. include:: automaton_make_automaton.rst -.. include:: automaton_iter.rst -.. include:: automaton_iter_long.rst -.. include:: automaton_find_all.rst -.. include:: automaton___reduce__.rst -.. include:: automaton_save.rst -.. include:: module_load.rst -.. include:: automaton___sizeof__.rst -.. include:: automaton_get_stats.rst -.. include:: automaton_dump.rst -.. include:: automaton_search_iter_set.rst - diff --git a/stringcheese/pyahocorasick-1.4.0/docs/module.rst b/stringcheese/pyahocorasick-1.4.0/docs/module.rst deleted file mode 100644 index 438ced4..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/module.rst +++ /dev/null @@ -1,3 +0,0 @@ -**pyahocorasick** is a fast and memory efficient library for exact or -approximate multi-pattern string search meaning that you can find multiple -key strings occurrences at once in some input text. diff --git a/stringcheese/pyahocorasick-1.4.0/docs/module_load.rst b/stringcheese/pyahocorasick-1.4.0/docs/module_load.rst deleted file mode 100644 index d87f041..0000000 --- a/stringcheese/pyahocorasick-1.4.0/docs/module_load.rst +++ /dev/null @@ -1,7 +0,0 @@ -load(path, deserializer) => Automaton ----------------------------------------------------------------------- - -Load automaton previously stored on disc using ``save`` method. - -``Deserializer`` is a callable object which converts bytes back into -python object; it can be ``pickle.loads``. diff --git a/stringcheese/pyahocorasick-1.4.0/dump2dot.py b/stringcheese/pyahocorasick-1.4.0/dump2dot.py deleted file mode 100644 index 092eb1d..0000000 --- a/stringcheese/pyahocorasick-1.4.0/dump2dot.py +++ /dev/null @@ -1,87 +0,0 @@ -""" - Aho-Corasick string search algorithm. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -""" - -import ahocorasick -import os - -from ahocorasick import EMPTY, TRIE, AHOCORASICK; - - -def dump2dot(automaton, file): - - def writeln(text=""): - file.write(text + "\n") - - def nodename(nodeid): - return 'node%x' % (nodeid & 0xffffffff) - - if automaton.kind == EMPTY: - writeln("digraph empty {}") - return - - if automaton.kind == TRIE: - name = "trie" - else: - name = "ahocorasick" - - - writeln("digraph %s {" % name) - - nodes, edges, fail = automaton.dump() - - # nodes - for nodeid, end in nodes: - if end: - attr = '[shape=doublecircle, label=""]' - else: - attr = '[shape=circle, label=""]' - - writeln("\t%s %s" % (nodename(nodeid), attr)) - - def format_label(label): - label = str(label, 'ascii') - label = label.replace('"', r'\"') - return '"%s"' % label - - # trie edges - for nodeid, label, destid in edges: - writeln("\t%s -> %s [label=%s]" % (nodename(nodeid), nodename(destid), format_label(label))) - - # fail links - for nodeid, failid in fail: - writeln("\t%s -> %s [color=blue]" % (nodename(nodeid), nodename(failid))) - - writeln("}") - - -def show(automaton): - path = '/dev/shm/%s.dot' % os.getpid() - with open(path, 'wt') as f: - dump2dot(automaton, f) - - os.system("xdot %s" % path) - #os.system("dotty %s" % path) - os.unlink(path) - - -if __name__ == '__main__': - A = ahocorasick.Automaton(ahocorasick.STORE_LENGTH) - - A.add_word("he") - A.add_word("her") - A.add_word("hers") - A.add_word("she") - A.add_word("cat") - A.add_word("shield") - - with open('trie.dot', 'wt') as f: - dump2dot(A, f) - - A.make_automaton() - with open('ahocorasick.dot', 'wt') as f: - dump2dot(A, f) diff --git a/stringcheese/pyahocorasick-1.4.0/msinttypes/inttypes.h b/stringcheese/pyahocorasick-1.4.0/msinttypes/inttypes.h deleted file mode 100644 index ac7e32b..0000000 --- a/stringcheese/pyahocorasick-1.4.0/msinttypes/inttypes.h +++ /dev/null @@ -1,306 +0,0 @@ -// ISO C9x compliant inttypes.h for Microsoft Visual Studio -// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 -// -// Copyright (c) 2006-2013 Alexander Chemeris -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the product nor the names of its contributors may -// be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO -// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -/////////////////////////////////////////////////////////////////////////////// - -#ifndef _MSC_VER // [ -#error "Use this header only with Microsoft Visual C++ compilers!" -#endif // _MSC_VER ] - -#ifndef _MSC_INTTYPES_H_ // [ -#define _MSC_INTTYPES_H_ - -#if _MSC_VER > 1000 -#pragma once -#endif - -#include "stdint.h" - -// 7.8 Format conversion of integer types - -typedef struct { - intmax_t quot; - intmax_t rem; -} imaxdiv_t; - -// 7.8.1 Macros for format specifiers - -#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [ See footnote 185 at page 198 - -// The fprintf macros for signed integers are: -#define PRId8 "d" -#define PRIi8 "i" -#define PRIdLEAST8 "d" -#define PRIiLEAST8 "i" -#define PRIdFAST8 "d" -#define PRIiFAST8 "i" - -#define PRId16 "hd" -#define PRIi16 "hi" -#define PRIdLEAST16 "hd" -#define PRIiLEAST16 "hi" -#define PRIdFAST16 "hd" -#define PRIiFAST16 "hi" - -#define PRId32 "I32d" -#define PRIi32 "I32i" -#define PRIdLEAST32 "I32d" -#define PRIiLEAST32 "I32i" -#define PRIdFAST32 "I32d" -#define PRIiFAST32 "I32i" - -#define PRId64 "I64d" -#define PRIi64 "I64i" -#define PRIdLEAST64 "I64d" -#define PRIiLEAST64 "I64i" -#define PRIdFAST64 "I64d" -#define PRIiFAST64 "I64i" - -#define PRIdMAX "I64d" -#define PRIiMAX "I64i" - -#define PRIdPTR "Id" -#define PRIiPTR "Ii" - -// The fprintf macros for unsigned integers are: -#define PRIo8 "o" -#define PRIu8 "u" -#define PRIx8 "x" -#define PRIX8 "X" -#define PRIoLEAST8 "o" -#define PRIuLEAST8 "u" -#define PRIxLEAST8 "x" -#define PRIXLEAST8 "X" -#define PRIoFAST8 "o" -#define PRIuFAST8 "u" -#define PRIxFAST8 "x" -#define PRIXFAST8 "X" - -#define PRIo16 "ho" -#define PRIu16 "hu" -#define PRIx16 "hx" -#define PRIX16 "hX" -#define PRIoLEAST16 "ho" -#define PRIuLEAST16 "hu" -#define PRIxLEAST16 "hx" -#define PRIXLEAST16 "hX" -#define PRIoFAST16 "ho" -#define PRIuFAST16 "hu" -#define PRIxFAST16 "hx" -#define PRIXFAST16 "hX" - -#define PRIo32 "I32o" -#define PRIu32 "I32u" -#define PRIx32 "I32x" -#define PRIX32 "I32X" -#define PRIoLEAST32 "I32o" -#define PRIuLEAST32 "I32u" -#define PRIxLEAST32 "I32x" -#define PRIXLEAST32 "I32X" -#define PRIoFAST32 "I32o" -#define PRIuFAST32 "I32u" -#define PRIxFAST32 "I32x" -#define PRIXFAST32 "I32X" - -#define PRIo64 "I64o" -#define PRIu64 "I64u" -#define PRIx64 "I64x" -#define PRIX64 "I64X" -#define PRIoLEAST64 "I64o" -#define PRIuLEAST64 "I64u" -#define PRIxLEAST64 "I64x" -#define PRIXLEAST64 "I64X" -#define PRIoFAST64 "I64o" -#define PRIuFAST64 "I64u" -#define PRIxFAST64 "I64x" -#define PRIXFAST64 "I64X" - -#define PRIoMAX "I64o" -#define PRIuMAX "I64u" -#define PRIxMAX "I64x" -#define PRIXMAX "I64X" - -#define PRIoPTR "Io" -#define PRIuPTR "Iu" -#define PRIxPTR "Ix" -#define PRIXPTR "IX" - -// The fscanf macros for signed integers are: -#define SCNd8 "d" -#define SCNi8 "i" -#define SCNdLEAST8 "d" -#define SCNiLEAST8 "i" -#define SCNdFAST8 "d" -#define SCNiFAST8 "i" - -#define SCNd16 "hd" -#define SCNi16 "hi" -#define SCNdLEAST16 "hd" -#define SCNiLEAST16 "hi" -#define SCNdFAST16 "hd" -#define SCNiFAST16 "hi" - -#define SCNd32 "ld" -#define SCNi32 "li" -#define SCNdLEAST32 "ld" -#define SCNiLEAST32 "li" -#define SCNdFAST32 "ld" -#define SCNiFAST32 "li" - -#define SCNd64 "I64d" -#define SCNi64 "I64i" -#define SCNdLEAST64 "I64d" -#define SCNiLEAST64 "I64i" -#define SCNdFAST64 "I64d" -#define SCNiFAST64 "I64i" - -#define SCNdMAX "I64d" -#define SCNiMAX "I64i" - -#ifdef _WIN64 // [ -# define SCNdPTR "I64d" -# define SCNiPTR "I64i" -#else // _WIN64 ][ -# define SCNdPTR "ld" -# define SCNiPTR "li" -#endif // _WIN64 ] - -// The fscanf macros for unsigned integers are: -#define SCNo8 "o" -#define SCNu8 "u" -#define SCNx8 "x" -#define SCNX8 "X" -#define SCNoLEAST8 "o" -#define SCNuLEAST8 "u" -#define SCNxLEAST8 "x" -#define SCNXLEAST8 "X" -#define SCNoFAST8 "o" -#define SCNuFAST8 "u" -#define SCNxFAST8 "x" -#define SCNXFAST8 "X" - -#define SCNo16 "ho" -#define SCNu16 "hu" -#define SCNx16 "hx" -#define SCNX16 "hX" -#define SCNoLEAST16 "ho" -#define SCNuLEAST16 "hu" -#define SCNxLEAST16 "hx" -#define SCNXLEAST16 "hX" -#define SCNoFAST16 "ho" -#define SCNuFAST16 "hu" -#define SCNxFAST16 "hx" -#define SCNXFAST16 "hX" - -#define SCNo32 "lo" -#define SCNu32 "lu" -#define SCNx32 "lx" -#define SCNX32 "lX" -#define SCNoLEAST32 "lo" -#define SCNuLEAST32 "lu" -#define SCNxLEAST32 "lx" -#define SCNXLEAST32 "lX" -#define SCNoFAST32 "lo" -#define SCNuFAST32 "lu" -#define SCNxFAST32 "lx" -#define SCNXFAST32 "lX" - -#define SCNo64 "I64o" -#define SCNu64 "I64u" -#define SCNx64 "I64x" -#define SCNX64 "I64X" -#define SCNoLEAST64 "I64o" -#define SCNuLEAST64 "I64u" -#define SCNxLEAST64 "I64x" -#define SCNXLEAST64 "I64X" -#define SCNoFAST64 "I64o" -#define SCNuFAST64 "I64u" -#define SCNxFAST64 "I64x" -#define SCNXFAST64 "I64X" - -#define SCNoMAX "I64o" -#define SCNuMAX "I64u" -#define SCNxMAX "I64x" -#define SCNXMAX "I64X" - -#ifdef _WIN64 // [ -# define SCNoPTR "I64o" -# define SCNuPTR "I64u" -# define SCNxPTR "I64x" -# define SCNXPTR "I64X" -#else // _WIN64 ][ -# define SCNoPTR "lo" -# define SCNuPTR "lu" -# define SCNxPTR "lx" -# define SCNXPTR "lX" -#endif // _WIN64 ] - -#endif // __STDC_FORMAT_MACROS ] - -// 7.8.2 Functions for greatest-width integer types - -// 7.8.2.1 The imaxabs function -#define imaxabs _abs64 - -// 7.8.2.2 The imaxdiv function - -// This is modified version of div() function from Microsoft's div.c found -// in %MSVC.NET%\crt\src\div.c -#ifdef STATIC_IMAXDIV // [ -static -#else // STATIC_IMAXDIV ][ -_inline -#endif // STATIC_IMAXDIV ] -imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom) -{ - imaxdiv_t result; - - result.quot = numer / denom; - result.rem = numer % denom; - - if (numer < 0 && result.rem > 0) { - // did division wrong; must fix up - ++result.quot; - result.rem -= denom; - } - - return result; -} - -// 7.8.2.3 The strtoimax and strtoumax functions -#define strtoimax _strtoi64 -#define strtoumax _strtoui64 - -// 7.8.2.4 The wcstoimax and wcstoumax functions -#define wcstoimax _wcstoi64 -#define wcstoumax _wcstoui64 - - -#endif // _MSC_INTTYPES_H_ ] diff --git a/stringcheese/pyahocorasick-1.4.0/msinttypes/stdint.h b/stringcheese/pyahocorasick-1.4.0/msinttypes/stdint.h deleted file mode 100644 index 4fe0ef9..0000000 --- a/stringcheese/pyahocorasick-1.4.0/msinttypes/stdint.h +++ /dev/null @@ -1,259 +0,0 @@ -// ISO C9x compliant stdint.h for Microsoft Visual Studio -// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 -// -// Copyright (c) 2006-2013 Alexander Chemeris -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the product nor the names of its contributors may -// be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED -// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO -// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -/////////////////////////////////////////////////////////////////////////////// - -#ifndef _MSC_VER // [ -#error "Use this header only with Microsoft Visual C++ compilers!" -#endif // _MSC_VER ] - -#ifndef _MSC_STDINT_H_ // [ -#define _MSC_STDINT_H_ - -#if _MSC_VER > 1000 -#pragma once -#endif - -#if _MSC_VER >= 1600 // [ -#include -#else // ] _MSC_VER >= 1600 [ - -#include - -// For Visual Studio 6 in C++ mode and for many Visual Studio versions when -// compiling for ARM we should wrap include with 'extern "C++" {}' -// or compiler give many errors like this: -// error C2733: second C linkage of overloaded function 'wmemchr' not allowed -#ifdef __cplusplus -extern "C" { -#endif -# include -#ifdef __cplusplus -} -#endif - -// Define _W64 macros to mark types changing their size, like intptr_t. -#ifndef _W64 -# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 -# define _W64 __w64 -# else -# define _W64 -# endif -#endif - - -// 7.18.1 Integer types - -// 7.18.1.1 Exact-width integer types - -// Visual Studio 6 and Embedded Visual C++ 4 doesn't -// realize that, e.g. char has the same size as __int8 -// so we give up on __intX for them. -#if (_MSC_VER < 1300) - typedef signed char int8_t; - typedef signed short int16_t; - typedef signed int int32_t; - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; -#else - typedef signed __int8 int8_t; - typedef signed __int16 int16_t; - typedef signed __int32 int32_t; - typedef unsigned __int8 uint8_t; - typedef unsigned __int16 uint16_t; - typedef unsigned __int32 uint32_t; -#endif -typedef signed __int64 int64_t; -typedef unsigned __int64 uint64_t; - - -// 7.18.1.2 Minimum-width integer types -typedef int8_t int_least8_t; -typedef int16_t int_least16_t; -typedef int32_t int_least32_t; -typedef int64_t int_least64_t; -typedef uint8_t uint_least8_t; -typedef uint16_t uint_least16_t; -typedef uint32_t uint_least32_t; -typedef uint64_t uint_least64_t; - -// 7.18.1.3 Fastest minimum-width integer types -typedef int8_t int_fast8_t; -typedef int16_t int_fast16_t; -typedef int32_t int_fast32_t; -typedef int64_t int_fast64_t; -typedef uint8_t uint_fast8_t; -typedef uint16_t uint_fast16_t; -typedef uint32_t uint_fast32_t; -typedef uint64_t uint_fast64_t; - -// 7.18.1.4 Integer types capable of holding object pointers -#ifdef _WIN64 // [ - typedef signed __int64 intptr_t; - typedef unsigned __int64 uintptr_t; -#else // _WIN64 ][ - typedef _W64 signed int intptr_t; - typedef _W64 unsigned int uintptr_t; -#endif // _WIN64 ] - -// 7.18.1.5 Greatest-width integer types -typedef int64_t intmax_t; -typedef uint64_t uintmax_t; - - -// 7.18.2 Limits of specified-width integer types - -#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 - -// 7.18.2.1 Limits of exact-width integer types -#define INT8_MIN ((int8_t)_I8_MIN) -#define INT8_MAX _I8_MAX -#define INT16_MIN ((int16_t)_I16_MIN) -#define INT16_MAX _I16_MAX -#define INT32_MIN ((int32_t)_I32_MIN) -#define INT32_MAX _I32_MAX -#define INT64_MIN ((int64_t)_I64_MIN) -#define INT64_MAX _I64_MAX -#define UINT8_MAX _UI8_MAX -#define UINT16_MAX _UI16_MAX -#define UINT32_MAX _UI32_MAX -#define UINT64_MAX _UI64_MAX - -// 7.18.2.2 Limits of minimum-width integer types -#define INT_LEAST8_MIN INT8_MIN -#define INT_LEAST8_MAX INT8_MAX -#define INT_LEAST16_MIN INT16_MIN -#define INT_LEAST16_MAX INT16_MAX -#define INT_LEAST32_MIN INT32_MIN -#define INT_LEAST32_MAX INT32_MAX -#define INT_LEAST64_MIN INT64_MIN -#define INT_LEAST64_MAX INT64_MAX -#define UINT_LEAST8_MAX UINT8_MAX -#define UINT_LEAST16_MAX UINT16_MAX -#define UINT_LEAST32_MAX UINT32_MAX -#define UINT_LEAST64_MAX UINT64_MAX - -// 7.18.2.3 Limits of fastest minimum-width integer types -#define INT_FAST8_MIN INT8_MIN -#define INT_FAST8_MAX INT8_MAX -#define INT_FAST16_MIN INT16_MIN -#define INT_FAST16_MAX INT16_MAX -#define INT_FAST32_MIN INT32_MIN -#define INT_FAST32_MAX INT32_MAX -#define INT_FAST64_MIN INT64_MIN -#define INT_FAST64_MAX INT64_MAX -#define UINT_FAST8_MAX UINT8_MAX -#define UINT_FAST16_MAX UINT16_MAX -#define UINT_FAST32_MAX UINT32_MAX -#define UINT_FAST64_MAX UINT64_MAX - -// 7.18.2.4 Limits of integer types capable of holding object pointers -#ifdef _WIN64 // [ -# define INTPTR_MIN INT64_MIN -# define INTPTR_MAX INT64_MAX -# define UINTPTR_MAX UINT64_MAX -#else // _WIN64 ][ -# define INTPTR_MIN INT32_MIN -# define INTPTR_MAX INT32_MAX -# define UINTPTR_MAX UINT32_MAX -#endif // _WIN64 ] - -// 7.18.2.5 Limits of greatest-width integer types -#define INTMAX_MIN INT64_MIN -#define INTMAX_MAX INT64_MAX -#define UINTMAX_MAX UINT64_MAX - -// 7.18.3 Limits of other integer types - -#ifdef _WIN64 // [ -# define PTRDIFF_MIN _I64_MIN -# define PTRDIFF_MAX _I64_MAX -#else // _WIN64 ][ -# define PTRDIFF_MIN _I32_MIN -# define PTRDIFF_MAX _I32_MAX -#endif // _WIN64 ] - -#define SIG_ATOMIC_MIN INT_MIN -#define SIG_ATOMIC_MAX INT_MAX - -#ifndef SIZE_MAX // [ -# ifdef _WIN64 // [ -# define SIZE_MAX _UI64_MAX -# else // _WIN64 ][ -# define SIZE_MAX _UI32_MAX -# endif // _WIN64 ] -#endif // SIZE_MAX ] - -// WCHAR_MIN and WCHAR_MAX are also defined in -#ifndef WCHAR_MIN // [ -# define WCHAR_MIN 0 -#endif // WCHAR_MIN ] -#ifndef WCHAR_MAX // [ -# define WCHAR_MAX _UI16_MAX -#endif // WCHAR_MAX ] - -#define WINT_MIN 0 -#define WINT_MAX _UI16_MAX - -#endif // __STDC_LIMIT_MACROS ] - - -// 7.18.4 Limits of other integer types - -#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 - -// 7.18.4.1 Macros for minimum-width integer constants - -#define INT8_C(val) val##i8 -#define INT16_C(val) val##i16 -#define INT32_C(val) val##i32 -#define INT64_C(val) val##i64 - -#define UINT8_C(val) val##ui8 -#define UINT16_C(val) val##ui16 -#define UINT32_C(val) val##ui32 -#define UINT64_C(val) val##ui64 - -// 7.18.4.2 Macros for greatest-width integer constants -// These #ifndef's are needed to prevent collisions with . -// Check out Issue 9 for the details. -#ifndef INTMAX_C // [ -# define INTMAX_C INT64_C -#endif // INTMAX_C ] -#ifndef UINTMAX_C // [ -# define UINTMAX_C UINT64_C -#endif // UINTMAX_C ] - -#endif // __STDC_CONSTANT_MACROS ] - -#endif // _MSC_VER >= 1600 ] - -#endif // _MSC_STDINT_H_ ] diff --git a/stringcheese/pyahocorasick-1.4.0/posix.h b/stringcheese/pyahocorasick-1.4.0/posix.h deleted file mode 100644 index cde9051..0000000 --- a/stringcheese/pyahocorasick-1.4.0/posix.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - POSIX declarations. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -*/ - -#ifndef PYAHCORASICK_POSIX_H__ -#define PYAHCORASICK_POSIX_H__ - -#define PY_OBJECT_HEAD_INIT PyVarObject_HEAD_INIT(&PyType_Type, 0) - -#endif - diff --git a/stringcheese/pyahocorasick-1.4.0/py/README.rst b/stringcheese/pyahocorasick-1.4.0/py/README.rst deleted file mode 100644 index f78971a..0000000 --- a/stringcheese/pyahocorasick-1.4.0/py/README.rst +++ /dev/null @@ -1,2 +0,0 @@ -This directory contains a simpler pure python module, compatible with Python 2 -and 3. It has a slightly different API. It may fail at pickling for long keys. diff --git a/stringcheese/pyahocorasick-1.4.0/py/exportdot.py b/stringcheese/pyahocorasick-1.4.0/py/exportdot.py deleted file mode 100644 index 8b71c54..0000000 --- a/stringcheese/pyahocorasick-1.4.0/py/exportdot.py +++ /dev/null @@ -1,81 +0,0 @@ -""" - Aho-Corasick string search algorithm. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -""" - -import pyahocorasick - - -def exportdot(trie, file): - - def writeln(text=""): - file.write(text + "\n") - - writeln("digraph ahocorasick {") - - def walk(node): - queue = [node] - while queue: - node = queue.pop() - yield node - - for child in node.children.itervalues(): - if child != node: - queue.append(child) - - nodes = list(walk(trie.root)) - - # nodes - for node in nodes: - if node.output != pyahocorasick.nil: - writeln("\tnode%d [shape=doublecircle, label=\"\"]" % id(node)) - else: - writeln("\tnode%d [shape=circle, label=\"\"]" % id(node)) - - # trie edges - for node in nodes: - for letter, child in node.children.iteritems(): - nodeid = id(node) - destid = id(child) - if destid == id(trie.root): - # do not show self-links of root node created during make_automaton - continue - - if letter.isalnum(): - label = letter - else: - label = '%02x' % ord(letter) - - writeln("\tnode%d -> node%d [label=\"%s\"]" % (nodeid, destid, label)) - - # fail links - for node in nodes: - nodeid = id(node) - failid = id(node.fail) - - if failid != pyahocorasick.nil: - writeln("\tnode%d -> node%d [color=blue]" % (nodeid, failid)) - - writeln("}") - - -if __name__ == '__main__': - A = pyahocorasick.Trie() - - A.add_word("he", 0) - A.add_word("her", 1) - A.add_word("hers", 2) - A.add_word("she", 3) - A.add_word("cat", 4) - A.add_word("shield", 5) - - with open('trie.dot', 'wt') as f: - exportdot(A, f) - - A.make_automaton() - - with open('ahocorasick.dot', 'wt') as f: - exportdot(A, f) diff --git a/stringcheese/pyahocorasick-1.4.0/py/issue_21.py b/stringcheese/pyahocorasick-1.4.0/py/issue_21.py deleted file mode 100644 index 11fe50e..0000000 --- a/stringcheese/pyahocorasick-1.4.0/py/issue_21.py +++ /dev/null @@ -1,54 +0,0 @@ -""" - Aho-Corasick string search algorithm. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -""" - -import pyahocorasick - -test_cases = [ - # example provided by @Ulitochka - { - 'words' : ["alpha", "alpha beta", "gamma", "gamma alpha"], - 'input' : "I went to alpha beta the alpha other day gamma alpha to pick up some spam", - 'expected' : [("alpha beta", 19), ("alpha", 29), ("gamma alpha", 51)] - }, - - { - 'words' : ["alpha", "alpha beta", "beta gamma", "gamma"], - 'input' : "Cats have not idea what alpha beta gamma means", - 'expected' : [("alpha beta", 33), ("gamma", 39)] - }, - - { - 'words' : ["alpha", "alpha beta", "beta gamma", "gamma"], - 'input' : "Cats have not idea what alpha beta gamma", - 'expected' : [("alpha beta", 33), ("gamma", 39)] - }, -] - - -def test(case): - - tree = pyahocorasick.Trie() - for word in case['words']: - tree.add_word(word, word) - - tree.make_automaton() - - actual = [item for item in tree.iter_long(case['input'])] - - if actual != case['expected']: - print("ERROR:") - print(actual) - print(case['expected']) - assert(False) - - -if __name__ == '__main__': - for data in test_cases: - test(data) - - print("OK") diff --git a/stringcheese/pyahocorasick-1.4.0/py/pyahocorasick.py b/stringcheese/pyahocorasick-1.4.0/py/pyahocorasick.py deleted file mode 100644 index b51548f..0000000 --- a/stringcheese/pyahocorasick-1.4.0/py/pyahocorasick.py +++ /dev/null @@ -1,345 +0,0 @@ -# -*- coding: utf-8 -*- -""" - Aho-Corasick string search algorithm. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -""" - -from collections import deque - -nil = object() # used to distinguish from None - -class TrieNode(object): - """ - Node of trie/Aho-Corasick automaton - """ - - __slots__ = ['char', 'output', 'fail', 'children'] - - def __init__(self, char): - """ - Constructs an empty node - """ - - self.char = char # character - self.output = nil # an output function for this node - self.fail = nil # fail link used by Aho-Corasick automaton - self.children = {} # children - - def __repr__(self): - """ - Textual representation of node. - """ - - if self.output is not nil: - return "" % (self.char, self.output) - else: - return "" % self.char - - -class Trie(object): - """ - Trie/Aho-Corasick automaton. - """ - - def __init__(self): - """ - Construct an empty trie - """ - - self.root = TrieNode('') - - - def __get_node(self, word): - """ - Private function retrieving a final node of trie - for given word - - Returns node or None, if the trie doesn't contain the word. - """ - - node = self.root - for c in word: - try: - node = node.children[c] - except KeyError: - return None - - return node - - - def get(self, word, default=nil): - """ - Retrieves output value associated with word. - - If there is no word returns default value, - and if default is not given rises KeyError. - """ - - node = self.__get_node(word) - output = nil - if node: - output = node.output - - if output is nil: - if default is nil: - raise KeyError("no key '%s'" % word) - else: - return default - else: - return output - - - def keys(self): - """ - Generator returning all keys (i.e. word) stored in trie - """ - - for key, _ in self.items(): - yield key - - - def values(self): - """ - Generator returning all values associated with words stored in a trie. - """ - - for _, value in self.items(): - yield value - - - def items(self): - """ - Generator returning all keys and values stored in a trie. - """ - - L = [] - def aux(node, s): - s = s + node.char - if node.output is not nil: - L.append((s, node.output)) - - for child in node.children.values(): - if child is not node: - aux(child, s) - - aux(self.root, '') - return iter(L) - - - def __len__(self): - """ - Calculates number of words in a trie. - """ - - stack = deque() - stack.append(self.root) - n = 0 - while stack: - node = stack.pop() - if node.output is not nil: - n += 1 - - for child in node.children.values(): - stack.append(child) - - return n - - - def add_word(self, word, value): - """ - Adds word and associated value. - - If word already exists, its value is replaced. - """ - if not word: - return - - node = self.root - for c in word: - try: - node = node.children[c] - except KeyError: - n = TrieNode(c) - node.children[c] = n - node = n - - node.output = value - - - def clear(self): - """ - Clears trie. - """ - - self.root = TrieNode('') - - - def exists(self, word): - """ - Checks if whole word is present in the trie. - """ - - node = self.__get_node(word) - if node: - return bool(node.output != nil) - else: - return False - - - def match(self, word): - """ - Checks if word is a prefix of any existing word in the trie. - """ - - return (self.__get_node(word) is not None) - - - def make_automaton(self): - """ - Converts trie to Aho-Corasick automaton. - """ - - queue = deque() - - # 1. - for i in range(256): - c = chr(i) - if c in self.root.children: - node = self.root.children[c] - node.fail = self.root # f(s) = 0 - queue.append(node) - else: - self.root.children[c] = self.root - - # 2. - while queue: - r = queue.popleft() - for node in r.children.values(): - queue.append(node) - state = r.fail - while node.char not in state.children: - state = state.fail - - node.fail = state.children.get(node.char, self.root) - - - def iter(self, string): - """ - Generator performs Aho-Corasick search string algorithm, yielding - tuples containing two values: - - position in string - - outputs associated with matched strings - """ - state = self.root - for index, c in enumerate(string): - while c not in state.children: - state = state.fail - - state = state.children.get(c, self.root) - - tmp = state - output = [] - while tmp is not nil: - if tmp.output is not nil: - output.append(tmp.output) - - tmp = tmp.fail - - if output: - yield (index, output) - - def iter_long(self, string): - """ - Generator performs a modified Aho-Corasick search string algorithm, - which maches only the longest word. - - """ - state = self.root - last = None - - index = 0 - while index < len(string): - c = string[index] - - if c in state.children: - state = state.children[c] - - if state.output is not nil: - # save the last node on the path - last = (state.output, index) - - index += 1 - else: - if last: - # return the saved match - yield last - - # and start over, as we don't want overlapped results - # Note: this leads to quadratic complexity in the worst case - index = last[1] + 1 - state = self.root - last = None - else: - # if no output, perform classic Aho-Corasick algorithm - while c not in state.children: - state = state.fail - - # corner case - if last: - yield last - - def find_all(self, string, callback): - """ - Wrapper on iter method, callback gets an iterator result - """ - for index, output in self.iter(string): - callback(index, output) - - - -if __name__ == '__main__': - - def demo(): - words = "he hers his she hi him man".split() - - t = Trie(); - for w in words: - t.add_word(w, w) - - s = "he rshershidamanza " - - t.make_automaton() - for res in t.items(): - print(res) - - for res in t.iter(s): - print - print('%s' % s) - pos, matches = res - for fragment in matches: - print('%s%s' % ((pos - len(fragment) + 1)*' ', fragment)) - - demo() - - - def bug(): - patterns = ['GT-C3303','SAMSUNG-GT-C3303K/'] - text = 'SAMSUNG-GT-C3303i/1.0 NetFront/3.5 Profile/MIDP-2.0 Configuration/CLDC-1.1' - - t = Trie() - for pattern in patterns: - ret = t.add_word(pattern, (0, pattern)) - - t.make_automaton() - - res = list(t.iter(text)) - - assert len(res) == 1, 'failed' - - bug() - -# vim: ts=4 sw=4 nowrap - diff --git a/stringcheese/pyahocorasick-1.4.0/py/unittests.py b/stringcheese/pyahocorasick-1.4.0/py/unittests.py deleted file mode 100644 index b1617e1..0000000 --- a/stringcheese/pyahocorasick-1.4.0/py/unittests.py +++ /dev/null @@ -1,209 +0,0 @@ -""" - Aho-Corasick string search algorithm. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -""" - -import unittest -from pyahocorasick import Trie - -class TestTrie(unittest.TestCase): - def testEmptyTrieShouldNotContainsAnyWords(self): - t = Trie() - self.assertEqual(len(t), 0) - - - def testAddedWordShouldBeCountedAndAvailableForRetrieval(self): - t = Trie() - t.add_word('python', 'value') - self.assertEqual(len(t), 1) - self.assertEqual(t.get('python'), 'value') - - - def testAddingExistingWordShouldReplaceAssociatedValue(self): - t = Trie() - t.add_word('python', 'value') - self.assertEqual(len(t), 1) - self.assertEqual(t.get('python'), 'value') - - t.add_word('python', 'other') - self.assertEqual(len(t), 1) - self.assertEqual(t.get('python'), 'other') - - def testGetUnknowWordWithoutDefaultValueShouldRaiseException(self): - t = Trie() - with self.assertRaises(KeyError): - t.get('python') - - - def testGetUnknowWordWithDefaultValueShouldReturnDefault(self): - t = Trie() - self.assertEqual(t.get('python', 'default'), 'default') - - - def testExistShouldDetectAddedWords(self): - t = Trie() - t.add_word('python', 'value') - t.add_word('ada', 'value') - - self.assertTrue(t.exists('python')) - self.assertTrue(t.exists('ada')) - - - def testExistShouldReturnFailOnUnknownWord(self): - t = Trie() - t.add_word('python', 'value') - - self.assertFalse(t.exists('ada')) - - - def testMatchShouldDetecAllPrefixesIncludingWord(self): - t = Trie() - t.add_word('python', 'value') - t.add_word('ada', 'value') - - self.assertTrue(t.match('a')) - self.assertTrue(t.match('ad')) - self.assertTrue(t.match('ada')) - - self.assertTrue(t.match('p')) - self.assertTrue(t.match('py')) - self.assertTrue(t.match('pyt')) - self.assertTrue(t.match('pyth')) - self.assertTrue(t.match('pytho')) - self.assertTrue(t.match('python')) - - - def testItemsShouldReturnAllItemsAlreadyAddedToTheTrie(self): - t = Trie() - - t.add_word('python', 1) - t.add_word('ada', 2) - t.add_word('perl', 3) - t.add_word('pascal', 4) - t.add_word('php', 5) - - result = list(t.items()) - self.assertEquals(len(result), 5) - self.assertIn(('python', 1), result) - self.assertIn(('ada', 2), result) - self.assertIn(('perl', 3), result) - self.assertIn(('pascal', 4), result) - self.assertIn(('php', 5), result) - - - def testKeysShouldReturnAllKeysAlreadyAddedToTheTrie(self): - t = Trie() - - t.add_word('python', 1) - t.add_word('ada', 2) - t.add_word('perl', 3) - t.add_word('pascal', 4) - t.add_word('php', 5) - - result = list(t.keys()) - self.assertEquals(len(result), 5) - self.assertIn('python',result) - self.assertIn('ada', result) - self.assertIn('perl', result) - self.assertIn('pascal',result) - self.assertIn('php', result) - - - def testValuesShouldReturnAllValuesAlreadyAddedToTheTrie(self): - t = Trie() - - t.add_word('python', 1) - t.add_word('ada', 2) - t.add_word('perl', 3) - t.add_word('pascal', 4) - t.add_word('php', 5) - - result = list(t.values()) - self.assertEquals(len(result), 5) - self.assertIn(1, result) - self.assertIn(2, result) - self.assertIn(3, result) - self.assertIn(4, result) - self.assertIn(5, result) - - - def testClearShouldRemoveEveryting(self): - t = Trie() - - t.add_word('python', 1) - t.add_word('ada', 2) - t.add_word('perl', 3) - t.add_word('pascal', 4) - t.add_word('php', 5) - - self.assertEqual(len(t), 5) - self.assertEqual(len(list(t.items())), 5) - - t.clear() - - self.assertEqual(len(t), 0) - self.assertEqual(len(list(t.items())), 0) - - - def testIterShouldMatchAllStrings(self): - - def get_test_automaton(): - words = "he her hers his she hi him man himan".split() - - t = Trie(); - for w in words: - t.add_word(w, w) - - t.make_automaton() - - return t - - - test_string = "he she himan" - - t = get_test_automaton() - result = list(t.iter(test_string)) - - # there are 5 matching positions - self.assertEquals(len(result), 5) - - # result should have be valid, i.e. returned position and substring - # must match substring from test string - for end_index, strings in result: - for s in strings: - n = len(s) - self.assertEqual(s, test_string[end_index - n + 1 : end_index + 1]) - - - def testFindAllShouldGetTheSameDataAsIter(self): - - def get_test_automaton(): - words = "he her hers his she hi him man himan".split() - - t = Trie(); - for w in words: - t.add_word(w, w) - - t.make_automaton() - - return t - - find_all_arguments = [] - - def find_all_callback(end_index, strings): - find_all_arguments.append((end_index, strings)) - - t = get_test_automaton() - test_string = "he she himan" - - t.find_all(test_string, find_all_callback) - - result_items = list(t.iter(test_string)) - self.assertEquals(find_all_arguments, result_items) - - -if __name__ == '__main__': - unittest.main() diff --git a/stringcheese/pyahocorasick-1.4.0/pyahocorasick.c b/stringcheese/pyahocorasick-1.4.0/pyahocorasick.c deleted file mode 100644 index 6b68a0a..0000000 --- a/stringcheese/pyahocorasick-1.4.0/pyahocorasick.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - Python module. - - This file include all code from *.c files. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -*/ - -#include "common.h" -#include "slist.h" -#include "trienode.h" -#include "trie.h" -#include "Automaton.h" -#include "AutomatonSearchIter.h" -#include "AutomatonSearchIterLong.h" -#include "AutomatonItemsIter.h" -#include "src/inline_doc.h" -#include "src/custompickle/load/module_automaton_load.h" - -/* code */ -#include "utils.c" -#include "trienode.c" -#include "trie.c" -#include "slist.c" -#include "Automaton.c" -#include "AutomatonItemsIter.c" -#include "AutomatonSearchIter.c" -#include "AutomatonSearchIterLong.c" -#ifdef PYCALLS_INJECT_FAULTS -#include "src/pycallfault/pycallfault.c" -#endif -#include "allsources.c" - - -static -PyMethodDef -ahocorasick_module_methods[] = { - {"load", module_automaton_load, METH_VARARGS, module_load_doc}, - - {NULL, NULL, 0, NULL} -}; - - -#ifdef PY3K -static -PyModuleDef ahocorasick_module = { - PyModuleDef_HEAD_INIT, - "ahocorasick", - module_doc, - -1, - ahocorasick_module_methods -}; -#endif - -#ifdef PY3K -#define init_function PyInit_ahocorasick -#define init_return(value) return (value) -#else -#define init_function initahocorasick -#define init_return(unused) return -#endif - -PyMODINIT_FUNC -init_function(void) { - PyObject* module; - -#ifdef MEMORY_DEBUG - PyErr_WarnEx(PyExc_RuntimeWarning, - "This is a developer version of pyahcorosick. " - "The module was compiled with flag MEMORY_DEBUG.", 1); - initialize_memory_debug(); -#endif - -#ifdef PYCALLS_INJECT_FAULTS - PyErr_WarnEx(PyExc_RuntimeWarning, - "This is a developer version of pyahcorosick. " - "The module was compiled with flag PYCALLS_INJECT_FAULTS.", 1); - initialize_pycallfault(); -#endif - -#if DEBUG_LAYOUT - PyErr_WarnEx(PyExc_RuntimeWarning, - "This is a developer version of pyahcorosick. " - "The module was compiled with flag DEBUG_LAYOUT.", 1); - trienode_dump_layout(); -#endif - - automaton_as_sequence.sq_length = automaton_len; - automaton_as_sequence.sq_contains = automaton_contains; - - automaton_type.tp_as_sequence = &automaton_as_sequence; - -#ifdef PY3K - module = PyModule_Create(&ahocorasick_module); -#else - module = Py_InitModule3("ahocorasick", ahocorasick_module_methods, module_doc); -#endif - if (module == NULL) - init_return(NULL); - - - if (PyType_Ready(&automaton_type) < 0) { - Py_DECREF(module); - init_return(NULL); - } - else - PyModule_AddObject(module, "Automaton", (PyObject*)&automaton_type); - -#define add_enum_const(name) PyModule_AddIntConstant(module, #name, name) - add_enum_const(TRIE); - add_enum_const(AHOCORASICK); - add_enum_const(EMPTY); - - add_enum_const(STORE_LENGTH); - add_enum_const(STORE_INTS); - add_enum_const(STORE_ANY); - - add_enum_const(KEY_STRING); - add_enum_const(KEY_SEQUENCE); - - add_enum_const(MATCH_EXACT_LENGTH); - add_enum_const(MATCH_AT_MOST_PREFIX); - add_enum_const(MATCH_AT_LEAST_PREFIX); -#undef add_enum_const - -#ifdef AHOCORASICK_UNICODE - PyModule_AddIntConstant(module, "unicode", 1); -#else - PyModule_AddIntConstant(module, "unicode", 0); -#endif - - init_return(module); -} diff --git a/stringcheese/pyahocorasick-1.4.0/regression/issue_10.py b/stringcheese/pyahocorasick-1.4.0/regression/issue_10.py deleted file mode 100644 index 1e01ffe..0000000 --- a/stringcheese/pyahocorasick-1.4.0/regression/issue_10.py +++ /dev/null @@ -1,34 +0,0 @@ -# -*- coding: utf-8 -*- -""" - Aho-Corasick string search algorithm. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -""" - -import ahocorasick - -ac = ahocorasick.Automaton() -ac.add_word('S', 1) -ac.make_automaton() -buffer = 'SSS' - -def case_1(): - count = 0 - for item in ac.iter(buffer, 0, 3): # this causes an error - print(item) - count += 1 - - assert(count == 3) - -def case_2(): - count = 0 - for item in ac.iter(buffer, 0, 2): # no error, but it misses the last 'S' in the buffer - print(item) - count += 1 - - assert(count == 2) - -case_1() -case_2() diff --git a/stringcheese/pyahocorasick-1.4.0/regression/issue_19.py b/stringcheese/pyahocorasick-1.4.0/regression/issue_19.py deleted file mode 100644 index 1df22ce..0000000 --- a/stringcheese/pyahocorasick-1.4.0/regression/issue_19.py +++ /dev/null @@ -1,15 +0,0 @@ -# -*- coding: utf-8 -*- -""" - Aho-Corasick string search algorithm. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -""" - -import ahocorasick - -A = ahocorasick.Automaton() -for index, word in enumerate("he her hers she".split()): - A.add_word(word, (index, word)) - A.clear() diff --git a/stringcheese/pyahocorasick-1.4.0/regression/issue_26.py b/stringcheese/pyahocorasick-1.4.0/regression/issue_26.py deleted file mode 100644 index cbc4d2a..0000000 --- a/stringcheese/pyahocorasick-1.4.0/regression/issue_26.py +++ /dev/null @@ -1,17 +0,0 @@ -# -*- coding: utf-8 -*- -""" - Aho-Corasick string search algorithm. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -""" - -import pickle - -import ahocorasick as aho - -a = aho.Automaton(aho.STORE_INTS) -a.add_word('abc', 12) -a.make_automaton() -p = pickle.dumps(a) diff --git a/stringcheese/pyahocorasick-1.4.0/regression/issue_5.py b/stringcheese/pyahocorasick-1.4.0/regression/issue_5.py deleted file mode 100644 index a668f9c..0000000 --- a/stringcheese/pyahocorasick-1.4.0/regression/issue_5.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- -""" - Aho-Corasick string search algorithm. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -""" - -import ahocorasick - -A = ahocorasick.Automaton() - -# add some words to trie -for index, word in enumerate("he her hers she".split()): - A.add_word(word, (index, word)) - -A = None #### segfault here diff --git a/stringcheese/pyahocorasick-1.4.0/regression/issue_50-part1.py b/stringcheese/pyahocorasick-1.4.0/regression/issue_50-part1.py deleted file mode 100644 index 2894e8b..0000000 --- a/stringcheese/pyahocorasick-1.4.0/regression/issue_50-part1.py +++ /dev/null @@ -1,10 +0,0 @@ -from ahocorasick import Automaton -from pickle import load, dump - -auto = Automaton() -auto.add_word('abc', 'abc') - -auto.add_word('def', 'def') - -with open('automaton-wee.pickle', 'wb') as dest: - dump(auto, dest) diff --git a/stringcheese/pyahocorasick-1.4.0/regression/issue_50-part2.py b/stringcheese/pyahocorasick-1.4.0/regression/issue_50-part2.py deleted file mode 100644 index d49463b..0000000 --- a/stringcheese/pyahocorasick-1.4.0/regression/issue_50-part2.py +++ /dev/null @@ -1,5 +0,0 @@ -from ahocorasick import Automaton -from pickle import load, dump - -with open('automaton-wee.pickle', 'rb') as src: - auto = load(src) diff --git a/stringcheese/pyahocorasick-1.4.0/regression/issue_53.py b/stringcheese/pyahocorasick-1.4.0/regression/issue_53.py deleted file mode 100644 index e78accd..0000000 --- a/stringcheese/pyahocorasick-1.4.0/regression/issue_53.py +++ /dev/null @@ -1,11 +0,0 @@ -from ahocorasick import Automaton -auto = Automaton() -auto.add_word('wounded', 'wounded') - -auto.make_automaton() - -for item in auto.iter('Winning \U0001F629 so gutted, can\'t do anything for 4 weeks... Myth. #wounded'): - print(item) - -for item in auto.iter('Winning so gutted, can\'t do anything for 4 weeks... Myth. #wounded'): - print(item) diff --git a/stringcheese/pyahocorasick-1.4.0/regression/issue_56.py b/stringcheese/pyahocorasick-1.4.0/regression/issue_56.py deleted file mode 100644 index 4a05942..0000000 --- a/stringcheese/pyahocorasick-1.4.0/regression/issue_56.py +++ /dev/null @@ -1,41 +0,0 @@ -import ahocorasick - - - - -def iter_results(s): - r = [] - for x in A.iter(teststr): - r.append(x) - - return r - - -def find_all_results(s): - - r = [] - - def append(x, s): - r.append((x, s)) - - A.find_all(s, append) - - return r - - -A = ahocorasick.Automaton() - -for word in ("poke", "go", "pokegois", "egoist"): - A.add_word(word, word) - -A.make_automaton() - -teststr = 'pokego pokego pokegoist' -expected = iter_results(teststr) -findall = find_all_results(teststr) - -if findall != expected: - print("expected: %s" % expected) - print("findall : %s" % findall) - assert findall == expected - diff --git a/stringcheese/pyahocorasick-1.4.0/regression/issue_8.py b/stringcheese/pyahocorasick-1.4.0/regression/issue_8.py deleted file mode 100644 index 26bb8dc..0000000 --- a/stringcheese/pyahocorasick-1.4.0/regression/issue_8.py +++ /dev/null @@ -1,55 +0,0 @@ -# -*- coding: utf-8 -*- - -""" - Aho-Corasick string search algorithm. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -""" -import ahocorasick - -test_sentences_rus = ["!ASM Print", -"!ASM Print, tyre компания er", -"!ASM Print, рекламно-производственная компания rr", -"!Action Pact!", -"!T.O.O.H.!", -"!YES, лингвистический центр", -"!ts, магазин", -"!ФЕСТ", -'"100-th" department store', -'"1000 мелочей"', -'"1001 мелочь"', -'"19 отряд Федеральной противопожарной службы по Ленинградской области"', -'"У Друзей"', -'"ШТОРЫ и не только..."'] - -test_sentences_pl = [ - "wąż", # a snake - "mąż", # a husband - why so similar :) - "żółć", - "aż", - "waży" -] - -def create_sutomata_rus(): - A = ahocorasick.Automaton() - for sentences in test_sentences_rus[-7:]: - for index, word in enumerate(sentences.split(' ')): - A.add_word(word, (index, word)) - - A.make_automaton() - - -def create_and_iter_sutomata_pl(): - A = ahocorasick.Automaton() - for index, word in enumerate(test_sentences_pl): - A.add_word(word, (index, word)) - - A.make_automaton() - for item in A.iter("wyważyć"): - print(item) - -if __name__ == '__main__': - create_sutomata_rus() - create_and_iter_sutomata_pl() diff --git a/stringcheese/pyahocorasick-1.4.0/regression/issue_9.py b/stringcheese/pyahocorasick-1.4.0/regression/issue_9.py deleted file mode 100644 index 1650b33..0000000 --- a/stringcheese/pyahocorasick-1.4.0/regression/issue_9.py +++ /dev/null @@ -1,57 +0,0 @@ -# -*- coding: utf-8 -*- -""" - Aho-Corasick string search algorithm. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -""" - -import os -import sys - -import ahocorasick - -ac = ahocorasick.Automaton() -ac.add_word('SSSSS', 1) -ac.make_automaton() - -try: - range = xrange # for Py2 -except NameError: - pass - -def get_memory_usage(): - # Linux only - pid = os.getpid() - - lines = [] - try: - with open('/proc/%d/status' % pid, 'rt') as f: - lines = f.readlines() - except: - pass - - for line in lines: - if line.startswith('VmSize'): - return float(line.split()[1]) - - return 0 - -def test(): - with open('README.rst', 'r') as f: - data = f.read()[:1024 * 2] - - for loop in range(1000): - for start in range(0, len(data) - 20): - ac.iter(data, start) - - -if __name__ == '__main__': - - before = get_memory_usage() - test() - after = get_memory_usage() - - print("Memory's usage growth: %s (before = %s, after = %s)" % (after - before, before, after)) - assert(before == after) diff --git a/stringcheese/pyahocorasick-1.4.0/setup.cfg b/stringcheese/pyahocorasick-1.4.0/setup.cfg deleted file mode 100644 index 60a473b..0000000 --- a/stringcheese/pyahocorasick-1.4.0/setup.cfg +++ /dev/null @@ -1,7 +0,0 @@ -[metadata] -license_file = LICENSE - -[build_sphinx] -source-dir = docs/ -build-dir = docs/_build -all_files = 1 \ No newline at end of file diff --git a/stringcheese/pyahocorasick-1.4.0/setup.py b/stringcheese/pyahocorasick-1.4.0/setup.py deleted file mode 100644 index 7e84043..0000000 --- a/stringcheese/pyahocorasick-1.4.0/setup.py +++ /dev/null @@ -1,126 +0,0 @@ -# -*- coding: utf-8 -*- - -""" - Aho-Corasick string search algorithm. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -""" - -try: - from setuptools import setup, Extension -except ImportError: - from distutils.core import setup, Extension - -from sys import version_info as python_version - - -def get_long_description(): - """ - Strip the content index from the long description. - """ - import codecs - with codecs.open('README.rst', encoding='UTF-8') as f: - readme = [line for line in f if not line.startswith('.. contents::')] - return ''.join(readme) - - -if python_version.major not in [2, 3]: - raise ValueError('Python %s is not supported' % python_version) - - -if python_version.major == 3: - macros = [ - # when defined unicode strings are supported - # ('AHOCORASICK_UNICODE', ''), # set to bytes only - ] -else: - # On Python 2, unicode strings are not supported (yet). - macros = [] - - -module = Extension( - 'ahocorasick', - sources=[ - 'pyahocorasick.c', - ], - define_macros=macros, - depends=[ - 'common.h', - 'Automaton.c', - 'Automaton.h', - 'Automaton_pickle.c', - 'AutomatonItemsIter.c', - 'AutomatonItemsIter.h', - 'AutomatonSearchIter.c', - 'AutomatonSearchIter.h', - 'AutomatonSearchIterLong.c', - 'AutomatonSearchIterLong.h', - 'trie.c', - 'trie.h', - 'slist.c', - 'utils.c', - 'trienode.c', - 'trienode.h', - 'msinttypes/stdint.h', - 'src/inline_doc.h', - 'src/pickle/pickle.h', - 'src/pickle/pickle_data.h', - 'src/pickle/pickle_data.c', - 'src/custompickle/custompickle.h', - 'src/custompickle/custompickle.c', - 'src/custompickle/pyhelpers.h', - 'src/custompickle/pyhelpers.c', - 'src/custompickle/save/automaton_save.h', - 'src/custompickle/save/automaton_save.c', - 'src/custompickle/save/savebuffer.h', - 'src/custompickle/save/savebuffer.c', - 'src/custompickle/load/module_automaton_load.h', - 'src/custompickle/load/module_automaton_load.c', - 'src/custompickle/load/loadbuffer.h', - 'src/custompickle/load/loadbuffer.c', - 'src/pycallfault/pycallfault.h', - 'src/pycallfault/pycallfault.c', - ], -) - - -setup( - name='pyahocorasick', - version='1.4.2dev1', - ext_modules=[module], - - description=( - 'pyahocorasick is a fast and memory efficient library for exact or ' - 'approximate multi-pattern string search. With the ahocorasick.Automaton ' - 'class, you can find multiple key strings occurrences at once in some input ' - 'text. You can use it as a plain dict-like Trie or convert a Trie to an ' - 'automaton for efficient Aho-Corasick search. Implemented in C and tested ' - 'on Python 2.7 and 3.4+. Works on Linux, Mac and Windows. BSD-3-clause license.' - ), - author='Wojciech Muła', - author_email='wojciech_mula@poczta.onet.pl', - maintainer='Wojciech Muła', - maintainer_email='wojciech_mula@poczta.onet.pl', - url='http://github.com/WojciechMula/pyahocorasick', - platforms=['Linux', 'MacOSX', 'Windows'], - license=' BSD-3-Clause and Public-Domain', - long_description=get_long_description(), - long_description_content_type="text/x-rst", - keywords=[ - 'aho-corasick', - 'trie', - 'automaton', - 'dictionary', - ], - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'License :: OSI Approved :: BSD License', - 'Programming Language :: C', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 3', - 'Topic :: Software Development :: Libraries', - 'Topic :: Text Editors :: Text Processing', - ], -) diff --git a/stringcheese/pyahocorasick-1.4.0/slist.c b/stringcheese/pyahocorasick-1.4.0/slist.c deleted file mode 100644 index 42aee39..0000000 --- a/stringcheese/pyahocorasick-1.4.0/slist.c +++ /dev/null @@ -1,114 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - Linked list implementation. - - Const time of: - * append - * prepend - * pop first - * get first/last - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -*/ -#include "slist.h" - -ListItem* -list_item_new(const size_t size) { - ListItem* item = (ListItem*)memory_alloc(size); - if (item) { - item->__next = 0; - } - - return item; -} - - -void -list_item_delete(ListItem* item) { - memory_free(item); -} - - -void -list_init(List* list) { - if (list) { - list->head = 0; - list->last = 0; - } -} - - -int -list_delete(List* list) { - - ListItem* item; - ListItem* tmp; - - ASSERT(list); - - item = list->head; - while (item) { - tmp = item; - item = item->__next; - memory_free(tmp); - } - - list->head = list->last = NULL; - return 0; -} - - -ListItem* -list_append(List* list, ListItem* item) { - ASSERT(list); - - if (item) { - if (list->last) { - list->last->__next = item; // append - list->last = item; // set as last node - } - else - list->head = list->last = item; - } - - return item; -} - - -ListItem* -list_push_front(List* list, ListItem* item) { - ASSERT(list); - - if (list->head) { - item->__next = list->head; - list->head = item; - } - else - list->head = list->last = item; - - return item; -} - - -ListItem* -list_pop_first(List* list) { - ListItem* item; - - ASSERT(list); - - if (list->head) { - item = list->head; - list->head = item->__next; - - if (!list->head) - list->last = 0; - - return item; - } - else - return NULL; -} - diff --git a/stringcheese/pyahocorasick-1.4.0/slist.h b/stringcheese/pyahocorasick-1.4.0/slist.h deleted file mode 100644 index 60b9b5f..0000000 --- a/stringcheese/pyahocorasick-1.4.0/slist.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - Linked list declarations. - - Const time of: - * append - * prepend - * pop first - * get first/last - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -*/ -#ifndef ahocorasick_slist_h_included -#define ahocorasick_slist_h_included - -#include "common.h" - -/** base structure for list */ -#define LISTITEM_data struct ListItem* __next - -/** list item node */ -typedef struct ListItem { - LISTITEM_data; -} ListItem; - -/** Create new item */ -ListItem* list_item_new(const size_t size); - -/** Deallocate list item. */ -void list_item_delete(ListItem* item); - -/** Returns pointer to next item */ -#define list_item_next(item) (((ListItem*)(item))->__next) - -/** Set new pointer to next item */ -#define list_item_setnext(item, next) list_item_next(item) = (ListItem*)(next) - - -/** List. - -*/ -typedef struct { - ListItem* head; ///< first node - ListItem* last; ///< last node -} List; - - -/** Initialize list. */ -void list_init(List* list); - -/** Deallocate all elements of list. */ -int list_delete(List* list); - -/** Append item at the end of list. */ -ListItem* list_append(List* list, ListItem* item); - -/** Prepend item at front of list. */ -ListItem* list_push_front(List* list, ListItem* item); - -/** Unlink first item from list. */ -ListItem* list_pop_first(List* list); - -/** Test if list is empty. */ -#define list_empty(list) ((list)->head == NULL) - - -#endif diff --git a/stringcheese/pyahocorasick-1.4.0/src/custompickle/custompickle.c b/stringcheese/pyahocorasick-1.4.0/src/custompickle/custompickle.c deleted file mode 100644 index d086c46..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/custompickle/custompickle.c +++ /dev/null @@ -1,52 +0,0 @@ -#include "custompickle.h" -#include "../../Automaton.h" - - -static const char CUSTOMPICKLE_MAGICK[16] = { - 'p', 'y', 'a', 'h', 'o', 'c', 'o', 'r', 'a', 's', 'i', 'c', 'k', // signature - '0', '0', '2' // format version -}; - - -void custompickle_initialize_header(CustompickleHeader* header, Automaton* automaton) { - - ASSERT(header != NULL); - ASSERT(automaton != NULL); - - memcpy(header->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)); - header->data.kind = automaton->kind; - header->data.store = automaton->store; - header->data.key_type = automaton->key_type; - header->data.words_count = automaton->count; - header->data.longest_word = automaton->longest_word; -} - - -void custompickle_initialize_footer(CustompickleFooter* footer, size_t nodes_count) { - - ASSERT(footer != NULL); - - memcpy(footer->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)); - footer->nodes_count = nodes_count; -} - -int custompickle_validate_header(CustompickleHeader* header) { - if (memcmp(header->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)) != 0) - return false; - - if (!check_store(header->data.store)) - return false; - - if (!check_kind(header->data.kind)) - return false; - - if (!check_key_type(header->data.key_type)) - return false; - - return true; -} - - -int custompickle_validate_footer(CustompickleFooter* footer) { - return (memcmp(footer->magick, CUSTOMPICKLE_MAGICK, sizeof(CUSTOMPICKLE_MAGICK)) == 0); -} diff --git a/stringcheese/pyahocorasick-1.4.0/src/custompickle/custompickle.h b/stringcheese/pyahocorasick-1.4.0/src/custompickle/custompickle.h deleted file mode 100644 index da53fcf..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/custompickle/custompickle.h +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once - -#include "../../Automaton.h" - -typedef struct AutomatonData { - AutomatonKind kind; - KeysStore store; - KeyType key_type; - size_t words_count; - int longest_word; -} AutomatonData; - - -typedef struct CustompickleHeader { - char magick[16]; // CUSTOMPICKLE_MAGICK - AutomatonData data; -} CustompickleHeader; - - -typedef struct CustompickleFooter { - size_t nodes_count; - char magick[16]; // CUSTOMPICKLE_MAGICK -} CustompickleFooter; - - -void custompickle_initialize_header(CustompickleHeader* header, Automaton* automaton); -void custompickle_initialize_footer(CustompickleFooter* footer, size_t nodescount); -int custompickle_validate_header(CustompickleHeader* header); -int custompickle_validate_footer(CustompickleFooter* footer); diff --git a/stringcheese/pyahocorasick-1.4.0/src/custompickle/load/loadbuffer.c b/stringcheese/pyahocorasick-1.4.0/src/custompickle/load/loadbuffer.c deleted file mode 100644 index 6753b52..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/custompickle/load/loadbuffer.c +++ /dev/null @@ -1,152 +0,0 @@ -#include "loadbuffer.h" - - -int -loadbuffer_open(LoadBuffer* input, const char* path, PyObject* deserializer) { - - ASSERT(input != NULL); - ASSERT(path != NULL); - - input->file = NULL; - input->lookup = NULL; - input->size = 0; - input->capacity = 0; - input->deserializer = deserializer; - - input->file = fopen(path, "rb"); - if (UNLIKELY(input->file == NULL)) { - PyErr_SetFromErrno(PyExc_IOError); - return 0; - } - - return 1; -} - -int -loadbuffer_load(LoadBuffer* input, char* buffer, size_t size) { - - size_t read; - - ASSERT(input != NULL); - ASSERT(buffer != NULL); - - if (UNLIKELY(size == 0)) { - PyErr_SetString(PyExc_ValueError, "logic error: tried to read 0 bytes"); - return 0; - } - - read = fread(buffer, 1, size, input->file); - if (read != size) { - PyErr_SetFromErrno(PyExc_IOError); - return 0; - } - - return 1; -} - -int -loadbuffer_init(LoadBuffer* input, CustompickleHeader* header, CustompickleFooter* footer) { - - long pos; - int ret; - - ASSERT(input != NULL); - ASSERT(header != NULL); - ASSERT(footer != NULL); - - ret = loadbuffer_loadinto(input, header, CustompickleHeader); - if (UNLIKELY(!ret)) { - return 0; - } - - pos = ftell(input->file); - if (UNLIKELY(pos < 0)) { - PyErr_SetFromErrno(PyExc_IOError); - return 0; - } - - ret = fseek(input->file, -sizeof(CustompickleFooter), SEEK_END); - if (UNLIKELY(ret < 0)) { - PyErr_SetFromErrno(PyExc_IOError); - return 0; - } - - ret = loadbuffer_loadinto(input, footer, CustompickleFooter); - if (UNLIKELY(!ret)) { - return 0; - } - - ret = fseek(input->file, pos, SEEK_SET); - if (UNLIKELY(ret < 0)) { - PyErr_SetFromErrno(PyExc_IOError); - return 0; - } - - if (UNLIKELY(!custompickle_validate_header(header))) { - PyErr_Format(PyExc_ValueError, "invalid header"); - return 0; - } - - if (UNLIKELY(!custompickle_validate_footer(footer))) { - PyErr_Format(PyExc_ValueError, "invalid footer"); - return 0; - } - - input->store = header->data.store; - input->kind = header->data.kind; - input->size = 0; - input->capacity = footer->nodes_count; - input->lookup = (AddressPair*)memory_alloc(sizeof(AddressPair) * input->capacity); - if (UNLIKELY(input->lookup == NULL)) { - PyErr_NoMemory(); - return 0; - } - - return 1; -} - -void -loadbuffer_invalidate(LoadBuffer* input) { - - ASSERT(input != NULL); - - input->size = 0; -} - -void -loadbuffer_close(LoadBuffer* input) { - - TrieNode* node; - size_t i; - - if (input->file != NULL) { - fclose(input->file); - } - - if (input->lookup) { - for (i=0; i < input->size; i++) { - node = input->lookup[i].current; - - if (node->eow && input->store == STORE_ANY) { - Py_DECREF(node->output.object); - } - - trienode_free(node); - } - - memory_free(input->lookup); - } -} - - -void -loadbuffer_dump(LoadBuffer* input, FILE* out) { - - AddressPair* pair; - size_t i; - - for (i=0; i < input->size; i++) { - pair = &(input->lookup[i]); - fprintf(out, "%p -> %p\n", pair->original, pair->current); - } -} diff --git a/stringcheese/pyahocorasick-1.4.0/src/custompickle/load/loadbuffer.h b/stringcheese/pyahocorasick-1.4.0/src/custompickle/load/loadbuffer.h deleted file mode 100644 index 15d4ae6..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/custompickle/load/loadbuffer.h +++ /dev/null @@ -1,43 +0,0 @@ -#pragma once - -#include - -#include "../../../trienode.h" -#include "../custompickle.h" - -typedef struct AddressPair { - TrieNode* original; - TrieNode* current; -} AddressPair; - - -typedef struct LoadBuffer { - PyObject* deserializer; - FILE* file; - KeysStore store; - AutomatonKind kind; - AddressPair* lookup; - size_t size; - size_t capacity; -} LoadBuffer; - -int -loadbuffer_open(LoadBuffer* input, const char* path, PyObject* deserializer); - -int -loadbuffer_load(LoadBuffer* input, char* output, size_t size); - -#define loadbuffer_loadinto(input, variable, type) \ - loadbuffer_load(input, (char*)(variable), sizeof(type)) - -int -loadbuffer_init(LoadBuffer* input, CustompickleHeader* header, CustompickleFooter* footer); - -void -loadbuffer_invalidate(LoadBuffer* input); - -void -loadbuffer_close(LoadBuffer* input); - -void -loadbuffer_dump(LoadBuffer* input, FILE* out); diff --git a/stringcheese/pyahocorasick-1.4.0/src/custompickle/load/module_automaton_load.c b/stringcheese/pyahocorasick-1.4.0/src/custompickle/load/module_automaton_load.c deleted file mode 100644 index 60d8a15..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/custompickle/load/module_automaton_load.c +++ /dev/null @@ -1,280 +0,0 @@ -#include "module_automaton_load.h" - -#include "../../../Automaton.h" -#include "loadbuffer.h" - - -// --- public ----------------------------------------------------------- - -static bool -automaton_load_impl(Automaton* automaton, const char* path, PyObject* deserializer); - -PyObject* -module_automaton_load(PyObject* module, PyObject* args) { - - SaveLoadParameters params; - Automaton* automaton; - int ret; - - automaton = (Automaton*)automaton_create(); - if (UNLIKELY(automaton == NULL)) { - return NULL; - } - - if (UNLIKELY(!automaton_save_load_parse_args(automaton->store, args, ¶ms))) { - Py_DECREF(automaton); - return NULL; - } - - ret = automaton_load_impl(automaton, PyBytes_AsString(params.path), params.callback); - Py_DECREF(params.path); - - if (LIKELY(ret)) - return (PyObject*)automaton; - else - return NULL; -} - -// ----private ---------------------------------------------------------- - -static bool -automaton_load_node(LoadBuffer* input); - -static TrieNode* -automaton_load_fixup_pointers(LoadBuffer* input); - -static bool -automaton_load_impl(Automaton* automaton, const char* path, PyObject* deserializer) { - - TrieNode* root; - LoadBuffer input; - CustompickleHeader header; - CustompickleFooter footer; - size_t i; - - if (!loadbuffer_open(&input, path, deserializer)) { - return false; - } - - if (!loadbuffer_init(&input, &header, &footer)) { - goto exception; - } - - if (header.data.kind == TRIE || header.data.kind == AHOCORASICK) { - for (i=0; i < input.capacity; i++) { - if (UNLIKELY(!automaton_load_node(&input))) { - goto exception; - } - } - - root = automaton_load_fixup_pointers(&input); - if (UNLIKELY(root == NULL)) { - goto exception; - } - } else if (header.data.kind == EMPTY) { - - root = NULL; - - } else { - PyErr_SetString(PyExc_ValueError, "automaton kind save in file is invalid"); - goto exception; - } - - loadbuffer_close(&input); - - // setup object - automaton->kind = header.data.kind; - automaton->store = header.data.store; - automaton->key_type = header.data.key_type; - automaton->count = header.data.words_count; - automaton->longest_word = header.data.longest_word; - automaton->version = 0; - automaton->stats.version = -1; - automaton->root = root; - - return true; - -exception: - loadbuffer_close(&input); - return false; -} - -static bool -automaton_load_node(LoadBuffer* input) { - - PyObject* bytes; // XXX: it might be reused (i.e. be part of input) - PyObject* object; - TrieNode* original; - TrieNode* node; - size_t size; - int ret; - - // 1. get original address of upcoming node - ret = loadbuffer_loadinto(input, &original, TrieNode*); - if (UNLIKELY(!ret)) { - return false; - } - - // 2. load node data - node = (TrieNode*)memory_alloc(sizeof(TrieNode)); - if (UNLIKELY(node == NULL)) { - PyErr_NoMemory(); - return false; - } - - ret = loadbuffer_load(input, (char*)node, PICKLE_TRIENODE_SIZE); - if (UNLIKELY(!ret)) { - memory_free(node); - return false; - } - - node->next = NULL; - - // 3. load next pointers - if (node->n > 0) { - size = sizeof(Pair) * node->n; - node->next = (Pair*)memory_alloc(size); - if (UNLIKELY(node->next == NULL)) { - PyErr_NoMemory(); - goto exception; - } - - ret = loadbuffer_load(input, (char*)(node->next), size); - if (UNLIKELY(!ret)) { - goto exception; - } - } - - // 4. load custom python object - if (node->eow && input->store == STORE_ANY) { - size = (size_t)(node->output.integer); - bytes = F(PyBytes_FromStringAndSize)(NULL, size); - if (UNLIKELY(bytes == NULL)) { - goto exception; - } - - ret = loadbuffer_load(input, PyBytes_AS_STRING(bytes), size); - if (UNLIKELY(!ret)) { - Py_DECREF(bytes); - goto exception; - } - - object = F(PyObject_CallFunction)(input->deserializer, "O", bytes); - if (UNLIKELY(object == NULL)) { - Py_DECREF(bytes); - goto exception; - } - - node->output.object = object; - Py_DECREF(bytes); - } - - input->lookup[input->size].original = original; - input->lookup[input->size].current = node; - input->size += 1; - - return true; - -exception: - memory_safefree(node->next); - memory_free(node); - - return false; -} - - -static int -addresspair_cmp(const void* a, const void *b) { - const TrieNode* Aptr; - const TrieNode* Bptr; - uintptr_t A; - uintptr_t B; - - Aptr = ((AddressPair*)a)->original; - Bptr = ((AddressPair*)b)->original; - - A = (uintptr_t)Aptr; - B = (uintptr_t)Bptr; - - if (A < B) { - return -1; - } else if (A > B) { - return +1; - } else { - return 0; - } -} - - -static TrieNode* -lookup_address(LoadBuffer* input, TrieNode* original) { - - AddressPair* pair; - - pair = (AddressPair*)bsearch(&original, - input->lookup, - input->size, - sizeof(AddressPair), - addresspair_cmp); - - if (LIKELY(pair != NULL)) { - return pair->current; - } else { - return NULL; - } -} - - -static bool -automaton_load_fixup_node(LoadBuffer* input, TrieNode* node) { - - size_t i; - - if (input->kind == AHOCORASICK && node->fail != NULL) { - node->fail = lookup_address(input, node->fail); - if (UNLIKELY(node->fail == NULL)) { - return false; - } - } - - if (node->n > 0) { - for (i=0; i < node->n; i++) { - node->next[i].child = lookup_address(input, node->next[i].child); - if (UNLIKELY(node->next[i].child == NULL)) { - return false; - } - } - } - - return true; -} - - -static TrieNode* -automaton_load_fixup_pointers(LoadBuffer* input) { - - TrieNode* root; - TrieNode* node; - size_t i; - - ASSERT(input != NULL); - - // 1. root is the first node stored in the array - root = input->lookup[0].current; - - // 2. sort array to make it bsearch-able - qsort(input->lookup, input->size, sizeof(AddressPair), addresspair_cmp); - - // 3. convert all next and fail pointers to current pointers - for (i=0; i < input->size; i++) { - node = input->lookup[i].current; - if (UNLIKELY(!automaton_load_fixup_node(input, node))) { - PyErr_Format(PyExc_ValueError, "Detected malformed pointer during unpickling node %lu", i); - return NULL; - } - } - - loadbuffer_invalidate(input); - - return root; -} diff --git a/stringcheese/pyahocorasick-1.4.0/src/custompickle/load/module_automaton_load.h b/stringcheese/pyahocorasick-1.4.0/src/custompickle/load/module_automaton_load.h deleted file mode 100644 index 2d7bd0c..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/custompickle/load/module_automaton_load.h +++ /dev/null @@ -1,7 +0,0 @@ -#pragma once - -#define module_automaton_load_doc \ - "Load automaton from a file" - -PyObject* -module_automaton_load(PyObject* module, PyObject* args); diff --git a/stringcheese/pyahocorasick-1.4.0/src/custompickle/pyhelpers.c b/stringcheese/pyahocorasick-1.4.0/src/custompickle/pyhelpers.c deleted file mode 100644 index 32a74fd..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/custompickle/pyhelpers.c +++ /dev/null @@ -1,61 +0,0 @@ -#include "pyhelpers.h" - -bool -automaton_save_load_parse_args(KeysStore store, PyObject* args, SaveLoadParameters* result) { - - PyObject* string; - - if (store == STORE_ANY) { - if (PyTuple_GET_SIZE(args) != 2) { - PyErr_SetString(PyExc_ValueError, "expected exactly two arguments"); - return false; - } - } else { - if (PyTuple_GET_SIZE(args) != 1) { - PyErr_SetString(PyExc_ValueError, "expected exactly one argument"); - return false; - } - } - - string = F(PyTuple_GetItem)(args, 0); - if (UNLIKELY(string == NULL)) { - return false; - } - -#if defined(PY3K) - if (UNLIKELY(!F(PyUnicode_Check)(string))) { - PyErr_SetString(PyExc_TypeError, "the first argument must be a string"); - return false; - } -#else - if (UNLIKELY(!F(PyString_Check)(string))) { - PyErr_SetString(PyExc_TypeError, "the first argument must be a string"); - return false; - } -#endif - - if (store == STORE_ANY) { - result->callback = F(PyTuple_GetItem)(args, 1); - if (UNLIKELY(result->callback == NULL)) { - return false; - } - - if (UNLIKELY(!F(PyCallable_Check)(result->callback))) { - PyErr_SetString(PyExc_TypeError, "the second argument must be a callable object"); - return false; - } - } - -#if defined(PY3K) - result->path = F(PyUnicode_AsUTF8String)(string); -#else - result->path = string; - Py_INCREF(string); -#endif - if (UNLIKELY(result->path == NULL)) { - return false; - } - - return true; -} - diff --git a/stringcheese/pyahocorasick-1.4.0/src/custompickle/pyhelpers.h b/stringcheese/pyahocorasick-1.4.0/src/custompickle/pyhelpers.h deleted file mode 100644 index d79ac8b..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/custompickle/pyhelpers.h +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -typedef struct SaveLoadParameters { - PyObject* path; - PyObject* callback; -} SaveLoadParameters; - -bool -automaton_save_load_parse_args(KeysStore store, PyObject* args, SaveLoadParameters* result); - diff --git a/stringcheese/pyahocorasick-1.4.0/src/custompickle/save/automaton_save.c b/stringcheese/pyahocorasick-1.4.0/src/custompickle/save/automaton_save.c deleted file mode 100644 index eaa6a49..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/custompickle/save/automaton_save.c +++ /dev/null @@ -1,138 +0,0 @@ -#include "automaton_save.h" - -#include "../custompickle.h" -#include "../pyhelpers.h" -#include "savebuffer.h" - - -// --- public ----------------------------------------------------------- - -static bool -automaton_save_impl(Automaton* automaton, const char* path, PyObject* serializer); - -PyObject* -automaton_save(PyObject* self, PyObject* args) { - - SaveLoadParameters params; - Automaton* automaton; - int ret; - - automaton = (Automaton*)self; - - if (UNLIKELY(!automaton_save_load_parse_args(automaton->store, args, ¶ms))) { - return NULL; - } - - ret = automaton_save_impl(automaton, PyBytes_AsString(params.path), params.callback); - Py_DECREF(params.path); - - if (LIKELY(ret)) - Py_RETURN_NONE; - else - return NULL; -} - -// --- private ---------------------------------------------------------- - -static int -automaton_save_node(TrieNode* node, const int depth, void* extra); - -static bool -automaton_save_impl(Automaton* automaton, const char* path, PyObject* serializer) { - - CustompickleHeader header; - CustompickleFooter footer; - SaveBuffer output; - int ret; - - ret = savebuffer_init(&output, - serializer, - automaton->store, - path, - SAVEBUFFER_DEFAULT_SIZE); - if (!ret) - return false; - - custompickle_initialize_header(&header, automaton); - - // 1. save header - savebuffer_store(&output, (const char*)&header, sizeof(header)); - - // 2. save nodes - if (automaton->kind != EMPTY) { - trie_traverse(automaton->root, automaton_save_node, &output); - if (UNLIKELY(PyErr_Occurred() != NULL)) { - goto exception; - } - } - - // 3. save footer - custompickle_initialize_footer(&footer, output.nodes_count); - savebuffer_store(&output, (const char*)&footer, sizeof(footer)); - - savebuffer_finalize(&output); - - return true; - -exception: - savebuffer_finalize(&output); - - return false; -} - - -static int -automaton_save_node(TrieNode* node, const int depth, void* extra) { - - SaveBuffer* output; - TrieNode* dump; - PyObject* bytes; - - output = (SaveBuffer*)extra; - - // 1. save actual address of node - savebuffer_store_pointer(output, (void*)node); - - // 2. obtain buffer - dump = (TrieNode*)savebuffer_acquire(output, PICKLE_TRIENODE_SIZE); - - if (output->store != STORE_ANY) - dump->output.integer = node->output.integer; - - dump->n = node->n; - dump->eow = node->eow; - dump->fail = node->fail; - - // 3. pickle python value associated with word - if (node->eow && output->store == STORE_ANY) { - bytes = F(PyObject_CallFunctionObjArgs)(output->serializer, node->output.object, NULL); - if (UNLIKELY(bytes == NULL)) { - return 0; - } - - if (UNLIKELY(!F(PyBytes_CheckExact)(bytes))) { - PyErr_SetString(PyExc_TypeError, "serializer must return bytes object"); - return 0; - } - - // store the size of buffer in trie node [which is not saved yet in the file] - *(size_t*)(&dump->output.integer) = PyBytes_GET_SIZE(bytes); - } else { - bytes = NULL; - } - - // 4. save array of pointers - if (node->n > 0) { - savebuffer_store(output, (const char*)node->next, node->n * sizeof(Pair)); - } - - // 5. save pickled data, if any - if (bytes) { - savebuffer_store(output, PyBytes_AS_STRING(bytes), PyBytes_GET_SIZE(bytes)); - Py_DECREF(bytes); - } - - output->nodes_count += 1; - - return 1; -} diff --git a/stringcheese/pyahocorasick-1.4.0/src/custompickle/save/automaton_save.h b/stringcheese/pyahocorasick-1.4.0/src/custompickle/save/automaton_save.h deleted file mode 100644 index 1213403..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/custompickle/save/automaton_save.h +++ /dev/null @@ -1,7 +0,0 @@ -#pragma once - -#include "../../../common.h" - -PyObject* -automaton_save(PyObject* self, PyObject* args); - diff --git a/stringcheese/pyahocorasick-1.4.0/src/custompickle/save/savebuffer.c b/stringcheese/pyahocorasick-1.4.0/src/custompickle/save/savebuffer.c deleted file mode 100644 index e79a098..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/custompickle/save/savebuffer.c +++ /dev/null @@ -1,114 +0,0 @@ -#include "savebuffer.h" - -bool -savebuffer_init(SaveBuffer* output, PyObject* serializer, KeysStore store, const char* path, size_t capacity) { - - output->store = store; - output->file = NULL; - output->buffer = NULL; - output->size = 0; - output->capacity = capacity; - output->serializer = serializer; - output->nodes_count = 0; - - if (PICKLE_SIZE_T_SIZE < sizeof(PyObject*)) { - // XXX: this must be reworked, likely moved to module level - PyErr_SetString(PyExc_SystemError, "unable to save data due to technical reasons"); - return false; - } - - if (UNLIKELY(store == STORE_ANY && serializer == NULL)) { - PyErr_SetString(PyExc_ValueError, "for automatons with STORE_ANY serializer must be given"); - return false; - } - - output->buffer = (char*)memory_alloc(capacity); - if (UNLIKELY(output->buffer == NULL)) { - PyErr_NoMemory(); - return false; - } - - output->file = fopen(path, "wb"); - if (output->file == NULL) { - memory_free(output->buffer); - output->buffer = NULL; - PyErr_SetFromErrno(PyExc_IOError); - return false; - } - - return true; -} - - -void -savebuffer_flush(SaveBuffer* output) { - if (output->size != fwrite(output->buffer, 1, output->size, output->file)) { - PyErr_SetFromErrno(PyExc_IOError); - } - - output->size = 0; -} - - -char* -savebuffer_acquire(SaveBuffer* output, size_t request) { - - char* ptr; - - if (UNLIKELY(request > output->capacity)) { - return NULL; - } - - if (UNLIKELY(output->size + request > output->capacity)) { - savebuffer_flush(output); - } - - ptr = output->buffer + output->size; - output->size += request; - - return ptr; -} - - -void -savebuffer_store(SaveBuffer* output, const char* data, size_t size) { - - if (UNLIKELY(size > output->capacity)) { - savebuffer_flush(output); - if (fwrite(data, 1, size, output->file) != size) { - PyErr_SetFromErrno(PyExc_IOError); - } - return; - } - - if (UNLIKELY(output->size + size >= output->capacity)) { - savebuffer_flush(output); - } - - memcpy(output->buffer + output->size, data, size); - output->size += size; -} - - -void -savebuffer_store_pointer(SaveBuffer* save, void* ptr) { - char* buf; - - buf = savebuffer_acquire(save, sizeof(void*)); - *((void**)buf) = ptr; -} - - -void -savebuffer_finalize(SaveBuffer* output) { - - if (output->buffer != NULL && output->file != NULL && output->size > 0) { - savebuffer_flush(output); - } - - memory_safefree(output->buffer); - - if (output->file != NULL) { - fclose(output->file); - } -} diff --git a/stringcheese/pyahocorasick-1.4.0/src/custompickle/save/savebuffer.h b/stringcheese/pyahocorasick-1.4.0/src/custompickle/save/savebuffer.h deleted file mode 100644 index 4d1ce6d..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/custompickle/save/savebuffer.h +++ /dev/null @@ -1,34 +0,0 @@ -#pragma once - -#include "../../../Automaton.h" - -#define SAVEBUFFER_DEFAULT_SIZE (32 * 1024lu) - -typedef struct SaveBuffer { - KeysStore store; - FILE* file; - char* buffer; - size_t size; - size_t capacity; - - PyObject* serializer; - size_t nodes_count; ///< the total number of stored nodes -} SaveBuffer; - -bool -savebuffer_init(SaveBuffer* save, PyObject* serializer, KeysStore store, const char* path, size_t capacity); - -void -savebuffer_flush(SaveBuffer* save); - -char* -savebuffer_acquire(SaveBuffer* save, size_t request); - -void -savebuffer_store(SaveBuffer* save, const char* data, size_t size); - -void -savebuffer_store_pointer(SaveBuffer* save, void* ptr); - -void -savebuffer_finalize(SaveBuffer* save); diff --git a/stringcheese/pyahocorasick-1.4.0/src/inline_doc.h b/stringcheese/pyahocorasick-1.4.0/src/inline_doc.h deleted file mode 100644 index 8f4c5c8..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/inline_doc.h +++ /dev/null @@ -1,282 +0,0 @@ -#pragma once -// DO NOT EDIT. File generated by script update_inlinedoc.py. - -#define automaton___reduce___doc \ - "__reduce__()\n" \ - "\n" \ - "Return pickle-able data for this automaton instance." - -#define automaton___sizeof___doc \ - "Return the approximate size in bytes occupied by the\n" \ - "Automaton instance in memory excluding the size of\n" \ - "associated objects when the Automaton is created with\n" \ - "Automaton() or Automaton(ahocorasick.STORE_ANY)." - -#define automaton_add_word_doc \ - "add_word(key, [value]) -> boolean\n" \ - "\n" \ - "Add a key string to the dict-like trie and associate this\n" \ - "key with a value. value is optional or mandatory depending\n" \ - "how the Automaton instance was created. Return True if the\n" \ - "word key is inserted and did not exists in the trie or False\n" \ - "otherwise. The value associated with an existing word is\n" \ - "replaced.\n" \ - "\n" \ - "The value is either mandatory or optional:\n" \ - "- If the Automaton was created without argument (the\n" \ - " default) as Automaton() or with\n" \ - " Automaton(ahocorasik.STORE_ANY) then the value is required\n" \ - " and can be any Python object.\n" \ - "- If the Automaton was created with\n" \ - " Automaton(ahocorasik.STORE_INTS) then the value is\n" \ - " optional. If provided it must be an integer, otherwise it\n" \ - " defaults to len(automaton) which is therefore the order\n" \ - " index in which keys are added to the trie.\n" \ - "- If the Automaton was created with\n" \ - " Automaton(ahocorasik.STORE_LENGTH) then associating a\n" \ - " value is not allowed - len(word) is saved automatically as\n" \ - " a value instead.\n" \ - "\n" \ - "Calling add_word() invalidates all iterators only if the new\n" \ - "key did not exist in the trie so far (i.e. the method\n" \ - "returned True)." - -#define automaton_clear_doc \ - "clear()\n" \ - "\n" \ - "Remove all keys from the trie. This method invalidates all\n" \ - "iterators." - -#define automaton_constructor_doc \ - "Automaton(value_type=ahocorasick.STORE_ANY, [key_type])\n" \ - "\n" \ - "Create a new empty Automaton. Both value_type and key_type\n" \ - "are optional.\n" \ - "\n" \ - "value_type is one of these constants:\n" \ - "- ahocorasick.STORE_ANY [default] : The associated value can\n" \ - " be any Python object.\n" \ - "- ahocorasick.STORE_LENGTH : The length of an added string\n" \ - " key is automatically used as the associated value stored\n" \ - " in the trie for that key.\n" \ - "- ahocorasick.STORE_INTS : The associated value must be a\n" \ - " 32-bit integer.\n" \ - "\n" \ - "key_type defines the type of data that can be stored in an\n" \ - "automaton; it is one of these constants and defines type of\n" \ - "data might be stored:\n" \ - "- ahocorasick.KEY_STRING [default] : string\n" \ - "- ahocorasick.KEY_SEQUENCE : sequences of integers; The size\n" \ - " of integer depends the version and platform Python, but\n" \ - " for versions of Python >= 3.3, it is guaranteed to be\n" \ - " 32-bits." - -#define automaton_dump_doc \ - "dump()\n" \ - "\n" \ - "Return a three-tuple of lists describing the Automaton as a\n" \ - "graph of nodes, edges, failure links.\n" \ - "- nodes: each item is a pair (node id, end of word marker)\n" \ - "- edges: each item is a triple (node id, label char, child\n" \ - " node id)\n" \ - "- failure links: each item is a pair (source node id, node\n" \ - " if connected by fail node)\n" \ - "\n" \ - "For each of these, the node id is a unique number and a\n" \ - "label is a number." - -#define automaton_exists_doc \ - "exists(key) -> boolean\n" \ - "\n" \ - "Return True if the key is present in the trie. Same as using\n" \ - "the 'in' keyword." - -#define automaton_find_all_doc \ - "find_all(string, callback, [start, [end]])\n" \ - "\n" \ - "Perform the Aho-Corasick search procedure using the provided\n" \ - "input string and iterate over the matching tuples\n" \ - "(end_index, value) for keys found in string. Invoke the\n" \ - "callback callable for each matching tuple.\n" \ - "\n" \ - "The callback callable must accept two positional arguments:\n" \ - "- end_index is the end index in the input string where a\n" \ - "trie key string was found. - value is the value associated\n" \ - "with the found key string.\n" \ - "\n" \ - "The start and end optional arguments can be used to limit\n" \ - "the search to an input string slice as in string[start:end].\n" \ - "\n" \ - "Equivalent to a loop on iter() calling a callable at each\n" \ - "iteration." - -#define automaton_get_doc \ - "get(key[, default])\n" \ - "\n" \ - "Return the value associated with the key string.\n" \ - "\n" \ - "Raise a KeyError exception if the key is not in the trie and\n" \ - "no default is provided.\n" \ - "\n" \ - "Return the optional default value if provided and the key is\n" \ - "not in the trie." - -#define automaton_get_stats_doc \ - "get_stats() -> dict\n" \ - "\n" \ - "Return a dictionary containing Automaton statistics.\n" \ - "- nodes_count - total number of nodes\n" \ - "- words_count - number of distinct words (same as\n" \ - " len(automaton))\n" \ - "- longest_word - length of the longest word\n" \ - "- links_count - number of edges\n" \ - "- sizeof_node - size of single node in bytes\n" \ - "- total_size - total size of trie in bytes (about\n" \ - " nodes_count * size_of node + links_count * size of\n" \ - " pointer)." - -#define automaton_items_doc \ - "items([prefix, [wildcard, [how]]])\n" \ - "\n" \ - "Return an iterator on tuples of (key, value). Keys are\n" \ - "matched optionally to the prefix using the same logic and\n" \ - "arguments as in the keys() method." - -#define automaton_iter_doc \ - "iter(string, [start, [end]], ignore_white_space=False)\n" \ - "\n" \ - "Perform the Aho-Corasick search procedure using the provided\n" \ - "input string.\n" \ - "\n" \ - "Return an iterator of tuples (end_index, value) for keys\n" \ - "found in string where:\n" \ - "- end_index is the end index in the input string where a\n" \ - " trie key string was found.\n" \ - "- value is the value associated with the found key string.\n" \ - "\n" \ - "The start and end optional arguments can be used to limit\n" \ - "the search to an input string slice as in string[start:end].\n" \ - "\n" \ - "The ignore_white_space optional arguments can be used to\n" \ - "ignore white spaces from input string." - -#define automaton_iter_long_doc \ - "iter_long(string, [start, [end]])\n" \ - "\n" \ - "Perform the modified Aho-Corasick search procedure which\n" \ - "matches the longest words from set.\n" \ - "\n" \ - "Return an iterator of tuples (end_index, value) for keys\n" \ - "found in string where:\n" \ - "- end_index is the end index in the input string where a\n" \ - " trie key string was found.\n" \ - "- value is the value associated with the found key string.\n" \ - "\n" \ - "The start and end optional arguments can be used to limit\n" \ - "the search to an input string slice as in string[start:end]." - -#define automaton_keys_doc \ - "keys([prefix, [wildcard, [how]]])\n" \ - "\n" \ - "Return an iterator on keys. If the optional prefix string is\n" \ - "provided, only yield keys starting with this prefix.\n" \ - "\n" \ - "If the optional wildcard is provided as a single character\n" \ - "string, then the prefix is treated as a simple pattern using\n" \ - "this character as a wildcard.\n" \ - "\n" \ - "The optional how argument is used to control how strings are\n" \ - "matched using one of these possible values:\n" \ - "- ahocorasick.MATCH_EXACT_LENGTH (default) Yield matches\n" \ - " that have the same exact length as the prefix length.\n" \ - "- ahocorasick.MATCH_AT_LEAST_PREFIX Yield matches that have\n" \ - " a length greater or equal to the prefix length.\n" \ - "- ahocorasick.MATCH_AT_MOST_PREFIX Yield matches that have a\n" \ - " length lesser or equal to the prefix length." - -#define automaton_len_doc \ - "len() -> integer\n" \ - "\n" \ - "Return the number of distinct keys added to the trie." - -#define automaton_longest_prefix_doc \ - "longest_prefix(string) => integer\n" \ - "\n" \ - "Return the length of the longest prefix of string that\n" \ - "exists in the trie." - -#define automaton_make_automaton_doc \ - "make_automaton()\n" \ - "\n" \ - "Finalize and create the Aho-Corasick automaton based on the\n" \ - "keys already added to the trie. This does not require\n" \ - "additional memory. After successful creation the\n" \ - "Automaton.kind attribute is set to ahocorasick.AHOCORASICK." - -#define automaton_match_doc \ - "match(key) -> bool\n" \ - "\n" \ - "Return True if there is a prefix (or key) equal to key\n" \ - "present in the trie.\n" \ - "\n" \ - "For example if the key 'example' has been added to the trie,\n" \ - "then calls to match('e'), match('ex'), ..., match('exampl')\n" \ - "or match('example') all return True. But exists() is True\n" \ - "only when calling exists('example')." - -#define automaton_pop_doc \ - "pop(word)\n" \ - "\n" \ - "Remove given word from a trie and return associated values.\n" \ - "Raise a KeyError if the word was not found." - -#define automaton_remove_word_doc \ - "remove_word(word) -> bool\n" \ - "\n" \ - "Remove given word from a trie. Return True if words was\n" \ - "found, False otherwise." - -#define automaton_save_doc \ - "save(path, serializer)\n" \ - "\n" \ - "Save content of automaton in an on-disc file.\n" \ - "\n" \ - "Serializer is a callable object that is used when automaton\n" \ - "store type is STORE_ANY. This method converts a python\n" \ - "object into bytes; it can be pickle.dumps." - -#define automaton_search_iter_doc \ - "This class is not available directly but instances of\n" \ - "AutomatonSearchIter are returned by the iter() method of an\n" \ - "Automaton. This iterator can be manipulated through its\n" \ - "set() method." - -#define automaton_search_iter_set_doc \ - "set(string, reset=False)\n" \ - "\n" \ - "Set a new string to search. When the reset argument is False\n" \ - "(default) then the Aho-Corasick procedure is continued and\n" \ - "the internal state of the Automaton and end index of the\n" \ - "string being searched are not reset. This allow to search\n" \ - "for large strings in multiple smaller chunks." - -#define automaton_values_doc \ - "values([prefix, [wildcard, [how]]])\n" \ - "\n" \ - "Return an iterator on values associated with each keys. Keys\n" \ - "are matched optionally to the prefix using the same logic\n" \ - "and arguments as in the keys() method." - -#define module_doc \ - "pyahocorasick is a fast and memory efficient library for\n" \ - "exact or approximate multi-pattern string search meaning\n" \ - "that you can find multiple key strings occurrences at once\n" \ - "in some input text." - -#define module_load_doc \ - "load(path, deserializer) => Automaton\n" \ - "\n" \ - "Load automaton previously stored on disc using save method.\n" \ - "\n" \ - "Deserializer is a callable object which converts bytes back\n" \ - "into python object; it can be pickle.loads." diff --git a/stringcheese/pyahocorasick-1.4.0/src/pickle/pickle.h b/stringcheese/pyahocorasick-1.4.0/src/pickle/pickle.h deleted file mode 100644 index 7a520d4..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/pickle/pickle.h +++ /dev/null @@ -1,9 +0,0 @@ -#pragma once - -#include "../../trienode.h" - -// We save all TrieNode's fields except the last one, which is a pointer to array, -// as we're store that array just after the node -#define PICKLE_TRIENODE_SIZE (sizeof(TrieNode) - sizeof(Pair*)) -#define PICKLE_SIZE_T_SIZE (sizeof(size_t)) -#define PICKLE_CHUNK_COUNTER_SIZE (sizeof(Py_ssize_t)) diff --git a/stringcheese/pyahocorasick-1.4.0/src/pickle/pickle_data.c b/stringcheese/pyahocorasick-1.4.0/src/pickle/pickle_data.c deleted file mode 100644 index f0e1308..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/pickle/pickle_data.c +++ /dev/null @@ -1,126 +0,0 @@ -#include "pickle.h" -#include "pickle_data.h" - - -static void -pickle_data__init_default(PickleData* data) { - ASSERT(data != NULL); - - data->bytes_list = NULL; - data->chunked = false; - data->size = 0; - data->data = NULL; - data->count = NULL; - data->top = 0; - data->values = 0; - data->error = false; -} - - -static void -pickle_data__cleanup(PickleData* data) { - ASSERT(data != NULL); - - Py_XDECREF(data->bytes_list); - Py_XDECREF(data->values); -} - - -static bool -pickle_data__add_next_buffer(PickleData* data) { - - PyObject* bytes; - void* raw; - - ASSERT(data != NULL); - - bytes = F(PyBytes_FromStringAndSize)(NULL, data->size); - if (UNLIKELY(bytes == NULL)) { - return false; - } - - if (UNLIKELY(F(PyList_Append)(data->bytes_list, bytes) < 0)) { - Py_DECREF(bytes); - return false; - } - - raw = PyBytes_AS_STRING(bytes); - - data->count = (Py_ssize_t*)raw; - (*data->count) = 0; - - data->data = (uint8_t*)raw; - data->top = PICKLE_CHUNK_COUNTER_SIZE; - - return true; -} - - -static bool -pickle_data__shrink_last_buffer(PickleData* data) { - - PyObject* bytes; - PyObject* new; - Py_ssize_t last_idx; - - ASSERT(data != NULL); - - if (data->top >= data->size) { - return true; - } - - ASSERT(data->bytes_list); - - last_idx = PyList_GET_SIZE(data->bytes_list) - 1; - - bytes = F(PyList_GetItem)(data->bytes_list, last_idx); - if (UNLIKELY(bytes == NULL)) { - return false; - } - - new = F(PyBytes_FromStringAndSize)(PyBytes_AS_STRING(bytes), data->top); - if (UNLIKELY(new == NULL)) { - return false; - } - - if (F(PyList_SetItem)(data->bytes_list, last_idx, new) < 0) { - return false; - } - - return true; -} - - -static int -pickle_data__init(PickleData* data, KeysStore store, size_t total_size, size_t max_array_size) { - - pickle_data__init_default(data); - - ASSERT(total_size > 0); - ASSERT(max_array_size > PICKLE_TRIENODE_SIZE * 1024); - - data->bytes_list = F(PyList_New)(0); - if (UNLIKELY(data->bytes_list == NULL)) { - return false; - } - - if (store == STORE_ANY) { - data->values = F(PyList_New)(0); - if (UNLIKELY(data->values == NULL)) { - Py_DECREF(data->bytes_list); - return false; - } - } - - if (total_size <= max_array_size) { - data->size = total_size + PICKLE_CHUNK_COUNTER_SIZE; - data->chunked = false; - } else { - // TODO: more heuristic here: what if total_size > 100MB? what if > 1GB, > 10GB? - data->size = max_array_size; - data->chunked = true; - } - - return pickle_data__add_next_buffer(data); -} - diff --git a/stringcheese/pyahocorasick-1.4.0/src/pickle/pickle_data.h b/stringcheese/pyahocorasick-1.4.0/src/pickle/pickle_data.h deleted file mode 100644 index 42398ed..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/pickle/pickle_data.h +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once - -typedef struct PickleData { - PyObject* bytes_list; ///< PyList of PyBytes - bool chunked; ///< bytes_list has more than one element - size_t size; ///< size of single array - uint8_t* data; ///< current array - Py_ssize_t* count; ///< ptr to number of nodes stored in the current array - size_t top; ///< first free address in the current array - - PyObject* values; ///< a list (if store == STORE_ANY) - bool error; ///< error occurred during pickling -} PickleData; - - -static void -pickle_data__init_default(PickleData* data); - -static void -pickle_data__cleanup(PickleData* data); - -static bool -pickle_data__add_next_buffer(PickleData* data); - -static bool -pickle_data__shrink_last_buffer(PickleData* data); - -static int -pickle_data__init(PickleData* data, KeysStore store, size_t total_size, size_t max_array_size); diff --git a/stringcheese/pyahocorasick-1.4.0/src/pycallfault/pycallfault.c b/stringcheese/pyahocorasick-1.4.0/src/pycallfault/pycallfault.c deleted file mode 100644 index c137afb..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/pycallfault/pycallfault.c +++ /dev/null @@ -1,49 +0,0 @@ -#include "pycallfault.h" - -#include - - -static int pycall = -1; -static int pycall_fail = -1; -static int pycall_trap = 0; - -void initialize_pycallfault(void) { - const char* fail = getenv("PYCALL_FAIL"); - const char* trap = getenv("PYCALL_TRAP"); - - if (fail != NULL) { - pycall_fail = atoi(fail); - } - - if (trap != NULL) { - pycall_trap = 1; - } -} - - -int check(void) { - pycall += 1; - printf("Fail ID: %d\n", pycall); - - if (pycall == pycall_fail) { - if (pycall_trap) { - __builtin_trap(); - } - - printf("Failed pycall #%d\n", pycall); - return 1; - } - - return 0; -} - - -int check_and_set_error(void) { - if (check()) { - PyErr_NoMemory(); - return 1; - } - - return 0; -} - diff --git a/stringcheese/pyahocorasick-1.4.0/src/pycallfault/pycallfault.h b/stringcheese/pyahocorasick-1.4.0/src/pycallfault/pycallfault.h deleted file mode 100644 index 1a9901a..0000000 --- a/stringcheese/pyahocorasick-1.4.0/src/pycallfault/pycallfault.h +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef PYCALLFAULT_H_ -#define PYCALLFAULT_H_ - -#define F(name) name##_custom - -void initialize_pycallfault(void); - -// --- python function wrappers ----------------------------------------- - -int check(void); -int check_and_set_error(void); - -#define PyObject_New_custom(...) (check_and_set_error() ? NULL : PyObject_New(__VA_ARGS__)) - -#define PyArg_ParseTuple_custom(...) (check() ? 0 : PyArg_ParseTuple(__VA_ARGS__)) - -#define PyTuple_GetItem_custom(...) (check_and_set_error() ? NULL : PyTuple_GetItem(__VA_ARGS__)) - -#define PyList_New_custom(arg) (check_and_set_error() ? NULL : PyList_New(arg)) - -#define PyList_GetItem_custom(...) (check_and_set_error() ? NULL : PyList_GetItem(__VA_ARGS__)) - -#define PyList_SetItem_custom(...) (check_and_set_error() ? -1 : PyList_SetItem(__VA_ARGS__)) - -#define PyList_Append_custom(...) (check_and_set_error() ? -1 : PyList_Append(__VA_ARGS__)) - -#define PyNumber_AsSsize_t_custom(...) (check_and_set_error() ? -1 : PyNumber_AsSsize_t(__VA_ARGS__)) - -#define Py_BuildValue_custom(...) (check_and_set_error() ? NULL : Py_BuildValue(__VA_ARGS__)) - -#define PyCallable_Check_custom(arg) (check() ? 0 : PyCallable_Check(arg)) - -#define PyString_Check_custom(arg) (check() ? 0 : PyString_Check(arg)) - -#define PyUnicode_Check_custom(arg) (check() ? 0 : PyUnicode_Check(arg)) - -#define PyBytes_Check_custom(arg) (check() ? 0 : PyBytes_Check(arg)) - -#define PyBytes_CheckExact_custom(arg) (check() ? 0 : PyBytes_CheckExact(arg)) - -#define PyNumber_Check_custom(arg) (check() ? 0 : PyNumber_Check(arg)) - -#define PyTuple_Check_custom(arg) (check() ? 0 : PyTuple_Check(arg)) - -#define PyObject_CallFunction_custom(...) (check_and_set_error() ? NULL : PyObject_CallFunction(__VA_ARGS__)) - -#define PyObject_CallFunctionObjArgs_custom(...) (check_and_set_error() ? NULL : PyObject_CallFunctionObjArgs(__VA_ARGS__)) - -#define PyArg_ParseTupleAndKeywords_custom(...) (check_and_set_error() ? 0 : PyArg_ParseTupleAndKeywords(__VA_ARGS__)) - -#define PyNumber_Index_custom(arg) (check_and_set_error() ? NULL : PyNumber_Index(arg)) - -#define PyUnicode_FromKindAndData_custom(...) (check_and_set_error() ? NULL : PyUnicode_FromKindAndData(__VA_ARGS__)) - -#define PyUnicode_AsUTF8String_custom(...) (check_and_set_error() ? NULL : PyUnicode_AsUTF8String(__VA_ARGS__)) - -#define PyBytes_FromStringAndSize_custom(...) (check_and_set_error() ? NULL : PyBytes_FromStringAndSize(__VA_ARGS__)) - -#endif // PYCALLFAULT_H_ diff --git a/stringcheese/pyahocorasick-1.4.0/stamp/.gitignore b/stringcheese/pyahocorasick-1.4.0/stamp/.gitignore deleted file mode 100644 index 92df83e..0000000 --- a/stringcheese/pyahocorasick-1.4.0/stamp/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*_py? diff --git a/stringcheese/pyahocorasick-1.4.0/test.py b/stringcheese/pyahocorasick-1.4.0/test.py deleted file mode 100644 index 7b608aa..0000000 --- a/stringcheese/pyahocorasick-1.4.0/test.py +++ /dev/null @@ -1,55 +0,0 @@ -# -*- coding: utf-8 -*- - -""" - Aho-Corasick string search algorithm. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -""" - -import ahocorasick -import sys - -print(dir(ahocorasick)) - - -def is_python_2(): - return sys.version_info.major == 2 - - -a = ahocorasick.Automaton() -words = b"he e hers his she hi him man he" -if not is_python_2(): - words = words.decode('utf8') -for i,w in enumerate(words.split()): - a.add_word(w, (i, w)) - -#print(len(a), len(set(words))) -#print(a.get(b"hea", 1)) - -""" -for w in a.keys(): - print(w) - -for w in a.values(): - print(w) - -for w in a.items(): - print(w) -""" - -s = b"he rshershidamanza " -if not is_python_2(): - s = s.decode('utf8') - -a.make_automaton() -for item in a.iter(s, 2, 8): - print(item) - -print("==") - -def callback(index, item): - print(index, item) - -a.find_all(s, callback, 2, 11) diff --git a/stringcheese/pyahocorasick-1.4.0/tests/generate_random_words.py b/stringcheese/pyahocorasick-1.4.0/tests/generate_random_words.py deleted file mode 100644 index 541d9a6..0000000 --- a/stringcheese/pyahocorasick-1.4.0/tests/generate_random_words.py +++ /dev/null @@ -1,72 +0,0 @@ -import sys -import os -import random -import gzip -import pickle -import optparse -import time - -from optparse import OptionParser - - -def main(): - options = parse_args() - app = TestApplication(options) - app.run() - - -chars = 'abcdefghijklmnopqestuvwxyzABCDEFGHIJKLMNOPQESTUVWXYZ0123456789.,;:-' - -class TestApplication(object): - def __init__(self, options): - self.options = options - - random.seed(options.seed) - - - def run(self): - n = self.options.words - - for i in range(n): - print(self.generate_random_word()) - - - def generate_random_word(self): - n = random.randint(1, self.options.maxlength + 1) - s = '' - for i in range(n): - s += random.choice(chars) - - return s - - -def parse_args(): - - parser = OptionParser() - parser.add_option( - "--max-words", dest='words', type=int, default=50000, metavar='N', - help="maximum number of words generated/loaded" - ) - - parser.add_option( - "--random", dest='random', action='store_true', default=False, - help="generate random words" - ) - - parser.add_option( - "--seed", dest='seed', type=int, default=0, metavar='INT', - help="random seed" - ) - - parser.add_option( - "--random-max-len", dest='maxlength', type=int, default=100, metavar='K', - help="maximum count of characters in a word" - ) - - (options, rest) = parser.parse_args() - - return options - - -if __name__ == '__main__': - main() diff --git a/stringcheese/pyahocorasick-1.4.0/tests/memdump_check.py b/stringcheese/pyahocorasick-1.4.0/tests/memdump_check.py deleted file mode 100644 index 7bc9fbc..0000000 --- a/stringcheese/pyahocorasick-1.4.0/tests/memdump_check.py +++ /dev/null @@ -1,83 +0,0 @@ -import sys - -def main(): - try: - path = sys.argv[1] - except IndexError: - path = 'memory.dump' - - app = Application(path) - if app.run(): - sys.exit(0) - else: - sys.exit(1) - - -class Application(object): - def __init__(self, path): - self.path = path - self.memory = {} - - - def run(self): - with open(self.path, 'rt') as f: - self.analyze(f) - - self.print_leaks() - - return len(self.memory) == 0 - - - def analyze(self, file): - self.memory = {} - - for i, line in enumerate(file): - fields = line.split() - action = fields[0] - if action == 'A': - id = fields[1] - addr = fields[2] - size = int(fields[3]) - - assert addr not in self.memory - self.memory[addr] = (id, size) - - elif action == 'R': - - id = fields[1] - oldaddr = fields[2] - newaddr = fields[3] - size = int(fields[4]) - - try: - key = int(oldaddr, 16) - del self.memory[oldaddr] - except ValueError: - pass - - assert newaddr not in self.memory - self.memory[newaddr] = (id, size) - - elif action == 'F': - - addr = fields[1] - if addr in self.memory: - del self.memory[addr] - - - def print_leaks(self): - n = len(self.memory) - if n == 0: - return - - print('There are %d leaks:' % n) - tmp = [(int(id), addr, size) for addr, (id, size) in self.memory.items()] - tmp.sort(key=lambda item: item[0]) - - for id, addr, size in tmp: - print('#%s: %s %d' % (id, addr, size)) - - -if __name__ == '__main__': - main() - diff --git a/stringcheese/pyahocorasick-1.4.0/tests/memdump_maxalloc.py b/stringcheese/pyahocorasick-1.4.0/tests/memdump_maxalloc.py deleted file mode 100644 index 3431ef4..0000000 --- a/stringcheese/pyahocorasick-1.4.0/tests/memdump_maxalloc.py +++ /dev/null @@ -1,32 +0,0 @@ -import sys - -def main(): - try: - path = sys.argv[1] - except IndexError: - path = 'memory.dump' - - app = Application(path) - app.run() - - -class Application(object): - def __init__(self, path): - self.path = path - - - def run(self): - with open(self.path, 'rt') as f: - print(max(self.ids(f))) - - - def ids(self, file): - for i, line in enumerate(file): - fields = line.split() - if fields[0] == 'A': - yield int(fields[1]) - - -if __name__ == '__main__': - main() - diff --git a/stringcheese/pyahocorasick-1.4.0/tests/memdump_maxrealloc.py b/stringcheese/pyahocorasick-1.4.0/tests/memdump_maxrealloc.py deleted file mode 100644 index 82d2209..0000000 --- a/stringcheese/pyahocorasick-1.4.0/tests/memdump_maxrealloc.py +++ /dev/null @@ -1,32 +0,0 @@ -import sys - -def main(): - try: - path = sys.argv[1] - except IndexError: - path = 'memory.dump' - - app = Application(path) - app.run() - - -class Application(object): - def __init__(self, path): - self.path = path - - - def run(self): - with open(self.path, 'rt') as f: - print(max(self.ids(f))) - - - def ids(self, file): - for i, line in enumerate(file): - fields = line.split() - if fields[0] == 'R': - yield int(fields[1]) - - -if __name__ == '__main__': - main() - diff --git a/stringcheese/pyahocorasick-1.4.0/tests/pickle_stresstest.py b/stringcheese/pyahocorasick-1.4.0/tests/pickle_stresstest.py deleted file mode 100644 index c46367b..0000000 --- a/stringcheese/pyahocorasick-1.4.0/tests/pickle_stresstest.py +++ /dev/null @@ -1,287 +0,0 @@ -import sys -import os -import random -import gzip -import pickle -import optparse -import time - -import ahocorasick - -from optparse import OptionParser - - -def main(): - options = parse_args() - app = TestApplication(options) - app.run() - - -chars = 'abcdefghijklmnopqestuvwxyzABCDEFGHIJKLMNOPQESTUVWXYZ0123456789.,;:-' - -class TestApplication(object): - def __init__(self, options): - self.options = options - self.words = set() - - random.seed(options.seed) - - - def run(self): - self.A = ahocorasick.Automaton() - - if self.options.compare and (not self.options.pickle and not self.options.save): - self.generate_words() - - if self.options.pickle or self.options.save: - self.add_words() - - if self.options.pickle: - t1 = time.time() - self.pickle() - t2 = time.time() - print(" time: %0.2fs" % (t2 - t1)) - self.A.clear() - - if self.options.save: - t1 = time.time() - self.save() - t2 = time.time() - print(" time: %0.2fs" % (t2 - t1)) - self.A.clear() - - if self.options.unpickle: - t1 = time.time() - self.unpickle() - t2 = time.time() - print(" time: %0.2fs" % (t2 - t1)) - - if self.options.load: - t1 = time.time() - self.load() - t2 = time.time() - print(" time: %0.2fs" % (t2 - t1)) - - if self.options.compare: - self.compare() - - - def add_words(self): - if self.options.random: - self.__add_random_words() - else: - self.__add_from_file() - - print("Automaton statistics:") - d = self.A.get_stats() - print("- nodes_count : %d" % d['nodes_count']) - print("- words_count : %d" % d['words_count']) - print("- links_count : %d" % d['links_count']) - print("- longest_word : %d" % d['longest_word']) - print("- sizeof_node : %d" % d['sizeof_node']) - print("- total_size : %d" % d['total_size']) - - - def __add_random_words(self): - n = self.options.words - - print("Adding %d words" % n) - while n > 0: - word = self.generate_random_word() - if self.options.compare: - self.words.add(word) - - if self.A.add_word(word, True): - n -= 1 - - - def __add_from_file(self): - n = self.options.words - - print("Adding %d words from %s" % (n, self.options.file_gz)) - for i, word in enumerate(self.read()): - if i > n: - return - - if self.options.compare: - self.words.add(word) - - self.A.add_word(word, True) - - - def generate_words(self): - if self.options.random: - self.__generate_random_words() - else: - self.__load_words() - - - def __generate_random_words(self): - n = self.options.words - - print("Generating %d words" % n) - while len(self.words) < n: - word = self.generate_random_word() - self.words.add(word) - - - def __load_words(self): - n = self.options.words - print ("Loading %d words from %s" % (n, self.options.file_gz)) - for i, word in enumerate(self.read()): - if i < n: - self.words.add(word) - else: - return - - - def read(self): - with gzip.open(self.options.file_gz, "rt", encoding="utf-8") as f: - for line in f: - yield line.strip() - - - def pickle(self): - path = self.options.picklepath - - print("Pickling automaton in %s" % path) - - with open(path, 'wb') as f: - pickle.dump(self.A, f) - - size = os.path.getsize(path) - print(" file size is %s" % format_size(size)) - - - def unpickle(self): - path = self.options.picklepath - - print("Unpickling automaton from %s" % path) - with open(path, 'rb') as f: - self.A = pickle.load(f) - - - def save(self): - path = self.options.picklepath - - print("Saving automaton in %s" % path) - - self.A.save(path, pickle.dumps); - - size = os.path.getsize(path) - print(" file size is %s" % format_size(size)) - - - def load(self): - path = self.options.picklepath - - print("Loading automaton from %s" % path) - - self.A = ahocorasick.load(path, pickle.loads) - - - def compare(self): - print("Comparing added words with restored automaton") - - for word in self.A: - self.words.remove(word) - - if self.words: - print("Not all words were restored (%d missing)" % len(self.words)) - - - def generate_random_word(self): - n = random.randint(1, self.options.maxlength + 1) - s = '' - for i in range(n): - s += random.choice(chars) - - return s - - -def format_size(size): - units = [ - ('GB', 1024**3), - ('MB', 1024**2), - ('kB', 1024), - ] - - for suffix, threshold in units: - if size > threshold: - return '%0.2f %s (%d bytes)' % (float(size)/threshold, suffix, size) - - return '%d bytes' % size - - -def parse_args(): - - parser = OptionParser() - parser.add_option( - "--pickle-path", dest='picklepath', default='pickle_stresstest.pickle', - help="path used in pickling/unpickling" - ) - - parser.add_option( - "-p", "--pickle", dest='pickle', action='store_true', default=False, - help="perform pickle operation on generated/loaded words" - ) - - parser.add_option( - "-u", "--unpickle", dest='unpickle', action='store_true', default=False, - help="perform unpickle operation on previously pickled data" - ) - - parser.add_option( - "-s", "--save", dest='save', action='store_true', default=False, - help="perform save operation on generated/loaded words" - ) - - parser.add_option( - "-l", "--load", dest='load', action='store_true', default=False, - help="perform load operation on previously saved data" - ) - - - parser.add_option( - "-c", "--compare", action='store_true', default=False, - help="compare generated/loaded words with unpickled data" - ) - - parser.add_option( - "--max-words", dest='words', type=int, default=50000, metavar='N', - help="maximum number of words generated/loaded" - ) - - parser.add_option( - "--random", dest='random', action='store_true', default=False, - help="generate random words" - ) - - parser.add_option( - "--seed", dest='seed', type=int, default=0, metavar='INT', - help="random seed" - ) - - parser.add_option( - "--random-max-len", dest='maxlength', type=int, default=100, metavar='K', - help="maximum count of characters in a word" - ) - - parser.add_option( - "--file-gz", metavar='FILE', - help="load words from utf8-encoded gz file" - ) - - (options, rest) = parser.parse_args() - - if not (options.file_gz or options.random): - raise parser.error("pass --random or --file-gz option") - - if (options.pickle or options.unpickle) and (options.save or options.load): - raise parser.error("use separately --pickle/--unpickle and --save/--load") - - return options - - -if __name__ == '__main__': - main() diff --git a/stringcheese/pyahocorasick-1.4.0/tests/pyfault_check.py b/stringcheese/pyahocorasick-1.4.0/tests/pyfault_check.py deleted file mode 100644 index a65511b..0000000 --- a/stringcheese/pyahocorasick-1.4.0/tests/pyfault_check.py +++ /dev/null @@ -1,43 +0,0 @@ -import sys - -def main(): - path = sys.argv[1] - - app = Application(path) - app.run() - - -class Application(object): - def __init__(self, path): - self.path = path - - def run(self): - with open(self.path, 'rt') as f: - lines = [line.rstrip() for line in f if line.rstrip()] - self.analyze(lines) - - - def analyze(self, lines): - error_sep = '======================================================================' - traceback_sep = '----------------------------------------------------------------------' - - index = 0 - while True: - try: - index = lines.index(error_sep, index) - except ValueError: - break - - index += 1 - function = lines[index] - index += 1 - start = lines.index(traceback_sep, index) - end = lines.index(traceback_sep, start + 1) - index = end + 1 - - error = lines[end - 1] - print('%s: %s' % (function, error)) - - -if __name__ == '__main__': - main() diff --git a/stringcheese/pyahocorasick-1.4.0/tests/removeword_stresstest.py b/stringcheese/pyahocorasick-1.4.0/tests/removeword_stresstest.py deleted file mode 100644 index 2954889..0000000 --- a/stringcheese/pyahocorasick-1.4.0/tests/removeword_stresstest.py +++ /dev/null @@ -1,183 +0,0 @@ -import sys -import os -import random -import gzip -import pickle -import optparse - -import ahocorasick - -from optparse import OptionParser - - -def main(): - options = parse_args() - app = TestApplication(options) - app.run() - - -chars = 'abcdefghijklmnopqestuvwxyzABCDEFGHIJKLMNOPQESTUVWXYZ0123456789.,;:-' - -class TestApplication(object): - def __init__(self, options): - self.options = options - self.words = [] - - random.seed(options.seed) - - - def run(self): - self.A = ahocorasick.Automaton() - - self.add_words() - self.remove() - - - def add_words(self): - if self.options.random: - self.__add_random_words() - else: - self.__add_from_file() - - print("Automaton statistics:") - d = self.A.get_stats() - print("- nodes_count : %d" % d['nodes_count']) - print("- words_count : %d" % d['words_count']) - print("- links_count : %d" % d['links_count']) - print("- longest_word : %d" % d['longest_word']) - print("- sizeof_node : %d" % d['sizeof_node']) - print("- total_size : %d" % d['total_size']) - - - def remove(self): - print("Removing %d words" % len(self.words)) - random.shuffle(self.words) - for word in self.words: - self.A.remove_word(word) - - - print("Automaton statistics:") - d = self.A.get_stats() - print("- nodes_count : %d" % d['nodes_count']) - print("- words_count : %d" % d['words_count']) - print("- links_count : %d" % d['links_count']) - print("- longest_word : %d" % d['longest_word']) - print("- sizeof_node : %d" % d['sizeof_node']) - print("- total_size : %d" % d['total_size']) - - def __add_random_words(self): - n = self.options.words - - print("Adding %d words" % n) - while n > 0: - word = self.generate_random_word() - if self.A.add_word(word, True): - n -= 1 - self.words.append(word) - - - def __add_from_file(self): - n = self.options.words - - print("Adding %d words from %s" % (n, self.options.file_gz)) - for i, word in enumerate(self.read()): - if i > n: - return - - self.A.add_word(word, True) - self.words.append(word) - - - def generate_words(self): - if self.options.random: - self.__generate_random_words() - else: - self.__load_words() - - - def __generate_random_words(self): - n = self.options.words - - print("Generating %d words" % n) - while len(self.words) < n: - word = self.generate_random_word() - self.words.add(word) - - - def __load_words(self): - n = self.options.words - print ("Loading %d words from %s" % (n, self.options.file_gz)) - for i, word in enumerate(self.read()): - if i < n: - self.words.add(word) - else: - return - - - def read(self): - with gzip.open(self.options.file_gz, "rt", encoding="utf-8") as f: - for line in f: - yield line.strip() - - - def generate_random_word(self): - n = random.randint(1, self.options.maxlength + 1) - s = '' - for i in range(n): - s += random.choice(chars) - - return s - - -def format_size(size): - units = [ - ('GB', 1024**3), - ('MB', 1024**2), - ('kB', 1024), - ] - - for suffix, threshold in units: - if size > threshold: - return '%0.2f %s (%d bytes)' % (float(size)/threshold, suffix, size) - - return '%d bytes' % size - - -def parse_args(): - - parser = OptionParser() - parser.add_option( - "--max-words", dest='words', type=int, default=50000, metavar='N', - help="maximum number of words generated/loaded" - ) - - parser.add_option( - "--random", dest='random', action='store_true', default=False, - help="generate random words" - ) - - parser.add_option( - "--seed", dest='seed', type=int, default=0, metavar='INT', - help="random seed" - ) - - parser.add_option( - "--random-max-len", dest='maxlength', type=int, default=100, metavar='K', - help="maximum count of characters in a word" - ) - - parser.add_option( - "--file-gz", metavar='FILE', - help="load words from utf8-encoded gz file" - ) - - (options, rest) = parser.parse_args() - - if not (options.file_gz or options.random): - raise parser.error("pass --random or --file-gz option") - - return options - - -if __name__ == '__main__': - main() diff --git a/stringcheese/pyahocorasick-1.4.0/tests/unittestlog_check.py b/stringcheese/pyahocorasick-1.4.0/tests/unittestlog_check.py deleted file mode 100644 index eeaff23..0000000 --- a/stringcheese/pyahocorasick-1.4.0/tests/unittestlog_check.py +++ /dev/null @@ -1,53 +0,0 @@ -import sys - -def main(): - path = sys.argv[1] - - app = Application(path) - if app.run(): - sys.exit(0) - else: - sys.exit(1) - - -class Application(object): - def __init__(self, path): - self.path = path - - def run(self): - with open(self.path, 'rt') as f: - lines = [line.rstrip() for line in f if line.rstrip()] - errors = self.analyze(lines) - - return errors - - - def analyze(self, lines): - error_sep = '======================================================================' - traceback_sep = '----------------------------------------------------------------------' - - index = 0 - result = True - while True: - try: - index = lines.index(error_sep, index) - except ValueError: - break - - index += 1 - function = lines[index] - index += 1 - start = lines.index(traceback_sep, index) - end = lines.index(traceback_sep, start + 1) - index = end + 1 - - error = lines[end - 1] - if error != 'MemoryError': - print('%s: %s' % (function, error)) - result = False - - return result - - -if __name__ == '__main__': - main() diff --git a/stringcheese/pyahocorasick-1.4.0/tests/valgrind_check.py b/stringcheese/pyahocorasick-1.4.0/tests/valgrind_check.py deleted file mode 100644 index c606ad7..0000000 --- a/stringcheese/pyahocorasick-1.4.0/tests/valgrind_check.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -import sys - -def main(): - app = Application(sys.argv[1], sys.argv[2]) - if app.run(): - sys.exit(0) - else: - sys.exit(1) - - -class Application(object): - def __init__(self, srcdir, path): - self.srcdir = srcdir - self.path = path - self.sources = set() - self.dump = 0 - - - def run(self): - self.gather_sources() - - with open(self.path, 'rt') as f: - leaks = self.analyze(f) - - if not leaks: - return True - else: - print("Following references found in %s (file -> line no)" % self.path) - for name in sorted(leaks): - lines = ', '.join(map(str, leaks[name])) - print("- %s: %s" % (name, lines)) - - - def gather_sources(self): - for path in os.listdir(self.srcdir): - if path.endswith('.c'): - self.sources.add(path) - - - def analyze(self, file): - result = {} - for k, line in enumerate(file): - if 'by 0x' in line or 'at 0x' in line: - try: - # by 0xfffff: function (file.c:1234) - # ^^^^^^ - index = line.rindex('(') + 1 - name = line[index:] - if name.startswith('in '): - continue - - index = name.index(':') - name = name[:index] - - if self.dump: - print(name) - except ValueError: - continue - - if name in self.sources: - if name not in result: - result[name] = [] - - result[name].append(k) - - return result - - -if __name__ == '__main__': - main() - diff --git a/stringcheese/pyahocorasick-1.4.0/trie.c b/stringcheese/pyahocorasick-1.4.0/trie.c deleted file mode 100644 index 941d230..0000000 --- a/stringcheese/pyahocorasick-1.4.0/trie.c +++ /dev/null @@ -1,231 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - Trie implementation - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -*/ - -#include "trie.h" - - -static TrieNode* -trie_add_word(Automaton* automaton, const TRIE_LETTER_TYPE* word, const size_t wordlen, bool* new_word) { - - TrieNode* node; - TrieNode* child; - unsigned i; - - if (automaton->kind == EMPTY) { - ASSERT(automaton->root == NULL); - automaton->root = trienode_new(false); - if (automaton->root == NULL) - return NULL; - } - - node = automaton->root; - - for (i=0; i < wordlen; i++) { - const TRIE_LETTER_TYPE letter = word[i]; - - child = trienode_get_next(node, letter); - if (child == NULL) { - child = trienode_new(false); - if (LIKELY(child != NULL)) { - if (UNLIKELY(trienode_set_next(node, letter, child) == NULL)) { - memory_free(child); - return NULL; - } - } else { - // Note: in case of memory error, the already allocate nodes - // are still reachable from the root and will be free - // upon automaton destruction. - return NULL; - } - } - - node = child; - } - - if (node->eow == false) { - node->eow = true; - *new_word = true; - automaton->count += 1; - } - else - *new_word = false; - - automaton->kind = TRIE; - - return node; -} - - -static PyObject* -trie_remove_word(Automaton* automaton, const TRIE_LETTER_TYPE* word, const size_t wordlen) { - - PyObject* object; - TrieNode* node; - TrieNode* tmp; - TrieNode* last_multiway; - unsigned last_multiway_index; - unsigned i; - - if (automaton->root == NULL) { - return NULL; - } - - node = automaton->root; - - last_multiway = node; - last_multiway_index = 0; - for (i=0; i < wordlen; i++) { - const TRIE_LETTER_TYPE letter = word[i]; - - node = trienode_get_next(node, letter); - if (node == NULL) { - return NULL; - } - - // Save the last node along path which has more children - // or is a terminating node. - if (node->n > 1 || (node->n == 1 && node->eow)) { - last_multiway = node; - last_multiway_index = i + 1; - } - } - - if (node->eow != true) { - return NULL; - } - - object = node->output.object; - - if (trienode_is_leaf(node)) { - // Remove a linear list that starts at the last_multiway node - // and ends at the last [found] one. - - // 1. Unlink the tail from the trie - node = trienode_get_next(last_multiway, word[last_multiway_index]); - ASSERT(node != NULL); - - if (UNLIKELY(trienode_unset_next_pointer(last_multiway, node) == MEMORY_ERROR)) { - PyErr_NoMemory(); - return NULL; - } - - // 2. Free the tail (reference to value from the last element was already saved) - for (i = last_multiway_index + 1; i < wordlen; i++) { - tmp = trienode_get_next(node, word[i]); - ASSERT(tmp->n <= 1); - trienode_free(node); - node = tmp; - } - - trienode_free(node); - - } else { - // just unmark the terminating node - node->eow = false; - } - - automaton->kind = TRIE; - return object; -} - - -static TrieNode* PURE -trie_find(TrieNode* root, const TRIE_LETTER_TYPE* word, const size_t wordlen) { - TrieNode* node; - size_t i; - - node = root; - - if (node != NULL) { - for (i=0; i < wordlen; i++) { - node = trienode_get_next(node, word[i]); - if (node == NULL) - return NULL; - } - } - - return node; -} - - -static int PURE -trie_longest(TrieNode* root, const TRIE_LETTER_TYPE* word, const size_t wordlen) { - TrieNode* node; - int len = 0; - size_t i; - - node = root; - for (i=0; i < wordlen; i++) { - node = trienode_get_next(node, word[i]); - if (node == NULL) - break; - else - len += 1; - } - - return len; -} - - -static TrieNode* PURE -ahocorasick_next(TrieNode* node, TrieNode* root, const TRIE_LETTER_TYPE letter) { - TrieNode* next = node; - TrieNode* tmp; - - while (next) { - tmp = trienode_get_next(next, letter); - if (tmp) - // found link - return tmp; - else - // or go back through fail edges - next = next->fail; - } - - // or return root node - return root; -} - -static int -trie_traverse_aux( - TrieNode* node, - const int depth, - trie_traverse_callback callback, - void *extra -) { - unsigned i; - if (callback(node, depth, extra) == 0) - return 0; - - for (i=0; i < node->n; i++) { - if (trie_traverse_aux(trienode_get_ith_unsafe(node, i), depth + 1, callback, extra) == 0) - return 0; - } - - return 1; -} - - -static void -trie_traverse( - TrieNode* root, - trie_traverse_callback callback, - void *extra -) { - ASSERT(root); - ASSERT(callback); - trie_traverse_aux(root, 0, callback, extra); -} - - -size_t PURE -trienode_get_size(const TrieNode* node) { - return sizeof(TrieNode) + node->n * sizeof(TrieNode*); -} diff --git a/stringcheese/pyahocorasick-1.4.0/trie.h b/stringcheese/pyahocorasick-1.4.0/trie.h deleted file mode 100644 index cc33520..0000000 --- a/stringcheese/pyahocorasick-1.4.0/trie.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - Trie declarations - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -*/ - -#ifndef ahocorasick_trie_h_included -#define ahocorasick_trie_h_included - -#include "common.h" -#include "trienode.h" -#include "Automaton.h" - -/* add new word to a trie, returns last node on a path for that word */ -static TrieNode* -trie_add_word(Automaton* automaton, const TRIE_LETTER_TYPE* word, const size_t wordlen, bool* new_word); - -/* remove word from a trie, returns associated object if was any */ -static PyObject* -trie_remove_word(Automaton* automaton, const TRIE_LETTER_TYPE* word, const size_t wordlen); - -/* returns last node on a path for given word */ -static TrieNode* PURE -trie_find(TrieNode* root, const TRIE_LETTER_TYPE* word, const size_t wordlen); - -/* returns node linked by edge labeled with letter including paths going - through fail links */ -static TrieNode* PURE -ahocorasick_next(TrieNode* node, TrieNode* root, const TRIE_LETTER_TYPE letter); - -typedef int (*trie_traverse_callback)(TrieNode* node, const int depth, void* extra); - -/* traverse trie in DFS order, for each node callback is called - if callback returns false, then traversing stop */ -static void -trie_traverse( - TrieNode* root, - trie_traverse_callback callback, - void *extra -); - -/* returns total size of node and it's internal structures */ -size_t PURE -trienode_get_size(const TrieNode* node); - -#endif diff --git a/stringcheese/pyahocorasick-1.4.0/trienode.c b/stringcheese/pyahocorasick-1.4.0/trienode.c deleted file mode 100644 index b1ca7c2..0000000 --- a/stringcheese/pyahocorasick-1.4.0/trienode.c +++ /dev/null @@ -1,200 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - Trie implementation - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -*/ - -#include "trienode.h" - -static TrieNode* -trienode_new(const char eow) { - TrieNode* node = (TrieNode*)memory_alloc(sizeof(TrieNode)); - if (node) { - node->output.integer = 0; - node->output.object = NULL; - node->fail = NULL; - - node->n = 0; - node->eow = eow; - node->next = NULL; - } - - return node; -} - -static void -trienode_free(TrieNode* node) { - - ASSERT(node); - - if (node->n > 0) { - memory_free(node->next); - } - - memory_free(node); -} - - -static TrieNode* PURE -trienode_get_next(TrieNode* node, const TRIE_LETTER_TYPE letter) { - - unsigned i; - Pair* next; - - ASSERT(node); - next = (Pair*)node->next; - - for (i=0; i < node->n; i++) - if (next[i].letter == letter) { - return next[i].child; - } - - return NULL; -} - - -static TristateResult -trienode_unset_next_pointer(TrieNode* node, TrieNode* child) { - - unsigned i; - unsigned index; - Pair* next; - - ASSERT(node); - for (i=0; i < node->n; i++) { - if (node->next[i].child == child) { - index = i; - goto found; - } - } - - return FALSE; - -found: - if (node->n == 1) { - // there is just one node - node->n = 0; - memory_free(node->next); - node->next = NULL; - return TRUE; - } - - // there are more nodes, reallocation is needed - - next = (Pair*)memory_alloc((node->n - 1) * sizeof(Pair)); - if (UNLIKELY(next == NULL)) { - return MEMORY_ERROR; - } - - for (i=0; i < index; i++) { - next[i] = node->next[i]; - } - - for (i=index + 1; i < node->n; i++) { - next[i - 1] = node->next[i]; - } - - memory_free(node->next); - node->next = next; - node->n -= 1; - return TRUE; -} - - -static TrieNode* PURE -trienode_get_ith_unsafe(TrieNode* node, size_t index) { - ASSERT(node); - - return node->next[index].child; -} - - -static TRIE_LETTER_TYPE PURE -trieletter_get_ith_unsafe(TrieNode* node, size_t index) { - ASSERT(node); - - return node->next[index].letter; -} - - -static TrieNode* -trienode_set_next(TrieNode* node, const TRIE_LETTER_TYPE letter, TrieNode* child) { - - int n; - void* next; - - ASSERT(node); - ASSERT(child); - ASSERT(trienode_get_next(node, letter) == NULL); - - n = node->n; - next = (TrieNode**)memory_realloc(node->next, (n + 1) * (sizeof(Pair))); - if (next) { - - node->next = next; - node->next[n].letter = letter; - node->next[n].child = child; - node->n += 1; - - return child; - } - else - return NULL; -} - - -#ifdef DEBUG_LAYOUT -void trienode_dump_layout() { -#define field_size(TYPE, name) sizeof(((TYPE*)NULL)->name) -#define field_ofs(TYPE, name) offsetof(TYPE, name) -#define field_dump(TYPE, name) printf("- %-12s: %d %d\n", #name, field_size(TYPE, name), field_ofs(TYPE, name)); - - printf("TrieNode (size=%lu):\n", sizeof(TrieNode)); - field_dump(TrieNode, output); - field_dump(TrieNode, fail); - field_dump(TrieNode, n); - field_dump(TrieNode, eow); - field_dump(TrieNode, next); - - printf("Pair (size=%lu):\n", sizeof(Pair)); - field_dump(Pair, letter); - field_dump(Pair, child); - -#undef field_dump -#undef field_size -#undef field_ofs -} -#endif - - -UNUSED static void -trienode_dump_to_file(TrieNode* node, FILE* f) { - unsigned i; - - ASSERT(node != NULL); - ASSERT(f != NULL); - - if (node->n == 0) - fprintf(f, "leaf "); - - fprintf(f, "node %p\n", node); - if (node->eow) - fprintf(f, "- eow [%p]\n", node->output.object); - - fprintf(f, "- fail: %p\n", node->fail); - if (node->n > 0) { - if (node->next == NULL) { - fprintf(f, "- %d next: %p\n", node->n, node->next); - } else { - fprintf(f, "- %d next: [(%d; %p)", node->n, node->next[0].letter, node->next[0].child); - for (i=1; i < node->n; i++) - fprintf(f, ", (%d; %p)", node->next[i].letter, node->next[i].child); - fprintf(f, "]\n"); - } - } -} - diff --git a/stringcheese/pyahocorasick-1.4.0/trienode.h b/stringcheese/pyahocorasick-1.4.0/trienode.h deleted file mode 100644 index 210a13c..0000000 --- a/stringcheese/pyahocorasick-1.4.0/trienode.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - Trie node declarations - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -*/ - -#ifndef ahocorasick_trienode_h_included -#define ahocorasick_trienode_h_included - -#include "common.h" - -struct TrieNode; - - -#pragma pack(push) -#pragma pack(1) -typedef struct Pair { - TRIE_LETTER_TYPE letter; ///< edge label - struct TrieNode* child; ///< next pointer -} Pair; -#pragma pack(pop) - -/* links to children nodes are stored in dynamic table */ -typedef struct TrieNode { - union { - PyObject* object; ///< valid when kind = STORE_ANY - Py_uintptr_t integer; ///< valid when kind in [STORE_LENGTH, STORE_INTS] - } output; ///< output function, valid when eow is true - struct TrieNode* fail; ///< fail node - -#if TRIE_LETTER_SIZE == 1 - uint16_t n; ///< length of next -#else - uint32_t n; ///< length of next -#endif - uint8_t eow; ///< end of word marker - Pair* next; ///< table of letters and associated next pointers -} TrieNode; - - -typedef enum { - MEMORY_ERROR, - TRUE, - FALSE -} TristateResult; - - -/* allocate new node */ -static TrieNode* -trienode_new(const char eow); - -/* free node */ -static void -trienode_free(TrieNode* node); - -/* returns child node linked by edge labelled with letter */ -static TrieNode* PURE -trienode_get_next(TrieNode* node, const TRIE_LETTER_TYPE letter); - -/* link with child node by edge labelled with letter */ -static TrieNode* -trienode_set_next(TrieNode* node, const TRIE_LETTER_TYPE letter, TrieNode* child); - -/* remove link to given children */ -static TristateResult -trienode_unset_next_pointer(TrieNode* node, TrieNode* child); - -static TrieNode* PURE -trienode_get_ith_unsafe(TrieNode* node, size_t letter); - -static TRIE_LETTER_TYPE PURE -trieletter_get_ith_unsafe(TrieNode* node, size_t letter); - -#define trienode_is_leaf(node) ((node)->n == 0) - -static void -trienode_dump_to_file(TrieNode* node, FILE* f); - -#define trienode_dump(node) trienode_dump_to_file(node, stdout) - -#ifdef DEBUG_LAYOUT -void trienode_dump_layout(); -#endif - -#endif diff --git a/stringcheese/pyahocorasick-1.4.0/unittests.py b/stringcheese/pyahocorasick-1.4.0/unittests.py deleted file mode 100644 index 5519b7a..0000000 --- a/stringcheese/pyahocorasick-1.4.0/unittests.py +++ /dev/null @@ -1,1596 +0,0 @@ -# -*- coding: utf-8 -*- -""" - This is part of pyahocorasick Python module. - - Unit tests for the C-based ahocorasick module. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl/proj/pyahocorasick/ - License : public domain -""" - -import sys -import os -import unittest -import ahocorasick - -try: - import _pickle -except ImportError: - _pickle = None - - -if ahocorasick.unicode: - conv = lambda x: x -else: - if sys.version_info.major >= 3: - conv = lambda x: bytes(x, 'ascii') - else: - conv = lambda x: x - - -class TestCase(unittest.TestCase): - def __init__(self, *args): - super(TestCase, self).__init__(*args) - - if not hasattr(self, 'assertRaisesRegex'): - # fixup for Py2 - self.assertRaisesRegex = self.assertRaisesRegexp - - - def assertEmpty(self, collection): - self.assertEqual(0, len(collection)) - - - def assertNotEmpty(self, collection): - self.assertGreater(len(collection), 0) - - -class TestConstructor(TestCase): - def test_constructor_wrong_store(self): - with self.assertRaisesRegex(ValueError, "store value must be one of.*"): - ahocorasick.Automaton(-42) - - - def test_constructor_wrong_key_type(self): - with self.assertRaisesRegex(ValueError, "key_type must have value.*"): - ahocorasick.Automaton(ahocorasick.STORE_ANY, -42) - - -class TestTrieStorePyObjectsBase(TestCase): - def setUp(self): - self.A = ahocorasick.Automaton(); - self.words = "word python aho corasick \x00\x00\x00".split() - self.inexisting = "test foo bar dword".split() - - -class TestTrieMethods(TestTrieStorePyObjectsBase): - "Test basic methods related to trie structure" - - def test_empty(self): - A = self.A - self.assertTrue(A.kind == ahocorasick.EMPTY) - self.assertTrue(len(A) == 0) - - - def test_add_word(self): - A = self.A - self.assertTrue(A.kind == ahocorasick.EMPTY) - - n = 0 - for word in self.words: - n += 1 - A.add_word(conv(word), None) - self.assertEqual(A.kind, ahocorasick.TRIE) - self.assertEqual(len(A), n) - - # dupliacted entry - A.add_word(conv(self.words[0]), None) - self.assertTrue(A.kind == ahocorasick.TRIE) - self.assertTrue(len(A) == n) - - - def test_add_empty_word(self): - if ahocorasick.unicode: - self.assertFalse(self.A.add_word("", None)) - else: - self.assertFalse(self.A.add_word(b"", None)) - - self.assertEqual(len(self.A), 0) - self.assertEqual(self.A.kind, ahocorasick.EMPTY) - - - def test_clear(self): - A = self.A - self.assertTrue(A.kind == ahocorasick.EMPTY) - - for w in self.words: - A.add_word(conv(w), w) - - self.assertEqual(len(A), len(self.words)) - - A.clear() - self.assertEqual(A.kind, ahocorasick.EMPTY) - self.assertEqual(len(A), 0) - - - def test_exists(self): - A = self.A - - for w in self.words: - A.add_word(conv(w), w) - - for w in self.words: - self.assertTrue(A.exists(conv(w))) - - for w in self.inexisting: - self.assertFalse(A.exists(conv(w))) - - - def test_contains(self): - A = self.A - for w in self.words: - A.add_word(conv(w), w) - - for w in self.words: - self.assertTrue(conv(w) in A) - - for w in self.inexisting: - self.assertTrue(conv(w) not in A) - - - def test_match(self): - A = self.A - for word in self.words: - A.add_word(conv(word), word) - - prefixes = "w wo wor word p py pyt pyth pytho python \x00 \x00\x00 \x00\x00\x00".split() - - for word in prefixes: - self.assertTrue(A.match(conv(word))) - - inexisting = "wa apple pyTon \x00\x00\x00\x00".split() - for word in inexisting: - self.assertFalse(A.match(conv(word))) - - - def test_get1(self): - A = self.A - for i, w in enumerate(self.words): - A.add_word(conv(w), i + 1) - - for i, w in enumerate(self.words): - self.assertEqual(A.get(conv(w)), i + 1) - - - def test_get2(self): - A = self.A - for i, w in enumerate(self.words): - A.add_word(conv(w), i + 1) - - for w in self.inexisting: - self.assertEqual(A.get(conv(w), None), None) - - - def test_get3(self): - A = self.A - for i, w in enumerate(self.words): - A.add_word(conv(w), i + 1) - - for w in self.inexisting: - with self.assertRaises(KeyError): - A.get(conv(w)) - - - def test_get_from_an_empty_automaton(self): - A = ahocorasick.Automaton() - - r = A.get('foo', None) - self.assertEqual(r, None) - - - def test_longest_prefix(self): - A = self.A - for i, w in enumerate(self.words): - A.add_word(conv(w), i + 1) - - # there is "word" - self.assertEqual(A.longest_prefix(conv("wo")), 2) - self.assertEqual(A.longest_prefix(conv("working")), 3) - self.assertEqual(A.longest_prefix(conv("word")), 4) - self.assertEqual(A.longest_prefix(conv("wordbook")), 4) - self.assertEqual(A.longest_prefix(conv("void")), 0) - self.assertEqual(A.longest_prefix(conv("")), 0) - - - def test_stats_have_valid_structure(self): - A = self.A - for i, w in enumerate(self.words): - A.add_word(conv(w), i + 1) - - platform_dependent = None - reference = { - 'longest_word': 8, - 'total_size': platform_dependent, - 'sizeof_node': platform_dependent, - 'nodes_count': 25, - 'words_count': 5, - 'links_count': 24 - } - - s = A.get_stats() - - self.assertEqual(len(s), len(reference)) - - for key in reference: - self.assertIn(key, s) - - for key in (key for key in reference if reference[key] != platform_dependent): - self.assertEqual(reference[key], s[key]) - - - def test_stats_for_empty_tire_are_empty(self): - s = self.A.get_stats() - self.assertTrue(len(s) > 0) - for key in s: - if key != "sizeof_node": - self.assertEqual(s[key], 0) - - -class TestTrieRemoveWord(TestTrieStorePyObjectsBase): - - def test_remove_word_from_empty_trie(self): - self.assertFalse(self.A.remove_word("test")) - - - def test_remove_existing_word(self): - A = self.A - - words = ["he", "her", "hi", "him", "his"] - for w in words: - A.add_word(conv(w), w) - - expected_len = len(A) - for w in words: - self.assertTrue(self.A.remove_word(w)) - self.assertFalse(self.A.exists(w)) - expected_len -= 1 - self.assertEqual(expected_len, len(A)) - - - def test_remove_inexisting_word(self): - A = self.A - - words = ["he", "her", "hi", "him", "his"] - for w in words: - A.add_word(conv(w), w) - - expected_len = len(A) - for w in ["cat", "dog", "tree"]: - self.assertFalse(self.A.exists(w)) - self.assertFalse(self.A.remove_word(w)) - self.assertEqual(expected_len, len(A)) - - - def test_remove__case1(self): - words = ["k", "ki", "kit", "kitt", "kitte", "kitten" - , "kitc", "kitch", "kitche", "kitchen"] - - A = self.A - for w in words: - A.add_word(conv(w), w) - - expected_set = set(words) - for w in words: - self.assertTrue(self.A.remove_word(w)) - expected_set.discard(w) - current_set = set(A.keys()) - self.assertEqual(expected_set, current_set) - self.assertEqual(len(expected_set), len(A)) - - - def test_remove__case2(self): - words = ["k", "ki", "kit", "kitt", "kitte", "kitten" - , "kitc", "kitch", "kitche", "kitchen"] - - A = self.A - for w in words: - A.add_word(conv(w), w) - - expected_set = set(words) - for w in reversed(words): - self.assertTrue(self.A.remove_word(w)) - expected_set.discard(w) - current_set = set(A.keys()) - self.assertEqual(expected_set, current_set) - self.assertEqual(len(expected_set), len(A)) - - - def test_remove_word_changes_type_of_automaton(self): - A = self.A - - words = ["he", "her", "hi", "him", "his"] - for w in words: - A.add_word(conv(w), w) - - A.make_automaton() - self.assertEqual(ahocorasick.AHOCORASICK, A.kind) - - self.assertFalse(A.remove_word("inexisting")) - self.assertEqual(ahocorasick.AHOCORASICK, A.kind) - - self.assertTrue(A.remove_word("hi")) - self.assertEqual(ahocorasick.TRIE, A.kind) - - -class TestTriePop(TestTrieStorePyObjectsBase): - - def test_pop_from_empty_trie(self): - with self.assertRaises(KeyError): - self.A.pop("test") - - - def test_pop_existing_word(self): - A = self.A - - words = ["he", "her", "hi", "him", "his"] - for w in words: - A.add_word(conv(w), w) - - expected_len = len(A) - for w in words: - self.assertEqual(w, self.A.pop(w)) - self.assertFalse(self.A.exists(w)) - expected_len -= 1 - self.assertEqual(expected_len, len(A)) - - - def test_pop_inexisting_word(self): - A = self.A - - words = ["he", "her", "hi", "him", "his"] - for w in words: - A.add_word(conv(w), w) - - expected_len = len(A) - for w in ["cat", "dog", "tree"]: - with self.assertRaises(KeyError): - self.A.pop(w) - - self.assertEqual(expected_len, len(A)) - - - def test_pop__case1(self): - words = ["k", "ki", "kit", "kitt", "kitte", "kitten" - , "kitc", "kitch", "kitche", "kitchen"] - - A = self.A - for w in words: - A.add_word(conv(w), w) - - expected_set = set(words) - for w in words: - self.assertEqual(w, self.A.pop(w)) - expected_set.discard(w) - current_set = set(A.keys()) - self.assertEqual(expected_set, current_set) - self.assertEqual(len(expected_set), len(A)) - - - def test_pop__case2(self): - words = ["k", "ki", "kit", "kitt", "kitte", "kitten" - , "kitc", "kitch", "kitche", "kitchen"] - - A = self.A - for w in words: - A.add_word(conv(w), w) - - expected_set = set(words) - for w in reversed(words): - self.assertEqual(w, self.A.pop(w)) - expected_set.discard(w) - current_set = set(A.keys()) - self.assertEqual(expected_set, current_set) - self.assertEqual(len(expected_set), len(A)) - - - def test_pop_changes_type_of_automaton(self): - A = self.A - - words = ["he", "her", "hi", "him", "his"] - for w in words: - A.add_word(conv(w), w) - - A.make_automaton() - self.assertEqual(ahocorasick.AHOCORASICK, A.kind) - - with self.assertRaises(KeyError): - A.pop("inexisting") - - self.assertEqual(ahocorasick.AHOCORASICK, A.kind) - - self.assertEqual("hi", A.pop("hi")) - self.assertEqual(ahocorasick.TRIE, A.kind) - - -class TestTrieIterators(TestTrieStorePyObjectsBase): - "Test iterators walking over trie" - - - def test_iter(self): - A = self.A - for i, w in enumerate(self.words): - A.add_word(conv(w), i + 1) - - L = [word for word in A] - K = list(map(conv, self.words)) - self.assertEqual(len(L), len(K)) - self.assertEqual(set(L), set(K)) - - - def test_keys(self): - A = self.A - for i, w in enumerate(self.words): - A.add_word(conv(w), i + 1) - - L = [word for word in A.keys()] - K = [conv(word) for word in self.words] - self.assertEqual(len(L), len(K)) - self.assertEqual(set(L), set(K)) - - - def test_values(self): - A = self.A - for i, w in enumerate(self.words): - A.add_word(conv(w), i + 1) - - L = [x for x in A.values()] - V = list(range(1, len(self.words) + 1)) - self.assertEqual(len(L), len(V)) - self.assertEqual(set(L), set(V)) - - - def test_items(self): - A = self.A - I = [] - for i, w in enumerate(self.words): - A.add_word(conv(w), i + 1) - I.append((conv(w), i + 1)) - - L = [x for x in A.items()] - self.assertEqual(len(L), len(I)) - self.assertEqual(set(L), set(I)) - - - def test_items_with_prefix_valid(self): - A = self.A - words = "he she her hers star ham".split() - for word in words: - A.add_word(conv(word), word) - - I = list(map(conv, "he her hers".split())) - L = [x for x in A.keys(conv("he"))] - self.assertEqual(len(L), len(I)) - self.assertEqual(set(L), set(I)) - - - def test_items_with_prefix_invalid(self): - A = self.A - words = "he she her hers star ham".split() - for word in words: - A.add_word(conv(word), word) - - I = [] - L = [x for x in A.keys(conv("cat"))] - self.assertEqual(len(L), len(I)) - self.assertEqual(set(L), set(I)) - - - def test_items_with_valid_pattern(self): - A = self.A - words = "abcde aXcd aZcdef aYc Xbcdefgh".split() - for word in words: - A.add_word(conv(word), word) - - I = ["aXcd"] - L = [x for x in A.keys(conv("a?cd"), conv("?"))] - self.assertEqual(set(I), set(L)) - - - def test_items_with_valid_pattern2(self): - A = self.A - words = "abcde aXcde aZcdef aYc Xbcdefgh".split() - for word in words: - A.add_word(conv(word), word) - - L = [x for x in A.keys(conv("a?c??"), conv("?"), ahocorasick.MATCH_EXACT_LENGTH)] - I = ["abcde", "aXcde"] - self.assertEqual(set(I), set(L)) - - L = [x for x in A.keys(conv("a?c??"), conv("?"), ahocorasick.MATCH_AT_MOST_PREFIX)] - I = ["aYc", "abcde", "aXcde"] - self.assertEqual(set(I), set(L)) - - L = [x for x in A.keys(conv("a?c??"), conv("?"), ahocorasick.MATCH_AT_LEAST_PREFIX)] - I = ["abcde", "aXcde", "aZcdef"] - self.assertEqual(set(I), set(L)) - - - def test_items_wrong_wildcrard(self): - with self.assertRaisesRegex(ValueError, "Wildcard must be a single character.*"): - self.A.keys(conv("anything"), conv("??")) - - - def test_items_wrong_match_enum(self): - with self.assertRaisesRegex(ValueError, "The optional how third argument must be one of"): - self.A.keys(conv("anything"), conv("?"), -42) - - -class TestTrieIteratorsInvalidate(TestTrieStorePyObjectsBase): - "Test invalidating iterator when trie is changed" - - def helper(self, method): - A = self.A - for i, w in enumerate(self.words): - A.add_word(conv(w), i + 1) - - it = method() - w = next(it) - # word already exists, just change associated value - # iterator is still valid - A.add_word(conv(self.words[0]), 2) - w = next(it) - - # new word, iterator is invalidated - A.add_word(conv("should fail"), 1) - with self.assertRaises(ValueError): - w = next(it) - - - def test_keys(self): - self.helper(self.A.keys) - - - def test_values(self): - self.helper(self.A.values) - - - def test_items(self): - self.helper(self.A.items) - - -class TestAutomatonBase(TestCase): - def setUp(self): - self.A = ahocorasick.Automaton(); - self.words = "he her hers she".split() - self.string = "_sherhershe_" - self.correct_positons = [ - (3, "she"), - (3, "he"), - (4, "her"), - (6, "he"), - (7, "her"), - (8, "hers"), - (10, "she"), - (10, "he") - ] - - - def add_words(self): - for word in self.words: - self.A.add_word(conv(word), word) - - return self.A - - - def add_words_and_make_automaton(self): - self.add_words() - self.A.make_automaton() - return self.A - - -class TestAutomatonConstruction(TestAutomatonBase): - "Test converting trie to Aho-Corasick automaton" - - def test_make_automaton1(self): - A = self.A - self.assertEqual(A.kind, ahocorasick.EMPTY) - A.make_automaton() - # empty trie is never converted to automaton - self.assertEqual(A.kind, ahocorasick.EMPTY) - - - def test_make_automaton2(self): - A = self.A - self.assertEqual(A.kind, ahocorasick.EMPTY) - - self.add_words() - self.assertEqual(A.kind, ahocorasick.TRIE) - - A.make_automaton() - self.assertEqual(A.kind, ahocorasick.AHOCORASICK) - - - def test_make_automaton3(self): - A = self.A - self.assertEqual(A.kind, ahocorasick.EMPTY) - - self.add_words() - self.assertEqual(A.kind, ahocorasick.TRIE) - - A.make_automaton() - self.assertEqual(A.kind, ahocorasick.AHOCORASICK) - - A.add_word(conv("rollback?"), True) - self.assertEqual(A.kind, ahocorasick.TRIE) - - -class TestAutomatonSearch(TestAutomatonBase): - "Test searching using constructed automaton (method find_all)" - - def test_find_all1(self): - "no action is performed until automaton is constructed" - A = self.A - self.assertEqual(A.kind, ahocorasick.EMPTY) - - self.assertEqual(A.find_all(self.string, conv("any arg")), None) - - A.add_word(conv("word"), None) - self.assertEqual(A.kind, ahocorasick.TRIE) - self.assertEqual(A.find_all(self.string, conv("any arg")), None) - - - def test_find_all2(self): - A = self.add_words_and_make_automaton() - - L = [] - def callback(index, word): - L.append((index, word)) - - A.find_all(conv(self.string), callback) - - C = self.correct_positons - self.assertEqual(L, C) - - - def test_find_all3(self): - A = self.add_words_and_make_automaton() - - L = [] - def callback(index, word): - L.append((index, word)) - - start = 4 - end = 9 - - L = [] - A.find_all(conv(self.string[start:end]), callback) - C = [(pos + start, word) for pos, word in L] - - L = [] - A.find_all(conv(self.string), callback, start, end) - - self.assertEqual(L, C) - - - def test_find_all__not_a_callable_object(self): - A = self.add_words_and_make_automaton() - - with self.assertRaisesRegex(TypeError, "The callback argument must be a callable such as a function."): - A.find_all(conv(self.string), None) - - - def test_find_all__wrong_range__case_1(self): - A = self.add_words_and_make_automaton() - - L = [] - def callback(index, word): - L.append((index, word)) - - with self.assertRaisesRegex(IndexError, "end index not in range 0..12"): - A.find_all(conv(self.string), callback, 0, len(self.string) + 5) - - - def test_find_all__wrong_range__case_2(self): - A = self.add_words_and_make_automaton() - - L = [] - def callback(index, word): - L.append((index, word)) - - with self.assertRaisesRegex(IndexError, "start index not in range 0..12"): - A.find_all(conv(self.string), callback, -len(self.string) - 1, 3) - - - def test_find_all__end_index_not_given(self): - A = self.add_words_and_make_automaton() - - L = [] - def callback(index, word): - L.append((index, word)) - - A.find_all(conv(self.string), callback, 0) - - - def test_find_all__start_is_negative(self): - A = self.add_words_and_make_automaton() - - L = [] - def callback(index, word): - L.append((index, word)) - - A.find_all(conv(self.string), callback, -3, 4) - - - def test_find_all__end_is_negative(self): - A = self.add_words_and_make_automaton() - - L = [] - def callback(index, word): - L.append((index, word)) - - A.find_all(conv(self.string), callback, 0, -1) - - -class TestAutomatonIterSearch(TestAutomatonBase): - "Test searching using constructed automaton (iterator)" - - def test_iter1(self): - A = self.A - self.assertEqual(A.kind, ahocorasick.EMPTY) - with self.assertRaises(AttributeError): - A.iter(conv(self.string)) - - A.add_word(conv("word"), None) - self.assertEqual(A.kind, ahocorasick.TRIE) - with self.assertRaises(AttributeError): - A.iter(conv(self.string)) - - - def test_iter2(self): - A = self.add_words_and_make_automaton() - - L = [] - for index, word in A.iter(conv(self.string)): - L.append((index, word)) - - C = self.correct_positons - self.assertEqual(L, C) - - - def test_iter3(self): - A = self.add_words_and_make_automaton() - - start = 4 - end = 9 - - C = [] - for index, word in A.iter(conv(self.string[start:end])): - C.append((index + start, word)) - - L = [] - for index, word in A.iter(conv(self.string), start, end): - L.append((index, word)) - - self.assertEqual(L, C) - - - def test_iter_set(self): - A = self.add_words_and_make_automaton() - parts = "_sh erhe rshe _".split() - - expected = { - '_sh' : [], - 'erhe' : [(3, 'she'), - (3, 'he'), - (4, 'her'), - (6, 'he')], - 'rshe' : [(7, 'her'), - (8, 'hers'), - (10, 'she'), - (10, 'he')], - '_' : [] - } - - it = A.iter(conv("")) - result = {} - for part in parts: - it.set(conv(part)) - result[part] = [] - for item in it: - result[part].append(item) - - self.assertEqual(expected, result) - - def test_iter_set__with_reset(self): - A = self.add_words_and_make_automaton() - - expected = { - 'he' : [(1, 'he')], - 'she' : [(2, 'she'), (2, 'he')], - } - - it = A.iter(conv("")) - result = {} - for part in ["he", "she"]: - it.set(conv(part), True) - result[part] = [] - for item in it: - result[part].append(item) - - self.assertEqual(expected, result) - - - def test_iter_compare_with_find_all(self): - A = self.add_words_and_make_automaton() - - # results from find_all - L = [] - def callback(index, word): - L.append((index, word)) - - A.find_all(conv(self.string), callback) - - # results from iterator - C = [] - for index, word in A.iter(conv(self.string)): - C.append((index, word)) - - self.assertEqual(L, C) - - - def test_iter_wrong_argument_type(self): - A = self.add_words_and_make_automaton() - - with self.assertRaisesRegex(TypeError, "string required"): - A.iter(None) - - -class TestAutomatonIterSearchWithIgnoreWhiteSpace(TestAutomatonBase): - "Test searching using constructed automaton (iterator)" - - def setUp(self): - self.A = ahocorasick.Automaton() - self.words = "he her hers she".split() - self.string = "_sh e rher she_" - self.correct_positons = [ - (4, "she"), - (4, "he"), - (6, "her"), - (8, "he"), - (9, "her"), - (11, "hers"), - (13, "she"), - (13, "he") - ] - self.correct_positons_start_12 = [ - (13, "he") - ] - - - def test_iter1(self): - self.add_words_and_make_automaton() - A = self.A - self.assertEqual(A.kind, ahocorasick.AHOCORASICK) - - L = [] - for index, word in A.iter(conv(self.string), ignore_white_space=True): - L.append((index, word)) - self.assertEqual(L, self.correct_positons) - - - def test_iter2(self): - self.add_words_and_make_automaton() - A = self.A - self.assertEqual(A.kind, ahocorasick.AHOCORASICK) - - L = [] - for index, word in A.iter(conv(self.string), ignore_white_space=True, start=12): - L.append((index, word)) - self.assertEqual(L, self.correct_positons_start_12) - - - def test_wrong_keyword(self): - self.add_words_and_make_automaton() - A = self.A - self.assertEqual(A.kind, ahocorasick.AHOCORASICK) - - with self.assertRaises(TypeError): - A.iter(conv(self.string), ignore_white_space2=True) - - -class TestAutomatonIterInvalidate(TestAutomatonBase): - "Test if searching iterator is invalidated when trie/automaton change" - - def test_iter1(self): - A = self.add_words_and_make_automaton() - - it = A.iter(conv(self.string)) - w = next(it) - A.add_word(conv("should fail"), 1) - with self.assertRaises(ValueError): - w = next(it) - - - def test_iter2(self): - A = self.add_words_and_make_automaton() - - it = A.iter(conv(self.string)) - w = next(it) - A.clear() - with self.assertRaises(ValueError): - w = next(it) - - -print_dumps = False - -class TestPickle(TestAutomatonBase): - "Test pickling/unpickling" - - def test_pickle(self): - import pickle - - A = self.add_words_and_make_automaton(); - reduced = A.__reduce__() - self.assertEqual(len(reduced), 2) - if print_dumps: - print(pickle.dumps(A)) - - - def test_unpickle(self): - import pickle - A = self.add_words_and_make_automaton(); - dump = pickle.dumps(A) - B = pickle.loads(dump) - - self.compare_automatons(A, B) - - - def test_unicode(self): - # sample Russian words from issue #8 - import pickle - - test_sentences_rus = ["!ASM Print", - "!ASM Print, tyre компания er", - "!ASM Print, рекламно-производственная компания rr", - "!Action Pact!", - "!T.O.O.H.!", - "!YES, лингвистический центр", - "!ts, магазин", - "!ФЕСТ", - '"100-th" department store', - '"1000 мелочей"', - '"1001 мелочь"', - '"19 отряд Федеральной противопожарной службы по Ленинградской области"', - '"У Друзей"', - '"ШТОРЫ и не только..."'] - - A = ahocorasick.Automaton() - for sentences in test_sentences_rus[-7:]: - for index, word in enumerate(sentences.split(' ')): - A.add_word(word, (index, word)) - - dump = pickle.dumps(A) - B = pickle.loads(dump) - - self.compare_automatons(A, B) - - - def test_empty(self): - import pickle - - A = ahocorasick.Automaton() - dump = pickle.dumps(A) - B = pickle.loads(dump) - - self.compare_automatons(A, B) - - - def compare_automatons(self, A, B): - if print_dumps: - print([x for x in B.items()]) - print([x for x in A.items()]) - - self.assertEqual(len(A), len(B)) - - for item in zip(A.items(), B.items()): - (AK, AV), (BK, BV) = item - - self.assertEqual(AK, BK) - self.assertEqual(AV, BV) - - -class TestPickleStoreInts(TestCase): - "Test pickling/unpickling for automaton of kind STORE_INTS/STORE_LEN" - - - def add_words_and_make_automaton(self): - A = ahocorasick.Automaton(ahocorasick.STORE_INTS) - words = "tree trie bark branch barrier brag".split() - - for index, word in enumerate(words): - A.add_word(word, index) - - A.make_automaton() - - return A - - - def test_pickle_and_unpickle(self): - import pickle - A = self.add_words_and_make_automaton(); - dump = pickle.dumps(A) - B = pickle.loads(dump) - - self.compare_automatons(A, B) - - - def compare_automatons(self, A, B): - if print_dumps: - print([x for x in B.items()]) - print([x for x in A.items()]) - - self.assertEqual(len(A), len(B)) - - for item in zip(A.items(), B.items()): - (AK, AV), (BK, BV) = item - - self.assertEqual(AK, BK) - self.assertEqual(AV, BV) - - -class TestTrieStoreInts(TestCase): - "Test storing plain ints as values (instead of python objects)" - - def setUp(self): - self.A = ahocorasick.Automaton(ahocorasick.STORE_INTS); - self.words = "word python aho corasick \x00\x00\x00".split() - - - def test_add_word1(self): - A = self.A - - # by default next values are stored - for word in self.words: - A.add_word(conv(word)) - - I = list(range(1, len(self.words) + 1)) - L = [A.get(conv(word)) for word in self.words] - self.assertEqual(I, L) - - - def test_add_word2(self): - A = self.A - - # store arbitrary ints - for i, word in enumerate(self.words): - A.add_word(conv(word), i + 123) - - I = list(range(123, 123 + len(self.words))) - L = [A.get(conv(word)) for word in self.words] - self.assertEqual(I, L) - - - def test_add_word3(self): - # not a number - with self.assertRaises(TypeError): - self.A.add_word(conv("xyz"), None) - - - def test_iter(self): - A = self.A - for word in self.words: - A.add_word(conv(word)); - - I = set(range(1, len(A) + 1)) - L1 = [val for val in A.values()] - L2 = [val for key, val in A.items()] - - self.assertEqual(L1, L2) - self.assertEqual(set(L1), I) - - - def test_find_all_and_iter(self): - words = "he her hers she".split() - string = "_sherhershe_" - - A = self.A - for word in words: - A.add_word(conv(word)) - - A.make_automaton() - - # find_all() - C = [] - def callback(index, value): - C.append((index, value)) - - A.find_all(conv(string), callback); - - # iter() - L = [(index, value) for index, value in A.iter(conv(string))] - - # - self.assertEqual(C, L) - - -class TestTrieStoreLengths(TestCase): - """Test storing plain ints -- length of words --- as values - (instead of python objects)""" - - def setUp(self): - self.A = ahocorasick.Automaton(ahocorasick.STORE_LENGTH); - self.words = "word python aho corasick \x00\x00\x00".split() - - - def test_add_word1(self): - A = self.A - - # by default next values are stored - for word in self.words: - A.add_word(conv(word)) - - for key, value in A.items(): - self.assertEqual(len(key), value) - - -class TestSizeOf(TestCase): - def setUp(self): - self.A = ahocorasick.Automaton(); - words = "word python aho corasick tree bark branch root".split() - for word in words: - self.A.add_word(conv(word), 1) - - - def test_sizeof(self): - size1 = sys.getsizeof(self.A) - - # grow memory - self.A.add_word("kitten", "fluffy") - - size2 = sys.getsizeof(self.A) - - # just change the assigned value, no changes to the trie structure - self.A.add_word("word", "other value") - - size3 = sys.getsizeof(self.A) - - self.assertTrue(size2 > size1) - self.assertTrue(size3 == size2) - - -class TestBugAutomatonSearch(TestAutomatonBase): - """Bug in search""" - - def setUp(self): - self.A = ahocorasick.Automaton() - self.words = ['GT-C3303', 'SAMSUNG-GT-C3303K/'] - - - def test_bug(self): - self.add_words_and_make_automaton() - text = 'SAMSUNG-GT-C3303i/1.0 NetFront/3.5 Profile/MIDP-2.0 Configuration/CLDC-1.1' - - res = list(self.A.iter(conv(text))) - - self.assertEqual([(15, 'GT-C3303')], res) - - -class TestIntSequenceBase(TestCase): - def setUp(self): - self.A = ahocorasick.Automaton(ahocorasick.STORE_ANY, ahocorasick.KEY_SEQUENCE); - - -class TestIntSequence__TrieMethods(TestIntSequenceBase): - - def test_add__case_1(self): - A = self.A - - ret = A.add_word((1, 2, 3), "foo") - self.assertTrue(ret) - self.assertTrue(A.kind == ahocorasick.TRIE) - - self.assertEqual(len(A), 1) - self.assertTrue((1, 2, 3) in A) - - - def test_add__case_2(self): - A = self.A - - A.add_word((1, 2, 3), "foo") - ret = A.add_word((1, 2, 3), "bar") - self.assertFalse(ret) - - - def test_add__case_3(self): - A = self.A - - A.add_word((1, 2, 3), "foo") - A.add_word((1, 2, 3, 4, 5), "bar") - A.add_word((1, 3, 4, 5), "baz") - - self.assertEqual(len(A), 3); - self.assertEqual(A.get((1, 2, 3)), "foo"); - self.assertEqual(A.get((1, 2, 3, 4, 5)), "bar"); - self.assertEqual(A.get((1, 3, 4, 5)), "baz"); - - - def test_add__case_4(self): - A = self.A - - ret = A.add_word((), "foo") - self.assertFalse(ret) - - - def test_add__case_5__wrong_argument_type(self): - A = self.A - - with self.assertRaises(TypeError) as e: - A.add_word("hello!", "foo") - - self.assertEqual(str(e.exception), "argument is not a supported sequence type") - - - def test_add__case_6__wrong_item_type(self): - A = self.A - - with self.assertRaises(ValueError) as e: - A.add_word((1, 2, "hello!"), "foo") - - self.assertEqual(str(e.exception), "item #2 is not a number") - - - def test_add__case_7__wrong_value(self): - A = self.A - - with self.assertRaises(ValueError) as e: - A.add_word((1, -1, 12), "foo") - - errmsg = str(e.exception) - msgs = [ - "item #1: value -1 outside range [0..65535]", - "item #1: value -1 outside range [0..4294967295]", - ] - - self.assertIn(errmsg, msgs) - - - def test_add__case_8__wrong_value(self): - A = self.A - - with self.assertRaises(ValueError) as e: - A.add_word((2**42, 0, 12), "foo") - - # Depending on python's version the message might be different, - # but the type remains the same. - - errmsg = str(e.exception) - msgs = [ - "item #0: value 4398046511104 outside range [0..65535]", - "item #0: value 4398046511104 outside range [0..4294967295]", - "item #0 is not a number", - ] - - self.assertIn(errmsg, msgs) - - - def test_match(self): - A = self.A - - ret = A.add_word((1, 2, 3), "foo") - self.assertTrue(A.match((1,))) - self.assertTrue(A.match((1, 2))) - self.assertTrue(A.match((1, 2, 3))) - - - def test_longest_prefix(self): - A = self.A - - ret = A.add_word((1, 2, 3, 4, 5, 6), "foo") - self.assertEqual(A.longest_prefix((1, 2, 3, 111, 1111, 11111)), 3); - self.assertEqual(A.longest_prefix((111, 1111, 11111)), 0); - - def test_iter1(self): - A = self.A - - A.add_word((1, 2, 3), "foo") - A.add_word((2, 3, 4, 5), "bar") - A.add_word((2, 3, 5), "baz") - A.make_automaton() - - L = [(index, value) for index, value in A.iter((1, 2, 3, 5))] - - self.assertEqual(L, [ - (2, "foo"), - (3, "baz"), - ]) - - def test_iter2(self): - A = self.A - - A.add_word((43, 89), (43, 89)) - A.add_word((43, 89, 64), (43, 89, 64)) - A.add_word((89, 64), (89, 64)) - A.add_word((89, 100), (89, 100)) - A.make_automaton() - - L = [ - (index, value) - for index, value in - A.iter((80, 80, 43, 89, 90, 89, 64, 100, 43, 89, 100)) - ] - - self.assertEqual(L, [ - (3, (43, 89)), - (6, (89, 64)), - (9, (43, 89)), - (10, (89, 100)), - ]) - - - def test_iter_wrong_argument_type(self): - A = self.A - A.add_word((89, 100), (89, 100)) - A.make_automaton() - - with self.assertRaisesRegex(TypeError, "tuple required"): - self.A.iter(None) - - -class TestDump(TestAutomatonBase): - def test_dump_empty(self): - self.assertIsNone(self.A.dump()) - - - def test_dump_trie(self): - self.add_words() - ret = self.A.dump() - - self.assertEqual(3, len(ret)) - self.assertNotEmpty(ret[0]) # list of nodes - self.assertNotEmpty(ret[1]) # list of edges - self.assertEmpty(ret[2]) # list of fail links -- empty, if not an automaton - - - def test_dump_automaton(self): - self.add_words_and_make_automaton() - ret = self.A.dump() - - self.assertEqual(3, len(ret)) - self.assertNotEmpty(ret[0]) # list of nodes - self.assertNotEmpty(ret[1]) # list of edges - self.assertNotEmpty(ret[2]) # list of fail links - - -class TestIssue53(TestCase): - """ - Problems with handling of UCS-2 encoding - """ - - def test_case1(self): - # test contributed by @woakesd (David Woakes) - - a = ahocorasick.Automaton() - a.add_word('test', 'test') - - a.make_automaton() - - test_string = 'test 🙈 test?!' - - # wrongly calculated matching position - for item in a.iter(test_string): - start = item[0] - len(item[1]) + 1 - match = test_string[start:item[0] + 1] - self.assertEqual(match, "test") - - - def test_case2(self): - a = ahocorasick.Automaton() - a.add_word('test', 'test') - - a.make_automaton() - - test_string = '🙈' * 1000 - - # wrongly calculated the input's length - for item in a.iter(test_string): - pass - - -class TestIssue68(TestCase): - """ - Test problems with pickling - """ - - def test_case1(self): - if _pickle is None: - print("module _pickle not available") - return - - A = ahocorasick.Automaton() - for i in range(0, 65): - A.add_word(str(i), (i, i)) - - path = 'TestIssue68.test_case1' - with open(path, 'wb') as f: - _pickle.dump(A, f) - - with open(path, 'rb') as f: - _pickle.load(f) - - try: - os.unlink(path) - except: - pass - - -class TestLoadSave(TestAutomatonBase): - - def __init__(self, *args): - super(TestAutomatonBase, self).__init__(*args) - - if os.path.isdir("/dev/shm"): - tmp = "/dev/shm" - else: - tmp = "/tmp" - - self.path = conv(os.path.join(tmp, "test.dat")) - - - def test_save__invalid_number_of_arguments(self): - A = self.add_words_and_make_automaton(); - with self.assertRaisesRegex(ValueError, "expected exactly two arguments"): - A.save() - - - def test_save__invalid_argument_1(self): - A = self.add_words_and_make_automaton(); - with self.assertRaisesRegex(TypeError, "the first argument must be a string"): - A.save(None, None) - - - def test_save__invalid_argument_2(self): - A = self.add_words_and_make_automaton(); - with self.assertRaisesRegex(TypeError, "the second argument must be a callable object"): - A.save(self.path, None) - - - def test_load__invalid_number_of_arguments(self): - with self.assertRaisesRegex(ValueError, "expected exactly two arguments"): - ahocorasick.load() - - - def test_load__invalid_argument_1(self): - with self.assertRaisesRegex(TypeError, "the first argument must be a string"): - ahocorasick.load(None, None) - - - def test_load__invalid_argument_2(self): - with self.assertRaisesRegex(TypeError, "the second argument must be a callable object"): - ahocorasick.load("/dev/shm/test.dump", None) - - - def test_save(self): - import pickle - - A = self.add_words_and_make_automaton(); - - A.save(self.path, pickle.dumps) - - - def test_save_and_load_empty(self): - import pickle - - A = ahocorasick.Automaton() - - A.save(self.path, pickle.dumps) - B = ahocorasick.load(self.path, pickle.loads) - - self.compare_automatons(A, B) - - - def test_save_and_load_trie(self): - import pickle - - A = self.add_words() - - A.save(self.path, pickle.dumps) - B = ahocorasick.load(self.path, pickle.loads) - - self.compare_automatons(A, B) - - - def test_save_and_load_automaton(self): - import pickle - - A = self.add_words_and_make_automaton(); - - A.save(self.path, pickle.dumps) - B = ahocorasick.load(self.path, pickle.loads) - - self.compare_automatons(A, B) - - - def test_save_ints(self): - A = ahocorasick.Automaton(ahocorasick.STORE_INTS) - with self.assertRaisesRegex(ValueError, "expected exactly one argument"): - A.save(self.path, None) - - - def test_save_and_load_ints(self): - import pickle - - A = ahocorasick.Automaton(ahocorasick.STORE_INTS) - for i, word in enumerate(conv("he she her cat car carriage zoo")): - A.add_word(word, i) - - A.save(self.path) - B = ahocorasick.load(self.path, pickle.loads) - - self.compare_automatons(A, B) - - - def test_save_and_load_tuples(self): - import pickle - - A = ahocorasick.Automaton(ahocorasick.STORE_ANY) - for i, word in enumerate(conv("he she her cat car carriage zoo")): - A.add_word(word, (i, word)) - - A.save(self.path, pickle.dumps) - B = ahocorasick.load(self.path, pickle.loads) - - self.compare_automatons(A, B) - - - - def compare_automatons(self, A, B): - if print_dumps: - print([x for x in B.items()]) - print([x for x in A.items()]) - - self.assertEqual(len(A), len(B)) - - A = list(A.items()) - B = list(B.items()) - - for item in zip(A, B): - (AK, AV), (BK, BV) = item - - self.assertEqual(AK, BK) - self.assertEqual(AV, BV) - - -class TestLongIterString(TestAutomatonBase): - def test_match(self): - A = ahocorasick.Automaton(); - for word in "he here her".split(): - A.add_word(word, word) - - A.make_automaton() - - result = list(A.iter_long("he here her")) - self.assertEqual(result[0], (1, "he")) - self.assertEqual(result[1], (6, "here")) - self.assertEqual(result[2], (10, "her")) - - -class TestLongIterSequence(TestAutomatonBase): - def test_match(self): - A = ahocorasick.Automaton(ahocorasick.STORE_ANY, ahocorasick.KEY_SEQUENCE); - for word in [(1, 2), (1, 2, 3), (1, 2, 3, 4)]: - A.add_word(word, word) - - A.make_automaton() - - result = list(A.iter_long((0, 1, 2, 3, 4, 0, 0, 1, 2, 0, 1, 3, 1, 2, 3, 0))) - # ^^^^^^^^^^ ^^^^ ^^^^^^^ - # index 4 8 14 - self.assertEqual(result[0], (4, (1, 2, 3, 4))) - self.assertEqual(result[1], (8, (1, 2))) - self.assertEqual(result[2], (14, (1, 2, 3))) - - -if __name__ == '__main__': - unittest.main() - diff --git a/stringcheese/pyahocorasick-1.4.0/unpickle_test.py b/stringcheese/pyahocorasick-1.4.0/unpickle_test.py deleted file mode 100644 index 41d075d..0000000 --- a/stringcheese/pyahocorasick-1.4.0/unpickle_test.py +++ /dev/null @@ -1,457 +0,0 @@ -# -*- coding: utf-8 -*- - -import ahocorasick -import unittest -import struct -import sys - - -class TreeNodeBuilderBase(object): - def __init__(self): - self.integer = 0 - self.fail = 0 - self.n = 0 - self.eow = 0 - self.next = [] - - - def dump(self): - - assert self.n == len(self.next) - - next = b'' - for letter, node in self.next: - next += self.dump_edge(letter, node) - - return self.dump_node() + next - -if sys.version_info.major == 3: - - class TreeNodeBuilderPy3(TreeNodeBuilderBase): - def dump_node(self): - """ - On Debian 64-bit, GCC 7.3 - - python3: - - integer : size 8, offset 0 - fail : size 8, offset 8 - n : size 4, offset 16 - eow : size 1, offset 20 - padding : size 3 - next : size 8, offset 24 -- omitted in dump - - python2: - - integer : size 8, offset 0 - fail : size 8, offset 8 - n : size 4, offset 16 - eow : size 1, offset 20 - padding : size 1 - next : size 8, offset 22 -- omitted in dump - """ - node = struct.pack('=QQIBxxx', - self.integer, - self.fail, - self.n, - self.eow) - - assert len(node) == 24 - - return node - - - def dump_edge(self, letter, node): - assert ord(letter) < 256 - - b = struct.pack('=IQ', ord(letter), node) - assert len(b) == 12 - - return b - - - TreeNodeBuilder = TreeNodeBuilderPy3 - -elif sys.version_info.major == 2: - - class TreeNodeBuilderPy2(TreeNodeBuilderBase): - def dump_node(self): - """ - On Debian 64-bit, GCC 7.3 - - python2: - - integer : size 8, offset 0 - fail : size 8, offset 8 - n : size 4, offset 16 - eow : size 1, offset 20 - padding : size 3 - next : size 8, offset 24 -- omitted in dump - """ - node = struct.pack('QQIBxxx', - self.integer, - self.fail, - self.n, - self.eow) - - assert len(node) == 24 - - return node - - - def dump_edge(self, letter, node): - assert ord(letter) < 256 - - b = struct.pack('=HQ', ord(letter), node) - assert len(b) == 10 - - return b - - - TreeNodeBuilder = TreeNodeBuilderPy2 - - -USE_EXACT_RAW = True - - -class TestUnpickleRaw(unittest.TestCase): - - def __init__(self, *args): - super(TestUnpickleRaw, self).__init__(*args) - - if not hasattr(self, 'assertRaisesRegex'): - # fixup for Py2 - self.assertRaisesRegex = self.assertRaisesRegexp - - - # raw constructor get 7-tuple (see Automaton.c): - # 1. serialized nodes (as list of bytes or strings) - # 2. kind - # 3. store - # 4. key type - # 5. word count - # 6. length of the longest word - # 7. python values saved in a trie (if store == ahocorasick.STORE_ANY) - - def setUp(self): - self.count = 0 - self.raw = b'' - self.kind = ahocorasick.EMPTY - self.store = ahocorasick.STORE_ANY - self.key_type = ahocorasick.KEY_STRING - self.word_count = 0 - self.longest = 0 - self.values = [] - - - def create_automaton(self, use_exact_raw=False): - # alter values that were set in setUp - if use_exact_raw: - raw = self.raw - else: - raw = [self.create_raw_count(self.count) + self.raw] - - args = (raw, self.kind, self.store, self.key_type, - self.word_count, self.longest, self.values); - - return ahocorasick.Automaton(*args) - - - def create_node_builder(self, eow, children): - builder = TreeNodeBuilder() - builder.next = [(letter, i + 1) for letter, i in children] # starts from 1 - builder.n = len(children) - builder.eow = eow - - return builder - - - def create_raw_count(self, n): - return struct.pack('Q', n) - - - def create_raw_node(self, eow, children): - return self.create_node_builder(eow, children).dump() - - - # -------------------------------------------------- - - - def test__construct_empty(self): - - A = self.create_automaton() - - self.assertTrue(A.kind == ahocorasick.EMPTY) - self.assertTrue(len(A) == 0) - - - def test__construct_simple_trie(self): - - r""" - trie for set {he, her, his, him, it} - - #0 -> [h #1 ] -> [e #2*] -> [r #3*] - | \-> [i #4 ] -> [s #5*] - | \-> [m #6*] - | - +--> [i #7 ] -> [t #8 ] - """ - values = ["HE", "HER", "HIS", "HIM", "IT"] - - node0 = self.create_raw_node(0, [('h', 1), ('i', 7)]) - node1 = self.create_raw_node(0, [('e', 2), ('i', 4)]) - node2 = self.create_raw_node(1, [('r', 3)]) # HE - node3 = self.create_raw_node(1, []) # HER - node4 = self.create_raw_node(0, [('s', 5), ('m', 6)]) - node5 = self.create_raw_node(1, []) # HIS - node6 = self.create_raw_node(1, []) # HIM - node7 = self.create_raw_node(0, [('t', 8)]) - node8 = self.create_raw_node(1, []) # IT - - self.count = 9 - self.raw = node0 + node1 + node2 + node3 + node4 + node5 + node6 + node7 + node8 - self.kind = ahocorasick.TRIE - self.values = values - self.word_count = 5 - - A = self.create_automaton() - self.assertEqual(len(A), 5) - self.assertEqual(A.get("he"), "HE") - self.assertEqual(A.get("her"), "HER") - self.assertEqual(A.get("him"), "HIM") - self.assertEqual(A.get("his"), "HIS") - self.assertEqual(A.get("it"), "IT") - - - def test__construct_simple_trie__split_across_a_few_chunks(self): - - r""" - trie for set {he, her, his, him, it} - - #0 -> [h #1 ] -> [e #2*] -> [r #3*] - | \-> [i #4 ] -> [s #5*] - | \-> [m #6*] - | - +--> [i #7 ] -> [t #8 ] - """ - values = ["HE", "HER", "HIS", "HIM", "IT"] - - node0 = self.create_raw_node(0, [('h', 1), ('i', 7)]) - node1 = self.create_raw_node(0, [('e', 2), ('i', 4)]) - node2 = self.create_raw_node(1, [('r', 3)]) # HE - node3 = self.create_raw_node(1, []) # HER - node4 = self.create_raw_node(0, [('s', 5), ('m', 6)]) - node5 = self.create_raw_node(1, []) # HIS - node6 = self.create_raw_node(1, []) # HIM - node7 = self.create_raw_node(0, [('t', 8)]) - node8 = self.create_raw_node(1, []) # IT - - self.count = 9 - self.raw = [ - self.create_raw_count(2) + node0 + node1, - self.create_raw_count(3) + node2 + node3 + node4, - self.create_raw_count(1) + node5, - self.create_raw_count(3) + node6 + node7 + node8 - ] - self.kind = ahocorasick.TRIE - self.values = values - self.word_count = 5 - - A = self.create_automaton(USE_EXACT_RAW) - self.assertEqual(len(A), 5) - self.assertEqual(A.get("he"), "HE") - self.assertEqual(A.get("her"), "HER") - self.assertEqual(A.get("him"), "HIM") - self.assertEqual(A.get("his"), "HIS") - self.assertEqual(A.get("it"), "IT") - - - def test__construct_wrong_kind(self): - - self.kind = 10000 - - with self.assertRaisesRegex(ValueError, "kind value.*"): - self.create_automaton() - - - def test__construct_wrong_store(self): - - self.store = 10000 - - with self.assertRaisesRegex(ValueError, "store value.*"): - self.create_automaton() - - - def test__construct_wrong_key_type(self): - - self.key_type = 10000 - - with self.assertRaisesRegex(ValueError, "key_type must have.*"): - self.create_automaton() - - - def test__construct_simple_trie__wrong_index(self): - """ - trie for set {he} - - #0 -> [h #1*] -> [e #2*] - """ - - node0 = self.create_raw_node(0, [('h', 1)]) - node1 = self.create_raw_node(1, [('e', 2)]) # expect python value - node2 = self.create_raw_node(1, []) # also python value - - self.count = 3 - self.raw = node0 + node1 + node2 - self.kind = ahocorasick.TRIE - self.values = ["HE"] # but we provide a too short collection - self.word_count = 2 - - with self.assertRaises(IndexError): - self.create_automaton() - - - def test__truncated_raw__case_1(self): - - self.count = 1 # we're saying this is a non-empty trie, but given empty data - self.raw = b'' - self.kind = ahocorasick.TRIE - - with self.assertRaisesRegex(ValueError, r"Data truncated \[parsing header of node #0\].*"): - self.create_automaton() - - - def test__truncated_raw__case_2(self): - """ - trie for set {he} - - #0 -> [h #1 ] -> [e #2*] - """ - - node0 = self.create_raw_node(0, [('h', 1)]) - node1 = self.create_raw_node(0, [('e', 2)]) - node2 = self.create_raw_node(1, []) - raw = node0 + node1 + node2 - - self.count = 3 - self.kind = ahocorasick.TRIE - - for length in range(len(raw)): - self.raw = raw[:length] # truncate data and expect fail - with self.assertRaisesRegex(ValueError, "Data truncated.*"): - self.create_automaton() - - - def test__malicious_next_pointer(self): - """ - #0 -> [? #1 ] - """ - - node0 = self.create_raw_node(0, [('?', 1)]) - node1 = self.create_raw_node(0, [('x', 16)]) # the second node point to non-existent node - - self.count = 2 - self.raw = node0 + node1 - self.kind = ahocorasick.TRIE - - with self.assertRaisesRegex(ValueError, "Node #1 malformed: next link #0 points to.*"): - self.create_automaton() - - - def test__malicious_fail_pointer(self): - """ - trie with just one node - """ - - builder = self.create_node_builder(0, []) - builder.fail = 42 - - self.count = 1 - self.raw = builder.dump() - self.kind = ahocorasick.TRIE - - with self.assertRaisesRegex(ValueError, "Node #0 malformed: the fail link points to.*"): - self.create_automaton() - - - def test__values_leaks(self): - - # create not connected nodes, but each hold a value - good_nodes = 1000 - raw = b'' - values = [] - for i in range(good_nodes): - raw += self.create_raw_node(1, []) - values.append(tuple("node %d" % i)) - - # create the last node that will cause error -- malformed next pointer - raw += self.create_raw_node(1, [('_', 10000)]) - values.append(tuple("never reached")) - - self.count = good_nodes + 1 - self.raw = raw - self.kind = ahocorasick.TRIE - self.values = values - - with self.assertRaises(ValueError): - self.create_automaton() - - - def test__wrong_type_of_chunk_container(self): - - self.count = 9 - self.raw = () # this should be a list - self.kind = ahocorasick.TRIE - self.values = None - self.word_count = 5 - - with self.assertRaisesRegex(TypeError, "Expected list"): - A = self.create_automaton(USE_EXACT_RAW) - - - def test__wrong_type_of_chunk(self): - - self.count = 9 - self.raw = [42] # list items must be strings/bytes - self.kind = ahocorasick.TRIE - self.values = None - self.word_count = 5 - - with self.assertRaisesRegex(ValueError, "Item #0 on the bytes list is not a bytes object"): - A = self.create_automaton(USE_EXACT_RAW) - - - def test__wrong_count_of_nodes_in_chunk__case1(self): - - self.count = 9 - self.raw = [ - self.create_raw_count(0) # count must be greater than 0 - ] - self.kind = ahocorasick.TRIE - self.values = None - self.word_count = 5 - - with self.assertRaisesRegex(ValueError, r"Nodes count for item #0 on the bytes list is not positive \(0\)"): - A = self.create_automaton(USE_EXACT_RAW) - - - def test__wrong_count_of_nodes_in_chunk__case2(self): - - self.count = 9 - self.raw = [ - self.create_raw_count(-12 & 0xffffffffffffffff) # count must be greater than 0 - ] - self.kind = ahocorasick.TRIE - self.values = None - self.word_count = 5 - - with self.assertRaisesRegex(ValueError, r"Nodes count for item #0 on the bytes list is not positive \(-12\)"): - A = self.create_automaton(USE_EXACT_RAW) - - - -if __name__ == '__main__': - print("WARNING: these tests deal with in-memory representation (see TreeNodeBuilder),") - print(" they were meant to test low-level implementation of pickling.") - print(" Might segfault on your machine which is not necessary a bug in pyahocorasick.") - unittest.main() diff --git a/stringcheese/pyahocorasick-1.4.0/unresolved_bugs/.gitignore b/stringcheese/pyahocorasick-1.4.0/unresolved_bugs/.gitignore deleted file mode 100644 index 2211df6..0000000 --- a/stringcheese/pyahocorasick-1.4.0/unresolved_bugs/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.txt diff --git a/stringcheese/pyahocorasick-1.4.0/unresolved_bugs/bug_81.py b/stringcheese/pyahocorasick-1.4.0/unresolved_bugs/bug_81.py deleted file mode 100644 index f80c2ca..0000000 --- a/stringcheese/pyahocorasick-1.4.0/unresolved_bugs/bug_81.py +++ /dev/null @@ -1,60 +0,0 @@ -# -*- coding: utf-8 -*- -""" - Aho-Corasick string search algorithm. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -""" - -import os -import sys - -import ahocorasick - - -try: - range = xrange # for Py2 -except NameError: - pass - -def get_memory_usage(): - # Linux only - pid = os.getpid() - - lines = [] - try: - with open('/proc/%d/status' % pid, 'rt') as f: - lines = f.readlines() - except: - pass - - for line in lines: - if line.startswith('VmSize'): - return float(line.split()[1]) - - return 0 - -def test(): - with open('README.rst', 'r') as f: - data = f.read().split() - - ac = ahocorasick.Automaton() - for i, word in enumerate(data): - ac.add_word(word, i) - - ac.make_automaton() - - for i in range(1024): - s = list(ac.keys()) - - -if __name__ == '__main__': - - before = get_memory_usage() - test() - after = get_memory_usage() - - print("Memory's usage growth: %s (before = %s, after = %s)" % (after - before, before, after)) - assert(before == after) - diff --git a/stringcheese/pyahocorasick-1.4.0/update_inlinedoc.py b/stringcheese/pyahocorasick-1.4.0/update_inlinedoc.py deleted file mode 100644 index ed2c02c..0000000 --- a/stringcheese/pyahocorasick-1.4.0/update_inlinedoc.py +++ /dev/null @@ -1,151 +0,0 @@ -from pathlib import Path -import sys -import os -import textwrap -import xml.etree.ElementTree as ET - - -def main(): - dstpath = Path('src/inline_doc.h') - app = Application(dstpath) - app.run() - - -HEADER = """#pragma once -// DO NOT EDIT. File generated by script update_inlinedoc.py. -""" - - -class Application(object): - def __init__(self, dstpath): - self.dstpath = dstpath - - - def run(self): - content = HEADER - for path, name in self.__get_files(): - content += '\n' + self.__format_file(path, name) - - oldcontent = None - if self.dstpath.exists(): - oldcontent = self.dstpath.read_text() - - if content != oldcontent: - print("Creating %s" % self.dstpath) - self.dstpath.write_text(content) - - - def __format_file(self, path, name): - print("Parsing %s" % path) - cmd = 'rst2xml %s' % path - xml = os.popen(cmd).read() - f = Formatter(xml, name) - return f.format() - - - def __get_files(self): - rootdir = Path('docs') - for path in sorted(rootdir.glob("*.rst")): - if path.name != 'index.rst': - name = path.stem + '_doc' - yield (path, name) - - -WIDTH = 60 - - -class Formatter(object): - def __init__(self, xml_string, name): - self.xml = ET.fromstring(xml_string) - self.name = name - - - def format(self): - self.lines = [] - for node in next(self.xml.iter('document')): - if node.tag == 'title': - self.format_title(node) - elif node.tag == 'paragraph': - self.format_paragraph(node) - elif node.tag == 'bullet_list': - self.format_bullet_list(node) - elif node.tag == 'section': - break # do not add extra sections - else: - raise ValueError("tag '%s' not supported" % node.tag) - - - return self.format_c_define() - - - def format_title(self, node): - self.lines.append(node.text) - - - def format_paragraph(self, node): - self.lines.append('') - self.lines.extend(textwrap.wrap(self.normalize(node), width=WIDTH)) - - - def format_bullet_list(self, node): - for list_item in node.iter('list_item'): - for paragraph in list_item.iter('paragraph'): - - text = self.normalize(paragraph) - lines = textwrap.wrap(text, width=(WIDTH - 2)) - for i, line in enumerate(lines): - if i == 0: - prefix = '- ' - else: - prefix = ' ' - - self.lines.append(prefix + line) - - - def normalize(self, node): - t = ET.tostring(node, method='text', encoding='unicode') - t = t.split() - return ' '.join(t) - - - def format_c_define(self): - lines = [] - prevline = '' - - # 1. do preformatting - for line in self.lines: - line = line.rstrip() - if line == '' and prevline == '': - continue # compress multiple empty lines - - prevline = line - - line = line.replace(r'\\', r'\\\\') - line = line.replace('"', r'\"') - - lines.append(line) - - # 2. remove empty lines from the end - while lines: - if lines[-1] == '': - del lines[-1] - else: - break - - - # 3. add qutations - n = len(lines) - indent = '\t' - result = '#define %s \\\n' % self.name - for i, line in enumerate(lines): - result += indent - if i < n - 1: - result += '"%s\\n" \\\n' % line - else: - result += '"%s"\n' % line - - return result - - -if __name__ == '__main__': - main() diff --git a/stringcheese/pyahocorasick-1.4.0/utils.c b/stringcheese/pyahocorasick-1.4.0/utils.c deleted file mode 100644 index 9e846f0..0000000 --- a/stringcheese/pyahocorasick-1.4.0/utils.c +++ /dev/null @@ -1,409 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - Helpers functions. - This file is included directly. - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : public domain -*/ - -//#define MEMORY_DEBUG -#ifdef MEMORY_DEBUG -#ifndef MEMORY_DUMP_PATH -# define MEMORY_DUMP_PATH "memory.dump" -#endif -const char* debug_path = MEMORY_DUMP_PATH; -FILE* debug_file; -int memory_dump = 1; // dump to file -int alloc_num = 0; // id of allocation -int alloc_fail = -1; // id of allocation that will fail -int alloc_trap_on_fail = 0; // rather failing, execute trap (for gdb use) -int realloc_num = 0; // id of allocation -int realloc_fail = -1; // id of allocation that will fail -int realloc_trap_on_fail = 0; // rather failing, execute trap (for gdb use) - -static int -env_getint(const char* name, int def) { - const char* val = getenv(name); - if (val != NULL) - return atoi(val); - else - return def; -} - -static int -env_exists(const char* name) { - return (getenv(name) != NULL); -} - -static -void initialize_memory_debug(void) { - if (env_exists("ALLOC_NODUMP")) { - memory_dump = 0; - } - - alloc_fail = env_getint("ALLOC_FAIL", alloc_fail); - realloc_fail = env_getint("REALLOC_FAIL", realloc_fail); - - alloc_trap_on_fail = env_exists("ALLOC_TRAP"); - realloc_trap_on_fail = env_exists("REALLOC_TRAP"); - - if (memory_dump) { - debug_file = fopen(debug_path, "wt"); - if (debug_file == NULL) { - PyErr_WarnEx(PyExc_RuntimeWarning, "Cannot open file, logging on stderr", 1); - debug_file = stderr; - } - } -} -#endif - -void* memory_alloc(ssize_t size) { -#ifdef MEMORY_DEBUG - if (alloc_num == alloc_fail) { - if (alloc_trap_on_fail) { - __builtin_trap(); - } - - printf("DEBUG: allocation #%d failed\n", alloc_num); - alloc_num += 1; - return NULL; - } -#endif - void* res = PyMem_Malloc(size); - -#ifdef MEMORY_DEBUG - alloc_num += 1; - if (memory_dump) - fprintf(debug_file, "A %d %p %ld\n", alloc_num, res, size); -#endif - - return res; -} - - -void* memory_realloc(void* ptr, size_t size) { -#ifdef MEMORY_DEBUG - if (realloc_num == realloc_fail) { - if (realloc_trap_on_fail) { - __builtin_trap(); - } - - printf("DEBUG: reallocation #%d failed\n", realloc_num); - realloc_num += 1; - return NULL; - } -#endif - void* res = PyMem_Realloc(ptr, size); - -#ifdef MEMORY_DEBUG - realloc_num += 1; - if (memory_dump) { - fprintf(debug_file, "R %d %p %p %ld\n", realloc_num, ptr, res, size); - } -#endif - - return res; -} - - -void memory_free(void* ptr) { -#ifdef MEMORY_DEBUG - if (memory_dump) - fprintf(debug_file, "F %p\n", ptr); -#endif - PyMem_Free(ptr); -} - - -void memory_safefree(void* ptr) { - if (ptr != NULL) { - memory_free(ptr); - } -} - - -#if !defined(PY3K) || !defined(AHOCORASICK_UNICODE) -// define when pymod_get_string makes a copy of string -# define INPUT_KEEPS_COPY -#endif - -#if defined INPUT_KEEPS_COPY -# define maybe_free(flag, word) memory_free(word); -# define maybe_decref(flag, ref) -#elif defined PEP393_UNICODE -# define maybe_free(flag, word) if (flag) { memory_free(word); } -# define maybe_decref(flag, ref) if (ref && !flag) { Py_DECREF(ref); } -#else -# define maybe_free(flag, word) -# define maybe_decref(flag, ref) if (ref) { Py_DECREF(ref); } -#endif - -/* returns bytes or unicode internal buffer */ -static PyObject* -pymod_get_string(PyObject* obj, TRIE_LETTER_TYPE** word, ssize_t* wordlen, bool* is_copy) { - -#ifdef INPUT_KEEPS_COPY - ssize_t i; - char* bytes; -#endif - -#if defined PEP393_UNICODE - if (F(PyUnicode_Check)(obj)) { - PyUnicode_READY(obj); - if (PyUnicode_KIND(obj) == PyUnicode_4BYTE_KIND) { - *word = (TRIE_LETTER_TYPE*)(PyUnicode_4BYTE_DATA(obj)); - *wordlen = PyUnicode_GET_LENGTH(obj); - *is_copy = false; - Py_INCREF(obj); - - return obj; - } else { - *word = PyUnicode_AsUCS4Copy(obj); - *wordlen = PyUnicode_GET_LENGTH(obj); - *is_copy = true; - // No INCREF - we have our copy - return obj; - } - } - else { - PyErr_SetString(PyExc_TypeError, "string expected"); - return NULL; - } -#elif defined PY3K -# ifdef AHOCORASICK_UNICODE - if (F(PyUnicode_Check)(obj)) { - *word = (TRIE_LETTER_TYPE*)(PyUnicode_AS_UNICODE(obj)); - *wordlen = PyUnicode_GET_SIZE(obj); - Py_INCREF(obj); - return obj; - } - else { - PyErr_SetString(PyExc_TypeError, "string expected"); - return NULL; - } -# else -# ifndef INPUT_KEEPS_COPY -# error "defines inconsistency" -# endif - if (F(PyBytes_Check)(obj)) { - *wordlen = PyBytes_GET_SIZE(obj); - *word = (TRIE_LETTER_TYPE*)memory_alloc(*wordlen * TRIE_LETTER_SIZE); - if (*word == NULL) { - PyErr_NoMemory(); - return NULL; - } - - bytes = PyBytes_AS_STRING(obj); - for (i=0; i < *wordlen; i++) { - (*word)[i] = bytes[i]; - } - // Note: there is no INCREF - return obj; - } - else { - PyErr_SetString(PyExc_TypeError, "bytes expected"); - return NULL; - } -# endif -#else // PY_MAJOR_VERSION == 3 -# ifndef INPUT_KEEPS_COPY -# error "defines inconsistency" -# endif - if (F(PyString_Check)(obj)) { - *wordlen = PyString_GET_SIZE(obj); - *word = (TRIE_LETTER_TYPE*)memory_alloc(*wordlen * TRIE_LETTER_SIZE); - if (*word == NULL) { - PyErr_NoMemory(); - return NULL; - } - - - bytes = PyString_AS_STRING(obj); - for (i=0; i < *wordlen; i++) { - (*word)[i] = bytes[i]; - }; - - Py_INCREF(obj); - return obj; - } else { - PyErr_SetString(PyExc_TypeError, "string required"); - return NULL; - } -#endif -} - -static bool -__read_sequence__from_tuple(PyObject* obj, TRIE_LETTER_TYPE** word, ssize_t* wordlen) { - Py_ssize_t i; - Py_ssize_t size = PyTuple_GET_SIZE(obj); - TRIE_LETTER_TYPE* tmpword; - - tmpword = (TRIE_LETTER_TYPE*)memory_alloc(size * TRIE_LETTER_SIZE); - if (UNLIKELY(tmpword == NULL)) { - PyErr_NoMemory(); - return false; - } - - for (i=0; i < size; i++) { - Py_ssize_t value = F(PyNumber_AsSsize_t)(F(PyTuple_GetItem)(obj, i), PyExc_ValueError); - if (value == -1 && PyErr_Occurred()) { - PyErr_Format(PyExc_ValueError, "item #%zd is not a number", i); - memory_free(tmpword); - return false; - } - - - // TODO: both min and max values should be configured -#if TRIE_LETTER_SIZE == 4 - #define MAX_VAL 4294967295l -#else - #define MAX_VAL 65535ul -#endif - if (value < 0 || value > MAX_VAL) { - PyErr_Format(PyExc_ValueError, "item #%zd: value %zd outside range [%d..%lu]", i, value, 0, MAX_VAL); - memory_free(tmpword); - return false; - } - - tmpword[i] = (TRIE_LETTER_TYPE)value; - } - - *word = tmpword; - *wordlen = size; - - return true; -} - - -static bool -pymod_get_sequence(PyObject* obj, TRIE_LETTER_TYPE** word, ssize_t* wordlen) { - if (LIKELY(F(PyTuple_Check)(obj))) { - return __read_sequence__from_tuple(obj, word, wordlen); - } else { - PyErr_Format(PyExc_TypeError, "argument is not a supported sequence type"); - return false; - } -} - - -/* parse optional indexes used in few functions [start, [end]] */ -static int -pymod_parse_start_end( - PyObject* args, - int idx_start, int idx_end, - const ssize_t min, const ssize_t max, - ssize_t* Start, ssize_t* End -) { - PyObject* obj; -#define start (*Start) -#define end (*End) - - start = min; - end = max; - - // first argument - obj = F(PyTuple_GetItem)(args, idx_start); - if (obj == NULL) { - PyErr_Clear(); - return 0; - } - - obj = F(PyNumber_Index)(obj); - if (obj == NULL) - return -1; - - start = F(PyNumber_AsSsize_t)(obj, PyExc_IndexError); - Py_DECREF(obj); - if (start == -1 and PyErr_Occurred()) - return -1; - - if (start < 0) - start = max + start; - - if (start < min or start >= max) { - PyErr_Format(PyExc_IndexError, "start index not in range %zd..%zd", min, max); - return -1; - } - - // second argument - obj = F(PyTuple_GetItem)(args, idx_end); - if (obj == NULL) { - PyErr_Clear(); - return 0; - } - - obj = F(PyNumber_Index)(obj); - if (obj == NULL) - return -1; - - end = F(PyNumber_AsSsize_t)(obj, PyExc_IndexError); - Py_DECREF(obj); - if (end == -1 and PyErr_Occurred()) - return -1; - - if (end < 0) - end = max - 1 + end; - - if (end < min or end > max) { - PyErr_Format(PyExc_IndexError, "end index not in range %zd..%zd", min, max); - return -1; - } - - return 0; - -#undef start -#undef end -} - - -void init_input(struct Input* input) { - input->word = NULL; - input->py_word = NULL; -} - - -bool prepare_input(PyObject* self, PyObject* tuple, struct Input* input) { -#define automaton ((Automaton*)self) - if (automaton->key_type == KEY_STRING) { - input->py_word = pymod_get_string(tuple, &input->word, &input->wordlen, &input->is_copy); - if (not input->py_word) - return false; - } else { - input->is_copy = true; // we always create a copy of sequence - input->py_word = NULL; - if (not pymod_get_sequence(tuple, &input->word, &input->wordlen)) { - return false; - } - } -#undef automaton - - return true; -} - - -bool prepare_input_from_tuple(PyObject* self, PyObject* args, int index, struct Input* input) { - PyObject* tuple; - - tuple = F(PyTuple_GetItem)(args, index); - if (tuple) - return prepare_input(self, tuple, input); - else - return false; -} - - -void destroy_input(struct Input* input) { - maybe_decref(input->is_copy, input->py_word) - maybe_free(input->is_copy, input->word) -} - - -void assign_input(struct Input* dst, struct Input* src) { - - dst->wordlen = src->wordlen; - dst->word = src->word; - dst->py_word = src->py_word; // Note: there is no INCREF -} diff --git a/stringcheese/pyahocorasick-1.4.0/windows.bat b/stringcheese/pyahocorasick-1.4.0/windows.bat deleted file mode 100644 index 7d0d044..0000000 --- a/stringcheese/pyahocorasick-1.4.0/windows.bat +++ /dev/null @@ -1,43 +0,0 @@ -@echo off -@rem A python interperter must be available through PATH. - -SET PYTHONPATH=. - -IF [%1]==[clean] ( - del /Q stamp\*_pyW - exit /B -) - -IF NOT EXIST stamp\build_pyW ( - python setup.py build_ext --inplace - IF %ERRORLEVEL% NEQ 0 EXIT /B - type nul > stamp\build_pyW -) ELSE echo the extension was built - -IF NOT EXIST stamp\unittests_pyW ( - python unittests.py - IF %ERRORLEVEL% NEQ 0 EXIT /B - type nul > stamp\unittests_pyW -) ELSE echo unittests were run - -IF NOT EXIST stamp\regression_pyW ( - python regression/issue_5.py - IF %ERRORLEVEL% NEQ 0 EXIT /B - - python regression/issue_8.py - IF %ERRORLEVEL% NEQ 0 EXIT /B - - python regression/issue_9.py - IF %ERRORLEVEL% NEQ 0 EXIT /B - - python regression/issue_10.py - IF %ERRORLEVEL% NEQ 0 EXIT /B - - python regression/issue_26.py - IF %ERRORLEVEL% NEQ 0 EXIT /B - - python regression/issue_56.py - IF %ERRORLEVEL% NEQ 0 EXIT /B - - type nul > stamp\regression_pyW -) ELSE echo regression tests were run diff --git a/stringcheese/pyahocorasick-1.4.0/windows.h b/stringcheese/pyahocorasick-1.4.0/windows.h deleted file mode 100644 index 119cde9..0000000 --- a/stringcheese/pyahocorasick-1.4.0/windows.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - This is part of pyahocorasick Python module. - - Windows declarations - - Author : Wojciech Muła, wojciech_mula@poczta.onet.pl - WWW : http://0x80.pl - License : BSD-3-Clause (see LICENSE) -*/ - -#ifndef PYAHCORASICK_WINDOWS_H__ -#define PYAHCORASICK_WINDOWS_H__ - -#include "msinttypes/stdint.h" - -#define PY_OBJECT_HEAD_INIT PyVarObject_HEAD_INIT(NULL, 0) - -#endif diff --git a/stringcheese/stringcheese.py b/stringcheese/stringcheese.py index 1b88892..7fc6f7b 100644 --- a/stringcheese/stringcheese.py +++ b/stringcheese/stringcheese.py @@ -2,40 +2,40 @@ import base64 import binascii +from collections.abc import Callable import gc import sys from argparse import ArgumentParser +from typing import Any -from stringcheese.ahocorasick import * +from ahocorasick_rs import BytesAhoCorasick from tqdm import tqdm MAX_FLAG_LENGTH = 2000 -CLOSING_CHAR = b'}' +CLOSING_CHAR = b"}" def setup_parser(): - parser = ArgumentParser(description='Find flags automatically in ' - 'CTF challenges. This looks for flags ' - 'in the provided files using searches similar ' - 'to strings+grep, but works even if the flag is ' - 'transformed, e.g. encoded or xor-encrypted.', - add_help=False) + parser = ArgumentParser( + description="Find flags automatically in " + "CTF challenges. This looks for flags " + "in the provided files using searches similar " + "to strings+grep, but works even if the flag is " + "transformed, e.g. encoded or xor-encrypted.", + add_help=False, + ) - parser.add_argument('--help', '-h', action='help', help='show this help ' - 'message and exit') + parser.add_argument("--help", "-h", action="help", help="show this help " "message and exit") - parser.add_argument('pattern', type=str, help='the pattern you want to ' - 'search, e.g. FLAG{') + parser.add_argument("pattern", type=str, help="the pattern you want to " "search, e.g. FLAG{") - parser.add_argument('--file', '-f', type=str, help='the file in which ' - 'to search for flags, stdin by default', default='-') + parser.add_argument("--file", "-f", type=str, help="the file in which " "to search for flags, stdin by default", default="-") - parser.add_argument('--fast', help='skip the slow checks. Useful ' - 'on larger files but you may miss matches', - action='store_true') + parser.add_argument( + "--fast", help="skip the slow checks. Useful " "on larger files but you may miss matches", action="store_true" + ) - parser.add_argument('-v', '--verbose', help='increase output verbosity', - action='store_true') + parser.add_argument("-v", "--verbose", help="increase output verbosity", action="store_true") return parser @@ -59,7 +59,7 @@ def b64_decoder(match): # Add correct padding while len(trim_match) % 4: - trim_match += b'=' + trim_match += b"=" try: return base64.b64decode(trim_match) @@ -68,6 +68,7 @@ def b64_decoder(match): return None # All decodes failed + def b64_urlsafedecoder(match): try: # If padding is present, b64decode ignores leftover data and works @@ -83,7 +84,7 @@ def b64_urlsafedecoder(match): # Add correct padding while len(trim_match) % 4: - trim_match += b'=' + trim_match += b"=" try: return base64.urlsafe_b64decode(trim_match) @@ -92,6 +93,7 @@ def b64_urlsafedecoder(match): return None # All decodes failed + def b32_decoder(match): try: # If padding is present, b32decode ignores leftover data and works @@ -107,7 +109,7 @@ def b32_decoder(match): # Add correct padding while len(trim_match) % 8: - trim_match += b'=' + trim_match += b"=" try: return base64.b32decode(trim_match) @@ -143,13 +145,14 @@ def codec_decoder(match): pass match = match[:-1] return None + return codec_decoder def hex_decoder(match): # Ensure all bytes in the match are hex for i in range(len(match)): - if match[i] not in b'0123456789abcdef': + if match[i] not in b"0123456789abcdef": match = match[:i] break if len(match) % 2: @@ -159,21 +162,21 @@ def hex_decoder(match): def hex_bytes_decoder(match): for i in range(len(match)): - if match[i] > 0xf: + if match[i] > 0xF: match = match[:i] break if len(match) % 2: match = match[:-1] - return bytes(match[i] << 4 | match[i+1] for i in range(0, len(match), 2)) + return bytes(match[i] << 4 | match[i + 1] for i in range(0, len(match), 2)) def bitstring_to_bytes(bitstring): - return int(bitstring, 2).to_bytes(len(bitstring) // 8, byteorder='big') + return int(bitstring, 2).to_bytes(len(bitstring) // 8, byteorder="big") def binary_decoder(match): for i in range(len(match)): - if match[i] not in b'01': + if match[i] not in b"01": match = match[:i] break while len(match) % 8: @@ -188,7 +191,7 @@ def binary_bytes_decoder(match): break while len(match) % 8: match = match[:-1] - bin_converted = bytes(ord('0')+x for x in match) + bin_converted = bytes(ord("0") + x for x in match) return bitstring_to_bytes(bin_converted) @@ -207,75 +210,88 @@ def b10_ascii_decoder(match): return decoded_ascii.encode() -def build_automaton(pattern): +class Automaton: + def __init__(self): + self._automaton: BytesAhoCorasick | None = None + self._patterns = [] + + def add_word(self, pattern: bytes, meta: tuple[bytes, str, Any]): + self._patterns.append((pattern, meta)) + + def make_automaton(self): + self._automaton = BytesAhoCorasick(patterns=(pattern for pattern, _ in self._patterns)) + + def iter(self, haystack: bytes) -> list[tuple[int, Any]]: + assert self._automaton is not None + + return [ + (end_index, self._patterns[pattern_index][1]) + for pattern_index, _, end_index in self._automaton.find_matches_as_indexes(haystack) + ] + + +def build_automaton(pattern: bytes) -> Automaton: automaton = Automaton() # identity match - automaton.add_word(pattern, (pattern, 'ASCII', identity_decoder)) + automaton.add_word(pattern, (pattern, "ASCII", identity_decoder)) # base64 match - b64pattern = base64.b64encode(pattern).rstrip(b'=') + b64pattern = base64.b64encode(pattern).rstrip(b"=") if len(b64pattern) % 3: b64pattern = b64pattern[:-1] - automaton.add_word(b64pattern, (b64pattern, 'base64', b64_decoder)) + automaton.add_word(b64pattern, (b64pattern, "base64", b64_decoder)) # base64 Url Safe Mode match - b64pattern_urlsafe = base64.urlsafe_b64encode(pattern).rstrip(b'=') + b64pattern_urlsafe = base64.urlsafe_b64encode(pattern).rstrip(b"=") if len(b64pattern_urlsafe) % 3: b64pattern_urlsafe = b64pattern_urlsafe[:-1] - automaton.add_word(b64pattern_urlsafe, (b64pattern_urlsafe, 'base64 urlsafe mode', b64_urlsafedecoder)) + automaton.add_word(b64pattern_urlsafe, (b64pattern_urlsafe, "base64 urlsafe mode", b64_urlsafedecoder)) - b32pattern = base64.b32encode(pattern).rstrip(b'=') + b32pattern = base64.b32encode(pattern).rstrip(b"=") if len(b32pattern) % 7: b32pattern = b32pattern[:-1] - automaton.add_word(b32pattern, (b32pattern, 'base32', b32_decoder)) + automaton.add_word(b32pattern, (b32pattern, "base32", b32_decoder)) # codec match - for codec in ('utf-16', 'utf-16-be', 'utf-16-le', - 'utf-32', 'utf-32-be', 'utf-32-le'): + for codec in ("utf-16", "utf-16-be", "utf-16-le", "utf-32", "utf-32-be", "utf-32-le"): codec_pattern = pattern.decode().encode(codec) codec_decoder = codec_decoder_generator(codec) automaton.add_word(codec_pattern, (codec_pattern, codec, codec_decoder)) # base 10 ascii match - b10_ascii_pattern = ''.join(str(x) for x in pattern).encode() - automaton.add_word(b10_ascii_pattern, - (b10_ascii_pattern, 'base10_ascii', b10_ascii_decoder)) + b10_ascii_pattern = "".join(str(x) for x in pattern).encode() + automaton.add_word(b10_ascii_pattern, (b10_ascii_pattern, "base10_ascii", b10_ascii_decoder)) # xor match for xorval in range(1, 256): xor_pattern = bytes(xorval ^ x for x in pattern) xor_decoder = lambda s, xorval=xorval: bytes(xorval ^ x for x in s) - automaton.add_word(xor_pattern, - (xor_pattern, f'XOR_{xorval}', xor_decoder)) + automaton.add_word(xor_pattern, (xor_pattern, f"XOR_{xorval}", xor_decoder)) # hex match hex_pattern = binascii.hexlify(pattern) - automaton.add_word(hex_pattern, (hex_pattern, 'hex', hex_decoder)) + automaton.add_word(hex_pattern, (hex_pattern, "hex", hex_decoder)) # raw hex match (bytes are \x00 through \x0f) raw_hex_pattern = bytes(int(x) for x in hex_pattern) - automaton.add_word(raw_hex_pattern, - (raw_hex_pattern, 'raw_hex', hex_bytes_decoder)) + automaton.add_word(raw_hex_pattern, (raw_hex_pattern, "raw_hex", hex_bytes_decoder)) # binary match - bin_pattern = ''.join(f'{pb:08b}' for pb in pattern).encode() - automaton.add_word(bin_pattern, (bin_pattern, 'binary', binary_decoder)) + bin_pattern = "".join(f"{pb:08b}" for pb in pattern).encode() + automaton.add_word(bin_pattern, (bin_pattern, "binary", binary_decoder)) # rot13 match raw_rot13_pattern = crypt_rot13(pattern) - automaton.add_word(raw_rot13_pattern, - (raw_rot13_pattern, 'raw_rot13', crypt_rot13)) + automaton.add_word(raw_rot13_pattern, (raw_rot13_pattern, "raw_rot13", crypt_rot13)) # rot47 match raw_rot47_pattern = crypt_rot47(pattern) - automaton.add_word(raw_rot47_pattern, - (raw_rot47_pattern, 'raw_rot47', crypt_rot47)) + automaton.add_word(raw_rot47_pattern, (raw_rot47_pattern, "raw_rot47", crypt_rot47)) # raw binary match raw_bin_pattern = bytes(int(x) for x in bin_pattern) - automaton.add_word(raw_bin_pattern, - (raw_bin_pattern, 'raw_binary', binary_bytes_decoder)) + automaton.add_word(raw_bin_pattern, (raw_bin_pattern, "raw_binary", binary_bytes_decoder)) # TODO: various ciphers, etc @@ -297,36 +313,38 @@ def postprocess_match(raw_match): def generate_haystacks(base_haystack, fast): - yield base_haystack, 'stream' + yield base_haystack, "stream" nb_steps = 33 if not fast else 8 for step in range(2, nb_steps): for startpos in range(step): - yield base_haystack[startpos::step], f'stream[{startpos}::{step}]' + yield base_haystack[startpos::step], f"stream[{startpos}::{step}]" - yield base_haystack[::-1], 'reversed stream' + yield base_haystack[::-1], "reversed stream" # TODO : add local xor for simple crackme challs? but may be slow def extract_matches(automaton, filename, fast, verbose): - if filename == '-': - print('No filename provided, reading from stdin.') + if filename == "-": + print("No filename provided, reading from stdin.") file_contents = sys.stdin.buffer.read() else: try: - with open(filename, 'rb') as haystack_file: + with open(filename, "rb") as haystack_file: file_contents = haystack_file.read() except: - print('Error opening file.') + print("Error opening file.") sys.exit(0) if fast: - val = input("Warning, with --fast your files will be treated faster by ignoring some tests so you might miss " - "some flags. Do you wish to continue? (y/N) : ") - if val != 'y': + val = input( + "Warning, with --fast your files will be treated faster by ignoring some tests so you might miss " + "some flags. Do you wish to continue? (y/N) : " + ) + if val != "y": sys.exit(0) if len(file_contents) > 50000: val = input("This is a large file and may take a long time to be treated, do you wish to continue? (y/N) : ") - if val != 'y': + if val != "y": sys.exit(0) # TODO : decode file formats (zip, png pixels, etc) @@ -334,7 +352,7 @@ def extract_matches(automaton, filename, fast, verbose): match_found = False # Compute the number of haystacks by counting them on a fake base - n_haystacks = sum(1 for _ in generate_haystacks(b'fake haystack', fast)) + n_haystacks = sum(1 for _ in generate_haystacks(b"fake haystack", fast)) progress = tqdm(total=n_haystacks) for haystack, haystack_name in generate_haystacks(file_contents, fast): @@ -343,9 +361,8 @@ def extract_matches(automaton, filename, fast, verbose): for end_index, (pattern, enc_desc, decoder) in match_iter: match_found = True start_index = end_index - len(pattern) + 1 - raw_match = haystack[start_index:start_index+MAX_FLAG_LENGTH] - tqdm.write(f'MATCH FOUND! ' - f'In {haystack_name}, using encoding {enc_desc}:') + raw_match = haystack[start_index : start_index + MAX_FLAG_LENGTH] + tqdm.write(f"MATCH FOUND! " f"In {haystack_name}, using encoding {enc_desc}:") if verbose: tqdm.write(binascii.hexlify(raw_match).decode()) decoded_flag = decoder(raw_match) @@ -362,7 +379,7 @@ def extract_matches(automaton, filename, fast, verbose): progress.close() if not match_found: - print('No match found.') + print("No match found.") def main(): @@ -376,5 +393,5 @@ def main(): extract_matches(automaton, filename, fast_mode, verbose_mode) -if __name__ == '__main__': +if __name__ == "__main__": main()