From c78e56626eea500d232ee32dd7787d84bd47d6e7 Mon Sep 17 00:00:00 2001 From: gotbadger Date: Thu, 5 Mar 2026 13:51:56 +0000 Subject: [PATCH] CM-60540: remove binaryornot dep --- cycode/cli/utils/binary_utils.py | 72 ++++++++++++++++++++++++++++++++ cycode/cli/utils/path_utils.py | 2 +- cycode/cli/utils/string_utils.py | 3 +- cycode/logger.py | 2 - poetry.lock | 48 ++++++--------------- pyproject.toml | 1 - tests/utils/test_binary_utils.py | 42 +++++++++++++++++++ 7 files changed, 128 insertions(+), 42 deletions(-) create mode 100644 cycode/cli/utils/binary_utils.py create mode 100644 tests/utils/test_binary_utils.py diff --git a/cycode/cli/utils/binary_utils.py b/cycode/cli/utils/binary_utils.py new file mode 100644 index 00000000..e61b7ddc --- /dev/null +++ b/cycode/cli/utils/binary_utils.py @@ -0,0 +1,72 @@ +_CONTROL_CHARS = b'\n\r\t\f\b' +_PRINTABLE_ASCII = _CONTROL_CHARS + bytes(range(32, 127)) +_PRINTABLE_HIGH_ASCII = bytes(range(127, 256)) + +# BOM signatures for encodings that legitimately contain null bytes +_BOM_ENCODINGS = ( + (b'\xff\xfe\x00\x00', 'utf-32-le'), + (b'\x00\x00\xfe\xff', 'utf-32-be'), + (b'\xff\xfe', 'utf-16-le'), + (b'\xfe\xff', 'utf-16-be'), +) + + +def _has_bom_encoding(bytes_to_check: bytes) -> bool: + """Check if bytes start with a BOM and can be decoded as that encoding.""" + for bom, encoding in _BOM_ENCODINGS: + if bytes_to_check.startswith(bom): + try: + bytes_to_check.decode(encoding) + return True + except (UnicodeDecodeError, LookupError): + pass + return False + + +def _is_decodable_as_utf8(bytes_to_check: bytes) -> bool: + """Try to decode bytes as UTF-8.""" + try: + bytes_to_check.decode('utf-8') + return True + except UnicodeDecodeError: + return False + + +def is_binary_string(bytes_to_check: bytes) -> bool: + """Check if a chunk of bytes appears to be binary content. + + Uses a simplified version of the Perl detection algorithm, matching + the structure of binaryornot's is_binary_string. + """ + if not bytes_to_check: + return False + + # Binary if control chars are > 30% of the string + low_chars = bytes_to_check.translate(None, _PRINTABLE_ASCII) + nontext_ratio1 = len(low_chars) / len(bytes_to_check) + + # Binary if high ASCII chars are < 5% of the string + high_chars = bytes_to_check.translate(None, _PRINTABLE_HIGH_ASCII) + nontext_ratio2 = len(high_chars) / len(bytes_to_check) + + is_likely_binary = (nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or ( + nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8 + ) + + # BOM-marked UTF-16/32 files legitimately contain null bytes. + # Check this first so they aren't misdetected as binary. + if _has_bom_encoding(bytes_to_check): + return False + + has_null_or_xff = b'\x00' in bytes_to_check or b'\xff' in bytes_to_check + + if is_likely_binary: + # Only let UTF-8 rescue data that doesn't contain null bytes. + # Null bytes are valid UTF-8 but almost never appear in real text files, + # whereas binary formats (e.g. .DS_Store) are full of them. + if has_null_or_xff: + return True + return not _is_decodable_as_utf8(bytes_to_check) + + # Null bytes or 0xff in otherwise normal-looking data indicate binary + return bool(has_null_or_xff) diff --git a/cycode/cli/utils/path_utils.py b/cycode/cli/utils/path_utils.py index ce60b0da..c2d59805 100644 --- a/cycode/cli/utils/path_utils.py +++ b/cycode/cli/utils/path_utils.py @@ -4,9 +4,9 @@ from typing import TYPE_CHECKING, AnyStr, Optional, Union import typer -from binaryornot.helpers import is_binary_string from cycode.cli.logger import logger +from cycode.cli.utils.binary_utils import is_binary_string if TYPE_CHECKING: from os import PathLike diff --git a/cycode/cli/utils/string_utils.py b/cycode/cli/utils/string_utils.py index 06d3a51c..43931239 100644 --- a/cycode/cli/utils/string_utils.py +++ b/cycode/cli/utils/string_utils.py @@ -5,9 +5,8 @@ import string from sys import getsizeof -from binaryornot.check import is_binary_string - from cycode.cli.consts import SCA_SHORTCUT_DEPENDENCY_PATHS +from cycode.cli.utils.binary_utils import is_binary_string def obfuscate_text(text: str) -> str: diff --git a/cycode/logger.py b/cycode/logger.py index 2fd44e4f..c5cdebcf 100644 --- a/cycode/logger.py +++ b/cycode/logger.py @@ -31,8 +31,6 @@ def _set_io_encodings() -> None: logging.getLogger('werkzeug').setLevel(logging.WARNING) logging.getLogger('schedule').setLevel(logging.WARNING) logging.getLogger('kubernetes').setLevel(logging.WARNING) -logging.getLogger('binaryornot').setLevel(logging.WARNING) -logging.getLogger('chardet').setLevel(logging.WARNING) logging.getLogger('git.cmd').setLevel(logging.WARNING) logging.getLogger('git.util').setLevel(logging.WARNING) diff --git a/poetry.lock b/poetry.lock index 9a11262a..30e77a12 100644 --- a/poetry.lock +++ b/poetry.lock @@ -31,7 +31,8 @@ version = "4.11.0" description = "High-level concurrency and networking framework on top of asyncio or Trio" optional = false python-versions = ">=3.9" -groups = ["main", "dev"] +groups = ["main"] +markers = "python_version >= \"3.10\"" files = [ {file = "anyio-4.11.0-py3-none-any.whl", hash = "sha256:0287e96f4d26d4149305414d4e3bc32f0dcd0862365a4bddea19d7a1ec38c4fc"}, {file = "anyio-4.11.0.tar.gz", hash = "sha256:82a8d0b81e318cc5ce71a5f1f8b5c4e63619620b63141ef8c995fa0db95a57c4"}, @@ -79,21 +80,6 @@ files = [ {file = "attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11"}, ] -[[package]] -name = "binaryornot" -version = "0.4.4" -description = "Ultra-lightweight pure Python package to check if a file is binary or text." -optional = false -python-versions = "*" -groups = ["main"] -files = [ - {file = "binaryornot-0.4.4-py2.py3-none-any.whl", hash = "sha256:b8b71173c917bddcd2c16070412e369c3ed7f0528926f70cac18a6c97fd563e4"}, - {file = "binaryornot-0.4.4.tar.gz", hash = "sha256:359501dfc9d40632edc9fac890e19542db1a287bbcfa58175b66658392018061"}, -] - -[package.dependencies] -chardet = ">=3.0.2" - [[package]] name = "certifi" version = "2025.10.5" @@ -204,18 +190,6 @@ files = [ [package.dependencies] pycparser = {version = "*", markers = "implementation_name != \"PyPy\""} -[[package]] -name = "chardet" -version = "5.2.0" -description = "Universal encoding detector for Python 3" -optional = false -python-versions = ">=3.7" -groups = ["main"] -files = [ - {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, - {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, -] - [[package]] name = "charset-normalizer" version = "3.4.4" @@ -534,12 +508,12 @@ version = "1.3.0" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" -groups = ["main", "dev", "test"] -markers = "python_version < \"3.11\"" +groups = ["main", "test"] files = [ {file = "exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10"}, {file = "exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88"}, ] +markers = {main = "python_version == \"3.10\"", test = "python_version < \"3.11\""} [package.dependencies] typing-extensions = {version = ">=4.6.0", markers = "python_version < \"3.13\""} @@ -663,7 +637,7 @@ version = "3.11" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.8" -groups = ["main", "dev", "test"] +groups = ["main", "test"] files = [ {file = "idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea"}, {file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"}, @@ -1785,7 +1759,8 @@ version = "1.3.1" description = "Sniff out which async library your code is running under" optional = false python-versions = ">=3.7" -groups = ["main", "dev"] +groups = ["main"] +markers = "python_version >= \"3.10\"" files = [ {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, @@ -1819,7 +1794,8 @@ version = "0.49.1" description = "The little ASGI library that shines." optional = false python-versions = ">=3.9" -groups = ["main", "dev"] +groups = ["main"] +markers = "python_version >= \"3.10\"" files = [ {file = "starlette-0.49.1-py3-none-any.whl", hash = "sha256:d92ce9f07e4a3caa3ac13a79523bd18e3bc0042bb8ff2d759a8e7dd0e1859875"}, {file = "starlette-0.49.1.tar.gz", hash = "sha256:481a43b71e24ed8c43b11ea02f5353d77840e01480881b8cb5a26b8cae64a8cb"}, @@ -1949,12 +1925,12 @@ version = "4.15.0" description = "Backported and Experimental Type Hints for Python 3.9+" optional = false python-versions = ">=3.9" -groups = ["main", "dev", "test"] +groups = ["main", "test"] files = [ {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, ] -markers = {dev = "python_version < \"3.13\"", test = "python_version < \"3.11\""} +markers = {test = "python_version < \"3.11\""} [[package]] name = "typing-inspection" @@ -2034,4 +2010,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">=3.9" -content-hash = "593c613fcd6438e2133d90f3777c2050738bfa42bc7f5512e43c612b784a9870" +content-hash = "4f1987623870103055d7f6d2bc359dae11c5fc3239b0e84ff337625bf7c1088d" diff --git a/pyproject.toml b/pyproject.toml index cc6297c9..98de72ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,6 @@ pyyaml = ">=6.0,<7.0" marshmallow = ">=3.15.0,<4.0.0" gitpython = ">=3.1.30,<3.2.0" arrow = ">=1.0.0,<1.4.0" -binaryornot = ">=0.4.4,<0.5.0" requests = ">=2.32.4,<3.0" urllib3 = ">=2.4.0,<3.0.0" pyjwt = ">=2.8.0,<3.0" diff --git a/tests/utils/test_binary_utils.py b/tests/utils/test_binary_utils.py new file mode 100644 index 00000000..c8fa7e53 --- /dev/null +++ b/tests/utils/test_binary_utils.py @@ -0,0 +1,42 @@ +import pytest + +from cycode.cli.utils.binary_utils import is_binary_string + + +@pytest.mark.parametrize( + ('data', 'expected'), + [ + # Empty / None-ish + (b'', False), + (None, False), + # Plain ASCII text + (b'Hello, world!', False), + (b'print("hello")\nfor i in range(10):\n pass\n', False), + # Whitespace-heavy text (tabs, newlines) is not binary + (b'\t\t\n\n\r\n some text\n', False), + # UTF-8 multibyte text (accented, CJK, emoji) + ('café résumé naïve'.encode(), False), + ('日本語テキスト'.encode(), False), + ('🎉🚀💻'.encode(), False), + # BOM-marked UTF-16/32 text is not binary + ('\ufeffHello UTF-16'.encode('utf-16-le'), False), + ('\ufeffHello UTF-16'.encode('utf-16-be'), False), + ('\ufeffHello UTF-32'.encode('utf-32-le'), False), + ('\ufeffHello UTF-32'.encode('utf-32-be'), False), + # Null bytes → binary + (b'\x00', True), + (b'hello\x00world', True), + (b'\x00\x01\x02\x03', True), + # 0xff in otherwise normal data → binary + (b'hello\xffworld', True), + # Mostly control chars + invalid UTF-8 → binary + (b'\x01\x02\x03\x04\x05\x06\x07\x0e\x0f\x10' * 10 + b'\x80', True), + # Real binary format headers + (b'\x89PNG\r\n\x1a\n' + b'\x00' * 100, True), + (b'\x7fELF' + b'\x00' * 100, True), + # DS_Store-like: null-byte-heavy valid UTF-8 → still binary + (b'\x00\x00\x00\x01Bud1' + b'\x00' * 100, True), + ], +) +def test_is_binary_string(data: bytes, expected: bool) -> None: + assert is_binary_string(data) is expected