From c78e56626eea500d232ee32dd7787d84bd47d6e7 Mon Sep 17 00:00:00 2001
From: gotbadger
Date: Thu, 5 Mar 2026 13:51:56 +0000
Subject: [PATCH] CM-60540: remove binaryornot dep
---
cycode/cli/utils/binary_utils.py | 72 ++++++++++++++++++++++++++++++++
cycode/cli/utils/path_utils.py | 2 +-
cycode/cli/utils/string_utils.py | 3 +-
cycode/logger.py | 2 -
poetry.lock | 48 ++++++---------------
pyproject.toml | 1 -
tests/utils/test_binary_utils.py | 42 +++++++++++++++++++
7 files changed, 128 insertions(+), 42 deletions(-)
create mode 100644 cycode/cli/utils/binary_utils.py
create mode 100644 tests/utils/test_binary_utils.py
diff --git a/cycode/cli/utils/binary_utils.py b/cycode/cli/utils/binary_utils.py
new file mode 100644
index 00000000..e61b7ddc
--- /dev/null
+++ b/cycode/cli/utils/binary_utils.py
@@ -0,0 +1,72 @@
+_CONTROL_CHARS = b'\n\r\t\f\b'
+_PRINTABLE_ASCII = _CONTROL_CHARS + bytes(range(32, 127))
+_PRINTABLE_HIGH_ASCII = bytes(range(127, 256))
+
+# BOM signatures for encodings that legitimately contain null bytes
+_BOM_ENCODINGS = (
+ (b'\xff\xfe\x00\x00', 'utf-32-le'),
+ (b'\x00\x00\xfe\xff', 'utf-32-be'),
+ (b'\xff\xfe', 'utf-16-le'),
+ (b'\xfe\xff', 'utf-16-be'),
+)
+
+
+def _has_bom_encoding(bytes_to_check: bytes) -> bool:
+ """Check if bytes start with a BOM and can be decoded as that encoding."""
+ for bom, encoding in _BOM_ENCODINGS:
+ if bytes_to_check.startswith(bom):
+ try:
+ bytes_to_check.decode(encoding)
+ return True
+ except (UnicodeDecodeError, LookupError):
+ pass
+ return False
+
+
+def _is_decodable_as_utf8(bytes_to_check: bytes) -> bool:
+ """Try to decode bytes as UTF-8."""
+ try:
+ bytes_to_check.decode('utf-8')
+ return True
+ except UnicodeDecodeError:
+ return False
+
+
+def is_binary_string(bytes_to_check: bytes) -> bool:
+ """Check if a chunk of bytes appears to be binary content.
+
+ Uses a simplified version of the Perl detection algorithm, matching
+ the structure of binaryornot's is_binary_string.
+ """
+ if not bytes_to_check:
+ return False
+
+ # Binary if control chars are > 30% of the string
+ low_chars = bytes_to_check.translate(None, _PRINTABLE_ASCII)
+ nontext_ratio1 = len(low_chars) / len(bytes_to_check)
+
+ # Binary if high ASCII chars are < 5% of the string
+ high_chars = bytes_to_check.translate(None, _PRINTABLE_HIGH_ASCII)
+ nontext_ratio2 = len(high_chars) / len(bytes_to_check)
+
+ is_likely_binary = (nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or (
+ nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8
+ )
+
+ # BOM-marked UTF-16/32 files legitimately contain null bytes.
+ # Check this first so they aren't misdetected as binary.
+ if _has_bom_encoding(bytes_to_check):
+ return False
+
+ has_null_or_xff = b'\x00' in bytes_to_check or b'\xff' in bytes_to_check
+
+ if is_likely_binary:
+ # Only let UTF-8 rescue data that doesn't contain null bytes.
+ # Null bytes are valid UTF-8 but almost never appear in real text files,
+ # whereas binary formats (e.g. .DS_Store) are full of them.
+ if has_null_or_xff:
+ return True
+ return not _is_decodable_as_utf8(bytes_to_check)
+
+ # Null bytes or 0xff in otherwise normal-looking data indicate binary
+ return bool(has_null_or_xff)
diff --git a/cycode/cli/utils/path_utils.py b/cycode/cli/utils/path_utils.py
index ce60b0da..c2d59805 100644
--- a/cycode/cli/utils/path_utils.py
+++ b/cycode/cli/utils/path_utils.py
@@ -4,9 +4,9 @@
from typing import TYPE_CHECKING, AnyStr, Optional, Union
import typer
-from binaryornot.helpers import is_binary_string
from cycode.cli.logger import logger
+from cycode.cli.utils.binary_utils import is_binary_string
if TYPE_CHECKING:
from os import PathLike
diff --git a/cycode/cli/utils/string_utils.py b/cycode/cli/utils/string_utils.py
index 06d3a51c..43931239 100644
--- a/cycode/cli/utils/string_utils.py
+++ b/cycode/cli/utils/string_utils.py
@@ -5,9 +5,8 @@
import string
from sys import getsizeof
-from binaryornot.check import is_binary_string
-
from cycode.cli.consts import SCA_SHORTCUT_DEPENDENCY_PATHS
+from cycode.cli.utils.binary_utils import is_binary_string
def obfuscate_text(text: str) -> str:
diff --git a/cycode/logger.py b/cycode/logger.py
index 2fd44e4f..c5cdebcf 100644
--- a/cycode/logger.py
+++ b/cycode/logger.py
@@ -31,8 +31,6 @@ def _set_io_encodings() -> None:
logging.getLogger('werkzeug').setLevel(logging.WARNING)
logging.getLogger('schedule').setLevel(logging.WARNING)
logging.getLogger('kubernetes').setLevel(logging.WARNING)
-logging.getLogger('binaryornot').setLevel(logging.WARNING)
-logging.getLogger('chardet').setLevel(logging.WARNING)
logging.getLogger('git.cmd').setLevel(logging.WARNING)
logging.getLogger('git.util').setLevel(logging.WARNING)
diff --git a/poetry.lock b/poetry.lock
index 9a11262a..30e77a12 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -31,7 +31,8 @@ version = "4.11.0"
description = "High-level concurrency and networking framework on top of asyncio or Trio"
optional = false
python-versions = ">=3.9"
-groups = ["main", "dev"]
+groups = ["main"]
+markers = "python_version >= \"3.10\""
files = [
{file = "anyio-4.11.0-py3-none-any.whl", hash = "sha256:0287e96f4d26d4149305414d4e3bc32f0dcd0862365a4bddea19d7a1ec38c4fc"},
{file = "anyio-4.11.0.tar.gz", hash = "sha256:82a8d0b81e318cc5ce71a5f1f8b5c4e63619620b63141ef8c995fa0db95a57c4"},
@@ -79,21 +80,6 @@ files = [
{file = "attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11"},
]
-[[package]]
-name = "binaryornot"
-version = "0.4.4"
-description = "Ultra-lightweight pure Python package to check if a file is binary or text."
-optional = false
-python-versions = "*"
-groups = ["main"]
-files = [
- {file = "binaryornot-0.4.4-py2.py3-none-any.whl", hash = "sha256:b8b71173c917bddcd2c16070412e369c3ed7f0528926f70cac18a6c97fd563e4"},
- {file = "binaryornot-0.4.4.tar.gz", hash = "sha256:359501dfc9d40632edc9fac890e19542db1a287bbcfa58175b66658392018061"},
-]
-
-[package.dependencies]
-chardet = ">=3.0.2"
-
[[package]]
name = "certifi"
version = "2025.10.5"
@@ -204,18 +190,6 @@ files = [
[package.dependencies]
pycparser = {version = "*", markers = "implementation_name != \"PyPy\""}
-[[package]]
-name = "chardet"
-version = "5.2.0"
-description = "Universal encoding detector for Python 3"
-optional = false
-python-versions = ">=3.7"
-groups = ["main"]
-files = [
- {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"},
- {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"},
-]
-
[[package]]
name = "charset-normalizer"
version = "3.4.4"
@@ -534,12 +508,12 @@ version = "1.3.0"
description = "Backport of PEP 654 (exception groups)"
optional = false
python-versions = ">=3.7"
-groups = ["main", "dev", "test"]
-markers = "python_version < \"3.11\""
+groups = ["main", "test"]
files = [
{file = "exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10"},
{file = "exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88"},
]
+markers = {main = "python_version == \"3.10\"", test = "python_version < \"3.11\""}
[package.dependencies]
typing-extensions = {version = ">=4.6.0", markers = "python_version < \"3.13\""}
@@ -663,7 +637,7 @@ version = "3.11"
description = "Internationalized Domain Names in Applications (IDNA)"
optional = false
python-versions = ">=3.8"
-groups = ["main", "dev", "test"]
+groups = ["main", "test"]
files = [
{file = "idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea"},
{file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"},
@@ -1785,7 +1759,8 @@ version = "1.3.1"
description = "Sniff out which async library your code is running under"
optional = false
python-versions = ">=3.7"
-groups = ["main", "dev"]
+groups = ["main"]
+markers = "python_version >= \"3.10\""
files = [
{file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
{file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
@@ -1819,7 +1794,8 @@ version = "0.49.1"
description = "The little ASGI library that shines."
optional = false
python-versions = ">=3.9"
-groups = ["main", "dev"]
+groups = ["main"]
+markers = "python_version >= \"3.10\""
files = [
{file = "starlette-0.49.1-py3-none-any.whl", hash = "sha256:d92ce9f07e4a3caa3ac13a79523bd18e3bc0042bb8ff2d759a8e7dd0e1859875"},
{file = "starlette-0.49.1.tar.gz", hash = "sha256:481a43b71e24ed8c43b11ea02f5353d77840e01480881b8cb5a26b8cae64a8cb"},
@@ -1949,12 +1925,12 @@ version = "4.15.0"
description = "Backported and Experimental Type Hints for Python 3.9+"
optional = false
python-versions = ">=3.9"
-groups = ["main", "dev", "test"]
+groups = ["main", "test"]
files = [
{file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"},
{file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"},
]
-markers = {dev = "python_version < \"3.13\"", test = "python_version < \"3.11\""}
+markers = {test = "python_version < \"3.11\""}
[[package]]
name = "typing-inspection"
@@ -2034,4 +2010,4 @@ type = ["pytest-mypy"]
[metadata]
lock-version = "2.1"
python-versions = ">=3.9"
-content-hash = "593c613fcd6438e2133d90f3777c2050738bfa42bc7f5512e43c612b784a9870"
+content-hash = "4f1987623870103055d7f6d2bc359dae11c5fc3239b0e84ff337625bf7c1088d"
diff --git a/pyproject.toml b/pyproject.toml
index cc6297c9..98de72ea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,6 @@ pyyaml = ">=6.0,<7.0"
marshmallow = ">=3.15.0,<4.0.0"
gitpython = ">=3.1.30,<3.2.0"
arrow = ">=1.0.0,<1.4.0"
-binaryornot = ">=0.4.4,<0.5.0"
requests = ">=2.32.4,<3.0"
urllib3 = ">=2.4.0,<3.0.0"
pyjwt = ">=2.8.0,<3.0"
diff --git a/tests/utils/test_binary_utils.py b/tests/utils/test_binary_utils.py
new file mode 100644
index 00000000..c8fa7e53
--- /dev/null
+++ b/tests/utils/test_binary_utils.py
@@ -0,0 +1,42 @@
+import pytest
+
+from cycode.cli.utils.binary_utils import is_binary_string
+
+
+@pytest.mark.parametrize(
+ ('data', 'expected'),
+ [
+ # Empty / None-ish
+ (b'', False),
+ (None, False),
+ # Plain ASCII text
+ (b'Hello, world!', False),
+ (b'print("hello")\nfor i in range(10):\n pass\n', False),
+ # Whitespace-heavy text (tabs, newlines) is not binary
+ (b'\t\t\n\n\r\n some text\n', False),
+ # UTF-8 multibyte text (accented, CJK, emoji)
+ ('café résumé naïve'.encode(), False),
+ ('日本語テキスト'.encode(), False),
+ ('🎉🚀💻'.encode(), False),
+ # BOM-marked UTF-16/32 text is not binary
+ ('\ufeffHello UTF-16'.encode('utf-16-le'), False),
+ ('\ufeffHello UTF-16'.encode('utf-16-be'), False),
+ ('\ufeffHello UTF-32'.encode('utf-32-le'), False),
+ ('\ufeffHello UTF-32'.encode('utf-32-be'), False),
+ # Null bytes → binary
+ (b'\x00', True),
+ (b'hello\x00world', True),
+ (b'\x00\x01\x02\x03', True),
+ # 0xff in otherwise normal data → binary
+ (b'hello\xffworld', True),
+ # Mostly control chars + invalid UTF-8 → binary
+ (b'\x01\x02\x03\x04\x05\x06\x07\x0e\x0f\x10' * 10 + b'\x80', True),
+ # Real binary format headers
+ (b'\x89PNG\r\n\x1a\n' + b'\x00' * 100, True),
+ (b'\x7fELF' + b'\x00' * 100, True),
+ # DS_Store-like: null-byte-heavy valid UTF-8 → still binary
+ (b'\x00\x00\x00\x01Bud1' + b'\x00' * 100, True),
+ ],
+)
+def test_is_binary_string(data: bytes, expected: bool) -> None:
+ assert is_binary_string(data) is expected