Optimized utils.ngrams_counts

dhondta · dhondta · commit 67d4ca5c8615 · 2026-02-15T23:53:58.000+01:00
diff --git a/.coveragerc b/.coveragerc
@@ -21,3 +21,4 @@ exclude_lines =
     if self.type not in ["ELF", "MachO", "PE"]:
     if j <= i or start2 is None:
     if s != self.__size:
+    glob['_IMP'] = True
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -19,7 +19,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest]
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
     steps:
     - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,7 @@ authors = [
 description = "Library for plotting executable samples supporting multiple formats"
 license = {file = "LICENSE"}
 keywords = ["python", "development", "programming", "executable-samples", "plot", "entropy", "cfg"]
-requires-python = ">=3.9,<4"
+requires-python = ">=3.10,<4"
 classifiers = [
   "Development Status :: 5 - Production/Stable",
   "Environment :: Console",
diff --git a/src/exeplot/VERSION.txt b/src/exeplot/VERSION.txt
@@ -1 +1 @@
-0.4.3
+0.5.2
diff --git a/src/exeplot/__conf__.py b/src/exeplot/__conf__.py
@@ -29,7 +29,7 @@
 numpy.int = numpy.int_  # dirty fix to "AttributeError: module 'numpy' has no attribute 'int'."
 
 
-def check_imports(*names):
+def check_imports(*names) -> None:
     import warnings
     from inspect import currentframe
     glob = currentframe().f_back.f_globals
@@ -42,7 +42,7 @@ def check_imports(*names):
             glob['_IMP'] = False
 
 
-def configure():  # pragma: no cover
+def configure() -> None:  # pragma: no cover
     from configparser import ConfigParser
     from os.path import exists, expanduser
     path = expanduser("~/.exeplot.conf")
@@ -58,7 +58,7 @@ def configure():  # pragma: no cover
     plt.rcParams['font.family'] = config['font_family']
 
 
-def configure_fonts(**kw):
+def configure_fonts(**kw) -> dict:
     import matplotlib
     matplotlib.rc('font', **{k.split("_")[1]: kw.pop(k, config[k]) for k in ['font_family', 'font_size']})
     kw['title-font'] = {'fontfamily': kw.pop('title_font_family', config['font_family']),
diff --git a/src/exeplot/__main__.py b/src/exeplot/__main__.py
@@ -1,19 +1,20 @@
 # -*- coding: UTF-8 -*-
+from argparse import ArgumentParser, Namespace, RawTextHelpFormatter
+
 from .__info__ import __author__, __copyright__, __email__, __license__, __source__, __version__
 from .__init__ import *
 from .__init__ import __all__ as _plots
 
 
-def _parser(name, description, examples):
-    from argparse import ArgumentParser, RawTextHelpFormatter
+def _parser(name: str, description: str, examples: list[str]) -> ArgumentParser:
     descr = f"{name} {__version__}\n\nAuthor   : {__author__} ({__email__})\nCopyright: {__copyright__}\nLicense  :" \
             f" {__license__}\nSource   : {__source__}\n\n{description}.\n\n"
     examples = [f"exeplot {e}" if not e.startswith("exeplot ") else e for e in examples]
     return ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False,
                           epilog="usage examples:\n  " + "\n  ".join(examples) if len(examples) > 0 else None)
 
 
-def _setup(parser):  # pragma: no cover
+def _setup(parser: ArgumentParser) -> Namespace:  # pragma: no cover
     args = parser.parse_args()
     if hasattr(args, "verbose"):
         import logging
@@ -22,7 +23,7 @@ def _setup(parser):  # pragma: no cover
     return args
 
 
-def main():
+def main() -> None:  # pragma: no cover
     from os import makedirs
     parser = _parser("Exeplot", "This tool allows to plot executable sample(s) in different ways",
                      ["byte binary.exe", "entropy binary1.exe binary2.exe --scale"])
diff --git a/src/exeplot/utils.py b/src/exeplot/utils.py
@@ -1,13 +1,15 @@
 # -*- coding: UTF-8 -*-
+import numpy as np
 from math import log2
+from typing import Optional
 
 
 __all__ = ["ensure_str", "human_readable_size", "ngrams_counts", "ngrams_distribution", "shannon_entropy"]
 
 shannon_entropy = lambda b: -sum([p*log2(p) for p in [float(ctr)/len(b) for ctr in [b.count(c) for c in set(b)]]]) or 0.
 
 
-def ensure_str(s, encoding='utf-8', errors='strict'):
+def ensure_str(s: str | bytes, encoding: str = "utf-8", errors: str = "strict") -> str:
     """ Ensure that an input string is decoded. """
     if isinstance(s, bytes):
         try:
@@ -19,7 +21,7 @@ def ensure_str(s, encoding='utf-8', errors='strict'):
     return s
 
 
-def human_readable_size(size, precision=0):
+def human_readable_size(size: int, precision: int = 0) -> str:
     """ Display bytes' size in a human-readable format given a precision. """
     i, units = 0, ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]
     while size >= 1024 and i < len(units)-1:
@@ -28,27 +30,45 @@ def human_readable_size(size, precision=0):
     return "%.*f%s" % (precision, size, units[i])
 
 
-def ngrams_counts(byte_obj, n=1, step=1):
+def ngrams_counts(byte_obj: bytes | object, n: int = 1, step: int = 1) -> dict[bytes, int]:
     """ Output the Counter instance for an input byte sequence or byte object based on n-grams.
          If the input is a byte object, cache the result.
     
     :param byte_obj:      byte sequence ('bytes') or byte object with "bytes" and "size" attributes (i.e. pathlib2.Path)
     :param n: n determining the size of n-grams, defaults to 1
     :param step:          step for sliding the n-grams
+    :param start:         number of bytes to start from
     """
-    from collections import Counter
-    if isinstance(byte_obj, (str, bytes)):
-        return Counter(byte_obj[i:i+n] for i in range(0, len(byte_obj)-n+1, step))
-    elif hasattr(byte_obj, "bytes") and hasattr(byte_obj, "size"):
-        if not hasattr(byte_obj, "_ngram_counts_cache"):
-            byte_obj._ngram_counts_cache = {}
-        if n not in byte_obj._ngram_counts_cache.keys():
-            byte_obj._ngram_counts_cache[n] = Counter(byte_obj.bytes[i:i+n] for i in range(0, byte_obj.size-n+1, step))
-        return byte_obj._ngram_counts_cache[n]
+    if n not in (1, 2, 3):
+        raise ValueError("n must be 1, 2, or 3")
+    if step <= 0:
+        raise ValueError("step must be positive")
+    if isinstance(byte_obj, bytes) or hasattr(byte_obj, "bytes"):
+        a = np.frombuffer(data := byte_obj if isinstance(byte_obj, bytes) else byte_obj.bytes, dtype=np.uint8)
+        l = a.size
+        if l < n:
+            return {}
+        if n == 1:
+            counts = {b.to_bytes(1, "big"): int(c) for b, c in \
+                      enumerate(np.bincount(np.frombuffer(data, dtype=np.uint8)))}
+        else:
+            end = (m := (l - n) // step + 1) * step
+            grams = np.stack((a[0:end:step], a[1:1+end:step]), axis=1) if n == 2 else \
+                    np.stack((a[0:end:step], a[1:1+end:step], a[2:2+end:step]), axis=1)
+            counts = {bytes(row): int(c) for row, c in zip(*np.unique(grams, axis=0, return_counts=True))}
+        if isinstance(byte_obj, bytes):
+            return counts
+        elif hasattr(byte_obj, "bytes"):
+            if not hasattr(byte_obj, "_ngram_counts_cache"):
+                byte_obj._ngram_counts_cache = {}
+            if n not in byte_obj._ngram_counts_cache.keys():
+                byte_obj._ngram_counts_cache[n] = counts
+            return byte_obj._ngram_counts_cache[n]
     raise TypeError("Bad input type ; should be a byte sequence or object")
 
 
-def ngrams_distribution(byte_obj, n=1, step=1, n_most_common=None, n_exclude_top=0, exclude=None):
+def ngrams_distribution(byte_obj: bytes | object, n: int = 1, step: int = 1, n_most_common: Optional[int] = None,
+                        n_exclude_top: int = 0, exclude: Optional[list] = None) -> list[tuple[bytes, int]]:
     """ Compute the n-grams distribution of an input byte sequence or byte object given exclusions.
     
     :param byte_obj:      byte sequence ('bytes') or byte object with "bytes" and "size" attributes (i.e. pathlib2.Path)
@@ -60,7 +80,8 @@ def ngrams_distribution(byte_obj, n=1, step=1, n_most_common=None, n_exclude_top
     :return:              list of n_most_common (n-gram, count) pairs
     """
     c = ngrams_counts(byte_obj, n, step)
-    r = c.most_common(len(c) if n_most_common is None else n_most_common + n_exclude_top + len(exclude or []))
+    n = len(c) if n_most_common is None else n_most_common + n_exclude_top + len(exclude or [])
+    r = sorted(c.items(), key=lambda p: p[1], reverse=True)[:n]
     if exclude is not None:
         r = [(ngram, count) for ngram, count in r if ngram not in exclude]
     return r[n_exclude_top:n_exclude_top+(n_most_common or len(c))]
diff --git a/tests/test_others.py b/tests/test_others.py
@@ -2,7 +2,6 @@
 # -*- coding: UTF-8 -*-
 import matplotlib.pyplot as plt
 import os
-from collections import Counter
 from exeplot.plots.__common__ import Binary
 from exeplot.utils import *
 from unittest import TestCase
@@ -21,10 +20,14 @@ def test_miscellaneous(self):
 class TestUtils(TestCase):
     def test_ngrams_functions(self):
         self.assertRaises(TypeError, ngrams_counts, 123)
-        self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4), Counter))
+        for n in [0, 4]:
+            self.assertRaises(ValueError, ngrams_counts, b"abc", n=n)
+        self.assertRaises(ValueError, ngrams_counts, b"abc", step=-1)
+        self.assertEqual(ngrams_counts(b"a", n=2), {})
+        self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4), dict))
+        self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4, n=2), dict))
         class Test:
             bytes = seq
-            size = len(seq)
         histogram = ngrams_distribution(t := Test(), exclude=(b"\x00", b"\xff"))
         self.assertTrue(isinstance(histogram, list))
         self.assertNotIn(b"\x00", [b for b, c in histogram])