Optimized utils.ngrams_counts (2)

dhondta · dhondta · commit 36f39ab61db9 · 2026-02-16T08:49:59.000+01:00
diff --git a/src/exeplot/VERSION.txt b/src/exeplot/VERSION.txt
@@ -1 +1 @@
-0.5.2
+0.5.4
diff --git a/src/exeplot/__init__.py b/src/exeplot/__init__.py
@@ -1,5 +1,9 @@
 # -*- coding: UTF-8 -*-
+"""Exeplot package.
+
+"""
 from .__conf__ import *
+from .__info__ import __author__, __copyright__, __license__, __version__
 from .plots import *
 from .plots import __all__
 
diff --git a/src/exeplot/utils.py b/src/exeplot/utils.py
@@ -30,19 +30,22 @@ def human_readable_size(size: int, precision: int = 0) -> str:
     return "%.*f%s" % (precision, size, units[i])
 
 
-def ngrams_counts(byte_obj: bytes | object, n: int = 1, step: int = 1) -> dict[bytes, int]:
-    """ Output the Counter instance for an input byte sequence or byte object based on n-grams.
-         If the input is a byte object, cache the result.
+def ngrams_counts(byte_obj: bytes | object, n: int = 1, step: int = 1) -> list[tuple[bytes, int]]:
+    """ Output a sorted list of tuples (n-gram, counts) for an input byte sequence or byte object.
+         If the input is a byte object, the result is cached.
     
-    :param byte_obj:      byte sequence ('bytes') or byte object with "bytes" and "size" attributes (i.e. pathlib2.Path)
-    :param n: n determining the size of n-grams, defaults to 1
-    :param step:          step for sliding the n-grams
-    :param start:         number of bytes to start from
+    :param byte_obj: byte sequence ('bytes') or byte object with "bytes" and "size" attributes (i.e. pathlib2.Path)
+    :param n:        n determining the size of n-grams, defaults to 1
+    :param step:     step for sliding the n-grams
     """
     if n not in (1, 2, 3):
         raise ValueError("n must be 1, 2, or 3")
     if step <= 0:
         raise ValueError("step must be positive")
+    try:
+        return byte_obj._ngram_counts_cache[n]
+    except (AttributeError, KeyError):
+        pass
     if isinstance(byte_obj, bytes) or hasattr(byte_obj, "bytes"):
         a = np.frombuffer(data := byte_obj if isinstance(byte_obj, bytes) else byte_obj.bytes, dtype=np.uint8)
         l = a.size
@@ -56,6 +59,7 @@ def ngrams_counts(byte_obj: bytes | object, n: int = 1, step: int = 1) -> dict[b
             grams = np.stack((a[0:end:step], a[1:1+end:step]), axis=1) if n == 2 else \
                     np.stack((a[0:end:step], a[1:1+end:step], a[2:2+end:step]), axis=1)
             counts = {bytes(row): int(c) for row, c in zip(*np.unique(grams, axis=0, return_counts=True))}
+        counts = sorted(counts.items(), key=lambda p: p[1], reverse=True)
         if isinstance(byte_obj, bytes):
             return counts
         elif hasattr(byte_obj, "bytes"):
@@ -80,8 +84,7 @@ def ngrams_distribution(byte_obj: bytes | object, n: int = 1, step: int = 1, n_m
     :return:              list of n_most_common (n-gram, count) pairs
     """
     c = ngrams_counts(byte_obj, n, step)
-    n = len(c) if n_most_common is None else n_most_common + n_exclude_top + len(exclude or [])
-    r = sorted(c.items(), key=lambda p: p[1], reverse=True)[:n]
+    r = c[:len(c) if n_most_common is None else n_most_common + n_exclude_top + len(exclude or [])]
     if exclude is not None:
         r = [(ngram, count) for ngram, count in r if ngram not in exclude]
     return r[n_exclude_top:n_exclude_top+(n_most_common or len(c))]
diff --git a/tests/test_others.py b/tests/test_others.py
@@ -24,12 +24,11 @@ def test_ngrams_functions(self):
             self.assertRaises(ValueError, ngrams_counts, b"abc", n=n)
         self.assertRaises(ValueError, ngrams_counts, b"abc", step=-1)
         self.assertEqual(ngrams_counts(b"a", n=2), {})
-        self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4), dict))
-        self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4, n=2), dict))
+        self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4), list))
+        self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4, n=2), list))
         class Test:
             bytes = seq
-        histogram = ngrams_distribution(t := Test(), exclude=(b"\x00", b"\xff"))
-        self.assertTrue(isinstance(histogram, list))
+        self.assertTrue(isinstance(histogram := ngrams_distribution(t := Test(), exclude=(b"\x00", b"\xff")), list))
         self.assertNotIn(b"\x00", [b for b, c in histogram])
         self.assertNotIn(b"\xff", [b for b, c in histogram])
         histogram2 = ngrams_distribution(t, n_most_common=300)