Skip to content

Commit 36f39ab

Browse files
committed
Optimized utils.ngrams_counts (2)
1 parent d9d4028 commit 36f39ab

4 files changed

Lines changed: 20 additions & 14 deletions

File tree

src/exeplot/VERSION.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.5.2
1+
0.5.4

src/exeplot/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# -*- coding: UTF-8 -*-
2+
"""Exeplot package.
3+
4+
"""
25
from .__conf__ import *
6+
from .__info__ import __author__, __copyright__, __license__, __version__
37
from .plots import *
48
from .plots import __all__
59

src/exeplot/utils.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,22 @@ def human_readable_size(size: int, precision: int = 0) -> str:
3030
return "%.*f%s" % (precision, size, units[i])
3131

3232

33-
def ngrams_counts(byte_obj: bytes | object, n: int = 1, step: int = 1) -> dict[bytes, int]:
34-
""" Output the Counter instance for an input byte sequence or byte object based on n-grams.
35-
If the input is a byte object, cache the result.
33+
def ngrams_counts(byte_obj: bytes | object, n: int = 1, step: int = 1) -> list[tuple[bytes, int]]:
34+
""" Output a sorted list of tuples (n-gram, counts) for an input byte sequence or byte object.
35+
If the input is a byte object, the result is cached.
3636
37-
:param byte_obj: byte sequence ('bytes') or byte object with "bytes" and "size" attributes (i.e. pathlib2.Path)
38-
:param n: n determining the size of n-grams, defaults to 1
39-
:param step: step for sliding the n-grams
40-
:param start: number of bytes to start from
37+
:param byte_obj: byte sequence ('bytes') or byte object with "bytes" and "size" attributes (i.e. pathlib2.Path)
38+
:param n: n determining the size of n-grams, defaults to 1
39+
:param step: step for sliding the n-grams
4140
"""
4241
if n not in (1, 2, 3):
4342
raise ValueError("n must be 1, 2, or 3")
4443
if step <= 0:
4544
raise ValueError("step must be positive")
45+
try:
46+
return byte_obj._ngram_counts_cache[n]
47+
except (AttributeError, KeyError):
48+
pass
4649
if isinstance(byte_obj, bytes) or hasattr(byte_obj, "bytes"):
4750
a = np.frombuffer(data := byte_obj if isinstance(byte_obj, bytes) else byte_obj.bytes, dtype=np.uint8)
4851
l = a.size
@@ -56,6 +59,7 @@ def ngrams_counts(byte_obj: bytes | object, n: int = 1, step: int = 1) -> dict[b
5659
grams = np.stack((a[0:end:step], a[1:1+end:step]), axis=1) if n == 2 else \
5760
np.stack((a[0:end:step], a[1:1+end:step], a[2:2+end:step]), axis=1)
5861
counts = {bytes(row): int(c) for row, c in zip(*np.unique(grams, axis=0, return_counts=True))}
62+
counts = sorted(counts.items(), key=lambda p: p[1], reverse=True)
5963
if isinstance(byte_obj, bytes):
6064
return counts
6165
elif hasattr(byte_obj, "bytes"):
@@ -80,8 +84,7 @@ def ngrams_distribution(byte_obj: bytes | object, n: int = 1, step: int = 1, n_m
8084
:return: list of n_most_common (n-gram, count) pairs
8185
"""
8286
c = ngrams_counts(byte_obj, n, step)
83-
n = len(c) if n_most_common is None else n_most_common + n_exclude_top + len(exclude or [])
84-
r = sorted(c.items(), key=lambda p: p[1], reverse=True)[:n]
87+
r = c[:len(c) if n_most_common is None else n_most_common + n_exclude_top + len(exclude or [])]
8588
if exclude is not None:
8689
r = [(ngram, count) for ngram, count in r if ngram not in exclude]
8790
return r[n_exclude_top:n_exclude_top+(n_most_common or len(c))]

tests/test_others.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,11 @@ def test_ngrams_functions(self):
2424
self.assertRaises(ValueError, ngrams_counts, b"abc", n=n)
2525
self.assertRaises(ValueError, ngrams_counts, b"abc", step=-1)
2626
self.assertEqual(ngrams_counts(b"a", n=2), {})
27-
self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4), dict))
28-
self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4, n=2), dict))
27+
self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4), list))
28+
self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4, n=2), list))
2929
class Test:
3030
bytes = seq
31-
histogram = ngrams_distribution(t := Test(), exclude=(b"\x00", b"\xff"))
32-
self.assertTrue(isinstance(histogram, list))
31+
self.assertTrue(isinstance(histogram := ngrams_distribution(t := Test(), exclude=(b"\x00", b"\xff")), list))
3332
self.assertNotIn(b"\x00", [b for b, c in histogram])
3433
self.assertNotIn(b"\xff", [b for b, c in histogram])
3534
histogram2 = ngrams_distribution(t, n_most_common=300)

0 commit comments

Comments
 (0)