Skip to content

Commit 67d4ca5

Browse files
committed
Optimized utils.ngrams_counts
1 parent 6134af6 commit 67d4ca5

8 files changed

Lines changed: 53 additions & 27 deletions

File tree

.coveragerc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ exclude_lines =
2121
if self.type not in ["ELF", "MachO", "PE"]:
2222
if j <= i or start2 is None:
2323
if s != self.__size:
24+
glob['_IMP'] = True

.github/workflows/python-package.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ jobs:
1919
fail-fast: false
2020
matrix:
2121
os: [ubuntu-latest]
22-
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
22+
python-version: ["3.10", "3.11", "3.12", "3.13"]
2323
steps:
2424
- uses: actions/checkout@v3
2525
- name: Set up Python ${{ matrix.python-version }}

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ authors = [
2222
description = "Library for plotting executable samples supporting multiple formats"
2323
license = {file = "LICENSE"}
2424
keywords = ["python", "development", "programming", "executable-samples", "plot", "entropy", "cfg"]
25-
requires-python = ">=3.9,<4"
25+
requires-python = ">=3.10,<4"
2626
classifiers = [
2727
"Development Status :: 5 - Production/Stable",
2828
"Environment :: Console",

src/exeplot/VERSION.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.4.3
1+
0.5.2

src/exeplot/__conf__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
numpy.int = numpy.int_ # dirty fix to "AttributeError: module 'numpy' has no attribute 'int'."
3030

3131

32-
def check_imports(*names):
32+
def check_imports(*names) -> None:
3333
import warnings
3434
from inspect import currentframe
3535
glob = currentframe().f_back.f_globals
@@ -42,7 +42,7 @@ def check_imports(*names):
4242
glob['_IMP'] = False
4343

4444

45-
def configure(): # pragma: no cover
45+
def configure() -> None: # pragma: no cover
4646
from configparser import ConfigParser
4747
from os.path import exists, expanduser
4848
path = expanduser("~/.exeplot.conf")
@@ -58,7 +58,7 @@ def configure(): # pragma: no cover
5858
plt.rcParams['font.family'] = config['font_family']
5959

6060

61-
def configure_fonts(**kw):
61+
def configure_fonts(**kw) -> dict:
6262
import matplotlib
6363
matplotlib.rc('font', **{k.split("_")[1]: kw.pop(k, config[k]) for k in ['font_family', 'font_size']})
6464
kw['title-font'] = {'fontfamily': kw.pop('title_font_family', config['font_family']),

src/exeplot/__main__.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,20 @@
11
# -*- coding: UTF-8 -*-
2+
from argparse import ArgumentParser, Namespace, RawTextHelpFormatter
3+
24
from .__info__ import __author__, __copyright__, __email__, __license__, __source__, __version__
35
from .__init__ import *
46
from .__init__ import __all__ as _plots
57

68

7-
def _parser(name, description, examples):
8-
from argparse import ArgumentParser, RawTextHelpFormatter
9+
def _parser(name: str, description: str, examples: list[str]) -> ArgumentParser:
910
descr = f"{name} {__version__}\n\nAuthor : {__author__} ({__email__})\nCopyright: {__copyright__}\nLicense :" \
1011
f" {__license__}\nSource : {__source__}\n\n{description}.\n\n"
1112
examples = [f"exeplot {e}" if not e.startswith("exeplot ") else e for e in examples]
1213
return ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False,
1314
epilog="usage examples:\n " + "\n ".join(examples) if len(examples) > 0 else None)
1415

1516

16-
def _setup(parser): # pragma: no cover
17+
def _setup(parser: ArgumentParser) -> Namespace: # pragma: no cover
1718
args = parser.parse_args()
1819
if hasattr(args, "verbose"):
1920
import logging
@@ -22,7 +23,7 @@ def _setup(parser): # pragma: no cover
2223
return args
2324

2425

25-
def main():
26+
def main() -> None: # pragma: no cover
2627
from os import makedirs
2728
parser = _parser("Exeplot", "This tool allows to plot executable sample(s) in different ways",
2829
["byte binary.exe", "entropy binary1.exe binary2.exe --scale"])

src/exeplot/utils.py

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
# -*- coding: UTF-8 -*-
2+
import numpy as np
23
from math import log2
4+
from typing import Optional
35

46

57
__all__ = ["ensure_str", "human_readable_size", "ngrams_counts", "ngrams_distribution", "shannon_entropy"]
68

79
shannon_entropy = lambda b: -sum([p*log2(p) for p in [float(ctr)/len(b) for ctr in [b.count(c) for c in set(b)]]]) or 0.
810

911

10-
def ensure_str(s, encoding='utf-8', errors='strict'):
12+
def ensure_str(s: str | bytes, encoding: str = "utf-8", errors: str = "strict") -> str:
1113
""" Ensure that an input string is decoded. """
1214
if isinstance(s, bytes):
1315
try:
@@ -19,7 +21,7 @@ def ensure_str(s, encoding='utf-8', errors='strict'):
1921
return s
2022

2123

22-
def human_readable_size(size, precision=0):
24+
def human_readable_size(size: int, precision: int = 0) -> str:
2325
""" Display bytes' size in a human-readable format given a precision. """
2426
i, units = 0, ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]
2527
while size >= 1024 and i < len(units)-1:
@@ -28,27 +30,45 @@ def human_readable_size(size, precision=0):
2830
return "%.*f%s" % (precision, size, units[i])
2931

3032

31-
def ngrams_counts(byte_obj, n=1, step=1):
33+
def ngrams_counts(byte_obj: bytes | object, n: int = 1, step: int = 1) -> dict[bytes, int]:
3234
""" Output the Counter instance for an input byte sequence or byte object based on n-grams.
3335
If the input is a byte object, cache the result.
3436
3537
:param byte_obj: byte sequence ('bytes') or byte object with "bytes" and "size" attributes (i.e. pathlib2.Path)
3638
:param n: n determining the size of n-grams, defaults to 1
3739
:param step: step for sliding the n-grams
40+
:param start: number of bytes to start from
3841
"""
39-
from collections import Counter
40-
if isinstance(byte_obj, (str, bytes)):
41-
return Counter(byte_obj[i:i+n] for i in range(0, len(byte_obj)-n+1, step))
42-
elif hasattr(byte_obj, "bytes") and hasattr(byte_obj, "size"):
43-
if not hasattr(byte_obj, "_ngram_counts_cache"):
44-
byte_obj._ngram_counts_cache = {}
45-
if n not in byte_obj._ngram_counts_cache.keys():
46-
byte_obj._ngram_counts_cache[n] = Counter(byte_obj.bytes[i:i+n] for i in range(0, byte_obj.size-n+1, step))
47-
return byte_obj._ngram_counts_cache[n]
42+
if n not in (1, 2, 3):
43+
raise ValueError("n must be 1, 2, or 3")
44+
if step <= 0:
45+
raise ValueError("step must be positive")
46+
if isinstance(byte_obj, bytes) or hasattr(byte_obj, "bytes"):
47+
a = np.frombuffer(data := byte_obj if isinstance(byte_obj, bytes) else byte_obj.bytes, dtype=np.uint8)
48+
l = a.size
49+
if l < n:
50+
return {}
51+
if n == 1:
52+
counts = {b.to_bytes(1, "big"): int(c) for b, c in \
53+
enumerate(np.bincount(np.frombuffer(data, dtype=np.uint8)))}
54+
else:
55+
end = (m := (l - n) // step + 1) * step
56+
grams = np.stack((a[0:end:step], a[1:1+end:step]), axis=1) if n == 2 else \
57+
np.stack((a[0:end:step], a[1:1+end:step], a[2:2+end:step]), axis=1)
58+
counts = {bytes(row): int(c) for row, c in zip(*np.unique(grams, axis=0, return_counts=True))}
59+
if isinstance(byte_obj, bytes):
60+
return counts
61+
elif hasattr(byte_obj, "bytes"):
62+
if not hasattr(byte_obj, "_ngram_counts_cache"):
63+
byte_obj._ngram_counts_cache = {}
64+
if n not in byte_obj._ngram_counts_cache.keys():
65+
byte_obj._ngram_counts_cache[n] = counts
66+
return byte_obj._ngram_counts_cache[n]
4867
raise TypeError("Bad input type ; should be a byte sequence or object")
4968

5069

51-
def ngrams_distribution(byte_obj, n=1, step=1, n_most_common=None, n_exclude_top=0, exclude=None):
70+
def ngrams_distribution(byte_obj: bytes | object, n: int = 1, step: int = 1, n_most_common: Optional[int] = None,
71+
n_exclude_top: int = 0, exclude: Optional[list] = None) -> list[tuple[bytes, int]]:
5272
""" Compute the n-grams distribution of an input byte sequence or byte object given exclusions.
5373
5474
:param byte_obj: byte sequence ('bytes') or byte object with "bytes" and "size" attributes (i.e. pathlib2.Path)
@@ -60,7 +80,8 @@ def ngrams_distribution(byte_obj, n=1, step=1, n_most_common=None, n_exclude_top
6080
:return: list of n_most_common (n-gram, count) pairs
6181
"""
6282
c = ngrams_counts(byte_obj, n, step)
63-
r = c.most_common(len(c) if n_most_common is None else n_most_common + n_exclude_top + len(exclude or []))
83+
n = len(c) if n_most_common is None else n_most_common + n_exclude_top + len(exclude or [])
84+
r = sorted(c.items(), key=lambda p: p[1], reverse=True)[:n]
6485
if exclude is not None:
6586
r = [(ngram, count) for ngram, count in r if ngram not in exclude]
6687
return r[n_exclude_top:n_exclude_top+(n_most_common or len(c))]

tests/test_others.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
# -*- coding: UTF-8 -*-
33
import matplotlib.pyplot as plt
44
import os
5-
from collections import Counter
65
from exeplot.plots.__common__ import Binary
76
from exeplot.utils import *
87
from unittest import TestCase
@@ -21,10 +20,14 @@ def test_miscellaneous(self):
2120
class TestUtils(TestCase):
2221
def test_ngrams_functions(self):
2322
self.assertRaises(TypeError, ngrams_counts, 123)
24-
self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4), Counter))
23+
for n in [0, 4]:
24+
self.assertRaises(ValueError, ngrams_counts, b"abc", n=n)
25+
self.assertRaises(ValueError, ngrams_counts, b"abc", step=-1)
26+
self.assertEqual(ngrams_counts(b"a", n=2), {})
27+
self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4), dict))
28+
self.assertTrue(isinstance(ngrams_counts(seq := b"\x00" * 4 + os.urandom(120) + b"\xff" * 4, n=2), dict))
2529
class Test:
2630
bytes = seq
27-
size = len(seq)
2831
histogram = ngrams_distribution(t := Test(), exclude=(b"\x00", b"\xff"))
2932
self.assertTrue(isinstance(histogram, list))
3033
self.assertNotIn(b"\x00", [b for b, c in histogram])

0 commit comments

Comments
 (0)