@@ -30,19 +30,22 @@ def human_readable_size(size: int, precision: int = 0) -> str:
3030 return "%.*f%s" % (precision , size , units [i ])
3131
3232
33- def ngrams_counts (byte_obj : bytes | object , n : int = 1 , step : int = 1 ) -> dict [ bytes , int ]:
34- """ Output the Counter instance for an input byte sequence or byte object based on n-grams .
35- If the input is a byte object, cache the result.
33+ def ngrams_counts (byte_obj : bytes | object , n : int = 1 , step : int = 1 ) -> list [ tuple [ bytes , int ] ]:
34+ """ Output a sorted list of tuples (n-gram, counts) for an input byte sequence or byte object.
35+ If the input is a byte object, the result is cached .
3636
37- :param byte_obj: byte sequence ('bytes') or byte object with "bytes" and "size" attributes (i.e. pathlib2.Path)
38- :param n: n determining the size of n-grams, defaults to 1
39- :param step: step for sliding the n-grams
40- :param start: number of bytes to start from
37+ :param byte_obj: byte sequence ('bytes') or byte object with "bytes" and "size" attributes (i.e. pathlib2.Path)
38+ :param n: n determining the size of n-grams, defaults to 1
39+ :param step: step for sliding the n-grams
4140 """
4241 if n not in (1 , 2 , 3 ):
4342 raise ValueError ("n must be 1, 2, or 3" )
4443 if step <= 0 :
4544 raise ValueError ("step must be positive" )
45+ try :
46+ return byte_obj ._ngram_counts_cache [n ]
47+ except (AttributeError , KeyError ):
48+ pass
4649 if isinstance (byte_obj , bytes ) or hasattr (byte_obj , "bytes" ):
4750 a = np .frombuffer (data := byte_obj if isinstance (byte_obj , bytes ) else byte_obj .bytes , dtype = np .uint8 )
4851 l = a .size
@@ -56,6 +59,7 @@ def ngrams_counts(byte_obj: bytes | object, n: int = 1, step: int = 1) -> dict[b
5659 grams = np .stack ((a [0 :end :step ], a [1 :1 + end :step ]), axis = 1 ) if n == 2 else \
5760 np .stack ((a [0 :end :step ], a [1 :1 + end :step ], a [2 :2 + end :step ]), axis = 1 )
5861 counts = {bytes (row ): int (c ) for row , c in zip (* np .unique (grams , axis = 0 , return_counts = True ))}
62+ counts = sorted (counts .items (), key = lambda p : p [1 ], reverse = True )
5963 if isinstance (byte_obj , bytes ):
6064 return counts
6165 elif hasattr (byte_obj , "bytes" ):
@@ -80,8 +84,7 @@ def ngrams_distribution(byte_obj: bytes | object, n: int = 1, step: int = 1, n_m
8084 :return: list of n_most_common (n-gram, count) pairs
8185 """
8286 c = ngrams_counts (byte_obj , n , step )
83- n = len (c ) if n_most_common is None else n_most_common + n_exclude_top + len (exclude or [])
84- r = sorted (c .items (), key = lambda p : p [1 ], reverse = True )[:n ]
87+ r = c [:len (c ) if n_most_common is None else n_most_common + n_exclude_top + len (exclude or [])]
8588 if exclude is not None :
8689 r = [(ngram , count ) for ngram , count in r if ngram not in exclude ]
8790 return r [n_exclude_top :n_exclude_top + (n_most_common or len (c ))]
0 commit comments