From 7cf4c3c1b41cd968513cf46be470d152cb7d1859 Mon Sep 17 00:00:00 2001 From: 3em0 <79475432@qq.com> Date: Fri, 3 Apr 2026 16:32:27 +0800 Subject: [PATCH] [BugFix] fix multimodal hasher hash collision risk when ndarray shape or dtype differs numpy tobytes() only serializes raw element bytes without encoding shape or dtype metadata. This means arrays with identical raw bytes but different shapes (e.g. (6,4) vs (4,6)) or different dtypes (e.g. float32 vs uint8 reinterpretation of same memory) produce the same SHA-256 digest, leading to silent cache collisions in ProcessorCacheManager / EncoderCacheManager / PrefixCacheManager. Prepend a "{shape}|{dtype}|" header to the byte payload before hashing so that shape and dtype participate in the digest. Added test cases for shape and dtype sensitivity. --- fastdeploy/multimodal/hasher.py | 6 +++++- tests/multimodal/test_hasher.py | 16 +++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/fastdeploy/multimodal/hasher.py b/fastdeploy/multimodal/hasher.py index 6d2fc4f9b91..8793107d3de 100644 --- a/fastdeploy/multimodal/hasher.py +++ b/fastdeploy/multimodal/hasher.py @@ -25,5 +25,9 @@ class MultimodalHasher: @classmethod def hash_features(cls, obj: object) -> str: if isinstance(obj, np.ndarray): - return hashlib.sha256((obj.tobytes())).hexdigest() + # Encode shape and dtype into the hash to avoid collisions between + # arrays that share the same raw bytes but differ in layout, e.g. + # a (6,4) vs (4,6) array, or float32 vs uint8 reinterpretation. + header = f"{obj.shape}|{obj.dtype}|".encode() + return hashlib.sha256(header + obj.tobytes()).hexdigest() return hashlib.sha256((pickle.dumps(obj))).hexdigest() diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py index a89ff2cf13a..ea6368449dc 100644 --- a/tests/multimodal/test_hasher.py +++ b/tests/multimodal/test_hasher.py @@ -26,9 +26,23 @@ def test_hash_features_ndarray(self): """Test hash features with numpy ndarray""" arr = np.random.randint(low=0, high=255, size=(28, 28), dtype=np.uint8) arr_hash = MultimodalHasher.hash_features(arr) - target_hash = hashlib.sha256((arr.tobytes())).hexdigest() + header = f"{arr.shape}|{arr.dtype}|".encode() + target_hash = hashlib.sha256(header + arr.tobytes()).hexdigest() assert arr_hash == target_hash, f"Ndarray hash mismatch: {arr_hash} != {target_hash}" + def test_hash_features_ndarray_shape_sensitivity(self): + """Arrays with same bytes but different shapes must produce different hashes""" + base = np.arange(24, dtype=np.float32) + a = base.reshape(6, 4) + b = base.reshape(4, 6) + assert MultimodalHasher.hash_features(a) != MultimodalHasher.hash_features(b) + + def test_hash_features_ndarray_dtype_sensitivity(self): + """Arrays with same shape but different dtypes must produce different hashes""" + a = np.zeros((4, 4), dtype=np.float32) + b = np.zeros((4, 4), dtype=np.float64) + assert MultimodalHasher.hash_features(a) != MultimodalHasher.hash_features(b) + def test_hash_features_object(self): """Test hash features with unsupported object type""" obj = {"key": "value"}