diff --git a/lightllm/server/audioserver/model_infer/model_rpc.py b/lightllm/server/audioserver/model_infer/model_rpc.py index ad18ffd7f5..a8a2c39c3e 100644 --- a/lightllm/server/audioserver/model_infer/model_rpc.py +++ b/lightllm/server/audioserver/model_infer/model_rpc.py @@ -43,7 +43,10 @@ def exposed_init_model(self, kvargs): set_current_device_id(torch.cuda.current_device()) - self.cpu_embed_cache_client = CpuEmbedCacheClient(create_meta_data=False, init_shm_data=False) + self.cpu_embed_cache_client = CpuEmbedCacheClient( + create_meta_data=False, + init_shm_data=False, + ) except Exception as e: print("#" * 16) print("load model error:", str(e), e, type(e)) diff --git a/lightllm/server/embed_cache/embed_cache_client.py b/lightllm/server/embed_cache/embed_cache_client.py index 8c5b7f71ee..6fcc2d3783 100644 --- a/lightllm/server/embed_cache/embed_cache_client.py +++ b/lightllm/server/embed_cache/embed_cache_client.py @@ -24,11 +24,11 @@ def __init__(self, create_meta_data: bool, init_shm_data: bool): if create_meta_data: self.token_index_manager = MemoryManager(total_size=self.token_num) + + if init_shm_data: + self._create_shm_embed_kv_cache() else: - if init_shm_data: - self._create_shm_embed_kv_cache() - else: - self._attach_shm_cpu_embed_cache() + self._attach_shm_cpu_embed_cache() return def alloc_indexes(self, token_num: int) -> Optional["MemoryBlock"]: @@ -64,21 +64,7 @@ def _create_shm_embed_kv_cache(self): shm_ptr = create_shm_kv_cache_ptr( key=self.args.multi_modal_cache_shm_id, size=self.embed_cache_tensor_meta.calcu_size() ) - handle = register_shm_ptr_to_pin(shm_ptr=shm_ptr, size=self.embed_cache_tensor_meta.calcu_size()) - handle.wait() - numpy_array = np.frombuffer( - memoryview((ctypes.c_uint8 * self.embed_cache_tensor_meta.calcu_size()).from_address(shm_ptr)), - dtype=np.uint8, - ) - # 将 NumPy 数组转换为 PyTorch 张量 - shape = ( - self.embed_cache_tensor_meta.token_num, - self.embed_cache_tensor_meta.layer_num, - self.embed_cache_tensor_meta.hidden_size, - ) - self.cpu_embed_cache_tensor = ( - torch.from_numpy(numpy_array).view(dtype=self.embed_cache_tensor_meta.data_type).view(shape) - ) + logger.info(f"create embed cache shm ptr: {shm_ptr}, size: {self.embed_cache_tensor_meta.calcu_size()}") return def _attach_shm_cpu_embed_cache(self): diff --git a/lightllm/server/embed_cache/impl/naive_memory_cache.py b/lightllm/server/embed_cache/impl/naive_memory_cache.py index 61ba46d7c6..fbce108762 100644 --- a/lightllm/server/embed_cache/impl/naive_memory_cache.py +++ b/lightllm/server/embed_cache/impl/naive_memory_cache.py @@ -45,7 +45,7 @@ def __init__(self, args) -> None: self.token_id_range_start = 0 self.token_id_range_end = 0 self.use_config_server = self.args.config_server_host and self.args.config_server_port - self.cpu_embed_cache_client = CpuEmbedCacheClient(create_meta_data=True, init_shm_data=False) + self.cpu_embed_cache_client = CpuEmbedCacheClient(create_meta_data=True, init_shm_data=True) def _check_and_set_new_id_range(self, alloced_token_num): need_update_range = self.token_id_range_start + alloced_token_num >= self.token_id_range_end diff --git a/lightllm/server/visualserver/model_infer/model_rpc.py b/lightllm/server/visualserver/model_infer/model_rpc.py index 3e97f4de3e..6355ac2dbf 100644 --- a/lightllm/server/visualserver/model_infer/model_rpc.py +++ b/lightllm/server/visualserver/model_infer/model_rpc.py @@ -95,7 +95,7 @@ def exposed_init_model(self, kvargs): self.model.load_model(weight_dir) self.model = self.model.cuda() - self.cpu_embed_cache_client = CpuEmbedCacheClient(create_meta_data=False, init_shm_data=True) + self.cpu_embed_cache_client = CpuEmbedCacheClient(create_meta_data=False, init_shm_data=False) except Exception as e: print("#" * 16) print("load model error:", str(e), e, type(e)) diff --git a/lightllm/utils/device_utils.py b/lightllm/utils/device_utils.py index a1ed6ed950..43b10ec88b 100644 --- a/lightllm/utils/device_utils.py +++ b/lightllm/utils/device_utils.py @@ -88,6 +88,18 @@ def is_musa(): return hasattr(torch.version, "musa") and torch.version.musa is not None +@lru_cache(maxsize=1) +def is_nvidia(): + ans = ( + torch.cuda.is_available() + and getattr(torch.version, "cuda", None) is not None + and getattr(torch.version, "hip", None) is None + and not is_musa() + ) + logger.info(f"device is_nvidia : {ans}") + return ans + + @lru_cache(maxsize=None) def get_current_device_name(): if torch.cuda.is_available() or is_musa(): @@ -260,20 +272,33 @@ def set_sm_limit(percent: int, gpu_index=0): return True +@lru_cache(maxsize=None) +def support_tma() -> bool: + # 5090 关闭 tma feature,实际测试开了没啥用 + return is_nvidia() and torch.cuda.get_device_capability() >= (9, 0) and not is_5090_gpu() + + @lru_cache(maxsize=None) def triton_support_tensor_descriptor() -> bool: + if not support_tma(): + logger.info( + "triton tensor_descriptor requires NVIDIA Hopper or newer GPU (compute capability >= 9.0) " + "and is not supported on 5090" + ) + return False + try: from triton.tools.tensor_descriptor import TensorDescriptor - support_tma = torch.cuda.get_device_capability() >= (9, 0) - if support_tma: - logger.info("triton support tensor_descriptor") - return True - else: - assert False - except: - logger.info("triton not support tensor_descriptor") - return False + _ = TensorDescriptor + + logger.info("triton support tensor_descriptor") + return True + except Exception: + pass + + logger.info("triton not support tensor_descriptor") + return False @lru_cache(maxsize=None)