Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion lightllm/server/audioserver/model_infer/model_rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ def exposed_init_model(self, kvargs):

set_current_device_id(torch.cuda.current_device())

self.cpu_embed_cache_client = CpuEmbedCacheClient(create_meta_data=False, init_shm_data=False)
self.cpu_embed_cache_client = CpuEmbedCacheClient(
create_meta_data=False,
init_shm_data=False,
)
except Exception as e:
print("#" * 16)
print("load model error:", str(e), e, type(e))
Expand Down
24 changes: 5 additions & 19 deletions lightllm/server/embed_cache/embed_cache_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ def __init__(self, create_meta_data: bool, init_shm_data: bool):

if create_meta_data:
self.token_index_manager = MemoryManager(total_size=self.token_num)

if init_shm_data:
self._create_shm_embed_kv_cache()
else:
if init_shm_data:
self._create_shm_embed_kv_cache()
else:
self._attach_shm_cpu_embed_cache()
self._attach_shm_cpu_embed_cache()
return

def alloc_indexes(self, token_num: int) -> Optional["MemoryBlock"]:
Expand Down Expand Up @@ -64,21 +64,7 @@ def _create_shm_embed_kv_cache(self):
shm_ptr = create_shm_kv_cache_ptr(
key=self.args.multi_modal_cache_shm_id, size=self.embed_cache_tensor_meta.calcu_size()
)
handle = register_shm_ptr_to_pin(shm_ptr=shm_ptr, size=self.embed_cache_tensor_meta.calcu_size())
handle.wait()
numpy_array = np.frombuffer(
memoryview((ctypes.c_uint8 * self.embed_cache_tensor_meta.calcu_size()).from_address(shm_ptr)),
dtype=np.uint8,
)
# 将 NumPy 数组转换为 PyTorch 张量
shape = (
self.embed_cache_tensor_meta.token_num,
self.embed_cache_tensor_meta.layer_num,
self.embed_cache_tensor_meta.hidden_size,
)
self.cpu_embed_cache_tensor = (
torch.from_numpy(numpy_array).view(dtype=self.embed_cache_tensor_meta.data_type).view(shape)
)
logger.info(f"create embed cache shm ptr: {shm_ptr}, size: {self.embed_cache_tensor_meta.calcu_size()}")
return

def _attach_shm_cpu_embed_cache(self):
Expand Down
2 changes: 1 addition & 1 deletion lightllm/server/embed_cache/impl/naive_memory_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(self, args) -> None:
self.token_id_range_start = 0
self.token_id_range_end = 0
self.use_config_server = self.args.config_server_host and self.args.config_server_port
self.cpu_embed_cache_client = CpuEmbedCacheClient(create_meta_data=True, init_shm_data=False)
self.cpu_embed_cache_client = CpuEmbedCacheClient(create_meta_data=True, init_shm_data=True)

def _check_and_set_new_id_range(self, alloced_token_num):
need_update_range = self.token_id_range_start + alloced_token_num >= self.token_id_range_end
Expand Down
2 changes: 1 addition & 1 deletion lightllm/server/visualserver/model_infer/model_rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def exposed_init_model(self, kvargs):

self.model.load_model(weight_dir)
self.model = self.model.cuda()
self.cpu_embed_cache_client = CpuEmbedCacheClient(create_meta_data=False, init_shm_data=True)
self.cpu_embed_cache_client = CpuEmbedCacheClient(create_meta_data=False, init_shm_data=False)
except Exception as e:
print("#" * 16)
print("load model error:", str(e), e, type(e))
Expand Down
43 changes: 34 additions & 9 deletions lightllm/utils/device_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,18 @@ def is_musa():
return hasattr(torch.version, "musa") and torch.version.musa is not None


@lru_cache(maxsize=1)
def is_nvidia():
ans = (
torch.cuda.is_available()
and getattr(torch.version, "cuda", None) is not None
and getattr(torch.version, "hip", None) is None
and not is_musa()
)
logger.info(f"device is_nvidia : {ans}")
return ans


@lru_cache(maxsize=None)
def get_current_device_name():
if torch.cuda.is_available() or is_musa():
Expand Down Expand Up @@ -260,20 +272,33 @@ def set_sm_limit(percent: int, gpu_index=0):
return True


@lru_cache(maxsize=None)
def support_tma() -> bool:
# 5090 关闭 tma feature,实际测试开了没啥用
return is_nvidia() and torch.cuda.get_device_capability() >= (9, 0) and not is_5090_gpu()


@lru_cache(maxsize=None)
def triton_support_tensor_descriptor() -> bool:
if not support_tma():
logger.info(
"triton tensor_descriptor requires NVIDIA Hopper or newer GPU (compute capability >= 9.0) "
"and is not supported on 5090"
)
return False

try:
from triton.tools.tensor_descriptor import TensorDescriptor

support_tma = torch.cuda.get_device_capability() >= (9, 0)
if support_tma:
logger.info("triton support tensor_descriptor")
return True
else:
assert False
except:
logger.info("triton not support tensor_descriptor")
return False
_ = TensorDescriptor

logger.info("triton support tensor_descriptor")
return True
except Exception:
pass

logger.info("triton not support tensor_descriptor")
return False


@lru_cache(maxsize=None)
Expand Down
Loading