Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 129 additions & 81 deletions devops/scripts/benchmarks/benches/compute/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,90 +525,11 @@ def createTorchLinearKernelSizeBench(variant_name: str, **kwargs):
),
]

# Graph benchmarks segfault on pvc
device_arch = getattr(options, "device_architecture", "")
if not ("pvc" in device_arch):
# Add TorchGraphSingleQueue benchmarks
for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
for profiler_type, kernel_name in product(
list(PROFILERS), list(KERNEL_NAME)
):

def createTorchGraphSingleQueueBench(variant_name: str, **kwargs):
return TorchGraphSingleQueue(
self,
runtime,
variant_name,
profiler_type,
fixed_args={
"KernelWGCount": 512,
"KernelWGSize": 256,
"Profiling": 0,
"UseEvents": 0,
},
**kwargs,
)

benches += [
createTorchGraphSingleQueueBench(
"small",
KernelName=kernel_name.value,
KernelsPerQueue=10,
KernelBatchSize=10,
),
createTorchGraphSingleQueueBench(
"medium",
KernelName=kernel_name.value,
KernelsPerQueue=32,
KernelBatchSize=32,
),
createTorchGraphSingleQueueBench(
"large",
KernelName=kernel_name.value,
KernelsPerQueue=64,
KernelBatchSize=64,
),
]

# Add TorchGraphMultiQueue benchmarks
for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
for profiler_type in list(PROFILERS):

def createTorchGraphMultiQueueBench(variant_name: str, **kwargs):
return TorchGraphMultiQueue(
self,
runtime,
variant_name,
profiler_type,
fixed_args={
"KernelWGCount": 512,
"KernelWGSize": 256,
"Profiling": 0,
"UseEvents": 0,
},
**kwargs,
)

benches += [
createTorchGraphMultiQueueBench(
"small",
KernelsPerQueue=10,
),
createTorchGraphMultiQueueBench(
"medium",
KernelsPerQueue=32,
),
createTorchGraphMultiQueueBench(
"large",
KernelsPerQueue=64,
),
]

# Add TorchSubmitEventRecordWait benchmarks
# Add TorchEventRecordWait benchmarks
for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
for profiler_type in list(PROFILERS):
benches.append(
TorchSubmitEventRecordWait(
TorchEventRecordWait(
self,
runtime,
"medium",
Expand All @@ -619,6 +540,133 @@ def createTorchGraphMultiQueueBench(variant_name: str, **kwargs):
)
)

#
# Note: Graph benchmarks segfault on pvc on L0
#
device_arch = getattr(options, "device_architecture", "")

# Add TorchGraphSingleQueue benchmarks
for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
if "pvc" in device_arch and runtime == RUNTIMES.LEVEL_ZERO:
continue

for profiler_type, kernel_name in product(
list(PROFILERS), list(KERNEL_NAME)
):

def createTorchGraphSingleQueueBench(variant_name: str, **kwargs):
return TorchGraphSingleQueue(
self,
runtime,
variant_name,
profiler_type,
fixed_args={
"KernelWGCount": 512,
"KernelWGSize": 256,
"Profiling": 0,
"UseEvents": 0,
},
**kwargs,
)

benches += [
createTorchGraphSingleQueueBench(
"small",
KernelName=kernel_name.value,
KernelsPerQueue=10,
KernelBatchSize=10,
),
createTorchGraphSingleQueueBench(
"medium",
KernelName=kernel_name.value,
KernelsPerQueue=32,
KernelBatchSize=32,
),
createTorchGraphSingleQueueBench(
"large",
KernelName=kernel_name.value,
KernelsPerQueue=64,
KernelBatchSize=64,
),
]

# Add TorchGraphMultiQueue benchmarks
for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
if "pvc" in device_arch and runtime == RUNTIMES.LEVEL_ZERO:
continue

for profiler_type in list(PROFILERS):

def createTorchGraphMultiQueueBench(variant_name: str, **kwargs):
return TorchGraphMultiQueue(
self,
runtime,
variant_name,
profiler_type,
fixed_args={
"KernelWGCount": 512,
"KernelWGSize": 256,
"Profiling": 0,
"UseEvents": 0,
},
**kwargs,
)

benches += [
createTorchGraphMultiQueueBench(
"small",
KernelsPerQueue=10,
),
createTorchGraphMultiQueueBench(
"medium",
KernelsPerQueue=32,
),
createTorchGraphMultiQueueBench(
"large",
KernelsPerQueue=64,
),
]

# Add TorchGraphVllmMock benchmarks
for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
if "pvc" in device_arch and runtime == RUNTIMES.LEVEL_ZERO:
continue

for profiler_type in list(PROFILERS):

def createTorchGraphVllmMockBench(variant_name: str, **kwargs):
return TorchGraphVllmMock(
self,
runtime,
variant_name,
profiler_type,
fixed_args={
"KernelWGCount": 512,
"KernelWGSize": 256,
"Profiling": 0,
"UseEvents": 0,
},
**kwargs,
)

benches += [
createTorchGraphVllmMockBench(
"small", AllocCount=32, GraphScenario=0
),
createTorchGraphVllmMockBench(
"large", AllocCount=128, GraphScenario=0
),
createTorchGraphVllmMockBench(
"large", AllocCount=128, GraphScenario=1
),
createTorchGraphVllmMockBench(
"large", AllocCount=128, GraphScenario=2
),
createTorchGraphVllmMockBench(
"large", AllocCount=128, GraphScenario=3
),
]

# Add UR-specific benchmarks
benches += [
# TODO: multithread_benchmark_ur fails with segfault
Expand Down
23 changes: 22 additions & 1 deletion devops/scripts/benchmarks/benches/compute/compute_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def __init__(
)


class TorchSubmitEventRecordWait(TorchBenchmark):
class TorchEventRecordWait(TorchBenchmark):
def __init__(
self,
suite,
Expand All @@ -249,3 +249,24 @@ def __init__(
profiler_type,
**kwargs,
)


class TorchGraphVllmMock(TorchBenchmark):
def __init__(
self,
suite,
runtime: RUNTIMES,
variant_name: str,
profiler_type: PROFILERS,
Comment thread
PatKamin marked this conversation as resolved.
fixed_args: dict | None = None,
**kwargs,
):
super().__init__(
suite,
runtime,
"KernelSubmitGraphVllmMock",
variant_name,
profiler_type,
fixed_args=fixed_args,
**kwargs,
)
61 changes: 36 additions & 25 deletions devops/scripts/benchmarks/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def test_torch_l0(self):
"KernelSubmitMemoryReuse Int32Large",
{"pytorch", "L0"},
)
# FIXME: Graph benchmarks segfault on pvc
# FIXME: Graph benchmarks segfault on pvc on L0
if not ("pvc" in self.device_arch.lower()):
self._checkCase(
"torch_benchmark_l0 KernelSubmitGraphSingleQueue KernelBatchSize 10, KernelName Add, KernelsPerQueue 10 CPU count",
Expand All @@ -289,6 +289,11 @@ def test_torch_l0(self):
"KernelSubmitGraphMultiQueue large, CPU count",
{"pytorch", "L0"},
)
self._checkCase(
"torch_benchmark_l0 KernelSubmitGraphVllmMock AllocCount 128, GraphScenario 3",
"KernelSubmitGraphVllmMock large",
{"pytorch", "L0"},
)

def test_torch_sycl(self):
self._checkCase(
Expand Down Expand Up @@ -326,18 +331,21 @@ def test_torch_sycl(self):
"KernelSubmitMemoryReuse FloatLarge",
{"pytorch", "SYCL"},
)
# FIXME: Graph benchmarks segfault on pvc
if not ("pvc" in self.device_arch.lower()):
self._checkCase(
"torch_benchmark_sycl KernelSubmitGraphSingleQueue KernelBatchSize 32, KernelName Add, KernelsPerQueue 32",
"KernelSubmitGraphSingleQueue medium",
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_sycl KernelSubmitGraphMultiQueue KernelsPerQueue 32 CPU count",
"KernelSubmitGraphMultiQueue medium, CPU count",
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_sycl KernelSubmitGraphSingleQueue KernelBatchSize 32, KernelName Add, KernelsPerQueue 32",
"KernelSubmitGraphSingleQueue medium",
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_sycl KernelSubmitGraphMultiQueue KernelsPerQueue 32 CPU count",
"KernelSubmitGraphMultiQueue medium, CPU count",
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_sycl KernelSubmitGraphVllmMock AllocCount 32, GraphScenario 0",
"KernelSubmitGraphVllmMock small",
{"pytorch", "SYCL"},
)

def test_torch_syclpreview(self):
self._checkCase(
Expand Down Expand Up @@ -380,18 +388,21 @@ def test_torch_syclpreview(self):
"KernelSubmitMemoryReuse FloatMedium, CPU count",
{"pytorch", "SYCL"},
)
# FIXME: Graph benchmarks segfault on pvc
if not ("pvc" in self.device_arch.lower()):
self._checkCase(
"torch_benchmark_syclpreview KernelSubmitGraphSingleQueue KernelBatchSize 64, KernelName Add, KernelsPerQueue 64",
"KernelSubmitGraphSingleQueue large",
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_syclpreview KernelSubmitGraphMultiQueue KernelsPerQueue 10",
"KernelSubmitGraphMultiQueue small",
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_syclpreview KernelSubmitGraphSingleQueue KernelBatchSize 64, KernelName Add, KernelsPerQueue 64",
"KernelSubmitGraphSingleQueue large",
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_syclpreview KernelSubmitGraphMultiQueue KernelsPerQueue 10",
"KernelSubmitGraphMultiQueue small",
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_syclpreview KernelSubmitGraphVllmMock AllocCount 128, GraphScenario 1 CPU count",
"KernelSubmitGraphVllmMock large, CPU count",
{"pytorch", "SYCL"},
)


if __name__ == "__main__":
Expand Down
Loading