-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbench.py
More file actions
88 lines (75 loc) · 2.58 KB
/
bench.py
File metadata and controls
88 lines (75 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
Benchmark CodeGPT model performance.
Measures throughput, latency, and MFU.
"""
import os
import time
import torch
from contextlib import nullcontext
from model import CodeGPT, CodeGPTConfig
# config
batch_size = 8
block_size = 1024
n_layer = 12
n_head = 12
n_embd = 768
bias = False
seed = 1337
device = 'cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu')
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
compile = True if torch.cuda.is_available() else False
num_warmup = 10
num_iters = 50
from configurator import configure
configure()
torch.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
device_type = 'cuda' if 'cuda' in device else ('mps' if 'mps' in device else 'cpu')
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
# create model
config = CodeGPTConfig(
block_size=block_size, n_layer=n_layer, n_head=n_head,
n_embd=n_embd, bias=bias, dropout=0.0,
)
model = CodeGPT(config)
model.to(device)
if compile:
print("Compiling model...")
model = torch.compile(model)
# synthetic data
x = torch.randint(0, config.vocab_size, (batch_size, block_size), device=device)
y = torch.randint(0, config.vocab_size, (batch_size, block_size), device=device)
# warmup
print(f"Warming up ({num_warmup} iters)...")
for _ in range(num_warmup):
with ctx:
logits, loss = model(x, y)
loss.backward()
model.zero_grad(set_to_none=True)
if device_type == 'cuda':
torch.cuda.synchronize()
# benchmark
print(f"Benchmarking ({num_iters} iters)...")
times = []
for _ in range(num_iters):
t0 = time.time()
with ctx:
logits, loss = model(x, y)
loss.backward()
model.zero_grad(set_to_none=True)
if device_type == 'cuda':
torch.cuda.synchronize()
t1 = time.time()
times.append(t1 - t0)
avg_time = sum(times) / len(times)
tokens_per_sec = batch_size * block_size / avg_time
mfu = model.estimate_mfu(batch_size, avg_time) if hasattr(model, 'estimate_mfu') else 0
print(f"\nResults:")
print(f" Model: {config.n_layer}L/{config.n_head}H/{config.n_embd}E ({model.get_num_params()/1e6:.1f}M params)")
print(f" Batch size: {batch_size}, Block size: {block_size}")
print(f" Device: {device}, Dtype: {dtype}, Compile: {compile}")
print(f" Avg iter time: {avg_time*1000:.2f} ms")
print(f" Throughput: {tokens_per_sec:,.0f} tokens/sec")
print(f" MFU: {mfu*100:.2f}%")