PyTorch-CUDA-Graph-Capture-Examples/common.py at main · leimao/PyTorch-CUDA-Graph-Capture-Examples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""
Common utilities for CUDA graph examples.

This module contains shared model definitions and training functions used
across different CUDA graph demonstration scripts.
"""

import os
import torch
import torch.nn as nn
from torch.profiler import profile, ProfilerActivity, schedule, record_function


class MLPBlock(nn.Module):
    """Single MLP block with Linear, ReLU, and Dropout."""

    def __init__(self, in_features, out_features, dropout_p=0.1):
        super().__init__()
        self.linear = nn.Linear(in_features, out_features)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout_p)

    def forward(self, x):
        x = self.linear(x)
        x = self.relu(x)
        x = self.dropout(x)
        return x


class MLPModel(nn.Module):
    """MLP model with three consecutive MLP blocks plus a final linear layer."""

    def __init__(self,
                 input_dim,
                 hidden_dim1,
                 hidden_dim2,
                 hidden_dim3,
                 output_dim,
                 dropout_p1=0.2,
                 dropout_p2=0.1,
                 dropout_p3=0.1):
        super().__init__()
        self.block1 = MLPBlock(input_dim, hidden_dim1, dropout_p=dropout_p1)
        self.block2 = MLPBlock(hidden_dim1, hidden_dim2, dropout_p=dropout_p2)
        self.block3 = MLPBlock(hidden_dim2, hidden_dim3, dropout_p=dropout_p3)
        self.output = nn.Linear(hidden_dim3, output_dim)

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.output(x)
        return x


def train_without_cuda_graph(model,
                             loss_fn,
                             optimizer,
                             inputs,
                             targets,
                             profiler=None):
    """Train without using CUDA graph (standard PyTorch training)."""
    print("Training WITHOUT CUDA graph...")

    for i, (data, target) in enumerate(zip(inputs, targets)):
        with record_function("## optimizer.zero_grad ##"):
            optimizer.zero_grad()

        with record_function("## forward_pass ##"):
            y_pred = model(data)

        with record_function("## loss_computation ##"):
            loss = loss_fn(y_pred, target)

        with record_function("## backward_pass ##"):
            loss.backward()

        with record_function("## optimizer.step ##"):
            optimizer.step()

        if profiler is not None:
            profiler.step()

        # NOTE: Avoid calling .item() in the training loop as it triggers device-to-host
        # memory copy and CPU-GPU synchronization, which damages performance.
        # if i % 2 == 0:
        #     print(f"  Iteration {i+1:2d}: Loss = {loss.item():.4f}")

    print(f"  Completed {len(inputs)} iterations.")
    print()


def setup_model_and_data(device):
    """Setup model configuration and generate training data."""
    # Model setup
    N, D_in, H1, H2, H3, D_out = 640, 4096, 2048, 1024, 512, 256
    print(f"Model configuration:")
    print(f"  Batch size: {N}")
    print(f"  Input dim: {D_in}")
    print(f"  Hidden dims: {H1} -> {H2} -> {H3}")
    print(f"  Output dim: {D_out}")
    print()

    # Generate training data
    num_iterations = 10
    real_inputs = [
        torch.randn(N, D_in, device=device) for _ in range(num_iterations)
    ]
    real_targets = [
        torch.randn(N, D_out, device=device) for _ in range(num_iterations)
    ]

    config = {
        'N': N,
        'D_in': D_in,
        'H1': H1,
        'H2': H2,
        'H3': H3,
        'D_out': D_out
    }

    return config, real_inputs, real_targets


def create_model(config, device):
    """Create a new MLPModel instance with the given configuration."""
    return MLPModel(input_dim=config['D_in'],
                    hidden_dim1=config['H1'],
                    hidden_dim2=config['H2'],
                    hidden_dim3=config['H3'],
                    output_dim=config['D_out'],
                    dropout_p1=0.2,
                    dropout_p2=0.1,
                    dropout_p3=0.1).to(device)


def create_profiler():
    """Create a profiler with standard configuration."""
    return profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                   schedule=schedule(wait=1, warmup=2, active=7, repeat=1),
                   record_shapes=True,
                   profile_memory=True,
                   with_stack=True)


def save_and_print_profile(prof, trace_file, scenario_name):
    """Save profiling trace and print summary."""
    # Create directory if it doesn't exist
    trace_dir = os.path.dirname(trace_file)
    if trace_dir and not os.path.exists(trace_dir):
        os.makedirs(trace_dir)

    prof.export_chrome_trace(trace_file)
    print(f"Profiling trace saved to: {trace_file}")
    print()

    print(f"Top 10 operations by CUDA time ({scenario_name}):")
    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
    print()