CROSS_Tutorial/tpu.html at main · EfficientPPML/CROSS_Tutorial · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
<!DOCTYPE html>
<html lang="en" data-theme="dark">

<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>TPU</title>
  <meta name="description" content="Guide to setting up Google Cloud TPU for Privacy-Preserving AI.">

  <!-- Google Fonts: Inter & Outfit for a more modern look -->
  <link rel="preconnect" href="https://fonts.googleapis.com">
  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
  <link
    href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Outfit:wght@400;500;700;800&display=swap"
    rel="stylesheet">

  <link rel="stylesheet" href="styles.css">
</head>

<body>

  <div class="app-layout">
    <!-- Sidebar Navigation -->
    <aside class="sidebar">
      <div class="sidebar-header">
        <div class="logo">CPA TUTORIAL</div>
        <button id="mobile-close-btn" class="icon-btn" aria-label="Close Menu">
          <svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
            <path d="M18 6L6 18M6 6l12 12" />
          </svg>
        </button>
      </div>

      <nav class="sidebar-nav">
        <!-- New Sections -->
        <div class="nav-section">
          <h3 class="nav-title">tutorial</h3>
          <ul class="nav-links">
            <li><a href="index.html">ASPLOS'26@Pittsburgh</a></li>
          </ul>
        </div>

        <div class="nav-section">
          <h3 class="nav-title">docs</h3>
          <ul class="nav-links">
            <li><a href="beginner.html">Easy HE Background</a></li>
            <li><a href="cross.html">CROSS</a></li>
            <li><a href="tpu.html" class="active">TPU Novice to Master</a></li>
            <li><a href="ntt.html">NTT Algorithms</a></li>
            <li><a href="open_challenge.html">Open Challenge</a></li>
          </ul>
        </div>
      </nav>
    </aside>

    <!-- Main Content Area -->
    <main class="main-content">
      <header class="top-bar">
        <button id="mobile-menu-btn" class="icon-btn" aria-label="Open Menu">
          <svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
            <path d="M3 12h18M3 6h18M3 18h18" />
          </svg>
        </button>

        <div class="header-title">TPU</div>

        <div class="top-actions">
          <button id="theme-toggle" class="icon-btn" aria-label="Toggle Theme">
            <!-- Sun Icon -->
            <svg class="sun-icon" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor"
              stroke-width="2">
              <circle cx="12" cy="12" r="5" />
              <path
                d="M12 1v2M12 21v2M4.22 4.22l1.42 1.42M18.36 18.36l1.42 1.42M1 12h2M21 12h2M4.22 19.78l1.42-1.42M18.36 5.64l1.42-1.42" />
            </svg>
            <!-- Moon Icon -->
            <svg class="moon-icon" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor"
              stroke-width="2">
              <path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z" />
            </svg>
          </button>
        </div>
      </header>

      <div class="content-wrapper">
        <article class="doc-content">

          <section id="tpu-setup">
            <h2>Chapter 1: TPU Setup</h2>

            <ul>
              <li>
                <strong>Step 1: Create a Google Project</strong> at <a
                  href="https://cloud.google.com/appengine/docs/standard/nodejs/building-app/creating-project">tutorial</a>.
                <br>
                Obtain the name of the project as <code>&lt;google_project_name&gt;</code> and <strong>Google Project
                  ID</strong>
                from the created project.
              </li>

              <li>
                <strong>Step 2: Apply for the Tree-tier TPU trail for 30 days</strong> at <a
                  href="https://sites.research.google/trc/about/">TRC</a>
                <p>Once submitted the request, an email will be shot to you within one day, where there is a link to
                  fill in a
                  survey with your <strong>Google project ID</strong>.</p>
              </li>

              <li>
                <strong>Step 3: Launch TPU VM.</strong>
                <p>You could do it over GUI or gcloud cli (in your local machine) to create a TPU VM. I give the gcloud
                  cli as it
                  works for all generations (>=v4) of TPUs.</p>

                <!-- Placeholder for Image mentioned in previous content -->
                <!-- [Image of Google Cloud TPU architecture and VM connection workflow] -->

                <p><strong>For TPUv4,</strong></p>
                <pre><code>gcloud config set project &lt;google_project_name&gt;
gcloud config set compute/zone us-central2-b
gcloud alpha compute tpus queued-resources create &lt;google_project_name&gt; --node-id=&lt;your_favoriate_node_name&gt; \
    --zone=us-central2-b \
    --accelerator-type=v4-8 \
    --runtime-version=v2-alpha-tpuv4</code></pre>

                <p><strong>For TPUv5e,</strong></p>
                <pre><code>gcloud config set project &lt;google_project_name&gt;
gcloud config set compute/zone us-west1-c
gcloud alpha compute tpus queued-resources create &lt;google_project_name&gt; --node-id=&lt;your_favoriate_node_name&gt; \
    --zone=us-west1-c \
    --accelerator-type=v5litepod-4 \
    --runtime-version=v2-alpha-tpuv5-lite \
    --provisioning-model=spot</code></pre>

                <p><strong>For TPUv5p,</strong></p>
                <pre><code>gcloud config set project &lt;google_project_name&gt;
gcloud config set compute/zone us-east5-a
gcloud alpha compute tpus queued-resources create &lt;google_project_name&gt; --node-id=&lt;your_favoriate_node_name&gt; \
    --zone=us-east5-a \
    --accelerator-type=v5p-8 \
    --runtime-version=v2-alpha-tpuv5 \
    --provisioning-model=spot</code></pre>

                <p><strong>For TPUv6e,</strong></p>
                <pre><code>gcloud config set project &lt;google_project_name&gt;
gcloud config set compute/zone us-east1-d
gcloud alpha compute tpus queued-resources create &lt;google_project_name&gt; --node-id=&lt;your_favoriate_node_name&gt; \
    --zone=us-east1-d \
    --accelerator-type=v6e-1 \
    --runtime-version=v2-alpha-tpuv6e \
    --provisioning-model=spot</code></pre>

                <p><em>Note:</em> TPUv5e, TPUv5p and TPUv6e might be easier to get under <code>provisioning-model</code>
                  as spot or flex-start (with a specific time range) because they are popular resources, and Google
                  cloud can preempt it if there are tasks with higher priority requiring these resources. But you could
                  get a long-term active TPUv4 VM as it's less demanding by other tasks.
                </p>
              </li>

              <li>
                <strong>Step 4: Setup Remote SSH (VSCode or Cursor) to TPU VM</strong>
                <p>Once the requested TPU vm is up and running as shown in Google console, you could use gcloud to
                  forward the SSH
                  port of the remote machine to a port of local machine and setup VSCode remote ssh.</p>
                <p>You need to first setup local ssh key to Google's compute engine, following <a
                    href="https://cloud.google.com/compute/docs/connect/create-ssh-keys#gcloud">link</a>. After your
                  follow the
                  instructions on the page, the ssh key will be dumped here
                  <code>&lt;path_to_local_user&gt;/.ssh/google_compute_engine</code>.
                </p>

                <pre><code>gcloud compute tpus tpu-vm ssh &lt;gcloud_user_name&gt;@&lt;your_favoriate_node_name&gt; -- -L 9009:localhost:22</code></pre>
                <p>Where <code>9009</code> is the port of local machine, while <code>22</code> is the SSH port of the
                  TPU vm.</p>

                <p>After you set it up, you could configure VSCode to use the remote SSH package <a
                    href="https://code.visualstudio.com/docs/remote/ssh">link</a> to remotely access into TPUvm.</p>

                <pre><code>Host tpu-vm
  User &lt;gcloud_user_name&gt;
  HostName localhost
  Port 9009
  IdentityFile &lt;path_to_local_user&gt;/.ssh/google_compute_engine</code></pre>

                <p>After this, you should follow the steps on <a
                    href="https://code.visualstudio.com/docs/remote/ssh">link</a> to
                  log into TPU VM.</p>
              </li>
            </ul>
          </section>

          <section id="env-setup">
            <h2>Chapter 2: Environment Setup</h2>
            <p>Inside TPU VM, please do following setup to configure the environment.</p>

            <ul>
              <li>
                <strong>Step 1: install miniconda</strong>
                <pre><code>wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
chmod +x ./Miniconda3-latest-Linux-x86_64.sh
./Miniconda3-latest-Linux-x86_64.sh
# follow instructions and set up launch into .bashrc</code></pre>
              </li>

              <li>
                <strong>Step 2: create environment and install required packages</strong>
                <pre><code>git clone https://github.com/EfficientPPML/CROSS
source ~/.bashrc
conda create --name jaxite python=3.13
conda activate jaxite
pip install -U "jax[tpu]"
pip install xprof
pip install absl-py
pip install pandas
pip install gmpy2
# Try to run a simple test
python script.py </code></pre>
              </li>
              where <code>script.py</code> is the script you want to run in the jaxite_word folder.
            </ul>
          </section>

          <section id="tpu-architecture">
            <h2>Chapter 3: TPU Architecture</h2>

            <h3>Part 1: TPU micro-architecture (using TPUv4 as an example)</h3>
            <img src="fig/tpu_tip_image2.png" alt="TPU v4 Architecture Diagram">

            <p>AI accelerators such as TPU feature abundant memory and compute which show great potential for HE
              acceleration.</p>

            <ul>
              <li><strong>Large Compute Array (Parallelism):</strong> Each MXU in AI accelerators is 32x larger (e.g.,
                128x128 in TPUv4) than those in GPUs (typically 4,4,4-Matrix Multiplication). The larger size of MXUs
                increases on-chip data reuse within the two-dimensional computation arrays, enhancing throughput per
                watt. Moreover, a sea of 2048 SIMD ALUs, sharing the same VRegs with MXUs and further increasing data
                reuse.</li>
              <li><strong>Large On-chip Memory:</strong> AI accelerators feature large on-chip memory, e.g. a single
                Google TPU v4 chip has 160 MB of on-chip memory, including 128 MB CMEM and 32 MB VMEM array, which is
                20x/4x larger than the AMD MI100 / NVIDIA A100. This substantial on-chip capacity can accommodate entire
                ciphertexts to avoid its repeated accesses from off-chip memory, alleviating memory bottleneck inherent
                in HE workloads for better performance and efficiency.</li>
              <li><strong>On-chip Data Management Units:</strong> TPU has specialized Cross Lane Unit (XLU), which could
                (1) transpose data sitting in on-chip VMEM, (2) shuffle data among VMEMs, and (3) accumulate partial
                results from VMEMs in different lanes into final results.</li>
            </ul>

            <p>Some functionalities of computation and how is it useful for ML is introduced in this repo: <a
                href="https://jax-ml.github.io/scaling-book/">https://jax-ml.github.io/scaling-book/</a></p>

            <img src="fig/tpu_tip_image1.png" alt="TPU Components Diagram">

            <p>From a programmer's perspective, a TPU chip mainly consists of three powerful components working
              together:</p>

            <ol>
              <li><strong>MXU (Matrix Multiplication Unit):</strong>
                <ul>
                  <li><strong>What it does:</strong> Performs massive-scale matrix multiplications such as 8x128x128
                    matrix multiplication.</li>
                  <li>This is the most powerful part of the TPU. It’s like a giant calculator dedicated to matrix math.
                    A single MXU can be 32 times larger than those in a typical GPU, which serves as the key
                    computational power of TPUs.</li>
                </ul>
              </li>
              <li><strong>VPU (Vector Processing Unit):</strong>
                <ul>
                  <li><strong>What it does:</strong> Handles vector operations (like additions, subtractions, and other
                    general math) that the MXU doesn't handle.</li>
                  <li>It handles the simpler, non-matrix math tasks.</li>
                </ul>
              </li>
              <li><strong>VRegs (Vectorized Registers):</strong>
                <ul>
                  <li><strong>What they are:</strong> A set of coarse-grained on-chip memory locations (4 KB each)
                    shared by both the MXU and VPU.</li>
                  <li>This is the temporary 'workbench' for the MXU and VPU. Data must be loaded into VRegs before any
                    computation can begin. The most efficient way to use the TPU is to keep all the data here to avoid
                    slow access to off-chip memory. The VRegs organize data in chunks of (8, 128) 32-bit values.</li>
                </ul>
              </li>
            </ol>

            <p><strong>In short</strong>, TPU could be simply viewed as Big "Matrix Multiplication Unit (MXU)" and Big
              "Vector Processing Unit (VPU)" working on the same set of (8, 128) 32-bit registers (VReg, 4 KB each).</p>

            <p><strong>The Main Goal:</strong> The key to fast TPU code is to <strong>fully utilize the MXU and
                VPU</strong> by <strong>keeping data in the VRegs</strong> and minimizing slow data movement between
              memory locations.</p>

            <h3>Part 2: Tips for Writing Fast TPU Code</h3>
            <p>These tips focus on how to structure your code and data to get the most out of the TPU's architecture.
            </p>

            <table>
              <thead>
                <tr>
                  <th>Tip</th>
                  <th>What to Do</th>
                  <th>Why It Works</th>
                </tr>
              </thead>
              <tbody>
                <tr>
                  <td>1. Fully Utilize the VRegs</td>
                  <td>Make sure at least two dimensions of your tensor shapes are a multiple of 8 and 128.</td>
                  <td>The VRegs are optimized for data chunks of size (8, 128). If your data size isn't a multiple of
                    these numbers, the TPU has to waste resources (VReg space and instructions) to process only a
                    partial chunk. Matching the (8, 128) granularity ensures your data is perfectly aligned and can be
                    processed in the fewest number of instructions. A simple example: for an operation like Vectorized
                    Multiplication, if you use an unoptimized tensor shape of (4, 256), the data must be split,
                    requiring two VRegs and resulting in only 50% utilization of the VReg slots and requiring two
                    instructions to complete. By contrast, if you optimize your data shape to (8, 128), it perfectly
                    aligns with the VReg's internal block size, requiring only 1 VReg to hold all the data and allowing
                    the operation to complete in a single instruction, significantly boosting efficiency.</td>
                </tr>
                <tr>
                  <td>2. Minimize Layout Transformations</td>
                  <td>Avoid adding explicit <code>reshape</code> or layout conversion instructions in your code.</td>
                  <td>A layout conversion is when the TPU has to internally move or reorganize data inside its on-chip
                    memory. This is non-computational work (it doesn't do any math) and adds extra explicit latency.
                  </td>
                </tr>
                <tr>
                  <td>3. Offload VPU Work to the MXU</td>
                  <td>Try to reform any vector-based math (VPU work) into low-precision dense matrix multiplication (MXU
                    work).</td>
                  <td>The MXU is designed for matrix math and has a much higher throughput than the VPU. If you can
                    creatively turn a VPU task into an MXU task (often by using low-precision data types), you switch
                    the workload from the "Generalist" to the "Heavy Lifter," resulting in a massive speed boost.</td>
                </tr>
                <tr>
                  <td>4. Shuffle Data at a Coarse Granularity</td>
                  <td>If you <strong>must</strong> shuffle or reorder data, do it at the large (8, 128) VReg chunk
                    level.</td>
                  <td>Shuffling is a type of layout transformation. By moving data at the largest possible (8, 128)
                    granularity, you minimize the total number of instructions the TPU needs to execute for the
                    rearrangement, reducing the overall overhead.</td>
                </tr>
                <tr>
                  <td>5. Hiding the off-chip memory access</td>
                  <td>Choose the granularity of data to be loaded such that the off-chip accessing latency could be
                    hidden by the computation</td>
                  <td>Off-chip memory access latency could be hidden by carefully selecting a data-loading granularity
                    that ensures the memory access time is less than the computation time. This optimization aims at
                    reducing or eliminating the explicit off-chip data accessing latency among consecutive operations to
                    maximize average compute utilization.</td>
                </tr>
              </tbody>
            </table>

            <h3>Part 3: Useful Resources for TPU programming</h3>
            <ul>
              <li><strong>Multi-Chip Programming with JAX:</strong> Learn how to run a job on multiple TPU chips (less
                than 8 on a single host VM) by specifying the topology and sharding strategy: <a
                  href="https://docs.jax.dev/en/latest/jax.sharding.html">JAX Sharding Documentation</a>.</li>
              <li><strong>Custom Kernel Development with Pallas:</strong> TPU programming can be done using
                <strong>JAX</strong> (similar to NumPy) for high-level operations, or <strong>Pallas</strong> (Google’s
                equivalent of Triton) to create customized kernels with explicit control over data loading and
                computation granularity: <a href="https://docs.jax.dev/en/latest/pallas/index.html">Pallas
                  Documentation</a>.
              </li>
            </ul>
          </section>

          <!-- Citation Section -->
          <section id="citation" class="citation-section">
            <h2>Citation</h2>
            <p>If you find this tutorial helpful, feel free to:</p>
            <ul>
              <li>Star CROSS repo at <a
                  href="https://github.com/EfficientPPML/CROSS">https://github.com/EfficientPPML/CROSS</a></li>
              <li>Cite our paper with biblatex below:</li>
            </ul>
            <pre><code>@inproceedings{tong2025CROSS,
author = {Jianming Tong and Tianhao Huang and Jingtian Dang and Leo de Castro and Anirudh Itagi and Anupam
Golder and Asra Ali and Jevin Jiang and Jeremy Kun and Arvind and G. Edward Suh and Tushar Krishna},
title = {Leveraging ASIC AI Chips for Homomorphic Encryption},
year = {2026},
publisher = {2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)},
address = {Australia},
keywords = {AI ASICs, TPU, Fully Homomorphic Encryption},
location = {Australia},
series = {HPCA'26} }</code></pre>
          </section>

        </article>

        <aside class="on-this-page">
          <h4>On this page</h4>
          <ul>
            <li><a href="#tpu-setup">TPU Setup</a></li>
            <li><a href="#env-setup">Environment Setup</a></li>
            <li><a href="#tpu-architecture">TPU Architecture</a></li>
            <li><a href="#citation">Citation</a></li>
          </ul>
        </aside>
      </div>

      <footer class="site-footer">
        <p>&copy; 2026 Cryptography Primitives Acceleration Tutorial.</p>
      </footer>
    </main>
  </div>

  <script src="script.js"></script>
</body>

</html>