pulp-platform · viv-eth · Jun 17, 2025 · May 20, 2026
@@ -0,0 +1,25 @@
+# SoCDAML Part III - Student skeletons for `iLeakyReLU`
+
+These files are your starting points for the Part III lab. Each one
+contains the surrounding boilerplate; the conceptually interesting
+parts are marked with `TODO(student)` comments and short hints.
+
+| File | What's in it | What to do |
+|------|--------------|------------|
+| `generate.py` | Complete ONNX + golden-value generator | Run it (Step 1) |
+| `iLeakyReLU.h` | Complete kernel header | Copy to `TargetLibraries/PULPOpen/inc/kernel/` (Step 3) |
+| `iLeakyReLU.c` | Multi-core chunking provided; inner loop TODO | Fill the TODO, copy to `TargetLibraries/PULPOpen/src/` (Step 3) |
+| `iLeakyReLU_simd.c` | SIMD chunking + load/max/store provided; one TODO line | Fill in Step 6b after the scalar works |
+| `iLeakyReLUParser.py` | `parseNode` and `parseNodeCtxt` are TODO | Fill in, paste class into `Deeploy/Targets/Generic/Parsers.py` (Step 2) |
+| `iLeakyReLUTemplate.py` | Mako template body is TODO | Fill in, copy to `Deeploy/Targets/PULPOpen/Templates/` (Step 4) |
+| `iLeakyReLUTileConstraint.py` | Inherits `UnaryTileConstraint`; performance constraint TODO | Fill in (Step 5 + Step 6a), copy to `Deeploy/Targets/PULPOpen/TileConstraints/` |
+
+The **Binding** (in `Bindings.py`), **Mapper + PULPMapping entry** (in `Platform.py`), the **`TilingReadyNodeBindings` registration** (in `Tiler.py`), and the **aggregator include** in `DeeployPULPMath.h` are *not* shipped as paste-in snippets — you'll write them yourself with the markdown's guidance and `<details>` solutions.
+
+The companion `Deeploy/Tutorials/SoCDAML.md` (Part III) walks through
+the six steps in order and includes collapsed solutions to peek at
+when you're stuck.
+
+If you really need the answer key, look in
+`Deeploy/Tutorials/PartIII_solution/iLeakyReLU/` -- but try the lab
+first; you'll learn far more.
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+# ----------------------------------------------------------------------
+# File: generate.py  (SoCDAML Part III - Step 1, provided complete)
+#
+# Builds the single-node ONNX graph + golden tensors that DeeployTest's
+# harness will use to validate your iLeakyReLU implementation.
+#
+# Run from this directory:
+#     python generate.py
+#
+# Outputs:
+#     network.onnx, inputs.npz, outputs.npz
+#
+# Quantization-friendly LeakyReLU formula used here:
+#     out[i] = x          if x >= 0
+#              (mul*x) >> shift   otherwise
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import onnx
+from onnx import TensorProto, helper
+
+SHAPE = (1, 16, 64, 64)
+MUL   = 1
+SHIFT = 3
+SEED  = 0xC0FFEE
+
+def golden(x, mul, shift):
+    pos = x.astype(np.int32)
+    neg = (mul * pos) >> shift
+    out = np.where(pos >= 0, pos, neg)
+    return np.clip(out, -128, 127).astype(np.int8)
+
+def build_onnx():
+    in_value  = helper.make_tensor_value_info('data_in',  TensorProto.INT8, SHAPE)
+    out_value = helper.make_tensor_value_info('data_out', TensorProto.INT8, SHAPE)
+    node  = helper.make_node('iLeakyReLU', ['data_in'], ['data_out'],
+                             name='iLeakyReLU_0', mul=MUL, shift=SHIFT)
+    graph = helper.make_graph([node], 'iLeakyReLU_single_node',
+                              [in_value], [out_value])
+    model = helper.make_model(graph, producer_name='SoCDAML-PartIII')
+    model.opset_import[0].version = 13
+    model.ir_version = 7
+    return model
+
+def main():
+    rng = np.random.default_rng(SEED)
+    x   = rng.integers(low=-128, high=127, size=SHAPE, dtype=np.int8)
+    y   = golden(x, MUL, SHIFT)
+    onnx.save(build_onnx(), 'network.onnx')
+    np.savez('inputs.npz',  data_in=x)
+    np.savez('outputs.npz', data_out=y)
+    print(f"OK: network.onnx, inputs.npz, outputs.npz "
+          f"(shape={SHAPE}, mul={MUL}, shift={SHIFT})")
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,33 @@
+/* =====================================================================
+ * Title:        iLeakyReLU.c  (SoCDAML Part III - Step 3 skeleton)
+ *
+ * Plain-C int8 LeakyReLU. The per-core chunking boilerplate is provided.
+ * Fill in the inner loop body marked `TODO(student)`.
+ *
+ *   Goal: out[i] = (in[i] >= 0) ? in[i] : ((mul * in[i]) >> shift)
+ *
+ * Hints:
+ *   - Cast in[i] to int32_t before the multiply to avoid 8-bit overflow.
+ *   - Cast the final result back to int8_t before storing.
+ *
+ * Drop into: TargetLibraries/PULPOpen/src/iLeakyReLU.c
+ * ===================================================================== */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+#include "DeeployPULPMath.h"
+#include "pmsis.h"
+
+void PULPiLeakyReLU_i8_i8(int8_t *pIn, int8_t *pOut, uint32_t size,
+                          int32_t mul, int32_t shift) {
+  uint32_t cid   = pi_core_id();
+  uint32_t nC    = NUM_CORES;
+  uint32_t per   = (size + nC - 1) / nC;
+  uint32_t start = cid * per;
+  uint32_t end   = (start + per > size) ? size : (start + per);
+
+  for (uint32_t i = start; i < end; i++) {
+    // TODO(student): compute pOut[i] from pIn[i], mul, shift.
+    // Replace the following line:
+    pOut[i] = 0;
+  }
+}
@@ -0,0 +1,18 @@
+/* =====================================================================
+ * Title:        iLeakyReLU.h  (SoCDAML Part III - Step 3, provided)
+ *
+ * Header for the iLeakyReLU PULP kernel.
+ * Drop into: TargetLibraries/PULPOpen/inc/kernel/iLeakyReLU.h
+ * and add `#include "kernel/iLeakyReLU.h"` to DeeployPULPMath.h.
+ * ===================================================================== */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+#ifndef __DEEPLOY_KERNEL_ILEAKYRELU_H_
+#define __DEEPLOY_KERNEL_ILEAKYRELU_H_
+
+#include "DeeployPULPMath.h"
+
+void PULPiLeakyReLU_i8_i8(int8_t *pIn, int8_t *pOut, uint32_t size,
+                          int32_t mul, int32_t shift);
+
+#endif // __DEEPLOY_KERNEL_ILEAKYRELU_H_
@@ -0,0 +1,36 @@
+# ----------------------------------------------------------------------
+# File: iLeakyReLUParser.py  (SoCDAML Part III - Step 2 skeleton)
+#
+# Paste this class into:
+#   Deeploy/Targets/Generic/Parsers.py
+#
+# Imports already present in that file (math, numpy as np,
+# onnx_graphsurgeon as gs, NodeParser, NetworkContext).
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+
+
+class iLeakyReLUParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        # TODO(student): return False if the node doesn't have exactly
+        # one input, exactly one output, and both 'mul' and 'shift'
+        # attributes. On success, store them into
+        # self.operatorRepresentation as ints and return True.
+        return False
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True):
+        # TODO(student): look up the input and output tensors from ctxt
+        # using node.inputs[0].name / node.outputs[0].name, and populate
+        # self.operatorRepresentation with:
+        #     'data_in'  -> input tensor name
+        #     'data_out' -> output tensor name
+        #     'size'     -> int(np.prod(input_shape))
+        # Return (ctxt, True).
+        return ctxt, False
@@ -0,0 +1,28 @@
+# ----------------------------------------------------------------------
+# File: iLeakyReLUTemplate.py  (SoCDAML Part III - Step 4 skeleton)
+#
+# Drop this file into:
+#   Deeploy/Targets/PULPOpen/Templates/iLeakyReLUTemplate.py
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+
+class _iLeakyReLUTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+
+# TODO(student): fill in the Mako template body so it emits a single
+# call to your C kernel:
+#
+#     PULPiLeakyReLU_i8_i8(<data_in>, <data_out>, <size>, <mul>, <shift>);
+#
+# All five `${...}` substitutions correspond to keys you populated in
+# the parser (or that Deeploy fills automatically for tensor names).
+referenceTemplate = _iLeakyReLUTemplate("""
+// iLeakyReLU (Name: ${nodeName}, Op: ${nodeOp})
+// TODO(student): emit the kernel call here.
+""")
@@ -0,0 +1,33 @@
+# ----------------------------------------------------------------------
+# File: iLeakyReLUTileConstraint.py  (SoCDAML Part III - Step 5+7a skeleton)
+#
+# Drop this file into:
+#   Deeploy/Targets/PULPOpen/TileConstraints/iLeakyReLUTileConstraint.py
+#
+# UnaryTileConstraint already implements the geometry and serializer
+# you need for an elementwise op. You only have to subclass it. In
+# Step 6a you'll add a performance constraint on top.
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict
+
+from Deeploy.DeeployTypes import NetworkContext
+from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+
+
+class iLeakyReLUTileConstraint(UnaryTileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        tilerModel = UnaryTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
+
+        # TODO(student, Step 6a): add a performance constraint so the
+        # innermost tile dim is a multiple of 16. Helpful API:
+        #     tilerModel.addMinTileSizeConstraint(parseDict, name,
+        #                                         tensorDimVar, modulo)
+        # See: Deeploy/Targets/Generic/TileConstraints/ConvTileConstraint.py
+        # for a usage example.
+
+        return tilerModel
@@ -0,0 +1,49 @@
+/* =====================================================================
+ * Title:        iLeakyReLU_simd.c  (SoCDAML Part III - Step 6b skeleton)
+ *
+ * SIMD version of iLeakyReLU using XPULP packed 4x8b operations.
+ * The per-core chunking is provided. Fill in the inner SIMD body.
+ *
+ * Key identity (worth deriving on paper before reading hints below):
+ *     LeakyReLU(x) = (x >= 0) ? x : (x >> shift)
+ *                  = max(x, x >> shift)
+ * because arithmetic right shift makes a negative value LESS negative
+ * (or zero) and doesn't change the sign of a non-negative value.
+ *
+ * Strategy hint (one path, two intrinsic-level operations per 4 lanes):
+ *   - load v4s lane:        v4s x = vIn[i];
+ *   - per-lane signed shift: v4s s = x >> shift;       (GCC vector ext)
+ *   - signed packed max:    __builtin_pulp_max4(x, s);
+ *
+ * For the lab we assume `mul == 1` (the generator picks mul=1, shift=3).
+ *
+ * Drop into: TargetLibraries/PULPOpen/src/iLeakyReLU.c (overwrite scalar)
+ * ===================================================================== */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+#include "DeeployPULPMath.h"
+#include "pmsis.h"
+
+void PULPiLeakyReLU_i8_i8(int8_t *pIn, int8_t *pOut, uint32_t size,
+                          int32_t mul, int32_t shift) {
+  (void)mul;  // SIMD path assumes mul == 1
+
+  uint32_t cid   = pi_core_id();
+  uint32_t nC    = NUM_CORES;
+  uint32_t per   = (size + nC - 1) / nC;
+  per &= ~0x3u;
+  uint32_t start = cid * per;
+  uint32_t end   = (start + per > size) ? size : (start + per);
+
+  v4s *vIn  = (v4s *)(pIn  + start);
+  v4s *vOut = (v4s *)(pOut + start);
+  uint32_t nVec = (end - start) >> 2;
+
+  for (uint32_t i = 0; i < nVec; i++) {
+    v4s x = vIn[i];
+    // TODO(student): one line to compute `s` from `x` and `shift`,
+    //                one line to blend `x` and `s` with the packed
+    //                signed max intrinsic and store it.
+    vOut[i] = x;  // <- placeholder, replace
+  }
+}
@@ -0,0 +1,99 @@
+# SoCDAML Part III - TA reference solution for `iLeakyReLU`
+
+The complete working `iLeakyReLU` operator (parser, template, binding,
+mapper, tile constraint, scalar kernel, SIMD kernel, ONNX + golden
+artifacts, and a one-shot deploy script). Use it to demo the lab
+end-to-end, and unblock students who get stuck.
+
+## What's in here
+
+| File | Purpose |
+|------|---------|
+| `generate.py` | Builds `network.onnx`, `inputs.npz`, `outputs.npz` for the single-node test |
+| `network.onnx` | Single-node ONNX with op_type `iLeakyReLU` (`mul=1`, `shift=3`), shape `(1, 16, 64, 64)` |
+| `inputs.npz`  | Int64 input tensor named `input` |
+| `outputs.npz` | Int64 golden output tensor named `output` |
+| `iLeakyReLU.h` | Kernel header |
+| `iLeakyReLU.c` | Scalar baseline kernel (Step 3) |
+| `iLeakyReLU_simd.c` | XPULP SIMD kernel (Step 6b) |
+| `iLeakyReLUParser.py` | Full parser class for `Deeploy/Targets/Generic/Parsers.py` |
+| `iLeakyReLUTemplate.py` | Full Mako template for `Deeploy/Targets/PULPOpen/Templates/` |
+| `iLeakyReLUTileConstraint.py` | Full tile + perf constraint for `Deeploy/Targets/PULPOpen/TileConstraints/` |
+| `deploy.sh` | One-shot script that copies the kernel/template/constraint into the live tree AND patches `Parsers.py`, `Bindings.py`, `Tiler.py`, `Platform.py`, `DeeployPULPMath.h` to wire everything up |
+
+## Quick start (TA workflow)
+
+From this directory, inside the Singularity shell:
+
+```bash
+# 1) (Re)generate the test artifacts
+python generate.py
+
+# 2) Apply the SCALAR solution into the live source tree
+./deploy.sh
+
+# 3) Verify (all four runs should report 0 errors and the cycle counts
+#    in the table below). See "Verification" section for the commands.
+
+# 4) Swap to the SIMD kernel for Step 6b
+./deploy.sh simd
+
+# 5) Roll back the file copies if you ever need to clean up
+./deploy.sh undo
+# (note: the script-applied patches into Parsers.py / Bindings.py /
+#  Platform.py / Tiler.py / DeeployPULPMath.h are NOT auto-reverted;
+#  use `git checkout -- <path>` for those if needed)
+```
+
+`deploy.sh` is idempotent, i.e. running it a second time is a no-op for
+the source patches. Re-running `./deploy.sh` after `./deploy.sh simd`
+will overwrite the kernel back to scalar (and vice versa), so you can
+flip between the two with one command.
+
+## Verification
+
+Reproduce every number in the lab's "Stacked speedup" table from
+`DeeployTest/`:
+
+```bash
+cd /app/Deeploy/Tutorials/PartIII_solution/iLeakyReLU
+./deploy.sh
+cd /app/Deeploy/DeeployTest
+
+echo "=== Baseline (1 core, scalar, untiled) ===";  python testRunner_siracusa.py       -t Tests/iLeakyReLU --cores=1 2>&1 | grep -E "Runtime|Errors"
+echo "=== Step 4   (8 cores, scalar, untiled) ==="; python testRunner_siracusa.py       -t Tests/iLeakyReLU --cores=8 2>&1 | grep -E "Runtime|Errors"
+echo "=== Step 5   (8 cores, scalar, tiled)   ==="; python testRunner_tiled_siracusa.py -t Tests/iLeakyReLU --cores=8 --l1=32768 --defaultMemLevel=L2 2>&1 | grep -E "Runtime|Errors"
+
+cd /app/Deeploy/Tutorials/PartIII_solution/iLeakyReLU
+./deploy.sh simd
+cd /app/Deeploy/DeeployTest
+
+echo "=== Step 6   (8 cores, SIMD,   tiled)   ==="; python testRunner_tiled_siracusa.py -t Tests/iLeakyReLU --cores=8 --l1=32768 --defaultMemLevel=L2 2>&1 | grep -E "Runtime|Errors"
+```
+
+### Expected output
+
+Every run reports `Errors: 0 out of 65536`. Cycle counts:
+
+| Step | Configuration | Cycles | vs baseline |
+|------|---|---|---|
+| baseline | 1 core, scalar, untiled | **2 492 970** | 1.00× |
+| Step 4   | 8 cores, scalar, untiled | **313 541** | 7.95× |
+| Step 5   | 8 cores, scalar, tiled (`--l1=32768`) | **108 090** | 23.06× |
+| Step 6   | 8 cores, SIMD, tiled (`--l1=32768`) | **43 005** | 57.97× |
+
+If any count drifts by more than a few percent or a run reports any
+errors, something in the deploy is off. Try `./deploy.sh undo` plus
+`git checkout --` on the patched source files, then re-deploy from
+scratch.
+
+## Files NOT in this directory (live-tree edits applied by deploy.sh)
+
+`deploy.sh` modifies these files in the live tree. They are NOT
+duplicated here, i.e. `deploy.sh` is the source of truth.
+
+- `Deeploy/Targets/Generic/Parsers.py`: appends `iLeakyReLUParser`
+- `Deeploy/Targets/PULPOpen/Bindings.py`: appends `PULPiLeakyReLUBindings`
+- `Deeploy/Targets/PULPOpen/Tiler.py`: appends `PULPiLeakyReLUTilingReadyBindings`
+- `Deeploy/Targets/PULPOpen/Platform.py`: adds parser/layer imports, mapper, and `PULPMapping` entry
+- `TargetLibraries/PULPOpen/inc/DeeployPULPMath.h`: adds the kernel include