itest: add tranche-based parallel runner and clearer logs

ffranr · ffranr · commit b0f7ee265654 · 2025-12-09T01:20:30.000Z
- add tranche splitting/shuffling flags to the itest harness
- add itest-parallel target and scripts to run tranches concurrently
- write per-tranche logs under .logs/trancheN and tail failures for
  clarity
diff --git a/Makefile b/Makefile
@@ -239,6 +239,10 @@ build-itest:
 	CGO_ENABLED=0 $(GOBUILD) -tags="$(ITEST_TAGS)" -o itest/btcd-itest -ldflags "$(ITEST_LDFLAGS)" $(BTCD_PKG)
 	CGO_ENABLED=0 $(GOBUILD) -tags="$(ITEST_TAGS)" -o itest/lnd-itest -ldflags "$(ITEST_LDFLAGS)" $(LND_PKG)/cmd/lnd
 
+build-itest-binary:
+	@$(call print, "Building itest binary.")
+	CGO_ENABLED=0 $(GOTEST) -v ./itest -tags="$(DEV_TAGS) $(ITEST_TAGS)" -c -o itest/itest.test
+
 install-backward-compat-versions:
 	@$(call print, "Installing old versions of litd for backward compatibility tests.")
 	scripts/install-backward-compat-versions.sh '$(LITD_COMPAT_VERSIONS)'
@@ -258,6 +262,16 @@ itest: app-build build-itest itest-only
 
 itest-no-backward-compat: app-build build-itest build-itest run-itest-only
 
+itest-parallel: app-build build-itest install-backward-compat-versions build-itest-binary
+	@$(call print, "Running integration tests in parallel.")
+	rm -rf itest/*.log itest/.logs*; date
+	scripts/itest_parallel.sh $(ITEST_PARALLELISM) $(NUM_ITEST_TRANCHES) $(SHUFFLE_SEED) $(TEST_FLAGS) $(ITEST_FLAGS)
+
+itest-parallel-no-backward-compat: app-build build-itest build-itest-binary
+	@$(call print, "Running integration tests in parallel (no backward compat binaries).")
+	rm -rf itest/*.log itest/.logs*; date
+	scripts/itest_parallel.sh $(ITEST_PARALLELISM) $(NUM_ITEST_TRANCHES) $(SHUFFLE_SEED) $(TEST_FLAGS) $(ITEST_FLAGS)
+
 # =============
 # FLAKE HUNTING
 # =============
@@ -349,5 +363,6 @@ flakehunter-unit:
 .PHONY: default all yarn-install build install go-build go-build-noui \
 	go-install go-install-noui go-install-cli app-build release go-release \
 	docker-release docker-tools scratch check unit unit-cover unit-race \
-	clean-itest build-itest itest-only itest flake-unit fmt lint mod mod-check \
-	list rpc protos protos-check rpc-js-compile clean
+	clean-itest build-itest build-itest-binary itest-only itest \
+	itest-parallel itest-parallel-no-backward-compat flake-unit fmt lint \
+	mod mod-check list rpc protos protos-check rpc-js-compile clean
diff --git a/itest/litd_test.go b/itest/litd_test.go
@@ -1,6 +1,9 @@
 package itest
 
 import (
+	"flag"
+	"fmt"
+	"math/rand"
 	"os"
 	"strings"
 	"testing"
@@ -12,6 +15,43 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
+const (
+	// defaultSplitTranches is the default number of tranches to divide the
+	// test suite into when no override is provided.
+	defaultSplitTranches uint = 1
+
+	// defaultRunTranche is the default tranche index to execute when no
+	// explicit tranche is selected.
+	defaultRunTranche uint = 0
+)
+
+var (
+	// testCasesSplitTranches is the number of tranches the test cases should
+	// be split into. By default this is set to 1, so no splitting happens.
+	// If this value is increased, then the -runtranche flag must be
+	// specified as well to indicate which part should be run in the current
+	// invocation.
+	testCasesSplitTranches = flag.Uint(
+		"splittranches", defaultSplitTranches,
+		"split the test cases in this many tranches and run the tranche "+
+			"at 0-based index specified by the -runtranche flag",
+	)
+
+	// shuffleSeedFlag enables deterministic shuffling of test cases to
+	// balance workload across tranches.
+	shuffleSeedFlag = flag.Uint64(
+		"shuffleseed", 0, "if set, shuffles the test cases using this "+
+			"as the source of randomness",
+	)
+
+	// testCasesRunTranche selects which tranche (0-based) to execute.
+	testCasesRunTranche = flag.Uint(
+		"runtranche", defaultRunTranche,
+		"run the tranche of the split test cases with the given (0-based) "+
+			"index",
+	)
+)
+
 // TestLightningTerminal performs a series of integration tests amongst a
 // programmatically driven network of lnd nodes.
 func TestLightningTerminal(t *testing.T) {
@@ -39,9 +79,18 @@ func TestLightningTerminal(t *testing.T) {
 		"--rpcmiddleware.enable",
 	}
 
+	testCases, trancheIndex, trancheOffset := selectTestTranche()
+	totalTestCases := len(allTestCases)
+
 	// Run the subset of the test cases selected in this tranche.
-	for _, testCase := range allTestCases {
-		success := t.Run(testCase.name, func(t1 *testing.T) {
+	for idx, testCase := range testCases {
+		testOrdinal := int(trancheOffset) + idx + 1
+		testName := fmt.Sprintf(
+			"tranche%02d/%02d-of-%d/%s", int(trancheIndex),
+			testOrdinal, totalTestCases, testCase.name,
+		)
+
+		success := t.Run(testName, func(t1 *testing.T) {
 			cleanTestCaseName := strings.ReplaceAll(
 				testCase.name, " ", "_",
 			)
@@ -107,6 +156,79 @@ func TestLightningTerminal(t *testing.T) {
 	}
 }
 
+// maybeShuffleTestCases shuffles the test cases if the flag `shuffleseed` is
+// set and not 0. This is used by parallel test runs to even out the work
+// across tranches.
+func maybeShuffleTestCases() {
+	// Exit if not set or set to 0.
+	if shuffleSeedFlag == nil || *shuffleSeedFlag == 0 {
+		return
+	}
+
+	// Init the seed and shuffle the test cases.
+	// #nosec G404 -- This is not for cryptographic purposes.
+	r := rand.New(rand.NewSource(int64(*shuffleSeedFlag)))
+	r.Shuffle(len(allTestCases), func(i, j int) {
+		allTestCases[i], allTestCases[j] =
+			allTestCases[j], allTestCases[i]
+	})
+}
+
+// createIndices divides the number of test cases into pairs of indices that
+// specify the start and end of a tranche.
+func createIndices(numCases, numTranches uint) [][2]uint {
+	base := numCases / numTranches
+	remainder := numCases % numTranches
+
+	indices := make([][2]uint, numTranches)
+	start := uint(0)
+
+	for i := uint(0); i < numTranches; i++ {
+		end := start + base
+		if i < remainder {
+			end++
+		}
+		indices[i] = [2]uint{start, end}
+		start = end
+	}
+
+	return indices
+}
+
+// selectTestTranche returns the sub slice of the test cases that should be run
+// as the current split tranche as well as the index and slice offset of the
+// tranche.
+func selectTestTranche() ([]*testCase, uint, uint) {
+	numTranches := defaultSplitTranches
+	if testCasesSplitTranches != nil {
+		numTranches = *testCasesSplitTranches
+	}
+	runTranche := defaultRunTranche
+	if testCasesRunTranche != nil {
+		runTranche = *testCasesRunTranche
+	}
+
+	// There's a special flake-hunt mode where we run the same test multiple
+	// times in parallel. In that case the tranche index is equal to the
+	// thread ID, but we need to actually run all tests for the regex
+	// selection to work.
+	threadID := runTranche
+	if numTranches == 1 {
+		runTranche = 0
+	}
+
+	// Shuffle the test cases if the `shuffleseed` flag is set.
+	maybeShuffleTestCases()
+
+	numCases := uint(len(allTestCases))
+	indices := createIndices(numCases, numTranches)
+	index := indices[runTranche]
+	trancheOffset, trancheEnd := index[0], index[1]
+
+	return allTestCases[trancheOffset:trancheEnd], threadID,
+		trancheOffset
+}
+
 func init() {
 	logger := btclog.NewSLogger(btclog.NewDefaultHandler(os.Stdout))
 	UseLogger(logger.SubSystem(Subsystem))
diff --git a/make/testing_flags.mk b/make/testing_flags.mk
@@ -3,6 +3,26 @@ include make/compile_flags.mk
 TEST_FLAGS =
 DEV_TAGS = dev
 
+NUM_ITEST_TRANCHES = 8
+ITEST_PARALLELISM = $(NUM_ITEST_TRANCHES)
+SHUFFLE_SEED = 0
+
+# Scale the number of parallel running itest tranches.
+ifneq ($(tranches),)
+NUM_ITEST_TRANCHES = $(tranches)
+ITEST_PARALLELISM = $(NUM_ITEST_TRANCHES)
+endif
+
+# Give the ability to run the same tranche multiple times at the same time.
+ifneq ($(parallel),)
+ITEST_PARALLELISM = $(parallel)
+endif
+
+# Set the seed for shuffling the test cases.
+ifneq ($(shuffleseed),)
+SHUFFLE_SEED = $(shuffleseed)
+endif
+
 # Define the integration test.run filter if the icase argument was provided.
 ifneq ($(icase),)
 ITEST_FLAGS += -test.run="TestLightningTerminal/$(icase)"
diff --git a/scripts/itest_parallel.sh b/scripts/itest_parallel.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Get all the variables.
+PROCESSES=$1
+TRANCHES=$2
+SHUFFLE_SEED=$3
+
+# Here we also shift 3 times and get the rest of our flags to pass on in $@.
+shift 3
+
+# Create a variable to hold the final exit code.
+exit_code=0
+
+# Run commands in parallel and track their PIDs.
+pids=()
+for ((i=0; i<PROCESSES; i++)); do
+	scripts/itest_part.sh $i $TRANCHES $SHUFFLE_SEED "$@" &
+	pids+=($!)
+done
+
+# Wait for the processes created by xargs to finish.
+for pid in "${pids[@]}"; do
+	wait $pid
+
+	# Once finished, grab its exit code.
+	current_exit_code=$?
+
+	# Overwrite the exit code if current itest doesn't return 0.
+	if [ $current_exit_code -ne 0 ]; then
+		# Only write the exit code of the first failing itest.
+		if [ $exit_code -eq 0 ]; then
+			exit_code=$current_exit_code
+		fi
+	fi
+done
+
+# Exit with the exit code of the first failing itest or 0.
+exit $exit_code
diff --git a/scripts/itest_part.sh b/scripts/itest_part.sh
@@ -3,14 +3,45 @@
 # Let's work with absolute paths only, we run in the itest directory itself.
 WORKDIR=$(pwd)/itest
 
+TRANCHE=0
+NUM_TRANCHES=1
+SHUFFLE_SEED=0
+
+# If the first three arguments are integers, treat them as tranche settings.
+if [[ $# -ge 3 && "$1" =~ ^[0-9]+$ && "$2" =~ ^[0-9]+$ && "$3" =~ ^[0-9]+$ ]]; then
+	TRANCHE=$1
+	NUM_TRANCHES=$2
+	SHUFFLE_SEED=$3
+	shift 3
+fi
+
 # Windows insists on having the .exe suffix for an executable, we need to add
 # that here if necessary.
 EXEC="$WORKDIR"/itest.test
 LITD_EXEC="$WORKDIR"/litd-itest
 BTCD_EXEC="$WORKDIR"/btcd-itest
-echo $EXEC -test.v "$@" -logoutput -logdir=.logs -litdexec=$LITD_EXEC -btcdexec=$BTCD_EXEC
+LOG_DIR="$WORKDIR/.logs"
+if [[ $NUM_TRANCHES -gt 1 ]]; then
+	LOG_DIR="$WORKDIR/.logs/tranche$TRANCHE"
+fi
+
+mkdir -p "$LOG_DIR"
+LOG_FILE="$LOG_DIR/output.log"
+
+TRANCHE_FLAGS=(-splittranches="$NUM_TRANCHES" -runtranche="$TRANCHE" -shuffleseed="$SHUFFLE_SEED")
+
+echo "$EXEC" -test.v "${TRANCHE_FLAGS[@]}" "$@" -logoutput -logdir="$LOG_DIR" -litdexec=$LITD_EXEC -btcdexec=$BTCD_EXEC
 
 # Exit code 255 causes the parallel jobs to abort, so if one part fails the
 # other is aborted too.
 cd "$WORKDIR" || exit 255
-$EXEC -test.v "$@" -logoutput -logdir=.logs -litdexec=$LITD_EXEC -btcdexec=$BTCD_EXEC || exit 255
+$EXEC -test.v "${TRANCHE_FLAGS[@]}" "$@" -logoutput -logdir="$LOG_DIR" -litdexec=$LITD_EXEC -btcdexec=$BTCD_EXEC >"$LOG_FILE" 2>&1
+
+exit_code=$?
+if [ $exit_code -ne 0 ]; then
+	echo "Tranche $TRANCHE failed with exit code $exit_code"
+	tail -n 100 "$LOG_FILE"
+	exit 255
+else
+	echo "Tranche $TRANCHE completed successfully"
+fi