From e1eebcc45d89c7ad8f59396bbf9f11f455f34525 Mon Sep 17 00:00:00 2001
From: bm1549 <brian.marks@datadoghq.com>
Date: Thu, 9 Apr 2026 14:31:39 -0400
Subject: [PATCH 1/4] Fix log injection smoke test flakiness from startup
 timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `check raw file injection` test has been flaking across 11+ logging
backend variants for months. CI Visibility data shows 90% of failures are
`traceCount=0` at `waitForTraceCount(2)` after exactly 30s — the JVM +
agent bytecode instrumentation simply takes >30s on overloaded CI machines.

Changes:
- Add `startupPoll` with 120s timeout for the initial `waitForTraceCount(2)`
  that covers JVM startup + agent init, giving 4x headroom over the current
  30s `defaultPoll`
- Add `waitForTraceCountAlive` that checks process liveness on each poll
  iteration, turning silent 30-120s timeouts into instant, actionable errors
  when the process crashes
- Reorder `waitForTraceCount(4)` before `waitFor` to confirm all traces are
  delivered while the process is still alive
- Assert `waitFor` return value for a clear error if the process hangs

tag: no release note

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 .../smoketest/LogInjectionSmokeTest.groovy    | 43 ++++++++++++++++---
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy
index ae1a1de83f7..5f690d3db44 100644
--- a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy
+++ b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy
@@ -10,6 +10,7 @@ import java.nio.charset.StandardCharsets
 import java.util.zip.GZIPInputStream
 import spock.lang.AutoCleanup
 import spock.lang.Shared
+import spock.util.concurrent.PollingConditions
 
 import static datadog.trace.api.config.TraceInstrumentationConfig.TRACE_128_BIT_TRACEID_LOGGING_ENABLED
 import static datadog.trace.api.config.TracerConfig.TRACE_128_BIT_TRACEID_GENERATION_ENABLED
@@ -27,9 +28,15 @@ import static java.util.concurrent.TimeUnit.SECONDS
  * SECONDTRACEID TRACEID SPANID
  */
 abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
-  // Estimate for the amount of time instrumentation, plus request, plus some extra
+  // Timeout for the process to complete after all traces have been received
   static final int TIMEOUT_SECS = 30
 
+  // Agent initialization (bytecode instrumentation of ~200 integrations) can take >30s on
+  // overloaded CI machines. Use a longer timeout for the initial waitForTraceCount that covers
+  // JVM startup + agent init + first trace flush.
+  @Shared
+  protected final PollingConditions startupPoll = new PollingConditions(timeout: 120, initialDelay: 0, delay: 1, factor: 1)
+
   static final String LOG4J2_BACKEND = "Log4j2"
 
   @Shared
@@ -347,6 +354,27 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
     return logEvent[key]
   }
 
+  /**
+   * Like {@link AbstractSmokeTest#waitForTraceCount} but also checks that the process is still
+   * alive on each poll iteration. Without this, a crashed process causes a silent 30s (or longer)
+   * timeout instead of an immediate failure with a useful message.
+   */
+  int waitForTraceCountAlive(int count, spock.util.concurrent.PollingConditions conditions) {
+    conditions.eventually {
+      if (traceDecodingFailure != null) {
+        throw traceDecodingFailure
+      }
+      if (testedProcess != null && !testedProcess.isAlive()) {
+        // Process died — fail immediately instead of waiting for full timeout
+        throw new AssertionError(
+        "Process exited with code ${testedProcess.exitValue()} while waiting for ${count} traces (have ${traceCount.get()}). " +
+        "Check ${logFilePath} for errors.")
+      }
+      assert traceCount.get() >= count
+    }
+    traceCount.get()
+  }
+
   def parseTraceFromStdOut( String line ) {
     if (line == null) {
       throw new IllegalArgumentException("Line is null")
@@ -365,7 +393,9 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
   @Flaky(condition = () -> JavaVirtualMachine.isIbm8() || JavaVirtualMachine.isOracleJDK8())
   def "check raw file injection"() {
     when:
-    def count = waitForTraceCount(2)
+    // Use the longer startup timeout for the initial wait — agent initialization
+    // (bytecode instrumentation) can take >30s on overloaded CI machines
+    def count = waitForTraceCountAlive(2, startupPoll)
 
     def newConfig = """
         {"lib_config":
@@ -374,14 +404,15 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
      """.toString()
     setRemoteConfig("datadog/2/APM_TRACING/config_overrides/config", newConfig)
 
-    count = waitForTraceCount(3)
+    count = waitForTraceCountAlive(3, defaultPoll)
 
     setRemoteConfig("datadog/2/APM_TRACING/config_overrides/config", """{"lib_config":{}}""".toString())
 
-    testedProcess.waitFor(TIMEOUT_SECS, SECONDS)
-    def exitValue = testedProcess.exitValue()
+    // Wait for all 4 traces before waiting for process exit to ensure trace delivery is confirmed
+    count = waitForTraceCountAlive(4, defaultPoll)
 
-    count = waitForTraceCount(4)
+    assert testedProcess.waitFor(TIMEOUT_SECS, SECONDS) : "Process did not exit within ${TIMEOUT_SECS}s"
+    def exitValue = testedProcess.exitValue()
 
     def logLines = outputLogFile.readLines()
     println "log lines: " + logLines

From 9a554fcbf308c8a886b7b541f84221179e8137df Mon Sep 17 00:00:00 2001
From: bm1549 <brian.marks@datadoghq.com>
Date: Thu, 9 Apr 2026 15:58:49 -0400
Subject: [PATCH 2/4] Diagnose log injection smoke test flakiness instead of
 masking it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `check raw file injection` test flakes across 11+ logging backend
variants. CI Visibility data shows the failure is bimodal — successful
runs complete in 3-9s, but failures sit at exactly 30s (the
PollingConditions timeout) with traceCount=0. Nothing in between. This
means the process either works or is totally broken — no amount of
timeout increase will help.

The current test is blind during the 30s wait — it just polls
traceCount with no diagnostics when the process crashes or hangs.

Changes:
- Add `waitForTraceCountAlive` that checks process liveness on every
  poll iteration. If the process dies, it fails immediately with the
  exit code, RC poll count, and last 20 lines of process output.
- On timeout, enrich the error with diagnostic state (process alive?,
  traceCount, RC polls received, last 30 lines of output) so the next
  CI failure tells us whether it's a crash, a hang, or a connectivity
  issue.
- Reorder `waitForTraceCount(4)` before `waitFor` to confirm all
  traces are delivered while the process is still alive.
- Assert `waitFor` return value for a clear error if the process hangs.

tag: no release notes

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 .../smoketest/LogInjectionSmokeTest.groovy    | 70 ++++++++++++-------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy
index 5f690d3db44..a490ccc3d26 100644
--- a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy
+++ b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy
@@ -10,7 +10,6 @@ import java.nio.charset.StandardCharsets
 import java.util.zip.GZIPInputStream
 import spock.lang.AutoCleanup
 import spock.lang.Shared
-import spock.util.concurrent.PollingConditions
 
 import static datadog.trace.api.config.TraceInstrumentationConfig.TRACE_128_BIT_TRACEID_LOGGING_ENABLED
 import static datadog.trace.api.config.TracerConfig.TRACE_128_BIT_TRACEID_GENERATION_ENABLED
@@ -28,15 +27,9 @@ import static java.util.concurrent.TimeUnit.SECONDS
  * SECONDTRACEID TRACEID SPANID
  */
 abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
-  // Timeout for the process to complete after all traces have been received
+  // Estimate for the amount of time instrumentation, plus request, plus some extra
   static final int TIMEOUT_SECS = 30
 
-  // Agent initialization (bytecode instrumentation of ~200 integrations) can take >30s on
-  // overloaded CI machines. Use a longer timeout for the initial waitForTraceCount that covers
-  // JVM startup + agent init + first trace flush.
-  @Shared
-  protected final PollingConditions startupPoll = new PollingConditions(timeout: 120, initialDelay: 0, delay: 1, factor: 1)
-
   static final String LOG4J2_BACKEND = "Log4j2"
 
   @Shared
@@ -355,26 +348,53 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
   }
 
   /**
-   * Like {@link AbstractSmokeTest#waitForTraceCount} but also checks that the process is still
-   * alive on each poll iteration. Without this, a crashed process causes a silent 30s (or longer)
-   * timeout instead of an immediate failure with a useful message.
+   * Like {@link AbstractSmokeTest#waitForTraceCount} but checks process liveness on every poll
+   * iteration and dumps diagnostic state on failure, so CI failures produce actionable output
+   * instead of a bare "Condition not satisfied" after a 30s timeout.
    */
-  int waitForTraceCountAlive(int count, spock.util.concurrent.PollingConditions conditions) {
-    conditions.eventually {
-      if (traceDecodingFailure != null) {
-        throw traceDecodingFailure
+  int waitForTraceCountAlive(int count) {
+    try {
+      defaultPoll.eventually {
+        if (traceDecodingFailure != null) {
+          throw traceDecodingFailure
+        }
+        if (testedProcess != null && !testedProcess.isAlive()) {
+          def lastLines = tailProcessLog(20)
+          throw new AssertionError(
+            "Process exited with code ${testedProcess.exitValue()} while waiting for ${count} traces " +
+            "(received ${traceCount.get()}, RC polls: ${rcClientMessages.size()}).\n" +
+            "Last process output:\n${lastLines}")
+        }
+        assert traceCount.get() >= count
       }
-      if (testedProcess != null && !testedProcess.isAlive()) {
-        // Process died — fail immediately instead of waiting for full timeout
-        throw new AssertionError(
-        "Process exited with code ${testedProcess.exitValue()} while waiting for ${count} traces (have ${traceCount.get()}). " +
-        "Check ${logFilePath} for errors.")
+    } catch (Throwable e) {
+      if (e instanceof AssertionError && e.message?.startsWith("Process exited")) {
+        throw e // Already enriched — don't wrap again
       }
-      assert traceCount.get() >= count
+      // The default error ("Condition not satisfied after 30s") is useless — enrich with diagnostic state
+      def alive = testedProcess?.isAlive()
+      def lastLines = tailProcessLog(30)
+      throw new AssertionError(
+        "Timed out waiting for ${count} traces after ${defaultPoll.timeout}s. " +
+        "traceCount=${traceCount.get()}, process.alive=${alive}, " +
+        "RC polls received: ${rcClientMessages.size()}.\n" +
+        "Last process output:\n${lastLines}", e)
     }
     traceCount.get()
   }
 
+  private String tailProcessLog(int lines) {
+    try {
+      def logFile = new File(logFilePath)
+      if (!logFile.exists()) return "(log file does not exist: ${logFilePath})"
+      def allLines = logFile.readLines()
+      def tail = allLines.size() > lines ? allLines[-lines..-1] : allLines
+      return tail.join("\n")
+    } catch (Exception e) {
+      return "(failed to read log: ${e.message})"
+    }
+  }
+
   def parseTraceFromStdOut( String line ) {
     if (line == null) {
       throw new IllegalArgumentException("Line is null")
@@ -393,9 +413,7 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
   @Flaky(condition = () -> JavaVirtualMachine.isIbm8() || JavaVirtualMachine.isOracleJDK8())
   def "check raw file injection"() {
     when:
-    // Use the longer startup timeout for the initial wait — agent initialization
-    // (bytecode instrumentation) can take >30s on overloaded CI machines
-    def count = waitForTraceCountAlive(2, startupPoll)
+    def count = waitForTraceCountAlive(2)
 
     def newConfig = """
         {"lib_config":
@@ -404,12 +422,12 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
      """.toString()
     setRemoteConfig("datadog/2/APM_TRACING/config_overrides/config", newConfig)
 
-    count = waitForTraceCountAlive(3, defaultPoll)
+    count = waitForTraceCountAlive(3)
 
     setRemoteConfig("datadog/2/APM_TRACING/config_overrides/config", """{"lib_config":{}}""".toString())
 
     // Wait for all 4 traces before waiting for process exit to ensure trace delivery is confirmed
-    count = waitForTraceCountAlive(4, defaultPoll)
+    count = waitForTraceCountAlive(4)
 
     assert testedProcess.waitFor(TIMEOUT_SECS, SECONDS) : "Process did not exit within ${TIMEOUT_SECS}s"
     def exitValue = testedProcess.exitValue()

From b15254193c4669bf0c3eb2c40e355e1bb9f6609e Mon Sep 17 00:00:00 2001
From: bm1549 <brian.marks@datadoghq.com>
Date: Fri, 10 Apr 2026 11:58:08 -0400
Subject: [PATCH 3/4] Fix race in waitForTraceCountAlive: check count before
 liveness

The liveness check fired before the trace count check, so a normal
process exit after delivering all traces was treated as a failure.
Check traceCount >= count first and return early if satisfied.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 .../groovy/datadog/smoketest/LogInjectionSmokeTest.groovy    | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy
index a490ccc3d26..be4c8cfa88b 100644
--- a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy
+++ b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy
@@ -358,6 +358,11 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
         if (traceDecodingFailure != null) {
           throw traceDecodingFailure
         }
+        // Check the count BEFORE liveness — the process may have exited normally
+        // after delivering all traces, and we don't want to treat that as a failure.
+        if (traceCount.get() >= count) {
+          return
+        }
         if (testedProcess != null && !testedProcess.isAlive()) {
           def lastLines = tailProcessLog(20)
           throw new AssertionError(

From 9eb11aac61f292895687c3d678bbf76cb7a02d2c Mon Sep 17 00:00:00 2001
From: bm1549 <brian.marks@datadoghq.com>
Date: Fri, 10 Apr 2026 12:02:38 -0400
Subject: [PATCH 4/4] Use RuntimeException for liveness check to actually fail
 fast

PollingConditions.eventually only retries AssertionError. The liveness
check was throwing AssertionError, so a dead process still waited the
full 30s timeout. Switch to RuntimeException so it propagates
immediately. Also narrow the catch from Throwable to AssertionError.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 .../datadog/smoketest/LogInjectionSmokeTest.groovy       | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy
index be4c8cfa88b..8c4f3fcfe6c 100644
--- a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy
+++ b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy
@@ -365,17 +365,16 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
         }
         if (testedProcess != null && !testedProcess.isAlive()) {
           def lastLines = tailProcessLog(20)
-          throw new AssertionError(
+          // RuntimeException (not AssertionError) so PollingConditions propagates
+          // immediately instead of retrying for the full timeout.
+          throw new RuntimeException(
             "Process exited with code ${testedProcess.exitValue()} while waiting for ${count} traces " +
             "(received ${traceCount.get()}, RC polls: ${rcClientMessages.size()}).\n" +
             "Last process output:\n${lastLines}")
         }
         assert traceCount.get() >= count
       }
-    } catch (Throwable e) {
-      if (e instanceof AssertionError && e.message?.startsWith("Process exited")) {
-        throw e // Already enriched — don't wrap again
-      }
+    } catch (AssertionError e) {
       // The default error ("Condition not satisfied after 30s") is useless — enrich with diagnostic state
       def alive = testedProcess?.isAlive()
       def lastLines = tailProcessLog(30)