From e1eebcc45d89c7ad8f59396bbf9f11f455f34525 Mon Sep 17 00:00:00 2001 From: bm1549 Date: Thu, 9 Apr 2026 14:31:39 -0400 Subject: [PATCH 1/4] Fix log injection smoke test flakiness from startup timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `check raw file injection` test has been flaking across 11+ logging backend variants for months. CI Visibility data shows 90% of failures are `traceCount=0` at `waitForTraceCount(2)` after exactly 30s — the JVM + agent bytecode instrumentation simply takes >30s on overloaded CI machines. Changes: - Add `startupPoll` with 120s timeout for the initial `waitForTraceCount(2)` that covers JVM startup + agent init, giving 4x headroom over the current 30s `defaultPoll` - Add `waitForTraceCountAlive` that checks process liveness on each poll iteration, turning silent 30-120s timeouts into instant, actionable errors when the process crashes - Reorder `waitForTraceCount(4)` before `waitFor` to confirm all traces are delivered while the process is still alive - Assert `waitFor` return value for a clear error if the process hangs tag: no release note Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .../smoketest/LogInjectionSmokeTest.groovy | 43 ++++++++++++++++--- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy index ae1a1de83f7..5f690d3db44 100644 --- a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy +++ b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy @@ -10,6 +10,7 @@ import java.nio.charset.StandardCharsets import java.util.zip.GZIPInputStream import spock.lang.AutoCleanup import spock.lang.Shared +import spock.util.concurrent.PollingConditions import static datadog.trace.api.config.TraceInstrumentationConfig.TRACE_128_BIT_TRACEID_LOGGING_ENABLED import static datadog.trace.api.config.TracerConfig.TRACE_128_BIT_TRACEID_GENERATION_ENABLED @@ -27,9 +28,15 @@ import static java.util.concurrent.TimeUnit.SECONDS * SECONDTRACEID TRACEID SPANID */ abstract class LogInjectionSmokeTest extends AbstractSmokeTest { - // Estimate for the amount of time instrumentation, plus request, plus some extra + // Timeout for the process to complete after all traces have been received static final int TIMEOUT_SECS = 30 + // Agent initialization (bytecode instrumentation of ~200 integrations) can take >30s on + // overloaded CI machines. Use a longer timeout for the initial waitForTraceCount that covers + // JVM startup + agent init + first trace flush. + @Shared + protected final PollingConditions startupPoll = new PollingConditions(timeout: 120, initialDelay: 0, delay: 1, factor: 1) + static final String LOG4J2_BACKEND = "Log4j2" @Shared @@ -347,6 +354,27 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest { return logEvent[key] } + /** + * Like {@link AbstractSmokeTest#waitForTraceCount} but also checks that the process is still + * alive on each poll iteration. Without this, a crashed process causes a silent 30s (or longer) + * timeout instead of an immediate failure with a useful message. + */ + int waitForTraceCountAlive(int count, spock.util.concurrent.PollingConditions conditions) { + conditions.eventually { + if (traceDecodingFailure != null) { + throw traceDecodingFailure + } + if (testedProcess != null && !testedProcess.isAlive()) { + // Process died — fail immediately instead of waiting for full timeout + throw new AssertionError( + "Process exited with code ${testedProcess.exitValue()} while waiting for ${count} traces (have ${traceCount.get()}). " + + "Check ${logFilePath} for errors.") + } + assert traceCount.get() >= count + } + traceCount.get() + } + def parseTraceFromStdOut( String line ) { if (line == null) { throw new IllegalArgumentException("Line is null") @@ -365,7 +393,9 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest { @Flaky(condition = () -> JavaVirtualMachine.isIbm8() || JavaVirtualMachine.isOracleJDK8()) def "check raw file injection"() { when: - def count = waitForTraceCount(2) + // Use the longer startup timeout for the initial wait — agent initialization + // (bytecode instrumentation) can take >30s on overloaded CI machines + def count = waitForTraceCountAlive(2, startupPoll) def newConfig = """ {"lib_config": @@ -374,14 +404,15 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest { """.toString() setRemoteConfig("datadog/2/APM_TRACING/config_overrides/config", newConfig) - count = waitForTraceCount(3) + count = waitForTraceCountAlive(3, defaultPoll) setRemoteConfig("datadog/2/APM_TRACING/config_overrides/config", """{"lib_config":{}}""".toString()) - testedProcess.waitFor(TIMEOUT_SECS, SECONDS) - def exitValue = testedProcess.exitValue() + // Wait for all 4 traces before waiting for process exit to ensure trace delivery is confirmed + count = waitForTraceCountAlive(4, defaultPoll) - count = waitForTraceCount(4) + assert testedProcess.waitFor(TIMEOUT_SECS, SECONDS) : "Process did not exit within ${TIMEOUT_SECS}s" + def exitValue = testedProcess.exitValue() def logLines = outputLogFile.readLines() println "log lines: " + logLines From 9a554fcbf308c8a886b7b541f84221179e8137df Mon Sep 17 00:00:00 2001 From: bm1549 Date: Thu, 9 Apr 2026 15:58:49 -0400 Subject: [PATCH 2/4] Diagnose log injection smoke test flakiness instead of masking it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `check raw file injection` test flakes across 11+ logging backend variants. CI Visibility data shows the failure is bimodal — successful runs complete in 3-9s, but failures sit at exactly 30s (the PollingConditions timeout) with traceCount=0. Nothing in between. This means the process either works or is totally broken — no amount of timeout increase will help. The current test is blind during the 30s wait — it just polls traceCount with no diagnostics when the process crashes or hangs. Changes: - Add `waitForTraceCountAlive` that checks process liveness on every poll iteration. If the process dies, it fails immediately with the exit code, RC poll count, and last 20 lines of process output. - On timeout, enrich the error with diagnostic state (process alive?, traceCount, RC polls received, last 30 lines of output) so the next CI failure tells us whether it's a crash, a hang, or a connectivity issue. - Reorder `waitForTraceCount(4)` before `waitFor` to confirm all traces are delivered while the process is still alive. - Assert `waitFor` return value for a clear error if the process hangs. tag: no release notes Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .../smoketest/LogInjectionSmokeTest.groovy | 70 ++++++++++++------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy index 5f690d3db44..a490ccc3d26 100644 --- a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy +++ b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy @@ -10,7 +10,6 @@ import java.nio.charset.StandardCharsets import java.util.zip.GZIPInputStream import spock.lang.AutoCleanup import spock.lang.Shared -import spock.util.concurrent.PollingConditions import static datadog.trace.api.config.TraceInstrumentationConfig.TRACE_128_BIT_TRACEID_LOGGING_ENABLED import static datadog.trace.api.config.TracerConfig.TRACE_128_BIT_TRACEID_GENERATION_ENABLED @@ -28,15 +27,9 @@ import static java.util.concurrent.TimeUnit.SECONDS * SECONDTRACEID TRACEID SPANID */ abstract class LogInjectionSmokeTest extends AbstractSmokeTest { - // Timeout for the process to complete after all traces have been received + // Estimate for the amount of time instrumentation, plus request, plus some extra static final int TIMEOUT_SECS = 30 - // Agent initialization (bytecode instrumentation of ~200 integrations) can take >30s on - // overloaded CI machines. Use a longer timeout for the initial waitForTraceCount that covers - // JVM startup + agent init + first trace flush. - @Shared - protected final PollingConditions startupPoll = new PollingConditions(timeout: 120, initialDelay: 0, delay: 1, factor: 1) - static final String LOG4J2_BACKEND = "Log4j2" @Shared @@ -355,26 +348,53 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest { } /** - * Like {@link AbstractSmokeTest#waitForTraceCount} but also checks that the process is still - * alive on each poll iteration. Without this, a crashed process causes a silent 30s (or longer) - * timeout instead of an immediate failure with a useful message. + * Like {@link AbstractSmokeTest#waitForTraceCount} but checks process liveness on every poll + * iteration and dumps diagnostic state on failure, so CI failures produce actionable output + * instead of a bare "Condition not satisfied" after a 30s timeout. */ - int waitForTraceCountAlive(int count, spock.util.concurrent.PollingConditions conditions) { - conditions.eventually { - if (traceDecodingFailure != null) { - throw traceDecodingFailure + int waitForTraceCountAlive(int count) { + try { + defaultPoll.eventually { + if (traceDecodingFailure != null) { + throw traceDecodingFailure + } + if (testedProcess != null && !testedProcess.isAlive()) { + def lastLines = tailProcessLog(20) + throw new AssertionError( + "Process exited with code ${testedProcess.exitValue()} while waiting for ${count} traces " + + "(received ${traceCount.get()}, RC polls: ${rcClientMessages.size()}).\n" + + "Last process output:\n${lastLines}") + } + assert traceCount.get() >= count } - if (testedProcess != null && !testedProcess.isAlive()) { - // Process died — fail immediately instead of waiting for full timeout - throw new AssertionError( - "Process exited with code ${testedProcess.exitValue()} while waiting for ${count} traces (have ${traceCount.get()}). " + - "Check ${logFilePath} for errors.") + } catch (Throwable e) { + if (e instanceof AssertionError && e.message?.startsWith("Process exited")) { + throw e // Already enriched — don't wrap again } - assert traceCount.get() >= count + // The default error ("Condition not satisfied after 30s") is useless — enrich with diagnostic state + def alive = testedProcess?.isAlive() + def lastLines = tailProcessLog(30) + throw new AssertionError( + "Timed out waiting for ${count} traces after ${defaultPoll.timeout}s. " + + "traceCount=${traceCount.get()}, process.alive=${alive}, " + + "RC polls received: ${rcClientMessages.size()}.\n" + + "Last process output:\n${lastLines}", e) } traceCount.get() } + private String tailProcessLog(int lines) { + try { + def logFile = new File(logFilePath) + if (!logFile.exists()) return "(log file does not exist: ${logFilePath})" + def allLines = logFile.readLines() + def tail = allLines.size() > lines ? allLines[-lines..-1] : allLines + return tail.join("\n") + } catch (Exception e) { + return "(failed to read log: ${e.message})" + } + } + def parseTraceFromStdOut( String line ) { if (line == null) { throw new IllegalArgumentException("Line is null") @@ -393,9 +413,7 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest { @Flaky(condition = () -> JavaVirtualMachine.isIbm8() || JavaVirtualMachine.isOracleJDK8()) def "check raw file injection"() { when: - // Use the longer startup timeout for the initial wait — agent initialization - // (bytecode instrumentation) can take >30s on overloaded CI machines - def count = waitForTraceCountAlive(2, startupPoll) + def count = waitForTraceCountAlive(2) def newConfig = """ {"lib_config": @@ -404,12 +422,12 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest { """.toString() setRemoteConfig("datadog/2/APM_TRACING/config_overrides/config", newConfig) - count = waitForTraceCountAlive(3, defaultPoll) + count = waitForTraceCountAlive(3) setRemoteConfig("datadog/2/APM_TRACING/config_overrides/config", """{"lib_config":{}}""".toString()) // Wait for all 4 traces before waiting for process exit to ensure trace delivery is confirmed - count = waitForTraceCountAlive(4, defaultPoll) + count = waitForTraceCountAlive(4) assert testedProcess.waitFor(TIMEOUT_SECS, SECONDS) : "Process did not exit within ${TIMEOUT_SECS}s" def exitValue = testedProcess.exitValue() From b15254193c4669bf0c3eb2c40e355e1bb9f6609e Mon Sep 17 00:00:00 2001 From: bm1549 Date: Fri, 10 Apr 2026 11:58:08 -0400 Subject: [PATCH 3/4] Fix race in waitForTraceCountAlive: check count before liveness The liveness check fired before the trace count check, so a normal process exit after delivering all traces was treated as a failure. Check traceCount >= count first and return early if satisfied. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .../groovy/datadog/smoketest/LogInjectionSmokeTest.groovy | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy index a490ccc3d26..be4c8cfa88b 100644 --- a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy +++ b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy @@ -358,6 +358,11 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest { if (traceDecodingFailure != null) { throw traceDecodingFailure } + // Check the count BEFORE liveness — the process may have exited normally + // after delivering all traces, and we don't want to treat that as a failure. + if (traceCount.get() >= count) { + return + } if (testedProcess != null && !testedProcess.isAlive()) { def lastLines = tailProcessLog(20) throw new AssertionError( From 9eb11aac61f292895687c3d678bbf76cb7a02d2c Mon Sep 17 00:00:00 2001 From: bm1549 Date: Fri, 10 Apr 2026 12:02:38 -0400 Subject: [PATCH 4/4] Use RuntimeException for liveness check to actually fail fast PollingConditions.eventually only retries AssertionError. The liveness check was throwing AssertionError, so a dead process still waited the full 30s timeout. Switch to RuntimeException so it propagates immediately. Also narrow the catch from Throwable to AssertionError. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .../datadog/smoketest/LogInjectionSmokeTest.groovy | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy index be4c8cfa88b..8c4f3fcfe6c 100644 --- a/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy +++ b/dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy @@ -365,17 +365,16 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest { } if (testedProcess != null && !testedProcess.isAlive()) { def lastLines = tailProcessLog(20) - throw new AssertionError( + // RuntimeException (not AssertionError) so PollingConditions propagates + // immediately instead of retrying for the full timeout. + throw new RuntimeException( "Process exited with code ${testedProcess.exitValue()} while waiting for ${count} traces " + "(received ${traceCount.get()}, RC polls: ${rcClientMessages.size()}).\n" + "Last process output:\n${lastLines}") } assert traceCount.get() >= count } - } catch (Throwable e) { - if (e instanceof AssertionError && e.message?.startsWith("Process exited")) { - throw e // Already enriched — don't wrap again - } + } catch (AssertionError e) { // The default error ("Condition not satisfied after 30s") is useless — enrich with diagnostic state def alive = testedProcess?.isAlive() def lastLines = tailProcessLog(30)