Altinity · strtgbb · Mar 18, 2026 · Mar 18, 2026
diff --git a/ci/jobs/integration_test_job.py b/ci/jobs/integration_test_job.py
@@ -24,6 +24,48 @@
 MAX_CPUS_PER_WORKER = 5
 MAX_MEM_PER_WORKER = 11
 
+INFRASTRUCTURE_ERROR_PATTERNS = [
+    "timed out after",
+    "TimeoutExpired",
+    "Cannot connect to the Docker daemon",
+    "Error response from daemon",
+    "Name or service not known",
+    "Temporary failure in name resolution",
+    "Network is unreachable",
+    "Connection reset by peer",
+    "No space left on device",
+    "Cannot allocate memory",
+    "OCI runtime create failed",
+    "toomanyrequests",
+    "pull access denied",
+    "Got exception pulling images:",  # docker pull failure during cluster.start()
+]
+
+
+def _is_infrastructure_error(result: Result) -> bool:
+    """Returns True if the result is an ERROR caused by infrastructure issues."""
+    if result.status not in (Result.Status.ERROR, Result.StatusExtended.ERROR):
+        return False
+    if not result.info:
+        return False
+    return any(pattern in result.info for pattern in INFRASTRUCTURE_ERROR_PATTERNS)
+
+
+def _mark_infrastructure_errors(results: list) -> int:
+    """Scan results, label infrastructure errors with INFRA and change their status to SKIPPED.
+
+    Returns the number of results that were relabeled.
+    """
+    count = 0
+    for r in results:
+        if _is_infrastructure_error(r):
+            r.set_label(Result.Label.INFRA)
+            r.status = Result.StatusExtended.SKIPPED
+            count += 1
+    if count:
+        print(f"Marked {count} test result(s) as infrastructure errors")
+    return count
+
 
 def get_broken_tests_rules(broken_tests_file_path: str) -> dict:
     if (
@@ -520,6 +562,7 @@ def main():
                 )
                 break
         test_results.extend(test_result_parallel.results)
+        _mark_infrastructure_errors(test_result_parallel.results)
         failed_test_cases.extend(
             [t.name for t in test_result_parallel.results if t.is_failure()]
         )
@@ -551,6 +594,7 @@ def main():
                 )
                 break
         test_results.extend(test_result_sequential.results)
+        _mark_infrastructure_errors(test_result_sequential.results)
         failed_test_cases.extend(
             [t.name for t in test_result_sequential.results if t.is_failure()]
         )
@@ -660,6 +704,16 @@ def main():
 
     R = Result.create_from(results=test_results, stopwatch=sw, files=attached_files)
 
+    # If all non-OK results are infrastructure errors, do not treat as a real failure
+    if has_error:
+        non_ok = [r for r in test_results if not r.is_ok()]
+        if non_ok and all(r.has_label(Result.Label.INFRA) for r in non_ok):
+            print(
+                "All failures are infrastructure errors - clearing error flag"
+            )
+            has_error = False
+            force_ok_exit = True
+
     if has_error:
         R.set_error().set_info("\n".join(error_info))
 

diff --git a/ci/praktika/result.py b/ci/praktika/result.py
@@ -65,6 +65,7 @@ class Label:
         FAILED_ON_RETRY = "retry_failed"
         BLOCKER = "blocker"
         ISSUE = "issue"
+        INFRA = "infra"
 
     name: str
     status: str

diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py
@@ -3381,7 +3381,7 @@ def logging_pulling_images(**kwargs):
                         "Got exception pulling images: %s", kwargs["exception"]
                     )
 
-            retry(log_function=logging_pulling_images, retries=3, delay=8, jitter=8)(run_and_check, images_pull_cmd, nothrow=True, timeout=600)
+            retry(log_function=logging_pulling_images, retries=3, delay=8, jitter=8)(run_and_check, images_pull_cmd, timeout=180)
 
             if self.with_zookeeper_secure and self.base_zookeeper_cmd:
                 logging.debug("Setup ZooKeeper Secure")
@@ -3826,6 +3826,10 @@ def logging_azurite_initialization(exception, retry_number, sleep_time):
                 self.wait_ytsaurus_to_start()
 
             if self.with_letsencrypt_pebble and self.base_letsencrypt_pebble_cmd:
+                letsencrypt_pebble_pull_cmd = self.base_letsencrypt_pebble_cmd + ["pull"]
+                retry(log_function=logging_pulling_images, retries=3, delay=8, jitter=8)(
+                    run_and_check, letsencrypt_pebble_pull_cmd, timeout=180
+                )
                 letsencrypt_pebble_start_cmd = self.base_letsencrypt_pebble_cmd + common_opts
                 run_and_check(letsencrypt_pebble_start_cmd)
                 self.wait_letsencrypt_pebble_to_start()