diff --git a/ci/jobs/integration_test_job.py b/ci/jobs/integration_test_job.py index dd94e5f187d4..bc4f8ed3fd01 100644 --- a/ci/jobs/integration_test_job.py +++ b/ci/jobs/integration_test_job.py @@ -24,6 +24,48 @@ MAX_CPUS_PER_WORKER = 5 MAX_MEM_PER_WORKER = 11 +INFRASTRUCTURE_ERROR_PATTERNS = [ + "timed out after", + "TimeoutExpired", + "Cannot connect to the Docker daemon", + "Error response from daemon", + "Name or service not known", + "Temporary failure in name resolution", + "Network is unreachable", + "Connection reset by peer", + "No space left on device", + "Cannot allocate memory", + "OCI runtime create failed", + "toomanyrequests", + "pull access denied", + "Got exception pulling images:", # docker pull failure during cluster.start() +] + + +def _is_infrastructure_error(result: Result) -> bool: + """Returns True if the result is an ERROR caused by infrastructure issues.""" + if result.status not in (Result.Status.ERROR, Result.StatusExtended.ERROR): + return False + if not result.info: + return False + return any(pattern in result.info for pattern in INFRASTRUCTURE_ERROR_PATTERNS) + + +def _mark_infrastructure_errors(results: list) -> int: + """Scan results, label infrastructure errors with INFRA and change their status to SKIPPED. + + Returns the number of results that were relabeled. + """ + count = 0 + for r in results: + if _is_infrastructure_error(r): + r.set_label(Result.Label.INFRA) + r.status = Result.StatusExtended.SKIPPED + count += 1 + if count: + print(f"Marked {count} test result(s) as infrastructure errors") + return count + def get_broken_tests_rules(broken_tests_file_path: str) -> dict: if ( @@ -520,6 +562,7 @@ def main(): ) break test_results.extend(test_result_parallel.results) + _mark_infrastructure_errors(test_result_parallel.results) failed_test_cases.extend( [t.name for t in test_result_parallel.results if t.is_failure()] ) @@ -551,6 +594,7 @@ def main(): ) break test_results.extend(test_result_sequential.results) + _mark_infrastructure_errors(test_result_sequential.results) failed_test_cases.extend( [t.name for t in test_result_sequential.results if t.is_failure()] ) @@ -660,6 +704,16 @@ def main(): R = Result.create_from(results=test_results, stopwatch=sw, files=attached_files) + # If all non-OK results are infrastructure errors, do not treat as a real failure + if has_error: + non_ok = [r for r in test_results if not r.is_ok()] + if non_ok and all(r.has_label(Result.Label.INFRA) for r in non_ok): + print( + "All failures are infrastructure errors - clearing error flag" + ) + has_error = False + force_ok_exit = True + if has_error: R.set_error().set_info("\n".join(error_info)) diff --git a/ci/praktika/result.py b/ci/praktika/result.py index 0ebde2ae40cb..e538d6d7e39f 100644 --- a/ci/praktika/result.py +++ b/ci/praktika/result.py @@ -65,6 +65,7 @@ class Label: FAILED_ON_RETRY = "retry_failed" BLOCKER = "blocker" ISSUE = "issue" + INFRA = "infra" name: str status: str diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 9d8065b12d7c..44dc22bc3ffc 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -3381,7 +3381,7 @@ def logging_pulling_images(**kwargs): "Got exception pulling images: %s", kwargs["exception"] ) - retry(log_function=logging_pulling_images, retries=3, delay=8, jitter=8)(run_and_check, images_pull_cmd, nothrow=True, timeout=600) + retry(log_function=logging_pulling_images, retries=3, delay=8, jitter=8)(run_and_check, images_pull_cmd, timeout=180) if self.with_zookeeper_secure and self.base_zookeeper_cmd: logging.debug("Setup ZooKeeper Secure") @@ -3826,6 +3826,10 @@ def logging_azurite_initialization(exception, retry_number, sleep_time): self.wait_ytsaurus_to_start() if self.with_letsencrypt_pebble and self.base_letsencrypt_pebble_cmd: + letsencrypt_pebble_pull_cmd = self.base_letsencrypt_pebble_cmd + ["pull"] + retry(log_function=logging_pulling_images, retries=3, delay=8, jitter=8)( + run_and_check, letsencrypt_pebble_pull_cmd, timeout=180 + ) letsencrypt_pebble_start_cmd = self.base_letsencrypt_pebble_cmd + common_opts run_and_check(letsencrypt_pebble_start_cmd) self.wait_letsencrypt_pebble_to_start()