Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions ci/jobs/integration_test_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,48 @@
MAX_CPUS_PER_WORKER = 5
MAX_MEM_PER_WORKER = 11

INFRASTRUCTURE_ERROR_PATTERNS = [
"timed out after",
"TimeoutExpired",
"Cannot connect to the Docker daemon",
"Error response from daemon",
"Name or service not known",
"Temporary failure in name resolution",
"Network is unreachable",
"Connection reset by peer",
"No space left on device",
"Cannot allocate memory",
"OCI runtime create failed",
"toomanyrequests",
"pull access denied",
"Got exception pulling images:", # docker pull failure during cluster.start()
]


def _is_infrastructure_error(result: Result) -> bool:
"""Returns True if the result is an ERROR caused by infrastructure issues."""
if result.status not in (Result.Status.ERROR, Result.StatusExtended.ERROR):
return False
if not result.info:
return False
return any(pattern in result.info for pattern in INFRASTRUCTURE_ERROR_PATTERNS)


def _mark_infrastructure_errors(results: list) -> int:
"""Scan results, label infrastructure errors with INFRA and change their status to SKIPPED.

Returns the number of results that were relabeled.
"""
count = 0
for r in results:
if _is_infrastructure_error(r):
r.set_label(Result.Label.INFRA)
r.status = Result.StatusExtended.SKIPPED
count += 1
if count:
print(f"Marked {count} test result(s) as infrastructure errors")
return count


def get_broken_tests_rules(broken_tests_file_path: str) -> dict:
if (
Expand Down Expand Up @@ -520,6 +562,7 @@ def main():
)
break
test_results.extend(test_result_parallel.results)
_mark_infrastructure_errors(test_result_parallel.results)
failed_test_cases.extend(
[t.name for t in test_result_parallel.results if t.is_failure()]
)
Expand Down Expand Up @@ -551,6 +594,7 @@ def main():
)
break
test_results.extend(test_result_sequential.results)
_mark_infrastructure_errors(test_result_sequential.results)
failed_test_cases.extend(
[t.name for t in test_result_sequential.results if t.is_failure()]
)
Expand Down Expand Up @@ -660,6 +704,16 @@ def main():

R = Result.create_from(results=test_results, stopwatch=sw, files=attached_files)

# If all non-OK results are infrastructure errors, do not treat as a real failure
if has_error:
non_ok = [r for r in test_results if not r.is_ok()]
if non_ok and all(r.has_label(Result.Label.INFRA) for r in non_ok):
print(
"All failures are infrastructure errors - clearing error flag"
)
has_error = False
force_ok_exit = True

if has_error:
R.set_error().set_info("\n".join(error_info))

Expand Down
1 change: 1 addition & 0 deletions ci/praktika/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class Label:
FAILED_ON_RETRY = "retry_failed"
BLOCKER = "blocker"
ISSUE = "issue"
INFRA = "infra"

name: str
status: str
Expand Down
6 changes: 5 additions & 1 deletion tests/integration/helpers/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -3381,7 +3381,7 @@ def logging_pulling_images(**kwargs):
"Got exception pulling images: %s", kwargs["exception"]
)

retry(log_function=logging_pulling_images, retries=3, delay=8, jitter=8)(run_and_check, images_pull_cmd, nothrow=True, timeout=600)
retry(log_function=logging_pulling_images, retries=3, delay=8, jitter=8)(run_and_check, images_pull_cmd, timeout=180)

if self.with_zookeeper_secure and self.base_zookeeper_cmd:
logging.debug("Setup ZooKeeper Secure")
Expand Down Expand Up @@ -3826,6 +3826,10 @@ def logging_azurite_initialization(exception, retry_number, sleep_time):
self.wait_ytsaurus_to_start()

if self.with_letsencrypt_pebble and self.base_letsencrypt_pebble_cmd:
letsencrypt_pebble_pull_cmd = self.base_letsencrypt_pebble_cmd + ["pull"]
retry(log_function=logging_pulling_images, retries=3, delay=8, jitter=8)(
run_and_check, letsencrypt_pebble_pull_cmd, timeout=180
)
letsencrypt_pebble_start_cmd = self.base_letsencrypt_pebble_cmd + common_opts
run_and_check(letsencrypt_pebble_start_cmd)
self.wait_letsencrypt_pebble_to_start()
Expand Down
Loading