From 9e08162da076c5c80289a85786943f11ed3442b4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 2 Apr 2026 13:45:57 +0000 Subject: [PATCH 1/2] Initial plan From ecd20b082c0231901ac2c29ad476d38ddde265ce Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 2 Apr 2026 13:53:29 +0000 Subject: [PATCH 2/2] Fix CDR workflow flakiness: retry Fleet API, None guards, artifact checks, unique GCP names Agent-Logs-Url: https://github.com/elastic/cloudbeat/sessions/9d369ca2-51a6-4ff9-ad51-9f61348b182b Co-authored-by: romulets <5350001+romulets@users.noreply.github.com> --- .github/actions/cdr/action.yml | 18 +++-- tests/fleet_api/base_call_api.py | 72 ++++++++++++++++--- ...install_aws_asset_inventory_integration.py | 3 + ...stall_azure_asset_inventory_integration.py | 3 + ...install_gcp_asset_inventory_integration.py | 3 + 5 files changed, 86 insertions(+), 13 deletions(-) diff --git a/.github/actions/cdr/action.yml b/.github/actions/cdr/action.yml index 0167335a69..6fb56da1d3 100644 --- a/.github/actions/cdr/action.yml +++ b/.github/actions/cdr/action.yml @@ -466,12 +466,18 @@ runs: - name: Deploy Azure Asset Inventory agent id: azure-asset-inventory-agent working-directory: deploy/azure - if: ${{ !cancelled() && steps.asset-inventory-version-check.outputs.asset_inventory_supported == 'true' }} + if: ${{ !cancelled() && steps.azure-asset-inventory-integration.outcome == 'success' }} shell: bash env: AZURE_TAGS: ${{ inputs.azure-tags }} DEPLOYMENT_NAME: "${{ inputs.deployment-name }}-inventory" - run: DEPLOYMENT_NAME="$DEPLOYMENT_NAME-$(openssl rand -hex 3)" ./install_agent_az_cli.sh + STACK_VERSION: ${{ inputs.elk-stack-version }} + run: | + if [ ! -f arm_parameters.json ]; then + echo "ERROR: arm_parameters.json not found in $(pwd). Azure installer may have failed." >&2 + exit 1 + fi + DEPLOYMENT_NAME="$DEPLOYMENT_NAME-$(openssl rand -hex 3)" ./install_agent_az_cli.sh - name: Install AWS Asset Inventory integration id: aws-asset-inventory @@ -487,7 +493,7 @@ runs: poetry run python ./install_aws_asset_inventory_integration.py - name: Deploy AWS Asset Inventory agent - if: ${{ !cancelled() && steps.asset-inventory-version-check.outputs.asset_inventory_supported == 'true' }} + if: ${{ !cancelled() && steps.aws-asset-inventory.outcome == 'success' }} working-directory: deploy/test-environments/cdr shell: bash env: @@ -496,6 +502,10 @@ runs: run: | scriptname="aws-asset-inventory-linux.sh" src="../../../tests/integrations_setup/$scriptname" + if [ ! -f "$src" ]; then + echo "ERROR: $src not found. AWS asset inventory installer may have failed." >&2 + exit 1 + fi cmd="chmod +x $scriptname && ./$scriptname" ../remote_setup.sh -k "$EC2_ASSET_INV_KEY" -s "$src" -h "$ASSET_INV_PUBLIC_IP" -d "~/$scriptname" -c "$cmd" @@ -506,7 +516,7 @@ runs: working-directory: deploy/deployment-manager env: SERVICE_ACCOUNT_NAME: "${{ inputs.deployment-name }}-sa" - DEPLOYMENT_NAME: "${{ inputs.deployment-name }}-acc" + DEPLOYMENT_NAME: "${{ inputs.deployment-name }}-acc-${{ github.run_id }}" run: | # Deploys a GCP Service Account ./deploy_service_account.sh diff --git a/tests/fleet_api/base_call_api.py b/tests/fleet_api/base_call_api.py index 9bf4c746b9..fbeb5130da 100644 --- a/tests/fleet_api/base_call_api.py +++ b/tests/fleet_api/base_call_api.py @@ -10,7 +10,18 @@ - requests: Library for making HTTP requests """ +import random +import time + import requests +from loguru import logger + +TRANSIENT_STATUSES = {502, 503, 504} +TRANSIENT_EXCEPTIONS = ( + requests.exceptions.ConnectionError, + requests.exceptions.Timeout, + requests.exceptions.ChunkedEncodingError, +) class APICallException(Exception): @@ -42,6 +53,9 @@ def perform_api_call( auth=None, params=None, ok_statuses=None, + retries=8, + retry_backoff_sec=2.0, + retry_max_sleep_sec=60.0, ): """ Perform an API call using the provided parameters. @@ -59,6 +73,9 @@ def perform_api_call( params (dict, optional): The parameters to be included in the API request. Defaults to None. ok_statuses (tuple, optional): HTTP status codes treated as success. Defaults to (200,). + retries (int, optional): Number of retry attempts on transient failures. Defaults to 8. + retry_backoff_sec (float, optional): Base backoff seconds for exponential delay. Defaults to 2.0. + retry_max_sleep_sec (float, optional): Maximum sleep seconds between retries. Defaults to 60.0. Returns: dict or bytes: Parsed JSON (empty dict for 204 or empty body), or raw content. @@ -78,15 +95,52 @@ def perform_api_call( if ok_statuses is None: ok_statuses = (200,) - response = requests.request(method=method, url=url, headers=headers, auth=auth, **params) - if response.status_code not in ok_statuses: - raise APICallException(response.status_code, response.text) - - if not return_json: - return response.content - if response.status_code == 204 or not (response.content or b"").strip(): - return {} - return response.json() + attempt = 0 + while True: + attempt += 1 + try: + response = requests.request( + method=method, + url=url, + headers=headers, + auth=auth, + timeout=60, + **params, + ) + except TRANSIENT_EXCEPTIONS as exc: + if attempt <= retries: + sleep = _retry_sleep(attempt, retry_backoff_sec, retry_max_sleep_sec) + logger.warning( + f"Transient request error on attempt {attempt}/{retries} ({exc}). " + f"Retrying in {sleep:.1f}s..." + ) + time.sleep(sleep) + continue + raise + + if response.status_code not in ok_statuses: + if response.status_code in TRANSIENT_STATUSES and attempt <= retries: + sleep = _retry_sleep(attempt, retry_backoff_sec, retry_max_sleep_sec) + logger.warning( + f"Transient HTTP {response.status_code} on attempt {attempt}/{retries}. " + f"Retrying in {sleep:.1f}s... Response: {response.text[:200]}" + ) + time.sleep(sleep) + continue + raise APICallException(response.status_code, response.text) + + if not return_json: + return response.content + if response.status_code == 204 or not (response.content or b"").strip(): + return {} + return response.json() + + +def _retry_sleep(attempt: int, base: float, maximum: float) -> float: + """Return a jittered exponential backoff sleep duration, capped at maximum.""" + sleep = base * (2 ** (attempt - 1)) + sleep = sleep * (0.75 + random.random() * 0.5) + return min(maximum, sleep) def uses_new_fleet_api_response(version: str) -> bool: diff --git a/tests/integrations_setup/install_aws_asset_inventory_integration.py b/tests/integrations_setup/install_aws_asset_inventory_integration.py index c76e61412b..2e91738b38 100755 --- a/tests/integrations_setup/install_aws_asset_inventory_integration.py +++ b/tests/integrations_setup/install_aws_asset_inventory_integration.py @@ -53,6 +53,9 @@ if __name__ == "__main__": # pylint: disable=duplicate-code package_version = get_package_version(cfg=cnfg.elk_config, package_name="cloud_asset_inventory") + if not package_version: + logger.error(f"Could not resolve cloud_asset_inventory package version from Fleet (Kibana may be unavailable).") + raise SystemExit(1) logger.info(f"Package version: {package_version}") if not version_compatible( current_version=package_version, diff --git a/tests/integrations_setup/install_azure_asset_inventory_integration.py b/tests/integrations_setup/install_azure_asset_inventory_integration.py index d5a48384d7..2bb0a3f1aa 100755 --- a/tests/integrations_setup/install_azure_asset_inventory_integration.py +++ b/tests/integrations_setup/install_azure_asset_inventory_integration.py @@ -63,6 +63,9 @@ if __name__ == "__main__": # pylint: disable=duplicate-code package_version = get_package_version(cfg=cnfg.elk_config, package_name=PACKAGE_NAME) + if not package_version: + logger.error(f"Could not resolve {PACKAGE_NAME} package version from Fleet (Kibana may be unavailable).") + raise SystemExit(1) logger.info(f"Package version: {package_version}") if not version_compatible( current_version=package_version, diff --git a/tests/integrations_setup/install_gcp_asset_inventory_integration.py b/tests/integrations_setup/install_gcp_asset_inventory_integration.py index 4f6ce9bf75..085a240a8d 100755 --- a/tests/integrations_setup/install_gcp_asset_inventory_integration.py +++ b/tests/integrations_setup/install_gcp_asset_inventory_integration.py @@ -51,6 +51,9 @@ if __name__ == "__main__": # pylint: disable=duplicate-code package_version = get_package_version(cfg=cnfg.elk_config, package_name="cloud_asset_inventory") + if not package_version: + logger.error("Could not resolve cloud_asset_inventory package version from Fleet (Kibana may be unavailable).") + raise SystemExit(1) if not version_compatible( current_version=package_version, required_version=PKG_DEFAULT_VERSION,