diff --git a/alts/worker/runners/opennebula.py b/alts/worker/runners/opennebula.py index dc513ab..c95f940 100644 --- a/alts/worker/runners/opennebula.py +++ b/alts/worker/runners/opennebula.py @@ -422,6 +422,54 @@ def check_limit(used, limit, required=1): return True + def find_vm_by_name(self, vm_name: str) -> Optional[int]: + """ + Find VM ID by VM name using OpenNebula API. + + Parameters + ---------- + vm_name : str + Name of the VM to search for. + + Returns + ------- + Optional[int] + VM ID if found, None otherwise. + """ + try: + vmpool = self.opennebula_client.vmpool.info(-1, -1, -1, -1) + for vm in vmpool.VM: + if vm.NAME == vm_name: + self._logger.info('Found VM "%s" with ID %d', vm_name, vm.ID) + return vm.ID + self._logger.warning('VM with name "%s" not found in vmpool', vm_name) + except Exception as e: + self._logger.error('Failed to search for VM by name "%s": %s', vm_name, e) + return None + + def destroy_vm_by_name(self, vm_name: str) -> bool: + """ + Destroy VM by name. This method finds the VM by name and then destroys it. + + Note: OpenNebula API requires VM ID for deletion, so this method must + query for the ID first before deletion. + + Parameters + ---------- + vm_name : str + Name of the VM to destroy. + + Returns + ------- + bool + True if VM was found and destroyed, False otherwise. + """ + vm_id = self.find_vm_by_name(vm_name) + if vm_id is None: + return False + self.destroy_vm_via_api(vm_id) + return True + def destroy_vm_via_api(self, vm_id: int): def vm_info(): return self.opennebula_client.vm.info(vm_id) @@ -473,12 +521,11 @@ def _stop_env(self): self._logger.warning(err_msg) return 0, err_msg, '' if self.start_env_failed: - err_msg = ( - 'VM is not created because start environment step failed' + self._logger.warning( + 'Start environment step failed, but VM may have been ' + 'partially created. Attempting cleanup.', ) - self._logger.warning(err_msg) - return 0, err_msg, '' - if self.vm_alive: + elif self.vm_alive: return 0, "WARNING: VM won't be destroyed because vm_alive=True was given", "" stop_exit_code, stop_out, stop_err = super()._stop_env() if stop_exit_code == 0: @@ -496,7 +543,24 @@ def _stop_env(self): ) self._logger.debug('VM ID: %s', vm_id) if id_exit_code != 0 or not vm_id: - self._logger.warning('Cannot get VM ID: %s', id_stderr) - return id_exit_code, 'Cannot get VM ID', id_stderr - self.destroy_vm_via_api(int(vm_id.strip())) - return 0, f'{vm_id} is destroyed via API', '' + self._logger.warning( + 'Cannot get VM ID: %s. Attempting to destroy VM by name', + id_stderr + ) + if self.destroy_vm_by_name(self.env_name): + return 0, f'VM "{self.env_name}" is destroyed via API (found by name)', '' + return id_exit_code, 'Cannot get VM ID and VM not found by name', id_stderr + try: + parsed_vm_id = int(vm_id.strip()) + except ValueError: + # Terraform may return warning text (e.g. no outputs in state) + # instead of an integer VM id on partially initialized envs. + self._logger.warning( + 'Unexpected VM ID output: %s. Attempting to destroy VM by name', + vm_id + ) + if self.destroy_vm_by_name(self.env_name): + return 0, f'VM "{self.env_name}" is destroyed via API (found by name)', '' + return 1, 'Cannot parse VM ID and VM not found by name', str(vm_id) + self.destroy_vm_via_api(parsed_vm_id) + return 0, f'{parsed_vm_id} is destroyed via API', '' diff --git a/alts/worker/tasks.py b/alts/worker/tasks.py index 8bcee8a..d9cd7f2 100644 --- a/alts/worker/tasks.py +++ b/alts/worker/tasks.py @@ -58,6 +58,8 @@ ConnectTimeout, TimeoutError, OpenNebulaQuotaExceededError, + StartEnvironmentError, + TerraformInitializationError, ) @@ -196,6 +198,7 @@ def run_tests(self, task_params: dict): Result summary of a test execution. """ aborted = False + retry_exc = None def is_success(stage_data_: dict): tap_result = are_tap_tests_success(stage_data_.get('stdout', '')) @@ -296,9 +299,17 @@ def set_artifacts_when_stage_has_unexpected_exception( 'stderr': traceback.format_exc() } except TerraformInitializationError as exc: - logging.exception('Cannot initialize terraform: %s', exc) + logging.exception( + 'Cannot initialize terraform for task %s, scheduling retry: %s', + task_params['task_id'], exc, + ) + retry_exc = exc except StartEnvironmentError as exc: - logging.exception('Cannot start environment: %s', exc) + logging.exception( + 'Cannot start environment for task %s, scheduling retry: %s', + task_params['task_id'], exc, + ) + retry_exc = exc except ProvisionError as exc: logging.exception('Cannot run initial provision: %s', exc) except InstallPackageError as exc: @@ -320,9 +331,9 @@ def set_artifacts_when_stage_has_unexpected_exception( except OpenNebulaQuotaExceededError as exc: logging.warning( 'OpenNebula quota exceeded for task %s, scheduling retry: %s', - task_params['task_id'], exc + task_params['task_id'], exc, ) - raise + retry_exc = exc except Exception as exc: logging.exception('Unexpected exception: %s', exc) set_artifacts_when_stage_has_unexpected_exception( @@ -332,6 +343,9 @@ def set_artifacts_when_stage_has_unexpected_exception( ) finally: runner.teardown() + if retry_exc is not None and self.request.retries < self.max_retries: + if isinstance(runner, OpennebulaRunner): + raise retry_exc summary = defaultdict(dict) if aborted: summary['revoked'] = True