From 3e33171471000958fd711265df33b4b2d2648d8e Mon Sep 17 00:00:00 2001 From: Guilherme Gallo Date: Thu, 11 Apr 2024 18:47:51 -0300 Subject: [PATCH] ci/lava: Introduce unretriable exception handling This commit refactors the exception hierarchy to differentiate between retriable and fatal errors in the CI pipeline, specifically within the LAVA job submission process. A new base class, `MesaCIRetriableException`, is introduced for exceptions that should trigger a retry of the CI job, while `MesaCIFatalException` is added for non-recoverable errors that halt the process immediately. Additionally, the logic for deciding whether a job should be retried or not is updated to check for instances of `MesaCIRetriableException`, improving the robustness and reliability of the CI job execution strategy. Signed-off-by: Guilherme Gallo Part-of: --- .gitlab-ci/lava/exceptions.py | 20 ++++++++++++++++---- .gitlab-ci/lava/lava_job_submitter.py | 14 +++++++++----- .gitlab-ci/lava/utils/lava_job.py | 15 ++++++++++----- 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/.gitlab-ci/lava/exceptions.py b/.gitlab-ci/lava/exceptions.py index f877b024510..5066d337507 100644 --- a/.gitlab-ci/lava/exceptions.py +++ b/.gitlab-ci/lava/exceptions.py @@ -5,24 +5,36 @@ class MesaCIException(Exception): pass -class MesaCITimeoutError(MesaCIException): +class MesaCIRetriableException(MesaCIException): + pass + + +class MesaCITimeoutError(MesaCIRetriableException): def __init__(self, *args, timeout_duration: timedelta) -> None: super().__init__(*args) self.timeout_duration = timeout_duration -class MesaCIRetryError(MesaCIException): +class MesaCIRetryError(MesaCIRetriableException): def __init__(self, *args, retry_count: int, last_job: None) -> None: super().__init__(*args) self.retry_count = retry_count self.last_job = last_job -class MesaCIParseException(MesaCIException): +class MesaCIFatalException(MesaCIException): + """Exception raised when the Mesa CI script encounters a fatal error that + prevents the script from continuing.""" + + def __init__(self, *args) -> None: + super().__init__(*args) + + +class MesaCIParseException(MesaCIRetriableException): pass -class MesaCIKnownIssueException(MesaCIException): +class MesaCIKnownIssueException(MesaCIRetriableException): """Exception raised when the Mesa CI script finds something in the logs that is known to cause the LAVA job to eventually fail""" diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py index 82867f1a3bc..4bc0628433c 100755 --- a/.gitlab-ci/lava/lava_job_submitter.py +++ b/.gitlab-ci/lava/lava_job_submitter.py @@ -25,6 +25,8 @@ from lavacli.utils import flow_yaml as lava_yaml from lava.exceptions import ( MesaCIException, + MesaCIFatalException, + MesaCIRetriableException, MesaCIParseException, MesaCIRetryError, MesaCITimeoutError, @@ -85,14 +87,14 @@ NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int( def raise_exception_from_metadata(metadata: dict, job_id: int) -> None: """ Investigate infrastructure errors from the job metadata. - If it finds an error, raise it as MesaCIException. + If it finds an error, raise it as MesaCIRetriableException. """ if "result" not in metadata or metadata["result"] != "fail": return if "error_type" in metadata: error_type = metadata["error_type"] if error_type == "Infrastructure": - raise MesaCIException( + raise MesaCIRetriableException( f"LAVA job {job_id} failed with Infrastructure Error. Retry." ) if error_type == "Job": @@ -100,12 +102,12 @@ def raise_exception_from_metadata(metadata: dict, job_id: int) -> None: # with mal-formed job definitions. As we are always validating the # jobs, only the former is probable to happen. E.g.: When some LAVA # action timed out more times than expected in job definition. - raise MesaCIException( + raise MesaCIRetriableException( f"LAVA job {job_id} failed with JobError " "(possible LAVA timeout misconfiguration/bug). Retry." ) if "case" in metadata and metadata["case"] == "validate": - raise MesaCIException( + raise MesaCIRetriableException( f"LAVA job {job_id} failed validation (possible download error). Retry." ) @@ -214,7 +216,7 @@ def submit_job(job): try: job.submit() except Exception as mesa_ci_err: - raise MesaCIException( + raise MesaCIRetriableException( f"Could not submit LAVA job. Reason: {mesa_ci_err}" ) from mesa_ci_err @@ -316,6 +318,8 @@ def execute_job_with_retries( f"Finished executing LAVA job in the attempt #{attempt_no}" f"{CONSOLE_LOG['RESET']}" ) + if job.exception and not isinstance(job.exception, MesaCIRetriableException): + break return last_failed_job diff --git a/.gitlab-ci/lava/utils/lava_job.py b/.gitlab-ci/lava/utils/lava_job.py index b69f8b9fbb7..f05168dac2e 100644 --- a/.gitlab-ci/lava/utils/lava_job.py +++ b/.gitlab-ci/lava/utils/lava_job.py @@ -6,6 +6,7 @@ from typing import Any, Optional from lava.exceptions import ( MesaCIException, + MesaCIRetriableException, MesaCIKnownIssueException, MesaCIParseException, MesaCITimeoutError, @@ -34,7 +35,7 @@ class LAVAJob: self._is_finished = False self.log: dict[str, Any] = log self.status = "not_submitted" - self.__exception: Optional[str] = None + self.__exception: Optional[Exception] = None def heartbeat(self) -> None: self.last_log_time: datetime = datetime.now() @@ -63,13 +64,13 @@ class LAVAJob: return self._is_finished @property - def exception(self) -> str: + def exception(self) -> Optional[Exception]: return self.__exception @exception.setter def exception(self, exception: Exception) -> None: - self.__exception = repr(exception) - self.log["dut_job_fail_reason"] = self.__exception + self.__exception = exception + self.log["dut_job_fail_reason"] = repr(self.__exception) def validate(self) -> Optional[dict]: """Returns a dict with errors, if the validation fails. @@ -176,11 +177,15 @@ class LAVAJob: self.status = "canceled" elif isinstance(exception, MesaCITimeoutError): self.status = "hung" - elif isinstance(exception, MesaCIException): + elif isinstance(exception, MesaCIRetriableException): self.status = "failed" elif isinstance(exception, KeyboardInterrupt): self.status = "interrupted" print_log("LAVA job submitter was interrupted. Cancelling the job.") raise + elif isinstance(exception, MesaCIException): + self.status = "interrupted" + print_log("LAVA job submitter was interrupted. Cancelling the job.") + raise else: self.status = "job_submitter_error"