ci/lava: Introduce unretriable exception handling

This commit refactors the exception hierarchy to differentiate between
retriable and fatal errors in the CI pipeline, specifically within the
LAVA job submission process.

A new base class, `MesaCIRetriableException`, is introduced for
exceptions that should trigger a retry of the CI job, while
`MesaCIFatalException` is added for non-recoverable errors that halt the
process immediately.

Additionally, the logic for deciding whether a job should be retried or
not is updated to check for instances of `MesaCIRetriableException`,
improving the robustness and reliability of the CI job execution
strategy.

Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28778>
This commit is contained in:
Guilherme Gallo
2024-04-11 18:47:51 -03:00
committed by Marge Bot
parent 5363874676
commit 3e33171471
3 changed files with 35 additions and 14 deletions

View File

@@ -5,24 +5,36 @@ class MesaCIException(Exception):
pass
class MesaCITimeoutError(MesaCIException):
class MesaCIRetriableException(MesaCIException):
pass
class MesaCITimeoutError(MesaCIRetriableException):
def __init__(self, *args, timeout_duration: timedelta) -> None:
super().__init__(*args)
self.timeout_duration = timeout_duration
class MesaCIRetryError(MesaCIException):
class MesaCIRetryError(MesaCIRetriableException):
def __init__(self, *args, retry_count: int, last_job: None) -> None:
super().__init__(*args)
self.retry_count = retry_count
self.last_job = last_job
class MesaCIParseException(MesaCIException):
class MesaCIFatalException(MesaCIException):
"""Exception raised when the Mesa CI script encounters a fatal error that
prevents the script from continuing."""
def __init__(self, *args) -> None:
super().__init__(*args)
class MesaCIParseException(MesaCIRetriableException):
pass
class MesaCIKnownIssueException(MesaCIException):
class MesaCIKnownIssueException(MesaCIRetriableException):
"""Exception raised when the Mesa CI script finds something in the logs that
is known to cause the LAVA job to eventually fail"""

View File

@@ -25,6 +25,8 @@ from lavacli.utils import flow_yaml as lava_yaml
from lava.exceptions import (
MesaCIException,
MesaCIFatalException,
MesaCIRetriableException,
MesaCIParseException,
MesaCIRetryError,
MesaCITimeoutError,
@@ -85,14 +87,14 @@ NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(
def raise_exception_from_metadata(metadata: dict, job_id: int) -> None:
"""
Investigate infrastructure errors from the job metadata.
If it finds an error, raise it as MesaCIException.
If it finds an error, raise it as MesaCIRetriableException.
"""
if "result" not in metadata or metadata["result"] != "fail":
return
if "error_type" in metadata:
error_type = metadata["error_type"]
if error_type == "Infrastructure":
raise MesaCIException(
raise MesaCIRetriableException(
f"LAVA job {job_id} failed with Infrastructure Error. Retry."
)
if error_type == "Job":
@@ -100,12 +102,12 @@ def raise_exception_from_metadata(metadata: dict, job_id: int) -> None:
# with mal-formed job definitions. As we are always validating the
# jobs, only the former is probable to happen. E.g.: When some LAVA
# action timed out more times than expected in job definition.
raise MesaCIException(
raise MesaCIRetriableException(
f"LAVA job {job_id} failed with JobError "
"(possible LAVA timeout misconfiguration/bug). Retry."
)
if "case" in metadata and metadata["case"] == "validate":
raise MesaCIException(
raise MesaCIRetriableException(
f"LAVA job {job_id} failed validation (possible download error). Retry."
)
@@ -214,7 +216,7 @@ def submit_job(job):
try:
job.submit()
except Exception as mesa_ci_err:
raise MesaCIException(
raise MesaCIRetriableException(
f"Could not submit LAVA job. Reason: {mesa_ci_err}"
) from mesa_ci_err
@@ -316,6 +318,8 @@ def execute_job_with_retries(
f"Finished executing LAVA job in the attempt #{attempt_no}"
f"{CONSOLE_LOG['RESET']}"
)
if job.exception and not isinstance(job.exception, MesaCIRetriableException):
break
return last_failed_job

View File

@@ -6,6 +6,7 @@ from typing import Any, Optional
from lava.exceptions import (
MesaCIException,
MesaCIRetriableException,
MesaCIKnownIssueException,
MesaCIParseException,
MesaCITimeoutError,
@@ -34,7 +35,7 @@ class LAVAJob:
self._is_finished = False
self.log: dict[str, Any] = log
self.status = "not_submitted"
self.__exception: Optional[str] = None
self.__exception: Optional[Exception] = None
def heartbeat(self) -> None:
self.last_log_time: datetime = datetime.now()
@@ -63,13 +64,13 @@ class LAVAJob:
return self._is_finished
@property
def exception(self) -> str:
def exception(self) -> Optional[Exception]:
return self.__exception
@exception.setter
def exception(self, exception: Exception) -> None:
self.__exception = repr(exception)
self.log["dut_job_fail_reason"] = self.__exception
self.__exception = exception
self.log["dut_job_fail_reason"] = repr(self.__exception)
def validate(self) -> Optional[dict]:
"""Returns a dict with errors, if the validation fails.
@@ -176,11 +177,15 @@ class LAVAJob:
self.status = "canceled"
elif isinstance(exception, MesaCITimeoutError):
self.status = "hung"
elif isinstance(exception, MesaCIException):
elif isinstance(exception, MesaCIRetriableException):
self.status = "failed"
elif isinstance(exception, KeyboardInterrupt):
self.status = "interrupted"
print_log("LAVA job submitter was interrupted. Cancelling the job.")
raise
elif isinstance(exception, MesaCIException):
self.status = "interrupted"
print_log("LAVA job submitter was interrupted. Cancelling the job.")
raise
else:
self.status = "job_submitter_error"