ci/lava: Introduce unretriable exception handling
This commit refactors the exception hierarchy to differentiate between retriable and fatal errors in the CI pipeline, specifically within the LAVA job submission process. A new base class, `MesaCIRetriableException`, is introduced for exceptions that should trigger a retry of the CI job, while `MesaCIFatalException` is added for non-recoverable errors that halt the process immediately. Additionally, the logic for deciding whether a job should be retried or not is updated to check for instances of `MesaCIRetriableException`, improving the robustness and reliability of the CI job execution strategy. Signed-off-by: Guilherme Gallo <guilherme.gallo@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28778>
This commit is contained in:
committed by
Marge Bot
parent
5363874676
commit
3e33171471
@@ -5,24 +5,36 @@ class MesaCIException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class MesaCITimeoutError(MesaCIException):
|
||||
class MesaCIRetriableException(MesaCIException):
|
||||
pass
|
||||
|
||||
|
||||
class MesaCITimeoutError(MesaCIRetriableException):
|
||||
def __init__(self, *args, timeout_duration: timedelta) -> None:
|
||||
super().__init__(*args)
|
||||
self.timeout_duration = timeout_duration
|
||||
|
||||
|
||||
class MesaCIRetryError(MesaCIException):
|
||||
class MesaCIRetryError(MesaCIRetriableException):
|
||||
def __init__(self, *args, retry_count: int, last_job: None) -> None:
|
||||
super().__init__(*args)
|
||||
self.retry_count = retry_count
|
||||
self.last_job = last_job
|
||||
|
||||
|
||||
class MesaCIParseException(MesaCIException):
|
||||
class MesaCIFatalException(MesaCIException):
|
||||
"""Exception raised when the Mesa CI script encounters a fatal error that
|
||||
prevents the script from continuing."""
|
||||
|
||||
def __init__(self, *args) -> None:
|
||||
super().__init__(*args)
|
||||
|
||||
|
||||
class MesaCIParseException(MesaCIRetriableException):
|
||||
pass
|
||||
|
||||
|
||||
class MesaCIKnownIssueException(MesaCIException):
|
||||
class MesaCIKnownIssueException(MesaCIRetriableException):
|
||||
"""Exception raised when the Mesa CI script finds something in the logs that
|
||||
is known to cause the LAVA job to eventually fail"""
|
||||
|
||||
|
||||
@@ -25,6 +25,8 @@ from lavacli.utils import flow_yaml as lava_yaml
|
||||
|
||||
from lava.exceptions import (
|
||||
MesaCIException,
|
||||
MesaCIFatalException,
|
||||
MesaCIRetriableException,
|
||||
MesaCIParseException,
|
||||
MesaCIRetryError,
|
||||
MesaCITimeoutError,
|
||||
@@ -85,14 +87,14 @@ NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(
|
||||
def raise_exception_from_metadata(metadata: dict, job_id: int) -> None:
|
||||
"""
|
||||
Investigate infrastructure errors from the job metadata.
|
||||
If it finds an error, raise it as MesaCIException.
|
||||
If it finds an error, raise it as MesaCIRetriableException.
|
||||
"""
|
||||
if "result" not in metadata or metadata["result"] != "fail":
|
||||
return
|
||||
if "error_type" in metadata:
|
||||
error_type = metadata["error_type"]
|
||||
if error_type == "Infrastructure":
|
||||
raise MesaCIException(
|
||||
raise MesaCIRetriableException(
|
||||
f"LAVA job {job_id} failed with Infrastructure Error. Retry."
|
||||
)
|
||||
if error_type == "Job":
|
||||
@@ -100,12 +102,12 @@ def raise_exception_from_metadata(metadata: dict, job_id: int) -> None:
|
||||
# with mal-formed job definitions. As we are always validating the
|
||||
# jobs, only the former is probable to happen. E.g.: When some LAVA
|
||||
# action timed out more times than expected in job definition.
|
||||
raise MesaCIException(
|
||||
raise MesaCIRetriableException(
|
||||
f"LAVA job {job_id} failed with JobError "
|
||||
"(possible LAVA timeout misconfiguration/bug). Retry."
|
||||
)
|
||||
if "case" in metadata and metadata["case"] == "validate":
|
||||
raise MesaCIException(
|
||||
raise MesaCIRetriableException(
|
||||
f"LAVA job {job_id} failed validation (possible download error). Retry."
|
||||
)
|
||||
|
||||
@@ -214,7 +216,7 @@ def submit_job(job):
|
||||
try:
|
||||
job.submit()
|
||||
except Exception as mesa_ci_err:
|
||||
raise MesaCIException(
|
||||
raise MesaCIRetriableException(
|
||||
f"Could not submit LAVA job. Reason: {mesa_ci_err}"
|
||||
) from mesa_ci_err
|
||||
|
||||
@@ -316,6 +318,8 @@ def execute_job_with_retries(
|
||||
f"Finished executing LAVA job in the attempt #{attempt_no}"
|
||||
f"{CONSOLE_LOG['RESET']}"
|
||||
)
|
||||
if job.exception and not isinstance(job.exception, MesaCIRetriableException):
|
||||
break
|
||||
|
||||
return last_failed_job
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ from typing import Any, Optional
|
||||
|
||||
from lava.exceptions import (
|
||||
MesaCIException,
|
||||
MesaCIRetriableException,
|
||||
MesaCIKnownIssueException,
|
||||
MesaCIParseException,
|
||||
MesaCITimeoutError,
|
||||
@@ -34,7 +35,7 @@ class LAVAJob:
|
||||
self._is_finished = False
|
||||
self.log: dict[str, Any] = log
|
||||
self.status = "not_submitted"
|
||||
self.__exception: Optional[str] = None
|
||||
self.__exception: Optional[Exception] = None
|
||||
|
||||
def heartbeat(self) -> None:
|
||||
self.last_log_time: datetime = datetime.now()
|
||||
@@ -63,13 +64,13 @@ class LAVAJob:
|
||||
return self._is_finished
|
||||
|
||||
@property
|
||||
def exception(self) -> str:
|
||||
def exception(self) -> Optional[Exception]:
|
||||
return self.__exception
|
||||
|
||||
@exception.setter
|
||||
def exception(self, exception: Exception) -> None:
|
||||
self.__exception = repr(exception)
|
||||
self.log["dut_job_fail_reason"] = self.__exception
|
||||
self.__exception = exception
|
||||
self.log["dut_job_fail_reason"] = repr(self.__exception)
|
||||
|
||||
def validate(self) -> Optional[dict]:
|
||||
"""Returns a dict with errors, if the validation fails.
|
||||
@@ -176,11 +177,15 @@ class LAVAJob:
|
||||
self.status = "canceled"
|
||||
elif isinstance(exception, MesaCITimeoutError):
|
||||
self.status = "hung"
|
||||
elif isinstance(exception, MesaCIException):
|
||||
elif isinstance(exception, MesaCIRetriableException):
|
||||
self.status = "failed"
|
||||
elif isinstance(exception, KeyboardInterrupt):
|
||||
self.status = "interrupted"
|
||||
print_log("LAVA job submitter was interrupted. Cancelling the job.")
|
||||
raise
|
||||
elif isinstance(exception, MesaCIException):
|
||||
self.status = "interrupted"
|
||||
print_log("LAVA job submitter was interrupted. Cancelling the job.")
|
||||
raise
|
||||
else:
|
||||
self.status = "job_submitter_error"
|
||||
|
||||
Reference in New Issue
Block a user