diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index bdad395ab4f..71c0c77b559 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -395,8 +395,6 @@ sanity: set -eu image_tags=( ALPINE_X86_64_BUILD_TAG - ALPINE_X86_64_LAVA_SSH_TAG - ALPINE_X86_64_LAVA_TRIGGER_TAG DEBIAN_BUILD_BASE_TAG DEBIAN_BUILD_TAG DEBIAN_TEST_BASE_TAG diff --git a/.gitlab-ci/container/alpine/gitlab-ci.yml b/.gitlab-ci/container/alpine/gitlab-ci.yml index 1ea9408f6a0..b8475aaccf1 100644 --- a/.gitlab-ci/container/alpine/gitlab-ci.yml +++ b/.gitlab-ci/container/alpine/gitlab-ci.yml @@ -54,17 +54,3 @@ alpine/x86_64_build: optional: true - job: alpine/x86_64_build optional: true - -# Alpine based x86_64 image for LAVA SSH dockerized client -alpine/x86_64_lava_ssh_client: - extends: - - .alpine/x86_64_build-base - variables: - MESA_IMAGE_TAG: &alpine-x86_64_lava_ssh_client ${ALPINE_X86_64_LAVA_SSH_TAG} - -# Alpine based x86_64 image to run LAVA jobs -alpine/x86_64_lava-trigger: - extends: - - .alpine/x86_64_build-base - variables: - MESA_IMAGE_TAG: &alpine-x86_64_lava_trigger ${ALPINE_X86_64_LAVA_TRIGGER_TAG} diff --git a/.gitlab-ci/container/alpine/x86_64_lava-trigger.sh b/.gitlab-ci/container/alpine/x86_64_lava-trigger.sh deleted file mode 100644 index e625e76532e..00000000000 --- a/.gitlab-ci/container/alpine/x86_64_lava-trigger.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env bash - -# This is a ci-templates build script to generate a container for triggering LAVA jobs. - -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -# shellcheck disable=SC1091 -set -e - -. .gitlab-ci/setup-test-env.sh - -set -o xtrace - -uncollapsed_section_start alpine_setup "Base Alpine system setup" - -# Ephemeral packages (installed for this script and removed again at the end) -EPHEMERAL=( - git - py3-pip -) - -# We only need these very basic packages to run the LAVA jobs -DEPS=( - curl - python3 - tar - zstd -) - -apk --no-cache add "${DEPS[@]}" "${EPHEMERAL[@]}" - -pip3 install --break-system-packages -r bin/ci/requirements-lava.txt - -cp -Rp .gitlab-ci/lava / -cp -Rp .gitlab-ci/bin/structured_logger.py /lava - -. .gitlab-ci/container/container_pre_build.sh - -############### Uninstall the build software - -uncollapsed_section_switch alpine_cleanup "Cleaning up base Alpine system" - -apk del "${EPHEMERAL[@]}" - -. .gitlab-ci/container/container_post_build.sh - -section_end alpine_cleanup diff --git a/.gitlab-ci/container/alpine/x86_64_lava_ssh_client.sh b/.gitlab-ci/container/alpine/x86_64_lava_ssh_client.sh deleted file mode 100644 index 82b197b4d8a..00000000000 --- a/.gitlab-ci/container/alpine/x86_64_lava_ssh_client.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash - -# This is a ci-templates build script to generate a container for LAVA SSH client. - -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_SSH_TAG - -# shellcheck disable=SC1091 -set -e - -. .gitlab-ci/setup-test-env.sh - -set -o xtrace - -EPHEMERAL=( -) - -# We only need these very basic packages to run the tests. -DEPS=( - openssh-client # for ssh - iputils # for ping - bash - curl -) - - -apk --no-cache add "${DEPS[@]}" "${EPHEMERAL[@]}" - -. .gitlab-ci/container/container_pre_build.sh - -############### Uninstall the build software - -apk del "${EPHEMERAL[@]}" - -. .gitlab-ci/container/container_post_build.sh diff --git a/.gitlab-ci/image-tags.yml b/.gitlab-ci/image-tags.yml index d7fb17ad987..02627c30f8f 100644 --- a/.gitlab-ci/image-tags.yml +++ b/.gitlab-ci/image-tags.yml @@ -29,8 +29,6 @@ variables: DEBIAN_TEST_VK_TAG: "20250930-piglit-a7" ALPINE_X86_64_BUILD_TAG: "20250917-rust" - ALPINE_X86_64_LAVA_SSH_TAG: "20250423-rootfs" - ALPINE_X86_64_LAVA_TRIGGER_TAG: "20250814-trixie" FEDORA_X86_64_BUILD_TAG: "20250917-rust" diff --git a/.gitlab-ci/lava/__init__.py b/.gitlab-ci/lava/__init__.py deleted file mode 100644 index 449a711b384..00000000000 --- a/.gitlab-ci/lava/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG diff --git a/.gitlab-ci/lava/exceptions.py b/.gitlab-ci/lava/exceptions.py deleted file mode 100644 index 0a8fe0d0e8b..00000000000 --- a/.gitlab-ci/lava/exceptions.py +++ /dev/null @@ -1,45 +0,0 @@ -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -from datetime import timedelta - - -class MesaCIException(Exception): - pass - - -class MesaCIRetriableException(MesaCIException): - pass - - -class MesaCITimeoutError(MesaCIRetriableException): - def __init__(self, *args, timeout_duration: timedelta) -> None: - super().__init__(*args) - self.timeout_duration = timeout_duration - - -class MesaCIRetryError(MesaCIRetriableException): - def __init__(self, *args, retry_count: int, last_job: None) -> None: - super().__init__(*args) - self.retry_count = retry_count - self.last_job = last_job - - -class MesaCIFatalException(MesaCIException): - """Exception raised when the Mesa CI script encounters a fatal error that - prevents the script from continuing.""" - - def __init__(self, *args) -> None: - super().__init__(*args) - - -class MesaCIParseException(MesaCIRetriableException): - pass - - -class MesaCIKnownIssueException(MesaCIRetriableException): - """Exception raised when the Mesa CI script finds something in the logs that - is known to cause the LAVA job to eventually fail""" - - pass diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py deleted file mode 100755 index 8cbf5865c1a..00000000000 --- a/.gitlab-ci/lava/lava_job_submitter.py +++ /dev/null @@ -1,622 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright (C) 2020 - 2023 Collabora Limited -# Authors: -# Gustavo Padovan -# Guilherme Gallo -# -# SPDX-License-Identifier: MIT - -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -"""Send a job to LAVA, track it and collect log back""" - -import contextlib -import json -import pathlib -import sys -import time -from collections import defaultdict -from dataclasses import dataclass, field, fields -from datetime import datetime, timedelta, UTC -from os import environ, getenv -from typing import Any, Optional, Self - -import fire -from lavacli.utils import flow_yaml as lava_yaml - -from lava.exceptions import ( - MesaCIException, - MesaCIFatalException, - MesaCIRetriableException, - MesaCIParseException, - MesaCIRetryError, - MesaCITimeoutError, -) -from lava.utils import ( - CONSOLE_LOG, - GitlabSection, - LAVAJob, - LAVAJobDefinition, - LogFollower, - LogSectionType, - call_proxy, - fatal_err, - hide_sensitive_data, - print_log, - setup_lava_proxy, -) -from lava.utils import DEFAULT_GITLAB_SECTION_TIMEOUTS as GL_SECTION_TIMEOUTS - -# Initialize structural logging with a defaultdict, it can be changed for more -# sophisticated dict-like data abstractions. -STRUCTURAL_LOG = defaultdict(list) - -try: - from structured_logger import StructuredLogger -except ImportError as e: - print_log( - f"Could not import StructuredLogger library: {e}. " - "Falling back to defaultdict based structured logger." - ) - -# Timeout in seconds to decide if the device from the dispatched LAVA job has -# hung or not due to the lack of new log output. -DEVICE_HANGING_TIMEOUT_SEC = int(getenv("DEVICE_HANGING_TIMEOUT_SEC", 5 * 60)) - -# How many seconds the script should wait before try a new polling iteration to -# check if the dispatched LAVA job is running or waiting in the job queue. -WAIT_FOR_DEVICE_POLLING_TIME_SEC = int( - getenv("LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC", 1) -) - -# How many seconds the script will wait to let LAVA finalize the job and give -# the final details. -WAIT_FOR_LAVA_POST_PROCESSING_SEC = int(getenv("LAVA_WAIT_LAVA_POST_PROCESSING_SEC", 5)) -WAIT_FOR_LAVA_POST_PROCESSING_RETRIES = int( - getenv("LAVA_WAIT_LAVA_POST_PROCESSING_RETRIES", 6) -) - -# How many seconds to wait between log output LAVA RPC calls. -LOG_POLLING_TIME_SEC = int(getenv("LAVA_LOG_POLLING_TIME_SEC", 5)) - -# How many retries should be made when a timeout happen. -NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int( - getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2) -) - -CI_JOB_TIMEOUT_SEC = int(getenv("CI_JOB_TIMEOUT", 3600)) -# How many seconds the script will wait to let LAVA run the job and give the final details. -EXPECTED_JOB_DURATION_SEC = int(getenv("EXPECTED_JOB_DURATION_SEC", 60 * 10)) -# CI_JOB_STARTED is given by GitLab CI/CD in UTC timezone by default. -CI_JOB_STARTED_AT_RAW = getenv("CI_JOB_STARTED_AT", "") -CI_JOB_STARTED_AT: datetime = ( - datetime.fromisoformat(CI_JOB_STARTED_AT_RAW) - if CI_JOB_STARTED_AT_RAW - else datetime.now(tz=UTC) -) - - -def raise_exception_from_metadata(metadata: dict, job_id: int) -> None: - """ - Investigate infrastructure errors from the job metadata. - If it finds an error, raise it as MesaCIRetriableException. - """ - if "result" not in metadata or metadata["result"] != "fail": - return - if "error_type" in metadata: - error_type: str = metadata["error_type"] - error_msg: str = metadata.get("error_msg", "") - full_err_msg: str = error_type if not error_msg else f"{error_type}: {error_msg}" - if error_type == "Job": - # This happens when LAVA assumes that the job cannot terminate or - # with mal-formed job definitions. As we are always validating the - # jobs, only the former is probable to happen. E.g.: When some LAVA - # action timed out more times than expected in job definition. - raise MesaCIRetriableException( - f"LAVA job {job_id} failed with {full_err_msg}. Retry." - "(possible LAVA timeout misconfiguration/bug). Retry." - ) - if error_type: - raise MesaCIRetriableException( - f"LAVA job {job_id} failed with error type: {full_err_msg}. Retry." - ) - if "case" in metadata and metadata["case"] == "validate": - raise MesaCIRetriableException( - f"LAVA job {job_id} failed validation (possible download error). Retry." - ) - - -def raise_lava_error(job) -> None: - # Look for infrastructure errors, raise them, and retry if we see them. - results_yaml = call_proxy(job.proxy.results.get_testjob_results_yaml, job.job_id) - results = lava_yaml.load(results_yaml) - for res in results: - metadata = res["metadata"] - raise_exception_from_metadata(metadata, job.job_id) - - # If we reach this far, it means that the job ended without hwci script - # result and no LAVA infrastructure problem was found - job.status = "fail" - - - -def fetch_logs(job, max_idle_time, log_follower) -> None: - is_job_hanging(job, max_idle_time) - - time.sleep(LOG_POLLING_TIME_SEC) - new_log_lines = fetch_new_log_lines(job) - parsed_lines = parse_log_lines(job, log_follower, new_log_lines) - - for line in parsed_lines: - print_log(line) - - -def is_job_hanging(job, max_idle_time): - # Poll to check for new logs, assuming that a prolonged period of - # silence means that the device has died and we should try it again - if datetime.now(tz=UTC) - job.last_log_time > max_idle_time: - max_idle_time_min = max_idle_time.total_seconds() / 60 - - raise MesaCITimeoutError( - f"{CONSOLE_LOG['FG_BOLD_YELLOW']}" - f"LAVA job {job.job_id} unresponsive for {max_idle_time_min} " - "minutes; retrying the job." - f"{CONSOLE_LOG['RESET']}", - timeout_duration=max_idle_time, - ) - - -def parse_log_lines(job, log_follower, new_log_lines): - if log_follower.feed(new_log_lines): - # If we had non-empty log data, we can assure that the device is alive. - job.heartbeat() - parsed_lines = log_follower.flush() - - # Only parse job results when the script reaches the end of the logs. - # Depending on how much payload the RPC scheduler.jobs.logs get, it may - # reach the LAVA_POST_PROCESSING phase. - if log_follower.current_section.type in ( - LogSectionType.TEST_CASE, - LogSectionType.LAVA_POST_PROCESSING, - ): - parsed_lines = job.parse_job_result_from_log(parsed_lines) - return parsed_lines - - -def fetch_new_log_lines(job): - # The XMLRPC binary packet may be corrupted, causing a YAML scanner error. - # Retry the log fetching several times before exposing the error. - for _ in range(5): - with contextlib.suppress(MesaCIParseException): - new_log_lines = job.get_logs() - break - else: - raise MesaCIParseException - return new_log_lines - - -def submit_job(job): - try: - job.submit() - except Exception as mesa_ci_err: - raise MesaCIRetriableException( - f"Could not submit LAVA job. Reason: {mesa_ci_err}" - ) from mesa_ci_err - - -def wait_for_job_get_started(job, attempt_no): - print_log(f"Waiting for job {job.job_id} to start.") - while not job.is_started(): - current_job_duration_sec: int = int( - (datetime.now(tz=UTC) - CI_JOB_STARTED_AT).total_seconds() - ) - remaining_time_sec: int = max(0, CI_JOB_TIMEOUT_SEC - current_job_duration_sec) - if remaining_time_sec < EXPECTED_JOB_DURATION_SEC: - job.cancel() - raise MesaCIFatalException( - f"{CONSOLE_LOG['FG_BOLD_YELLOW']}" - f"Job {job.job_id} only has {remaining_time_sec} seconds " - "remaining to run, but it is expected to take at least " - f"{EXPECTED_JOB_DURATION_SEC} seconds." - f"{CONSOLE_LOG['RESET']}", - ) - time.sleep(WAIT_FOR_DEVICE_POLLING_TIME_SEC) - job.refresh_log() - print_log(f"Job {job.job_id} started.") - - -def bootstrap_log_follower(main_test_case, timestamp_relative_to) -> LogFollower: - start_section = GitlabSection( - id="dut_deploy", - header="Running LAVA deploy action", - type=LogSectionType.LAVA_DEPLOY, - start_collapsed=True, - timestamp_relative_to=timestamp_relative_to, - ) - print(start_section.start()) - return LogFollower( - starting_section=start_section, - main_test_case=main_test_case, - timestamp_relative_to=timestamp_relative_to - ) - - -def follow_job_execution(job, log_follower): - with log_follower: - max_idle_time = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC) - # Start to check job's health - job.heartbeat() - while not job.is_finished: - fetch_logs(job, max_idle_time, log_follower) - structural_log_phases(job, log_follower) - - # Mesa Developers expect to have a simple pass/fail job result. - # If this does not happen, it probably means a LAVA infrastructure error - # happened. - if job.status not in ["pass", "fail"]: - raise_lava_error(job) - - # LogFollower does some cleanup after the early exit (trigger by - # `hwci: mesa: exit_code: \d+` regex), let's update the phases after the cleanup. - structural_log_phases(job, log_follower) - - -def structural_log_phases(job, log_follower): - phases: dict[str, Any] = { - s.header.split(" - ")[0]: { - k: str(getattr(s, k)) for k in ("start_time", "end_time") - } - for s in log_follower.section_history - } - job.log["dut_job_phases"] = phases - - -def print_job_final_status(job, timestamp_relative_to): - job.refresh_log() - if job.status == "running": - job.status = "hung" - - colour = LAVAJob.COLOR_STATUS_MAP.get(job.status, CONSOLE_LOG["FG_RED"]) - with GitlabSection( - "job_data", - f"Hardware job info for {job.status} job", - type=LogSectionType.LAVA_POST_PROCESSING, - start_collapsed=True, - colour=colour, - timestamp_relative_to=timestamp_relative_to, - ): - wait_post_processing_retries: int = WAIT_FOR_LAVA_POST_PROCESSING_RETRIES - while not job.is_post_processed() and wait_post_processing_retries > 0: - # Wait a little until LAVA finishes processing metadata - time.sleep(WAIT_FOR_LAVA_POST_PROCESSING_SEC) - wait_post_processing_retries -= 1 - - if not job.is_post_processed(): - waited_for_sec: int = ( - WAIT_FOR_LAVA_POST_PROCESSING_RETRIES - * WAIT_FOR_LAVA_POST_PROCESSING_SEC - ) - print_log( - "Timed out waiting for LAVA post-processing after " - f"{waited_for_sec} seconds. Printing incomplete information " - "anyway." - ) - - details: dict[str, str] = job.show() - for field, value in details.items(): - print(f"{field:<15}: {value}") - job.refresh_log() - - -def execute_job_with_retries( - proxy, job_definition, retry_count, jobs_log, main_test_case, - timestamp_relative_to -) -> Optional[LAVAJob]: - last_failed_job = None - for attempt_no in range(1, retry_count + 2): - # Need to get the logger value from its object to enable autosave - # features, if AutoSaveDict is enabled from StructuredLogging module - jobs_log.append({}) - job_log = jobs_log[-1] - job = LAVAJob(proxy, job_definition, job_log) - STRUCTURAL_LOG["dut_attempt_counter"] = attempt_no - try: - job_log["submitter_start_time"] = datetime.now(tz=UTC).isoformat() - submit_job(job) - queue_section = GitlabSection( - id="dut_queue", - header="Waiting for hardware device to become available", - type=LogSectionType.LAVA_QUEUE, - start_collapsed=False, - timestamp_relative_to=timestamp_relative_to - ) - with queue_section as section: - wait_for_job_get_started(job, attempt_no) - log_follower: LogFollower = bootstrap_log_follower( - main_test_case, timestamp_relative_to - ) - follow_job_execution(job, log_follower) - return job - - except (MesaCIException, KeyboardInterrupt) as exception: - job.handle_exception(exception) - - finally: - print_job_final_status(job, timestamp_relative_to) - # If LAVA takes too long to post process the job, the submitter - # gives up and proceeds. - job_log["submitter_end_time"] = datetime.now(tz=UTC).isoformat() - last_failed_job = job - print_log( - f"{CONSOLE_LOG['BOLD']}" - f"Finished executing LAVA job in the attempt #{attempt_no}" - f"{CONSOLE_LOG['RESET']}" - ) - if job.exception and not isinstance(job.exception, MesaCIRetriableException): - break - - return last_failed_job - - -def retriable_follow_job( - proxy, job_definition, main_test_case, timestamp_relative_to -) -> LAVAJob: - number_of_retries = NUMBER_OF_RETRIES_TIMEOUT_DETECTION - - last_attempted_job = execute_job_with_retries( - proxy, job_definition, number_of_retries, STRUCTURAL_LOG["dut_jobs"], - main_test_case, timestamp_relative_to - ) - - if last_attempted_job.exception is not None: - # Infra failed in all attempts - raise MesaCIRetryError( - f"{CONSOLE_LOG['BOLD']}" - f"{CONSOLE_LOG['FG_RED']}" - "Job failed after it exceeded the number of " - f"{number_of_retries} retries." - f"{CONSOLE_LOG['RESET']}", - retry_count=number_of_retries, - last_job=last_attempted_job, - ) - - return last_attempted_job - - -@dataclass -class PathResolver: - def __post_init__(self): - for field in fields(self): - value = getattr(self, field.name) - if not value: - continue - if field.type == pathlib.Path: - value = pathlib.Path(value) - setattr(self, field.name, value.resolve()) - - -@dataclass -class LAVAJobSubmitter(PathResolver): - boot_method: str - device_type: str - farm: str - job_timeout_min: int # The job timeout in minutes - shell_prompt: str = "lava-shell:" - dtb_filename: str = None - dump_yaml: bool = False # Whether to dump the YAML payload to stdout - env_file: pathlib.Path = None - jwt_file: pathlib.Path = None - kernel_image_name: str = None - kernel_image_type: str = "" - kernel_url_prefix: str = None - lava_tags: str | tuple[str, ...] = () # Comma-separated LAVA tags for the job - mesa_job_name: str = "mesa_ci_job" - pipeline_info: str = "" - rootfs_url: str = None - validate_only: bool = False # Whether to only validate the job, not execute it - visibility_group: str = None # Only affects LAVA farm maintainers - structured_log_file: pathlib.Path = None # Log file path with structured LAVA log - ssh_client_image: str = None # x86_64 SSH client image to follow the job's output - project_dir: str = None # Project directory to be used to find Mesa artifacts - project_name: str = None # Project name to be used in the job name - starting_section: str = None # GitLab section used to start - job_submitted_at: [str | datetime] = None - __structured_log_context = contextlib.nullcontext() # Structured Logger context - _overlays: dict = field(default_factory=dict, init=False) - - def __post_init__(self) -> Self: - super().__post_init__() - # Remove mesa job names with spaces, which breaks the lava-test-case command - self.mesa_job_name = self.mesa_job_name.split(" ")[0] - - if self.structured_log_file: - self.__structured_log_context = StructuredLoggerWrapper(self).logger_context() - - if self.job_submitted_at: - self.job_submitted_at = datetime.fromisoformat(self.job_submitted_at) - self.proxy = setup_lava_proxy() - - return self - - def append_overlay( - self, name: str, path: str, url: str, format: str = "tar", compression: str = "" - ) -> Self: - """ - Append an overlay to the LAVA job definition. - - Args: - name (str): The name of the overlay. - url (str): The URL from where the overlay can be downloaded. - path (str): The path where the overlay should be applied. - format (str, optional): The format of the overlay (default is "tar"). - compression (str, optional): The compression type of the overlay (e.g., "gz", "xz"). - - Returns: - Self: The instance of LAVAJobSubmitter with the overlay appended. - """ - self._overlays[name] = { - "url": url, - "path": path, - "format": format, - } - if compression: - self._overlays[name]["compression"] = compression - return self - - def print(self) -> Self: - """ - Prints the dictionary representation of the instance and returns the instance itself. - - Returns: - Self: The instance of the class. - """ - print(self.__dict__) - return self - - def __prepare_submission(self) -> str: - # Overwrite the timeout for the testcases with the value offered by the - # user. The testcase running time should be at least 4 times greater than - # the other sections (boot and setup), so we can safely ignore them. - # If LAVA fails to stop the job at this stage, it will fall back to the - # script section timeout with a reasonable delay. - GL_SECTION_TIMEOUTS[LogSectionType.TEST_CASE] = timedelta( - minutes=self.job_timeout_min - ) - - job_definition = LAVAJobDefinition(self).generate_lava_job_definition() - - if self.dump_yaml: - self.dump_job_definition(job_definition) - - validation_job = LAVAJob(self.proxy, job_definition) - if errors := validation_job.validate(): - fatal_err(f"Error in LAVA job definition: {errors}") - - return job_definition - - @classmethod - def is_under_ci(cls): - ci_envvar: str = getenv("CI", "false") - return ci_envvar.lower() == "true" - - def dump_job_definition(self, job_definition) -> None: - with GitlabSection( - "yaml_dump", - "LAVA job definition (YAML)", - type=LogSectionType.LAVA_BOOT, - start_collapsed=True, - ): - print(hide_sensitive_data(job_definition)) - - def submit(self) -> None: - """ - Prepares and submits the LAVA job. - If `validate_only` is True, it validates the job without submitting it. - If the job finishes with a non-pass status or encounters an exception, - the program exits with a non-zero return code. - """ - job_definition: str = self.__prepare_submission() - - if self.validate_only: - return - - if self.starting_section: - gl = GitlabSection( - id=self.starting_section, - header="Preparing to submit job for scheduling", - type=LogSectionType.LAVA_SUBMIT, - start_collapsed=True, - timestamp_relative_to=self.job_submitted_at, - ) - gl.start() - print(gl.end()) - - with self.__structured_log_context: - last_attempt_job = None - try: - last_attempt_job = retriable_follow_job( - self.proxy, job_definition, - f'{self.project_name}_{self.mesa_job_name}', - self.job_submitted_at) - - except MesaCIRetryError as retry_exception: - last_attempt_job = retry_exception.last_job - - except Exception as exception: - STRUCTURAL_LOG["job_combined_fail_reason"] = str(exception) - raise exception - - finally: - self.finish_script(last_attempt_job) - - def finish_script(self, last_attempt_job): - if not last_attempt_job: - # No job was run, something bad happened - STRUCTURAL_LOG["job_combined_status"] = "script_crash" - current_exception = str(sys.exc_info()[1]) - STRUCTURAL_LOG["job_combined_fail_reason"] = current_exception - print(f"Interrupting the script. Reason: {current_exception}") - raise SystemExit(1) - - STRUCTURAL_LOG["job_combined_status"] = last_attempt_job.status - STRUCTURAL_LOG["job_exit_code"] = last_attempt_job.exit_code - - if last_attempt_job.status != "pass": - raise SystemExit(last_attempt_job.exit_code) - - -class StructuredLoggerWrapper: - def __init__(self, submitter: LAVAJobSubmitter) -> None: - self.__submitter: LAVAJobSubmitter = submitter - - def _init_logger(self): - STRUCTURAL_LOG["fixed_tags"] = self.__submitter.lava_tags - STRUCTURAL_LOG["dut_job_type"] = self.__submitter.device_type - STRUCTURAL_LOG["farm"] = self.__submitter.farm - STRUCTURAL_LOG["job_combined_fail_reason"] = None - STRUCTURAL_LOG["job_combined_status"] = "not_submitted" - STRUCTURAL_LOG["job_exit_code"] = None - STRUCTURAL_LOG["dut_attempt_counter"] = 0 - - # Initialize dut_jobs list to enable appends - STRUCTURAL_LOG["dut_jobs"] = [] - - @contextlib.contextmanager - def _simple_logger_context(self): - log_file = pathlib.Path(self.__submitter.structured_log_file) - log_file.parent.mkdir(parents=True, exist_ok=True) - try: - # Truncate the file - log_file.write_text("") - yield - finally: - log_file.write_text(json.dumps(STRUCTURAL_LOG, indent=2)) - - def logger_context(self): - context = contextlib.nullcontext() - try: - global STRUCTURAL_LOG - STRUCTURAL_LOG = StructuredLogger( - self.__submitter.structured_log_file, truncate=True - ).data - except NameError: - context = self._simple_logger_context() - - self._init_logger() - return context - - -def main(): - # given that we proxy from DUT -> LAVA dispatcher -> LAVA primary -> us -> - # GitLab runner -> GitLab primary -> user, safe to say we don't need any - # more buffering - sys.stdout.reconfigure(line_buffering=True) - sys.stderr.reconfigure(line_buffering=True) - - fire.Fire(LAVAJobSubmitter) - -if __name__ == "__main__": - main() diff --git a/.gitlab-ci/lava/tests/__init__.py b/.gitlab-ci/lava/tests/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/.gitlab-ci/lava/tests/conftest.py b/.gitlab-ci/lava/tests/conftest.py deleted file mode 100644 index 6098bf209d0..00000000000 --- a/.gitlab-ci/lava/tests/conftest.py +++ /dev/null @@ -1,74 +0,0 @@ -from collections import defaultdict -from unittest.mock import MagicMock, patch - -import pytest -import yaml -from freezegun import freeze_time - -from .lava.helpers import generate_testsuite_result, jobs_logs_response - - -def pytest_configure(config): - config.addinivalue_line( - "markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')" - ) - -@pytest.fixture -def mock_sleep(): - """Mock time.sleep to make test faster""" - with patch("time.sleep", return_value=None): - yield - - -@pytest.fixture -def frozen_time(mock_sleep): - with freeze_time("2024-01-01") as frozen_time: - yield frozen_time - - -RESULT_GET_TESTJOB_RESULTS = [{"metadata": {"result": "test"}}] - - -@pytest.fixture -def mock_proxy(frozen_time): - def mock_job_state(jid) -> dict[str, str]: - frozen_time.tick(1) - return {"job_state": "Running"} - - def create_proxy_mock( - job_results=RESULT_GET_TESTJOB_RESULTS, - testsuite_results=[generate_testsuite_result()], - **kwargs - ): - proxy_mock = MagicMock() - proxy_submit_mock = proxy_mock.scheduler.jobs.submit - proxy_submit_mock.return_value = "1234" - - proxy_results_mock = proxy_mock.results.get_testjob_results_yaml - proxy_results_mock.return_value = yaml.safe_dump(job_results) - - proxy_test_suites_mock = proxy_mock.results.get_testsuite_results_yaml - proxy_test_suites_mock.return_value = yaml.safe_dump(testsuite_results) - - proxy_logs_mock = proxy_mock.scheduler.jobs.logs - proxy_logs_mock.return_value = jobs_logs_response() - - proxy_job_state = proxy_mock.scheduler.job_state - proxy_job_state.side_effect = mock_job_state - - proxy_show_mock = proxy_mock.scheduler.jobs.show - proxy_show_mock.return_value = defaultdict( - str, - { - "device_type": "test_device", - "device": "test_device-cbg-1", - "state": "created", - }, - ) - - for key, value in kwargs.items(): - setattr(proxy_logs_mock, key, value) - - return proxy_mock - - yield create_proxy_mock diff --git a/.gitlab-ci/lava/tests/data/FASTBOOT_force_uart=False_job_definition.yaml b/.gitlab-ci/lava/tests/data/FASTBOOT_force_uart=False_job_definition.yaml deleted file mode 100644 index 3d7900c9fae..00000000000 --- a/.gitlab-ci/lava/tests/data/FASTBOOT_force_uart=False_job_definition.yaml +++ /dev/null @@ -1,147 +0,0 @@ -job_name: 'test-project: my_pipeline_info' -device_type: my_fastboot_device_type -visibility: - group: - - my_visibility_group -priority: 75 -context: - extra_nfsroot_args: ' init=/init rootwait usbcore.quirks=0bda:8153:k' -timeouts: - job: - minutes: 10 - actions: - depthcharge-retry: - minutes: 6 - depthcharge-action: - minutes: 5 - uboot-action: - seconds: 200 -actions: -- deploy: - timeout: - minutes: 10 - to: nfs - nfsrootfs: - url: None - compression: zstd - format: tar - overlays: {} - namespace: dut -- deploy: - timeout: - minutes: 5 - to: downloads - os: oe - images: - kernel: - url: None/None - dtb: - url: None/my_dtb_filename.dtb - postprocess: - docker: - image: registry.gitlab.collabora.com/lava/health-check-docker - steps: - - cat Image.gz my_dtb_filename.dtb > Image.gz+dtb - - mkbootimg --kernel Image.gz+dtb --cmdline "root=/dev/nfs rw nfsroot=$NFS_SERVER_IP:$NFS_ROOTFS,tcp,hard,v3 - ip=dhcp init=/init rootwait usbcore.quirks=0bda:8153:k" --pagesize 4096 - --base 0x80000000 -o boot.img - namespace: dut -- deploy: - timeout: - minutes: 10 - to: fastboot - docker: - image: registry.gitlab.collabora.com/lava/health-check-docker - images: - boot: - url: downloads://boot.img - namespace: dut - failure_retry: 2 -- boot: - timeout: - minutes: 2 - docker: - image: registry.gitlab.collabora.com/lava/health-check-docker - failure_retry: 2 - method: fastboot - prompts: - - 'lava-shell:' - commands: - - set_active a - namespace: dut - auto_login: - login_commands: - - dropbear -R -B - - touch /dut_ready - login_prompt: 'ogin:' - username: '' -- test: - namespace: dut - definitions: - - from: inline - name: setup-ssh-server - path: inline-setup-ssh-server - repository: - metadata: - format: Lava-Test Test Definition 1.0 - name: dut-env-export - run: - steps: - - |- - FARM=test_farm . /test_dir/install/common/init-stage1.sh - echo RkFSTT10ZXN0X2Zhcm0gL3Rlc3RfZGlyL2luc3RhbGwvY29tbW9uL2luaXQtc3RhZ2UxLnNo | base64 -d >> /set-job-env-vars.sh - export CURRENT_SECTION=dut_boot - - export -p > /dut-env-vars.sh -- test: - namespace: container - timeout: - minutes: 10 - failure_retry: 3 - definitions: - - name: docker_ssh_client - from: inline - path: inline/docker_ssh_client.yaml - repository: - metadata: - name: mesa - description: Mesa test plan - format: Lava-Test Test Definition 1.0 - run: - steps: - - |- - set -ex - timeout 1m bash << EOF - while [ -z "$(lava-target-ip)" ]; do - echo Waiting for DUT to join LAN; - sleep 1; - done - EOF - - ping -c 5 -w 60 $(lava-target-ip) - - lava_ssh_test_case() { - set -ex - local test_case="${1}" - shift - lava-test-case "${test_case}" --shell \ - ssh ${SSH_PTY_ARGS:--T} \ - -o StrictHostKeyChecking=no \ - -o UserKnownHostsFile=/dev/null \ - -o ConnectTimeout=60 \ - root@$(lava-target-ip) "${@}" - } - - |- - lava_ssh_test_case 'wait_for_dut_login' << EOF - while [ ! -e /dut_ready ]; do sleep 1; done; - EOF - - |- - lava_ssh_test_case 'artifact_download' 'bash --' << EOF - source /dut-env-vars.sh - set -e - echo Could not find jwt file, disabling S3 requests... - sed -i '/S3_RESULTS_UPLOAD/d' /set-job-env-vars.sh - EOF - - export SSH_PTY_ARGS=-tt - - lava_ssh_test_case 'test-project_dut' 'cd / && /test_dir/install/common/init-stage2.sh' - docker: - image: diff --git a/.gitlab-ci/lava/tests/data/FASTBOOT_force_uart=True_job_definition.yaml b/.gitlab-ci/lava/tests/data/FASTBOOT_force_uart=True_job_definition.yaml deleted file mode 100644 index f1ca57cbb58..00000000000 --- a/.gitlab-ci/lava/tests/data/FASTBOOT_force_uart=True_job_definition.yaml +++ /dev/null @@ -1,95 +0,0 @@ -job_name: 'test-project: my_pipeline_info' -device_type: my_fastboot_device_type -visibility: - group: - - my_visibility_group -priority: 75 -context: - extra_nfsroot_args: ' init=/init rootwait usbcore.quirks=0bda:8153:k' -timeouts: - job: - minutes: 10 - actions: - depthcharge-retry: - minutes: 6 - depthcharge-action: - minutes: 5 - uboot-action: - seconds: 200 -actions: -- deploy: - timeout: - minutes: 10 - to: nfs - nfsrootfs: - url: None - compression: zstd - format: tar - overlays: {} -- deploy: - timeout: - minutes: 5 - to: downloads - os: oe - images: - kernel: - url: None/None - dtb: - url: None/my_dtb_filename.dtb - postprocess: - docker: - image: registry.gitlab.collabora.com/lava/health-check-docker - steps: - - cat Image.gz my_dtb_filename.dtb > Image.gz+dtb - - mkbootimg --kernel Image.gz+dtb --cmdline "root=/dev/nfs rw nfsroot=$NFS_SERVER_IP:$NFS_ROOTFS,tcp,hard,v3 - ip=dhcp init=/init rootwait usbcore.quirks=0bda:8153:k" --pagesize 4096 - --base 0x80000000 -o boot.img -- deploy: - timeout: - minutes: 2 - to: fastboot - docker: - image: registry.gitlab.collabora.com/lava/health-check-docker - images: - boot: - url: downloads://boot.img -- boot: - timeout: - minutes: 2 - docker: - image: registry.gitlab.collabora.com/lava/health-check-docker - failure_retry: 2 - method: fastboot - prompts: - - 'lava-shell:' - commands: - - set_active a -- test: - timeout: - minutes: 10 - failure_retry: 1 - definitions: - - name: mesa - from: inline - lava-signal: kmsg - path: inline/mesa.yaml - repository: - metadata: - name: mesa - description: Mesa test plan - os: - - oe - scope: - - functional - format: Lava-Test Test Definition 1.0 - run: - steps: - - FARM=test_farm . /test_dir/install/common/init-stage1.sh - - echo RkFSTT10ZXN0X2Zhcm0gL3Rlc3RfZGlyL2luc3RhbGwvY29tbW9uL2luaXQtc3RhZ2UxLnNo - | base64 -d >> /set-job-env-vars.sh - - export CURRENT_SECTION=dut_boot - - set -e - - echo Could not find jwt file, disabling S3 requests... - - sed -i '/S3_RESULTS_UPLOAD/d' /set-job-env-vars.sh - - sleep 1 - - lava-test-case 'test-project_dut' --shell /test_dir/install/common/init-stage2.sh diff --git a/.gitlab-ci/lava/tests/data/UBOOT_force_uart=False_job_definition.yaml b/.gitlab-ci/lava/tests/data/UBOOT_force_uart=False_job_definition.yaml deleted file mode 100644 index 6564eefc2f3..00000000000 --- a/.gitlab-ci/lava/tests/data/UBOOT_force_uart=False_job_definition.yaml +++ /dev/null @@ -1,118 +0,0 @@ -job_name: 'test-project: my_pipeline_info' -device_type: my_uboot_device_type -visibility: - group: - - my_visibility_group -priority: 75 -context: - extra_nfsroot_args: ' init=/init rootwait usbcore.quirks=0bda:8153:k' -timeouts: - job: - minutes: 10 - actions: - depthcharge-retry: - minutes: 6 - depthcharge-action: - minutes: 5 - uboot-action: - seconds: 200 -actions: -- deploy: - timeout: - minutes: 10 - to: tftp - os: oe - kernel: - url: None/None - nfsrootfs: - url: None - compression: zstd - format: tar - overlays: {} - dtb: - url: None/my_dtb_filename.dtb - namespace: dut - failure_retry: 2 -- boot: - failure_retry: 2 - method: u-boot - prompts: - - 'lava-shell:' - commands: nfs - namespace: dut - auto_login: - login_commands: - - dropbear -R -B - - touch /dut_ready - login_prompt: 'ogin:' - username: '' -- test: - namespace: dut - definitions: - - from: inline - name: setup-ssh-server - path: inline-setup-ssh-server - repository: - metadata: - format: Lava-Test Test Definition 1.0 - name: dut-env-export - run: - steps: - - |- - FARM=test_farm . /test_dir/install/common/init-stage1.sh - echo RkFSTT10ZXN0X2Zhcm0gL3Rlc3RfZGlyL2luc3RhbGwvY29tbW9uL2luaXQtc3RhZ2UxLnNo | base64 -d >> /set-job-env-vars.sh - export CURRENT_SECTION=dut_boot - - export -p > /dut-env-vars.sh -- test: - namespace: container - timeout: - minutes: 10 - failure_retry: 3 - definitions: - - name: docker_ssh_client - from: inline - path: inline/docker_ssh_client.yaml - repository: - metadata: - name: mesa - description: Mesa test plan - format: Lava-Test Test Definition 1.0 - run: - steps: - - |- - set -ex - timeout 1m bash << EOF - while [ -z "$(lava-target-ip)" ]; do - echo Waiting for DUT to join LAN; - sleep 1; - done - EOF - - ping -c 5 -w 60 $(lava-target-ip) - - lava_ssh_test_case() { - set -ex - local test_case="${1}" - shift - lava-test-case "${test_case}" --shell \ - ssh ${SSH_PTY_ARGS:--T} \ - -o StrictHostKeyChecking=no \ - -o UserKnownHostsFile=/dev/null \ - -o ConnectTimeout=60 \ - root@$(lava-target-ip) "${@}" - } - - |- - lava_ssh_test_case 'wait_for_dut_login' << EOF - while [ ! -e /dut_ready ]; do sleep 1; done; - EOF - - |- - lava_ssh_test_case 'artifact_download' 'bash --' << EOF - source /dut-env-vars.sh - set -e - echo Could not find jwt file, disabling S3 requests... - sed -i '/S3_RESULTS_UPLOAD/d' /set-job-env-vars.sh - EOF - - export SSH_PTY_ARGS=-tt - - lava_ssh_test_case 'test-project_dut' 'cd / && /test_dir/install/common/init-stage2.sh' - docker: - image: diff --git a/.gitlab-ci/lava/tests/data/UBOOT_force_uart=True_job_definition.yaml b/.gitlab-ci/lava/tests/data/UBOOT_force_uart=True_job_definition.yaml deleted file mode 100644 index f68b12f8909..00000000000 --- a/.gitlab-ci/lava/tests/data/UBOOT_force_uart=True_job_definition.yaml +++ /dev/null @@ -1,68 +0,0 @@ -job_name: 'test-project: my_pipeline_info' -device_type: my_uboot_device_type -visibility: - group: - - my_visibility_group -priority: 75 -context: - extra_nfsroot_args: ' init=/init rootwait usbcore.quirks=0bda:8153:k' -timeouts: - job: - minutes: 10 - actions: - depthcharge-retry: - minutes: 6 - depthcharge-action: - minutes: 5 - uboot-action: - seconds: 200 -actions: -- deploy: - timeout: - minutes: 5 - to: tftp - os: oe - kernel: - url: None/None - nfsrootfs: - url: None - compression: zstd - format: tar - overlays: {} - dtb: - url: None/my_dtb_filename.dtb -- boot: - failure_retry: 2 - method: u-boot - prompts: - - 'lava-shell:' - commands: nfs -- test: - timeout: - minutes: 10 - failure_retry: 1 - definitions: - - name: mesa - from: inline - lava-signal: kmsg - path: inline/mesa.yaml - repository: - metadata: - name: mesa - description: Mesa test plan - os: - - oe - scope: - - functional - format: Lava-Test Test Definition 1.0 - run: - steps: - - FARM=test_farm . /test_dir/install/common/init-stage1.sh - - echo RkFSTT10ZXN0X2Zhcm0gL3Rlc3RfZGlyL2luc3RhbGwvY29tbW9uL2luaXQtc3RhZ2UxLnNo - | base64 -d >> /set-job-env-vars.sh - - export CURRENT_SECTION=dut_boot - - set -e - - echo Could not find jwt file, disabling S3 requests... - - sed -i '/S3_RESULTS_UPLOAD/d' /set-job-env-vars.sh - - sleep 1 - - lava-test-case 'test-project_dut' --shell /test_dir/install/common/init-stage2.sh diff --git a/.gitlab-ci/lava/tests/lava/__init__.py b/.gitlab-ci/lava/tests/lava/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/.gitlab-ci/lava/tests/lava/helpers.py b/.gitlab-ci/lava/tests/lava/helpers.py deleted file mode 100644 index 11ec209ab7f..00000000000 --- a/.gitlab-ci/lava/tests/lava/helpers.py +++ /dev/null @@ -1,150 +0,0 @@ -from contextlib import nullcontext as does_not_raise -from datetime import UTC, datetime -from io import StringIO -from itertools import cycle -from typing import Any, Callable, Generator, Iterable, Optional, Tuple, Union - -from freezegun import freeze_time -from lava.utils.log_section import ( - DEFAULT_GITLAB_SECTION_TIMEOUTS, - FALLBACK_GITLAB_SECTION_TIMEOUT, - LogSectionType, -) -from lavacli.utils import flow_yaml as lava_yaml - - -def yaml_dump(data: dict[str, Any]) -> str: - stream = StringIO() - lava_yaml.dump(data, stream) - return stream.getvalue() - - -def section_timeout(section_type: LogSectionType) -> int: - return int( - DEFAULT_GITLAB_SECTION_TIMEOUTS.get( - section_type, FALLBACK_GITLAB_SECTION_TIMEOUT - ).total_seconds() - ) - - -def create_lava_yaml_msg( - dt: Callable = datetime.now, msg="test", lvl="target" -) -> dict[str, str]: - return {"dt": str(dt()), "msg": msg, "lvl": lvl} - - -def generate_testsuite_result( - name="test-mesa-ci", result="pass", exit_code=0, metadata_extra=None, extra=None -): - if metadata_extra is None: - metadata_extra = {} - if extra is None: - extra = {} - return {"metadata": {"result": result, "exit_code": exit_code, **metadata_extra}, "name": name} - - -def jobs_logs_response( - finished=False, msg=None, lvl="target", result=None, exit_code=None -) -> Tuple[bool, str]: - timed_msg = {"dt": str(datetime.now(tz=UTC)), "msg": "New message", "lvl": lvl} - if result: - timed_msg["lvl"] = "target" - timed_msg["msg"] = f"hwci: mesa: exit_code: {exit_code}" - - logs = [timed_msg] if msg is None else msg - - return finished, yaml_dump(logs) - - -def section_aware_message_generator( - messages: dict[LogSectionType, - Iterable[int]], - result: Optional[str] = None, - exit_code: Optional[int] = None -) -> Iterable[tuple[dict, Iterable[int]]]: - default = [1] - - result_message_section = LogSectionType.TEST_CASE - - for section_type in LogSectionType: - delay = messages.get(section_type, default) - yield mock_lava_signal(section_type), delay - if result and section_type == result_message_section: - # To consider the job finished, the result `echo` should be produced - # in the correct section - yield create_lava_yaml_msg(msg=f"hwci: mesa: exit_code: {exit_code}"), delay - - -def message_generator(): - for section_type in LogSectionType: - yield mock_lava_signal(section_type) - - -def level_generator(): - # Tests all known levels by default - yield from cycle(("results", "feedback", "warning", "error", "debug", "target")) - - -def generate_n_logs( - n=1, - tick_fn: Union[Generator, Iterable[int], int] = 1, - level_fn=level_generator, - result="pass", - exit_code=0, -): - """Simulate a log partitionated in n components""" - level_gen = level_fn() - - if isinstance(tick_fn, Generator): - tick_gen = tick_fn - elif isinstance(tick_fn, Iterable): - tick_gen = cycle(tick_fn) - else: - tick_gen = cycle((tick_fn,)) - - with freeze_time(datetime.now(tz=UTC)) as time_travel: - tick_sec: int = next(tick_gen) - while True: - # Simulate a scenario where the target job is waiting for being started - for _ in range(n - 1): - level: str = next(level_gen) - - time_travel.tick(tick_sec) - yield jobs_logs_response(finished=False, msg=[], lvl=level) - - time_travel.tick(tick_sec) - yield jobs_logs_response(finished=True, result=result, exit_code=exit_code) - - -def to_iterable(tick_fn): - if isinstance(tick_fn, Generator): - return tick_fn - elif isinstance(tick_fn, Iterable): - return cycle(tick_fn) - else: - return cycle((tick_fn,)) - - -def mock_logs(messages=None, result=None, exit_code=None): - if messages is None: - messages = {} - with freeze_time(datetime.now(tz=UTC)) as time_travel: - # Simulate a complete run given by message_fn - for msg, tick_list in section_aware_message_generator(messages, result, exit_code): - for tick_sec in tick_list: - yield jobs_logs_response(finished=False, msg=[msg]) - time_travel.tick(tick_sec) - - -def mock_lava_signal(type: LogSectionType) -> dict[str, str]: - return { - LogSectionType.TEST_CASE: create_lava_yaml_msg( - msg=" case", lvl="debug" - ), - LogSectionType.TEST_SUITE: create_lava_yaml_msg( - msg=" suite", lvl="debug" - ), - LogSectionType.LAVA_POST_PROCESSING: create_lava_yaml_msg( - msg="", lvl="target" - ), - }.get(type, create_lava_yaml_msg()) diff --git a/.gitlab-ci/lava/tests/test_lava_job_submitter.py b/.gitlab-ci/lava/tests/test_lava_job_submitter.py deleted file mode 100644 index 4d37c07feca..00000000000 --- a/.gitlab-ci/lava/tests/test_lava_job_submitter.py +++ /dev/null @@ -1,557 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright (C) 2022 Collabora Limited -# Author: Guilherme Gallo -# -# SPDX-License-Identifier: MIT - -import os -import xmlrpc.client -from contextlib import nullcontext as does_not_raise -from datetime import UTC, datetime -from itertools import cycle, islice, repeat -from pathlib import Path -from typing import Generator -from unittest.mock import MagicMock, patch - -import pytest -from lava.exceptions import MesaCIException, MesaCIRetryError, MesaCIFatalException -from lava.lava_job_submitter import ( - DEVICE_HANGING_TIMEOUT_SEC, - NUMBER_OF_RETRIES_TIMEOUT_DETECTION, - LAVAJob, - LAVAJobSubmitter, - bootstrap_log_follower, - follow_job_execution, - retriable_follow_job, - wait_for_job_get_started, -) -from lava.utils import LogSectionType, LAVA_TEST_CASE_TIMEOUT - -from .lava.helpers import ( - generate_n_logs, - generate_testsuite_result, - jobs_logs_response, - mock_lava_signal, - mock_logs, - section_timeout, -) - -NUMBER_OF_MAX_ATTEMPTS = NUMBER_OF_RETRIES_TIMEOUT_DETECTION + 1 - - -def max_sec_before_timeout(): - return min(1000, LAVA_TEST_CASE_TIMEOUT * 60 - 1) - - -@pytest.fixture -def mock_proxy_waiting_time(mock_proxy): - def update_mock_proxy(frozen_time, **kwargs): - def mock_job_state(jid) -> dict[str, str]: - frozen_time.tick(wait_time) - return {"job_state": "Running"} - - wait_time = kwargs.pop("wait_time", 1) - proxy_mock = mock_proxy(**kwargs) - proxy_job_state = proxy_mock.scheduler.job_state - proxy_job_state.side_effect = mock_job_state - - return proxy_mock - - return update_mock_proxy - - -@pytest.fixture(params=[{"CI": "true"}, {"CI": "false"}], ids=["Under CI", "Local run"]) -def ci_environment(request): - with patch.dict(os.environ, request.param): - yield - - -@pytest.fixture -def lava_job_submitter( - ci_environment, - tmp_path, - mock_proxy, -): - os.chdir(tmp_path) - tmp_file = Path(tmp_path) / "log.json" - - with patch("lava.lava_job_submitter.setup_lava_proxy") as mock_setup_lava_proxy: - mock_setup_lava_proxy.return_value = mock_proxy() - yield LAVAJobSubmitter( - boot_method="test_boot", - shell_prompt="lava-shell:", - device_type="test_device", - farm="test_farm", - job_timeout_min=1, - structured_log_file=tmp_file, - ) - - -@pytest.mark.parametrize("exception", [RuntimeError, SystemError, KeyError]) -def test_submit_and_follow_respects_exceptions(mock_sleep, mock_proxy, exception): - with pytest.raises(MesaCIException): - proxy = mock_proxy(side_effect=exception) - job = LAVAJob(proxy, "") - log_follower = bootstrap_log_follower(main_test_case="", timestamp_relative_to=None) - follow_job_execution(job, log_follower) - - -NETWORK_EXCEPTION = xmlrpc.client.ProtocolError("", 0, "test", {}) -XMLRPC_FAULT = xmlrpc.client.Fault(0, "test") - -PROXY_SCENARIOS = { - "simple pass case": (mock_logs(result="pass", exit_code=0), does_not_raise(), "pass", 0, {}), - "simple fail case": (mock_logs(result="fail", exit_code=1), does_not_raise(), "fail", 1, {}), - "simple hung case": ( - mock_logs( - messages={ - LogSectionType.TEST_CASE: [ - section_timeout(LogSectionType.TEST_CASE) + 1 - ] - * max_sec_before_timeout() - }, - result="fail", - exit_code=1, - ), - pytest.raises(MesaCIRetryError), - "hung", - 1, - {}, - ), - "leftover dump from last job in boot section": ( - ( - mock_lava_signal(LogSectionType.LAVA_BOOT), - jobs_logs_response(finished=False, msg=None, result="fail", exit_code=1), - ), - pytest.raises(MesaCIRetryError), - "hung", - 1, - {}, - ), - "boot works at last retry": ( - mock_logs( - messages={ - LogSectionType.LAVA_BOOT: [ - section_timeout(LogSectionType.LAVA_BOOT) + 1 - ] - * NUMBER_OF_RETRIES_TIMEOUT_DETECTION - + [1] - }, - result="pass", - exit_code=0, - ), - does_not_raise(), - "pass", - 0, - {}, - ), - "test case took too long": pytest.param( - mock_logs( - messages={ - LogSectionType.TEST_CASE: [ - section_timeout(LogSectionType.TEST_CASE) + 1 - ] - * (NUMBER_OF_MAX_ATTEMPTS + 1) - }, - result="pass", - exit_code=0, - ), - pytest.raises(MesaCIRetryError), - "pass", - 0, - {}, - ), - "timed out more times than retry attempts": ( - generate_n_logs(n=4, tick_fn=9999999), - pytest.raises(MesaCIRetryError), - "fail", - 1, - {}, - ), - "long log case, no silence": ( - mock_logs( - messages={LogSectionType.TEST_CASE: [1] * (max_sec_before_timeout())}, - result="pass", - exit_code=0, - ), - does_not_raise(), - "pass", - 0, - {}, - ), - "no retries, testsuite succeed": ( - mock_logs(result="pass", exit_code=0), - does_not_raise(), - "pass", - 0, - {"testsuite_results": [generate_testsuite_result(result="pass", exit_code=0)]}, - ), - "no retries, but testsuite fails": ( - mock_logs(result="fail", exit_code=1), - does_not_raise(), - "fail", - 1, - {"testsuite_results": [generate_testsuite_result(result="fail", exit_code=1)]}, - ), - "no retries, one testsuite fails": ( - mock_logs(result="fail", exit_code=1), - does_not_raise(), - "fail", - 1, - { - "testsuite_results": [ - generate_testsuite_result(result="fail", exit_code=1), - generate_testsuite_result(result="pass", exit_code=0), - ] - }, - ), - "very long silence": ( - generate_n_logs(n=NUMBER_OF_MAX_ATTEMPTS + 1, tick_fn=100000), - pytest.raises(MesaCIRetryError), - "fail", - 1, - {}, - ), - # If a protocol error happens, _call_proxy will retry without affecting timeouts - "unstable connection, ProtocolError followed by final message": ( - (NETWORK_EXCEPTION, *list(mock_logs(result="pass", exit_code=0))), - does_not_raise(), - "pass", - 0, - {}, - ), - # After an arbitrary number of retries, _call_proxy should call sys.exit - "unreachable case, subsequent ProtocolErrors": ( - repeat(NETWORK_EXCEPTION), - pytest.raises(SystemExit), - "fail", - 1, - {}, - ), - "XMLRPC Fault": ([XMLRPC_FAULT], pytest.raises(MesaCIRetryError), False, 1, {}), -} - - -@pytest.mark.parametrize( - "test_log, expectation, job_result, exit_code, proxy_args", - PROXY_SCENARIOS.values(), - ids=PROXY_SCENARIOS.keys(), -) -def test_retriable_follow_job( - mock_sleep, - test_log, - expectation, - job_result, - exit_code, - proxy_args, - mock_proxy, -): - with expectation: - proxy = mock_proxy(side_effect=test_log, **proxy_args) - job: LAVAJob = retriable_follow_job(proxy, "", "", None) - assert job_result == job.status - assert exit_code == job.exit_code - - - -WAIT_FOR_JOB_SCENARIOS = {"one log run taking (sec):": (mock_logs(result="pass", exit_code=0))} - -@pytest.mark.parametrize("wait_time", (DEVICE_HANGING_TIMEOUT_SEC * 2,)) -@pytest.mark.parametrize( - "side_effect", - WAIT_FOR_JOB_SCENARIOS.values(), - ids=WAIT_FOR_JOB_SCENARIOS.keys(), -) -def test_simulate_a_long_wait_to_start_a_job( - frozen_time, - wait_time, - side_effect, - mock_proxy_waiting_time, -): - start_time = datetime.now(tz=UTC) - job: LAVAJob = retriable_follow_job( - mock_proxy_waiting_time( - frozen_time, side_effect=side_effect, wait_time=wait_time - ), - "", - "", - None - ) - - end_time = datetime.now(tz=UTC) - delta_time = end_time - start_time - - assert job.status == "pass" - assert job.exit_code == 0 - assert delta_time.total_seconds() >= wait_time - - -LONG_LAVA_QUEUE_SCENARIOS = { - "no_time_to_run": (0, pytest.raises(MesaCIFatalException)), - "enough_time_to_run": (9999999999, does_not_raise()), -} - - -@pytest.mark.parametrize( - "job_timeout, expectation", - LONG_LAVA_QUEUE_SCENARIOS.values(), - ids=LONG_LAVA_QUEUE_SCENARIOS.keys(), -) -def test_wait_for_job_get_started_no_time_to_run(monkeypatch, job_timeout, expectation): - monkeypatch.setattr("lava.lava_job_submitter.CI_JOB_TIMEOUT_SEC", job_timeout) - monkeypatch.setattr("lava.lava_job_submitter.CI_JOB_STARTED_AT", datetime.now(tz=UTC)) - job = MagicMock() - # Make it escape the loop - job.is_started.side_effect = (False, False, True) - with expectation as e: - wait_for_job_get_started(job, 1) - if e: - job.cancel.assert_called_with() - - -CORRUPTED_LOG_SCENARIOS = { - "too much subsequent corrupted data": ( - [(False, "{'msg': 'Incomplete}")] * 100 + [jobs_logs_response(True)], - pytest.raises((MesaCIRetryError)), - ), - "one subsequent corrupted data": ( - [(False, "{'msg': 'Incomplete}")] * 2 + [jobs_logs_response(True)], - does_not_raise(), - ), -} - - -@pytest.mark.parametrize( - "data_sequence, expected_exception", - CORRUPTED_LOG_SCENARIOS.values(), - ids=CORRUPTED_LOG_SCENARIOS.keys(), -) -def test_log_corruption(mock_sleep, data_sequence, expected_exception, mock_proxy): - proxy_mock = mock_proxy() - proxy_logs_mock = proxy_mock.scheduler.jobs.logs - proxy_logs_mock.side_effect = data_sequence - with expected_exception: - retriable_follow_job(proxy_mock, "", "", None) - - -LAVA_RESULT_LOG_SCENARIOS = { - # the submitter should accept xtrace logs - "Bash xtrace echo with kmsg interleaving": ( - "echo hwci: mesa: exit_code: 0[ 737.673352] ", - "pass", 0, - ), - # the submitter should accept xtrace logs - "kmsg result print": ( - "[ 737.673352] hwci: mesa: exit_code: 0", - "pass", 0, - ), - # if the job result echo has a very bad luck, it still can be interleaved - # with kmsg - "echo output with kmsg interleaving": ( - "hwci: mesa: exit_code: 0[ 737.673352] ", - "pass", 0, - ), - "fail case": ( - "hwci: mesa: exit_code: 1", - "fail", 1, - ), - # fail case with different exit code - "fail case (exit code 101)": ( - "hwci: mesa: exit_code: 101", - "fail", 101, - ), -} - - -@pytest.mark.parametrize( - "message, expected_status, expected_exit_code", - LAVA_RESULT_LOG_SCENARIOS.values(), - ids=LAVA_RESULT_LOG_SCENARIOS.keys(), -) -def test_parse_job_result_from_log(message, expected_status, expected_exit_code, mock_proxy): - job = LAVAJob(mock_proxy(), "") - job.parse_job_result_from_log([message]) - - assert job.status == expected_status - assert job.exit_code == expected_exit_code - - -@pytest.mark.slow( - reason="Slow and sketchy test. Needs a LAVA log raw file at /tmp/log.yaml" -) -@pytest.mark.skipif( - not Path("/tmp/log.yaml").is_file(), reason="Missing /tmp/log.yaml file." -) -def test_full_yaml_log(mock_proxy, frozen_time, lava_job_submitter): - import random - - from lavacli.utils import flow_yaml as lava_yaml - - def time_travel_from_log_chunk(data_chunk): - if not data_chunk: - return - - first_log = lava_yaml.load(data_chunk[0])[0] - first_log_time = first_log["dt"] - frozen_time.move_to(first_log_time) - yield - - last_log = lava_yaml.load(data_chunk[-1])[0] - last_log_time = last_log["dt"] - frozen_time.move_to(last_log_time) - yield - - def time_travel_to_test_time(): - # Suppose that the first message timestamp of the entire LAVA job log is - # the same of from the job submitter execution - with open("/tmp/log.yaml", "r") as f: - first_log = f.readline() - first_log_time = lava_yaml.load(first_log)[0]["dt"] - frozen_time.move_to(first_log_time) - - def load_lines() -> Generator[tuple[bool, str], None, None]: - with open("/tmp/log.yaml", "r") as f: - # data = yaml.safe_load(f) - log_lines = f.readlines() - serial_message: str = "" - chunk_start_line = 0 - chunk_end_line = 0 - chunk_max_size = 100 - try: - while True: - chunk_end_line = chunk_start_line + random.randint(1, chunk_max_size) - # split the log in chunks of random size - log_chunk = list(islice(log_lines, chunk_start_line, chunk_end_line)) - chunk_start_line = chunk_end_line + 1 - serial_message = "".join(log_chunk) - # time_traveller_gen will make the time trave according to the timestamp from - # the message - time_traveller_gen = time_travel_from_log_chunk(log_chunk) - # Suppose that the first message timestamp is the same of - # log fetch RPC call - next(time_traveller_gen) - yield False, "[]" - # Travel to the same datetime of the last fetched log line - # in the chunk - next(time_traveller_gen) - yield False, serial_message - except StopIteration: - yield True, serial_message - return - - proxy = mock_proxy() - - def reset_logs(*args): - proxy.scheduler.jobs.logs.side_effect = load_lines() - - proxy.scheduler.jobs.submit = reset_logs - try: - time_travel_to_test_time() - start_time = datetime.now(tz=UTC) - retriable_follow_job(proxy, "", "", None) - finally: - try: - # If the job fails, maybe there will be no structured log - print(lava_job_submitter.structured_log_file.read_text()) - finally: - end_time = datetime.now(tz=UTC) - print("---- Reproduction log stats ----") - print(f"Start time: {start_time}") - print(f"End time: {end_time}") - print(f"Duration: {end_time - start_time}") - - -@pytest.mark.parametrize( - "validate_only,finished_job_status,job_exit_code,expected_combined_status", - [ - (True, "pass", None, None,), - (False, "pass", 0, "pass",), - (False, "fail", 1, "fail",), - ], - ids=[ - "validate_only_no_job_submission", - "successful_job_submission", - "failed_job_submission", - ], -) -def test_job_combined_status( - mock_proxy, - lava_job_submitter, - validate_only, - finished_job_status, - job_exit_code, - expected_combined_status, -): - lava_job_submitter.validate_only = validate_only - - with patch( - "lava.lava_job_submitter.retriable_follow_job" - ) as mock_retriable_follow_job, patch( - "lava.lava_job_submitter.LAVAJobSubmitter._LAVAJobSubmitter__prepare_submission" - ) as mock_prepare_submission, patch("sys.exit"): - from lava.lava_job_submitter import STRUCTURAL_LOG - - mock_retriable_follow_job.return_value = MagicMock( - status=finished_job_status, exit_code=job_exit_code - ) - - mock_job_definition = MagicMock(spec=str) - mock_prepare_submission.return_value = mock_job_definition - original_status: str = STRUCTURAL_LOG.get("job_combined_status") - original_exit_code: int = STRUCTURAL_LOG.get("job_exit_code") - - if validate_only: - lava_job_submitter.submit() - mock_retriable_follow_job.assert_not_called() - assert STRUCTURAL_LOG.get("job_combined_status") == original_status - assert STRUCTURAL_LOG.get("job_exit_code") == original_exit_code - return - - try: - lava_job_submitter.submit() - - except SystemExit as e: - assert e.code == job_exit_code - - assert STRUCTURAL_LOG["job_combined_status"] == expected_combined_status - assert STRUCTURAL_LOG["job_exit_code"] == job_exit_code - - -SUBMIT_SCENARIOS = { - "submit job pass": (cycle(mock_logs(result="pass", exit_code=0)), does_not_raise(), 0), - "submit job fails": ( - cycle(mock_logs(result="fail", exit_code=1)), - pytest.raises(SystemExit), - 1, - ), - "user interrupts the script": ( - (jobs_logs_response(), KeyboardInterrupt, jobs_logs_response()), - pytest.raises(SystemExit), - 1, - ), - "job finishes without hwci response": ( - (jobs_logs_response(), jobs_logs_response()), - pytest.raises(SystemExit), - 1, - ), -} - - -@pytest.mark.parametrize( - "test_log, expectation, exit_code", - SUBMIT_SCENARIOS.values(), - ids=SUBMIT_SCENARIOS.keys(), -) -def test_submission_exit_code( - request, mock_proxy, lava_job_submitter, test_log, expectation, exit_code -): - lava_job_submitter._LAVAJobSubmitter__prepare_submission = MagicMock() - proxy = mock_proxy(side_effect=test_log) - lava_job_submitter.proxy = proxy - - with expectation as e: - lava_job_submitter.submit() - # If the job fails, there should be a SystemExit exception - if e: - assert e.value.code == exit_code diff --git a/.gitlab-ci/lava/tests/utils/__init__.py b/.gitlab-ci/lava/tests/utils/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/.gitlab-ci/lava/tests/utils/test_lava_farm.py b/.gitlab-ci/lava/tests/utils/test_lava_farm.py deleted file mode 100644 index 02f2cc8f8ef..00000000000 --- a/.gitlab-ci/lava/tests/utils/test_lava_farm.py +++ /dev/null @@ -1,6 +0,0 @@ -from lava.utils.lava_farm import get_lava_farm - - -def test_get_lava_farm_no_tag(monkeypatch): - monkeypatch.delenv("FARM", raising=False) - assert get_lava_farm() == "unknown" diff --git a/.gitlab-ci/lava/tests/utils/test_lava_job_definition.py b/.gitlab-ci/lava/tests/utils/test_lava_job_definition.py deleted file mode 100644 index 269bb2cae4c..00000000000 --- a/.gitlab-ci/lava/tests/utils/test_lava_job_definition.py +++ /dev/null @@ -1,322 +0,0 @@ -import importlib -import os -import re -import subprocess -from itertools import chain -from pathlib import Path -from typing import Any, Iterable, Literal -from unittest import mock - -import lava.utils.constants -import pytest -from lava.lava_job_submitter import LAVAJobSubmitter -from lava.utils.lava_job_definition import LAVAJobDefinition -from ruamel.yaml import YAML - - -def flatten(iterable: Iterable[Iterable[Any]]) -> list[Any]: - return list(chain.from_iterable(iterable)) - - -# mock shell file -@pytest.fixture(scope="session") -def shell_file(tmp_path_factory): - def create_shell_file(content: str = "# test"): - shell_file = tmp_path_factory.mktemp("data") / "shell_file.sh" - shell_file.write_text(content) - return shell_file - - return create_shell_file - - -# fn to load the data file from $CWD/data using pathlib -def load_data_file(filename): - return Path(__file__).parent.parent / "data" / filename - - -def load_yaml_file(filename) -> dict: - with open(load_data_file(filename)) as f: - return YAML().load(f) - - -def job_submitter_factory(mode: Literal["UBOOT", "FASTBOOT"], shell_file): - if mode == "UBOOT": - boot_method = "u-boot" - device_type = "my_uboot_device_type" - elif mode == "FASTBOOT": - boot_method = "fastboot" - device_type = "my_fastboot_device_type" - - shell_prompt = "lava-shell:" - job_timeout_min = 10 - mesa_job_name = "dut test" - pipeline_info = "my_pipeline_info" - project_name = "test-project" - visibility_group = "my_visibility_group" - - return LAVAJobSubmitter( - boot_method=boot_method, - shell_prompt=shell_prompt, - device_type=device_type, - farm="test_farm", - dtb_filename="my_dtb_filename", - env_file=shell_file, - job_timeout_min=job_timeout_min, - mesa_job_name=mesa_job_name, - pipeline_info=pipeline_info, - visibility_group=visibility_group, - project_dir="/test_dir", - project_name=project_name, - ) - - -@pytest.fixture -def clear_env_vars(autouse=True): - with mock.patch.dict(os.environ) as environ: - # Remove all LAVA-related environment variables to make the test more robust - # and deterministic, once a envvar is capable of overriding the default value - for key in environ: - if any(kw in key for kw in ("LAVA_", "CI_", "JOB_", "RUNNER_", "DEVICE_")): - del environ[key] - # reload lava.utils.constants to update the JOB_PRIORITY value - importlib.reload(lava.utils.constants) - importlib.reload(lava.utils.lava_job_definition) - yield - - -@pytest.fixture -def mock_collabora_farm(clear_env_vars, monkeypatch): - # Mock a Chromebook in the Collabora farm - monkeypatch.setenv("FARM", "collabora") - monkeypatch.setenv("BOOT_METHOD", "depthcharge") - - -@pytest.mark.parametrize("force_uart", [True, False], ids=["SSH", "UART"]) -@pytest.mark.parametrize("mode", ["UBOOT", "FASTBOOT"]) -@mock.patch("lava.lava_job_submitter.setup_lava_proxy") -def test_generate_lava_job_definition_sanity( - mock_lava_proxy, - force_uart, - mode, - shell_file, - mock_collabora_farm, - monkeypatch, - mock_proxy, -): - monkeypatch.setattr(lava.utils.lava_job_definition, "FORCE_UART", force_uart) - # Do not actually connect to the LAVA server - mock_lava_proxy.return_value = mock_proxy - - farm_env = "FARM=test_farm" - init_script = "/install/common/init-stage1.sh" - job_submitter = job_submitter_factory(mode, shell_file(init_script)) - job_definition = LAVAJobDefinition(job_submitter).generate_lava_job_definition() - - # Load the YAML output and check that it contains the expected keys and values - yaml = YAML() - job_dict = yaml.load(job_definition) - yaml.dump(job_dict, Path(f"/tmp/{mode}_force_uart={force_uart}_job_definition.yaml")) - assert job_dict["device_type"] == job_submitter.device_type - assert job_dict["visibility"]["group"] == [job_submitter.visibility_group] - assert job_dict["timeouts"]["job"]["minutes"] == job_submitter.job_timeout_min - assert job_dict["context"]["extra_nfsroot_args"] - assert job_dict["timeouts"]["actions"] - - assert len(job_dict["actions"]) == 3 if mode == "UART" else 5 - - last_test_action = job_dict["actions"][-1]["test"] - # TODO: Remove hardcoded "mesa" test name, as this submitter is being used by other projects - first_test_name = last_test_action["definitions"][0]["name"] - is_running_ssh = "ssh" in first_test_name - # if force_uart, is_ssh must be False. If is_ssh, force_uart must be False. Both can be False - assert not (is_running_ssh and force_uart) - assert last_test_action["failure_retry"] == 3 if is_running_ssh else 1 - - run_steps = "".join(last_test_action["definitions"][0]["repository"]["run"]["steps"]) - # Check for project name in lava-test-case - assert re.search(rf"lava.?\S*.test.case.*{job_submitter.project_name}", run_steps) - - action_names = flatten(j.keys() for j in job_dict["actions"]) - if is_running_ssh: - assert action_names == ( - [ - "deploy", - "boot", - "test", # DUT: SSH server - "test", # Docker: SSH client - ] - if mode == "UBOOT" - else [ - "deploy", # NFS - "deploy", # Image generation - "deploy", # Image deployment - "boot", - "test", # DUT: SSH server - "test", # Docker: SSH client - ] - ) - test_action_server = job_dict["actions"][-2]["test"] - # SSH server in the DUT - assert test_action_server["namespace"] == "dut" - # SSH client via docker - assert last_test_action["namespace"] == "container" - - boot_action = next(a["boot"] for a in job_dict["actions"] if "boot" in a) - assert boot_action["namespace"] == "dut" - - # SSH server bootstrapping - assert "dropbear" in "".join(boot_action["auto_login"]["login_commands"]) - return - - # ---- Not SSH job - assert action_names == ( - [ - "deploy", - "boot", - "test", - ] - if mode == "UBOOT" - else [ - "deploy", # NFS - "deploy", # Image generation - "deploy", # Image deployment - "boot", - "test", - ] - ) - assert farm_env in run_steps - assert init_script in run_steps - - -# use yaml files from tests/data/ to test the job definition generation -@pytest.mark.parametrize("force_uart", [False, True], ids=["SSH", "UART"]) -@pytest.mark.parametrize("mode", ["UBOOT", "FASTBOOT"]) -@mock.patch("lava.lava_job_submitter.setup_lava_proxy") -def test_lava_job_definition( - mock_lava_proxy, - mode, - force_uart, - shell_file, - mock_collabora_farm, - mock_proxy, - monkeypatch, -): - monkeypatch.setattr(lava.utils.lava_job_definition, "FORCE_UART", force_uart) - # Do not actually connect to the LAVA server - mock_lava_proxy.return_value = mock_proxy - - yaml = YAML() - yaml.default_flow_style = False - - # Load the YAML output and check that it contains the expected keys and values - expected_job_dict = load_yaml_file(f"{mode}_force_uart={force_uart}_job_definition.yaml") - - init_script = f"FARM=test_farm /test_dir/install/common/init-stage1.sh" - job_submitter = job_submitter_factory(mode, shell_file(init_script)) - job_definition = LAVAJobDefinition(job_submitter).generate_lava_job_definition() - - job_dict = yaml.load(job_definition) - - # Uncomment the following to update the expected YAML files - # yaml.dump(job_dict, load_data_file(f"{mode}_force_uart={force_uart}_job_definition.yaml")) - - # Check that the generated job definition matches the expected one - assert job_dict == expected_job_dict - - -@pytest.mark.parametrize( - "directive", - ["declare -x", "export"], -) -@pytest.mark.parametrize( - "original_env_output", - [ - # Test basic environment variables - "FOO=bar\nBAZ=qux", - # Test export statements - "{directive} FOO=bar", - # Test multiple exports - "{directive} FOO=bar\n{directive} BAZ=qux\nNORM=val", - # Test mixed content with export - "{directive} FOO=bar\nBAZ=qux\n{directive} HELLO=world", - # Test empty file - "", - # Test special characters that need shell quoting - "FOO='bar baz'\nQUOTE=\"hello world\"", - # Test variables with spaces and quotes - "{directive} VAR='val spaces'\nQUOTES=\"test\"", - # Test inline scripts with export - "{directive} FOO=bar\nBAZ=qux\n{directive} HELLO=world", - # Test single quote inside double quotes in variable - "{directive} FOO='Revert \"commit's error\"'", - # Test backticks in variable - "{directive} FOO=`echo 'test'`", - ], - ids=[ - "basic_vars", - "single_export", - "multiple_exports", - "mixed_exports", - "empty_file", - "special_chars", - "spaces_and_quotes", - "inline_scripts_with_export", - "single_quote_in_var", - "backticks", - ] -) -def test_encode_job_env_vars(directive, original_env_output, shell_file, clear_env_vars): - """Test the encode_job_env_vars function with various environment file contents.""" - import base64 - import shlex - - # Create environment file with test content - original_env_output = original_env_output.format(directive=directive) - env_file = shell_file(original_env_output) - - # Create job submitter with the environment file - job_submitter = mock.MagicMock(spec=LAVAJobSubmitter, env_file=env_file) - job_definition = LAVAJobDefinition(job_submitter) - - # Call the function under test - result = job_definition.encode_job_env_vars() - - # Verify the result is a list with exactly one element - assert isinstance(result, list) - assert len(result) == 1 - - # Extract the command from the result - command = result[0] - assert isinstance(command, str) - - # Extract the base64 encoded part - start_marker = 'echo ' - end_marker = ' | base64 -d' - - start_idx = command.find(start_marker) + len(start_marker) - end_idx = command.find(end_marker) - redirect_idx = command.find(">") - encoded_part = command[start_idx:end_idx] - - # Verify if the script is executed correctly - env_script_process = subprocess.run( - ["bash", "-c", command[:redirect_idx]], capture_output=True, text=True - ) - - if env_script_process.returncode != 0: - pytest.fail(f"Failed to execute script: {env_script_process.stderr}") - - generated_env_output = env_script_process.stdout.strip() - - # The encoded part should be shell-quoted, so we need to parse it - # Use shlex to unquote the encoded content - unquoted_encoded = shlex.split(encoded_part)[0] - - # Decode the base64 content - try: - decoded_content = base64.b64decode(unquoted_encoded).decode() - except Exception as e: - pytest.fail(f"Failed to decode base64 content: {e}. Encoded part: {encoded_part}") - - # Verify the decoded content matches the original file content - assert decoded_content == original_env_output == generated_env_output diff --git a/.gitlab-ci/lava/tests/utils/test_lava_log.py b/.gitlab-ci/lava/tests/utils/test_lava_log.py deleted file mode 100644 index 7554de6a7a3..00000000000 --- a/.gitlab-ci/lava/tests/utils/test_lava_log.py +++ /dev/null @@ -1,502 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright (C) 2022 Collabora Limited -# Author: Guilherme Gallo -# -# SPDX-License-Identifier: MIT - -from datetime import UTC, datetime, timedelta - -import pytest - -from lava.exceptions import MesaCIKnownIssueException, MesaCITimeoutError -from lava.utils import ( - GitlabSection, - LogFollower, - LogSectionType, - fix_lava_gitlab_section_log, - hide_sensitive_data, -) -from lava.utils.constants import ( - KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER, - A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN, - A6XX_GPU_RECOVERY_FAILURE_MESSAGE, - A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT, -) -from lava.utils.lava_log_hints import LAVALogHints -from ..lava.helpers import ( - create_lava_yaml_msg, - does_not_raise, - lava_yaml, - mock_lava_signal, - yaml_dump, -) - -GITLAB_SECTION_SCENARIOS = { - "start collapsed": ( - "start", - True, - f"\x1b[0Ksection_start:mock_date:my_first_section[collapsed=true]\r\x1b[0K" - f"{GitlabSection.colour}my_header\x1b[0m", - ), - "start non_collapsed": ( - "start", - False, - f"\x1b[0Ksection_start:mock_date:my_first_section\r\x1b[0K" - f"{GitlabSection.colour}my_header\x1b[0m", - ), - "end collapsed": ( - "end", - True, - "\x1b[0Ksection_end:mock_date:my_first_section\r\x1b[0K", - ), - "end non_collapsed": ( - "end", - False, - "\x1b[0Ksection_end:mock_date:my_first_section\r\x1b[0K", - ), -} - - -@pytest.mark.parametrize( - "method, collapsed, expectation", - GITLAB_SECTION_SCENARIOS.values(), - ids=GITLAB_SECTION_SCENARIOS.keys(), -) -def test_gitlab_section(method, collapsed, expectation): - gs = GitlabSection( - id="my_first_section", - header="my_header", - type=LogSectionType.TEST_CASE, - start_collapsed=collapsed, - ) - gs.get_timestamp = lambda mock_date: "mock_date" - gs.start() - result = getattr(gs, method)() - assert result == expectation - - -def test_gl_sections(): - lines = [ - { - "dt": datetime.now(tz=UTC), - "lvl": "debug", - "msg": "Received signal: 0_setup-ssh-server 10145749_1.3.2.3.1", - }, - { - "dt": datetime.now(tz=UTC), - "lvl": "debug", - "msg": "Received signal: 0_mesa 5971831_1.3.2.3.1", - }, - # Redundant log message which triggers the same Gitlab Section, it - # should be ignored, unless the id is different - { - "dt": datetime.now(tz=UTC), - "lvl": "target", - "msg": "[ 7.778836] ", - }, - { - "dt": datetime.now(tz=UTC), - "lvl": "debug", - "msg": "Received signal: mesa-ci_iris-kbl-traces", - }, - # Another redundant log message - { - "dt": datetime.now(tz=UTC), - "lvl": "target", - "msg": "[ 16.997829] ", - }, - { - "dt": datetime.now(tz=UTC), - "lvl": "target", - "msg": "", - }, - ] - lf = LogFollower() - with lf: - for line in lines: - lf.manage_gl_sections(line) - parsed_lines = lf.flush() - - section_types = [s.type for s in lf.section_history] - - assert "section_start" in parsed_lines[0] - assert "collapsed=true" in parsed_lines[0] - assert "section_end" in parsed_lines[1] - assert "section_start" in parsed_lines[2] - assert "collapsed=true" in parsed_lines[2] - assert "section_end" in parsed_lines[3] - assert "section_start" in parsed_lines[4] - assert "collapsed=true" in parsed_lines[4] - assert section_types == [ - # LogSectionType.LAVA_BOOT, True, if LogFollower started with Boot section - LogSectionType.TEST_SUITE, - LogSectionType.TEST_CASE, - LogSectionType.LAVA_POST_PROCESSING, - ] - - -def test_log_follower_flush(): - lines = [ - { - "dt": datetime.now(tz=UTC), - "lvl": "debug", - "msg": "Received signal: mesa-ci_iris-kbl-traces", - }, - { - "dt": datetime.now(tz=UTC), - "lvl": "target", - "msg": "", - }, - ] - lf = LogFollower() - lf.feed(lines) - parsed_lines = lf.flush() - empty = lf.flush() - lf.feed(lines) - repeated_parsed_lines = lf.flush() - - assert parsed_lines - assert not empty - assert repeated_parsed_lines - - -SENSITIVE_DATA_SCENARIOS = { - "no sensitive data tagged": ( - ["bla bla", "mytoken: asdkfjsde1341=="], - ["bla bla", "mytoken: asdkfjsde1341=="], - ["HIDEME"], - ), - "sensitive data tagged": ( - ["bla bla", "mytoken: asdkfjsde1341== # HIDEME"], - ["bla bla"], - ["HIDEME"], - ), - "sensitive data tagged with custom word": ( - ["bla bla", "mytoken: asdkfjsde1341== # DELETETHISLINE", "third line # NOTANYMORE"], - ["bla bla", "third line # NOTANYMORE"], - ["DELETETHISLINE", "NOTANYMORE"], - ), -} - - -@pytest.mark.parametrize( - "input, expectation, tags", - SENSITIVE_DATA_SCENARIOS.values(), - ids=SENSITIVE_DATA_SCENARIOS.keys(), -) -def test_hide_sensitive_data(input, expectation, tags): - yaml_data = yaml_dump(input) - yaml_result = hide_sensitive_data(yaml_data, *tags) - result = lava_yaml.load(yaml_result) - - assert result == expectation - - -GITLAB_SECTION_SPLIT_SCENARIOS = { - "Split section_start at target level": ( - "\x1b[0Ksection_start:1668454947:test_post_process[collapsed=true]\r\x1b[0K" - "post-processing test results", - ( - "\x1b[0Ksection_start:1668454947:test_post_process[collapsed=true]", - "\x1b[0Kpost-processing test results", - ), - ), - "Split section_end at target level": ( - "\x1b[0Ksection_end:1666309222:test_post_process\r\x1b[0K", - ("\x1b[0Ksection_end:1666309222:test_post_process", "\x1b[0K"), - ), - "Second line is not split from the first": ( - ("\x1b[0Ksection_end:1666309222:test_post_process", "Any message"), - ("\x1b[0Ksection_end:1666309222:test_post_process", "Any message"), - ), -} - - -@pytest.mark.parametrize( - "expected_message, messages", - GITLAB_SECTION_SPLIT_SCENARIOS.values(), - ids=GITLAB_SECTION_SPLIT_SCENARIOS.keys(), -) -def test_fix_lava_gitlab_section_log(expected_message, messages): - fixed_messages = [] - gen = fix_lava_gitlab_section_log() - next(gen) - - for message in messages: - lava_log = create_lava_yaml_msg(msg=message, lvl="target") - if recovered_line := gen.send(lava_log): - fixed_messages.append((recovered_line, lava_log["msg"])) - fixed_messages.append(lava_log["msg"]) - - assert expected_message in fixed_messages - - -@pytest.mark.parametrize( - "expected_message, messages", - GITLAB_SECTION_SPLIT_SCENARIOS.values(), - ids=GITLAB_SECTION_SPLIT_SCENARIOS.keys(), -) -def test_lava_gitlab_section_log_collabora(expected_message, messages, monkeypatch): - """Check if LogFollower does not change the message if we are running in Collabora farm.""" - monkeypatch.setenv("RUNNER_TAG", "mesa-ci-x86_64-lava-test") - lf = LogFollower() - for message in messages: - lf.feed([create_lava_yaml_msg(msg=message)]) - new_messages = lf.flush() - new_messages = tuple(new_messages) if len(new_messages) > 1 else new_messages[0] - assert new_messages == expected_message - - -CARRIAGE_RETURN_SCENARIOS = { - "Carriage return at the end of the previous line": ( - ( - "\x1b[0Ksection_start:1677609903:test_setup[collapsed=true]\r\x1b[0K\x1b[0;36m[303:44] " - "deqp: preparing test setup\x1b[0m", - ), - ( - "\x1b[0Ksection_start:1677609903:test_setup[collapsed=true]\r", - "\x1b[0K\x1b[0;36m[303:44] deqp: preparing test setup\x1b[0m\r\n", - ), - ), - "Newline at the end of the line": ( - ("\x1b[0K\x1b[0;36m[303:44] deqp: preparing test setup\x1b[0m", "log"), - ("\x1b[0K\x1b[0;36m[303:44] deqp: preparing test setup\x1b[0m\r\n", "log"), - ), -} - - -@pytest.mark.parametrize( - "expected_message, messages", - CARRIAGE_RETURN_SCENARIOS.values(), - ids=CARRIAGE_RETURN_SCENARIOS.keys(), -) -def test_lava_log_merge_carriage_return_lines(expected_message, messages): - lf = LogFollower() - for message in messages: - lf.feed([create_lava_yaml_msg(msg=message)]) - new_messages = tuple(lf.flush()) - assert new_messages == expected_message - - -WATCHDOG_SCENARIOS = { - "1 second before timeout": ({"seconds": -1}, does_not_raise()), - "1 second after timeout": ({"seconds": 1}, pytest.raises(MesaCITimeoutError)), -} - - -@pytest.mark.parametrize( - "timedelta_kwargs, exception", - WATCHDOG_SCENARIOS.values(), - ids=WATCHDOG_SCENARIOS.keys(), -) -def test_log_follower_watchdog(frozen_time, timedelta_kwargs, exception): - lines = [ - { - "dt": datetime.now(tz=UTC), - "lvl": "debug", - "msg": "Received signal: mesa-ci_iris-kbl-traces", - }, - ] - td = {LogSectionType.TEST_CASE: timedelta(minutes=1)} - lf = LogFollower(timeout_durations=td) - lf.feed(lines) - frozen_time.tick( - lf.timeout_durations[LogSectionType.TEST_CASE] + timedelta(**timedelta_kwargs) - ) - lines = [create_lava_yaml_msg()] - with exception: - lf.feed(lines) - - -GITLAB_SECTION_ID_SCENARIOS = [ - ("a-good_name", "a-good_name"), - ("spaces are not welcome", "spaces-are-not-welcome"), - ("abc:amd64 1/3", "abc-amd64-1-3"), -] - - -@pytest.mark.parametrize("case_name, expected_id", GITLAB_SECTION_ID_SCENARIOS) -def test_gitlab_section_id(case_name, expected_id): - gl = GitlabSection( - id=case_name, header=case_name, type=LogSectionType.LAVA_POST_PROCESSING - ) - - assert gl.id == expected_id - - -def a618_network_issue_logs(level: str = "target") -> list: - net_error = create_lava_yaml_msg( - msg="[ 1733.599402] r8152 2-1.3:1.0 eth0: Tx status -71", lvl=level) - - nfs_error = create_lava_yaml_msg( - msg="[ 1733.604506] nfs: server 192.168.201.1 not responding, still trying", - lvl=level, - ) - - return [ - *(KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER*[net_error]), - nfs_error - ] - - -TEST_PHASE_LAVA_SIGNAL = mock_lava_signal(LogSectionType.TEST_CASE) -A618_NET_ISSUE_BOOT = a618_network_issue_logs(level="feedback") -A618_NET_ISSUE_TEST = [TEST_PHASE_LAVA_SIGNAL, *a618_network_issue_logs(level="target")] - - -A618_NETWORK_ISSUE_SCENARIOS = { - "Fail - R8152 kmsg during boot phase": ( - A618_NET_ISSUE_BOOT, - pytest.raises(MesaCIKnownIssueException), - ), - "Fail - R8152 kmsg during test phase": ( - A618_NET_ISSUE_TEST, - pytest.raises(MesaCIKnownIssueException), - ), - "Pass - Partial (1) R8152 kmsg during test phase": ( - A618_NET_ISSUE_TEST[:1], - does_not_raise(), - ), - "Pass - Partial (2) R8152 kmsg during test phase": ( - A618_NET_ISSUE_TEST[:2], - does_not_raise(), - ), - "Pass - Partial (3) subsequent R8152 kmsg during test phase": ( - [ - TEST_PHASE_LAVA_SIGNAL, - A618_NET_ISSUE_TEST[1], - A618_NET_ISSUE_TEST[1], - ], - does_not_raise(), - ), - "Pass - Partial (4) subsequent nfs kmsg during test phase": ( - [ - TEST_PHASE_LAVA_SIGNAL, - A618_NET_ISSUE_TEST[-1], - A618_NET_ISSUE_TEST[-1], - ], - does_not_raise(), - ), -} - - -@pytest.mark.parametrize( - "messages, expectation", - A618_NETWORK_ISSUE_SCENARIOS.values(), - ids=A618_NETWORK_ISSUE_SCENARIOS.keys(), -) -def test_detect_failure(messages, expectation): - boot_section = GitlabSection( - id="dut_boot", - header="Booting hardware device", - type=LogSectionType.LAVA_BOOT, - start_collapsed=True, - ) - boot_section.start() - lf = LogFollower(starting_section=boot_section) - with expectation: - lf.feed(messages) - - -def test_detect_a6xx_gpu_recovery_failure(frozen_time): - log_follower = LogFollower() - lava_log_hints = LAVALogHints(log_follower=log_follower) - failure_message = { - "dt": datetime.now(tz=UTC).isoformat(), - "msg": A6XX_GPU_RECOVERY_FAILURE_MESSAGE[0], - "lvl": "feedback", - } - with pytest.raises(MesaCIKnownIssueException): - for _ in range(A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT): - lava_log_hints.detect_a6xx_gpu_recovery_failure(failure_message) - # Simulate the passage of time within the watch period - frozen_time.tick(1) - failure_message["dt"] = datetime.now(tz=UTC).isoformat() - - -def test_detect_a6xx_gpu_recovery_success(frozen_time): - log_follower = LogFollower() - lava_log_hints = LAVALogHints(log_follower=log_follower) - failure_message = { - "dt": datetime.now(tz=UTC).isoformat(), - "msg": A6XX_GPU_RECOVERY_FAILURE_MESSAGE[0], - "lvl": "feedback", - } - # Simulate sending a tolerable number of failure messages - for _ in range(A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT - 1): - lava_log_hints.detect_a6xx_gpu_recovery_failure(failure_message) - frozen_time.tick(1) - failure_message["dt"] = datetime.now(tz=UTC).isoformat() - - # Simulate the passage of time outside of the watch period - frozen_time.tick(60 * A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN + 1) - failure_message = { - "dt": datetime.now(tz=UTC).isoformat(), - "msg": A6XX_GPU_RECOVERY_FAILURE_MESSAGE[1], - "lvl": "feedback", - } - with does_not_raise(): - lava_log_hints.detect_a6xx_gpu_recovery_failure(failure_message) - assert lava_log_hints.a6xx_gpu_first_fail_time is None, ( - "a6xx_gpu_first_fail_time is not None" - ) - assert lava_log_hints.a6xx_gpu_recovery_fail_counter == 0, ( - "a6xx_gpu_recovery_fail_counter is not 0" - ) - - -@pytest.mark.parametrize( - "start_offset", - [ - timedelta(hours=0), - timedelta(hours=1), - ], - ids=["equal timestamps", "negative delta"], -) -def test_gitlab_section_relative_time_clamping(start_offset): - """Test that delta time is clamped to zero if start_time <= timestamp_relative_to.""" - now = datetime.now(tz=UTC) - timestamp_relative_to = now + start_offset - gs = GitlabSection( - id="clamp_section", - header=f"clamp_section header {start_offset}", - type=LogSectionType.TEST_CASE, - timestamp_relative_to=timestamp_relative_to, - ) - gs.start() - output = gs.print_start_section() - assert "[00:00]" in output, f"Expected clamped relative time, got: {output}" - - -@pytest.mark.parametrize( - "delta_seconds,expected_seconds", - [ - (-5, 0), # Negative delta should be clamped to 0 - (0, 0), # Zero delta should remain 0 - (5, 5), # Positive delta should remain unchanged - ], - ids=["negative delta", "zero delta", "positive delta"], -) -def test_gitlab_section_delta_time(frozen_time, delta_seconds, expected_seconds): - """Test that delta_time() properly clamps negative deltas to zero.""" - gs = GitlabSection( - id="delta_section", - header=f"delta_section header {delta_seconds}", - type=LogSectionType.TEST_CASE, - ) - - with gs: - frozen_time.tick(delta_seconds) - - # Test internal _delta_time() returns exact delta - internal_delta = gs._delta_time() - assert internal_delta == timedelta(seconds=delta_seconds), ( - f"_delta_time() returned {internal_delta}, expected {timedelta(seconds=delta_seconds)}" - ) - - # Test public delta_time() returns clamped delta - clamped_delta = gs.delta_time() - assert clamped_delta == timedelta(seconds=expected_seconds), ( - f"delta_time() returned {clamped_delta}, expected {timedelta(seconds=expected_seconds)}" - ) diff --git a/.gitlab-ci/lava/utils/__init__.py b/.gitlab-ci/lava/utils/__init__.py deleted file mode 100644 index 52b4bee4fbd..00000000000 --- a/.gitlab-ci/lava/utils/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -from .console_format import CONSOLE_LOG -from .gitlab_section import GitlabSection -from .lava_job import LAVAJob -from .lava_job_definition import LAVAJobDefinition -from .lava_proxy import call_proxy, setup_lava_proxy -from .log_follower import ( - LogFollower, - fatal_err, - fix_lava_gitlab_section_log, - hide_sensitive_data, - print_log, -) -from .log_section import ( - DEFAULT_GITLAB_SECTION_TIMEOUTS, - FALLBACK_GITLAB_SECTION_TIMEOUT, - LogSection, - LogSectionType, - CI_JOB_TIMEOUT_MIN, - LAVA_TEST_OVERHEAD_MIN, - LAVA_TEST_CASE_TIMEOUT, - LAVA_TEST_SUITE_TIMEOUT, -) diff --git a/.gitlab-ci/lava/utils/console_format.py b/.gitlab-ci/lava/utils/console_format.py deleted file mode 100644 index 9f7458f3fc3..00000000000 --- a/.gitlab-ci/lava/utils/console_format.py +++ /dev/null @@ -1,19 +0,0 @@ -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -CONSOLE_LOG = { - "FG_GREEN": "\x1b[0;32m", - "FG_BOLD_GREEN": "\x1b[0;1;32m", - "FG_RED": "\x1b[0;38;5;197m", - "FG_BOLD_RED": "\x1b[0;1;38;5;197m", - "FG_YELLOW": "\x1b[0;33m", - "FG_BOLD_YELLOW": "\x1b[0;1;33m", - "FG_MAGENTA": "\x1b[0;35m", - "FG_BOLD_MAGENTA": "\x1b[0;1;35m", - "FG_CYAN": "\x1b[0;36m", - "RESET": "\x1b[0m", - "UNDERLINED": "\x1b[3m", - "BOLD": "\x1b[1m", - "DIM": "\x1b[2m", -} diff --git a/.gitlab-ci/lava/utils/constants.py b/.gitlab-ci/lava/utils/constants.py deleted file mode 100644 index 8b31ca8cb7c..00000000000 --- a/.gitlab-ci/lava/utils/constants.py +++ /dev/null @@ -1,39 +0,0 @@ -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -from os import getenv - -# How many attempts should be made when a timeout happen during LAVA device boot. -NUMBER_OF_ATTEMPTS_LAVA_BOOT = int(getenv("LAVA_NUMBER_OF_ATTEMPTS_LAVA_BOOT", 2)) -# The deploy involves downloading and decompressing the kernel, modules, dtb and the overlays. -# We should retry, to overcome network issues. -NUMBER_OF_ATTEMPTS_LAVA_DEPLOY = int(getenv("LAVA_NUMBER_OF_ATTEMPTS_LAVA_DEPLOY", 2)) - - -# Supports any integers in [0, 100]. -# The scheduler considers the job priority when ordering the queue -# to consider which job should run next. -JOB_PRIORITY = int(getenv("JOB_PRIORITY", 75)) - -# Use UART over the default SSH mechanism to follow logs. -# Caution: this can lead to device silence in some devices in Mesa CI. -FORCE_UART = bool(getenv("LAVA_FORCE_UART", False)) - -# How many times the r8152 error may happen to consider it a known issue. -KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER: int = 10 -KNOWN_ISSUE_R8152_PATTERNS: tuple[str, ...] = ( - r"r8152 \S+ eth0: Tx status -71", - r"nfs: server \d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} not responding, still trying", -) - -# This is considered noise, since LAVA produces this log after receiving a package of feedback -# messages. -LOG_DEBUG_FEEDBACK_NOISE = "Listened to connection for namespace 'dut' done" - -A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN = 3 -A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT = 30 -A6XX_GPU_RECOVERY_FAILURE_MESSAGE = ( - "cx gdsc didn't collapse", - "Timeout waiting for GMU OOB", -) diff --git a/.gitlab-ci/lava/utils/gitlab_section.py b/.gitlab-ci/lava/utils/gitlab_section.py deleted file mode 100644 index f4fea438750..00000000000 --- a/.gitlab-ci/lava/utils/gitlab_section.py +++ /dev/null @@ -1,157 +0,0 @@ -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -from __future__ import annotations - -import re -from dataclasses import dataclass, field -from datetime import datetime, timedelta, UTC -from math import floor -from typing import TYPE_CHECKING, Any, Optional - -from lava.utils.console_format import CONSOLE_LOG - -if TYPE_CHECKING: - from lava.utils.log_section import LogSectionType - - -# TODO: Add section final status to assist with monitoring -@dataclass -class GitlabSection: - id: str - header: str - type: LogSectionType - start_collapsed: bool = False - suppress_end: bool = False - suppress_start: bool = False - timestamp_relative_to: Optional[datetime] = None - escape: str = "\x1b[0K" - colour: str = f"{CONSOLE_LOG['FG_CYAN']}" - __start_time: Optional[datetime] = field(default=None, init=False) - __end_time: Optional[datetime] = field(default=None, init=False) - - @classmethod - def section_id_filter(cls, value: str) -> str: - return str(re.sub(r"[^\w_-]+", "-", value)) - - def __post_init__(self) -> None: - self.id = self.section_id_filter(self.id) - - @property - def has_started(self) -> bool: - return self.__start_time is not None - - @property - def has_finished(self) -> bool: - return self.__end_time is not None - - @property - def start_time(self) -> Optional[datetime]: - return self.__start_time - - @property - def end_time(self) -> Optional[datetime]: - return self.__end_time - - def get_timestamp(self, time: datetime) -> str: - unix_ts = datetime.timestamp(time) - return str(int(unix_ts)) - - def section(self, marker: str, header: str, time: datetime) -> str: - preamble = f"{self.escape}section_{marker}" - collapse = marker == "start" and self.start_collapsed - collapsed = "[collapsed=true]" if collapse else "" - section_id = f"{self.id}{collapsed}" - - timestamp = self.get_timestamp(time) - before_header = ":".join([preamble, timestamp, section_id]) - if self.timestamp_relative_to and self.start_time is not None: - delta = self.start_time - self.timestamp_relative_to - # time drift can occur because we are dealing with timestamps from different sources - # clamp the delta to 0 if it's negative - delta = max(delta, timedelta(seconds=0)) - reltime = f"[{floor(delta.seconds / 60):02}:{(delta.seconds % 60):02}] " - else: - reltime = "" - colored_header = f"{self.colour}{reltime}{header}\x1b[0m" if header else "" - header_wrapper = "\r" + f"{self.escape}{colored_header}" - - return f"{before_header}{header_wrapper}" - - def __str__(self) -> str: - status = "NS" if not self.has_started else "F" if self.has_finished else "IP" - delta = self.delta_time() - elapsed_time = "N/A" if delta is None else str(delta) - return ( - f"GitlabSection({self.id}, {self.header}, {self.type}, " - f"SC={self.start_collapsed}, S={status}, ST={self.start_time}, " - f"ET={self.end_time}, ET={elapsed_time})" - ) - - def __enter__(self) -> "GitlabSection": - if start_log_line := self.start(): - print(start_log_line) - return self - - def __exit__( - self, - *args: list[Any], - **kwargs: dict[str, Any], - ) -> None: - if end_log_line := self.end(): - print(end_log_line) - - def start(self) -> str: - assert not self.has_finished, "Starting an already finished section" - self.__start_time = datetime.now(tz=UTC) - return self.print_start_section() - - def print_start_section(self) -> str: - if self.suppress_start: - return "" - if self.__start_time is None: - raise RuntimeError("Start time is not set.") - return self.section(marker="start", header=self.header, time=self.__start_time) - - def end(self) -> str: - assert self.__start_time is not None, "Ending an uninitialized section" - self.__end_time = datetime.now(tz=UTC) - if self.__end_time < self.__start_time: - print( - CONSOLE_LOG["FG_YELLOW"] - + f"Warning: Section {self.id} ended before it started, clamping the delta time to 0" - + CONSOLE_LOG["RESET"] - ) - return self.print_end_section() - - def print_end_section(self) -> str: - if self.suppress_end: - return "" - if self.__end_time is None: - raise RuntimeError("End time is not set.") - return self.section(marker="end", header="", time=self.__end_time) - - def _delta_time(self) -> Optional[timedelta]: - """ - Return the delta time between the start and end of the section. - If the section has not ended, return the delta time between the start and now. - If the section has not started and not ended, return None. - """ - if self.__start_time is None: - return None - - if self.__end_time is None: - return datetime.now(tz=UTC) - self.__start_time - - return self.__end_time - self.__start_time - - def delta_time(self) -> Optional[timedelta]: - """ - Clamp the delta time to zero if it's negative, time drift can occur since we have timestamps - coming from GitLab jobs, LAVA dispatcher and DUTs. - """ - delta = self._delta_time() - if delta is None: - return None - return max(delta, timedelta(seconds=0)) diff --git a/.gitlab-ci/lava/utils/lava_farm.py b/.gitlab-ci/lava/utils/lava_farm.py deleted file mode 100644 index a4c49c138c5..00000000000 --- a/.gitlab-ci/lava/utils/lava_farm.py +++ /dev/null @@ -1,33 +0,0 @@ -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -import logging -import os - - -def get_lava_farm() -> str: - """ - Returns the LAVA farm based on the FARM environment variable. - - :return: The LAVA farm - """ - farm: str = os.getenv("FARM", "unknown") - - if farm == "unknown": - logging.warning("FARM environment variable is not set, using unknown") - - return farm.lower() - -def get_lava_boot_method() -> str: - """ - Returns the LAVA boot method based on the BOOT_METHOD environment variable. - - :return: The LAVA boot method - """ - boot_method: str = os.getenv("BOOT_METHOD", "unknown") - - if boot_method == "unknown": - logging.warning("BOOT_METHOD environment variable is not set, using unknown") - - return boot_method.lower() diff --git a/.gitlab-ci/lava/utils/lava_job.py b/.gitlab-ci/lava/utils/lava_job.py deleted file mode 100644 index 6b16f1e7376..00000000000 --- a/.gitlab-ci/lava/utils/lava_job.py +++ /dev/null @@ -1,213 +0,0 @@ -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -import re -import xmlrpc -from collections import defaultdict -from datetime import datetime, UTC -from typing import Any, Optional - -from lava.exceptions import ( - MesaCIException, - MesaCIRetriableException, - MesaCIKnownIssueException, - MesaCIParseException, - MesaCITimeoutError, -) -from lava.utils import CONSOLE_LOG -from lava.utils.log_follower import print_log -from lavacli.utils import flow_yaml as lava_yaml - -from .lava_proxy import call_proxy - - -class LAVAJob: - COLOR_STATUS_MAP: dict[str, str] = { - "pass": CONSOLE_LOG["FG_GREEN"], - "hung": CONSOLE_LOG["FG_BOLD_YELLOW"], - "fail": CONSOLE_LOG["FG_BOLD_RED"], - "canceled": CONSOLE_LOG["FG_BOLD_MAGENTA"], - } - - def __init__(self, proxy, definition, log=defaultdict(str)) -> None: - self._job_id = None - self.proxy = proxy - self.definition = definition - self.last_log_line = 0 - self.last_log_time = None - self._is_finished = False - self.log: dict[str, Any] = log - self.status = "not_submitted" - # Set the default exit code to 1 because we should set it to 0 only if the job has passed. - # If it fails or if it is interrupted, the exit code should be set to a non-zero value to - # make the GitLab job fail. - self._exit_code: int = 1 - self.__exception: Optional[Exception] = None - - def heartbeat(self) -> None: - self.last_log_time: datetime = datetime.now(tz=UTC) - self.status = "running" - - @property - def status(self) -> str: - return self._status - - @status.setter - def status(self, new_status: str) -> None: - self._status = new_status - self.log["status"] = self._status - - @property - def exit_code(self) -> int: - return self._exit_code - - @exit_code.setter - def exit_code(self, code: int) -> None: - self._exit_code = code - self.log["exit_code"] = self._exit_code - - @property - def job_id(self) -> int: - return self._job_id - - @job_id.setter - def job_id(self, new_id: int) -> None: - self._job_id = new_id - self.log["lava_job_id"] = self._job_id - - @property - def is_finished(self) -> bool: - return self._is_finished - - @property - def exception(self) -> Optional[Exception]: - return self.__exception - - @exception.setter - def exception(self, exception: Exception) -> None: - self.__exception = exception - self.log["dut_job_fail_reason"] = repr(self.__exception) - - def validate(self) -> Optional[dict]: - """Returns a dict with errors, if the validation fails. - - Returns: - Optional[dict]: a dict with the validation errors, if any - """ - return call_proxy(self.proxy.scheduler.jobs.validate, self.definition, True) - - def show(self) -> dict[str, str]: - return call_proxy(self.proxy.scheduler.jobs.show, self._job_id) - - def get_lava_time(self, key, data) -> Optional[str]: - return data[key].value if data[key] else None - - def refresh_log(self) -> None: - details = self.show() - self.log["dut_start_time"] = self.get_lava_time("start_time", details) - self.log["dut_submit_time"] = self.get_lava_time("submit_time", details) - self.log["dut_end_time"] = self.get_lava_time("end_time", details) - self.log["dut_name"] = details.get("device") - self.log["dut_state"] = details.get("state") - - def submit(self) -> bool: - try: - self.job_id = call_proxy(self.proxy.scheduler.jobs.submit, self.definition) - self.status = "submitted" - self.refresh_log() - except MesaCIException: - return False - return True - - def lava_state(self) -> str: - job_state: dict[str, str] = call_proxy( - self.proxy.scheduler.job_state, self._job_id - ) - return job_state["job_state"] - - def cancel(self): - if self._job_id: - self.proxy.scheduler.jobs.cancel(self._job_id) - # If we don't have yet set another job's status, let's update it - # with canceled one - if self.status == "running": - self.status = "canceled" - - def is_started(self) -> bool: - waiting_states = ("Submitted", "Scheduling", "Scheduled") - return self.lava_state() not in waiting_states - - def is_post_processed(self) -> bool: - return self.lava_state() != "Running" - - def _load_log_from_data(self, data) -> list[str]: - lines = [] - if isinstance(data, xmlrpc.client.Binary): - # We are dealing with xmlrpc.client.Binary - # Let's extract the data - data = data.data - # When there is no new log data, the YAML is empty - if loaded_lines := lava_yaml.load(data): - lines: list[str] = loaded_lines - self.last_log_line += len(lines) - return lines - - def get_logs(self) -> list[str]: - try: - (finished, data) = call_proxy( - self.proxy.scheduler.jobs.logs, self._job_id, self.last_log_line - ) - self._is_finished = finished - return self._load_log_from_data(data) - - except Exception as mesa_ci_err: - raise MesaCIParseException( - f"Could not get LAVA job logs. Reason: {mesa_ci_err}" - ) from mesa_ci_err - - def parse_job_result_from_log( - self, lava_lines: list[dict[str, str]] - ) -> list[dict[str, str]]: - """Use the console log to catch if the job has completed successfully or - not. Returns the list of log lines until the result line.""" - - last_line = None # Print all lines. lines[:None] == lines[:] - - for idx, line in enumerate(lava_lines): - if result := re.search(r"hwci: mesa: exit_code: (\d+)", line): - self._is_finished = True - self.exit_code = int(result.group(1)) - self.status = "pass" if self.exit_code == 0 else "fail" - - last_line = idx - # We reached the log end here. hwci script has finished. - break - return lava_lines[:last_line] - - def handle_exception(self, exception: Exception): - # Print the exception type and message - print_log(f"{type(exception).__name__}: {str(exception)}") - self.cancel() - self.exception = exception - - # Set the exit code to nonzero value - self.exit_code = 1 - - # Give more accurate status depending on exception - if isinstance(exception, MesaCIKnownIssueException): - self.status = "canceled" - elif isinstance(exception, MesaCITimeoutError): - self.status = "hung" - elif isinstance(exception, MesaCIRetriableException): - self.status = "failed" - elif isinstance(exception, KeyboardInterrupt): - self.status = "interrupted" - print_log("LAVA job submitter was interrupted. Cancelling the job.") - raise - elif isinstance(exception, MesaCIException): - self.status = "interrupted" - print_log("LAVA job submitter was interrupted. Cancelling the job.") - raise - else: - self.status = "job_submitter_error" diff --git a/.gitlab-ci/lava/utils/lava_job_definition.py b/.gitlab-ci/lava/utils/lava_job_definition.py deleted file mode 100644 index 711d030eac2..00000000000 --- a/.gitlab-ci/lava/utils/lava_job_definition.py +++ /dev/null @@ -1,284 +0,0 @@ -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -from io import StringIO -from typing import TYPE_CHECKING, Any -import base64 -import shlex - -from ruamel.yaml import YAML - -from os import getenv - -from lava.utils.lava_farm import get_lava_farm, get_lava_boot_method -from lava.utils.log_section import LAVA_DEPLOY_TIMEOUT -from lava.utils.ssh_job_definition import ( - generate_docker_test, - generate_dut_test, - wrap_boot_action, - wrap_final_deploy_action, -) -from lava.utils.uart_job_definition import ( - fastboot_boot_action, - fastboot_deploy_actions, - tftp_boot_action, - tftp_deploy_actions, - qemu_boot_action, - qemu_deploy_actions, - uart_test_actions, -) - -if TYPE_CHECKING: - from lava.lava_job_submitter import LAVAJobSubmitter - -from .constants import ( - FORCE_UART, - JOB_PRIORITY, - NUMBER_OF_ATTEMPTS_LAVA_BOOT, - NUMBER_OF_ATTEMPTS_LAVA_DEPLOY, -) - - -class LAVAJobDefinition: - """ - This class is responsible for generating the YAML payload to submit a LAVA - job. - """ - - def __init__(self, job_submitter: "LAVAJobSubmitter") -> None: - self.job_submitter: "LAVAJobSubmitter" = job_submitter - # NFS args provided by LAVA - self.lava_nfs_args: str = "root=/dev/nfs rw nfsroot=$NFS_SERVER_IP:$NFS_ROOTFS,tcp,hard,v3 ip=dhcp" - # extra_nfsroot_args appends to cmdline - self.extra_nfsroot_args: str = " init=/init rootwait usbcore.quirks=0bda:8153:k" - # Append LAVA_CMDLINE to extra_nfsroot_args - if lava_cmdline := getenv('LAVA_CMDLINE'): - self.extra_nfsroot_args += f" {lava_cmdline}" - - def has_ssh_support(self) -> bool: - if FORCE_UART: - return False - - current_farm = get_lava_farm() - boot_method = get_lava_boot_method() - - # Some Chromebooks have unreliable serial connections, so SSH is preferred. - # Only Collabora's farm supports running docker container as a LAVA actions, - # which is required to follow the job in an SSH section - # Chromebooks use the "depthcharge" boot method, so use SSH in that case, - # and UART for everything else. - return current_farm == "collabora" and boot_method == "depthcharge" - - def generate_lava_yaml_payload(self) -> dict[str, Any]: - """ - Generates a YAML payload for submitting a LAVA job, based on the provided arguments. - - Args: - None - - Returns: - a dictionary containing the values generated by the `generate_metadata` function and the - actions for the LAVA job submission. - """ - args = self.job_submitter - nfsrootfs = { - "url": f"{args.rootfs_url}", - "compression": "zstd", - "format": "tar", - "overlays": args._overlays, - } - values = self.generate_metadata() - - init_stage1_steps = self.init_stage1_steps() - jwt_steps = self.jwt_steps() - - deploy_actions = [] - boot_action = [] - test_actions = uart_test_actions(args, init_stage1_steps, jwt_steps) - - if args.boot_method == "fastboot": - deploy_actions = fastboot_deploy_actions(self, nfsrootfs) - boot_action = fastboot_boot_action(args) - elif args.boot_method == "qemu-nfs": - deploy_actions = qemu_deploy_actions(self, nfsrootfs) - boot_action = qemu_boot_action(args) - else: # tftp - deploy_actions = tftp_deploy_actions(self, nfsrootfs) - boot_action = tftp_boot_action(args) - - if self.has_ssh_support(): - wrap_final_deploy_action(deploy_actions[-1]) - # SSH jobs use namespaces to differentiate between the DUT and the - # docker container. Every LAVA action needs an explicit namespace, when we are not using - # the default one. - for deploy_action in deploy_actions: - deploy_action["namespace"] = "dut" - wrap_boot_action(boot_action) - test_actions = ( - generate_dut_test(args, init_stage1_steps), - generate_docker_test(args, jwt_steps), - ) - - values["actions"] = [ - *[{"deploy": d} for d in deploy_actions], - {"boot": boot_action}, - *[{"test": t} for t in test_actions], - ] - - return values - - def generate_lava_job_definition(self) -> str: - """ - Generates a LAVA job definition in YAML format and returns it as a string. - - Returns: - a string representation of the job definition generated by analysing job submitter - arguments and environment variables - """ - job_stream = StringIO() - yaml = YAML() - yaml.width = 4096 - yaml.dump(self.generate_lava_yaml_payload(), job_stream) - return job_stream.getvalue() - - def consume_lava_tags_args(self, values: dict[str, Any]): - # python-fire parses --lava-tags without arguments as True - if isinstance(self.job_submitter.lava_tags, tuple): - values["tags"] = self.job_submitter.lava_tags - # python-fire parses "tag-1,tag2" as str and "tag1,tag2" as tuple - # even if the -- --separator is something other than '-' - elif isinstance(self.job_submitter.lava_tags, str): - # Split string tags by comma, removing any trailing commas - values["tags"] = self.job_submitter.lava_tags.rstrip(",").split(",") - # Ensure tags are always a list of non-empty strings - if "tags" in values: - values["tags"] = [tag for tag in values["tags"] if tag] - # Remove empty tags - if "tags" in values and not values["tags"]: - del values["tags"] - - def generate_metadata(self) -> dict[str, Any]: - # General metadata and permissions - values = { - "job_name": f"{self.job_submitter.project_name}: {self.job_submitter.pipeline_info}", - "device_type": self.job_submitter.device_type, - "visibility": {"group": [self.job_submitter.visibility_group]}, - "priority": JOB_PRIORITY, - "context": {"extra_nfsroot_args": self.extra_nfsroot_args}, - "timeouts": { - "job": {"minutes": self.job_submitter.job_timeout_min}, - "actions": { - "depthcharge-retry": { - # Setting higher values here, to affect the subactions, namely - # `bootloader-commands` and `login-action` - # So this value can be higher than `depthcharge-action` timeout. - "minutes": 3 * NUMBER_OF_ATTEMPTS_LAVA_DEPLOY - }, - "depthcharge-action": { - # This timeout englobes the entire depthcharge timing, - # including retries - "minutes": LAVA_DEPLOY_TIMEOUT - }, - "uboot-action": { - # For rockchip DUTs, U-Boot auto-login action downloads the kernel and - # setup early network. This takes 72 seconds on average. - # The LAVA action that wraps it is `uboot-commands`, but we can't set a - # timeout for it directly, it is overridden by one third of `uboot-action` - # timeout. - # So actually, this timeout is here to enforce that `uboot-action` - # timeout to be 100 seconds (uboot-action timeout / - # NUMBER_OF_ATTEMPTS_LAVA_BOOT), which is more than enough. - "seconds": 100 * NUMBER_OF_ATTEMPTS_LAVA_BOOT - }, - }, - }, - } - - self.consume_lava_tags_args(values) - - # QEMU lava jobs mandate proper arch value in the context - if self.job_submitter.boot_method == "qemu-nfs": - values["context"]["arch"] = self.job_submitter.mesa_job_name.split(":")[1] - - return values - - def attach_kernel_and_dtb(self, deploy_field): - if self.job_submitter.kernel_image_type: - deploy_field["kernel"]["type"] = self.job_submitter.kernel_image_type - if self.job_submitter.dtb_filename: - deploy_field["dtb"] = { - "url": f"{self.job_submitter.kernel_url_prefix}/" - f"{self.job_submitter.dtb_filename}.dtb" - } - - def jwt_steps(self): - """ - This function is responsible for setting up the SSH server in the DUT and to - export the first boot environment to a file. - """ - # Pre-process the JWT - jwt_steps = [ - "set -e", - ] - - # If the JWT file is provided, we will use it to authenticate with the cloud - # storage provider and will hide it from the job output in Gitlab. - if self.job_submitter.jwt_file: - with open(self.job_submitter.jwt_file) as jwt_file: - jwt_steps += [ - "set +x # HIDE_START", - f'echo -n "{jwt_file.read()}" > "{self.job_submitter.jwt_file}"', - "set -x # HIDE_END", - f'echo "export S3_JWT_FILE={self.job_submitter.jwt_file}" >> /set-job-env-vars.sh', - ] - else: - jwt_steps += [ - "echo Could not find jwt file, disabling S3 requests...", - "sed -i '/S3_RESULTS_UPLOAD/d' /set-job-env-vars.sh", - ] - - return jwt_steps - - def encode_job_env_vars(self) -> list[str]: - steps = [] - with open(self.job_submitter.env_file, "rb") as f: - encoded = base64.b64encode(f.read()).decode() - safe_encoded = shlex.quote(encoded) - - steps += [ - f'echo {safe_encoded} | base64 -d >> /set-job-env-vars.sh', - ] - - return steps - - def init_stage1_steps(self) -> list[str]: - # job execution script: - # - source .gitlab-ci/common/init-stage1.sh - # - fetch and unpack per-pipeline build artifacts from build job - # - inline and encode per-job env from lava-submit.sh - # - exec .gitlab-ci/common/init-stage2.sh - run_steps = [ - f"FARM={self.job_submitter.farm} " - # We need to source the init-stage1.sh script, so that the environment - # variables including PWD are set in the current shell. - f". {self.job_submitter.project_dir}/install/common/init-stage1.sh" - ] - - # We cannot distribute the Adreno 660 shader firmware inside rootfs, - # since the license isn't bundled inside the repository - if self.job_submitter.device_type == "sm8350-hdk": - run_steps.append( - "mkdir -p /lib/firmware/qcom/sm8350 && " - + "curl -L --retry 4 -f --retry-all-errors --retry-delay 60 " - + "https://github.com/allahjasif1990/hdk888-firmware/raw/main/a660_zap.mbn " - + '-o "/lib/firmware/qcom/sm8350/a660_zap.mbn"' - ) - - # Forward environmental variables to the DUT - # base64-encoded to avoid YAML quoting issues - run_steps += self.encode_job_env_vars() - - run_steps.append("export CURRENT_SECTION=dut_boot") - - return run_steps diff --git a/.gitlab-ci/lava/utils/lava_log_hints.py b/.gitlab-ci/lava/utils/lava_log_hints.py deleted file mode 100644 index be6ff315875..00000000000 --- a/.gitlab-ci/lava/utils/lava_log_hints.py +++ /dev/null @@ -1,110 +0,0 @@ -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -from __future__ import annotations - -import re -from datetime import datetime, timedelta -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Optional, Sequence - -if TYPE_CHECKING: - from lava.utils import LogFollower - -from lava.exceptions import MesaCIKnownIssueException -from lava.utils.console_format import CONSOLE_LOG -from lava.utils.constants import ( - KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER, - LOG_DEBUG_FEEDBACK_NOISE, - KNOWN_ISSUE_R8152_PATTERNS, - A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN, - A6XX_GPU_RECOVERY_FAILURE_MESSAGE, - A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT, -) -from lava.utils.log_section import LogSectionType - - -def search_known_issue_patterns(patterns: Sequence[str], line: str) -> str: - for pattern in patterns: - if re.search(pattern, line): - return pattern - return "" - - -@dataclass -class LAVALogHints: - log_follower: LogFollower - r8152_issue_consecutive_counter: int = field(default=0, init=False) - reboot_counter: int = field(default=0, init=False) - a6xx_gpu_recovery_fail_counter: int = field(default=0, init=False) - a6xx_gpu_first_fail_time: Optional[datetime] = field(default=None, init=False) - - def raise_known_issue(self, message) -> None: - raise MesaCIKnownIssueException( - "Found known issue: " - f"{CONSOLE_LOG['FG_MAGENTA']}" - f"{message}" - f"{CONSOLE_LOG['RESET']}" - ) - - def detect_failure(self, new_lines: list[dict[str, Any]]): - for line in new_lines: - if line["msg"] == LOG_DEBUG_FEEDBACK_NOISE: - continue - self.detect_r8152_issue(line) - self.detect_forced_reboot(line) - self.detect_a6xx_gpu_recovery_failure(line) - - def detect_r8152_issue(self, line): - if self.log_follower.phase in ( - LogSectionType.LAVA_BOOT, - LogSectionType.TEST_CASE, - ) and line["lvl"] in ("feedback", "target"): - if search_known_issue_patterns(KNOWN_ISSUE_R8152_PATTERNS, line["msg"]): - if ( - self.r8152_issue_consecutive_counter - < KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER - ): - self.r8152_issue_consecutive_counter += 1 - return - - self.raise_known_issue( - "Probable network issue failure encountered, retrying the job" - ) - - # Reset the status, as the `nfs... still trying` complaint was not detected - self.r8152_issue_consecutive_counter = 0 - - def detect_forced_reboot(self, line: dict[str, Any]) -> None: - if ( - self.log_follower.phase == LogSectionType.TEST_CASE - and line["lvl"] == "feedback" - ): - if re.search(r"^Reboot requested", line["msg"]): - self.reboot_counter += 1 - - if self.reboot_counter > 0: - self.raise_known_issue( - "Forced reboot detected during test phase, failing the job..." - ) - - # If the a6xx gpu repeatedly fails to recover over a short period of time, - # then successful recovery is unlikely so cancel the job preemptively. - def detect_a6xx_gpu_recovery_failure(self, line: dict[str, Any]) -> None: - if search_known_issue_patterns(A6XX_GPU_RECOVERY_FAILURE_MESSAGE, line["msg"]): - time_of_failure = datetime.fromisoformat(line["dt"]) - self.a6xx_gpu_recovery_fail_counter += 1 - - if self.a6xx_gpu_first_fail_time is None: - self.a6xx_gpu_first_fail_time = time_of_failure - - if self.a6xx_gpu_recovery_fail_counter == A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT: - time_since_first_fail = time_of_failure - self.a6xx_gpu_first_fail_time - if time_since_first_fail <= timedelta(minutes=A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN): - self.raise_known_issue( - "Repeated GPU recovery failure detected: cancelling the job" - ) - else: - self.a6xx_gpu_first_fail_time = None - self.a6xx_gpu_recovery_fail_counter = 0 diff --git a/.gitlab-ci/lava/utils/lava_proxy.py b/.gitlab-ci/lava/utils/lava_proxy.py deleted file mode 100644 index a91736428c7..00000000000 --- a/.gitlab-ci/lava/utils/lava_proxy.py +++ /dev/null @@ -1,46 +0,0 @@ -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -import time -import traceback -import urllib -import urllib.parse -import xmlrpc -import xmlrpc.client - -import lavacli - -from .log_follower import fatal_err, print_log - - -def setup_lava_proxy(): - config = lavacli.load_config("default") - uri, usr, tok = (config.get(key) for key in ("uri", "username", "token")) - uri_obj = urllib.parse.urlparse(uri) - uri_str = f"{uri_obj.scheme}://{usr}:{tok}@{uri_obj.netloc}{uri_obj.path}" - transport = lavacli.RequestsTransport( - uri_obj.scheme, - config.get("proxy"), - config.get("timeout", 120.0), - config.get("verify_ssl_cert", True), - ) - proxy = xmlrpc.client.ServerProxy(uri_str, allow_none=True, transport=transport) - - return proxy - - -def call_proxy(fn, *args): - retries = 60 - for n in range(1, retries + 1): - try: - return fn(*args) - except xmlrpc.client.ProtocolError as err: - if n == retries: - traceback.print_exc() - fatal_err(f"A protocol error occurred (Err {err.errcode} {err.errmsg})") - else: - time.sleep(15) - except xmlrpc.client.Fault as err: - traceback.print_exc() - fatal_err(f"FATAL: Fault: {err.faultString} (code: {err.faultCode})", err) diff --git a/.gitlab-ci/lava/utils/log_follower.py b/.gitlab-ci/lava/utils/log_follower.py deleted file mode 100644 index 553ecb6bd8d..00000000000 --- a/.gitlab-ci/lava/utils/log_follower.py +++ /dev/null @@ -1,373 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright (C) 2022 Collabora Limited -# Author: Guilherme Gallo -# -# SPDX-License-Identifier: MIT - -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -""" -Some utilities to analyse logs, create gitlab sections and other quality of life -improvements -""" - -import logging -import re -import sys -from dataclasses import dataclass, field -from datetime import datetime, timedelta, UTC -from typing import Optional, Union - -from lava.exceptions import MesaCITimeoutError -from lava.utils.console_format import CONSOLE_LOG -from lava.utils.gitlab_section import GitlabSection -from lava.utils.lava_farm import get_lava_farm -from lava.utils.lava_log_hints import LAVALogHints -from lava.utils.log_section import ( - DEFAULT_GITLAB_SECTION_TIMEOUTS, - FALLBACK_GITLAB_SECTION_TIMEOUT, - LOG_SECTIONS, - LogSectionType, -) - - -@dataclass -class LogFollower: - starting_section: Optional[GitlabSection] = None - main_test_case: Optional[str] = None - timestamp_relative_to: Optional[datetime] = None - _current_section: Optional[GitlabSection] = None - section_history: list[GitlabSection] = field(default_factory=list, init=False) - timeout_durations: dict[LogSectionType, timedelta] = field( - default_factory=lambda: DEFAULT_GITLAB_SECTION_TIMEOUTS, - ) - fallback_timeout: timedelta = FALLBACK_GITLAB_SECTION_TIMEOUT - _buffer: list[str] = field(default_factory=list, init=False) - log_hints: LAVALogHints = field(init=False) - lava_farm: str = field(init=False, default=get_lava_farm()) - _merge_next_line: str = field(default_factory=str, init=False) - - def __post_init__(self): - # Make it trigger current_section setter to populate section history - self.current_section = self.starting_section - section_is_created = bool(self._current_section) - section_has_started = bool( - self._current_section and self._current_section.has_started - ) - self.log_hints = LAVALogHints(self) - assert ( - section_is_created == section_has_started - ), "Can't follow logs beginning from uninitialized GitLab sections." - - # Initialize fix_lava_gitlab_section_log generator - self.gl_section_fix_gen = fix_lava_gitlab_section_log() - next(self.gl_section_fix_gen) - - @property - def current_section(self): - return self._current_section - - @current_section.setter - def current_section(self, new_section: GitlabSection) -> None: - if old_section := self._current_section: - self.section_history.append(old_section) - self._current_section = new_section - - @property - def phase(self) -> LogSectionType: - return ( - self._current_section.type - if self._current_section - else LogSectionType.UNKNOWN - ) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - """Cleanup existing buffer if this object gets out from the context""" - self.clear_current_section() - last_lines = self.flush() - for line in last_lines: - print(line) - - def watchdog(self): - if not self._current_section: - return - - timeout_duration = self.timeout_durations.get( - self._current_section.type, self.fallback_timeout - ) - - if self._current_section.delta_time() > timeout_duration: - raise MesaCITimeoutError( - f"Gitlab Section {self._current_section} has timed out", - timeout_duration=timeout_duration, - ) - - def clear_current_section(self): - if self._current_section and not self._current_section.has_finished: - self._buffer.append(self._current_section.end()) - self.current_section = None - - def update_section(self, new_section: GitlabSection): - # Sections can have redundant regex to find them to mitigate LAVA - # interleaving kmsg and stderr/stdout issue. - if self.current_section and self.current_section.id == new_section.id: - return - self.clear_current_section() - self.current_section = new_section - self._buffer.append(new_section.start()) - - def manage_gl_sections(self, line): - if isinstance(line["msg"], list): - logging.debug("Ignoring messages as list. Kernel dumps.") - return - - for log_section in LOG_SECTIONS: - if new_section := log_section.from_log_line_to_section( - line, self.main_test_case, self.timestamp_relative_to - ): - self.update_section(new_section) - break - - def detect_kernel_dump_line(self, line: dict[str, Union[str, list]]) -> bool: - # line["msg"] can be a list[str] when there is a kernel dump - if isinstance(line["msg"], list): - return line["lvl"] == "debug" - - # result level has dict line["msg"] - if not isinstance(line["msg"], str): - return False - - # we have a line, check if it is a kernel message - if re.search(r"\[[\d\s]{5}\.[\d\s]{6}\] +\S{2,}", line["msg"]): - print_log(f"{CONSOLE_LOG['BOLD']}{line['msg']}{CONSOLE_LOG['RESET']}") - return True - - return False - - def remove_trailing_whitespace(self, line: dict[str, str]) -> None: - """ - Removes trailing whitespace from the end of the `msg` value in the log line dictionary. - - Args: - line: A dictionary representing a single log line. - - Note: - LAVA treats carriage return characters as a line break, so each carriage return in an output console - is mapped to a console line in LAVA. This method removes trailing `\r\n` characters from log lines. - """ - msg: Optional[str] = line.get("msg") - if not msg: - return False - - messages = [msg] if isinstance(msg, str) else msg - - for message in messages: - # LAVA logs brings raw messages, which includes newlines characters as \r\n. - line["msg"]: str = re.sub(r"\r\n$", "", message) - - def merge_carriage_return_lines(self, line: dict[str, str]) -> bool: - """ - Merges lines that end with a carriage return character into a single line. - - Args: - line: A dictionary representing a single log line. - - Returns: - A boolean indicating whether the current line has been merged with the next line. - - Note: - LAVA treats carriage return characters as a line break, so each carriage return in an output console - is mapped to a console line in LAVA. - """ - if line["msg"].endswith("\r"): - self._merge_next_line += line["msg"] - return True - - if self._merge_next_line: - line["msg"] = self._merge_next_line + line["msg"] - self._merge_next_line = "" - - return False - - def ignore_dut_feedback_lines(self, line: dict[str, str]) -> bool: - """ - Ignores feedback lines from LAVA. - If we only receive this level of message for some time, it means that the job is - misbehaving. E.g Rebooting. - - Args: - line: A dictionary representing a single log line. - - Returns: - A boolean indicating whether the current line is a feedback line. - """ - if line["lvl"] == "feedback" and line["ns"] == "dut": - return True - if line["lvl"] == "debug": - # This message happens after LAVA end receiving the feedback from the DUT - if line["msg"] == "Listened to connection for namespace 'dut' done": - return True - return False - - def feed(self, new_lines: list[dict[str, str]]) -> bool: - """Input data to be processed by LogFollower instance - Returns true if the DUT (device under test) seems to be alive. - """ - - self.watchdog() - - # No signal of job health in the log - is_job_healthy = False - - for line in new_lines: - self.remove_trailing_whitespace(line) - - if self.detect_kernel_dump_line(line): - continue - - if self.merge_carriage_return_lines(line): - continue - - if self.ignore_dut_feedback_lines(line): - continue - - # At least we are fed with a non-kernel dump log, it seems that the - # job is progressing - is_job_healthy = True - self.manage_gl_sections(line) - if parsed_line := self.parse_lava_line(line): - self._buffer.append(parsed_line) - - self.log_hints.detect_failure(new_lines) - - return is_job_healthy - - def flush(self) -> list[str]: - buffer = self._buffer - self._buffer = [] - return buffer - - def parse_lava_line(self, line) -> Optional[str]: - prefix = "" - suffix = "" - - if line["lvl"] in ["results", "feedback", "debug"]: - return - elif line["lvl"] in ["warning", "error"]: - prefix = CONSOLE_LOG["FG_BOLD_RED"] - suffix = CONSOLE_LOG["RESET"] - elif line["lvl"] == "input": - prefix = "$ " - suffix = "" - elif line["lvl"] == "target" and self.lava_farm != "collabora": - if self.lava_farm == "lima": - fix_lava_color_log(line) - # gl_section_fix_gen will output the stored line if it can't find a - # match for the first split line - # So we can recover it and put it back to the buffer - if recovered_first_line := self.gl_section_fix_gen.send(line): - self._buffer.append(recovered_first_line) - - return f'{prefix}{line["msg"]}{suffix}' - - -def fix_lava_color_log(line): - """This function is a temporary solution for the color escape codes mangling problem. There is - some problem in message passing between the LAVA dispatcher and the device under test (DUT). - Here \x1b or \\e character is missing before `[:digit::digit:?m` ANSI TTY color codes. - When this problem is fixed on the LAVA side, one should remove this function. - - For example, instead of receiving "\x1b[31m" (red text), we receive "[31m". - - The function fixes three types of mangled ANSI sequences: - 1. Standard color codes like [31m → \x1b[31m - 2. Line erase codes [0K → \x1b[0K - 3. Specific color formatting codes with carriage return [0;3xm → \r\x1b[0;3xm - - Note: most LAVA farms don't have this problem, except for Lima, which uses - an older version of LAVA. - """ - # Fix standard ANSI color codes (e.g., [31m → \x1b[31m) - line["msg"] = re.sub(r"(\[\d{1,2}m)", "\x1b" + r"\1", line["msg"]) - # Fix ANSI line erase codes (e.g., [0K → \x1b[0K) - line["msg"] = re.sub(r"(\[0K)", "\x1b" + r"\1", line["msg"]) - # Fix ANSI color codes with formatting and carriage return (e.g., [0;31m → \r\x1b[0;31m) - line["msg"] = re.sub(r"(\[0;3\d{1,2}m)", "\r\x1b" + r"\1", line["msg"]) - - -def fix_lava_gitlab_section_log(): - """This function is a temporary solution for the Gitlab section markers - splitting problem. Gitlab parses the following lines to define a collapsible - gitlab section in their log: - - \x1b[0Ksection_start:timestamp:section_id[collapsible=true/false]\r\x1b[0Ksection_header - - \x1b[0Ksection_end:timestamp:section_id\r\x1b[0K - There is some problem in message passing between the LAVA dispatcher and the - device under test (DUT), that replaces \r control characters into \n. When - this problem is fixed on the LAVA side, one should remove this function. - """ - while True: - line = yield False - first_line = None - split_line_pattern = re.compile(r"\x1b\[0K(section_\w+):(\d+):([^\s\r]+)$") - second_line_pattern = re.compile(r"\x1b\[0K([\S ]+)?") - - if not re.search(split_line_pattern, line["msg"]): - continue - - first_line = line["msg"] - # Delete the current line and hold this log line stream to be able to - # possibly merge it with the next line. - line["msg"] = "" - line = yield False - - # This code reached when we detect a possible first split line - if re.search(second_line_pattern, line["msg"]): - assert first_line - line["msg"] = f"{first_line}\r{line['msg']}" - else: - # The current line doesn't match with the previous one, send back the - # latter to give the user the chance to recover it. - yield first_line - - -def print_log(msg: str, *args) -> None: - is_section_header = msg.startswith("\x1b[0Ksection_") - if is_section_header: - print(msg, *args) - return - - # Reset color from timestamp, since `msg` can tint the terminal color - ts = datetime.now(tz=UTC) - ts_str = f"{ts.hour:02}:{ts.minute:02}:{ts.second:02}.{int(ts.microsecond / 1000):03}" - print(f"{CONSOLE_LOG['RESET']}{ts_str}: {msg}", *args) - - -def fatal_err(msg, exception=None): - colored_msg = f"{CONSOLE_LOG['FG_BOLD_RED']}" - print_log(colored_msg, f"{msg}", f"{CONSOLE_LOG['RESET']}") - if exception: - raise exception - sys.exit(1) - - -def hide_sensitive_data(yaml_data: str, start_hide: str = "HIDE_START", end_hide: str = "HIDE_END") -> str: - skip_line = False - dump_data: list[str] = [] - for line in yaml_data.splitlines(True): - if start_hide in line: - skip_line = True - elif end_hide in line: - skip_line = False - - if skip_line: - continue - - dump_data.append(line) - - return "".join(dump_data) diff --git a/.gitlab-ci/lava/utils/log_section.py b/.gitlab-ci/lava/utils/log_section.py deleted file mode 100644 index e948b1ca04c..00000000000 --- a/.gitlab-ci/lava/utils/log_section.py +++ /dev/null @@ -1,150 +0,0 @@ -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -import re -from dataclasses import dataclass -from datetime import datetime, timedelta -from enum import Enum, auto -from os import getenv -from typing import Optional, Pattern, Union - -from lava.utils.gitlab_section import GitlabSection - - -class LogSectionType(Enum): - UNKNOWN = auto() - LAVA_SUBMIT = auto() - LAVA_QUEUE = auto() - LAVA_DEPLOY = auto() - LAVA_BOOT = auto() - TEST_SUITE = auto() - TEST_CASE = auto() - LAVA_POST_PROCESSING = auto() - - -# How long to wait whilst we try to submit a job; make it fairly short, -# since the job will be retried. -LAVA_SUBMIT_TIMEOUT = int(getenv("LAVA_SUBMIT_TIMEOUT", 5)) - -# How long should we wait for a device to become available? -# For post-merge jobs, this should be ~infinite, but we can fail more -# aggressively for pre-merge. -LAVA_QUEUE_TIMEOUT = int(getenv("LAVA_QUEUE_TIMEOUT", 60)) - -# How long should we wait for a device to be deployed? -# The deploy involves downloading and decompressing the kernel, modules, dtb and the overlays. -# We should retry, to overcome network issues. -LAVA_DEPLOY_TIMEOUT = int(getenv("LAVA_DEPLOY_TIMEOUT", 5)) - -# Empirically, successful device deploy+boot in LAVA time takes less than 3 minutes. -# LAVA itself is configured to attempt `failure_retry` times (NUMBER_OF_ATTEMPTS_LAVA_BOOT) to boot -# the device. -# It is better to retry the boot than cancel the job and re-submit to avoid -# the enqueue delay. -LAVA_BOOT_TIMEOUT = int(getenv("LAVA_BOOT_TIMEOUT", 5)) - -# Estimated overhead in minutes for a job from GitLab to reach the test phase, -# including LAVA scheduling and boot duration -LAVA_TEST_OVERHEAD_MIN = int(getenv("LAVA_TEST_OVERHEAD_MIN", 5)) - -# CI_JOB_TIMEOUT in full minutes, no reason to use seconds here -# Defaults to 60 minutes if not set -CI_JOB_TIMEOUT_MIN = int(getenv("CI_JOB_TIMEOUT", "3600")) // 60 -# Sanity check: we need more job time than the LAVA estimated overhead -assert CI_JOB_TIMEOUT_MIN > LAVA_TEST_OVERHEAD_MIN, ( - f"CI_JOB_TIMEOUT in full minutes ({CI_JOB_TIMEOUT_MIN}) must be greater than LAVA_TEST_OVERHEAD ({LAVA_TEST_OVERHEAD_MIN})" -) - -# Test suite phase is where initialization occurs on both the DUT and the Docker container. -# The device will be listening to the SSH session until the end of the job. -LAVA_TEST_SUITE_TIMEOUT = CI_JOB_TIMEOUT_MIN - LAVA_TEST_OVERHEAD_MIN - -# Test cases may take a long time, this script has no right to interrupt -# them. But if the test case takes almost 1h, it will never succeed due to -# Gitlab job timeout. -LAVA_TEST_CASE_TIMEOUT = CI_JOB_TIMEOUT_MIN - LAVA_TEST_OVERHEAD_MIN - -# LAVA post processing may refer to a test suite teardown, or the -# adjustments to start the next test_case -LAVA_POST_PROCESSING_TIMEOUT = int(getenv("LAVA_POST_PROCESSING_TIMEOUT", 5)) - -FALLBACK_GITLAB_SECTION_TIMEOUT = timedelta(minutes=10) -DEFAULT_GITLAB_SECTION_TIMEOUTS = { - LogSectionType.LAVA_SUBMIT: timedelta(minutes=LAVA_SUBMIT_TIMEOUT), - LogSectionType.LAVA_QUEUE: timedelta(minutes=LAVA_QUEUE_TIMEOUT), - LogSectionType.LAVA_DEPLOY: timedelta(minutes=LAVA_DEPLOY_TIMEOUT), - LogSectionType.LAVA_BOOT: timedelta(minutes=LAVA_BOOT_TIMEOUT), - LogSectionType.TEST_SUITE: timedelta(minutes=LAVA_TEST_SUITE_TIMEOUT), - LogSectionType.TEST_CASE: timedelta(minutes=LAVA_TEST_CASE_TIMEOUT), - LogSectionType.LAVA_POST_PROCESSING: timedelta( - minutes=LAVA_POST_PROCESSING_TIMEOUT - ), -} - - -@dataclass(frozen=True) -class LogSection: - regex: Union[Pattern, str] - levels: tuple[str] - section_id: str - section_header: str - section_type: LogSectionType - collapsed: bool = False - - def from_log_line_to_section( - self, lava_log_line: dict[str, str], main_test_case: Optional[str], - timestamp_relative_to: Optional[datetime] - ) -> Optional[GitlabSection]: - if lava_log_line["lvl"] not in self.levels: - return - - if match := re.search(self.regex, lava_log_line["msg"]): - section_id = self.section_id.format(*match.groups()) - section_header = self.section_header.format(*match.groups()) - is_main_test_case = section_id == main_test_case - return GitlabSection( - id=section_id, - header=section_header, - type=self.section_type, - start_collapsed=self.collapsed, - suppress_start=is_main_test_case, - suppress_end=is_main_test_case, - timestamp_relative_to=timestamp_relative_to, - ) - - -LOG_SECTIONS = ( - LogSection( - regex=re.compile(r"start: 2 (\S+) \(timeout ([^)]+)\).*"), - levels=("info"), - section_id="{}", - section_header="Booting via {}", - section_type=LogSectionType.LAVA_BOOT, - collapsed=True, - ), - LogSection( - regex=re.compile(r"? ([^>]*)"), - levels=("target", "debug"), - section_id="{}", - section_header="test_case {}", - section_type=LogSectionType.TEST_CASE, - collapsed=True, - ), - LogSection( - regex=re.compile(r"? ([^>]*ssh.*server.*)"), - levels=("debug"), - section_id="{}", - section_header="Setting up hardware device for remote control", - section_type=LogSectionType.TEST_SUITE, - collapsed=True, - ), - LogSection( - regex=re.compile(r"ENDTC>? ([^>]+)"), - levels=("target", "debug"), - section_id="post-{}", - section_header="Post test_case {}", - section_type=LogSectionType.LAVA_POST_PROCESSING, - collapsed=True, - ), -) diff --git a/.gitlab-ci/lava/utils/ssh_job_definition.py b/.gitlab-ci/lava/utils/ssh_job_definition.py deleted file mode 100644 index 99759465b9b..00000000000 --- a/.gitlab-ci/lava/utils/ssh_job_definition.py +++ /dev/null @@ -1,206 +0,0 @@ -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -""" -In a few words: some devices in Mesa CI has problematic serial connection, they -may hang (become silent) intermittently. Every time it hangs for minutes, the -job is retried, causing delays in the overall pipeline executing, ultimately -blocking legit MRs to merge. - -To reduce reliance on UART, we explored LAVA features, such as running docker -containers as a test alongside the DUT one, to be able to create an SSH server -in the DUT the earliest possible and an SSH client in a docker container, to -establish a SSH session between both, allowing the console output to be passed -via SSH pseudo terminal, instead of relying in the error-prone UART. - -In more detail, we aim to use "export -p" to share the initial boot environment -with SSH LAVA test-cases. -The "init-stage1.sh" script handles tasks such as system mounting and network -setup, which are necessary for allocating a pseudo-terminal under "/dev/pts". -Although these chores are not required for establishing an SSH session, they are -essential for proper functionality to the target script given by HWCI_SCRIPT -environment variable. - -Therefore, we have divided the job definition into four parts: - -1. [DUT] Logging in to DUT and run the SSH server with root access. -2. [DUT] Running the "init-stage1.sh" script for the first SSH test case. -3. [DUT] Export the first boot environment to `/dut-env-vars.sh` file. -4. [SSH] Enabling the pseudo-terminal for colors and running the "init-stage2.sh" -script after sourcing "dut-env-vars.sh" again for the second SSH test case. -""" - - -import re -from typing import TYPE_CHECKING, Any, Iterable - -from ruamel.yaml.scalarstring import LiteralScalarString - -from .constants import NUMBER_OF_ATTEMPTS_LAVA_BOOT - -if TYPE_CHECKING: - from ..lava_job_submitter import LAVAJobSubmitter - -# Very early SSH server setup. Uses /dut_ready file to flag it is done. -SSH_SERVER_COMMANDS = { - "auto_login": { - "login_commands": [ - "dropbear -R -B", - "touch /dut_ready", - ], - "login_prompt": "ogin:", - # To login as root, the username should be empty - "username": "", - } -} - -# TODO: Extract this inline script to a shell file, like we do with -# init-stage[12].sh -# The current way is difficult to maintain because one has to deal with escaping -# characters for both Python and the resulting job definition YAML. -# Plus, it always good to lint bash scripts with shellcheck. -DOCKER_COMMANDS = [ - """set -ex -timeout 1m bash << EOF -while [ -z "$(lava-target-ip)" ]; do - echo Waiting for DUT to join LAN; - sleep 1; -done -EOF - -ping -c 5 -w 60 $(lava-target-ip) - -lava_ssh_test_case() { - set -ex - local test_case="${1}" - shift - lava-test-case \"${test_case}\" --shell \\ - ssh ${SSH_PTY_ARGS:--T} \\ - -o StrictHostKeyChecking=no \\ - -o UserKnownHostsFile=/dev/null \\ - -o ConnectTimeout=60 \\ - root@$(lava-target-ip) \"${@}\" -}""", -] - - -def to_yaml_block(steps_array: Iterable[str], escape_vars=[]) -> LiteralScalarString: - def escape_envvar(match): - return "\\" + match.group(0) - - filtered_array = [s for s in steps_array if s.strip() and not s.startswith("#")] - final_str = "\n".join(filtered_array) - - for escape_var in escape_vars: - # Find env vars and add '\\' before them - final_str = re.sub(rf"\${escape_var}*", escape_envvar, final_str) - return LiteralScalarString(final_str) - - -def generate_dut_test(args: "LAVAJobSubmitter", first_stage_steps: list[str]) -> dict[str, Any]: - # Commands executed on DUT. - # Trying to execute the minimal number of commands, because the console data is - # retrieved via UART, which is hang-prone in some devices. - return { - "namespace": "dut", - "definitions": [ - { - "from": "inline", - "name": "setup-ssh-server", - "path": "inline-setup-ssh-server", - "repository": { - "metadata": { - "format": "Lava-Test Test Definition 1.0", - "name": "dut-env-export", - }, - "run": { - "steps": [ - to_yaml_block(first_stage_steps), - "export -p > /dut-env-vars.sh", # Exporting the first boot environment - ], - }, - }, - } - ], - } - - -def generate_docker_test( - args: "LAVAJobSubmitter", artifact_download_steps: list[str] -) -> dict[str, Any]: - # This is a growing list of commands that will be executed by the docker - # guest, which will be the SSH client. - docker_commands = [] - - # LAVA test wrapping Mesa CI job in a SSH session. - init_stages_test = { - "namespace": "container", - "timeout": {"minutes": args.job_timeout_min}, - "failure_retry": 3, - "definitions": [ - { - "name": "docker_ssh_client", - "from": "inline", - "path": "inline/docker_ssh_client.yaml", - "repository": { - "metadata": { - "name": "mesa", - "description": "Mesa test plan", - "format": "Lava-Test Test Definition 1.0", - }, - "run": {"steps": docker_commands}, - }, - } - ], - "docker": { - "image": args.ssh_client_image, - }, - } - - docker_commands += [ - to_yaml_block(DOCKER_COMMANDS, escape_vars=["LAVA_TARGET_IP"]), - to_yaml_block( - ( - "lava_ssh_test_case 'wait_for_dut_login' << EOF", - "while [ ! -e /dut_ready ]; do sleep 1; done;", - "EOF", - ) - ), - to_yaml_block( - ( - "lava_ssh_test_case 'artifact_download' 'bash --' << EOF", - "source /dut-env-vars.sh", - *artifact_download_steps, - "EOF", - ) - ), - "export SSH_PTY_ARGS=-tt", - # Putting CI_JOB name as the testcase name, it may help LAVA farm - # maintainers with monitoring - f"lava_ssh_test_case '{args.project_name}_{args.mesa_job_name}' " - # Changing directory to /, as the HWCI_SCRIPT expects that - f"'cd / && {args.project_dir}/install/common/init-stage2.sh'", - ] - - return init_stages_test - - -def wrap_final_deploy_action(final_deploy_action: dict): - wrap = { - "namespace": "dut", - "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT, - "timeout": {"minutes": 10}, - } - - final_deploy_action.update(wrap) - - -def wrap_boot_action(boot_action: dict): - wrap = { - "namespace": "dut", - "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT, - **SSH_SERVER_COMMANDS, - } - - boot_action.update(wrap) diff --git a/.gitlab-ci/lava/utils/uart_job_definition.py b/.gitlab-ci/lava/utils/uart_job_definition.py deleted file mode 100644 index 88421adb2a7..00000000000 --- a/.gitlab-ci/lava/utils/uart_job_definition.py +++ /dev/null @@ -1,181 +0,0 @@ -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG - -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - from ..lava_job_submitter import LAVAJobSubmitter - from .lava_job_definition import LAVAJobDefinition - -from .constants import NUMBER_OF_ATTEMPTS_LAVA_BOOT - -# Use the same image that is being used for the hardware enablement and health-checks. -# They are pretty small (<100MB) and have all the tools we need to run LAVA, so it is a safe choice. -# You can find the Dockerfile here: -# https://gitlab.collabora.com/lava/health-check-docker/-/blob/main/Dockerfile -# And the registry here: https://gitlab.collabora.com/lava/health-check-docker/container_registry/ -DOCKER_IMAGE = "registry.gitlab.collabora.com/lava/health-check-docker" - - -def fastboot_deploy_actions( - job_definition: "LAVAJobDefinition", nfsrootfs -) -> tuple[dict[str, Any], ...]: - args = job_definition.job_submitter - cmdline = f"{job_definition.lava_nfs_args}{job_definition.extra_nfsroot_args}" - fastboot_deploy_nfs = { - "timeout": {"minutes": 10}, - "to": "nfs", - "nfsrootfs": nfsrootfs, - } - - fastboot_deploy_prepare = { - "timeout": {"minutes": 5}, - "to": "downloads", - "os": "oe", - "images": { - "kernel": { - "url": f"{args.kernel_url_prefix}/{args.kernel_image_name}", - }, - }, - "postprocess": { - "docker": { - "image": DOCKER_IMAGE, - "steps": [ - f"cat Image.gz {args.dtb_filename}.dtb > Image.gz+dtb", - "mkbootimg --kernel Image.gz+dtb" - + f' --cmdline "{cmdline}"' - + " --pagesize 4096 --base 0x80000000 -o boot.img", - ], - } - }, - } - - fastboot_deploy = { - "timeout": {"minutes": 2}, - "to": "fastboot", - "docker": { - "image": DOCKER_IMAGE, - }, - "images": { - "boot": {"url": "downloads://boot.img"}, - }, - } - - # URLs to our kernel rootfs to boot from, both generated by the base - # container build - job_definition.attach_kernel_and_dtb(fastboot_deploy_prepare["images"]) - - return (fastboot_deploy_nfs, fastboot_deploy_prepare, fastboot_deploy) - - -def tftp_deploy_actions(job_definition: "LAVAJobDefinition", nfsrootfs) -> tuple[dict[str, Any]]: - args = job_definition.job_submitter - tftp_deploy = { - "timeout": {"minutes": 5}, - "to": "tftp", - "os": "oe", - "kernel": { - "url": f"{args.kernel_url_prefix}/{args.kernel_image_name}", - }, - "nfsrootfs": nfsrootfs, - } - job_definition.attach_kernel_and_dtb(tftp_deploy) - - return (tftp_deploy,) - - -def qemu_deploy_actions(job_definition: "LAVAJobDefinition", nfsrootfs) -> tuple[dict[str, Any]]: - args = job_definition.job_submitter - qemu_deploy = { - "timeout": {"minutes": 5}, - "to": "nfs", - "images": { - "kernel": { - "image_arg": "-kernel {kernel}", - "url": f"{args.kernel_url_prefix}/{args.kernel_image_name}", - }, - "nfsrootfs": nfsrootfs, - }, - } - - return (qemu_deploy,) - - -def uart_test_actions( - args: "LAVAJobSubmitter", init_stage1_steps: list[str], jwt_steps: list[str] -) -> tuple[dict[str, Any]]: - # skeleton test definition: only declaring each job as a single 'test' - # since LAVA's test parsing is not useful to us - run_steps = [] - test = { - "timeout": {"minutes": args.job_timeout_min}, - "failure_retry": 1, - "definitions": [ - { - "name": "mesa", - "from": "inline", - "lava-signal": "kmsg", - "path": "inline/mesa.yaml", - "repository": { - "metadata": { - "name": "mesa", - "description": "Mesa test plan", - "os": ["oe"], - "scope": ["functional"], - "format": "Lava-Test Test Definition 1.0", - }, - "run": {"steps": run_steps}, - }, - } - ], - } - - run_steps += init_stage1_steps - run_steps += jwt_steps - - run_steps += [ - # Sleep a bit to give time for bash to dump shell xtrace messages into - # console which may cause interleaving with LAVA_SIGNAL_STARTTC in some - # devices like a618. - "sleep 1", - # Putting CI_JOB name as the testcase name, it may help LAVA farm - # maintainers with monitoring - f"lava-test-case '{args.project_name}_{args.mesa_job_name}' --shell {args.project_dir}/install/common/init-stage2.sh", - ] - - return (test,) - - -def tftp_boot_action(args: "LAVAJobSubmitter") -> dict[str, Any]: - tftp_boot = { - "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT, - "method": args.boot_method, - "prompts": [args.shell_prompt], - "commands": "nfs", - } - - return tftp_boot - - -def qemu_boot_action(args: "LAVAJobSubmitter") -> dict[str, Any]: - qemu_boot = { - "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT, - "method": args.boot_method, - "prompts": [args.shell_prompt], - } - - return qemu_boot - - -def fastboot_boot_action(args: "LAVAJobSubmitter") -> dict[str, Any]: - fastboot_boot = { - "timeout": {"minutes": 2}, - "docker": {"image": DOCKER_IMAGE}, - "failure_retry": NUMBER_OF_ATTEMPTS_LAVA_BOOT, - "method": args.boot_method, - "prompts": [args.shell_prompt], - "commands": ["set_active a"], - } - - return fastboot_boot diff --git a/.gitlab-ci/run-pytest.sh b/.gitlab-ci/run-pytest.sh index 0600fd37ba9..100bbdba445 100755 --- a/.gitlab-ci/run-pytest.sh +++ b/.gitlab-ci/run-pytest.sh @@ -3,7 +3,7 @@ # © Collabora Limited # Author: Guilherme Gallo -# This script runs unit/integration tests related with LAVA CI tools +# This script runs unit/integration tests for CI tools # shellcheck disable=SC1091 # The relative paths in this file only become valid at runtime. # shellcheck disable=SC2086 # quoting PYTEST_VERBOSE makes us pass an empty path @@ -25,12 +25,7 @@ if [ -z "${CI_PROJECT_DIR:-}" ]; then CI_PROJECT_DIR="$(dirname "${0}")/../" fi -if [ -z "${CI_JOB_TIMEOUT:-}" ]; then - # Export this default value, 1 hour in seconds, to test the lava job submitter - export CI_JOB_TIMEOUT=3600 -fi - -# If running outside of the debian/x86_64_pyutils container, +# If running outside of the debian/arm64_build container, # run in a virtual environment for isolation # e.g. USE_VENV=true ./.gitlab-ci/run-pytest.sh if [ "${USE_VENV:-}" == true ]; then @@ -41,13 +36,12 @@ if [ "${USE_VENV:-}" == true ]; then ${PYTHON_BIN} -m pip install --break-system-packages -r "${CI_PROJECT_DIR}/bin/ci/test/requirements.txt" fi -LIB_TEST_DIR=${CI_PROJECT_DIR}/.gitlab-ci/lava/tests SCRIPT_TEST_DIR=${CI_PROJECT_DIR}/bin/ci uncollapsed_section_start pytest "Running pytest" -PYTHONPATH="${LIB_TEST_DIR}:${SCRIPT_TEST_DIR}:${PYTHONPATH:-}" ${PYTHON_BIN} -m \ - pytest "${LIB_TEST_DIR}" "${SCRIPT_TEST_DIR}" \ +PYTHONPATH="${SCRIPT_TEST_DIR}:${PYTHONPATH:-}" ${PYTHON_BIN} -m \ + pytest "${SCRIPT_TEST_DIR}" \ -W ignore::DeprecationWarning \ --junitxml=artifacts/ci_scripts_report.xml \ -m 'not slow' \ @@ -58,5 +52,5 @@ section_end pytest section_start flake8 "flake8" ${PYTHON_BIN} -m flake8 \ --config "${CI_PROJECT_DIR}/.gitlab-ci/.flake8" \ -"${LIB_TEST_DIR}" "${SCRIPT_TEST_DIR}" +"${SCRIPT_TEST_DIR}" section_end flake8 diff --git a/bin/ci/requirements-lava.txt b/bin/ci/requirements-lava.txt deleted file mode 100644 index 818b59f79c9..00000000000 --- a/bin/ci/requirements-lava.txt +++ /dev/null @@ -1,6 +0,0 @@ -# When changing this file, you need to bump the following -# .gitlab-ci/image-tags.yml tags: -# ALPINE_X86_64_LAVA_TRIGGER_TAG -filelock==3.* -fire==0.7.0 -lavacli==2.2.0