From eb917271916e3fa66ecb5e0d46a7f4f2eef74eac Mon Sep 17 00:00:00 2001 From: Alex Yuskauskas Date: Tue, 20 Jan 2026 16:06:15 -0800 Subject: [PATCH 1/7] feat(agent): Add env var SKYHOOK_AGENT_WRITE_LOGS to be able to toggle log files on disk fix(agent): wrong call to make the log_file_glob for cleanup. If it is set to true it will create the logs files. If it is set to false it will NOT create the log files but it will still write to the stdout/stderr --- .../src/skyhook_agent/controller.py | 69 ++++-- agent/skyhook-agent/tests/test_controller.py | 226 +++++++++++++++++- 2 files changed, 272 insertions(+), 23 deletions(-) diff --git a/agent/skyhook-agent/src/skyhook_agent/controller.py b/agent/skyhook-agent/src/skyhook_agent/controller.py index b87914c9..42c8da5b 100644 --- a/agent/skyhook-agent/src/skyhook_agent/controller.py +++ b/agent/skyhook-agent/src/skyhook_agent/controller.py @@ -17,6 +17,8 @@ # limitations under the License. +import contextlib +from multiprocessing import context import sys import os import shutil @@ -69,7 +71,9 @@ def _get_env_config() -> tuple[str]: SKYHOOK_LOG_DIR = os.getenv("SKYHOOK_LOG_DIR", "/var/log/skyhook") - return SKYHOOK_RESOURCE_ID, SKYHOOK_DATA_DIR, SKYHOOK_ROOT_DIR, SKYHOOK_LOG_DIR + SKYHOOK_AGENT_WRITE_LOGS = os.getenv("SKYHOOK_AGENT_WRITE_LOGS", "true").lower() == 'true' + + return SKYHOOK_RESOURCE_ID, SKYHOOK_DATA_DIR, SKYHOOK_ROOT_DIR, SKYHOOK_LOG_DIR, SKYHOOK_AGENT_WRITE_LOGS def _get_package_information(config_data: dict) -> tuple[str, str]: return config_data["package_name"], config_data["package_version"] @@ -129,15 +133,43 @@ async def _stream_process( sink.flush() break +# A file like context manager that black holes all writes to it. Does not need to implement read +class NullWriter: + """A file-like context manager that discards all writes.""" + + def write(self, *args, **kwargs): + # Swallow everything and return len to mimic file behaviour if needed + if args: + return len(args[0]) + return 0 + + def flush(self): + pass + + def close(self): + pass + + def __enter__(self): + return self -async def tee(chroot_dir: str, cmd: List[str], stdout_sink_path: str, stderr_sink_path: str, write_cmds=False, no_chmod=False, env: dict[str, str] = {}, **kwargs): + def __exit__(self, exc_type, exc_val, exc_tb): + # Nothing to cleanup, obviously + return False + + +async def tee(chroot_dir: str, cmd: List[str], stdout_sink_path: str, stderr_sink_path: str, write_cmds=False, no_chmod=False, env: dict[str, str] = {}, write_logs: bool=True, **kwargs): """ Run the cmd in a subprocess and keep the stream of stdout/stderr and merge both into the sink_path as a log. """ # get the directory of the script script_dir = os.path.dirname(os.path.abspath(__file__)) - with open(stdout_sink_path, "w") as stdout_sink_f, open(stderr_sink_path, "w") as stderr_sink_f: + # Switch out the opens with nulls in the event of not wanting to write files + if write_logs: + files = (lambda : open(stdout_sink_path, 'w'), lambda: open(stderr_sink_path, 'w')) + else: + files = (lambda: NullWriter(), lambda: NullWriter()) + with files[0]() as stdout_sink_f, files[1]() as stderr_sink_f: if write_cmds: sys.stdout.write(" ".join(cmd) + "\n") stdout_sink_f.write(" ".join(cmd) + "\n") @@ -172,7 +204,7 @@ def get_host_path_for_steps(copy_dir: str): return f"{copy_dir}/skyhook_dir" def get_skyhook_directory(root_mount: str) -> str: - _, _, SKYHOOK_ROOT_DIR, _ = _get_env_config() + _, _, SKYHOOK_ROOT_DIR, _, _ = _get_env_config() return f"{root_mount}{SKYHOOK_ROOT_DIR}" def get_flag_dir(root_mount: str) -> str: @@ -182,7 +214,7 @@ def get_history_dir(root_mount: str) -> str: return f"{get_skyhook_directory(root_mount)}/history" def get_log_dir(root_mount: str) -> str: - _, _, _, SKYHOOK_LOG_DIR = _get_env_config() + _, _, _, SKYHOOK_LOG_DIR, _ = _get_env_config() return f"{root_mount}{SKYHOOK_LOG_DIR}" def get_log_file(step_path: str, copy_dir: str, config_data: dict, root_mount: str, timestamp: str=None) -> str: @@ -220,21 +252,23 @@ def set_flag(flag_file: str, msg: str = "") -> None: f.write(msg) -def _run(chroot_dir: str, cmds: list[str], log_path: str, write_cmds=False, no_chmod=False, env: dict[str, str] = {}, **kwargs) -> int: +def _run(chroot_dir: str, cmds: list[str], log_path: str|None, write_cmds=False, no_chmod=False, env: dict[str, str] = {}, write_logs: bool=True, **kwargs) -> int: """ Synchronous wrapper around the tee command to have logs written to disk """ # "tee" the stdout and stderr to a file to log the step results + stderr_path = f"{log_path}.err" if log_path else None result = asyncio.run( tee( chroot_dir, cmds, log_path, - f"{log_path}.err", + stderr_path, write_cmds=write_cmds, no_chmod=no_chmod, env=env, + write_logs=write_logs, **kwargs ) ) @@ -283,7 +317,11 @@ def run_step( return True time.sleep(1) - log_file = get_log_file(step_path, copy_dir, config_data, chroot_dir) + _, _, _, _, SKYHOOK_AGENT_WRITE_LOGS = _get_env_config() + if SKYHOOK_AGENT_WRITE_LOGS: + log_file = get_log_file(step_path, copy_dir, config_data, chroot_dir) + else: + log_file = None # Compile additional environment variables env = {} @@ -294,9 +332,11 @@ def run_step( chroot_dir, [step_path, *step.arguments], log_file, - env=env) + env=env, + write_logs=SKYHOOK_AGENT_WRITE_LOGS) - cleanup_old_logs(get_log_file(step_path, copy_dir, config_data, "*")) + if SKYHOOK_AGENT_WRITE_LOGS: + cleanup_old_logs(get_log_file(step_path, copy_dir, config_data, chroot_dir, "*")) if return_code not in step.returncodes: print(f"FAILED: {step.path} {' '.join(step.arguments)} {return_code}") return True @@ -421,7 +461,7 @@ def summarize_check_results(results: list[bool], step_data: dict[Mode, list[Step return False def make_config_data_from_resource_id() -> dict: - SKYHOOK_RESOURCE_ID, _, _, _ = _get_env_config() + SKYHOOK_RESOURCE_ID, _, _, _, _ = _get_env_config() # Interrupts don't really have config data we can read from the Package as it is run standalone. # So read it off of SKYHOOK_RESOURCE_ID instead @@ -441,7 +481,7 @@ def do_interrupt(interrupt_data: str, root_mount: str, copy_dir: str) -> bool: def _make_interrupt_flag(interrupt_dir: str, interrupt_id: int) -> str: return f"{interrupt_dir}/{interrupt_id}.complete" - SKYHOOK_RESOURCE_ID, _, _, _ = _get_env_config() + SKYHOOK_RESOURCE_ID, _, _, _, _ = _get_env_config() config_data = make_config_data_from_resource_id() interrupt = interrupts.inflate(interrupt_data) @@ -509,7 +549,7 @@ def main(mode: Mode, root_mount: str, copy_dir: str, interrupt_data: None|str, a if mode == Mode.INTERRUPT: return do_interrupt(interrupt_data, root_mount, copy_dir) - _, SKYHOOK_DATA_DIR, _, _ = _get_env_config() + _, SKYHOOK_DATA_DIR, _, _, _ = _get_env_config() # Check to see if the directory has already been copied down. If it hasn't assume that we # are running in legacy mode and copy the directory down. @@ -651,12 +691,13 @@ def cli(sys_argv: list[str]=sys.argv): print(str.center("ENV CONFIGURATION", 20, "-")) print(f"COPY_RESOLV: {copy_resolv}") print(f"OVERLAY_ALWAYS_RUN_STEP: {always_run_step}") - SKYHOOK_RESOURCE_ID, SKYHOOK_DATA_DIR, SKYHOOK_ROOT_DIR, SKYHOOK_LOG_DIR = _get_env_config() + SKYHOOK_RESOURCE_ID, SKYHOOK_DATA_DIR, SKYHOOK_ROOT_DIR, SKYHOOK_LOG_DIR, SKYHOOK_AGENT_WRITE_LOGS = _get_env_config() print(f"SKYHOOK_RESOURCE_ID: {SKYHOOK_RESOURCE_ID}") print(f"SKYHOOK_DATA_DIR: {SKYHOOK_DATA_DIR}") print(f"SKYHOOK_ROOT_DIR: {SKYHOOK_ROOT_DIR}") print(f"SKYHOOK_LOG_DIR: {SKYHOOK_LOG_DIR}") print(f"SKYHOOK_AGENT_BUFFER_LIMIT: {buff_size}") + print(f"SKYHOOK_AGENT_WRITE_LOGS: {SKYHOOK_AGENT_WRITE_LOGS}") print(str.center("Directory CONFIGURATION", 20, "-")) # print flag dir and log dir config_data = make_config_data_from_resource_id() diff --git a/agent/skyhook-agent/tests/test_controller.py b/agent/skyhook-agent/tests/test_controller.py index c49995cb..de19a018 100644 --- a/agent/skyhook-agent/tests/test_controller.py +++ b/agent/skyhook-agent/tests/test_controller.py @@ -23,6 +23,8 @@ import asyncio import textwrap import shutil +import glob +import time from datetime import datetime, timezone @@ -92,6 +94,29 @@ class TestHelpers(unittest.TestCase): def setUp(self): self.config_data = {"package_name": "foo", "package_version": "1.0.0"} + def test_nullwriter_discards_writes(self): + """Test that NullWriter discards all writes and behaves like a file.""" + writer = controller.NullWriter() + + # Test write returns length + result = writer.write("test data") + self.assertEqual(result, 9) + + # Test write with empty string + result = writer.write("") + self.assertEqual(result, 0) + + # Test flush and close don't raise + writer.flush() + writer.close() + + def test_nullwriter_context_manager(self): + """Test that NullWriter works as a context manager.""" + with controller.NullWriter() as writer: + writer.write("test") + writer.flush() + # Should exit cleanly without errors + def test_make_flag_path_uses_args(self): path_a = controller.make_flag_path(Step("foo.sh", arguments=["1", "2"], returncodes=(0, 1, 2)), self.config_data, "root_mount") path_b = controller.make_flag_path(Step("foo.sh", arguments=["1"], returncodes=(0, 1, 2)), self.config_data, "root_mount") @@ -176,8 +201,7 @@ def test_make_flag_path_has_package_name(self): @mock.patch("skyhook_agent.controller.get_log_file") @mock.patch("skyhook_agent.controller.subprocess") @mock.patch("skyhook_agent.controller.tee") - @mock.patch("skyhook_agent.controller.os") - def test_run_step_is_successful(self, os_mock, tee_mock, subprocess_mock, log_mock, cleanup_mock): + def test_run_step_is_successful(self, tee_mock, subprocess_mock, log_mock, cleanup_mock): subprocess_mock.run.return_value = FakeSubprocessResult(0) tee_mock.return_value = FakeSubprocessResult(0) @@ -197,7 +221,8 @@ def test_run_step_is_successful(self, os_mock, tee_mock, subprocess_mock, log_mo f"{log_file}.err", env={"STEP_ROOT": "copy_dir/skyhook_dir", "SKYHOOK_DIR": "copy_dir"}, write_cmds=False, - no_chmod=False + no_chmod=False, + write_logs=True ) ] ) @@ -246,7 +271,8 @@ def test_run_step_replaces_environment_variables( f"{log_file}.err", write_cmds=False, no_chmod=False, - env={"STEP_ROOT": "copy_dir/skyhook_dir", "SKYHOOK_DIR": "copy_dir"} + env={"STEP_ROOT": "copy_dir/skyhook_dir", "SKYHOOK_DIR": "copy_dir"}, + write_logs=True ) ] ) @@ -692,7 +718,8 @@ def test_from_and_to_version_is_given_to_upgrade_step_as_env_var(self, run_mock, ), env=dict( **{"PREVIOUS_VERSION": "0.0.9", "CURRENT_VERSION": "1.0.0"}, - **{"STEP_ROOT": f"{root_dir}/{copy_dir}/skyhook_dir", "SKYHOOK_DIR": copy_dir}) + **{"STEP_ROOT": f"{root_dir}/{copy_dir}/skyhook_dir", "SKYHOOK_DIR": copy_dir}), + write_logs=True ) ]) @@ -731,7 +758,8 @@ def test_from_and_to_version_is_given_to_upgradestep_class_as_env_var_and_args(s ), env=dict( **{"PREVIOUS_VERSION": "2024.07.28", "CURRENT_VERSION": "1.0.0"}, - **{"STEP_ROOT": f"{root_dir}/{copy_dir}/skyhook_dir", "SKYHOOK_DIR": copy_dir}) + **{"STEP_ROOT": f"{root_dir}/{copy_dir}/skyhook_dir", "SKYHOOK_DIR": copy_dir}), + write_logs=True ) ]) @@ -1360,16 +1388,196 @@ def test_get_env_config(self): SKYHOOK_RESOURCE_ID="resource_id", SKYHOOK_DATA_DIR="data_dir", SKYHOOK_ROOT_DIR="skyhook_dir", - SKYHOOK_LOG_DIR="log_dir"): - SKYHOOK_RESOURCE_ID, SKYHOOK_DATA_DIR, SKYHOOK_ROOT_DIR, SKYHOOK_LOG_DIR = controller._get_env_config() + SKYHOOK_LOG_DIR="log_dir", + SKYHOOK_AGENT_WRITE_LOGS="false"): + SKYHOOK_RESOURCE_ID, SKYHOOK_DATA_DIR, SKYHOOK_ROOT_DIR, SKYHOOK_LOG_DIR, SKYHOOK_AGENT_WRITE_LOGS = controller._get_env_config() self.assertEqual(SKYHOOK_RESOURCE_ID, "resource_id") self.assertEqual(SKYHOOK_DATA_DIR, "data_dir") self.assertEqual(SKYHOOK_ROOT_DIR, "skyhook_dir") self.assertEqual(SKYHOOK_LOG_DIR, "log_dir") + self.assertFalse(SKYHOOK_AGENT_WRITE_LOGS) # Test the default values - SKYHOOK_RESOURCE_ID, SKYHOOK_DATA_DIR, SKYHOOK_ROOT_DIR, SKYHOOK_LOG_DIR = controller._get_env_config() + SKYHOOK_RESOURCE_ID, SKYHOOK_DATA_DIR, SKYHOOK_ROOT_DIR, SKYHOOK_LOG_DIR, SKYHOOK_AGENT_WRITE_LOGS = controller._get_env_config() self.assertEqual(SKYHOOK_RESOURCE_ID, "") self.assertEqual(SKYHOOK_DATA_DIR, "/skyhook-package") self.assertEqual(SKYHOOK_ROOT_DIR, "/etc/skyhook") self.assertEqual(SKYHOOK_LOG_DIR, "/var/log/skyhook") + self.assertTrue(SKYHOOK_AGENT_WRITE_LOGS) # Default should be True + + def test_get_env_config_write_logs_variations(self): + """Test SKYHOOK_AGENT_WRITE_LOGS with different values.""" + # Test "true" value + with set_env(SKYHOOK_AGENT_WRITE_LOGS="true"): + *_, SKYHOOK_AGENT_WRITE_LOGS = controller._get_env_config() + self.assertTrue(SKYHOOK_AGENT_WRITE_LOGS) + + # Test "True" value (case insensitive) + with set_env(SKYHOOK_AGENT_WRITE_LOGS="True"): + *_, SKYHOOK_AGENT_WRITE_LOGS = controller._get_env_config() + self.assertTrue(SKYHOOK_AGENT_WRITE_LOGS) + + # Test "false" value + with set_env(SKYHOOK_AGENT_WRITE_LOGS="false"): + *_, SKYHOOK_AGENT_WRITE_LOGS = controller._get_env_config() + self.assertFalse(SKYHOOK_AGENT_WRITE_LOGS) + + # Test "False" value (case insensitive) + with set_env(SKYHOOK_AGENT_WRITE_LOGS="False"): + *_, SKYHOOK_AGENT_WRITE_LOGS = controller._get_env_config() + self.assertFalse(SKYHOOK_AGENT_WRITE_LOGS) + + # Test other values default to false + with set_env(SKYHOOK_AGENT_WRITE_LOGS="anything"): + *_, SKYHOOK_AGENT_WRITE_LOGS = controller._get_env_config() + self.assertFalse(SKYHOOK_AGENT_WRITE_LOGS) + + @mock.patch("skyhook_agent.controller.cleanup_old_logs") + @mock.patch("skyhook_agent.controller.tee") + def test_run_step_with_write_logs_false(self, tee_mock, cleanup_mock): + """Test that run_step does not write log files when SKYHOOK_AGENT_WRITE_LOGS is false.""" + tee_mock.return_value = FakeSubprocessResult(0) + + with set_env(SKYHOOK_AGENT_WRITE_LOGS="false"): + run_step_result = controller.run_step( + Step("foo", arguments=["a", "b"], returncodes=[0]), "chroot_dir", "copy_dir", self.config_data + ) + + self.assertFalse(run_step_result) + + # Verify tee was called with write_logs=False and None log paths + tee_mock.assert_has_calls( + [ + mock.call( + "chroot_dir", + ["copy_dir/skyhook_dir/foo", "a", "b"], + None, + None, + env={"STEP_ROOT": "copy_dir/skyhook_dir", "SKYHOOK_DIR": "copy_dir"}, + write_cmds=False, + no_chmod=False, + write_logs=False + ) + ] + ) + # cleanup_old_logs should not be called when write_logs is False + cleanup_mock.assert_not_called() + + @mock.patch("skyhook_agent.controller.cleanup_old_logs") + @mock.patch("skyhook_agent.controller.get_log_file") + @mock.patch("skyhook_agent.controller.tee") + def test_run_step_with_write_logs_true(self, tee_mock, get_log_file_mock, cleanup_mock): + """Test that run_step writes log files when SKYHOOK_AGENT_WRITE_LOGS is true.""" + tee_mock.return_value = FakeSubprocessResult(0) + get_log_file_mock.return_value = "/log/file.log" + + with set_env(SKYHOOK_AGENT_WRITE_LOGS="true"): + run_step_result = controller.run_step( + Step("foo", arguments=["a", "b"], returncodes=[0]), "chroot_dir", "copy_dir", self.config_data + ) + + self.assertFalse(run_step_result) + + # Verify tee was called with the log file path and write_logs=True + tee_mock.assert_has_calls( + [ + mock.call( + "chroot_dir", + ["copy_dir/skyhook_dir/foo", "a", "b"], + "/log/file.log", + "/log/file.log.err", + env={"STEP_ROOT": "copy_dir/skyhook_dir", "SKYHOOK_DIR": "copy_dir"}, + write_cmds=False, + no_chmod=False, + write_logs=True + ) + ] + ) + # cleanup_old_logs should be called when write_logs is True + cleanup_mock.assert_called_once() + + @mock.patch("skyhook_agent.controller.sys") + def test_tee_with_nullwriter_when_write_logs_false(self, sys_mock): + """Test that tee uses NullWriter when write_logs is False.""" + sys_mock.stdout = FakeIO() + sys_mock.stderr = FakeIO() + sys_mock.executable = sys.executable + + with tempfile.TemporaryDirectory() as dir: + stdout_path = f"{dir}/stdout.log" + stderr_path = f"{dir}/stderr.log" + + # Run tee with write_logs=False + result = asyncio.run( + controller.tee("", ["echo", "test"], stdout_path, stderr_path, write_logs=False) + ) + + # Log files should not be created + self.assertFalse(os.path.exists(stdout_path)) + self.assertFalse(os.path.exists(stderr_path)) + + def test_cleanup_old_logs_keeps_only_5_files(self): + """Test that cleanup_old_logs removes all but the 5 most recent log files.""" + with tempfile.TemporaryDirectory() as temp_dir: + # Create directory structure for logs + log_dir = f"{temp_dir}/var/log/skyhook/foo/1.0.0" + os.makedirs(log_dir, exist_ok=True) + + # Create a simple step script that succeeds + step_dir = f"{temp_dir}/skyhook_dir" + os.makedirs(step_dir, exist_ok=True) + step_path = f"{step_dir}/test_step.sh" + with open(step_path, "w") as f: + f.write("#!/bin/sh\necho 'test output'\nexit 0\n") + os.chmod(step_path, os.stat(step_path).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) + + # Track log files created + log_files_created = [] + + # Mock get_log_file and get_host_path_for_steps to use our temp directories + def mock_get_log_file(step_path_arg, copy_dir, config_data, root_mount, timestamp=None): + if timestamp is None: + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H%M%S") + log_file = f"{log_dir}/test_step.sh-{timestamp}.log" + # Only track actual log files, not glob patterns + if timestamp != "*": + log_files_created.append(log_file) + return log_file + + # Run run_step 6 times with delays to ensure different timestamps + # Use chroot_dir="local" to avoid permission issues with chroot + with mock.patch("skyhook_agent.controller.get_log_file", side_effect=mock_get_log_file), \ + mock.patch("skyhook_agent.controller.get_host_path_for_steps", return_value=step_dir), \ + mock.patch("skyhook_agent.controller.get_log_dir", return_value=log_dir): + + for i in range(6): + # Small delay to ensure different timestamps and file modification times + time.sleep(0.05) + + result = controller.run_step( + Step("test_step.sh", arguments=[], returncodes=[0]), + "local", # chroot_dir - "local" skips actual chroot + temp_dir, # copy_dir + self.config_data + ) + self.assertFalse(result, f"Step {i+1} should have succeeded") + + # After 6 runs with cleanup, there should be exactly 5 log files + actual_log_files = sorted(glob.glob(f"{log_dir}/test_step.sh-*.log")) + self.assertEqual(len(actual_log_files), 5, + f"Expected 5 log files after 6 runs, but found {len(actual_log_files)}: {actual_log_files}") + + # Verify the oldest log file was removed + self.assertFalse(os.path.exists(log_files_created[0]), + f"The oldest log file {log_files_created[0]} should have been removed") + + # Verify the 5 most recent log files remain + for log_file in log_files_created[1:]: + self.assertTrue(os.path.exists(log_file), + f"Recent log file {log_file} should still exist") + + # Verify stderr files also exist for remaining logs + for log_file in actual_log_files: + stderr_file = f"{log_file}.err" + self.assertTrue(os.path.exists(stderr_file), + f"Stderr file {stderr_file} should exist") \ No newline at end of file From f9bad98fc4b2949bc0343cad915aedcda895a2ad Mon Sep 17 00:00:00 2001 From: Alex Yuskauskas Date: Wed, 21 Jan 2026 11:13:05 -0800 Subject: [PATCH 2/7] feat(agent): update base container to nvcr.io/nvidia/distroless/python:3.12-v3.5.2 --- containers/agent.Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/containers/agent.Dockerfile b/containers/agent.Dockerfile index 419934c3..feda18f9 100644 --- a/containers/agent.Dockerfile +++ b/containers/agent.Dockerfile @@ -36,13 +36,13 @@ RUN make build build_version=${AGENT_VERSION} # Install the wheel in the builder stage RUN python3 -m venv venv && ./venv/bin/pip install /code/skyhook-agent/dist/skyhook_agent*.whl -FROM nvcr.io/nvidia/distroless/python:3.12-v3.4.15 +FROM nvcr.io/nvidia/distroless/python:3.12-v3.5.2 ARG AGENT_VERSION ARG GIT_SHA ## https://github.com/opencontainers/image-spec/blob/main/annotations.md -LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/distroless/python:3.12-v3.4.15" \ +LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/distroless/python:3.12-v3.5.2" \ org.opencontainers.image.licenses="Apache-2.0" \ org.opencontainers.image.title="skyhook-agent" \ org.opencontainers.image.version="${AGENT_VERSION}" \ From 9bcac02e9eb46a651786a95aa24c7f8c77ad3a84 Mon Sep 17 00:00:00 2001 From: Alex Yuskauskas Date: Wed, 21 Jan 2026 14:59:02 -0800 Subject: [PATCH 3/7] feat(agent/operator): add integration chainsaw tests for agent for reaping logs and not writing logs --- .github/workflows/agent-ci.yaml | 70 ++++++++++++++++++- .../dont_write_logs/assert.yaml | 43 ++++++++++++ .../dont_write_logs/chainsaw-test.yaml | 51 ++++++++++++++ .../dont_write_logs/skyhook.yaml | 44 ++++++++++++ .../interrupt/chainsaw-test.yaml | 5 +- .../operator-agent/reap_old_logs/assert.yaml | 43 ++++++++++++ .../reap_old_logs/chainsaw-test.yaml | 46 ++++++++++++ .../operator-agent/reap_old_logs/skyhook.yaml | 42 +++++++++++ operator/Makefile | 8 ++- 9 files changed, 347 insertions(+), 5 deletions(-) create mode 100644 k8s-tests/operator-agent/dont_write_logs/assert.yaml create mode 100644 k8s-tests/operator-agent/dont_write_logs/chainsaw-test.yaml create mode 100644 k8s-tests/operator-agent/dont_write_logs/skyhook.yaml create mode 100644 k8s-tests/operator-agent/reap_old_logs/assert.yaml create mode 100644 k8s-tests/operator-agent/reap_old_logs/chainsaw-test.yaml create mode 100644 k8s-tests/operator-agent/reap_old_logs/skyhook.yaml diff --git a/.github/workflows/agent-ci.yaml b/.github/workflows/agent-ci.yaml index 6261ed4e..043e0cc3 100644 --- a/.github/workflows/agent-ci.yaml +++ b/.github/workflows/agent-ci.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # @@ -132,3 +132,71 @@ jobs: subject-name: ${{ env.REGISTRY }}/${{env.IMAGE_NAME}}/agent subject-digest: ${{ steps.build.outputs.digest }} push-to-registry: true + + operator-agent-tests: + name: Operator Agent Integration Tests + runs-on: ubuntu-latest + needs: [build-and-push-agent] + permissions: + contents: read + packages: read + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-tags: true + fetch-depth: 0 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.25.5' + cache-dependency-path: operator/go.sum + + - name: Log in to the Container registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Create Kubernetes KinD Cluster + uses: helm/kind-action@v1 + with: + version: v0.31.0 + node_image: kindest/node:v1.35.0 + config: operator/config/local-dev/kind-config.yaml + cluster_name: kind + + - name: Restore cached Binaries + id: cached-binaries + uses: actions/cache/restore@v4 + with: + key: 1.25.5-${{ runner.os }}-${{ runner.arch }}-bin-${{ hashFiles('operator/deps.mk') }} + restore-keys: 1.25.5-${{ runner.os }}-${{ runner.arch }}-bin- + path: | + ${{ github.workspace }}/operator/bin + ~/.cache/go-build + + - name: Install dependencies + if: steps.cached-binaries.outputs.cache-hit != 'true' + run: | + cd operator + make install-deps + + - name: Save cached Binaries + if: steps.cached-binaries.outputs.cache-hit != 'true' + uses: actions/cache/save@v4 + with: + key: 1.25.5-${{ runner.os }}-${{ runner.arch }}-bin-${{ hashFiles('operator/deps.mk') }} + path: | + ${{ github.workspace }}/operator/bin + ~/.cache/go-build + + - name: Run operator-agent tests + run: | + cd operator + export GIT_SHA=$(git rev-parse --short ${{ github.sha }}) + export AGENT_IMAGE="${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/agent:${GIT_SHA}" + echo "Testing with agent image: ${AGENT_IMAGE}" + make operator-agent-tests diff --git a/k8s-tests/operator-agent/dont_write_logs/assert.yaml b/k8s-tests/operator-agent/dont_write_logs/assert.yaml new file mode 100644 index 00000000..9b0206a7 --- /dev/null +++ b/k8s-tests/operator-agent/dont_write_logs/assert.yaml @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +apiVersion: v1 +kind: Node +metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_dont-write-logs-agent-operator: complete + annotations: + skyhook.nvidia.com/status_dont-write-logs-agent-operator: complete +status: + (conditions[?type == 'skyhook.nvidia.com/dont-write-logs-agent-operator/NotReady']): + - reason: "Complete" + status: "False" + (conditions[?type == 'skyhook.nvidia.com/dont-write-logs-agent-operator/Erroring']): + - reason: "Not Erroring" + status: "False" +--- +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + name: dont-write-logs-agent-operator +status: + status: complete + nodeStatus: + # grab values should be one and is complete + (values(@)): + - complete diff --git a/k8s-tests/operator-agent/dont_write_logs/chainsaw-test.yaml b/k8s-tests/operator-agent/dont_write_logs/chainsaw-test.yaml new file mode 100644 index 00000000..86f90af0 --- /dev/null +++ b/k8s-tests/operator-agent/dont_write_logs/chainsaw-test.yaml @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: dont-write-logs-agent-operator +spec: + timeouts: + assert: 240s + exec: 90s + steps: + - try: + - script: + content: | + ## remove annotation from last run + ../../../../operator/bin/skyhook reset simple-agent-operator --confirm 2>/dev/null || true + - script: + content: | + ## reinstall the debug pod in case it was deleted + ../setup.sh kind-worker setup + - script: + content: | + ## clean up any logs from prior runs + ../check_node.sh kind-worker "rm -f /var/log/skyhook/dont-write-logs-agent-operator/shellscript/1.1.1/*.log" ".*" 2 + - apply: + file: skyhook.yaml + - assert: + file: assert.yaml + - script: + content: | + ../check_node.sh kind-worker "ls /var/lib/skyhook/dont-write-logs-agent-operator/flags/shellscript/1.1.1/" "shellscript_run.sh.*" + ../check_node.sh kind-worker "ls /var/log/skyhook/dont-write-logs-agent-operator/shellscript/1.1.1/*.log | wc -l" "0" 2 + - finally: + - delete: + file: skyhook.yaml diff --git a/k8s-tests/operator-agent/dont_write_logs/skyhook.yaml b/k8s-tests/operator-agent/dont_write_logs/skyhook.yaml new file mode 100644 index 00000000..4b1f3d58 --- /dev/null +++ b/k8s-tests/operator-agent/dont_write_logs/skyhook.yaml @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + labels: + app.kubernetes.io/part-of: skyhook-operator + app.kubernetes.io/created-by: skyhook-operator + name: dont-write-logs-agent-operator +spec: + nodeSelectors: + matchLabels: + skyhook.nvidia.com/test-node: skyhooke2e + packages: + shellscript: + version: "1.1.1" + image: ghcr.io/nvidia/skyhook-packages/shellscript + env: + - name: SKYHOOK_AGENT_WRITE_LOGS + value: "false" + configMap: + config.sh: | + #!/bin/bash + echo "Hello, world!" + + cat $SKYHOOK_DIR/configmaps/extra.txt + + extra.txt: | + This is a test diff --git a/k8s-tests/operator-agent/interrupt/chainsaw-test.yaml b/k8s-tests/operator-agent/interrupt/chainsaw-test.yaml index b8b86ce7..e9d3417e 100644 --- a/k8s-tests/operator-agent/interrupt/chainsaw-test.yaml +++ b/k8s-tests/operator-agent/interrupt/chainsaw-test.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test @@ -29,7 +28,7 @@ spec: - script: content: | ## remove annotation from last run - ../../../../operator/bin/skyhook reset interrupt-agent-operator --confirm 2>/dev/null || true + ../../../../operator/bin/skyhook reset simple-agent-operator --confirm 2>/dev/null || true - script: content: | ## reinstall the debug pod in case it was deleted diff --git a/k8s-tests/operator-agent/reap_old_logs/assert.yaml b/k8s-tests/operator-agent/reap_old_logs/assert.yaml new file mode 100644 index 00000000..a43c8ec7 --- /dev/null +++ b/k8s-tests/operator-agent/reap_old_logs/assert.yaml @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +apiVersion: v1 +kind: Node +metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_reap-old-logs-agent-operator: complete + annotations: + skyhook.nvidia.com/status_reap-old-logs-agent-operator: complete +status: + (conditions[?type == 'skyhook.nvidia.com/reap-old-logs-agent-operator/NotReady']): + - reason: "Complete" + status: "False" + (conditions[?type == 'skyhook.nvidia.com/reap-old-logs-agent-operator/Erroring']): + - reason: "Not Erroring" + status: "False" +--- +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + name: reap-old-logs-agent-operator +status: + status: complete + nodeStatus: + # grab values should be one and is complete + (values(@)): + - complete diff --git a/k8s-tests/operator-agent/reap_old_logs/chainsaw-test.yaml b/k8s-tests/operator-agent/reap_old_logs/chainsaw-test.yaml new file mode 100644 index 00000000..34f99b76 --- /dev/null +++ b/k8s-tests/operator-agent/reap_old_logs/chainsaw-test.yaml @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: reap-old-logs-agent-operator +spec: + timeouts: + assert: 240s + exec: 90s + steps: + - try: + - script: + content: | + ## remove annotation from last run + ../../../../operator/bin/skyhook reset reap-old-logs-agent-operator --confirm 2>/dev/null || true + - script: + content: | + ## reinstall the debug pod in case it was deleted + ../setup.sh kind-worker setup + - apply: + file: skyhook.yaml + - assert: + file: assert.yaml + - script: + content: | + ../check_node.sh kind-worker "ls /var/log/skyhook/reap-old-logs-agent-operator/shellscript/1.1.1/*.log | wc -l" "5" 2 + - finally: + - delete: + file: skyhook.yaml diff --git a/k8s-tests/operator-agent/reap_old_logs/skyhook.yaml b/k8s-tests/operator-agent/reap_old_logs/skyhook.yaml new file mode 100644 index 00000000..06a60278 --- /dev/null +++ b/k8s-tests/operator-agent/reap_old_logs/skyhook.yaml @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + labels: + app.kubernetes.io/part-of: skyhook-operator + app.kubernetes.io/created-by: skyhook-operator + name: reap-old-logs-agent-operator +spec: + nodeSelectors: + matchLabels: + skyhook.nvidia.com/test-node: skyhooke2e + packages: + shellscript: + version: "1.1.1" + image: ghcr.io/nvidia/skyhook-packages/shellscript + configMap: + config.sh: | + #!/bin/bash + if [ $(ls /var/lib/skyhook/reap-old-logs-agent-operator/flags/shellscript/1.1.1/*.log | wc -l) -eq 5 ]; then + echo "5 logs found. After this should still be 5." + exit 0 + else: + echo "Not enough logs yet. Erroring to produce more." + exit 1 + fi diff --git a/operator/Makefile b/operator/Makefile index 7b2e6b6c..59f45dd5 100644 --- a/operator/Makefile +++ b/operator/Makefile @@ -228,8 +228,14 @@ helm-tests: helm chainsaw ensure-test-symlinks $(CHAINSAW) test --test-dir ../k8s-tests/chainsaw/helm $(CHAINSAW_ARGS) operator-agent-tests: chainsaw install ## Run operator agent tests. + @if [ -z "$(AGENT_IMAGE)" ]; then \ + echo "Error: AGENT_IMAGE is not set. Please set it to the agent image to test against."; \ + echo "Example: AGENT_IMAGE=ghcr.io/nvidia/skyhook/agent:v6.3.1 make operator-agent-tests"; \ + exit 1; \ + fi + @echo "Running operator-agent tests with AGENT_IMAGE=$(AGENT_IMAGE)" ../k8s-tests/operator-agent/setup.sh kind-worker setup - AGENT_IMAGE=ghcr.io/nvidia/skyhook/agent:v6.2.0-30d8b7a $(MAKE) run + $(MAKE) run $(CHAINSAW) test --test-dir ../k8s-tests/operator-agent $(CHAINSAW_ARGS) $(MAKE) kill ## ../k8s-tests/operator-agent/setup.sh kind-worker teardown From ba35fdadf53baa2b47d3bf925c9424c3294c5cdc Mon Sep 17 00:00:00 2001 From: Alex Yuskauskas Date: Wed, 21 Jan 2026 15:52:20 -0800 Subject: [PATCH 4/7] fix(agent/ci): operator-agent now use same logic for making tags as building the image --- .github/workflows/agent-ci.yaml | 83 ++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 27 deletions(-) diff --git a/.github/workflows/agent-ci.yaml b/.github/workflows/agent-ci.yaml index 043e0cc3..0129d563 100644 --- a/.github/workflows/agent-ci.yaml +++ b/.github/workflows/agent-ci.yaml @@ -36,6 +36,52 @@ env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} jobs: + compute-metadata: + name: Compute Image Metadata + runs-on: ubuntu-latest + outputs: + git-sha: ${{ steps.meta.outputs.git-sha }} + agent-version: ${{ steps.meta.outputs.agent-version }} + agent-image-tag: ${{ steps.meta.outputs.agent-image-tag }} + tags: ${{ steps.meta.outputs.tags }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Fetch all tags + run: git fetch --tags --force + - name: Compute metadata + id: meta + run: | + export GIT_SHA=$(git rev-parse --short ${{ github.sha }}) + echo "git-sha=${GIT_SHA}" >> $GITHUB_OUTPUT + + case ${{ github.ref_type }} in + branch) + # The last tag + current git sha + export AGENT_VERSION=$(git tag --list 'agent*' --sort=-v:refname | head -n 1 | cut -d/ -f2)+${GIT_SHA} + # Convert + to - for docker tag compliance + export AGENT_IMAGE_TAG=$(echo "${AGENT_VERSION}" | tr + -) + TAGS="-t ${REGISTRY@L}/${{ github.repository }}/agent:${GIT_SHA} -t ${REGISTRY@L}/${{ github.repository }}/agent:${AGENT_IMAGE_TAG}" + ;; + tag) + # The version part of the tag + export AGENT_VERSION=$(echo "${{ github.ref_name }}" | cut -f 2 -d /) + export AGENT_IMAGE_TAG="${AGENT_VERSION}" + TAGS="-t ${REGISTRY@L}/${{ github.repository }}/agent:${GIT_SHA} -t ${REGISTRY@L}/${{ github.repository }}/agent:${AGENT_VERSION} -t ${REGISTRY@L}/${{ github.repository }}/agent:latest" + ;; + *) + echo "Unknown type ${{ github.ref_type }}" + exit 1 + ;; + esac + + echo "agent-version=${AGENT_VERSION}" >> $GITHUB_OUTPUT + echo "agent-image-tag=${AGENT_IMAGE_TAG}" >> $GITHUB_OUTPUT + echo "tags=${TAGS}" >> $GITHUB_OUTPUT + echo "📦 Agent Version: ${AGENT_VERSION}" + echo "🏷️ Image Tag: ${AGENT_IMAGE_TAG}" + echo "🏷️ All Tags: ${TAGS}" + test: name: Skyhook Agent Unit Tests runs-on: ubuntu-latest @@ -62,7 +108,7 @@ jobs: cat test-summary.md >> $GITHUB_STEP_SUMMARY build-and-push-agent: runs-on: ubuntu-latest - needs: [test] # Don't run the build and push if the unit tests fail + needs: [test, compute-metadata] # Don't run the build and push if the unit tests fail # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. permissions: contents: read @@ -92,38 +138,21 @@ jobs: - name: Build the agent container image id: build + env: + GIT_SHA: ${{ needs.compute-metadata.outputs.git-sha }} + AGENT_VERSION: ${{ needs.compute-metadata.outputs.agent-version }} + TAGS: ${{ needs.compute-metadata.outputs.tags }} run: | apt-get update && apt-get install -y make git jq cd agent - # if this is a tag build, use the tag as the version, otherwise use the sha - git fetch --all - export GIT_SHA=$(git rev-parse --short ${{ github.sha }}) - TAGS="-t ${REGISTRY@L}/${{env.IMAGE_NAME}}/agent:${GIT_SHA}" - case ${{ github.ref_type }} in - branch) - # The last tag + current git sha - export AGENT_VERSION=$(git tag --list 'agent*' --sort=-v:refname | head -n 1 | cut -d/ -f2)+${GIT_SHA} - TAGS="$TAGS -t ${REGISTRY@L}/${{env.IMAGE_NAME}}/agent:$(echo "${AGENT_VERSION}" | tr + -)" - ;; - tag) - # The version part of the tag - export AGENT_VERSION=$(echo "${{ github.ref_name }}" | cut -f 2 -d /) - TAGS="$TAGS -t ${REGISTRY@L}/${{env.IMAGE_NAME}}/agent:${AGENT_VERSION} -t ${REGISTRY@L}/${{env.IMAGE_NAME}}/agent:latest" - ;; - *) - echo "Unkown type ${{ github.ref_type }}" - exit 1 - ;; - esac - export TAGS=$TAGS + echo "📦 Building agent version: ${AGENT_VERSION}" + echo "🏷️ Tags: ${TAGS}" export REGISTRY=${REGISTRY@L} export BUILD_ARGS="--push" make docker-build-only agent_version=${AGENT_VERSION} cat metadata.json echo "digest=$(cat metadata.json | jq -r .\"containerimage.digest\")" >> $GITHUB_OUTPUT cat $GITHUB_OUTPUT - env: - AGENT_IMAGE: ${{env.IMAGE_NAME}}/agent # This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see [AUTOTITLE](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds). - name: Generate artifact attestation @@ -136,7 +165,7 @@ jobs: operator-agent-tests: name: Operator Agent Integration Tests runs-on: ubuntu-latest - needs: [build-and-push-agent] + needs: [compute-metadata, build-and-push-agent] permissions: contents: read packages: read @@ -194,9 +223,9 @@ jobs: ~/.cache/go-build - name: Run operator-agent tests + env: + AGENT_IMAGE: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/agent:${{ needs.compute-metadata.outputs.agent-image-tag }} run: | cd operator - export GIT_SHA=$(git rev-parse --short ${{ github.sha }}) - export AGENT_IMAGE="${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/agent:${GIT_SHA}" echo "Testing with agent image: ${AGENT_IMAGE}" make operator-agent-tests From f8a4b6a9aaf779e1e0f5da8178f3bdf11d309fdf Mon Sep 17 00:00:00 2001 From: Alex Yuskauskas Date: Wed, 21 Jan 2026 16:19:13 -0800 Subject: [PATCH 5/7] fix(agent/ci): reduce timeouts on integration tests and actually setup the cluster --- .github/workflows/agent-ci.yaml | 2 +- k8s-tests/operator-agent/dont_write_logs/chainsaw-test.yaml | 4 ++-- k8s-tests/operator-agent/simple/chainsaw-test.yaml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/agent-ci.yaml b/.github/workflows/agent-ci.yaml index 0129d563..84d6463b 100644 --- a/.github/workflows/agent-ci.yaml +++ b/.github/workflows/agent-ci.yaml @@ -228,4 +228,4 @@ jobs: run: | cd operator echo "Testing with agent image: ${AGENT_IMAGE}" - make operator-agent-tests + make setup-kind-cluster operator-agent-tests diff --git a/k8s-tests/operator-agent/dont_write_logs/chainsaw-test.yaml b/k8s-tests/operator-agent/dont_write_logs/chainsaw-test.yaml index 86f90af0..cc85aea4 100644 --- a/k8s-tests/operator-agent/dont_write_logs/chainsaw-test.yaml +++ b/k8s-tests/operator-agent/dont_write_logs/chainsaw-test.yaml @@ -22,8 +22,8 @@ metadata: name: dont-write-logs-agent-operator spec: timeouts: - assert: 240s - exec: 90s + assert: 10s + exec: 10s steps: - try: - script: diff --git a/k8s-tests/operator-agent/simple/chainsaw-test.yaml b/k8s-tests/operator-agent/simple/chainsaw-test.yaml index 2f5ed23a..8c6ceca6 100644 --- a/k8s-tests/operator-agent/simple/chainsaw-test.yaml +++ b/k8s-tests/operator-agent/simple/chainsaw-test.yaml @@ -22,8 +22,8 @@ metadata: name: simple-agent-operator spec: timeouts: - assert: 240s - exec: 90s + assert: 10s + exec: 10s steps: - try: - script: From 83220721dbf943d9255c4ac83fee8adb960aa6ef Mon Sep 17 00:00:00 2001 From: Alex Yuskauskas Date: Thu, 22 Jan 2026 12:51:38 -0800 Subject: [PATCH 6/7] fix(agent/k8s_test): fix the operator-agent tests and add cli building to agent ci for tests --- .github/workflows/agent-ci.yaml | 3 ++- k8s-tests/operator-agent/check_node.sh | 7 ++++++- .../dont_write_logs/chainsaw-test.yaml | 17 +++++++++-------- .../operator-agent/dont_write_logs/skyhook.yaml | 5 ++--- .../operator-agent/interrupt/chainsaw-test.yaml | 8 +++++++- k8s-tests/operator-agent/interrupt/skyhook.yaml | 5 ++--- .../reap_old_logs/chainsaw-test.yaml | 10 +++++++--- .../operator-agent/reap_old_logs/skyhook.yaml | 10 +++++----- .../operator-agent/simple/chainsaw-test.yaml | 14 +++++++++----- k8s-tests/operator-agent/simple/skyhook.yaml | 5 ++--- 10 files changed, 51 insertions(+), 33 deletions(-) diff --git a/.github/workflows/agent-ci.yaml b/.github/workflows/agent-ci.yaml index 84d6463b..77f64fab 100644 --- a/.github/workflows/agent-ci.yaml +++ b/.github/workflows/agent-ci.yaml @@ -224,8 +224,9 @@ jobs: - name: Run operator-agent tests env: - AGENT_IMAGE: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/agent:${{ needs.compute-metadata.outputs.agent-image-tag }} + AGENT_IMAGE: ${{ format('{0}/{1}/agent:{2}', toLower(env.REGISTRY), toLower(env.IMAGE_NAME), needs.compute-metadata.outputs.agent-image-tag) }} run: | cd operator echo "Testing with agent image: ${AGENT_IMAGE}" + make build-cli make setup-kind-cluster operator-agent-tests diff --git a/k8s-tests/operator-agent/check_node.sh b/k8s-tests/operator-agent/check_node.sh index 3fb487e2..c1266689 100755 --- a/k8s-tests/operator-agent/check_node.sh +++ b/k8s-tests/operator-agent/check_node.sh @@ -23,12 +23,17 @@ node=$1 cmd=$2 check=$3 timeout=${4:-10} +invert=${5:-false} # loop until the command returns a non-zero exit code or the timeout is reached for i in $(seq 1 ${timeout}); do data=$(kubectl exec ${node}-debugger -- chroot /host bash -c "${cmd}") - if echo "${data}" | grep -q "${check}"; then + check_result=$(echo "${data}" | grep -c "${check}") + if [ "$invert" == "true" ]; then + check_result=$((! check_result)) + fi + if [ $check_result -gt 0 ]; then echo "Check passed" exit 0 else diff --git a/k8s-tests/operator-agent/dont_write_logs/chainsaw-test.yaml b/k8s-tests/operator-agent/dont_write_logs/chainsaw-test.yaml index cc85aea4..fcc40cd2 100644 --- a/k8s-tests/operator-agent/dont_write_logs/chainsaw-test.yaml +++ b/k8s-tests/operator-agent/dont_write_logs/chainsaw-test.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test @@ -22,14 +21,14 @@ metadata: name: dont-write-logs-agent-operator spec: timeouts: - assert: 10s - exec: 10s + assert: 240s + exec: 90s steps: - try: - script: content: | ## remove annotation from last run - ../../../../operator/bin/skyhook reset simple-agent-operator --confirm 2>/dev/null || true + ../../../operator/bin/skyhook reset dont-write-logs-agent-operator --confirm 2>/dev/null || true - script: content: | ## reinstall the debug pod in case it was deleted @@ -37,15 +36,17 @@ spec: - script: content: | ## clean up any logs from prior runs - ../check_node.sh kind-worker "rm -f /var/log/skyhook/dont-write-logs-agent-operator/shellscript/1.1.1/*.log" ".*" 2 + ../check_node.sh kind-worker "rm -rf /var/log/skyhook/dont-write-logs-agent-operator || true" ".*" 2 + ../check_node.sh kind-worker "rm -rf /var/lib/skyhook/dont-write-logs-agent-operator || true" ".*" 2 - apply: file: skyhook.yaml - assert: file: assert.yaml - script: content: | - ../check_node.sh kind-worker "ls /var/lib/skyhook/dont-write-logs-agent-operator/flags/shellscript/1.1.1/" "shellscript_run.sh.*" - ../check_node.sh kind-worker "ls /var/log/skyhook/dont-write-logs-agent-operator/shellscript/1.1.1/*.log | wc -l" "0" 2 + set -e + ## Wont even create the log directory + ../check_node.sh kind-worker "ls /var/log/skyhook/" "dont-write-logs-agent-operator" 2 true - finally: - delete: file: skyhook.yaml diff --git a/k8s-tests/operator-agent/dont_write_logs/skyhook.yaml b/k8s-tests/operator-agent/dont_write_logs/skyhook.yaml index 4b1f3d58..a3b89ce8 100644 --- a/k8s-tests/operator-agent/dont_write_logs/skyhook.yaml +++ b/k8s-tests/operator-agent/dont_write_logs/skyhook.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: @@ -34,7 +33,7 @@ spec: - name: SKYHOOK_AGENT_WRITE_LOGS value: "false" configMap: - config.sh: | + apply.sh: | #!/bin/bash echo "Hello, world!" diff --git a/k8s-tests/operator-agent/interrupt/chainsaw-test.yaml b/k8s-tests/operator-agent/interrupt/chainsaw-test.yaml index e9d3417e..609aecf2 100644 --- a/k8s-tests/operator-agent/interrupt/chainsaw-test.yaml +++ b/k8s-tests/operator-agent/interrupt/chainsaw-test.yaml @@ -28,15 +28,21 @@ spec: - script: content: | ## remove annotation from last run - ../../../../operator/bin/skyhook reset simple-agent-operator --confirm 2>/dev/null || true + ../../../operator/bin/skyhook reset interrupt-agent-operator --confirm 2>/dev/null || true - script: content: | ## reinstall the debug pod in case it was deleted ../setup.sh kind-worker setup + - script: + content: | + ## clean up from prior runs + ../check_node.sh kind-worker "rm -rf /var/log/skyhook/interrupt-agent-operator || true" ".*" 2 + ../check_node.sh kind-worker "rm -rf /var/lib/skyhook/interrupt-agent-operator || true" ".*" 2 - apply: file: skyhook.yaml - script: content: | + set -e ../check_node.sh kind-worker "ls /var/lib/skyhook/interrupt-agent-operator/interrupts/flags/interrupt-agent-operator*/" "no_op.complete" 60 - assert: file: assert.yaml diff --git a/k8s-tests/operator-agent/interrupt/skyhook.yaml b/k8s-tests/operator-agent/interrupt/skyhook.yaml index ca004bbd..13c3d3bd 100644 --- a/k8s-tests/operator-agent/interrupt/skyhook.yaml +++ b/k8s-tests/operator-agent/interrupt/skyhook.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: @@ -33,6 +32,6 @@ spec: interrupt: type: noop configMap: - config.sh: | + apply.sh: | #!/bin/bash echo "Hello, world!" diff --git a/k8s-tests/operator-agent/reap_old_logs/chainsaw-test.yaml b/k8s-tests/operator-agent/reap_old_logs/chainsaw-test.yaml index 34f99b76..948c8811 100644 --- a/k8s-tests/operator-agent/reap_old_logs/chainsaw-test.yaml +++ b/k8s-tests/operator-agent/reap_old_logs/chainsaw-test.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test @@ -29,11 +28,16 @@ spec: - script: content: | ## remove annotation from last run - ../../../../operator/bin/skyhook reset reap-old-logs-agent-operator --confirm 2>/dev/null || true + ../../../operator/bin/skyhook reset reap-old-logs-agent-operator --confirm - script: content: | ## reinstall the debug pod in case it was deleted ../setup.sh kind-worker setup + - script: + content: | + ## clean up from prior runs + ../check_node.sh kind-worker "rm -rf /var/log/skyhook/reap-old-logs-agent-operator || true" ".*" 2 + ../check_node.sh kind-worker "rm -rf /var/lib/skyhook/reap-old-logs-agent-operator || true" ".*" 2 - apply: file: skyhook.yaml - assert: diff --git a/k8s-tests/operator-agent/reap_old_logs/skyhook.yaml b/k8s-tests/operator-agent/reap_old_logs/skyhook.yaml index 06a60278..fb0eed0f 100644 --- a/k8s-tests/operator-agent/reap_old_logs/skyhook.yaml +++ b/k8s-tests/operator-agent/reap_old_logs/skyhook.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: @@ -31,12 +30,13 @@ spec: version: "1.1.1" image: ghcr.io/nvidia/skyhook-packages/shellscript configMap: - config.sh: | + apply.sh: |- #!/bin/bash - if [ $(ls /var/lib/skyhook/reap-old-logs-agent-operator/flags/shellscript/1.1.1/*.log | wc -l) -eq 5 ]; then + sleep 1 + if [ $(ls /var/log/skyhook/reap-old-logs-agent-operator/shellscript/1.1.1/*.log | wc -l) -eq 5 ]; then echo "5 logs found. After this should still be 5." exit 0 - else: + else echo "Not enough logs yet. Erroring to produce more." exit 1 fi diff --git a/k8s-tests/operator-agent/simple/chainsaw-test.yaml b/k8s-tests/operator-agent/simple/chainsaw-test.yaml index 8c6ceca6..69ecbefd 100644 --- a/k8s-tests/operator-agent/simple/chainsaw-test.yaml +++ b/k8s-tests/operator-agent/simple/chainsaw-test.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test @@ -22,18 +21,23 @@ metadata: name: simple-agent-operator spec: timeouts: - assert: 10s - exec: 10s + assert: 240s + exec: 90s steps: - try: - script: content: | ## remove annotation from last run - ../../../../operator/bin/skyhook reset simple-agent-operator --confirm 2>/dev/null || true + ../../../operator/bin/skyhook reset simple-agent-operator --confirm 2>/dev/null || true - script: content: | ## reinstall the debug pod in case it was deleted ../setup.sh kind-worker setup + - script: + content: | + ## clean up from prior runs + ../check_node.sh kind-worker "rm -rf /var/log/skyhook/simple-agent-operator || true" ".*" 2 + ../check_node.sh kind-worker "rm -rf /var/lib/skyhook/simple-agent-operator || true" ".*" 2 - apply: file: skyhook.yaml - assert: diff --git a/k8s-tests/operator-agent/simple/skyhook.yaml b/k8s-tests/operator-agent/simple/skyhook.yaml index 9cac96a1..6a359912 100644 --- a/k8s-tests/operator-agent/simple/skyhook.yaml +++ b/k8s-tests/operator-agent/simple/skyhook.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: @@ -31,7 +30,7 @@ spec: version: "1.1.1" image: ghcr.io/nvidia/skyhook-packages/shellscript configMap: - config.sh: | + apply.sh: | #!/bin/bash echo "Hello, world!" From dea954524b6f16e0fc9f38eb47ba70a7e08d35a8 Mon Sep 17 00:00:00 2001 From: Alex Yuskauskas Date: Thu, 22 Jan 2026 13:31:08 -0800 Subject: [PATCH 7/7] fix(agent/ci): no toLower in github --- .github/workflows/agent-ci.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/agent-ci.yaml b/.github/workflows/agent-ci.yaml index 77f64fab..cb3c18ee 100644 --- a/.github/workflows/agent-ci.yaml +++ b/.github/workflows/agent-ci.yaml @@ -224,9 +224,10 @@ jobs: - name: Run operator-agent tests env: - AGENT_IMAGE: ${{ format('{0}/{1}/agent:{2}', toLower(env.REGISTRY), toLower(env.IMAGE_NAME), needs.compute-metadata.outputs.agent-image-tag) }} + AGENT_IMAGE: ${{ format('{0}/{1}/agent:{2}', env.REGISTRY, github.repository, needs.compute-metadata.outputs.agent-image-tag) }} run: | cd operator + export AGENT_IMAGE="${AGENT_IMAGE,,}" echo "Testing with agent image: ${AGENT_IMAGE}" make build-cli make setup-kind-cluster operator-agent-tests