Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 124 additions & 25 deletions .github/workflows/agent-ci.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
#
Expand Down Expand Up @@ -36,6 +36,52 @@ env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
compute-metadata:
name: Compute Image Metadata
runs-on: ubuntu-latest
outputs:
git-sha: ${{ steps.meta.outputs.git-sha }}
agent-version: ${{ steps.meta.outputs.agent-version }}
agent-image-tag: ${{ steps.meta.outputs.agent-image-tag }}
tags: ${{ steps.meta.outputs.tags }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Fetch all tags
run: git fetch --tags --force
- name: Compute metadata
id: meta
run: |
export GIT_SHA=$(git rev-parse --short ${{ github.sha }})
echo "git-sha=${GIT_SHA}" >> $GITHUB_OUTPUT

case ${{ github.ref_type }} in
branch)
# The last tag + current git sha
export AGENT_VERSION=$(git tag --list 'agent*' --sort=-v:refname | head -n 1 | cut -d/ -f2)+${GIT_SHA}
# Convert + to - for docker tag compliance
export AGENT_IMAGE_TAG=$(echo "${AGENT_VERSION}" | tr + -)
TAGS="-t ${REGISTRY@L}/${{ github.repository }}/agent:${GIT_SHA} -t ${REGISTRY@L}/${{ github.repository }}/agent:${AGENT_IMAGE_TAG}"
;;
tag)
# The version part of the tag
export AGENT_VERSION=$(echo "${{ github.ref_name }}" | cut -f 2 -d /)
export AGENT_IMAGE_TAG="${AGENT_VERSION}"
TAGS="-t ${REGISTRY@L}/${{ github.repository }}/agent:${GIT_SHA} -t ${REGISTRY@L}/${{ github.repository }}/agent:${AGENT_VERSION} -t ${REGISTRY@L}/${{ github.repository }}/agent:latest"
;;
*)
echo "Unknown type ${{ github.ref_type }}"
exit 1
;;
esac

echo "agent-version=${AGENT_VERSION}" >> $GITHUB_OUTPUT
echo "agent-image-tag=${AGENT_IMAGE_TAG}" >> $GITHUB_OUTPUT
echo "tags=${TAGS}" >> $GITHUB_OUTPUT
echo "📦 Agent Version: ${AGENT_VERSION}"
echo "🏷️ Image Tag: ${AGENT_IMAGE_TAG}"
echo "🏷️ All Tags: ${TAGS}"

test:
name: Skyhook Agent Unit Tests
runs-on: ubuntu-latest
Expand All @@ -62,7 +108,7 @@ jobs:
cat test-summary.md >> $GITHUB_STEP_SUMMARY
build-and-push-agent:
runs-on: ubuntu-latest
needs: [test] # Don't run the build and push if the unit tests fail
needs: [test, compute-metadata] # Don't run the build and push if the unit tests fail
# Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
permissions:
contents: read
Expand Down Expand Up @@ -92,38 +138,21 @@ jobs:

- name: Build the agent container image
id: build
env:
GIT_SHA: ${{ needs.compute-metadata.outputs.git-sha }}
AGENT_VERSION: ${{ needs.compute-metadata.outputs.agent-version }}
TAGS: ${{ needs.compute-metadata.outputs.tags }}
run: |
apt-get update && apt-get install -y make git jq
cd agent
# if this is a tag build, use the tag as the version, otherwise use the sha
git fetch --all
export GIT_SHA=$(git rev-parse --short ${{ github.sha }})
TAGS="-t ${REGISTRY@L}/${{env.IMAGE_NAME}}/agent:${GIT_SHA}"
case ${{ github.ref_type }} in
branch)
# The last tag + current git sha
export AGENT_VERSION=$(git tag --list 'agent*' --sort=-v:refname | head -n 1 | cut -d/ -f2)+${GIT_SHA}
TAGS="$TAGS -t ${REGISTRY@L}/${{env.IMAGE_NAME}}/agent:$(echo "${AGENT_VERSION}" | tr + -)"
;;
tag)
# The version part of the tag
export AGENT_VERSION=$(echo "${{ github.ref_name }}" | cut -f 2 -d /)
TAGS="$TAGS -t ${REGISTRY@L}/${{env.IMAGE_NAME}}/agent:${AGENT_VERSION} -t ${REGISTRY@L}/${{env.IMAGE_NAME}}/agent:latest"
;;
*)
echo "Unkown type ${{ github.ref_type }}"
exit 1
;;
esac
export TAGS=$TAGS
echo "📦 Building agent version: ${AGENT_VERSION}"
echo "🏷️ Tags: ${TAGS}"
export REGISTRY=${REGISTRY@L}
export BUILD_ARGS="--push"
make docker-build-only agent_version=${AGENT_VERSION}
cat metadata.json
echo "digest=$(cat metadata.json | jq -r .\"containerimage.digest\")" >> $GITHUB_OUTPUT
cat $GITHUB_OUTPUT
env:
AGENT_IMAGE: ${{env.IMAGE_NAME}}/agent

# This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see [AUTOTITLE](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds).
- name: Generate artifact attestation
Expand All @@ -132,3 +161,73 @@ jobs:
subject-name: ${{ env.REGISTRY }}/${{env.IMAGE_NAME}}/agent
subject-digest: ${{ steps.build.outputs.digest }}
push-to-registry: true

operator-agent-tests:
name: Operator Agent Integration Tests
runs-on: ubuntu-latest
needs: [compute-metadata, build-and-push-agent]
permissions:
contents: read
packages: read
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-tags: true
fetch-depth: 0

- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.25.5'
cache-dependency-path: operator/go.sum

- name: Log in to the Container registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Create Kubernetes KinD Cluster
uses: helm/kind-action@v1
with:
version: v0.31.0
node_image: kindest/node:v1.35.0
config: operator/config/local-dev/kind-config.yaml
cluster_name: kind

- name: Restore cached Binaries
id: cached-binaries
uses: actions/cache/restore@v4
with:
key: 1.25.5-${{ runner.os }}-${{ runner.arch }}-bin-${{ hashFiles('operator/deps.mk') }}
restore-keys: 1.25.5-${{ runner.os }}-${{ runner.arch }}-bin-
path: |
${{ github.workspace }}/operator/bin
~/.cache/go-build

- name: Install dependencies
if: steps.cached-binaries.outputs.cache-hit != 'true'
run: |
cd operator
make install-deps

- name: Save cached Binaries
if: steps.cached-binaries.outputs.cache-hit != 'true'
uses: actions/cache/save@v4
with:
key: 1.25.5-${{ runner.os }}-${{ runner.arch }}-bin-${{ hashFiles('operator/deps.mk') }}
path: |
${{ github.workspace }}/operator/bin
~/.cache/go-build

- name: Run operator-agent tests
env:
AGENT_IMAGE: ${{ format('{0}/{1}/agent:{2}', env.REGISTRY, github.repository, needs.compute-metadata.outputs.agent-image-tag) }}
run: |
cd operator
export AGENT_IMAGE="${AGENT_IMAGE,,}"
echo "Testing with agent image: ${AGENT_IMAGE}"
make build-cli
make setup-kind-cluster operator-agent-tests
69 changes: 55 additions & 14 deletions agent/skyhook-agent/src/skyhook_agent/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
# limitations under the License.


import contextlib
from multiprocessing import context
import sys
import os
import shutil
Expand Down Expand Up @@ -69,7 +71,9 @@ def _get_env_config() -> tuple[str]:

SKYHOOK_LOG_DIR = os.getenv("SKYHOOK_LOG_DIR", "/var/log/skyhook")

return SKYHOOK_RESOURCE_ID, SKYHOOK_DATA_DIR, SKYHOOK_ROOT_DIR, SKYHOOK_LOG_DIR
SKYHOOK_AGENT_WRITE_LOGS = os.getenv("SKYHOOK_AGENT_WRITE_LOGS", "true").lower() == 'true'

return SKYHOOK_RESOURCE_ID, SKYHOOK_DATA_DIR, SKYHOOK_ROOT_DIR, SKYHOOK_LOG_DIR, SKYHOOK_AGENT_WRITE_LOGS

def _get_package_information(config_data: dict) -> tuple[str, str]:
return config_data["package_name"], config_data["package_version"]
Expand Down Expand Up @@ -129,15 +133,43 @@ async def _stream_process(
sink.flush()
break

# A file like context manager that black holes all writes to it. Does not need to implement read
class NullWriter:
"""A file-like context manager that discards all writes."""

def write(self, *args, **kwargs):
# Swallow everything and return len to mimic file behaviour if needed
if args:
return len(args[0])
return 0

def flush(self):
pass

def close(self):
pass

def __enter__(self):
return self

async def tee(chroot_dir: str, cmd: List[str], stdout_sink_path: str, stderr_sink_path: str, write_cmds=False, no_chmod=False, env: dict[str, str] = {}, **kwargs):
def __exit__(self, exc_type, exc_val, exc_tb):
# Nothing to cleanup, obviously
return False


async def tee(chroot_dir: str, cmd: List[str], stdout_sink_path: str, stderr_sink_path: str, write_cmds=False, no_chmod=False, env: dict[str, str] = {}, write_logs: bool=True, **kwargs):
"""
Run the cmd in a subprocess and keep the stream of stdout/stderr and merge both into
the sink_path as a log.
"""
# get the directory of the script
script_dir = os.path.dirname(os.path.abspath(__file__))
with open(stdout_sink_path, "w") as stdout_sink_f, open(stderr_sink_path, "w") as stderr_sink_f:
# Switch out the opens with nulls in the event of not wanting to write files
if write_logs:
files = (lambda : open(stdout_sink_path, 'w'), lambda: open(stderr_sink_path, 'w'))
else:
files = (lambda: NullWriter(), lambda: NullWriter())
with files[0]() as stdout_sink_f, files[1]() as stderr_sink_f:
if write_cmds:
sys.stdout.write(" ".join(cmd) + "\n")
stdout_sink_f.write(" ".join(cmd) + "\n")
Expand Down Expand Up @@ -172,7 +204,7 @@ def get_host_path_for_steps(copy_dir: str):
return f"{copy_dir}/skyhook_dir"

def get_skyhook_directory(root_mount: str) -> str:
_, _, SKYHOOK_ROOT_DIR, _ = _get_env_config()
_, _, SKYHOOK_ROOT_DIR, _, _ = _get_env_config()
return f"{root_mount}{SKYHOOK_ROOT_DIR}"

def get_flag_dir(root_mount: str) -> str:
Expand All @@ -182,7 +214,7 @@ def get_history_dir(root_mount: str) -> str:
return f"{get_skyhook_directory(root_mount)}/history"

def get_log_dir(root_mount: str) -> str:
_, _, _, SKYHOOK_LOG_DIR = _get_env_config()
_, _, _, SKYHOOK_LOG_DIR, _ = _get_env_config()
return f"{root_mount}{SKYHOOK_LOG_DIR}"

def get_log_file(step_path: str, copy_dir: str, config_data: dict, root_mount: str, timestamp: str=None) -> str:
Expand Down Expand Up @@ -220,21 +252,23 @@ def set_flag(flag_file: str, msg: str = "") -> None:
f.write(msg)


def _run(chroot_dir: str, cmds: list[str], log_path: str, write_cmds=False, no_chmod=False, env: dict[str, str] = {}, **kwargs) -> int:
def _run(chroot_dir: str, cmds: list[str], log_path: str|None, write_cmds=False, no_chmod=False, env: dict[str, str] = {}, write_logs: bool=True, **kwargs) -> int:
"""
Synchronous wrapper around the tee command to have logs written to disk
"""
# "tee" the stdout and stderr to a file to log the step results
stderr_path = f"{log_path}.err" if log_path else None

result = asyncio.run(
tee(
chroot_dir,
cmds,
log_path,
f"{log_path}.err",
stderr_path,
write_cmds=write_cmds,
no_chmod=no_chmod,
env=env,
write_logs=write_logs,
**kwargs
)
)
Expand Down Expand Up @@ -283,7 +317,11 @@ def run_step(
return True

time.sleep(1)
log_file = get_log_file(step_path, copy_dir, config_data, chroot_dir)
_, _, _, _, SKYHOOK_AGENT_WRITE_LOGS = _get_env_config()
if SKYHOOK_AGENT_WRITE_LOGS:
log_file = get_log_file(step_path, copy_dir, config_data, chroot_dir)
else:
log_file = None

# Compile additional environment variables
env = {}
Expand All @@ -294,9 +332,11 @@ def run_step(
chroot_dir,
[step_path, *step.arguments],
log_file,
env=env)
env=env,
write_logs=SKYHOOK_AGENT_WRITE_LOGS)

cleanup_old_logs(get_log_file(step_path, copy_dir, config_data, "*"))
if SKYHOOK_AGENT_WRITE_LOGS:
cleanup_old_logs(get_log_file(step_path, copy_dir, config_data, chroot_dir, "*"))
if return_code not in step.returncodes:
print(f"FAILED: {step.path} {' '.join(step.arguments)} {return_code}")
return True
Expand Down Expand Up @@ -421,7 +461,7 @@ def summarize_check_results(results: list[bool], step_data: dict[Mode, list[Step
return False

def make_config_data_from_resource_id() -> dict:
SKYHOOK_RESOURCE_ID, _, _, _ = _get_env_config()
SKYHOOK_RESOURCE_ID, _, _, _, _ = _get_env_config()

# Interrupts don't really have config data we can read from the Package as it is run standalone.
# So read it off of SKYHOOK_RESOURCE_ID instead
Expand All @@ -441,7 +481,7 @@ def do_interrupt(interrupt_data: str, root_mount: str, copy_dir: str) -> bool:
def _make_interrupt_flag(interrupt_dir: str, interrupt_id: int) -> str:
return f"{interrupt_dir}/{interrupt_id}.complete"

SKYHOOK_RESOURCE_ID, _, _, _ = _get_env_config()
SKYHOOK_RESOURCE_ID, _, _, _, _ = _get_env_config()
config_data = make_config_data_from_resource_id()

interrupt = interrupts.inflate(interrupt_data)
Expand Down Expand Up @@ -509,7 +549,7 @@ def main(mode: Mode, root_mount: str, copy_dir: str, interrupt_data: None|str, a
if mode == Mode.INTERRUPT:
return do_interrupt(interrupt_data, root_mount, copy_dir)

_, SKYHOOK_DATA_DIR, _, _ = _get_env_config()
_, SKYHOOK_DATA_DIR, _, _, _ = _get_env_config()

# Check to see if the directory has already been copied down. If it hasn't assume that we
# are running in legacy mode and copy the directory down.
Expand Down Expand Up @@ -651,12 +691,13 @@ def cli(sys_argv: list[str]=sys.argv):
print(str.center("ENV CONFIGURATION", 20, "-"))
print(f"COPY_RESOLV: {copy_resolv}")
print(f"OVERLAY_ALWAYS_RUN_STEP: {always_run_step}")
SKYHOOK_RESOURCE_ID, SKYHOOK_DATA_DIR, SKYHOOK_ROOT_DIR, SKYHOOK_LOG_DIR = _get_env_config()
SKYHOOK_RESOURCE_ID, SKYHOOK_DATA_DIR, SKYHOOK_ROOT_DIR, SKYHOOK_LOG_DIR, SKYHOOK_AGENT_WRITE_LOGS = _get_env_config()
print(f"SKYHOOK_RESOURCE_ID: {SKYHOOK_RESOURCE_ID}")
print(f"SKYHOOK_DATA_DIR: {SKYHOOK_DATA_DIR}")
print(f"SKYHOOK_ROOT_DIR: {SKYHOOK_ROOT_DIR}")
print(f"SKYHOOK_LOG_DIR: {SKYHOOK_LOG_DIR}")
print(f"SKYHOOK_AGENT_BUFFER_LIMIT: {buff_size}")
print(f"SKYHOOK_AGENT_WRITE_LOGS: {SKYHOOK_AGENT_WRITE_LOGS}")
print(str.center("Directory CONFIGURATION", 20, "-"))
# print flag dir and log dir
config_data = make_config_data_from_resource_id()
Expand Down
Loading
Loading