From 6ffd98ca5f1973a2d45b7906de2dda13295fefe5 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 5 Dec 2025 06:49:55 +0000 Subject: [PATCH 01/29] Add SkyPilot integration for job launching --- python/monarch/_src/job/skypilot.py | 343 +++++++++++++++ python/monarch/job/__init__.py | 16 +- python/tests/test_skypilot_integration.py | 213 ++++++++++ python/tests/test_skypilot_job.py | 493 ++++++++++++++++++++++ 4 files changed, 1064 insertions(+), 1 deletion(-) create mode 100644 python/monarch/_src/job/skypilot.py create mode 100644 python/tests/test_skypilot_integration.py create mode 100644 python/tests/test_skypilot_job.py diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py new file mode 100644 index 000000000..39795d239 --- /dev/null +++ b/python/monarch/_src/job/skypilot.py @@ -0,0 +1,343 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import logging +import os +import sys +import time +from typing import Dict, List, Optional, Tuple, TYPE_CHECKING + +from monarch._src.job.job import JobState, JobTrait + +# Defer imports that may not be available in all environments +if TYPE_CHECKING: + import sky + from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle + +try: + import sky + from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle + + HAS_SKYPILOT = True +except ImportError: + HAS_SKYPILOT = False + sky = None # type: ignore[assignment] + CloudVmRayResourceHandle = None # type: ignore[assignment, misc] + + +logger: logging.Logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +logger.addHandler(logging.StreamHandler(sys.stderr)) +logger.propagate = False + +# Default port for Monarch TCP communication +DEFAULT_MONARCH_PORT = 22222 + + +def _configure_transport() -> None: + """Configure the Monarch transport. Deferred import to avoid import errors.""" + from monarch._rust_bindings.monarch_hyperactor.channel import ChannelTransport + from monarch._rust_bindings.monarch_hyperactor.config import configure + + configure(default_transport=ChannelTransport.TcpWithHostname) + + +def _attach_to_workers_wrapper(name: str, ca: str, workers: List[str]): + """Wrapper around attach_to_workers with deferred import.""" + from monarch._src.actor.bootstrap import attach_to_workers + + return attach_to_workers(name=name, ca=ca, workers=workers) + + +class SkyPilotJob(JobTrait): + """ + A job scheduler that uses SkyPilot to provision cloud instances. + + SkyPilot supports multiple cloud providers (AWS, GCP, Azure, Lambda, etc.) + and can automatically select the cheapest available option. + + This implementation: + 1. Uses sky.launch() to provision cloud instances with specified resources + 2. Runs Monarch workers on each node via a startup script + 3. Connects to workers using their IP addresses from the cluster handle + + Example: + >>> import sky + >>> from monarch.job import SkyPilotJob + >>> + >>> job = SkyPilotJob( + ... meshes={"trainers": 2}, + ... resources=sky.Resources(accelerators="A100:1"), + ... cluster_name="my-monarch-cluster", + ... ) + >>> state = job.state() + >>> trainers = state.trainers # HostMesh with 2 nodes + """ + + def __init__( + self, + meshes: Dict[str, int], + resources: Optional["sky.Resources"] = None, + cluster_name: Optional[str] = None, + monarch_port: int = DEFAULT_MONARCH_PORT, + idle_minutes_to_autostop: Optional[int] = None, + down_on_autostop: bool = False, + python_exe: str = "python", + setup_commands: Optional[str] = None, + ) -> None: + """ + Args: + meshes: Dictionary mapping mesh names to number of nodes. + e.g., {"trainers": 4, "dataloaders": 2} + resources: SkyPilot Resources specification for the instances. + If None, uses SkyPilot defaults. + cluster_name: Name for the SkyPilot cluster. If None, auto-generated. + monarch_port: Port for TCP communication between Monarch workers. + idle_minutes_to_autostop: If set, cluster will autostop after this + many minutes of idleness. + down_on_autostop: If True, tear down cluster on autostop instead of + just stopping it. + python_exe: Python executable to use for worker processes. + setup_commands: Optional setup commands to run before starting workers. + Use this to install dependencies. + """ + if not HAS_SKYPILOT: + raise ImportError( + "SkyPilot is not installed. Install it with: pip install skypilot" + ) + + # Configure transport at runtime when Monarch is available + try: + _configure_transport() + except ImportError: + # Monarch bindings not available, will fail later when needed + pass + + super().__init__() + + self._meshes = meshes + self._resources = resources + self._cluster_name = cluster_name + self._port = monarch_port + self._idle_minutes_to_autostop = idle_minutes_to_autostop + self._down_on_autostop = down_on_autostop + self._python_exe = python_exe + self._setup_commands = setup_commands + + # Runtime state + self._launched_cluster_name: Optional[str] = None + self._node_ips: List[str] = [] + + def _create(self, client_script: Optional[str]) -> None: + """Launch a SkyPilot cluster and start Monarch workers.""" + if client_script is not None: + raise RuntimeError("SkyPilotJob cannot run batch-mode scripts yet") + + total_nodes = sum(self._meshes.values()) + + # Build the worker startup command + worker_command = self._build_worker_command() + + # Create setup commands + setup = self._setup_commands or "" + if setup and not setup.endswith("\n"): + setup += "\n" + + # Create the SkyPilot task + task = sky.Task( + name="monarch-workers", + setup=setup if setup else None, + run=worker_command, + num_nodes=total_nodes, + ) + + if self._resources is not None: + task.set_resources(self._resources) + + # Generate cluster name if not provided + cluster_name = self._cluster_name or f"monarch-{os.getpid()}" + + logger.info(f"Launching SkyPilot cluster '{cluster_name}' with {total_nodes} nodes") + + # Launch the cluster + # Note: sky.launch returns a request ID in the SDK, we need to get the result + try: + request_id = sky.launch( + task, + cluster_name=cluster_name, + idle_minutes_to_autostop=self._idle_minutes_to_autostop, + down=self._down_on_autostop, + ) + # Get the result from the request + job_id, handle = sky.get(request_id) + except Exception as e: + logger.error(f"Failed to launch SkyPilot cluster: {e}") + raise RuntimeError(f"Failed to launch SkyPilot cluster: {e}") from e + + self._launched_cluster_name = cluster_name + logger.info(f"SkyPilot cluster '{cluster_name}' launched successfully") + + def _build_worker_command(self) -> str: + """Build the command to start Monarch workers on each node.""" + # This command will be run on each node + # We use the node's IP to create a unique address for each worker + return f""" +import socket +hostname = socket.gethostname() +# Get the IP address of this node +ip_addr = socket.gethostbyname(hostname) +address = f"tcp://{{ip_addr}}:{self._port}" +print(f"Starting Monarch worker at {{address}}") + +from monarch.actor import run_worker_loop_forever +run_worker_loop_forever(address=address, ca="trust_all_connections") +""" + + def _get_node_ips(self) -> List[str]: + """Get the IP addresses of all nodes in the cluster.""" + if not self._launched_cluster_name: + raise RuntimeError("Cluster has not been launched yet") + + # Query cluster status to get handle with node IPs + try: + request_id = sky.status(cluster_names=[self._launched_cluster_name]) + statuses = sky.get(request_id) + except Exception as e: + raise RuntimeError(f"Failed to get cluster status: {e}") from e + + if not statuses: + raise RuntimeError( + f"Cluster '{self._launched_cluster_name}' not found" + ) + + status = statuses[0] + handle = status.handle + + if handle is None: + raise RuntimeError( + f"Cluster '{self._launched_cluster_name}' has no handle" + ) + + if not isinstance(handle, CloudVmRayResourceHandle): + raise RuntimeError( + f"Unexpected handle type: {type(handle)}" + ) + + # Get the external IPs from the handle + if handle.stable_internal_external_ips is None: + raise RuntimeError("Cluster has no IP information") + + # stable_internal_external_ips is List[Tuple[internal_ip, external_ip]] + # We use external IPs to connect + ips = [] + for internal_ip, external_ip in handle.stable_internal_external_ips: + # Prefer external IP, fall back to internal + ip = external_ip if external_ip else internal_ip + if ip: + ips.append(ip) + + if not ips: + raise RuntimeError("No IP addresses found for cluster nodes") + + return ips + + def _wait_for_workers_ready( + self, expected_nodes: int, timeout: int = 300, poll_interval: int = 5 + ) -> List[str]: + """Wait for workers to be ready and return their addresses.""" + start_time = time.time() + + while time.time() - start_time < timeout: + try: + ips = self._get_node_ips() + if len(ips) >= expected_nodes: + logger.info(f"Found {len(ips)} nodes ready") + return ips + except Exception as e: + logger.debug(f"Waiting for workers: {e}") + + time.sleep(poll_interval) + + raise RuntimeError( + f"Timeout waiting for {expected_nodes} workers after {timeout}s" + ) + + def _state(self) -> JobState: + """Get the current state with HostMesh objects for each mesh.""" + if not self._jobs_active(): + raise RuntimeError("SkyPilot cluster is not active") + + # Get node IPs if not cached + if not self._node_ips: + total_nodes = sum(self._meshes.values()) + self._node_ips = self._wait_for_workers_ready(total_nodes) + + # Distribute IPs among meshes + host_meshes = {} + ip_idx = 0 + + for mesh_name, num_nodes in self._meshes.items(): + mesh_ips = self._node_ips[ip_idx : ip_idx + num_nodes] + ip_idx += num_nodes + + workers = [f"tcp://{ip}:{self._port}" for ip in mesh_ips] + + host_mesh = _attach_to_workers_wrapper( + name=mesh_name, + ca="trust_all_connections", + workers=workers, + ) + host_meshes[mesh_name] = host_mesh + + return JobState(host_meshes) + + def can_run(self, spec: "JobTrait") -> bool: + """Check if this job can run the given spec.""" + if not isinstance(spec, SkyPilotJob): + return False + + return ( + spec._meshes == self._meshes + and spec._resources == self._resources + and spec._port == self._port + and self._jobs_active() + ) + + def _jobs_active(self) -> bool: + """Check if the SkyPilot cluster is still active.""" + if not self.active or not self._launched_cluster_name: + return False + + try: + request_id = sky.status(cluster_names=[self._launched_cluster_name]) + statuses = sky.get(request_id) + + if not statuses: + return False + + status = statuses[0] + # Check if cluster is UP + return status.status == sky.ClusterStatus.UP + except Exception as e: + logger.warning(f"Error checking cluster status: {e}") + return False + + def _kill(self) -> None: + """Tear down the SkyPilot cluster.""" + if self._launched_cluster_name is not None: + try: + logger.info(f"Tearing down SkyPilot cluster '{self._launched_cluster_name}'") + request_id = sky.down(self._launched_cluster_name) + sky.get(request_id) + logger.info(f"Cluster '{self._launched_cluster_name}' terminated") + except Exception as e: + logger.warning(f"Failed to tear down cluster: {e}") + + self._launched_cluster_name = None + self._node_ips.clear() + diff --git a/python/monarch/job/__init__.py b/python/monarch/job/__init__.py index b6852a0a1..674007d53 100644 --- a/python/monarch/job/__init__.py +++ b/python/monarch/job/__init__.py @@ -8,5 +8,19 @@ from monarch._src.job.job import job_load, job_loads, JobState, JobTrait, LocalJob from monarch._src.job.slurm import SlurmJob +# SkyPilot is an optional dependency +try: + from monarch._src.job.skypilot import SkyPilotJob +except ImportError: + SkyPilotJob = None # type: ignore[misc,assignment] + # Define exports -__all__ = ["JobTrait", "job_load", "job_loads", "JobState", "LocalJob", "SlurmJob"] +__all__ = [ + "JobTrait", + "job_load", + "job_loads", + "JobState", + "LocalJob", + "SlurmJob", + "SkyPilotJob", +] diff --git a/python/tests/test_skypilot_integration.py b/python/tests/test_skypilot_integration.py new file mode 100644 index 000000000..5469f4717 --- /dev/null +++ b/python/tests/test_skypilot_integration.py @@ -0,0 +1,213 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Integration test script for SkyPilot job. + +This script tests the basic SkyPilot integration without requiring Monarch +runtime. It validates that SkyPilot cluster launching and node IP retrieval works. + +Run this script with: + python tests/test_skypilot_integration.py + +Prerequisites: +- SkyPilot installed and configured with cloud credentials +- Run `sky check` to verify cloud access +""" + +import argparse +import sys +import time + +try: + import sky + from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle +except ImportError: + print("Error: SkyPilot is not installed. Install with: pip install skypilot") + sys.exit(1) + + +def test_skypilot_cluster_launch( + cluster_name: str = "monarch-integration-test", + cloud: str = "aws", + cpus: str = "2+", + timeout_minutes: int = 10, +) -> bool: + """ + Test launching a SkyPilot cluster and retrieving node IPs. + + Args: + cluster_name: Name for the test cluster + cloud: Cloud provider to use + cpus: CPU specification + timeout_minutes: Timeout for cluster launch + + Returns: + True if test passed, False otherwise + """ + print(f"\n{'='*60}") + print("SkyPilot Integration Test") + print(f"{'='*60}\n") + + # Create a simple task + task = sky.Task( + name="monarch-test-task", + run="echo 'SkyPilot test successful' && hostname && sleep 30", + ) + + # Set resources based on cloud + cloud_obj = None + if cloud.lower() == "aws": + cloud_obj = sky.AWS() + elif cloud.lower() == "gcp": + cloud_obj = sky.GCP() + elif cloud.lower() == "azure": + cloud_obj = sky.Azure() + elif cloud.lower() == "kubernetes": + cloud_obj = sky.Kubernetes() + + resources = sky.Resources( + cloud=cloud_obj, + cpus=cpus, + ) + task.set_resources(resources) + + print(f"Test configuration:") + print(f" Cluster name: {cluster_name}") + print(f" Cloud: {cloud}") + print(f" CPUs: {cpus}") + print() + + try: + # Launch the cluster + print("Step 1: Launching cluster...") + request_id = sky.launch( + task, + cluster_name=cluster_name, + idle_minutes_to_autostop=5, + down=True, # Auto-teardown after idle + ) + + print(f" Request ID: {request_id}") + job_id, handle = sky.get(request_id) + print(f" Job ID: {job_id}") + + if handle is None: + print(" ERROR: No handle returned from launch") + return False + + print(" Cluster launched successfully!") + + # Get cluster status and node IPs + print("\nStep 2: Getting cluster status and node IPs...") + request_id = sky.status(cluster_names=[cluster_name]) + statuses = sky.get(request_id) + + if not statuses: + print(" ERROR: No status returned") + return False + + status = statuses[0] + print(f" Cluster status: {status.status}") + print(f" Cluster name: {status.name}") + + handle = status.handle + if handle is None: + print(" ERROR: Status has no handle") + return False + + if not isinstance(handle, CloudVmRayResourceHandle): + print(f" ERROR: Unexpected handle type: {type(handle)}") + return False + + # Get IPs + if handle.stable_internal_external_ips: + print(f"\n Node IPs ({len(handle.stable_internal_external_ips)} nodes):") + for i, (internal_ip, external_ip) in enumerate( + handle.stable_internal_external_ips + ): + print(f" Node {i}: internal={internal_ip}, external={external_ip}") + else: + print(" WARNING: No IP information available yet") + + # Test passed! + print("\n" + "=" * 60) + print("TEST PASSED!") + print("=" * 60) + print( + "\nThe SkyPilot integration is working correctly." + "\nMonarch workers can be launched on these nodes." + ) + return True + + except Exception as e: + print(f"\nERROR: {e}") + import traceback + + traceback.print_exc() + return False + + finally: + # Cleanup + print("\nStep 3: Cleaning up cluster...") + try: + request_id = sky.down(cluster_name) + sky.get(request_id) + print(" Cluster terminated successfully") + except Exception as e: + print(f" Warning: Failed to cleanup cluster: {e}") + print(f" You may need to manually run: sky down {cluster_name}") + + +def main(): + parser = argparse.ArgumentParser( + description="Integration test for SkyPilot-Monarch integration" + ) + parser.add_argument( + "--cluster-name", + default="monarch-integration-test", + help="Name for the test cluster", + ) + parser.add_argument( + "--cloud", + default="aws", + choices=["aws", "gcp", "azure", "kubernetes"], + help="Cloud provider to use", + ) + parser.add_argument( + "--cpus", + default="2+", + help="CPU specification", + ) + parser.add_argument( + "--timeout", + type=int, + default=10, + help="Timeout in minutes for cluster launch", + ) + + args = parser.parse_args() + + # Check SkyPilot is configured + print("Checking SkyPilot configuration...") + print(f" Using cloud: {args.cloud}") + print(" (Run 'sky check' to verify cloud credentials)") + + # Run the test + success = test_skypilot_cluster_launch( + cluster_name=args.cluster_name, + cloud=args.cloud, + cpus=args.cpus, + timeout_minutes=args.timeout, + ) + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() + diff --git a/python/tests/test_skypilot_job.py b/python/tests/test_skypilot_job.py new file mode 100644 index 000000000..295c7a74d --- /dev/null +++ b/python/tests/test_skypilot_job.py @@ -0,0 +1,493 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +"""Tests for the SkyPilot job integration.""" + +import os +import sys +import tempfile +from typing import Any, Dict, List, Optional, Tuple +from unittest import mock + +import pytest + + +# Check if SkyPilot is available +try: + import sky + + HAS_SKYPILOT = True +except ImportError: + HAS_SKYPILOT = False + +# Check if Monarch bindings are available +try: + from monarch._rust_bindings.monarch_hyperactor.config import configure + + HAS_MONARCH_BINDINGS = True +except ImportError: + HAS_MONARCH_BINDINGS = False + +# Skip all tests in this module if SkyPilot or Monarch bindings are not installed +pytestmark = [ + pytest.mark.skipif(not HAS_SKYPILOT, reason="SkyPilot not installed"), + pytest.mark.skipif(not HAS_MONARCH_BINDINGS, reason="Monarch bindings not available"), +] + + +class MockClusterHandle: + """Mock CloudVmRayResourceHandle for testing.""" + + def __init__( + self, + cluster_name: str, + node_ips: List[Tuple[str, str]], + ): + self.cluster_name = cluster_name + self.cluster_name_on_cloud = cluster_name + self.stable_internal_external_ips = node_ips + self.launched_nodes = len(node_ips) + + +class MockStatusResponse: + """Mock status response from sky.status().""" + + def __init__( + self, + name: str, + status: "sky.ClusterStatus", + handle: Optional[MockClusterHandle] = None, + ): + self.name = name + self.status = status + self.handle = handle + + +@pytest.fixture +def mock_sky(): + """Fixture to mock SkyPilot SDK functions.""" + with mock.patch("monarch._src.job.skypilot.sky") as mock_sky_module: + # Mock ClusterStatus enum + mock_sky_module.ClusterStatus = sky.ClusterStatus + + # Mock sky.launch to return a mock request_id + mock_sky_module.launch.return_value = "mock-request-id" + + # Mock sky.get to return appropriate results + def mock_get(request_id): + if request_id == "mock-request-id": + # Return (job_id, handle) for launch + return ( + 1, + MockClusterHandle( + "test-cluster", + [("10.0.0.1", "1.2.3.4"), ("10.0.0.2", "1.2.3.5")], + ), + ) + elif request_id == "mock-status-request-id": + # Return list of status responses + return [ + MockStatusResponse( + "test-cluster", + sky.ClusterStatus.UP, + MockClusterHandle( + "test-cluster", + [("10.0.0.1", "1.2.3.4"), ("10.0.0.2", "1.2.3.5")], + ), + ) + ] + elif request_id == "mock-down-request-id": + return None + return None + + mock_sky_module.get.side_effect = mock_get + + # Mock sky.status + mock_sky_module.status.return_value = "mock-status-request-id" + + # Mock sky.down + mock_sky_module.down.return_value = "mock-down-request-id" + + # Mock sky.Task + mock_sky_module.Task = mock.MagicMock() + + # Mock sky.Resources + mock_sky_module.Resources = sky.Resources + + yield mock_sky_module + + +@pytest.fixture +def mock_attach_to_workers(): + """Fixture to mock attach_to_workers wrapper.""" + with mock.patch( + "monarch._src.job.skypilot._attach_to_workers_wrapper" + ) as mock_attach: + # Create a simple mock HostMesh + class MockHostMesh: + def __init__(self, name): + self.name = name + + def create_mock_host_mesh(name, ca, workers): + return MockHostMesh(name) + + mock_attach.side_effect = create_mock_host_mesh + yield mock_attach + + +@pytest.fixture +def mock_configure_transport(): + """Fixture to mock _configure_transport.""" + with mock.patch( + "monarch._src.job.skypilot._configure_transport" + ) as mock_config: + yield mock_config + + +@pytest.mark.skipif(not HAS_SKYPILOT, reason="SkyPilot not installed") +def test_skypilot_job_import(): + """Test that SkyPilotJob can be imported from monarch.job.""" + from monarch.job import SkyPilotJob + + # SkyPilotJob should be available (or None if import failed) + # This test verifies the export is working + if HAS_MONARCH_BINDINGS: + assert SkyPilotJob is not None + # If bindings are not available, SkyPilotJob will be None (graceful degradation) + + +def test_skypilot_job_init(mock_configure_transport): + """Test SkyPilotJob initialization.""" + from monarch._src.job.skypilot import SkyPilotJob + + job = SkyPilotJob( + meshes={"trainers": 2, "workers": 1}, + cluster_name="test-cluster", + monarch_port=12345, + ) + + assert job._meshes == {"trainers": 2, "workers": 1} + assert job._cluster_name == "test-cluster" + assert job._port == 12345 + assert job._launched_cluster_name is None + assert job._node_ips == [] + + +def test_skypilot_job_init_with_resources(mock_configure_transport): + """Test SkyPilotJob initialization with SkyPilot resources.""" + from monarch._src.job.skypilot import SkyPilotJob + + resources = sky.Resources(accelerators="A100:1") + + job = SkyPilotJob( + meshes={"trainers": 4}, + resources=resources, + cluster_name="gpu-cluster", + ) + + assert job._resources == resources + assert job._meshes == {"trainers": 4} + + +def test_skypilot_job_build_worker_command(mock_configure_transport): + """Test the worker command generation.""" + from monarch._src.job.skypilot import SkyPilotJob + + job = SkyPilotJob( + meshes={"trainers": 1}, + monarch_port=22222, + ) + + command = job._build_worker_command() + + # Check that the command contains expected elements + assert "socket.gethostname()" in command + assert "tcp://" in command + assert "22222" in command + assert "run_worker_loop_forever" in command + assert 'ca="trust_all_connections"' in command + + +def test_skypilot_job_create(mock_sky, mock_attach_to_workers, mock_configure_transport): + """Test the _create method.""" + from monarch._src.job.skypilot import SkyPilotJob + + job = SkyPilotJob( + meshes={"trainers": 2}, + cluster_name="test-cluster", + ) + + # Call _create + job._create(None) + + # Verify sky.launch was called + mock_sky.launch.assert_called_once() + + # Check that cluster name was stored + assert job._launched_cluster_name == "test-cluster" + + +def test_skypilot_job_create_batch_mode_raises(mock_sky, mock_configure_transport): + """Test that _create raises an error for batch mode.""" + from monarch._src.job.skypilot import SkyPilotJob + + job = SkyPilotJob(meshes={"trainers": 1}) + + with pytest.raises(RuntimeError, match="batch-mode scripts"): + job._create("some_script.py") + + +def test_skypilot_job_state(mock_sky, mock_attach_to_workers, mock_configure_transport): + """Test the _state method.""" + from monarch._src.job.skypilot import SkyPilotJob + + job = SkyPilotJob( + meshes={"trainers": 2}, + cluster_name="test-cluster", + ) + + # Apply the job first + job.apply() + + # Now get state + state = job._state() + + # Verify attach_to_workers was called with correct addresses + mock_attach_to_workers.assert_called() + call_args = mock_attach_to_workers.call_args + + # Check the call arguments + assert call_args.kwargs["name"] == "trainers" + assert call_args.kwargs["ca"] == "trust_all_connections" + # Workers should use external IPs + workers = call_args.kwargs["workers"] + assert len(workers) == 2 + assert all("tcp://" in w for w in workers) + + # Check that state has the trainers mesh + assert hasattr(state, "trainers") + + +def test_skypilot_job_state_multiple_meshes(mock_sky, mock_attach_to_workers, mock_configure_transport): + """Test _state with multiple meshes.""" + from monarch._src.job.skypilot import SkyPilotJob + + # Create mock status with 3 nodes + def mock_get_multi(request_id): + if request_id == "mock-request-id": + return ( + 1, + MockClusterHandle( + "test-cluster", + [ + ("10.0.0.1", "1.2.3.4"), + ("10.0.0.2", "1.2.3.5"), + ("10.0.0.3", "1.2.3.6"), + ], + ), + ) + elif request_id == "mock-status-request-id": + return [ + MockStatusResponse( + "test-cluster", + sky.ClusterStatus.UP, + MockClusterHandle( + "test-cluster", + [ + ("10.0.0.1", "1.2.3.4"), + ("10.0.0.2", "1.2.3.5"), + ("10.0.0.3", "1.2.3.6"), + ], + ), + ) + ] + return None + + mock_sky.get.side_effect = mock_get_multi + + job = SkyPilotJob( + meshes={"trainers": 2, "evaluator": 1}, + cluster_name="test-cluster", + ) + + job.apply() + state = job._state() + + # Verify attach_to_workers was called twice (once for each mesh) + assert mock_attach_to_workers.call_count == 2 + + # Check that state has both meshes + assert hasattr(state, "trainers") + assert hasattr(state, "evaluator") + + +def test_skypilot_job_kill(mock_sky, mock_attach_to_workers, mock_configure_transport): + """Test the _kill method.""" + from monarch._src.job.skypilot import SkyPilotJob + + job = SkyPilotJob( + meshes={"trainers": 1}, + cluster_name="test-cluster", + ) + + # Apply the job first + job.apply() + assert job._launched_cluster_name == "test-cluster" + + # Kill the job + job._kill() + + # Verify sky.down was called + mock_sky.down.assert_called_once_with("test-cluster") + + # Check that state was cleared + assert job._launched_cluster_name is None + assert job._node_ips == [] + + +def test_skypilot_job_can_run(mock_sky, mock_attach_to_workers, mock_configure_transport): + """Test the can_run method.""" + from monarch._src.job.skypilot import SkyPilotJob + + job1 = SkyPilotJob( + meshes={"trainers": 2}, + cluster_name="test-cluster", + monarch_port=22222, + ) + + job2 = SkyPilotJob( + meshes={"trainers": 2}, + cluster_name="test-cluster", + monarch_port=22222, + ) + + job3 = SkyPilotJob( + meshes={"trainers": 4}, # Different mesh config + cluster_name="test-cluster", + monarch_port=22222, + ) + + # Apply job1 + job1.apply() + + # job1 should be able to run job2 (same config) + assert job1.can_run(job2) is True + + # job1 should NOT be able to run job3 (different mesh config) + assert job1.can_run(job3) is False + + +def test_skypilot_job_jobs_active(mock_sky, mock_attach_to_workers, mock_configure_transport): + """Test the _jobs_active method.""" + from monarch._src.job.skypilot import SkyPilotJob + + job = SkyPilotJob( + meshes={"trainers": 1}, + cluster_name="test-cluster", + ) + + # Before apply, should not be active + assert job._jobs_active() is False + + # Apply the job + job.apply() + + # After apply, should be active (mocked status returns UP) + assert job._jobs_active() is True + + +def test_skypilot_job_serialization(mock_sky, mock_attach_to_workers, mock_configure_transport): + """Test that SkyPilotJob can be serialized and deserialized.""" + from monarch._src.job.skypilot import SkyPilotJob + from monarch._src.job.job import job_loads + + job = SkyPilotJob( + meshes={"trainers": 2, "workers": 1}, + cluster_name="test-cluster", + monarch_port=33333, + ) + + # Serialize + serialized = job.dumps() + + # Deserialize + loaded_job = job_loads(serialized) + + # Check attributes + assert isinstance(loaded_job, SkyPilotJob) + assert loaded_job._meshes == {"trainers": 2, "workers": 1} + assert loaded_job._cluster_name == "test-cluster" + assert loaded_job._port == 33333 + + +def test_skypilot_job_with_setup_commands(mock_configure_transport): + """Test SkyPilotJob with custom setup commands.""" + from monarch._src.job.skypilot import SkyPilotJob + + setup = "pip install torch\npip install monarch" + + job = SkyPilotJob( + meshes={"trainers": 1}, + setup_commands=setup, + ) + + assert job._setup_commands == setup + + +def test_skypilot_job_with_autostop(mock_configure_transport): + """Test SkyPilotJob with autostop configuration.""" + from monarch._src.job.skypilot import SkyPilotJob + + job = SkyPilotJob( + meshes={"trainers": 1}, + idle_minutes_to_autostop=30, + down_on_autostop=True, + ) + + assert job._idle_minutes_to_autostop == 30 + assert job._down_on_autostop is True + + +# Integration test - only run if explicitly requested +@pytest.mark.skip(reason="Integration test - run manually with --run-integration") +def test_skypilot_job_integration(): + """ + Integration test that actually launches a SkyPilot cluster. + + To run this test: + pytest tests/test_skypilot_job.py::test_skypilot_job_integration --run-integration + + Make sure you have SkyPilot credentials configured. + """ + from monarch._src.job.skypilot import SkyPilotJob + + # Create a minimal job - just 1 node with cheap resources + job = SkyPilotJob( + meshes={"workers": 1}, + resources=sky.Resources( + cloud=sky.AWS(), # Change to your preferred cloud + cpus="2+", + ), + cluster_name="monarch-test-integration", + idle_minutes_to_autostop=5, + down_on_autostop=True, + ) + + try: + # Apply the job + job.apply() + + # Check that we can get state + state = job.state() + assert hasattr(state, "workers") + + print("Integration test passed!") + finally: + # Always clean up + job.kill() + From 17536f3168a1129861c02cb3426385f65a0ccc73 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 5 Dec 2025 07:22:25 +0000 Subject: [PATCH 02/29] Add workdir and file_mounts parameters to SkyPilotJob Summary: This update introduces two new optional parameters, `workdir` and `file_mounts`, to the `SkyPilotJob` class. The `workdir` parameter allows users to specify a local directory to sync with the cluster, while `file_mounts` enables additional file mounts by mapping remote paths to local paths. These enhancements improve the flexibility and usability of job configurations in SkyPilot. --- python/monarch/_src/job/skypilot.py | 30 +++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py index 39795d239..1a49a04f9 100644 --- a/python/monarch/_src/job/skypilot.py +++ b/python/monarch/_src/job/skypilot.py @@ -89,6 +89,8 @@ def __init__( down_on_autostop: bool = False, python_exe: str = "python", setup_commands: Optional[str] = None, + workdir: Optional[str] = None, + file_mounts: Optional[Dict[str, str]] = None, ) -> None: """ Args: @@ -104,7 +106,11 @@ def __init__( just stopping it. python_exe: Python executable to use for worker processes. setup_commands: Optional setup commands to run before starting workers. - Use this to install dependencies. + Use this to install dependencies including Monarch. + workdir: Local directory to sync to the cluster. If provided, this + directory will be uploaded to ~/sky_workdir on each node. + file_mounts: Dictionary mapping remote paths to local paths for + additional file mounts. """ if not HAS_SKYPILOT: raise ImportError( @@ -128,6 +134,8 @@ def __init__( self._down_on_autostop = down_on_autostop self._python_exe = python_exe self._setup_commands = setup_commands + self._workdir = workdir + self._file_mounts = file_mounts # Runtime state self._launched_cluster_name: Optional[str] = None @@ -154,8 +162,13 @@ def _create(self, client_script: Optional[str]) -> None: setup=setup if setup else None, run=worker_command, num_nodes=total_nodes, + workdir=self._workdir, ) + # Add file mounts if provided + if self._file_mounts: + task.set_file_mounts(self._file_mounts) + if self._resources is not None: task.set_resources(self._resources) @@ -183,20 +196,21 @@ def _create(self, client_script: Optional[str]) -> None: logger.info(f"SkyPilot cluster '{cluster_name}' launched successfully") def _build_worker_command(self) -> str: - """Build the command to start Monarch workers on each node.""" - # This command will be run on each node - # We use the node's IP to create a unique address for each worker - return f""" + """Build the bash command to start Monarch workers on each node.""" + # This command will be run on each node via SkyPilot + # SkyPilot expects a bash script, so we wrap Python code in python -c + python_code = f''' import socket hostname = socket.gethostname() -# Get the IP address of this node ip_addr = socket.gethostbyname(hostname) address = f"tcp://{{ip_addr}}:{self._port}" print(f"Starting Monarch worker at {{address}}") - from monarch.actor import run_worker_loop_forever run_worker_loop_forever(address=address, ca="trust_all_connections") -""" +''' + # Escape single quotes in the Python code for bash + escaped_code = python_code.replace("'", "'\"'\"'") + return f"python -c '{escaped_code}'" def _get_node_ips(self) -> List[str]: """Get the IP addresses of all nodes in the cluster.""" From b2344bf547548db49ea7e72df5f55db1b2e13920 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 5 Dec 2025 18:05:55 +0000 Subject: [PATCH 03/29] fixes --- python/monarch/_src/job/skypilot.py | 10 ++++++++-- python/tests/test_skypilot_job.py | 8 ++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py index 1a49a04f9..529997fe7 100644 --- a/python/monarch/_src/job/skypilot.py +++ b/python/monarch/_src/job/skypilot.py @@ -170,7 +170,12 @@ def _create(self, client_script: Optional[str]) -> None: task.set_file_mounts(self._file_mounts) if self._resources is not None: - task.set_resources(self._resources) + # Copy resources and override image_id to use PyTorch image with CUDA + # This ensures torchmonarch has access to CUDA libraries + resources = self._resources.copy( + image_id="docker:pytorch/pytorch:2.9.1-cuda12.6-cudnn9-devel" + ) + task.set_resources(resources) # Generate cluster name if not provided cluster_name = self._cluster_name or f"monarch-{os.getpid()}" @@ -210,7 +215,8 @@ def _build_worker_command(self) -> str: ''' # Escape single quotes in the Python code for bash escaped_code = python_code.replace("'", "'\"'\"'") - return f"python -c '{escaped_code}'" + # Set timeout env var - setup takes time so we need longer than default 30s + return f"export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=5m && python -c '{escaped_code}'" def _get_node_ips(self) -> List[str]: """Get the IP addresses of all nodes in the cluster.""" diff --git a/python/tests/test_skypilot_job.py b/python/tests/test_skypilot_job.py index 295c7a74d..b6af37a22 100644 --- a/python/tests/test_skypilot_job.py +++ b/python/tests/test_skypilot_job.py @@ -1,12 +1,12 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. +# All rights retuprved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # pyre-unsafe -"""Tests for the SkyPilot job integration.""" +"""Tests for the SubernyPilot job integration.""" import os import sys @@ -425,8 +425,8 @@ def test_skypilot_job_serialization(mock_sky, mock_attach_to_workers, mock_confi assert loaded_job._port == 33333 -def test_skypilot_job_with_setup_commands(mock_configure_transport): - """Test SkyPilotJob with custom setup commands.""" +def test_skypilot_job_with_upup_commands(mock_configure_transport): + """Test SkyPilotJob with custom ppppppppppp commands.""" from monarch._src.job.skypilot import SkyPilotJob setup = "pip install torch\npip install monarch" From 6803cbf12505e9023fecc988a9223ddbc202259b Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 5 Dec 2025 19:35:57 +0000 Subject: [PATCH 04/29] Add SkyPilot integration example and documentation - Add SKY_README.md with comprehensive documentation: - Architecture overview - Implementation details - Usage examples - Troubleshooting guide - Networking considerations for Kubernetes - Add python/examples/skypilot_getting_started.py: - Example script demonstrating Monarch actors on SkyPilot - Supports multiple clouds (Kubernetes, AWS, GCP, Azure) - Configurable via command-line arguments - Update skypilot.py: - Add host mesh initialization wait - Improve logging for debugging - Fix worker command environment variable setup --- SKY_README.md | 293 ++++++++++++++++++++ python/examples/skypilot_getting_started.py | 266 ++++++++++++++++++ python/monarch/_src/job/skypilot.py | 44 ++- 3 files changed, 598 insertions(+), 5 deletions(-) create mode 100644 SKY_README.md create mode 100644 python/examples/skypilot_getting_started.py diff --git a/SKY_README.md b/SKY_README.md new file mode 100644 index 000000000..1558c62da --- /dev/null +++ b/SKY_README.md @@ -0,0 +1,293 @@ +# Monarch + SkyPilot Integration + +This document describes the SkyPilot integration for Monarch, which enables running Monarch actors on cloud infrastructure provisioned by SkyPilot. + +## Overview + +SkyPilot is a framework for running ML workloads on any cloud (AWS, GCP, Azure, Lambda, Kubernetes, etc.). The `SkyPilotJob` class in Monarch provides a seamless integration that: + +1. **Provisions cloud instances** using SkyPilot's unified API +2. **Installs Monarch** (`torchmonarch` from PyPI) on remote nodes +3. **Starts Monarch workers** on each node listening for connections +4. **Connects clients** to workers using TCP for distributed actor communication + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Client Machine │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ SkyPilotJob │ │ +│ │ - Calls sky.launch() to provision cloud instances │ │ +│ │ - Configures setup commands to install torchmonarch │ │ +│ │ - Builds worker command with run_worker_loop_forever() │ │ +│ │ - Calls attach_to_workers() to create HostMesh │ │ +│ └─────────────────────────────────────────────────────────┘ │ +└───────────────────────────────┬─────────────────────────────────┘ + │ TCP connections (port 22222) + ┌───────────────────────┼───────────────────────┐ + │ │ │ + ▼ ▼ ▼ +┌───────────────┐ ┌───────────────┐ ┌───────────────┐ +│ Worker 1 │ │ Worker 2 │ │ Worker N │ +│ (Cloud Node) │ │ (Cloud Node) │ │ (Cloud Node) │ +│ │ │ │ │ │ +│ run_worker_ │ │ run_worker_ │ │ run_worker_ │ +│ loop_forever()│ │ loop_forever()│ │ loop_forever()│ +│ │ │ │ │ │ +│ tcp://: │ │ tcp://: │ │ tcp://: │ +│ 22222 │ │ 22222 │ │ 22222 │ +└───────────────┘ └───────────────┘ └───────────────┘ +``` + +## Implementation Details + +### Files + +- **`python/monarch/_src/job/skypilot.py`**: Core `SkyPilotJob` implementation +- **`python/monarch/job/__init__.py`**: Exports `SkyPilotJob` (with graceful ImportError handling) +- **`python/tests/test_skypilot_job.py`**: Unit tests with mocked SkyPilot +- **`python/tests/test_skypilot_integration.py`**: Integration test scaffolding +- **`python/examples/skypilot_getting_started.py`**: Example demonstrating usage + +### Key Classes and Functions + +#### `SkyPilotJob(JobTrait)` + +Main job class that implements the Monarch `JobTrait` interface. + +```python +from monarch.job import SkyPilotJob +import sky + +job = SkyPilotJob( + meshes={"trainers": 2}, # 2 nodes for "trainers" mesh + resources=sky.Resources( + cloud=sky.Kubernetes(), + accelerators="H100:1", + ), + cluster_name="my-cluster", + idle_minutes_to_autostop=10, + down_on_autostop=True, + setup_commands="pip install torchmonarch", +) + +state = job.state() # Launches cluster and returns JobState +hosts = state.trainers # HostMesh with 2 nodes +``` + +#### Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `meshes` | `Dict[str, int]` | Mesh names to node counts | +| `resources` | `sky.Resources` | SkyPilot resource specification | +| `cluster_name` | `str` | Name for the cluster | +| `monarch_port` | `int` | TCP port for workers (default: 22222) | +| `idle_minutes_to_autostop` | `int` | Auto-stop after idle minutes | +| `down_on_autostop` | `bool` | Terminate (not just stop) on autostop | +| `setup_commands` | `str` | Shell commands to run before workers start | +| `workdir` | `str` | Local directory to sync to cluster | +| `file_mounts` | `Dict[str, str]` | Additional file mounts | + +### Worker Lifecycle + +1. **Launch**: `sky.launch()` creates the cluster with specified resources +2. **Setup**: `setup_commands` run to install `torchmonarch` +3. **Run**: Worker command executes `run_worker_loop_forever(address, ca)` +4. **Connect**: Client calls `attach_to_workers()` to create `HostMesh` +5. **Teardown**: `sky.down()` terminates the cluster + +### Environment Variables + +The following environment variables control timeouts: + +```python +os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s" # Worker spawn timeout +os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s" # Message delivery timeout +os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s" # Proc mesh spawn timeout +``` + +## Requirements + +### Client Side +- Monarch with Rust bindings (`pip install -e .` from source) +- SkyPilot (`pip install skypilot`) +- Configured cloud credentials (`sky check`) + +### Worker Side (installed via setup_commands) +- `torchmonarch` from PyPI +- **CUDA libraries** - torchmonarch requires `libcuda.so.1` +- This means workers **must run on GPU nodes** + +## Usage + +### Basic Example + +```python +import sky +from monarch.job import SkyPilotJob +from monarch.actor import Actor, endpoint + +class MyActor(Actor): + @endpoint + def hello(self) -> str: + return "Hello from cloud!" + +# Create job +job = SkyPilotJob( + meshes={"workers": 2}, + resources=sky.Resources( + cloud=sky.AWS(), + accelerators="A100:1", + ), + setup_commands="pip install torchmonarch", +) + +# Launch and get state +state = job.state() +hosts = state.workers + +# Spawn processes and actors +procs = hosts.spawn_procs(per_host={"gpus": 1}) +actors = procs.spawn("my_actors", MyActor) + +# Interact with actors +results = actors.hello.call().get() +print(results) # ["Hello from cloud!", "Hello from cloud!"] + +# Cleanup +job.kill() +``` + +### Running the Example + +```bash +# Install dependencies +pip install skypilot +pip install -e . # Build Monarch from source + +# Configure cloud credentials +sky check + +# Run example +cd python/examples +python skypilot_getting_started.py \ + --cloud kubernetes \ + --num-hosts 2 \ + --accelerator "H100:1" \ + --cluster-name my-monarch-cluster +``` + +### Supported Clouds + +- **Kubernetes**: Use `sky.Kubernetes()` with `--region` for context +- **AWS**: Use `sky.AWS()` +- **GCP**: Use `sky.GCP()` +- **Azure**: Use `sky.Azure()` +- **Lambda Labs**: Use `sky.Lambda()` +- And others supported by SkyPilot + +## Networking Considerations + +### Kubernetes + +When using Kubernetes, the client and workers must be in the **same Kubernetes cluster** for pod-to-pod communication. Use the `region` parameter to specify the Kubernetes context: + +```python +resources=sky.Resources( + cloud=sky.Kubernetes(), + region="my-k8s-context", # Must match client's cluster +) +``` + +### Public Clouds (AWS, GCP, Azure) + +SkyPilot handles networking automatically. Workers get public IPs that clients can connect to. + +### Firewall + +Ensure port 22222 (or your custom `monarch_port`) is accessible: +- Kubernetes: Pod networking should handle this +- AWS: Security groups +- GCP: Firewall rules +- Azure: Network security groups + +## Troubleshooting + +### "libcuda.so.1: cannot open shared object file" + +**Cause**: Workers are running on CPU-only nodes, but `torchmonarch` requires CUDA. + +**Solution**: Request GPU nodes: +```python +resources=sky.Resources(accelerators="H100:1") +``` + +### "No route to host" or connection timeouts + +**Cause**: Client and workers are in different networks (e.g., different Kubernetes clusters). + +**Solution**: Ensure client and workers are in the same network: +- For Kubernetes: Use `region` parameter to specify the correct context +- For public clouds: Check security group / firewall rules + +### "error spawning proc mesh: statuses: Timeout" + +**Causes**: +1. Workers aren't listening on the expected port +2. Network connectivity issues +3. Workers crashed during startup + +**Debug steps**: +1. Check SkyPilot logs: `sky logs ` +2. SSH into cluster: `sky ssh ` +3. Check if port is listening: `ss -tlnp | grep 22222` +4. Check Monarch logs: `/tmp/sky/monarch_log.log` + +### Workers crash immediately + +Check SkyPilot logs for the error: +```bash +sky logs +``` + +Common issues: +- Missing CUDA libraries → use GPU nodes +- torchmonarch installation failed → check setup_commands +- Python version mismatch → ensure compatible Python version + +## Testing + +### Unit Tests (with mocked SkyPilot) + +```bash +cd python +pytest tests/test_skypilot_job.py -v +``` + +### Integration Tests (requires real cloud) + +```bash +cd python +pytest tests/test_skypilot_integration.py -v --cloud kubernetes +``` + +## Comparison with SlurmJob + +| Feature | SkyPilotJob | SlurmJob | +|---------|-------------|----------| +| Cloud Support | Multi-cloud (AWS, GCP, Azure, K8s, etc.) | HPC clusters only | +| Setup | Automatic via SkyPilot | Requires Slurm installation | +| Autoscaling | Supported | Depends on cluster | +| Cost Optimization | Automatic (cheapest region) | N/A | +| Worker Discovery | Via cluster handle IPs | Via squeue hostnames | + +## Future Work + +- [ ] Support for spot/preemptible instances +- [ ] Multi-region deployments +- [ ] Automatic failover on spot termination +- [ ] Integration with SkyPilot managed jobs +- [ ] Support for batch mode (client script on cluster) + diff --git a/python/examples/skypilot_getting_started.py b/python/examples/skypilot_getting_started.py new file mode 100644 index 000000000..64fbf9658 --- /dev/null +++ b/python/examples/skypilot_getting_started.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Monarch Getting Started with SkyPilot +===================================== + +This script demonstrates running Monarch actors on cloud infrastructure +provisioned by SkyPilot. It follows the Monarch getting started guide +but uses SkyPilot to launch the worker nodes. + +Prerequisites: +- Monarch installed with its Rust bindings (build with `pip install -e .` in monarch/) +- SkyPilot installed and configured (run `sky check`) +- torchmonarch available on PyPI (requires CUDA on remote nodes) + +Usage: + python skypilot_getting_started.py + + # With explicit options: + python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 + +See SKY_README.md for full documentation. +""" + +import argparse +import os +import sys + +# Set timeouts before importing monarch - worker setup takes time +os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s" +os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s" +os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s" + +# Check dependencies before importing +try: + import sky +except ImportError: + print("ERROR: SkyPilot is not installed. Run: pip install skypilot") + sys.exit(1) + +try: + from monarch.job import SkyPilotJob + from monarch.actor import Actor, endpoint, ProcMesh, context +except ImportError as e: + print(f"ERROR: Monarch is not properly installed: {e}") + print("\nTo install Monarch, you need to build it from source:") + print(" cd monarch/") + print(" pip install -e .") + print("\nThis requires the Rust toolchain and other dependencies.") + print("See monarch/README.md for full installation instructions.") + sys.exit(1) + +# ============================================================================ +# Step 1: Define our Actors (same as getting started guide) +# ============================================================================ + + +class Counter(Actor): + """A simple counter actor that demonstrates basic messaging.""" + + def __init__(self, initial_value: int = 0): + self.value = initial_value + + @endpoint + def increment(self) -> None: + self.value += 1 + + @endpoint + def get_value(self) -> int: + return self.value + + +class Trainer(Actor): + """A trainer actor that demonstrates distributed training patterns.""" + + @endpoint + def step(self) -> str: + my_point = context().message_rank + return f"Trainer {my_point} taking a step." + + @endpoint + def get_info(self) -> str: + rank = context().actor_instance.rank + return f"Trainer at rank {rank}" + + +# ============================================================================ +# Step 2: Create a SkyPilot Job to provision cloud infrastructure +# ============================================================================ + + +def get_cloud(cloud_name: str): + """Get SkyPilot cloud object from name.""" + clouds = { + "kubernetes": sky.Kubernetes, + "aws": sky.AWS, + "gcp": sky.GCP, + "azure": sky.Azure, + } + if cloud_name.lower() not in clouds: + raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}") + return clouds[cloud_name.lower()]() + + +def main(): + parser = argparse.ArgumentParser(description="Monarch Getting Started with SkyPilot") + parser.add_argument( + "--cloud", + default="kubernetes", + help="Cloud provider to use (kubernetes, aws, gcp, azure)", + ) + parser.add_argument( + "--num-hosts", + type=int, + default=2, + help="Number of host nodes to provision", + ) + parser.add_argument( + "--gpus-per-host", + type=int, + default=2, + help="Number of GPU processes per host", + ) + parser.add_argument( + "--cluster-name", + default="monarch-getting-started", + help="Name for the SkyPilot cluster", + ) + parser.add_argument( + "--accelerator", + default="H100:1", + help="GPU accelerator to request (e.g., H100:1, A100:1, V100:1)", + ) + parser.add_argument( + "--region", + default=None, + help="Cloud region/Kubernetes context to use", + ) + args = parser.parse_args() + + print("=" * 60) + print("Monarch Getting Started with SkyPilot") + print("=" * 60) + print(f"\nConfiguration:") + print(f" Cloud: {args.cloud}") + print(f" Hosts: {args.num_hosts}") + print(f" GPUs per host: {args.gpus_per_host}") + print(f" Accelerator: {args.accelerator}") + print(f" Cluster name: {args.cluster_name}") + if args.region: + print(f" Region: {args.region}") + + # Create a SkyPilotJob to provision nodes + # This will launch cloud instances and start Monarch workers on them + print("\n[1] Creating SkyPilot job...") + + # Setup commands to install Monarch on the remote nodes + # torchmonarch is the PyPI package name for Monarch + setup_commands = """ +sudo apt-get update && sudo apt-get install -y rdma-core libibverbs1 libmlx5-1 libibverbs-dev || true +pip install torchmonarch +echo "DONE INSTALLING TORCHMONARCH" +""" + + # Build resources specification + resources_kwargs = { + "cloud": get_cloud(args.cloud), + "cpus": "2+", + "accelerators": args.accelerator, # GPU required - torchmonarch needs CUDA + } + if args.region: + resources_kwargs["region"] = args.region + + job = SkyPilotJob( + # Define the mesh of hosts we need + meshes={"trainers": args.num_hosts}, + # Specify cloud resources - GPU required for torchmonarch (needs CUDA) + resources=sky.Resources(**resources_kwargs), + cluster_name=args.cluster_name, + # Auto-cleanup after 10 minutes of idle time + idle_minutes_to_autostop=10, + down_on_autostop=True, + # Setup commands to install dependencies + setup_commands=setup_commands, + ) + + try: + # Get the job state - this launches the cluster and returns HostMeshes + print("\n[2] Launching cluster and starting Monarch workers...") + state = job.state() + + # Get our host mesh + hosts = state.trainers + print(f" Got host mesh with extent: {hosts.extent}") + + # ==================================================================== + # Step 3: Spawn processes and actors on the cloud hosts + # ==================================================================== + + print("\n[3] Spawning processes on cloud hosts...") + # Create a process mesh - GPU processes per host + procs: ProcMesh = hosts.spawn_procs(per_host={"gpus": args.gpus_per_host}) + print(f" Process mesh extent: {procs.extent}") + + # Spawn counter actors + print("\n[4] Spawning Counter actors...") + counters: Counter = procs.spawn("counters", Counter, initial_value=0) + + # ==================================================================== + # Step 4: Interact with the actors + # ==================================================================== + + # Broadcast increment to all counters + print("\n[5] Broadcasting increment to all counters...") + counters.increment.broadcast() + counters.increment.broadcast() + counters.increment.broadcast() + + # Get all counter values + print("\n[6] Getting counter values...") + values = counters.get_value.call().get() + print(f" Counter values: {values}") + + # Spawn trainer actors + print("\n[7] Spawning Trainer actors...") + trainers: Trainer = procs.spawn("trainers", Trainer) + + # Do a training step + print("\n[8] Performing distributed training step...") + results = trainers.step.call().get() + for r in results: + print(f" {r}") + + # Get trainer info + print("\n[9] Getting trainer info...") + info = trainers.get_info.call().get() + for i in info: + print(f" {i}") + + print("\n" + "=" * 60) + print("SUCCESS! Monarch actors ran on SkyPilot cluster!") + print("=" * 60) + + except Exception as e: + print(f"\nERROR: {e}") + import traceback + traceback.print_exc() + print(f"\n[10] ERROR - not cleaning up cluster for debugging...") + print(f" You can debug with: sky ssh {args.cluster_name}") + print(f" To clean up later: sky down {args.cluster_name}") + raise + else: + # Clean up - tear down the SkyPilot cluster + print("\n[10] Cleaning up SkyPilot cluster...") + job.kill() + print(" Cluster terminated.") + + +if __name__ == "__main__": + main() + diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py index 529997fe7..21454de45 100644 --- a/python/monarch/_src/job/skypilot.py +++ b/python/monarch/_src/job/skypilot.py @@ -204,19 +204,42 @@ def _build_worker_command(self) -> str: """Build the bash command to start Monarch workers on each node.""" # This command will be run on each node via SkyPilot # SkyPilot expects a bash script, so we wrap Python code in python -c + # Note: Use IP address (not hostname) for the worker address since + # Kubernetes hostnames may not resolve across pods python_code = f''' import socket +import logging +import sys + +# Enable verbose logging +logging.basicConfig(level=logging.DEBUG, stream=sys.stdout, format="%(asctime)s %(levelname)s %(name)s: %(message)s") + hostname = socket.gethostname() ip_addr = socket.gethostbyname(hostname) address = f"tcp://{{ip_addr}}:{self._port}" -print(f"Starting Monarch worker at {{address}}") -from monarch.actor import run_worker_loop_forever -run_worker_loop_forever(address=address, ca="trust_all_connections") +print(f"Starting Monarch worker at {{address}} (hostname={{hostname}})", flush=True) +sys.stdout.flush() + +try: + from monarch.actor import run_worker_loop_forever + print(f"Imported run_worker_loop_forever successfully", flush=True) + print(f"Worker ready and listening...", flush=True) + run_worker_loop_forever(address=address, ca="trust_all_connections") +except Exception as e: + print(f"ERROR in worker: {{e}}", flush=True) + import traceback + traceback.print_exc() + raise ''' # Escape single quotes in the Python code for bash escaped_code = python_code.replace("'", "'\"'\"'") - # Set timeout env var - setup takes time so we need longer than default 30s - return f"export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=5m && python -c '{escaped_code}'" + # Set timeout env vars - setup takes time so we need longer than default 30s + env_vars = " ".join([ + "export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=5m", + "export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=5m", + "export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=5m", + ]) + return f"{env_vars} && python -c '{escaped_code}'" def _get_node_ips(self) -> List[str]: """Get the IP addresses of all nodes in the cluster.""" @@ -306,12 +329,23 @@ def _state(self) -> JobState: ip_idx += num_nodes workers = [f"tcp://{ip}:{self._port}" for ip in mesh_ips] + logger.info(f"Connecting to workers for mesh '{mesh_name}': {workers}") host_mesh = _attach_to_workers_wrapper( name=mesh_name, ca="trust_all_connections", workers=workers, ) + + # Wait for the host mesh to be initialized (connections established) + logger.info(f"Waiting for host mesh '{mesh_name}' to initialize...") + host_mesh.initialized.get() + logger.info(f"Host mesh '{mesh_name}' initialized successfully") + + # Give connections a moment to fully stabilize + time.sleep(5) + logger.info(f"Host mesh '{mesh_name}' ready") + host_meshes[mesh_name] = host_mesh return JobState(host_meshes) From a740ae1d317ee8e31ef465c2fccd6bc9e9fed279 Mon Sep 17 00:00:00 2001 From: Romil Date: Fri, 5 Dec 2025 23:51:38 +0000 Subject: [PATCH 05/29] Working example --- python/examples/skypilot_getting_started.py | 54 ++++++++++++-- python/monarch/_src/job/skypilot.py | 56 ++++++++++++--- test_worker_setup.yaml | 78 +++++++++++++++++++++ 3 files changed, 172 insertions(+), 16 deletions(-) create mode 100644 test_worker_setup.yaml diff --git a/python/examples/skypilot_getting_started.py b/python/examples/skypilot_getting_started.py index 64fbf9658..b1841dcde 100644 --- a/python/examples/skypilot_getting_started.py +++ b/python/examples/skypilot_getting_started.py @@ -133,7 +133,7 @@ def main(): ) parser.add_argument( "--accelerator", - default="H100:1", + default="H200:1", help="GPU accelerator to request (e.g., H100:1, A100:1, V100:1)", ) parser.add_argument( @@ -160,11 +160,51 @@ def main(): print("\n[1] Creating SkyPilot job...") # Setup commands to install Monarch on the remote nodes - # torchmonarch is the PyPI package name for Monarch + # Build from source to ensure client/worker version compatibility + # NOTE: Currently builds WITHOUT tensor engine due to old rdma-core on Ubuntu 20.04 setup_commands = """ -sudo apt-get update && sudo apt-get install -y rdma-core libibverbs1 libmlx5-1 libibverbs-dev || true -pip install torchmonarch -echo "DONE INSTALLING TORCHMONARCH" +set -ex + +# Add PPA for newer toolchains +sudo apt-get update +sudo apt-get install -y software-properties-common +sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test +sudo apt-get update + +# Install system dependencies +sudo apt-get install -y \ + build-essential \ + ninja-build \ + g++-11 \ + rdma-core \ + libibverbs1 \ + libmlx5-1 \ + libibverbs-dev \ + curl \ + pkg-config \ + libssl-dev + +# Install CUDA toolkit and NCCL +wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get install -y cuda-toolkit-12-1 +sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9 + +# Install Rust +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +source $HOME/.cargo/env +rustup default nightly + +# Install Python dependencies +cd ~/sky_workdir +pip install setuptools-rust maturin +pip install -r torch-requirements.txt -r build-requirements.txt + +# Build Monarch (without tensor engine due to old rdma-core) +CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation . + +echo "DONE INSTALLING MONARCH" """ # Build resources specification @@ -187,6 +227,10 @@ def main(): down_on_autostop=True, # Setup commands to install dependencies setup_commands=setup_commands, + # Sync Monarch source to workers for building + workdir="/home/sky/dev/monarch", + # Use default python (same as used by pip in setup) + python_exe="python", ) try: diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py index 21454de45..1d8c40427 100644 --- a/python/monarch/_src/job/skypilot.py +++ b/python/monarch/_src/job/skypilot.py @@ -170,12 +170,7 @@ def _create(self, client_script: Optional[str]) -> None: task.set_file_mounts(self._file_mounts) if self._resources is not None: - # Copy resources and override image_id to use PyTorch image with CUDA - # This ensures torchmonarch has access to CUDA libraries - resources = self._resources.copy( - image_id="docker:pytorch/pytorch:2.9.1-cuda12.6-cudnn9-devel" - ) - task.set_resources(resources) + task.set_resources(self._resources) # Generate cluster name if not provided cluster_name = self._cluster_name or f"monarch-{os.getpid()}" @@ -199,6 +194,45 @@ def _create(self, client_script: Optional[str]) -> None: self._launched_cluster_name = cluster_name logger.info(f"SkyPilot cluster '{cluster_name}' launched successfully") + + # Wait for the job to be RUNNING (setup complete, run started) + self._wait_for_job_running(cluster_name, job_id, timeout=900) + + def _wait_for_job_running(self, cluster_name: str, job_id: int, timeout: int = 900) -> None: + """Wait for the SkyPilot job to reach RUNNING status (setup complete).""" + import time + start_time = time.time() + poll_interval = 10 # seconds + + logger.info(f"Waiting for job {job_id} setup to complete (timeout={timeout}s)...") + + while time.time() - start_time < timeout: + try: + # Get job queue for the cluster + request_id = sky.queue(cluster_name) + jobs = sky.get(request_id) + + # Find our job + for job in jobs: + if job.get('id') == job_id or job.get('job_id') == job_id: + status = job.get('status', '') + status_str = str(status) + if 'RUNNING' in status_str: + logger.info(f"Job {job_id} is now RUNNING (setup complete)") + return + elif 'FAILED' in status_str or 'CANCELLED' in status_str: + raise RuntimeError(f"Job {job_id} failed with status: {status}. Check logs with: sky logs {cluster_name}") + else: + elapsed = int(time.time() - start_time) + logger.info(f"Job {job_id} status: {status} (waited {elapsed}s)") + break + + except Exception as e: + logger.warning(f"Error checking job status: {e}") + + time.sleep(poll_interval) + + raise RuntimeError(f"Timeout waiting for job {job_id} to reach RUNNING status") def _build_worker_command(self) -> str: """Build the bash command to start Monarch workers on each node.""" @@ -233,13 +267,13 @@ def _build_worker_command(self) -> str: ''' # Escape single quotes in the Python code for bash escaped_code = python_code.replace("'", "'\"'\"'") - # Set timeout env vars - setup takes time so we need longer than default 30s + # Set timeout env vars - setup takes time (building from source) so we need longer timeouts env_vars = " ".join([ - "export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=5m", - "export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=5m", - "export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=5m", + "export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=15m", + "export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=15m", + "export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=15m", ]) - return f"{env_vars} && python -c '{escaped_code}'" + return f"{env_vars} && {self._python_exe} -c '{escaped_code}'" def _get_node_ips(self) -> List[str]: """Get the IP addresses of all nodes in the cluster.""" diff --git a/test_worker_setup.yaml b/test_worker_setup.yaml new file mode 100644 index 000000000..8649bbc9c --- /dev/null +++ b/test_worker_setup.yaml @@ -0,0 +1,78 @@ +# Minimal SkyPilot YAML to test Monarch build on remote workers +# Usage: sky launch test_worker_setup.yaml -c monarch-test +# sky down monarch-test +# +# NOTE: Currently builds WITHOUT tensor engine due to old rdma-core on Ubuntu 20.04. +# For tensor engine support, need a newer base image with rdma-core >= 32. + +name: monarch-worker-test + +resources: + cloud: kubernetes + accelerators: H200:1 + cpus: 4+ + memory: 16+ + +num_nodes: 1 + +# Sync the local monarch repo to the worker +workdir: /home/sky/dev/monarch + +setup: | + set -ex + + echo "=== System info ===" + uname -a + cat /etc/os-release | head -3 + + echo "=== Adding PPA for newer toolchains ===" + sudo apt-get update + sudo apt-get install -y software-properties-common + sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test + sudo apt-get update + + echo "=== Installing system dependencies ===" + sudo apt-get install -y \ + build-essential \ + ninja-build \ + g++-11 \ + rdma-core \ + libibverbs1 \ + libmlx5-1 \ + libibverbs-dev \ + curl \ + pkg-config \ + libssl-dev + + echo "=== Installing CUDA toolkit ===" + wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + sudo apt-get update + sudo apt-get install -y cuda-toolkit-12-1 + sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9 + + echo "=== Installing Rust ===" + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + source $HOME/.cargo/env + rustup default nightly + + echo "=== Installing Python dependencies ===" + cd ~/sky_workdir + pip install setuptools-rust maturin + pip install -r torch-requirements.txt -r build-requirements.txt + + echo "=== Building Monarch (without tensor engine due to old rdma-core) ===" + cd ~/sky_workdir + CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation . + + echo "=== Verifying installation ===" + pip list | grep monarch + python -c "import monarch; print('Monarch imported successfully')" + python -c "import monarch._rust_bindings; print('Rust bindings loaded successfully')" + + echo "=== SETUP COMPLETE ===" + +run: | + echo "Worker setup test completed successfully!" + python -c "import monarch; print('Monarch ready')" + echo "Ready for Monarch worker operations" From efa313ff490a0d301e82f5c1793d7849fc55f943 Mon Sep 17 00:00:00 2001 From: Romil Date: Mon, 8 Dec 2025 03:48:53 +0000 Subject: [PATCH 06/29] fix --- python/examples/skypilot_getting_started.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/examples/skypilot_getting_started.py b/python/examples/skypilot_getting_started.py index b1841dcde..f75ae0822 100644 --- a/python/examples/skypilot_getting_started.py +++ b/python/examples/skypilot_getting_started.py @@ -40,7 +40,7 @@ try: import sky except ImportError: - print("ERROR: SkyPilot is not installed. Run: pip install skypilot") + print("ERROR: SkyPilot is not installed. Run: pip install skypilot[kubernetes]") sys.exit(1) try: From 35b6e0eec913097a1aab1bf0ccd881287ab5f484 Mon Sep 17 00:00:00 2001 From: Romil Date: Mon, 8 Dec 2025 04:47:06 +0000 Subject: [PATCH 07/29] updates --- .../skypilot_getting_started.py | 70 +++----------- examples/skypilot_run_example.yaml | 96 +++++++++++++++++++ python/monarch/_src/job/skypilot.py | 65 ++++++++++++- test_worker_setup.yaml | 78 --------------- 4 files changed, 171 insertions(+), 138 deletions(-) rename {python/examples => examples}/skypilot_getting_started.py (80%) create mode 100644 examples/skypilot_run_example.yaml delete mode 100644 test_worker_setup.yaml diff --git a/python/examples/skypilot_getting_started.py b/examples/skypilot_getting_started.py similarity index 80% rename from python/examples/skypilot_getting_started.py rename to examples/skypilot_getting_started.py index f75ae0822..3ccc1d10a 100644 --- a/python/examples/skypilot_getting_started.py +++ b/examples/skypilot_getting_started.py @@ -16,13 +16,13 @@ Prerequisites: - Monarch installed with its Rust bindings (build with `pip install -e .` in monarch/) - SkyPilot installed and configured (run `sky check`) -- torchmonarch available on PyPI (requires CUDA on remote nodes) Usage: - python skypilot_getting_started.py + # Run from inside a Kubernetes pod (client runs locally): + python examples/skypilot_getting_started.py --cloud kubernetes --num-hosts 2 - # With explicit options: - python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 + # Run from outside the cluster using the SkyPilot YAML: + sky launch examples/skypilot_run_example.yaml See SKY_README.md for full documentation. """ @@ -159,63 +159,18 @@ def main(): # This will launch cloud instances and start Monarch workers on them print("\n[1] Creating SkyPilot job...") - # Setup commands to install Monarch on the remote nodes - # Build from source to ensure client/worker version compatibility - # NOTE: Currently builds WITHOUT tensor engine due to old rdma-core on Ubuntu 20.04 - setup_commands = """ -set -ex - -# Add PPA for newer toolchains -sudo apt-get update -sudo apt-get install -y software-properties-common -sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test -sudo apt-get update - -# Install system dependencies -sudo apt-get install -y \ - build-essential \ - ninja-build \ - g++-11 \ - rdma-core \ - libibverbs1 \ - libmlx5-1 \ - libibverbs-dev \ - curl \ - pkg-config \ - libssl-dev - -# Install CUDA toolkit and NCCL -wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb -sudo dpkg -i cuda-keyring_1.1-1_all.deb -sudo apt-get update -sudo apt-get install -y cuda-toolkit-12-1 -sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9 - -# Install Rust -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y -source $HOME/.cargo/env -rustup default nightly - -# Install Python dependencies -cd ~/sky_workdir -pip install setuptools-rust maturin -pip install -r torch-requirements.txt -r build-requirements.txt - -# Build Monarch (without tensor engine due to old rdma-core) -CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation . - -echo "DONE INSTALLING MONARCH" -""" - # Build resources specification resources_kwargs = { "cloud": get_cloud(args.cloud), - "cpus": "2+", "accelerators": args.accelerator, # GPU required - torchmonarch needs CUDA } if args.region: resources_kwargs["region"] = args.region + # Find Monarch repo root (this script is in examples/) + script_dir = os.path.dirname(os.path.abspath(__file__)) + monarch_root = os.path.dirname(script_dir) # Go up from examples/ + job = SkyPilotJob( # Define the mesh of hosts we need meshes={"trainers": args.num_hosts}, @@ -225,12 +180,9 @@ def main(): # Auto-cleanup after 10 minutes of idle time idle_minutes_to_autostop=10, down_on_autostop=True, - # Setup commands to install dependencies - setup_commands=setup_commands, - # Sync Monarch source to workers for building - workdir="/home/sky/dev/monarch", - # Use default python (same as used by pip in setup) - python_exe="python", + # Sync Monarch source to workers for building from source + # (SkyPilotJob uses default setup commands when workdir is provided) + workdir=monarch_root, ) try: diff --git a/examples/skypilot_run_example.yaml b/examples/skypilot_run_example.yaml new file mode 100644 index 000000000..a2638a5ec --- /dev/null +++ b/examples/skypilot_run_example.yaml @@ -0,0 +1,96 @@ +# SkyPilot YAML for running Monarch SkyPilot example from outside the cluster +# +# This YAML launches a "client" pod that runs the skypilot_getting_started.py +# script. The script then uses SkyPilotJob to launch additional "worker" pods. +# +# Usage: +# sky launch examples/skypilot_run_example.yaml +# +# Requirements: +# - SkyPilot configured with Kubernetes access (sky check) +# - Kubernetes cluster with GPU nodes available +# +# Note: Cold start is slow (~7-10 minutes) because both the client and workers +# need to build Monarch from source to ensure version compatibility. + +name: monarch-skypilot-example + +resources: + cloud: kubernetes + # Client pod needs minimal resources - workers do the heavy lifting + cpus: 4+ + memory: 16+ + # Request a GPU for the client too (needed to build Monarch with CUDA support) + accelerators: H200:1 + +# Sync the Monarch repository to the client pod +workdir: . + +setup: | + set -ex + + echo "=== Setting up Monarch client pod ===" + + # Add PPA for newer toolchains + sudo apt-get update + sudo apt-get install -y software-properties-common + sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test + sudo apt-get update + + # Install system dependencies + sudo apt-get install -y \ + build-essential \ + ninja-build \ + g++-11 \ + rdma-core \ + libibverbs1 \ + libmlx5-1 \ + libibverbs-dev \ + curl \ + pkg-config \ + libssl-dev + + # Install CUDA toolkit and NCCL + wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + sudo apt-get update + sudo apt-get install -y cuda-toolkit-12-1 + sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9 + + # Install Rust + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + source $HOME/.cargo/env + rustup default nightly + + # Install SkyPilot with Kubernetes support + pip install "skypilot[kubernetes]" + + # Install Python dependencies and build Monarch from source + cd ~/sky_workdir + pip install setuptools-rust maturin + pip install -r torch-requirements.txt -r build-requirements.txt + CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation -e . + + echo "=== Client setup complete ===" + +run: | + set -ex + source $HOME/.cargo/env + cd ~/sky_workdir + + # Set timeouts for worker communication + export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=15m + export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=15m + export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=15m + + # Run the example + # Using --num-hosts 1 and --gpus-per-host 1 for a minimal test + # Adjust these values based on available cluster resources + python examples/skypilot_getting_started.py \ + --cloud kubernetes \ + --num-hosts 1 \ + --gpus-per-host 1 \ + --accelerator "H200:1" \ + --cluster-name monarch-workers + + diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py index 1d8c40427..6eade6eae 100644 --- a/python/monarch/_src/job/skypilot.py +++ b/python/monarch/_src/job/skypilot.py @@ -38,6 +38,58 @@ # Default port for Monarch TCP communication DEFAULT_MONARCH_PORT = 22222 +# Default setup commands to build Monarch from source on remote workers. +# NOTE: Cold start is slow (~7-10 minutes) because we need to compile Monarch +# each worker This is necessary to ensure client/worker version compatibility +# when using a development branch. For production use, consider +# using pre-built wheels from PyPI (pip install torchmonarch). +# +# For faster cold starts (<30s), use a custom Docker image with all dependencies +# pre-installed by setting image_id in sky.Resources: +# resources = sky.Resources(image_id="docker:your-registry/monarch-image:tag", ...) +DEFAULT_SETUP_COMMANDS = """ +set -ex + +# Add PPA for newer toolchains +sudo apt-get update +sudo apt-get install -y software-properties-common +sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test +sudo apt-get update + +# Install system dependencies +sudo apt-get install -y \ + build-essential \ + ninja-build \ + g++-11 \ + rdma-core \ + libibverbs1 \ + libmlx5-1 \ + libibverbs-dev \ + curl \ + pkg-config \ + libssl-dev + +# Install CUDA toolkit and NCCL +wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get install -y cuda-toolkit-12-1 +sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9 + +# Install Rust +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +source $HOME/.cargo/env +rustup default nightly + +# Install Python dependencies and build Monarch from source +cd ~/sky_workdir +pip install setuptools-rust maturin +pip install -r torch-requirements.txt -r build-requirements.txt +CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation . + +echo "Done installing Monarch" +""" + def _configure_transport() -> None: """Configure the Monarch transport. Deferred import to avoid import errors.""" @@ -55,6 +107,7 @@ def _attach_to_workers_wrapper(name: str, ca: str, workers: List[str]): class SkyPilotJob(JobTrait): + """ A job scheduler that uses SkyPilot to provision cloud instances. @@ -107,8 +160,12 @@ def __init__( python_exe: Python executable to use for worker processes. setup_commands: Optional setup commands to run before starting workers. Use this to install dependencies including Monarch. + If None and workdir is provided, uses DEFAULT_SETUP_COMMANDS + which builds Monarch from source. workdir: Local directory to sync to the cluster. If provided, this directory will be uploaded to ~/sky_workdir on each node. + When using workdir with the Monarch repo, DEFAULT_SETUP_COMMANDS + will build Monarch from source on each worker. file_mounts: Dictionary mapping remote paths to local paths for additional file mounts. """ @@ -152,7 +209,13 @@ def _create(self, client_script: Optional[str]) -> None: worker_command = self._build_worker_command() # Create setup commands - setup = self._setup_commands or "" + # If workdir is provided but no setup_commands, use defaults to build Monarch + if self._setup_commands is not None: + setup = self._setup_commands + elif self._workdir is not None: + setup = DEFAULT_SETUP_COMMANDS + else: + setup = "" if setup and not setup.endswith("\n"): setup += "\n" diff --git a/test_worker_setup.yaml b/test_worker_setup.yaml deleted file mode 100644 index 8649bbc9c..000000000 --- a/test_worker_setup.yaml +++ /dev/null @@ -1,78 +0,0 @@ -# Minimal SkyPilot YAML to test Monarch build on remote workers -# Usage: sky launch test_worker_setup.yaml -c monarch-test -# sky down monarch-test -# -# NOTE: Currently builds WITHOUT tensor engine due to old rdma-core on Ubuntu 20.04. -# For tensor engine support, need a newer base image with rdma-core >= 32. - -name: monarch-worker-test - -resources: - cloud: kubernetes - accelerators: H200:1 - cpus: 4+ - memory: 16+ - -num_nodes: 1 - -# Sync the local monarch repo to the worker -workdir: /home/sky/dev/monarch - -setup: | - set -ex - - echo "=== System info ===" - uname -a - cat /etc/os-release | head -3 - - echo "=== Adding PPA for newer toolchains ===" - sudo apt-get update - sudo apt-get install -y software-properties-common - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test - sudo apt-get update - - echo "=== Installing system dependencies ===" - sudo apt-get install -y \ - build-essential \ - ninja-build \ - g++-11 \ - rdma-core \ - libibverbs1 \ - libmlx5-1 \ - libibverbs-dev \ - curl \ - pkg-config \ - libssl-dev - - echo "=== Installing CUDA toolkit ===" - wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb - sudo dpkg -i cuda-keyring_1.1-1_all.deb - sudo apt-get update - sudo apt-get install -y cuda-toolkit-12-1 - sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9 - - echo "=== Installing Rust ===" - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - source $HOME/.cargo/env - rustup default nightly - - echo "=== Installing Python dependencies ===" - cd ~/sky_workdir - pip install setuptools-rust maturin - pip install -r torch-requirements.txt -r build-requirements.txt - - echo "=== Building Monarch (without tensor engine due to old rdma-core) ===" - cd ~/sky_workdir - CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation . - - echo "=== Verifying installation ===" - pip list | grep monarch - python -c "import monarch; print('Monarch imported successfully')" - python -c "import monarch._rust_bindings; print('Rust bindings loaded successfully')" - - echo "=== SETUP COMPLETE ===" - -run: | - echo "Worker setup test completed successfully!" - python -c "import monarch; print('Monarch ready')" - echo "Ready for Monarch worker operations" From 6a75678ae1f9046900c52f35589b957e1a8d5e48 Mon Sep 17 00:00:00 2001 From: Romil Date: Mon, 8 Dec 2025 05:24:05 +0000 Subject: [PATCH 08/29] cleanup --- SKY_README.md | 293 ------------- examples/skypilot_run_example.yaml | 96 ----- python/tests/test_skypilot_integration.py | 213 ---------- python/tests/test_skypilot_job.py | 493 ---------------------- 4 files changed, 1095 deletions(-) delete mode 100644 SKY_README.md delete mode 100644 examples/skypilot_run_example.yaml delete mode 100644 python/tests/test_skypilot_integration.py delete mode 100644 python/tests/test_skypilot_job.py diff --git a/SKY_README.md b/SKY_README.md deleted file mode 100644 index 1558c62da..000000000 --- a/SKY_README.md +++ /dev/null @@ -1,293 +0,0 @@ -# Monarch + SkyPilot Integration - -This document describes the SkyPilot integration for Monarch, which enables running Monarch actors on cloud infrastructure provisioned by SkyPilot. - -## Overview - -SkyPilot is a framework for running ML workloads on any cloud (AWS, GCP, Azure, Lambda, Kubernetes, etc.). The `SkyPilotJob` class in Monarch provides a seamless integration that: - -1. **Provisions cloud instances** using SkyPilot's unified API -2. **Installs Monarch** (`torchmonarch` from PyPI) on remote nodes -3. **Starts Monarch workers** on each node listening for connections -4. **Connects clients** to workers using TCP for distributed actor communication - -## Architecture - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ Client Machine │ -│ ┌─────────────────────────────────────────────────────────┐ │ -│ │ SkyPilotJob │ │ -│ │ - Calls sky.launch() to provision cloud instances │ │ -│ │ - Configures setup commands to install torchmonarch │ │ -│ │ - Builds worker command with run_worker_loop_forever() │ │ -│ │ - Calls attach_to_workers() to create HostMesh │ │ -│ └─────────────────────────────────────────────────────────┘ │ -└───────────────────────────────┬─────────────────────────────────┘ - │ TCP connections (port 22222) - ┌───────────────────────┼───────────────────────┐ - │ │ │ - ▼ ▼ ▼ -┌───────────────┐ ┌───────────────┐ ┌───────────────┐ -│ Worker 1 │ │ Worker 2 │ │ Worker N │ -│ (Cloud Node) │ │ (Cloud Node) │ │ (Cloud Node) │ -│ │ │ │ │ │ -│ run_worker_ │ │ run_worker_ │ │ run_worker_ │ -│ loop_forever()│ │ loop_forever()│ │ loop_forever()│ -│ │ │ │ │ │ -│ tcp://: │ │ tcp://: │ │ tcp://: │ -│ 22222 │ │ 22222 │ │ 22222 │ -└───────────────┘ └───────────────┘ └───────────────┘ -``` - -## Implementation Details - -### Files - -- **`python/monarch/_src/job/skypilot.py`**: Core `SkyPilotJob` implementation -- **`python/monarch/job/__init__.py`**: Exports `SkyPilotJob` (with graceful ImportError handling) -- **`python/tests/test_skypilot_job.py`**: Unit tests with mocked SkyPilot -- **`python/tests/test_skypilot_integration.py`**: Integration test scaffolding -- **`python/examples/skypilot_getting_started.py`**: Example demonstrating usage - -### Key Classes and Functions - -#### `SkyPilotJob(JobTrait)` - -Main job class that implements the Monarch `JobTrait` interface. - -```python -from monarch.job import SkyPilotJob -import sky - -job = SkyPilotJob( - meshes={"trainers": 2}, # 2 nodes for "trainers" mesh - resources=sky.Resources( - cloud=sky.Kubernetes(), - accelerators="H100:1", - ), - cluster_name="my-cluster", - idle_minutes_to_autostop=10, - down_on_autostop=True, - setup_commands="pip install torchmonarch", -) - -state = job.state() # Launches cluster and returns JobState -hosts = state.trainers # HostMesh with 2 nodes -``` - -#### Parameters - -| Parameter | Type | Description | -|-----------|------|-------------| -| `meshes` | `Dict[str, int]` | Mesh names to node counts | -| `resources` | `sky.Resources` | SkyPilot resource specification | -| `cluster_name` | `str` | Name for the cluster | -| `monarch_port` | `int` | TCP port for workers (default: 22222) | -| `idle_minutes_to_autostop` | `int` | Auto-stop after idle minutes | -| `down_on_autostop` | `bool` | Terminate (not just stop) on autostop | -| `setup_commands` | `str` | Shell commands to run before workers start | -| `workdir` | `str` | Local directory to sync to cluster | -| `file_mounts` | `Dict[str, str]` | Additional file mounts | - -### Worker Lifecycle - -1. **Launch**: `sky.launch()` creates the cluster with specified resources -2. **Setup**: `setup_commands` run to install `torchmonarch` -3. **Run**: Worker command executes `run_worker_loop_forever(address, ca)` -4. **Connect**: Client calls `attach_to_workers()` to create `HostMesh` -5. **Teardown**: `sky.down()` terminates the cluster - -### Environment Variables - -The following environment variables control timeouts: - -```python -os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s" # Worker spawn timeout -os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s" # Message delivery timeout -os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s" # Proc mesh spawn timeout -``` - -## Requirements - -### Client Side -- Monarch with Rust bindings (`pip install -e .` from source) -- SkyPilot (`pip install skypilot`) -- Configured cloud credentials (`sky check`) - -### Worker Side (installed via setup_commands) -- `torchmonarch` from PyPI -- **CUDA libraries** - torchmonarch requires `libcuda.so.1` -- This means workers **must run on GPU nodes** - -## Usage - -### Basic Example - -```python -import sky -from monarch.job import SkyPilotJob -from monarch.actor import Actor, endpoint - -class MyActor(Actor): - @endpoint - def hello(self) -> str: - return "Hello from cloud!" - -# Create job -job = SkyPilotJob( - meshes={"workers": 2}, - resources=sky.Resources( - cloud=sky.AWS(), - accelerators="A100:1", - ), - setup_commands="pip install torchmonarch", -) - -# Launch and get state -state = job.state() -hosts = state.workers - -# Spawn processes and actors -procs = hosts.spawn_procs(per_host={"gpus": 1}) -actors = procs.spawn("my_actors", MyActor) - -# Interact with actors -results = actors.hello.call().get() -print(results) # ["Hello from cloud!", "Hello from cloud!"] - -# Cleanup -job.kill() -``` - -### Running the Example - -```bash -# Install dependencies -pip install skypilot -pip install -e . # Build Monarch from source - -# Configure cloud credentials -sky check - -# Run example -cd python/examples -python skypilot_getting_started.py \ - --cloud kubernetes \ - --num-hosts 2 \ - --accelerator "H100:1" \ - --cluster-name my-monarch-cluster -``` - -### Supported Clouds - -- **Kubernetes**: Use `sky.Kubernetes()` with `--region` for context -- **AWS**: Use `sky.AWS()` -- **GCP**: Use `sky.GCP()` -- **Azure**: Use `sky.Azure()` -- **Lambda Labs**: Use `sky.Lambda()` -- And others supported by SkyPilot - -## Networking Considerations - -### Kubernetes - -When using Kubernetes, the client and workers must be in the **same Kubernetes cluster** for pod-to-pod communication. Use the `region` parameter to specify the Kubernetes context: - -```python -resources=sky.Resources( - cloud=sky.Kubernetes(), - region="my-k8s-context", # Must match client's cluster -) -``` - -### Public Clouds (AWS, GCP, Azure) - -SkyPilot handles networking automatically. Workers get public IPs that clients can connect to. - -### Firewall - -Ensure port 22222 (or your custom `monarch_port`) is accessible: -- Kubernetes: Pod networking should handle this -- AWS: Security groups -- GCP: Firewall rules -- Azure: Network security groups - -## Troubleshooting - -### "libcuda.so.1: cannot open shared object file" - -**Cause**: Workers are running on CPU-only nodes, but `torchmonarch` requires CUDA. - -**Solution**: Request GPU nodes: -```python -resources=sky.Resources(accelerators="H100:1") -``` - -### "No route to host" or connection timeouts - -**Cause**: Client and workers are in different networks (e.g., different Kubernetes clusters). - -**Solution**: Ensure client and workers are in the same network: -- For Kubernetes: Use `region` parameter to specify the correct context -- For public clouds: Check security group / firewall rules - -### "error spawning proc mesh: statuses: Timeout" - -**Causes**: -1. Workers aren't listening on the expected port -2. Network connectivity issues -3. Workers crashed during startup - -**Debug steps**: -1. Check SkyPilot logs: `sky logs ` -2. SSH into cluster: `sky ssh ` -3. Check if port is listening: `ss -tlnp | grep 22222` -4. Check Monarch logs: `/tmp/sky/monarch_log.log` - -### Workers crash immediately - -Check SkyPilot logs for the error: -```bash -sky logs -``` - -Common issues: -- Missing CUDA libraries → use GPU nodes -- torchmonarch installation failed → check setup_commands -- Python version mismatch → ensure compatible Python version - -## Testing - -### Unit Tests (with mocked SkyPilot) - -```bash -cd python -pytest tests/test_skypilot_job.py -v -``` - -### Integration Tests (requires real cloud) - -```bash -cd python -pytest tests/test_skypilot_integration.py -v --cloud kubernetes -``` - -## Comparison with SlurmJob - -| Feature | SkyPilotJob | SlurmJob | -|---------|-------------|----------| -| Cloud Support | Multi-cloud (AWS, GCP, Azure, K8s, etc.) | HPC clusters only | -| Setup | Automatic via SkyPilot | Requires Slurm installation | -| Autoscaling | Supported | Depends on cluster | -| Cost Optimization | Automatic (cheapest region) | N/A | -| Worker Discovery | Via cluster handle IPs | Via squeue hostnames | - -## Future Work - -- [ ] Support for spot/preemptible instances -- [ ] Multi-region deployments -- [ ] Automatic failover on spot termination -- [ ] Integration with SkyPilot managed jobs -- [ ] Support for batch mode (client script on cluster) - diff --git a/examples/skypilot_run_example.yaml b/examples/skypilot_run_example.yaml deleted file mode 100644 index a2638a5ec..000000000 --- a/examples/skypilot_run_example.yaml +++ /dev/null @@ -1,96 +0,0 @@ -# SkyPilot YAML for running Monarch SkyPilot example from outside the cluster -# -# This YAML launches a "client" pod that runs the skypilot_getting_started.py -# script. The script then uses SkyPilotJob to launch additional "worker" pods. -# -# Usage: -# sky launch examples/skypilot_run_example.yaml -# -# Requirements: -# - SkyPilot configured with Kubernetes access (sky check) -# - Kubernetes cluster with GPU nodes available -# -# Note: Cold start is slow (~7-10 minutes) because both the client and workers -# need to build Monarch from source to ensure version compatibility. - -name: monarch-skypilot-example - -resources: - cloud: kubernetes - # Client pod needs minimal resources - workers do the heavy lifting - cpus: 4+ - memory: 16+ - # Request a GPU for the client too (needed to build Monarch with CUDA support) - accelerators: H200:1 - -# Sync the Monarch repository to the client pod -workdir: . - -setup: | - set -ex - - echo "=== Setting up Monarch client pod ===" - - # Add PPA for newer toolchains - sudo apt-get update - sudo apt-get install -y software-properties-common - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test - sudo apt-get update - - # Install system dependencies - sudo apt-get install -y \ - build-essential \ - ninja-build \ - g++-11 \ - rdma-core \ - libibverbs1 \ - libmlx5-1 \ - libibverbs-dev \ - curl \ - pkg-config \ - libssl-dev - - # Install CUDA toolkit and NCCL - wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb - sudo dpkg -i cuda-keyring_1.1-1_all.deb - sudo apt-get update - sudo apt-get install -y cuda-toolkit-12-1 - sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9 - - # Install Rust - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - source $HOME/.cargo/env - rustup default nightly - - # Install SkyPilot with Kubernetes support - pip install "skypilot[kubernetes]" - - # Install Python dependencies and build Monarch from source - cd ~/sky_workdir - pip install setuptools-rust maturin - pip install -r torch-requirements.txt -r build-requirements.txt - CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation -e . - - echo "=== Client setup complete ===" - -run: | - set -ex - source $HOME/.cargo/env - cd ~/sky_workdir - - # Set timeouts for worker communication - export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=15m - export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=15m - export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=15m - - # Run the example - # Using --num-hosts 1 and --gpus-per-host 1 for a minimal test - # Adjust these values based on available cluster resources - python examples/skypilot_getting_started.py \ - --cloud kubernetes \ - --num-hosts 1 \ - --gpus-per-host 1 \ - --accelerator "H200:1" \ - --cluster-name monarch-workers - - diff --git a/python/tests/test_skypilot_integration.py b/python/tests/test_skypilot_integration.py deleted file mode 100644 index 5469f4717..000000000 --- a/python/tests/test_skypilot_integration.py +++ /dev/null @@ -1,213 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -""" -Integration test script for SkyPilot job. - -This script tests the basic SkyPilot integration without requiring Monarch -runtime. It validates that SkyPilot cluster launching and node IP retrieval works. - -Run this script with: - python tests/test_skypilot_integration.py - -Prerequisites: -- SkyPilot installed and configured with cloud credentials -- Run `sky check` to verify cloud access -""" - -import argparse -import sys -import time - -try: - import sky - from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle -except ImportError: - print("Error: SkyPilot is not installed. Install with: pip install skypilot") - sys.exit(1) - - -def test_skypilot_cluster_launch( - cluster_name: str = "monarch-integration-test", - cloud: str = "aws", - cpus: str = "2+", - timeout_minutes: int = 10, -) -> bool: - """ - Test launching a SkyPilot cluster and retrieving node IPs. - - Args: - cluster_name: Name for the test cluster - cloud: Cloud provider to use - cpus: CPU specification - timeout_minutes: Timeout for cluster launch - - Returns: - True if test passed, False otherwise - """ - print(f"\n{'='*60}") - print("SkyPilot Integration Test") - print(f"{'='*60}\n") - - # Create a simple task - task = sky.Task( - name="monarch-test-task", - run="echo 'SkyPilot test successful' && hostname && sleep 30", - ) - - # Set resources based on cloud - cloud_obj = None - if cloud.lower() == "aws": - cloud_obj = sky.AWS() - elif cloud.lower() == "gcp": - cloud_obj = sky.GCP() - elif cloud.lower() == "azure": - cloud_obj = sky.Azure() - elif cloud.lower() == "kubernetes": - cloud_obj = sky.Kubernetes() - - resources = sky.Resources( - cloud=cloud_obj, - cpus=cpus, - ) - task.set_resources(resources) - - print(f"Test configuration:") - print(f" Cluster name: {cluster_name}") - print(f" Cloud: {cloud}") - print(f" CPUs: {cpus}") - print() - - try: - # Launch the cluster - print("Step 1: Launching cluster...") - request_id = sky.launch( - task, - cluster_name=cluster_name, - idle_minutes_to_autostop=5, - down=True, # Auto-teardown after idle - ) - - print(f" Request ID: {request_id}") - job_id, handle = sky.get(request_id) - print(f" Job ID: {job_id}") - - if handle is None: - print(" ERROR: No handle returned from launch") - return False - - print(" Cluster launched successfully!") - - # Get cluster status and node IPs - print("\nStep 2: Getting cluster status and node IPs...") - request_id = sky.status(cluster_names=[cluster_name]) - statuses = sky.get(request_id) - - if not statuses: - print(" ERROR: No status returned") - return False - - status = statuses[0] - print(f" Cluster status: {status.status}") - print(f" Cluster name: {status.name}") - - handle = status.handle - if handle is None: - print(" ERROR: Status has no handle") - return False - - if not isinstance(handle, CloudVmRayResourceHandle): - print(f" ERROR: Unexpected handle type: {type(handle)}") - return False - - # Get IPs - if handle.stable_internal_external_ips: - print(f"\n Node IPs ({len(handle.stable_internal_external_ips)} nodes):") - for i, (internal_ip, external_ip) in enumerate( - handle.stable_internal_external_ips - ): - print(f" Node {i}: internal={internal_ip}, external={external_ip}") - else: - print(" WARNING: No IP information available yet") - - # Test passed! - print("\n" + "=" * 60) - print("TEST PASSED!") - print("=" * 60) - print( - "\nThe SkyPilot integration is working correctly." - "\nMonarch workers can be launched on these nodes." - ) - return True - - except Exception as e: - print(f"\nERROR: {e}") - import traceback - - traceback.print_exc() - return False - - finally: - # Cleanup - print("\nStep 3: Cleaning up cluster...") - try: - request_id = sky.down(cluster_name) - sky.get(request_id) - print(" Cluster terminated successfully") - except Exception as e: - print(f" Warning: Failed to cleanup cluster: {e}") - print(f" You may need to manually run: sky down {cluster_name}") - - -def main(): - parser = argparse.ArgumentParser( - description="Integration test for SkyPilot-Monarch integration" - ) - parser.add_argument( - "--cluster-name", - default="monarch-integration-test", - help="Name for the test cluster", - ) - parser.add_argument( - "--cloud", - default="aws", - choices=["aws", "gcp", "azure", "kubernetes"], - help="Cloud provider to use", - ) - parser.add_argument( - "--cpus", - default="2+", - help="CPU specification", - ) - parser.add_argument( - "--timeout", - type=int, - default=10, - help="Timeout in minutes for cluster launch", - ) - - args = parser.parse_args() - - # Check SkyPilot is configured - print("Checking SkyPilot configuration...") - print(f" Using cloud: {args.cloud}") - print(" (Run 'sky check' to verify cloud credentials)") - - # Run the test - success = test_skypilot_cluster_launch( - cluster_name=args.cluster_name, - cloud=args.cloud, - cpus=args.cpus, - timeout_minutes=args.timeout, - ) - - sys.exit(0 if success else 1) - - -if __name__ == "__main__": - main() - diff --git a/python/tests/test_skypilot_job.py b/python/tests/test_skypilot_job.py deleted file mode 100644 index b6af37a22..000000000 --- a/python/tests/test_skypilot_job.py +++ /dev/null @@ -1,493 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights retuprved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe - -"""Tests for the SubernyPilot job integration.""" - -import os -import sys -import tempfile -from typing import Any, Dict, List, Optional, Tuple -from unittest import mock - -import pytest - - -# Check if SkyPilot is available -try: - import sky - - HAS_SKYPILOT = True -except ImportError: - HAS_SKYPILOT = False - -# Check if Monarch bindings are available -try: - from monarch._rust_bindings.monarch_hyperactor.config import configure - - HAS_MONARCH_BINDINGS = True -except ImportError: - HAS_MONARCH_BINDINGS = False - -# Skip all tests in this module if SkyPilot or Monarch bindings are not installed -pytestmark = [ - pytest.mark.skipif(not HAS_SKYPILOT, reason="SkyPilot not installed"), - pytest.mark.skipif(not HAS_MONARCH_BINDINGS, reason="Monarch bindings not available"), -] - - -class MockClusterHandle: - """Mock CloudVmRayResourceHandle for testing.""" - - def __init__( - self, - cluster_name: str, - node_ips: List[Tuple[str, str]], - ): - self.cluster_name = cluster_name - self.cluster_name_on_cloud = cluster_name - self.stable_internal_external_ips = node_ips - self.launched_nodes = len(node_ips) - - -class MockStatusResponse: - """Mock status response from sky.status().""" - - def __init__( - self, - name: str, - status: "sky.ClusterStatus", - handle: Optional[MockClusterHandle] = None, - ): - self.name = name - self.status = status - self.handle = handle - - -@pytest.fixture -def mock_sky(): - """Fixture to mock SkyPilot SDK functions.""" - with mock.patch("monarch._src.job.skypilot.sky") as mock_sky_module: - # Mock ClusterStatus enum - mock_sky_module.ClusterStatus = sky.ClusterStatus - - # Mock sky.launch to return a mock request_id - mock_sky_module.launch.return_value = "mock-request-id" - - # Mock sky.get to return appropriate results - def mock_get(request_id): - if request_id == "mock-request-id": - # Return (job_id, handle) for launch - return ( - 1, - MockClusterHandle( - "test-cluster", - [("10.0.0.1", "1.2.3.4"), ("10.0.0.2", "1.2.3.5")], - ), - ) - elif request_id == "mock-status-request-id": - # Return list of status responses - return [ - MockStatusResponse( - "test-cluster", - sky.ClusterStatus.UP, - MockClusterHandle( - "test-cluster", - [("10.0.0.1", "1.2.3.4"), ("10.0.0.2", "1.2.3.5")], - ), - ) - ] - elif request_id == "mock-down-request-id": - return None - return None - - mock_sky_module.get.side_effect = mock_get - - # Mock sky.status - mock_sky_module.status.return_value = "mock-status-request-id" - - # Mock sky.down - mock_sky_module.down.return_value = "mock-down-request-id" - - # Mock sky.Task - mock_sky_module.Task = mock.MagicMock() - - # Mock sky.Resources - mock_sky_module.Resources = sky.Resources - - yield mock_sky_module - - -@pytest.fixture -def mock_attach_to_workers(): - """Fixture to mock attach_to_workers wrapper.""" - with mock.patch( - "monarch._src.job.skypilot._attach_to_workers_wrapper" - ) as mock_attach: - # Create a simple mock HostMesh - class MockHostMesh: - def __init__(self, name): - self.name = name - - def create_mock_host_mesh(name, ca, workers): - return MockHostMesh(name) - - mock_attach.side_effect = create_mock_host_mesh - yield mock_attach - - -@pytest.fixture -def mock_configure_transport(): - """Fixture to mock _configure_transport.""" - with mock.patch( - "monarch._src.job.skypilot._configure_transport" - ) as mock_config: - yield mock_config - - -@pytest.mark.skipif(not HAS_SKYPILOT, reason="SkyPilot not installed") -def test_skypilot_job_import(): - """Test that SkyPilotJob can be imported from monarch.job.""" - from monarch.job import SkyPilotJob - - # SkyPilotJob should be available (or None if import failed) - # This test verifies the export is working - if HAS_MONARCH_BINDINGS: - assert SkyPilotJob is not None - # If bindings are not available, SkyPilotJob will be None (graceful degradation) - - -def test_skypilot_job_init(mock_configure_transport): - """Test SkyPilotJob initialization.""" - from monarch._src.job.skypilot import SkyPilotJob - - job = SkyPilotJob( - meshes={"trainers": 2, "workers": 1}, - cluster_name="test-cluster", - monarch_port=12345, - ) - - assert job._meshes == {"trainers": 2, "workers": 1} - assert job._cluster_name == "test-cluster" - assert job._port == 12345 - assert job._launched_cluster_name is None - assert job._node_ips == [] - - -def test_skypilot_job_init_with_resources(mock_configure_transport): - """Test SkyPilotJob initialization with SkyPilot resources.""" - from monarch._src.job.skypilot import SkyPilotJob - - resources = sky.Resources(accelerators="A100:1") - - job = SkyPilotJob( - meshes={"trainers": 4}, - resources=resources, - cluster_name="gpu-cluster", - ) - - assert job._resources == resources - assert job._meshes == {"trainers": 4} - - -def test_skypilot_job_build_worker_command(mock_configure_transport): - """Test the worker command generation.""" - from monarch._src.job.skypilot import SkyPilotJob - - job = SkyPilotJob( - meshes={"trainers": 1}, - monarch_port=22222, - ) - - command = job._build_worker_command() - - # Check that the command contains expected elements - assert "socket.gethostname()" in command - assert "tcp://" in command - assert "22222" in command - assert "run_worker_loop_forever" in command - assert 'ca="trust_all_connections"' in command - - -def test_skypilot_job_create(mock_sky, mock_attach_to_workers, mock_configure_transport): - """Test the _create method.""" - from monarch._src.job.skypilot import SkyPilotJob - - job = SkyPilotJob( - meshes={"trainers": 2}, - cluster_name="test-cluster", - ) - - # Call _create - job._create(None) - - # Verify sky.launch was called - mock_sky.launch.assert_called_once() - - # Check that cluster name was stored - assert job._launched_cluster_name == "test-cluster" - - -def test_skypilot_job_create_batch_mode_raises(mock_sky, mock_configure_transport): - """Test that _create raises an error for batch mode.""" - from monarch._src.job.skypilot import SkyPilotJob - - job = SkyPilotJob(meshes={"trainers": 1}) - - with pytest.raises(RuntimeError, match="batch-mode scripts"): - job._create("some_script.py") - - -def test_skypilot_job_state(mock_sky, mock_attach_to_workers, mock_configure_transport): - """Test the _state method.""" - from monarch._src.job.skypilot import SkyPilotJob - - job = SkyPilotJob( - meshes={"trainers": 2}, - cluster_name="test-cluster", - ) - - # Apply the job first - job.apply() - - # Now get state - state = job._state() - - # Verify attach_to_workers was called with correct addresses - mock_attach_to_workers.assert_called() - call_args = mock_attach_to_workers.call_args - - # Check the call arguments - assert call_args.kwargs["name"] == "trainers" - assert call_args.kwargs["ca"] == "trust_all_connections" - # Workers should use external IPs - workers = call_args.kwargs["workers"] - assert len(workers) == 2 - assert all("tcp://" in w for w in workers) - - # Check that state has the trainers mesh - assert hasattr(state, "trainers") - - -def test_skypilot_job_state_multiple_meshes(mock_sky, mock_attach_to_workers, mock_configure_transport): - """Test _state with multiple meshes.""" - from monarch._src.job.skypilot import SkyPilotJob - - # Create mock status with 3 nodes - def mock_get_multi(request_id): - if request_id == "mock-request-id": - return ( - 1, - MockClusterHandle( - "test-cluster", - [ - ("10.0.0.1", "1.2.3.4"), - ("10.0.0.2", "1.2.3.5"), - ("10.0.0.3", "1.2.3.6"), - ], - ), - ) - elif request_id == "mock-status-request-id": - return [ - MockStatusResponse( - "test-cluster", - sky.ClusterStatus.UP, - MockClusterHandle( - "test-cluster", - [ - ("10.0.0.1", "1.2.3.4"), - ("10.0.0.2", "1.2.3.5"), - ("10.0.0.3", "1.2.3.6"), - ], - ), - ) - ] - return None - - mock_sky.get.side_effect = mock_get_multi - - job = SkyPilotJob( - meshes={"trainers": 2, "evaluator": 1}, - cluster_name="test-cluster", - ) - - job.apply() - state = job._state() - - # Verify attach_to_workers was called twice (once for each mesh) - assert mock_attach_to_workers.call_count == 2 - - # Check that state has both meshes - assert hasattr(state, "trainers") - assert hasattr(state, "evaluator") - - -def test_skypilot_job_kill(mock_sky, mock_attach_to_workers, mock_configure_transport): - """Test the _kill method.""" - from monarch._src.job.skypilot import SkyPilotJob - - job = SkyPilotJob( - meshes={"trainers": 1}, - cluster_name="test-cluster", - ) - - # Apply the job first - job.apply() - assert job._launched_cluster_name == "test-cluster" - - # Kill the job - job._kill() - - # Verify sky.down was called - mock_sky.down.assert_called_once_with("test-cluster") - - # Check that state was cleared - assert job._launched_cluster_name is None - assert job._node_ips == [] - - -def test_skypilot_job_can_run(mock_sky, mock_attach_to_workers, mock_configure_transport): - """Test the can_run method.""" - from monarch._src.job.skypilot import SkyPilotJob - - job1 = SkyPilotJob( - meshes={"trainers": 2}, - cluster_name="test-cluster", - monarch_port=22222, - ) - - job2 = SkyPilotJob( - meshes={"trainers": 2}, - cluster_name="test-cluster", - monarch_port=22222, - ) - - job3 = SkyPilotJob( - meshes={"trainers": 4}, # Different mesh config - cluster_name="test-cluster", - monarch_port=22222, - ) - - # Apply job1 - job1.apply() - - # job1 should be able to run job2 (same config) - assert job1.can_run(job2) is True - - # job1 should NOT be able to run job3 (different mesh config) - assert job1.can_run(job3) is False - - -def test_skypilot_job_jobs_active(mock_sky, mock_attach_to_workers, mock_configure_transport): - """Test the _jobs_active method.""" - from monarch._src.job.skypilot import SkyPilotJob - - job = SkyPilotJob( - meshes={"trainers": 1}, - cluster_name="test-cluster", - ) - - # Before apply, should not be active - assert job._jobs_active() is False - - # Apply the job - job.apply() - - # After apply, should be active (mocked status returns UP) - assert job._jobs_active() is True - - -def test_skypilot_job_serialization(mock_sky, mock_attach_to_workers, mock_configure_transport): - """Test that SkyPilotJob can be serialized and deserialized.""" - from monarch._src.job.skypilot import SkyPilotJob - from monarch._src.job.job import job_loads - - job = SkyPilotJob( - meshes={"trainers": 2, "workers": 1}, - cluster_name="test-cluster", - monarch_port=33333, - ) - - # Serialize - serialized = job.dumps() - - # Deserialize - loaded_job = job_loads(serialized) - - # Check attributes - assert isinstance(loaded_job, SkyPilotJob) - assert loaded_job._meshes == {"trainers": 2, "workers": 1} - assert loaded_job._cluster_name == "test-cluster" - assert loaded_job._port == 33333 - - -def test_skypilot_job_with_upup_commands(mock_configure_transport): - """Test SkyPilotJob with custom ppppppppppp commands.""" - from monarch._src.job.skypilot import SkyPilotJob - - setup = "pip install torch\npip install monarch" - - job = SkyPilotJob( - meshes={"trainers": 1}, - setup_commands=setup, - ) - - assert job._setup_commands == setup - - -def test_skypilot_job_with_autostop(mock_configure_transport): - """Test SkyPilotJob with autostop configuration.""" - from monarch._src.job.skypilot import SkyPilotJob - - job = SkyPilotJob( - meshes={"trainers": 1}, - idle_minutes_to_autostop=30, - down_on_autostop=True, - ) - - assert job._idle_minutes_to_autostop == 30 - assert job._down_on_autostop is True - - -# Integration test - only run if explicitly requested -@pytest.mark.skip(reason="Integration test - run manually with --run-integration") -def test_skypilot_job_integration(): - """ - Integration test that actually launches a SkyPilot cluster. - - To run this test: - pytest tests/test_skypilot_job.py::test_skypilot_job_integration --run-integration - - Make sure you have SkyPilot credentials configured. - """ - from monarch._src.job.skypilot import SkyPilotJob - - # Create a minimal job - just 1 node with cheap resources - job = SkyPilotJob( - meshes={"workers": 1}, - resources=sky.Resources( - cloud=sky.AWS(), # Change to your preferred cloud - cpus="2+", - ), - cluster_name="monarch-test-integration", - idle_minutes_to_autostop=5, - down_on_autostop=True, - ) - - try: - # Apply the job - job.apply() - - # Check that we can get state - state = job.state() - assert hasattr(state, "workers") - - print("Integration test passed!") - finally: - # Always clean up - job.kill() - From fd310d377b891dfc6f561e3938e7c9f53b826796 Mon Sep 17 00:00:00 2001 From: Romil Date: Mon, 8 Dec 2025 05:25:24 +0000 Subject: [PATCH 09/29] cleanup --- examples/skypilot_getting_started.py | 6 ------ python/monarch/_src/job/skypilot.py | 6 ------ 2 files changed, 12 deletions(-) diff --git a/examples/skypilot_getting_started.py b/examples/skypilot_getting_started.py index 3ccc1d10a..b4974e46c 100644 --- a/examples/skypilot_getting_started.py +++ b/examples/skypilot_getting_started.py @@ -1,10 +1,4 @@ #!/usr/bin/env python3 -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - """ Monarch Getting Started with SkyPilot ===================================== diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py index 6eade6eae..a5684b148 100644 --- a/python/monarch/_src/job/skypilot.py +++ b/python/monarch/_src/job/skypilot.py @@ -1,9 +1,3 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - # pyre-unsafe import logging From 8511e5a4870ee80214f870aa2f5fa83eafab7efc Mon Sep 17 00:00:00 2001 From: Romil Date: Mon, 8 Dec 2025 06:06:31 +0000 Subject: [PATCH 10/29] updates --- examples/skypilot_getting_started.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/examples/skypilot_getting_started.py b/examples/skypilot_getting_started.py index b4974e46c..3a6813e5a 100644 --- a/examples/skypilot_getting_started.py +++ b/examples/skypilot_getting_started.py @@ -25,7 +25,7 @@ import os import sys -# Set timeouts before importing monarch - worker setup takes time +# Set timeouts before importing monarch - monarch build takes time os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s" os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s" os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s" @@ -42,15 +42,10 @@ from monarch.actor import Actor, endpoint, ProcMesh, context except ImportError as e: print(f"ERROR: Monarch is not properly installed: {e}") - print("\nTo install Monarch, you need to build it from source:") - print(" cd monarch/") - print(" pip install -e .") - print("\nThis requires the Rust toolchain and other dependencies.") - print("See monarch/README.md for full installation instructions.") sys.exit(1) # ============================================================================ -# Step 1: Define our Actors (same as getting started guide) +# Step 1: Define actors (same as getting started guide) # ============================================================================ @@ -94,7 +89,7 @@ def get_cloud(cloud_name: str): "kubernetes": sky.Kubernetes, "aws": sky.AWS, "gcp": sky.GCP, - "azure": sky.Azure, + "azure": sky.Azure, # TODO(romilb): Add more clouds here } if cloud_name.lower() not in clouds: raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}") @@ -114,6 +109,7 @@ def main(): default=2, help="Number of host nodes to provision", ) + # TODO(romilb): This should be parsed from the accelerator spec parser.add_argument( "--gpus-per-host", type=int, @@ -156,7 +152,7 @@ def main(): # Build resources specification resources_kwargs = { "cloud": get_cloud(args.cloud), - "accelerators": args.accelerator, # GPU required - torchmonarch needs CUDA + "accelerators": args.accelerator, } if args.region: resources_kwargs["region"] = args.region @@ -168,7 +164,6 @@ def main(): job = SkyPilotJob( # Define the mesh of hosts we need meshes={"trainers": args.num_hosts}, - # Specify cloud resources - GPU required for torchmonarch (needs CUDA) resources=sky.Resources(**resources_kwargs), cluster_name=args.cluster_name, # Auto-cleanup after 10 minutes of idle time @@ -233,7 +228,7 @@ def main(): print(f" {i}") print("\n" + "=" * 60) - print("SUCCESS! Monarch actors ran on SkyPilot cluster!") + print("Success! Monarch actors ran on SkyPilot cluster!") print("=" * 60) except Exception as e: From 20d36e8c08853d9b436582709f281a8c63934eec Mon Sep 17 00:00:00 2001 From: Romil Date: Thu, 11 Dec 2025 01:31:34 +0000 Subject: [PATCH 11/29] Extract SkyPilotJob from monarch src --- examples/skypilot/README.md | 153 ++++++ examples/skypilot/__init__.py | 23 + .../skypilot_getting_started.py | 51 +- examples/skypilot/skypilot_job.py | 458 ++++++++++++++++++ 4 files changed, 660 insertions(+), 25 deletions(-) create mode 100644 examples/skypilot/README.md create mode 100644 examples/skypilot/__init__.py rename examples/{ => skypilot}/skypilot_getting_started.py (84%) create mode 100644 examples/skypilot/skypilot_job.py diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md new file mode 100644 index 000000000..74f16b9e0 --- /dev/null +++ b/examples/skypilot/README.md @@ -0,0 +1,153 @@ +# Monarch SkyPilot Integration + +This directory contains a standalone integration for running Monarch workloads on **Kubernetes and cloud VMs** via [SkyPilot](https://github.com/skypilot-org/skypilot). + +## Overview + +`SkyPilotJob` provisions cloud instances (or K8s pods) and starts Monarch workers on them, allowing you to run distributed Monarch actors across multiple machines. + +**Supported platforms:** +- Kubernetes (any cluster) +- AWS, GCP, Azure +- Lambda Labs, CoreWeave, RunPod, and [20+ other clouds](https://docs.skypilot.co/en/latest/getting-started/installation.html) + +## Installation + +```bash +# Install Monarch +pip install torchmonarch-nightly + +# Install SkyPilot with your preferred backend +pip install skypilot[kubernetes] # For Kubernetes +pip install skypilot[aws] # For AWS +pip install skypilot[gcp] # For GCP +pip install skypilot[all] # For all clouds + +# Verify SkyPilot setup +sky check +``` + +## Quick Start + +```python +import sky +from skypilot_job import SkyPilotJob +from monarch.actor import Actor, endpoint + +class MyActor(Actor): + @endpoint + def hello(self) -> str: + return "Hello from the cloud!" + +# Create a SkyPilot job with 2 nodes +job = SkyPilotJob( + meshes={"workers": 2}, + resources=sky.Resources( + cloud=sky.Kubernetes(), # or sky.AWS(), sky.GCP(), etc. + accelerators="H100:1", + ), + cluster_name="my-monarch-cluster", + idle_minutes_to_autostop=10, + down_on_autostop=True, +) + +# Launch and connect +state = job.state() +hosts = state.workers + +# Spawn processes and actors +procs = hosts.spawn_procs(per_host={"gpus": 1}) +actors = procs.spawn("my_actors", MyActor) + +# Use your actors +results = actors.hello.call().get() +print(results) # ["Hello from the cloud!", "Hello from the cloud!"] + +# Clean up +job.kill() +``` + +## Running the Example + +```bash +cd examples/skypilot + +# Run on Kubernetes +python getting_started.py --cloud kubernetes --num-hosts 2 + +# Run on AWS +python getting_started.py --cloud aws --num-hosts 2 --accelerator "A100:1" + +# Run on GCP +python getting_started.py --cloud gcp --num-hosts 2 --accelerator "A100:1" +``` + +## Configuration Options + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `meshes` | Dict mapping mesh names to node counts | Required | +| `resources` | SkyPilot Resources specification | None (SkyPilot defaults) | +| `cluster_name` | Name for the cluster | Auto-generated | +| `monarch_port` | Port for Monarch TCP communication | 22222 | +| `idle_minutes_to_autostop` | Auto-stop after idle time | None | +| `down_on_autostop` | Tear down on autostop vs just stop | False | +| `setup_commands` | Custom setup script | Installs torchmonarch-nightly | +| `workdir` | Local directory to sync to cluster | None | +| `file_mounts` | Additional files to mount | None | + +## Default Image + +By default, `SkyPilotJob` uses the `pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime` Docker image which has compatible system libraries for `torchmonarch-nightly`. Setup time is ~1-2 minutes (just pip install). + +## Faster Cold Starts + +For faster cold starts (<30s): + +**Option 1: Use a pre-built Docker image** +```python +resources = sky.Resources( + image_id="docker:your-registry/monarch-image:tag", + accelerators="H100:1", +) +``` + +**Option 2: Use SkyPilot's cluster reuse** +```python +job = SkyPilotJob( + ..., + idle_minutes_to_autostop=30, # Keep cluster alive + down_on_autostop=False, # Just stop, don't terminate +) +``` + +## Network Requirements + +The client must have direct network connectivity to the worker nodes: +- **Kubernetes**: Run the client inside the same cluster (e.g., in a pod) +- **Cloud VMs**: Ensure security groups allow inbound traffic on port 22222 + +## Troubleshooting + +**Check SkyPilot setup:** +```bash +sky check +sky show-gpus +``` + +**View cluster logs:** +```bash +sky logs +``` + +**SSH into a worker:** +```bash +sky ssh +``` + +**Clean up clusters:** +```bash +sky down +sky down --all # Remove all clusters +``` + diff --git a/examples/skypilot/__init__.py b/examples/skypilot/__init__.py new file mode 100644 index 000000000..8e7acc6da --- /dev/null +++ b/examples/skypilot/__init__.py @@ -0,0 +1,23 @@ +""" +SkyPilot integration for Monarch. + +This is a standalone package that provides SkyPilotJob - a way to run Monarch +workloads on Kubernetes and cloud VMs via SkyPilot. + +This package is separate from the main Monarch codebase to allow independent +iteration and to avoid chicken-and-egg problems with releases. + +Usage: + from skypilot_job import SkyPilotJob + + job = SkyPilotJob( + meshes={"workers": 2}, + resources=sky.Resources(cloud=sky.Kubernetes(), accelerators="H100:1"), + ) + state = job.state() +""" + +from .skypilot_job import SkyPilotJob + +__all__ = ["SkyPilotJob"] + diff --git a/examples/skypilot_getting_started.py b/examples/skypilot/skypilot_getting_started.py similarity index 84% rename from examples/skypilot_getting_started.py rename to examples/skypilot/skypilot_getting_started.py index 3a6813e5a..b9f703bee 100644 --- a/examples/skypilot_getting_started.py +++ b/examples/skypilot/skypilot_getting_started.py @@ -1,35 +1,41 @@ #!/usr/bin/env python3 """ -Monarch Getting Started with SkyPilot -===================================== +Running Monarch on Kubernetes with SkyPilot +=========================================== This script demonstrates running Monarch actors on cloud infrastructure -provisioned by SkyPilot. It follows the Monarch getting started guide -but uses SkyPilot to launch the worker nodes. +provisioned by SkyPilot (Kubernetes or cloud VMs). Prerequisites: -- Monarch installed with its Rust bindings (build with `pip install -e .` in monarch/) -- SkyPilot installed and configured (run `sky check`) + pip install torchmonarch-nightly + pip install skypilot[kubernetes] # or skypilot[aws], skypilot[gcp], etc. + sky check # Verify SkyPilot configuration Usage: - # Run from inside a Kubernetes pod (client runs locally): - python examples/skypilot_getting_started.py --cloud kubernetes --num-hosts 2 + # Run on Kubernetes: + python getting_started.py --cloud kubernetes --num-hosts 2 - # Run from outside the cluster using the SkyPilot YAML: - sky launch examples/skypilot_run_example.yaml + # Run on AWS: + python getting_started.py --cloud aws --num-hosts 2 -See SKY_README.md for full documentation. + # Run on GCP: + python getting_started.py --cloud gcp --num-hosts 2 """ import argparse import os import sys -# Set timeouts before importing monarch - monarch build takes time +# Set timeouts before importing monarch os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s" os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s" os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s" +# If running inside a SkyPilot cluster, unset the in-cluster context +# to allow launching new clusters on the same Kubernetes cluster +if "SKYPILOT_IN_CLUSTER_CONTEXT_NAME" in os.environ: + del os.environ["SKYPILOT_IN_CLUSTER_CONTEXT_NAME"] + # Check dependencies before importing try: import sky @@ -38,12 +44,15 @@ sys.exit(1) try: - from monarch.job import SkyPilotJob from monarch.actor import Actor, endpoint, ProcMesh, context except ImportError as e: print(f"ERROR: Monarch is not properly installed: {e}") + print("Run: pip install torchmonarch-nightly") sys.exit(1) +# Import SkyPilotJob from the local package +from skypilot_job import SkyPilotJob + # ============================================================================ # Step 1: Define actors (same as getting started guide) # ============================================================================ @@ -89,7 +98,8 @@ def get_cloud(cloud_name: str): "kubernetes": sky.Kubernetes, "aws": sky.AWS, "gcp": sky.GCP, - "azure": sky.Azure, # TODO(romilb): Add more clouds here + "azure": sky.Azure, + "lambda": sky.Lambda, } if cloud_name.lower() not in clouds: raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}") @@ -101,7 +111,7 @@ def main(): parser.add_argument( "--cloud", default="kubernetes", - help="Cloud provider to use (kubernetes, aws, gcp, azure)", + help="Cloud provider to use (kubernetes, aws, gcp, azure, lambda)", ) parser.add_argument( "--num-hosts", @@ -109,11 +119,10 @@ def main(): default=2, help="Number of host nodes to provision", ) - # TODO(romilb): This should be parsed from the accelerator spec parser.add_argument( "--gpus-per-host", type=int, - default=2, + default=1, help="Number of GPU processes per host", ) parser.add_argument( @@ -146,7 +155,6 @@ def main(): print(f" Region: {args.region}") # Create a SkyPilotJob to provision nodes - # This will launch cloud instances and start Monarch workers on them print("\n[1] Creating SkyPilot job...") # Build resources specification @@ -156,10 +164,6 @@ def main(): } if args.region: resources_kwargs["region"] = args.region - - # Find Monarch repo root (this script is in examples/) - script_dir = os.path.dirname(os.path.abspath(__file__)) - monarch_root = os.path.dirname(script_dir) # Go up from examples/ job = SkyPilotJob( # Define the mesh of hosts we need @@ -169,9 +173,6 @@ def main(): # Auto-cleanup after 10 minutes of idle time idle_minutes_to_autostop=10, down_on_autostop=True, - # Sync Monarch source to workers for building from source - # (SkyPilotJob uses default setup commands when workdir is provided) - workdir=monarch_root, ) try: diff --git a/examples/skypilot/skypilot_job.py b/examples/skypilot/skypilot_job.py new file mode 100644 index 000000000..7b5ea1178 --- /dev/null +++ b/examples/skypilot/skypilot_job.py @@ -0,0 +1,458 @@ +""" +SkyPilot integration for Monarch - standalone implementation. + +This module provides SkyPilotJob, which allows running Monarch workloads on +Kubernetes and cloud VMs via SkyPilot. It is designed to be used independently +of the main Monarch source tree. + +Requirements: + - pip install torchmonarch-nightly (or torchmonarch) + - pip install skypilot[kubernetes] (or other cloud backends) +""" + +import logging +import os +import sys +import time +from typing import Dict, List, Optional, TYPE_CHECKING + +# Import Monarch's job interface +from monarch._src.job.job import JobState, JobTrait + +# If running inside a SkyPilot cluster, unset the in-cluster context variable +# to allow launching new clusters on the same Kubernetes cluster. +# This must be done before importing sky to affect the API server. +if "SKYPILOT_IN_CLUSTER_CONTEXT_NAME" in os.environ: + del os.environ["SKYPILOT_IN_CLUSTER_CONTEXT_NAME"] + +# Defer imports that may not be available in all environments +if TYPE_CHECKING: + import sky + +try: + import sky + HAS_SKYPILOT = True +except ImportError: + HAS_SKYPILOT = False + sky = None # type: ignore[assignment] + + +logger: logging.Logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +if not logger.handlers: + logger.addHandler(logging.StreamHandler(sys.stderr)) +logger.propagate = False + +# Default port for Monarch TCP communication +DEFAULT_MONARCH_PORT = 22222 + +# Default setup commands to install Monarch from PyPI on remote workers. +# Requires a Docker image with Ubuntu 22.04+ for compatible libibverbs. +# +# Cold start time: ~1-2 minutes (pip install only). +# For faster cold starts (<30s), use a custom Docker image with Monarch pre-installed. +DEFAULT_SETUP_COMMANDS = """ +set -ex + +# Install torchmonarch from PyPI +pip install torchmonarch-nightly + +echo "Done installing Monarch" +""" + +# Default Docker image - PyTorch with CUDA on Ubuntu 22.04 (has compatible libibverbs) +DEFAULT_IMAGE_ID = "docker:pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime" + + +def _configure_transport() -> None: + """Configure the Monarch transport. Deferred import to avoid import errors.""" + from monarch._rust_bindings.monarch_hyperactor.channel import ChannelTransport + from monarch._rust_bindings.monarch_hyperactor.config import configure + + configure(default_transport=ChannelTransport.TcpWithHostname) + + +def _attach_to_workers_wrapper(name: str, ca: str, workers: List[str]): + """Wrapper around attach_to_workers with deferred import.""" + from monarch._src.actor.bootstrap import attach_to_workers + + return attach_to_workers(name=name, ca=ca, workers=workers) + + +class SkyPilotJob(JobTrait): + """ + A job scheduler that uses SkyPilot to provision cloud instances. + + SkyPilot supports multiple cloud providers (AWS, GCP, Azure, Lambda, etc.) + and Kubernetes, and can automatically select the cheapest available option. + + This implementation: + 1. Uses sky.launch() to provision cloud instances with specified resources + 2. Runs Monarch workers on each node via a startup script + 3. Connects to workers using their IP addresses from the cluster handle + + Example: + >>> import sky + >>> from skypilot_job import SkyPilotJob + >>> + >>> job = SkyPilotJob( + ... meshes={"trainers": 2}, + ... resources=sky.Resources(accelerators="A100:1"), + ... cluster_name="my-monarch-cluster", + ... ) + >>> state = job.state() + >>> trainers = state.trainers # HostMesh with 2 nodes + """ + + def __init__( + self, + meshes: Dict[str, int], + resources: Optional["sky.Resources"] = None, + cluster_name: Optional[str] = None, + monarch_port: int = DEFAULT_MONARCH_PORT, + idle_minutes_to_autostop: Optional[int] = None, + down_on_autostop: bool = False, + python_exe: str = "python", + setup_commands: Optional[str] = None, + workdir: Optional[str] = None, + file_mounts: Optional[Dict[str, str]] = None, + ) -> None: + """ + Args: + meshes: Dictionary mapping mesh names to number of nodes. + e.g., {"trainers": 4, "dataloaders": 2} + resources: SkyPilot Resources specification for the instances. + If None, uses SkyPilot defaults. + cluster_name: Name for the SkyPilot cluster. If None, auto-generated. + monarch_port: Port for TCP communication between Monarch workers. + idle_minutes_to_autostop: If set, cluster will autostop after this + many minutes of idleness. + down_on_autostop: If True, tear down cluster on autostop instead of + just stopping it. + python_exe: Python executable to use for worker processes. + setup_commands: Optional setup commands to run before starting workers. + If None, uses DEFAULT_SETUP_COMMANDS which installs + torchmonarch-nightly from PyPI. + workdir: Local directory to sync to the cluster. If provided, this + directory will be uploaded to ~/sky_workdir on each node. + file_mounts: Dictionary mapping remote paths to local paths for + additional file mounts. + """ + if not HAS_SKYPILOT: + raise ImportError( + "SkyPilot is not installed. Install it with: pip install skypilot[kubernetes]" + ) + + # Configure transport at runtime when Monarch is available + try: + _configure_transport() + except ImportError: + # Monarch bindings not available, will fail later when needed + pass + + super().__init__() + + self._meshes = meshes + self._resources = resources + self._cluster_name = cluster_name + self._port = monarch_port + self._idle_minutes_to_autostop = idle_minutes_to_autostop + self._down_on_autostop = down_on_autostop + self._python_exe = python_exe + self._setup_commands = setup_commands + self._workdir = workdir + self._file_mounts = file_mounts + + # Runtime state + self._launched_cluster_name: Optional[str] = None + self._node_ips: List[str] = [] + + def _create(self, client_script: Optional[str]) -> None: + """Launch a SkyPilot cluster and start Monarch workers.""" + if client_script is not None: + raise RuntimeError("SkyPilotJob cannot run batch-mode scripts yet") + + total_nodes = sum(self._meshes.values()) + + # Build the worker startup command + worker_command = self._build_worker_command() + + # Use provided setup commands or default to PyPI install + setup = self._setup_commands if self._setup_commands is not None else DEFAULT_SETUP_COMMANDS + if setup and not setup.endswith("\n"): + setup += "\n" + + # Create the SkyPilot task + task = sky.Task( + name="monarch-workers", + setup=setup if setup else None, + run=worker_command, + num_nodes=total_nodes, + workdir=self._workdir, + ) + + # Add file mounts if provided + if self._file_mounts: + task.set_file_mounts(self._file_mounts) + + # Set resources, using default image_id if not specified + resources = self._resources + if resources is not None: + # If no image_id specified, use the default PyTorch image + if resources.image_id is None: + resources = resources.copy(image_id=DEFAULT_IMAGE_ID) + task.set_resources(resources) + else: + # No resources specified, create default with image_id + task.set_resources(sky.Resources(image_id=DEFAULT_IMAGE_ID)) + + # Generate cluster name if not provided + cluster_name = self._cluster_name or f"monarch-{os.getpid()}" + + logger.info(f"Launching SkyPilot cluster '{cluster_name}' with {total_nodes} nodes") + + # Launch the cluster + try: + request_id = sky.launch( + task, + cluster_name=cluster_name, + idle_minutes_to_autostop=self._idle_minutes_to_autostop, + down=self._down_on_autostop, + ) + # Get the result from the request + job_id, handle = sky.get(request_id) + except Exception as e: + logger.error(f"Failed to launch SkyPilot cluster: {e}") + raise RuntimeError(f"Failed to launch SkyPilot cluster: {e}") from e + + self._launched_cluster_name = cluster_name + logger.info(f"SkyPilot cluster '{cluster_name}' launched successfully") + + # Wait for the job to be RUNNING (setup complete, run started) + self._wait_for_job_running(cluster_name, job_id, timeout=300) + + def _wait_for_job_running(self, cluster_name: str, job_id: int, timeout: int = 300) -> None: + """Wait for the SkyPilot job to reach RUNNING status (setup complete).""" + start_time = time.time() + poll_interval = 10 # seconds + + logger.info(f"Waiting for job {job_id} setup to complete (timeout={timeout}s)...") + + while time.time() - start_time < timeout: + try: + # Get job queue for the cluster + request_id = sky.queue(cluster_name) + jobs = sky.get(request_id) + + # Find our job + for job in jobs: + if job.get('id') == job_id or job.get('job_id') == job_id: + status = job.get('status', '') + status_str = str(status) + if 'RUNNING' in status_str: + logger.info(f"Job {job_id} is now RUNNING (setup complete)") + return + elif 'FAILED' in status_str or 'CANCELLED' in status_str: + raise RuntimeError(f"Job {job_id} failed with status: {status}. Check logs with: sky logs {cluster_name}") + else: + elapsed = int(time.time() - start_time) + logger.info(f"Job {job_id} status: {status} (waited {elapsed}s)") + break + + except Exception as e: + logger.warning(f"Error checking job status: {e}") + + time.sleep(poll_interval) + + raise RuntimeError(f"Timeout waiting for job {job_id} to reach RUNNING status") + + def _build_worker_command(self) -> str: + """Build the bash command to start Monarch workers on each node.""" + # This command will be run on each node via SkyPilot + # SkyPilot expects a bash script, so we wrap Python code in python -c + # Note: Use IP address (not hostname) for the worker address since + # Kubernetes hostnames may not resolve across pods + python_code = f''' +import socket +import logging +import sys + +# Enable verbose logging +logging.basicConfig(level=logging.DEBUG, stream=sys.stdout, format="%(asctime)s %(levelname)s %(name)s: %(message)s") + +hostname = socket.gethostname() +ip_addr = socket.gethostbyname(hostname) +address = f"tcp://{{ip_addr}}:{self._port}" +print(f"Starting Monarch worker at {{address}} (hostname={{hostname}})", flush=True) +sys.stdout.flush() + +try: + from monarch.actor import run_worker_loop_forever + print(f"Imported run_worker_loop_forever successfully", flush=True) + print(f"Worker ready and listening...", flush=True) + run_worker_loop_forever(address=address, ca="trust_all_connections") +except Exception as e: + print(f"ERROR in worker: {{e}}", flush=True) + import traceback + traceback.print_exc() + raise +''' + # Escape single quotes in the Python code for bash + escaped_code = python_code.replace("'", "'\"'\"'") + # Set timeout env vars + env_vars = " ".join([ + "export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=5m", + "export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=5m", + "export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=5m", + ]) + return f"{env_vars} && {self._python_exe} -c '{escaped_code}'" + + def _get_node_ips(self) -> List[str]: + """Get the IP addresses of all nodes in the cluster.""" + if not self._launched_cluster_name: + raise RuntimeError("Cluster has not been launched yet") + + # Query cluster status to get handle with node IPs + try: + request_id = sky.status(cluster_names=[self._launched_cluster_name]) + statuses = sky.get(request_id) + except Exception as e: + raise RuntimeError(f"Failed to get cluster status: {e}") from e + + if not statuses: + raise RuntimeError( + f"Cluster '{self._launched_cluster_name}' not found" + ) + + status = statuses[0] + handle = status.handle + + if handle is None: + raise RuntimeError( + f"Cluster '{self._launched_cluster_name}' has no handle" + ) + + # Get the external IPs from the handle + if handle.stable_internal_external_ips is None: + raise RuntimeError("Cluster has no IP information") + + # stable_internal_external_ips is List[Tuple[internal_ip, external_ip]] + # We use external IPs to connect + ips = [] + for internal_ip, external_ip in handle.stable_internal_external_ips: + # Prefer external IP, fall back to internal + ip = external_ip if external_ip else internal_ip + if ip: + ips.append(ip) + + if not ips: + raise RuntimeError("No IP addresses found for cluster nodes") + + return ips + + def _wait_for_workers_ready( + self, expected_nodes: int, timeout: int = 300, poll_interval: int = 5 + ) -> List[str]: + """Wait for workers to be ready and return their addresses.""" + start_time = time.time() + + while time.time() - start_time < timeout: + try: + ips = self._get_node_ips() + if len(ips) >= expected_nodes: + logger.info(f"Found {len(ips)} nodes ready") + return ips + except Exception as e: + logger.debug(f"Waiting for workers: {e}") + + time.sleep(poll_interval) + + raise RuntimeError( + f"Timeout waiting for {expected_nodes} workers after {timeout}s" + ) + + def _state(self) -> JobState: + """Get the current state with HostMesh objects for each mesh.""" + if not self._jobs_active(): + raise RuntimeError("SkyPilot cluster is not active") + + # Get node IPs if not cached + if not self._node_ips: + total_nodes = sum(self._meshes.values()) + self._node_ips = self._wait_for_workers_ready(total_nodes) + + # Distribute IPs among meshes + host_meshes = {} + ip_idx = 0 + + for mesh_name, num_nodes in self._meshes.items(): + mesh_ips = self._node_ips[ip_idx : ip_idx + num_nodes] + ip_idx += num_nodes + + workers = [f"tcp://{ip}:{self._port}" for ip in mesh_ips] + logger.info(f"Connecting to workers for mesh '{mesh_name}': {workers}") + + host_mesh = _attach_to_workers_wrapper( + name=mesh_name, + ca="trust_all_connections", + workers=workers, + ) + + # Wait for the host mesh to be initialized (connections established) + logger.info(f"Waiting for host mesh '{mesh_name}' to initialize...") + host_mesh.initialized.get() + logger.info(f"Host mesh '{mesh_name}' initialized successfully") + + # Give connections a moment to fully stabilize + time.sleep(5) + logger.info(f"Host mesh '{mesh_name}' ready") + + host_meshes[mesh_name] = host_mesh + + return JobState(host_meshes) + + def can_run(self, spec: "JobTrait") -> bool: + """Check if this job can run the given spec.""" + if not isinstance(spec, SkyPilotJob): + return False + + return ( + spec._meshes == self._meshes + and spec._resources == self._resources + and spec._port == self._port + and self._jobs_active() + ) + + def _jobs_active(self) -> bool: + """Check if the SkyPilot cluster is still active.""" + if not self.active or not self._launched_cluster_name: + return False + + try: + request_id = sky.status(cluster_names=[self._launched_cluster_name]) + statuses = sky.get(request_id) + + if not statuses: + return False + + status = statuses[0] + # Check if cluster is UP + return status.status == sky.ClusterStatus.UP + except Exception as e: + logger.warning(f"Error checking cluster status: {e}") + return False + + def _kill(self) -> None: + """Tear down the SkyPilot cluster.""" + if self._launched_cluster_name is not None: + try: + logger.info(f"Tearing down SkyPilot cluster '{self._launched_cluster_name}'") + request_id = sky.down(self._launched_cluster_name) + sky.get(request_id) + logger.info(f"Cluster '{self._launched_cluster_name}' terminated") + except Exception as e: + logger.warning(f"Failed to tear down cluster: {e}") + + self._launched_cluster_name = None + self._node_ips.clear() + From e23bd3faa3b2e881bb44a062106b2d9bd9f28843 Mon Sep 17 00:00:00 2001 From: Romil Date: Thu, 11 Dec 2025 01:42:49 +0000 Subject: [PATCH 12/29] remove stale changes --- python/monarch/_src/job/skypilot.py | 488 ---------------------------- python/monarch/job/__init__.py | 7 - 2 files changed, 495 deletions(-) delete mode 100644 python/monarch/_src/job/skypilot.py diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py deleted file mode 100644 index a5684b148..000000000 --- a/python/monarch/_src/job/skypilot.py +++ /dev/null @@ -1,488 +0,0 @@ -# pyre-unsafe - -import logging -import os -import sys -import time -from typing import Dict, List, Optional, Tuple, TYPE_CHECKING - -from monarch._src.job.job import JobState, JobTrait - -# Defer imports that may not be available in all environments -if TYPE_CHECKING: - import sky - from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle - -try: - import sky - from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle - - HAS_SKYPILOT = True -except ImportError: - HAS_SKYPILOT = False - sky = None # type: ignore[assignment] - CloudVmRayResourceHandle = None # type: ignore[assignment, misc] - - -logger: logging.Logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) -logger.addHandler(logging.StreamHandler(sys.stderr)) -logger.propagate = False - -# Default port for Monarch TCP communication -DEFAULT_MONARCH_PORT = 22222 - -# Default setup commands to build Monarch from source on remote workers. -# NOTE: Cold start is slow (~7-10 minutes) because we need to compile Monarch -# each worker This is necessary to ensure client/worker version compatibility -# when using a development branch. For production use, consider -# using pre-built wheels from PyPI (pip install torchmonarch). -# -# For faster cold starts (<30s), use a custom Docker image with all dependencies -# pre-installed by setting image_id in sky.Resources: -# resources = sky.Resources(image_id="docker:your-registry/monarch-image:tag", ...) -DEFAULT_SETUP_COMMANDS = """ -set -ex - -# Add PPA for newer toolchains -sudo apt-get update -sudo apt-get install -y software-properties-common -sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test -sudo apt-get update - -# Install system dependencies -sudo apt-get install -y \ - build-essential \ - ninja-build \ - g++-11 \ - rdma-core \ - libibverbs1 \ - libmlx5-1 \ - libibverbs-dev \ - curl \ - pkg-config \ - libssl-dev - -# Install CUDA toolkit and NCCL -wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb -sudo dpkg -i cuda-keyring_1.1-1_all.deb -sudo apt-get update -sudo apt-get install -y cuda-toolkit-12-1 -sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9 - -# Install Rust -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y -source $HOME/.cargo/env -rustup default nightly - -# Install Python dependencies and build Monarch from source -cd ~/sky_workdir -pip install setuptools-rust maturin -pip install -r torch-requirements.txt -r build-requirements.txt -CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation . - -echo "Done installing Monarch" -""" - - -def _configure_transport() -> None: - """Configure the Monarch transport. Deferred import to avoid import errors.""" - from monarch._rust_bindings.monarch_hyperactor.channel import ChannelTransport - from monarch._rust_bindings.monarch_hyperactor.config import configure - - configure(default_transport=ChannelTransport.TcpWithHostname) - - -def _attach_to_workers_wrapper(name: str, ca: str, workers: List[str]): - """Wrapper around attach_to_workers with deferred import.""" - from monarch._src.actor.bootstrap import attach_to_workers - - return attach_to_workers(name=name, ca=ca, workers=workers) - - -class SkyPilotJob(JobTrait): - - """ - A job scheduler that uses SkyPilot to provision cloud instances. - - SkyPilot supports multiple cloud providers (AWS, GCP, Azure, Lambda, etc.) - and can automatically select the cheapest available option. - - This implementation: - 1. Uses sky.launch() to provision cloud instances with specified resources - 2. Runs Monarch workers on each node via a startup script - 3. Connects to workers using their IP addresses from the cluster handle - - Example: - >>> import sky - >>> from monarch.job import SkyPilotJob - >>> - >>> job = SkyPilotJob( - ... meshes={"trainers": 2}, - ... resources=sky.Resources(accelerators="A100:1"), - ... cluster_name="my-monarch-cluster", - ... ) - >>> state = job.state() - >>> trainers = state.trainers # HostMesh with 2 nodes - """ - - def __init__( - self, - meshes: Dict[str, int], - resources: Optional["sky.Resources"] = None, - cluster_name: Optional[str] = None, - monarch_port: int = DEFAULT_MONARCH_PORT, - idle_minutes_to_autostop: Optional[int] = None, - down_on_autostop: bool = False, - python_exe: str = "python", - setup_commands: Optional[str] = None, - workdir: Optional[str] = None, - file_mounts: Optional[Dict[str, str]] = None, - ) -> None: - """ - Args: - meshes: Dictionary mapping mesh names to number of nodes. - e.g., {"trainers": 4, "dataloaders": 2} - resources: SkyPilot Resources specification for the instances. - If None, uses SkyPilot defaults. - cluster_name: Name for the SkyPilot cluster. If None, auto-generated. - monarch_port: Port for TCP communication between Monarch workers. - idle_minutes_to_autostop: If set, cluster will autostop after this - many minutes of idleness. - down_on_autostop: If True, tear down cluster on autostop instead of - just stopping it. - python_exe: Python executable to use for worker processes. - setup_commands: Optional setup commands to run before starting workers. - Use this to install dependencies including Monarch. - If None and workdir is provided, uses DEFAULT_SETUP_COMMANDS - which builds Monarch from source. - workdir: Local directory to sync to the cluster. If provided, this - directory will be uploaded to ~/sky_workdir on each node. - When using workdir with the Monarch repo, DEFAULT_SETUP_COMMANDS - will build Monarch from source on each worker. - file_mounts: Dictionary mapping remote paths to local paths for - additional file mounts. - """ - if not HAS_SKYPILOT: - raise ImportError( - "SkyPilot is not installed. Install it with: pip install skypilot" - ) - - # Configure transport at runtime when Monarch is available - try: - _configure_transport() - except ImportError: - # Monarch bindings not available, will fail later when needed - pass - - super().__init__() - - self._meshes = meshes - self._resources = resources - self._cluster_name = cluster_name - self._port = monarch_port - self._idle_minutes_to_autostop = idle_minutes_to_autostop - self._down_on_autostop = down_on_autostop - self._python_exe = python_exe - self._setup_commands = setup_commands - self._workdir = workdir - self._file_mounts = file_mounts - - # Runtime state - self._launched_cluster_name: Optional[str] = None - self._node_ips: List[str] = [] - - def _create(self, client_script: Optional[str]) -> None: - """Launch a SkyPilot cluster and start Monarch workers.""" - if client_script is not None: - raise RuntimeError("SkyPilotJob cannot run batch-mode scripts yet") - - total_nodes = sum(self._meshes.values()) - - # Build the worker startup command - worker_command = self._build_worker_command() - - # Create setup commands - # If workdir is provided but no setup_commands, use defaults to build Monarch - if self._setup_commands is not None: - setup = self._setup_commands - elif self._workdir is not None: - setup = DEFAULT_SETUP_COMMANDS - else: - setup = "" - if setup and not setup.endswith("\n"): - setup += "\n" - - # Create the SkyPilot task - task = sky.Task( - name="monarch-workers", - setup=setup if setup else None, - run=worker_command, - num_nodes=total_nodes, - workdir=self._workdir, - ) - - # Add file mounts if provided - if self._file_mounts: - task.set_file_mounts(self._file_mounts) - - if self._resources is not None: - task.set_resources(self._resources) - - # Generate cluster name if not provided - cluster_name = self._cluster_name or f"monarch-{os.getpid()}" - - logger.info(f"Launching SkyPilot cluster '{cluster_name}' with {total_nodes} nodes") - - # Launch the cluster - # Note: sky.launch returns a request ID in the SDK, we need to get the result - try: - request_id = sky.launch( - task, - cluster_name=cluster_name, - idle_minutes_to_autostop=self._idle_minutes_to_autostop, - down=self._down_on_autostop, - ) - # Get the result from the request - job_id, handle = sky.get(request_id) - except Exception as e: - logger.error(f"Failed to launch SkyPilot cluster: {e}") - raise RuntimeError(f"Failed to launch SkyPilot cluster: {e}") from e - - self._launched_cluster_name = cluster_name - logger.info(f"SkyPilot cluster '{cluster_name}' launched successfully") - - # Wait for the job to be RUNNING (setup complete, run started) - self._wait_for_job_running(cluster_name, job_id, timeout=900) - - def _wait_for_job_running(self, cluster_name: str, job_id: int, timeout: int = 900) -> None: - """Wait for the SkyPilot job to reach RUNNING status (setup complete).""" - import time - start_time = time.time() - poll_interval = 10 # seconds - - logger.info(f"Waiting for job {job_id} setup to complete (timeout={timeout}s)...") - - while time.time() - start_time < timeout: - try: - # Get job queue for the cluster - request_id = sky.queue(cluster_name) - jobs = sky.get(request_id) - - # Find our job - for job in jobs: - if job.get('id') == job_id or job.get('job_id') == job_id: - status = job.get('status', '') - status_str = str(status) - if 'RUNNING' in status_str: - logger.info(f"Job {job_id} is now RUNNING (setup complete)") - return - elif 'FAILED' in status_str or 'CANCELLED' in status_str: - raise RuntimeError(f"Job {job_id} failed with status: {status}. Check logs with: sky logs {cluster_name}") - else: - elapsed = int(time.time() - start_time) - logger.info(f"Job {job_id} status: {status} (waited {elapsed}s)") - break - - except Exception as e: - logger.warning(f"Error checking job status: {e}") - - time.sleep(poll_interval) - - raise RuntimeError(f"Timeout waiting for job {job_id} to reach RUNNING status") - - def _build_worker_command(self) -> str: - """Build the bash command to start Monarch workers on each node.""" - # This command will be run on each node via SkyPilot - # SkyPilot expects a bash script, so we wrap Python code in python -c - # Note: Use IP address (not hostname) for the worker address since - # Kubernetes hostnames may not resolve across pods - python_code = f''' -import socket -import logging -import sys - -# Enable verbose logging -logging.basicConfig(level=logging.DEBUG, stream=sys.stdout, format="%(asctime)s %(levelname)s %(name)s: %(message)s") - -hostname = socket.gethostname() -ip_addr = socket.gethostbyname(hostname) -address = f"tcp://{{ip_addr}}:{self._port}" -print(f"Starting Monarch worker at {{address}} (hostname={{hostname}})", flush=True) -sys.stdout.flush() - -try: - from monarch.actor import run_worker_loop_forever - print(f"Imported run_worker_loop_forever successfully", flush=True) - print(f"Worker ready and listening...", flush=True) - run_worker_loop_forever(address=address, ca="trust_all_connections") -except Exception as e: - print(f"ERROR in worker: {{e}}", flush=True) - import traceback - traceback.print_exc() - raise -''' - # Escape single quotes in the Python code for bash - escaped_code = python_code.replace("'", "'\"'\"'") - # Set timeout env vars - setup takes time (building from source) so we need longer timeouts - env_vars = " ".join([ - "export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=15m", - "export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=15m", - "export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=15m", - ]) - return f"{env_vars} && {self._python_exe} -c '{escaped_code}'" - - def _get_node_ips(self) -> List[str]: - """Get the IP addresses of all nodes in the cluster.""" - if not self._launched_cluster_name: - raise RuntimeError("Cluster has not been launched yet") - - # Query cluster status to get handle with node IPs - try: - request_id = sky.status(cluster_names=[self._launched_cluster_name]) - statuses = sky.get(request_id) - except Exception as e: - raise RuntimeError(f"Failed to get cluster status: {e}") from e - - if not statuses: - raise RuntimeError( - f"Cluster '{self._launched_cluster_name}' not found" - ) - - status = statuses[0] - handle = status.handle - - if handle is None: - raise RuntimeError( - f"Cluster '{self._launched_cluster_name}' has no handle" - ) - - if not isinstance(handle, CloudVmRayResourceHandle): - raise RuntimeError( - f"Unexpected handle type: {type(handle)}" - ) - - # Get the external IPs from the handle - if handle.stable_internal_external_ips is None: - raise RuntimeError("Cluster has no IP information") - - # stable_internal_external_ips is List[Tuple[internal_ip, external_ip]] - # We use external IPs to connect - ips = [] - for internal_ip, external_ip in handle.stable_internal_external_ips: - # Prefer external IP, fall back to internal - ip = external_ip if external_ip else internal_ip - if ip: - ips.append(ip) - - if not ips: - raise RuntimeError("No IP addresses found for cluster nodes") - - return ips - - def _wait_for_workers_ready( - self, expected_nodes: int, timeout: int = 300, poll_interval: int = 5 - ) -> List[str]: - """Wait for workers to be ready and return their addresses.""" - start_time = time.time() - - while time.time() - start_time < timeout: - try: - ips = self._get_node_ips() - if len(ips) >= expected_nodes: - logger.info(f"Found {len(ips)} nodes ready") - return ips - except Exception as e: - logger.debug(f"Waiting for workers: {e}") - - time.sleep(poll_interval) - - raise RuntimeError( - f"Timeout waiting for {expected_nodes} workers after {timeout}s" - ) - - def _state(self) -> JobState: - """Get the current state with HostMesh objects for each mesh.""" - if not self._jobs_active(): - raise RuntimeError("SkyPilot cluster is not active") - - # Get node IPs if not cached - if not self._node_ips: - total_nodes = sum(self._meshes.values()) - self._node_ips = self._wait_for_workers_ready(total_nodes) - - # Distribute IPs among meshes - host_meshes = {} - ip_idx = 0 - - for mesh_name, num_nodes in self._meshes.items(): - mesh_ips = self._node_ips[ip_idx : ip_idx + num_nodes] - ip_idx += num_nodes - - workers = [f"tcp://{ip}:{self._port}" for ip in mesh_ips] - logger.info(f"Connecting to workers for mesh '{mesh_name}': {workers}") - - host_mesh = _attach_to_workers_wrapper( - name=mesh_name, - ca="trust_all_connections", - workers=workers, - ) - - # Wait for the host mesh to be initialized (connections established) - logger.info(f"Waiting for host mesh '{mesh_name}' to initialize...") - host_mesh.initialized.get() - logger.info(f"Host mesh '{mesh_name}' initialized successfully") - - # Give connections a moment to fully stabilize - time.sleep(5) - logger.info(f"Host mesh '{mesh_name}' ready") - - host_meshes[mesh_name] = host_mesh - - return JobState(host_meshes) - - def can_run(self, spec: "JobTrait") -> bool: - """Check if this job can run the given spec.""" - if not isinstance(spec, SkyPilotJob): - return False - - return ( - spec._meshes == self._meshes - and spec._resources == self._resources - and spec._port == self._port - and self._jobs_active() - ) - - def _jobs_active(self) -> bool: - """Check if the SkyPilot cluster is still active.""" - if not self.active or not self._launched_cluster_name: - return False - - try: - request_id = sky.status(cluster_names=[self._launched_cluster_name]) - statuses = sky.get(request_id) - - if not statuses: - return False - - status = statuses[0] - # Check if cluster is UP - return status.status == sky.ClusterStatus.UP - except Exception as e: - logger.warning(f"Error checking cluster status: {e}") - return False - - def _kill(self) -> None: - """Tear down the SkyPilot cluster.""" - if self._launched_cluster_name is not None: - try: - logger.info(f"Tearing down SkyPilot cluster '{self._launched_cluster_name}'") - request_id = sky.down(self._launched_cluster_name) - sky.get(request_id) - logger.info(f"Cluster '{self._launched_cluster_name}' terminated") - except Exception as e: - logger.warning(f"Failed to tear down cluster: {e}") - - self._launched_cluster_name = None - self._node_ips.clear() - diff --git a/python/monarch/job/__init__.py b/python/monarch/job/__init__.py index 674007d53..0f6ec1960 100644 --- a/python/monarch/job/__init__.py +++ b/python/monarch/job/__init__.py @@ -8,12 +8,6 @@ from monarch._src.job.job import job_load, job_loads, JobState, JobTrait, LocalJob from monarch._src.job.slurm import SlurmJob -# SkyPilot is an optional dependency -try: - from monarch._src.job.skypilot import SkyPilotJob -except ImportError: - SkyPilotJob = None # type: ignore[misc,assignment] - # Define exports __all__ = [ "JobTrait", @@ -22,5 +16,4 @@ "JobState", "LocalJob", "SlurmJob", - "SkyPilotJob", ] From 40f3a6a9c06944d79c9baee1f4a81cc4c3a89e4a Mon Sep 17 00:00:00 2001 From: Romil Date: Thu, 11 Dec 2025 02:40:18 +0000 Subject: [PATCH 13/29] Add DDP and titan examples --- examples/skypilot/README.md | 67 ++++++++ examples/skypilot/skypilot_ddp.py | 200 +++++++++++++++++++++++ examples/skypilot/skypilot_titan.py | 245 ++++++++++++++++++++++++++++ 3 files changed, 512 insertions(+) create mode 100644 examples/skypilot/skypilot_ddp.py create mode 100644 examples/skypilot/skypilot_titan.py diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md index 74f16b9e0..153089ad1 100644 --- a/examples/skypilot/README.md +++ b/examples/skypilot/README.md @@ -82,6 +82,73 @@ python getting_started.py --cloud aws --num-hosts 2 --accelerator "A100:1" python getting_started.py --cloud gcp --num-hosts 2 --accelerator "A100:1" ``` +Example output: +``` +$ python skypilot_getting_started.py --num-hosts 2 --gpus-per-host 1 --cluster-name monarch-skypilot-test + +============================================================ +Monarch Getting Started with SkyPilot +============================================================ + +Configuration: + Cloud: kubernetes + Hosts: 2 + GPUs per host: 1 + Accelerator: H200:1 + Cluster name: monarch-skypilot-test + +[1] Creating SkyPilot job... + +[2] Launching cluster and starting Monarch workers... +No cached job found at path: .monarch/job_state.pkl +Applying current job +Launching SkyPilot cluster 'monarch-skypilot-test' with 2 nodes +Running on cluster: monarch-skypilot-test +SkyPilot cluster 'monarch-skypilot-test' launched successfully +Waiting for job 1 setup to complete (timeout=300s)... +Job 1 status: JobStatus.SETTING_UP (waited 5s) +Job 1 is now RUNNING (setup complete) +Saving job to cache at .monarch/job_state.pkl +Job has started, connecting to current state +Found 2 nodes ready +Connecting to workers for mesh 'trainers': ['tcp://10.0.4.22:22222', 'tcp://10.0.4.112:22222'] +Monarch internal logs are being written to /tmp/sky/monarch_log.log; execution id sky_Dec-11_01:31_653 +Waiting for host mesh 'trainers' to initialize... +Host mesh 'trainers' initialized successfully +Host mesh 'trainers' ready + Got host mesh with extent: {hosts: 2} + +[3] Spawning processes on cloud hosts... + Process mesh extent: {hosts: 2, gpus: 1} + +[4] Spawning Counter actors... + +[5] Broadcasting increment to all counters... + +[6] Getting counter values... + Counter values: ValueMesh({hosts: 2, gpus: 1}): + (({'hosts': 0/2, 'gpus': 0/1}, 3), ({'hosts': 1/2, 'gpus': 0/1}, 3)) + +[7] Spawning Trainer actors... + +[8] Performing distributed training step... + ({'hosts': 0/2, 'gpus': 0/1}, "Trainer {'hosts': 0/2, 'gpus': 0/1} taking a step.") + ({'hosts': 1/2, 'gpus': 0/1}, "Trainer {'hosts': 1/2, 'gpus': 0/1} taking a step.") + +[9] Getting trainer info... + ({'hosts': 0/2, 'gpus': 0/1}, "Trainer at rank {'hosts': 0/2, 'gpus': 0/1}") + ({'hosts': 1/2, 'gpus': 0/1}, "Trainer at rank {'hosts': 1/2, 'gpus': 0/1}") + +============================================================ +Success! Monarch actors ran on SkyPilot cluster! +============================================================ + +[10] Cleaning up SkyPilot cluster... +Tearing down SkyPilot cluster 'monarch-skypilot-test' +Cluster 'monarch-skypilot-test' terminated + Cluster terminated. +``` + ## Configuration Options | Parameter | Description | Default | diff --git a/examples/skypilot/skypilot_ddp.py b/examples/skypilot/skypilot_ddp.py new file mode 100644 index 000000000..9b9657428 --- /dev/null +++ b/examples/skypilot/skypilot_ddp.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +""" +Monarch DDP Example with SkyPilot +================================= + +This script demonstrates running PyTorch DDP (DistributedDataParallel) training +on cloud infrastructure provisioned by SkyPilot. + +Adapted from the SLURM DDP example (slurm_ddp.ipynb). + +Usage: + python skypilot_ddp.py --num-hosts 2 --gpus-per-host 1 +""" + +import argparse +import asyncio +import logging +import os +import sys + +# Set timeouts before importing monarch +os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s" +os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s" +os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s" + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.optim as optim + +from monarch.actor import Actor, current_rank, endpoint +from monarch.utils import setup_env_for_distributed +from torch.nn.parallel import DistributedDataParallel as DDP + +# Import SkyPilotJob from local module +from skypilot_job import SkyPilotJob + +try: + import sky +except ImportError: + print("ERROR: SkyPilot is not installed. Run: pip install skypilot[kubernetes]") + sys.exit(1) + +logging.basicConfig( + level=logging.INFO, + format="%(name)s %(asctime)s %(levelname)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + force=True, +) +logger = logging.getLogger(__name__) + + +class ToyModel(nn.Module): + """A simple toy model for demonstration purposes.""" + + def __init__(self): + super(ToyModel, self).__init__() + self.net1 = nn.Linear(10, 10) + self.relu = nn.ReLU() + self.net2 = nn.Linear(10, 5) + + def forward(self, x): + return self.net2(self.relu(self.net1(x))) + + +class DDPActor(Actor): + """This Actor wraps the basic functionality from Torch's DDP example. + + Adapted from: https://docs.pytorch.org/tutorials/intermediate/ddp_tutorial.html#basic-use-case + """ + + def __init__(self): + self.rank = current_rank().rank + + def _rprint(self, msg): + """Helper method to print with rank information.""" + print(f"{self.rank=} {msg}") + + @endpoint + async def setup(self): + """Initialize the PyTorch distributed process group.""" + self._rprint("Initializing torch distributed") + + WORLD_SIZE = int(os.environ["WORLD_SIZE"]) + # initialize the process group + dist.init_process_group("gloo", rank=self.rank, world_size=WORLD_SIZE) + self._rprint("Finished initializing torch distributed") + + @endpoint + async def cleanup(self): + """Clean up the PyTorch distributed process group.""" + self._rprint("Cleaning up torch distributed") + dist.destroy_process_group() + + @endpoint + async def demo_basic(self): + """Run a basic DDP training example.""" + self._rprint("Running basic DDP example") + + # create model and move it to GPU with id rank + local_rank = int(os.environ["LOCAL_RANK"]) + self._rprint(f"{local_rank=}") + model = ToyModel().to(local_rank) + ddp_model = DDP(model, device_ids=[local_rank]) + + loss_fn = nn.MSELoss() + optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) + + optimizer.zero_grad() + outputs = ddp_model(torch.randn(20, 10)) + labels = torch.randn(20, 5).to(local_rank) + loss_fn(outputs, labels).backward() + optimizer.step() + + print(f"{self.rank=} Finished running basic DDP example") + + +def get_cloud(cloud_name: str): + """Get SkyPilot cloud object from name.""" + clouds = { + "kubernetes": sky.Kubernetes, + "aws": sky.AWS, + "gcp": sky.GCP, + "azure": sky.Azure, + } + if cloud_name.lower() not in clouds: + raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}") + return clouds[cloud_name.lower()]() + + +async def main(): + parser = argparse.ArgumentParser(description="Monarch DDP with SkyPilot") + parser.add_argument("--cloud", default="kubernetes", help="Cloud provider") + parser.add_argument("--num-hosts", type=int, default=2, help="Number of hosts") + parser.add_argument("--gpus-per-host", type=int, default=1, help="GPUs per host") + parser.add_argument("--cluster-name", default="monarch-ddp", help="Cluster name") + parser.add_argument("--accelerator", default="H200:1", help="GPU accelerator") + args = parser.parse_args() + + print("=" * 60) + print("Monarch DDP Example with SkyPilot") + print("=" * 60) + print(f"\nConfiguration:") + print(f" Cloud: {args.cloud}") + print(f" Hosts: {args.num_hosts}") + print(f" GPUs per host: {args.gpus_per_host}") + print(f" Accelerator: {args.accelerator}") + + # Create SkyPilot job + job = SkyPilotJob( + meshes={"mesh0": args.num_hosts}, + resources=sky.Resources( + cloud=get_cloud(args.cloud), + accelerators=args.accelerator, + ), + cluster_name=args.cluster_name, + idle_minutes_to_autostop=10, + down_on_autostop=True, + ) + + try: + print("\n[1] Launching SkyPilot cluster...") + job_state = job.state() + + print("\n[2] Creating process mesh...") + proc_mesh = job_state.mesh0.spawn_procs({"gpus": args.gpus_per_host}) + print(f" Process mesh extent: {proc_mesh.extent}") + + print("\n[3] Spawning DDP actors...") + ddp_actor = proc_mesh.spawn("ddp_actor", DDPActor) + + print("\n[4] Setting up distributed environment...") + await setup_env_for_distributed(proc_mesh) + + print("\n[5] Running DDP example...") + await ddp_actor.setup.call() + await ddp_actor.demo_basic.call() + await ddp_actor.cleanup.call() + + print("\n" + "=" * 60) + print("DDP example completed successfully!") + print("=" * 60) + + except Exception as e: + print(f"\nERROR: {e}") + import traceback + traceback.print_exc() + print(f"\nNot cleaning up cluster for debugging...") + print(f" Debug with: sky ssh {args.cluster_name}") + print(f" Clean up: sky down {args.cluster_name}") + raise + else: + print("\n[6] Cleaning up SkyPilot cluster...") + job.kill() + print(" Done!") + + +if __name__ == "__main__": + asyncio.run(main()) + diff --git a/examples/skypilot/skypilot_titan.py b/examples/skypilot/skypilot_titan.py new file mode 100644 index 000000000..1f4930ba6 --- /dev/null +++ b/examples/skypilot/skypilot_titan.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 +""" +Monarch + TorchTitan Example with SkyPilot +========================================== + +This script demonstrates running TorchTitan distributed training on cloud +infrastructure provisioned by SkyPilot. + +Adapted from the SLURM TorchTitan example (slurm_titan.ipynb). + +Prerequisites: + - TorchTitan installed: pip install torchtitan + - Model config file (e.g., debug_model.toml) + - Tokenizer files in ./tokenizer/ + +Usage: + python skypilot_titan.py --num-hosts 2 --gpus-per-host 1 --config debug_model.toml +""" + +import argparse +import asyncio +import logging +import os +import sys +from dataclasses import dataclass + +# Set timeouts before importing monarch +os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s" +os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s" +os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s" + +# Check for TorchTitan +try: + from torchtitan.train import Trainer + from torchtitan.config import ConfigManager, JobConfig + from torchtitan.tools.logging import init_logger, logger as titan_logger + HAS_TORCHTITAN = True +except ImportError: + HAS_TORCHTITAN = False + print("WARNING: TorchTitan is not installed. Install with: pip install torchtitan") + print("This example will show the structure but cannot run training.") + +import torch +from monarch.actor import Actor, current_rank, endpoint +from monarch.utils import setup_env_for_distributed + +# Import SkyPilotJob from local module +from skypilot_job import SkyPilotJob + +try: + import sky +except ImportError: + print("ERROR: SkyPilot is not installed. Run: pip install skypilot[kubernetes]") + sys.exit(1) + +logging.basicConfig( + level=logging.INFO, + format="%(name)s %(asctime)s %(levelname)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + force=True, +) +logger = logging.getLogger(__name__) + + +@dataclass +class RunParams: + """Parameters for training job.""" + training_steps: int = 50 + model_config: str = "debug_model.toml" + dataset: str = "c4" + num_nodes: int = 2 + gpus_per_node: int = 1 + + +if HAS_TORCHTITAN: + class TrainerActor(Actor): + """A wrapper class that executes a TorchTitan trainer in a Monarch actor.""" + + def __init__(self, job_config: "JobConfig") -> None: + self.job_config = job_config + rank = current_rank().rank + self.uid = f"[trainer_{rank}]" + + @endpoint + async def start_training(self) -> None: + init_logger() + trainer = None + + try: + trainer = Trainer(self.job_config) + titan_logger.info(f"{self.uid} initialized successfully and starting training") + trainer.train() + except Exception: + if trainer: + trainer.close() + raise + else: + trainer.close() + finally: + torch.distributed.destroy_process_group() + titan_logger.info(f"{self.uid} trainer cleaned up") + + +def make_job_config(run_params: RunParams, script_dir: str) -> "JobConfig": + """Create a job config for TorchTitan.""" + if not HAS_TORCHTITAN: + raise RuntimeError("TorchTitan is not installed") + + data_parallel_shard_degree = run_params.num_nodes * run_params.gpus_per_node + output_path = "./outputs" + + default_args = [ + "--job.config_file", + os.path.join(script_dir, run_params.model_config), + "--model.tokenizer_path", + os.path.join(script_dir, "tokenizer"), + "--comm.trace_buf_size", + "0", + "--metrics.log_freq", + "1", + "--parallelism.data_parallel_shard_degree", + str(data_parallel_shard_degree), + "--activation_checkpoint.mode", + "full", + "--comm.train_timeout_seconds", + "60", + "--training.steps", + str(run_params.training_steps), + "--training.dataset", + run_params.dataset, + "--job.dump_folder", + output_path, + "--metrics.enable_tensorboard", + ] + + config_manager = ConfigManager() + job_config = config_manager.parse_args(default_args) + + return job_config + + +def get_cloud(cloud_name: str): + """Get SkyPilot cloud object from name.""" + clouds = { + "kubernetes": sky.Kubernetes, + "aws": sky.AWS, + "gcp": sky.GCP, + "azure": sky.Azure, + } + if cloud_name.lower() not in clouds: + raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}") + return clouds[cloud_name.lower()]() + + +async def main(): + parser = argparse.ArgumentParser(description="Monarch + TorchTitan with SkyPilot") + parser.add_argument("--cloud", default="kubernetes", help="Cloud provider") + parser.add_argument("--num-hosts", type=int, default=2, help="Number of hosts") + parser.add_argument("--gpus-per-host", type=int, default=1, help="GPUs per host") + parser.add_argument("--cluster-name", default="monarch-titan", help="Cluster name") + parser.add_argument("--accelerator", default="H200:1", help="GPU accelerator") + parser.add_argument("--config", default="debug_model.toml", help="TorchTitan config file") + parser.add_argument("--steps", type=int, default=50, help="Training steps") + args = parser.parse_args() + + if not HAS_TORCHTITAN: + print("ERROR: TorchTitan is required for this example.") + print("Install with: pip install torchtitan") + sys.exit(1) + + print("=" * 60) + print("Monarch + TorchTitan with SkyPilot") + print("=" * 60) + print(f"\nConfiguration:") + print(f" Cloud: {args.cloud}") + print(f" Hosts: {args.num_hosts}") + print(f" GPUs per host: {args.gpus_per_host}") + print(f" Accelerator: {args.accelerator}") + print(f" Config: {args.config}") + print(f" Steps: {args.steps}") + + # Setup run parameters + run_params = RunParams( + training_steps=args.steps, + model_config=args.config, + num_nodes=args.num_hosts, + gpus_per_node=args.gpus_per_host, + ) + + script_dir = os.path.dirname(os.path.abspath(__file__)) + job_config = make_job_config(run_params, script_dir) + + # Create SkyPilot job + job = SkyPilotJob( + meshes={"mesh0": args.num_hosts}, + resources=sky.Resources( + cloud=get_cloud(args.cloud), + accelerators=args.accelerator, + ), + cluster_name=args.cluster_name, + idle_minutes_to_autostop=10, + down_on_autostop=True, + ) + + try: + print("\n[1] Launching SkyPilot cluster...") + job_state = job.state() + + print("\n[2] Creating process mesh...") + proc_mesh = job_state.mesh0.spawn_procs({"gpus": args.gpus_per_host}) + print(f" Process mesh extent: {proc_mesh.extent}") + + print("\n[3] Configuring remote logging...") + await proc_mesh.logging_option(stream_to_client=True) + + print("\n[4] Setting up distributed environment...") + await setup_env_for_distributed(proc_mesh) + + print("\n[5] Spawning TrainerActor...") + trainer = proc_mesh.spawn("trainer_actor", TrainerActor, job_config) + + print("\n[6] Starting training...") + await trainer.start_training.call() + + print("\n" + "=" * 60) + print("Training completed successfully!") + print("=" * 60) + + except Exception as e: + print(f"\nERROR: {e}") + import traceback + traceback.print_exc() + print(f"\nNot cleaning up cluster for debugging...") + print(f" Debug with: sky ssh {args.cluster_name}") + print(f" Clean up: sky down {args.cluster_name}") + raise + else: + print("\n[7] Cleaning up SkyPilot cluster...") + job.kill() + print(" Done!") + + +if __name__ == "__main__": + asyncio.run(main()) + From 2132e3cbb190a19e438d8dd11334b80765bd3dd4 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 10 Dec 2025 23:53:28 -0800 Subject: [PATCH 14/29] Update README.md --- examples/skypilot/README.md | 39 +++++++++---------------------------- 1 file changed, 9 insertions(+), 30 deletions(-) diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md index 153089ad1..e9c6a2ca9 100644 --- a/examples/skypilot/README.md +++ b/examples/skypilot/README.md @@ -8,8 +8,8 @@ This directory contains a standalone integration for running Monarch workloads o **Supported platforms:** - Kubernetes (any cluster) -- AWS, GCP, Azure -- Lambda Labs, CoreWeave, RunPod, and [20+ other clouds](https://docs.skypilot.co/en/latest/getting-started/installation.html) +- Hyperscalers: AWS, GCP, Azure +- Neoclouds: CoreWeave, Nebius, and [20+ other clouds](https://docs.skypilot.co/en/latest/getting-started/installation.html) ## Installation @@ -27,6 +27,8 @@ pip install skypilot[all] # For all clouds sky check ``` +TODO(romilb): Link to SkyPilot docs for k8s setup + ## Quick Start ```python @@ -149,45 +151,22 @@ Cluster 'monarch-skypilot-test' terminated Cluster terminated. ``` -## Configuration Options - -| Parameter | Description | Default | -|-----------|-------------|---------| -| `meshes` | Dict mapping mesh names to node counts | Required | -| `resources` | SkyPilot Resources specification | None (SkyPilot defaults) | -| `cluster_name` | Name for the cluster | Auto-generated | -| `monarch_port` | Port for Monarch TCP communication | 22222 | -| `idle_minutes_to_autostop` | Auto-stop after idle time | None | -| `down_on_autostop` | Tear down on autostop vs just stop | False | -| `setup_commands` | Custom setup script | Installs torchmonarch-nightly | -| `workdir` | Local directory to sync to cluster | None | -| `file_mounts` | Additional files to mount | None | - ## Default Image -By default, `SkyPilotJob` uses the `pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime` Docker image which has compatible system libraries for `torchmonarch-nightly`. Setup time is ~1-2 minutes (just pip install). +By default, `SkyPilotJob` uses the `pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime` Docker image which has compatible system libraries for `torchmonarch-nightly`. TODO(romilb): mention image requirements. -## Faster Cold Starts +## Faster Cold Starts with SkyPilot's cluster reuse -For faster cold starts (<30s): - -**Option 1: Use a pre-built Docker image** -```python -resources = sky.Resources( - image_id="docker:your-registry/monarch-image:tag", - accelerators="H100:1", -) -``` - -**Option 2: Use SkyPilot's cluster reuse** +TODO(romilb): Validate if this works: ```python job = SkyPilotJob( ..., idle_minutes_to_autostop=30, # Keep cluster alive - down_on_autostop=False, # Just stop, don't terminate ) ``` +TODO(romilb): Benchmark pre-baked container images + ## Network Requirements The client must have direct network connectivity to the worker nodes: From 0b1e5fd947987c0748e815327c77983d5c95c4dd Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 11 Dec 2025 16:19:42 -0800 Subject: [PATCH 15/29] Clean up, add run_getting_started --- examples/skypilot/run_getting_started.yaml | 80 ++++++ examples/skypilot/skypilot_getting_started.py | 26 +- examples/skypilot/skypilot_job.py | 41 ++- examples/skypilot/skypilot_titan.py | 245 ------------------ python/monarch/job/__init__.py | 9 +- 5 files changed, 115 insertions(+), 286 deletions(-) create mode 100644 examples/skypilot/run_getting_started.yaml delete mode 100644 examples/skypilot/skypilot_titan.py diff --git a/examples/skypilot/run_getting_started.yaml b/examples/skypilot/run_getting_started.yaml new file mode 100644 index 000000000..c42b6e7ca --- /dev/null +++ b/examples/skypilot/run_getting_started.yaml @@ -0,0 +1,80 @@ +# SkyPilot YAML for running the Monarch Getting Started example. +# +# This YAML file syncs the example directory, installs dependencies, +# and runs the getting started example. +# +# Usage: +# cd monarch/examples/skypilot +# sky launch run_getting_started.yaml -c monarch-demo +# +# To view logs: +# sky logs monarch-demo +# +# To SSH into the cluster: +# sky ssh monarch-demo +# +# To tear down: +# sky down monarch-demo + +name: monarch-getting-started + +resources: + cloud: kubernetes # Optional, remove or change to your preferred cloud provider + cpus: 2+ # No GPUs needed for the driver script + image_id: docker:pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime + +# Sync the current directory (examples/skypilot) to the cluster +workdir: . + +setup: | + set -ex + + echo "=== Installing system dependencies ===" + # Install socat (required for SkyPilot Kubernetes portforward networking) and curl + apt-get update && apt-get install -y socat curl + + # Install kubectl for Kubernetes cluster management + echo "=== Installing kubectl ===" + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + chmod +x kubectl + mv kubectl /usr/local/bin/ + kubectl version --client || echo "kubectl installed" + + echo "=== Installing Python dependencies ===" + uv pip install --system torchmonarch-nightly + # Install SkyPilot with Kubernetes support for launching nested clusters + uv pip install --system "skypilot[kubernetes]" + + # Verify installations + python -c "import monarch; print(f'Monarch installed: {monarch}')" + python -c "import sky; print(f'SkyPilot installed: {sky}')" + + # Configure SkyPilot to use in-cluster Kubernetes context + # This allows the driver pod to launch nested SkyPilot clusters + unset SKYPILOT_IN_CLUSTER_CONTEXT_NAME + sky api start + + # Verify Kubernetes access + echo "=== Verifying Kubernetes access ===" + sky check kubernetes + + echo "=== GPUs available ===" + sky show-gpus --infra kubernetes + + echo "=== Setup complete ===" + +run: | + echo "=== Running Monarch Getting Started with SkyPilot ===" + + # Run the getting started example + # This will launch a SkyPilot cluster with Monarch workers. + # Change the arguments to your desired values. + python skypilot_getting_started.py \ + --cloud kubernetes \ + --num-hosts 2 \ + --gpus-per-host 1 \ + --cluster-name monarch-workers \ + --accelerator "H200:1" + + echo "=== Example ran successfully ===" + diff --git a/examples/skypilot/skypilot_getting_started.py b/examples/skypilot/skypilot_getting_started.py index b9f703bee..814f4e6d0 100644 --- a/examples/skypilot/skypilot_getting_started.py +++ b/examples/skypilot/skypilot_getting_started.py @@ -10,16 +10,14 @@ pip install torchmonarch-nightly pip install skypilot[kubernetes] # or skypilot[aws], skypilot[gcp], etc. sky check # Verify SkyPilot configuration + sky show-gpus --infra kubernetes # Verify GPUs available Usage: - # Run on Kubernetes: - python getting_started.py --cloud kubernetes --num-hosts 2 + # Run on Kubernetes with 2 nodes, 8 GPUs per node + python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 8 --gpus "H200:8" - # Run on AWS: - python getting_started.py --cloud aws --num-hosts 2 - - # Run on GCP: - python getting_started.py --cloud gcp --num-hosts 2 + # Run on cloud VMs + python skypilot_getting_started.py --cloud --num-hosts 2 --gpus-per-host 1 --gpus "H100:1" """ import argparse @@ -54,7 +52,7 @@ from skypilot_job import SkyPilotJob # ============================================================================ -# Step 1: Define actors (same as getting started guide) +# Step 1: Define actors # ============================================================================ @@ -88,7 +86,7 @@ def get_info(self) -> str: # ============================================================================ -# Step 2: Create a SkyPilot Job to provision cloud infrastructure +# Step 2: Create a SkyPilot Job to provision k8s pods/cloud VMs # ============================================================================ @@ -99,7 +97,10 @@ def get_cloud(cloud_name: str): "aws": sky.AWS, "gcp": sky.GCP, "azure": sky.Azure, - "lambda": sky.Lambda, + "nebius": sky.Nebius, + # "slurm": sky.Slurm, + # "ssh": sky.SSH, + # TODO(romilb): Add other clouds } if cloud_name.lower() not in clouds: raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}") @@ -111,7 +112,7 @@ def main(): parser.add_argument( "--cloud", default="kubernetes", - help="Cloud provider to use (kubernetes, aws, gcp, azure, lambda)", + help="Cloud provider to use (kubernetes, aws, gcp, azure, ssh)", ) parser.add_argument( "--num-hosts", @@ -165,12 +166,13 @@ def main(): if args.region: resources_kwargs["region"] = args.region + # Create a SkyPilotJob to provision nodes job = SkyPilotJob( # Define the mesh of hosts we need meshes={"trainers": args.num_hosts}, resources=sky.Resources(**resources_kwargs), cluster_name=args.cluster_name, - # Auto-cleanup after 10 minutes of idle time + # Auto-cleanup after 10 minutes of idle time (recommended for auto clean up if the job/controller fails) idle_minutes_to_autostop=10, down_on_autostop=True, ) diff --git a/examples/skypilot/skypilot_job.py b/examples/skypilot/skypilot_job.py index 7b5ea1178..edb418da8 100644 --- a/examples/skypilot/skypilot_job.py +++ b/examples/skypilot/skypilot_job.py @@ -1,9 +1,7 @@ """ -SkyPilot integration for Monarch - standalone implementation. +SkyPilotJob for Monarch. -This module provides SkyPilotJob, which allows running Monarch workloads on -Kubernetes and cloud VMs via SkyPilot. It is designed to be used independently -of the main Monarch source tree. +SkyPilotJob allows running Monarch on Kubernetes and cloud VMs via SkyPilot. Requirements: - pip install torchmonarch-nightly (or torchmonarch) @@ -16,7 +14,6 @@ import time from typing import Dict, List, Optional, TYPE_CHECKING -# Import Monarch's job interface from monarch._src.job.job import JobState, JobTrait # If running inside a SkyPilot cluster, unset the in-cluster context variable @@ -25,7 +22,6 @@ if "SKYPILOT_IN_CLUSTER_CONTEXT_NAME" in os.environ: del os.environ["SKYPILOT_IN_CLUSTER_CONTEXT_NAME"] -# Defer imports that may not be available in all environments if TYPE_CHECKING: import sky @@ -46,21 +42,22 @@ # Default port for Monarch TCP communication DEFAULT_MONARCH_PORT = 22222 +# Timeout for waiting for the job to reach RUNNING status. +JOB_TIMEOUT = 300 # seconds + # Default setup commands to install Monarch from PyPI on remote workers. -# Requires a Docker image with Ubuntu 22.04+ for compatible libibverbs. +# Requires a Docker image with Ubuntu 22.04+ with RDMA dependencies. +# In this implementation, we default to pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime image. # -# Cold start time: ~1-2 minutes (pip install only). # For faster cold starts (<30s), use a custom Docker image with Monarch pre-installed. DEFAULT_SETUP_COMMANDS = """ set -ex # Install torchmonarch from PyPI -pip install torchmonarch-nightly +uv pip install --system torchmonarch-nightly echo "Done installing Monarch" """ - -# Default Docker image - PyTorch with CUDA on Ubuntu 22.04 (has compatible libibverbs) DEFAULT_IMAGE_ID = "docker:pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime" @@ -81,16 +78,20 @@ def _attach_to_workers_wrapper(name: str, ca: str, workers: List[str]): class SkyPilotJob(JobTrait): """ - A job scheduler that uses SkyPilot to provision cloud instances. + SkyPilotJob to provision and manage Monarch workers K8s and cloud VMs. - SkyPilot supports multiple cloud providers (AWS, GCP, Azure, Lambda, etc.) - and Kubernetes, and can automatically select the cheapest available option. + SkyPilot supports multiple backends - Kubernetes and VMs on AWS, GCP, Azure, + CoreWeave, Nebius, and 20+ other clouds. This implementation: 1. Uses sky.launch() to provision cloud instances with specified resources 2. Runs Monarch workers on each node via a startup script 3. Connects to workers using their IP addresses from the cluster handle + Caveats: + * For Kubernetes, the driver/client must be run inside the same cluster. + TOOD(romilb): Explore if loadbalancer can be used to connect to workers. + Example: >>> import sky >>> from skypilot_job import SkyPilotJob @@ -198,12 +199,10 @@ def _create(self, client_script: Optional[str]) -> None: # Set resources, using default image_id if not specified resources = self._resources if resources is not None: - # If no image_id specified, use the default PyTorch image if resources.image_id is None: resources = resources.copy(image_id=DEFAULT_IMAGE_ID) task.set_resources(resources) else: - # No resources specified, create default with image_id task.set_resources(sky.Resources(image_id=DEFAULT_IMAGE_ID)) # Generate cluster name if not provided @@ -229,9 +228,9 @@ def _create(self, client_script: Optional[str]) -> None: logger.info(f"SkyPilot cluster '{cluster_name}' launched successfully") # Wait for the job to be RUNNING (setup complete, run started) - self._wait_for_job_running(cluster_name, job_id, timeout=300) + self._wait_for_job_running(cluster_name, job_id, timeout=JOB_TIMEOUT) - def _wait_for_job_running(self, cluster_name: str, job_id: int, timeout: int = 300) -> None: + def _wait_for_job_running(self, cluster_name: str, job_id: int, timeout: int = JOB_TIMEOUT) -> None: """Wait for the SkyPilot job to reach RUNNING status (setup complete).""" start_time = time.time() poll_interval = 10 # seconds @@ -301,9 +300,9 @@ def _build_worker_command(self) -> str: escaped_code = python_code.replace("'", "'\"'\"'") # Set timeout env vars env_vars = " ".join([ - "export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=5m", - "export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=5m", - "export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=5m", + f"export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT={JOB_TIMEOUT}s", + f"export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT={JOB_TIMEOUT}s", + f"export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE={JOB_TIMEOUT}s", ]) return f"{env_vars} && {self._python_exe} -c '{escaped_code}'" diff --git a/examples/skypilot/skypilot_titan.py b/examples/skypilot/skypilot_titan.py deleted file mode 100644 index 1f4930ba6..000000000 --- a/examples/skypilot/skypilot_titan.py +++ /dev/null @@ -1,245 +0,0 @@ -#!/usr/bin/env python3 -""" -Monarch + TorchTitan Example with SkyPilot -========================================== - -This script demonstrates running TorchTitan distributed training on cloud -infrastructure provisioned by SkyPilot. - -Adapted from the SLURM TorchTitan example (slurm_titan.ipynb). - -Prerequisites: - - TorchTitan installed: pip install torchtitan - - Model config file (e.g., debug_model.toml) - - Tokenizer files in ./tokenizer/ - -Usage: - python skypilot_titan.py --num-hosts 2 --gpus-per-host 1 --config debug_model.toml -""" - -import argparse -import asyncio -import logging -import os -import sys -from dataclasses import dataclass - -# Set timeouts before importing monarch -os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s" -os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s" -os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s" - -# Check for TorchTitan -try: - from torchtitan.train import Trainer - from torchtitan.config import ConfigManager, JobConfig - from torchtitan.tools.logging import init_logger, logger as titan_logger - HAS_TORCHTITAN = True -except ImportError: - HAS_TORCHTITAN = False - print("WARNING: TorchTitan is not installed. Install with: pip install torchtitan") - print("This example will show the structure but cannot run training.") - -import torch -from monarch.actor import Actor, current_rank, endpoint -from monarch.utils import setup_env_for_distributed - -# Import SkyPilotJob from local module -from skypilot_job import SkyPilotJob - -try: - import sky -except ImportError: - print("ERROR: SkyPilot is not installed. Run: pip install skypilot[kubernetes]") - sys.exit(1) - -logging.basicConfig( - level=logging.INFO, - format="%(name)s %(asctime)s %(levelname)s %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - force=True, -) -logger = logging.getLogger(__name__) - - -@dataclass -class RunParams: - """Parameters for training job.""" - training_steps: int = 50 - model_config: str = "debug_model.toml" - dataset: str = "c4" - num_nodes: int = 2 - gpus_per_node: int = 1 - - -if HAS_TORCHTITAN: - class TrainerActor(Actor): - """A wrapper class that executes a TorchTitan trainer in a Monarch actor.""" - - def __init__(self, job_config: "JobConfig") -> None: - self.job_config = job_config - rank = current_rank().rank - self.uid = f"[trainer_{rank}]" - - @endpoint - async def start_training(self) -> None: - init_logger() - trainer = None - - try: - trainer = Trainer(self.job_config) - titan_logger.info(f"{self.uid} initialized successfully and starting training") - trainer.train() - except Exception: - if trainer: - trainer.close() - raise - else: - trainer.close() - finally: - torch.distributed.destroy_process_group() - titan_logger.info(f"{self.uid} trainer cleaned up") - - -def make_job_config(run_params: RunParams, script_dir: str) -> "JobConfig": - """Create a job config for TorchTitan.""" - if not HAS_TORCHTITAN: - raise RuntimeError("TorchTitan is not installed") - - data_parallel_shard_degree = run_params.num_nodes * run_params.gpus_per_node - output_path = "./outputs" - - default_args = [ - "--job.config_file", - os.path.join(script_dir, run_params.model_config), - "--model.tokenizer_path", - os.path.join(script_dir, "tokenizer"), - "--comm.trace_buf_size", - "0", - "--metrics.log_freq", - "1", - "--parallelism.data_parallel_shard_degree", - str(data_parallel_shard_degree), - "--activation_checkpoint.mode", - "full", - "--comm.train_timeout_seconds", - "60", - "--training.steps", - str(run_params.training_steps), - "--training.dataset", - run_params.dataset, - "--job.dump_folder", - output_path, - "--metrics.enable_tensorboard", - ] - - config_manager = ConfigManager() - job_config = config_manager.parse_args(default_args) - - return job_config - - -def get_cloud(cloud_name: str): - """Get SkyPilot cloud object from name.""" - clouds = { - "kubernetes": sky.Kubernetes, - "aws": sky.AWS, - "gcp": sky.GCP, - "azure": sky.Azure, - } - if cloud_name.lower() not in clouds: - raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}") - return clouds[cloud_name.lower()]() - - -async def main(): - parser = argparse.ArgumentParser(description="Monarch + TorchTitan with SkyPilot") - parser.add_argument("--cloud", default="kubernetes", help="Cloud provider") - parser.add_argument("--num-hosts", type=int, default=2, help="Number of hosts") - parser.add_argument("--gpus-per-host", type=int, default=1, help="GPUs per host") - parser.add_argument("--cluster-name", default="monarch-titan", help="Cluster name") - parser.add_argument("--accelerator", default="H200:1", help="GPU accelerator") - parser.add_argument("--config", default="debug_model.toml", help="TorchTitan config file") - parser.add_argument("--steps", type=int, default=50, help="Training steps") - args = parser.parse_args() - - if not HAS_TORCHTITAN: - print("ERROR: TorchTitan is required for this example.") - print("Install with: pip install torchtitan") - sys.exit(1) - - print("=" * 60) - print("Monarch + TorchTitan with SkyPilot") - print("=" * 60) - print(f"\nConfiguration:") - print(f" Cloud: {args.cloud}") - print(f" Hosts: {args.num_hosts}") - print(f" GPUs per host: {args.gpus_per_host}") - print(f" Accelerator: {args.accelerator}") - print(f" Config: {args.config}") - print(f" Steps: {args.steps}") - - # Setup run parameters - run_params = RunParams( - training_steps=args.steps, - model_config=args.config, - num_nodes=args.num_hosts, - gpus_per_node=args.gpus_per_host, - ) - - script_dir = os.path.dirname(os.path.abspath(__file__)) - job_config = make_job_config(run_params, script_dir) - - # Create SkyPilot job - job = SkyPilotJob( - meshes={"mesh0": args.num_hosts}, - resources=sky.Resources( - cloud=get_cloud(args.cloud), - accelerators=args.accelerator, - ), - cluster_name=args.cluster_name, - idle_minutes_to_autostop=10, - down_on_autostop=True, - ) - - try: - print("\n[1] Launching SkyPilot cluster...") - job_state = job.state() - - print("\n[2] Creating process mesh...") - proc_mesh = job_state.mesh0.spawn_procs({"gpus": args.gpus_per_host}) - print(f" Process mesh extent: {proc_mesh.extent}") - - print("\n[3] Configuring remote logging...") - await proc_mesh.logging_option(stream_to_client=True) - - print("\n[4] Setting up distributed environment...") - await setup_env_for_distributed(proc_mesh) - - print("\n[5] Spawning TrainerActor...") - trainer = proc_mesh.spawn("trainer_actor", TrainerActor, job_config) - - print("\n[6] Starting training...") - await trainer.start_training.call() - - print("\n" + "=" * 60) - print("Training completed successfully!") - print("=" * 60) - - except Exception as e: - print(f"\nERROR: {e}") - import traceback - traceback.print_exc() - print(f"\nNot cleaning up cluster for debugging...") - print(f" Debug with: sky ssh {args.cluster_name}") - print(f" Clean up: sky down {args.cluster_name}") - raise - else: - print("\n[7] Cleaning up SkyPilot cluster...") - job.kill() - print(" Done!") - - -if __name__ == "__main__": - asyncio.run(main()) - diff --git a/python/monarch/job/__init__.py b/python/monarch/job/__init__.py index 0f6ec1960..b6852a0a1 100644 --- a/python/monarch/job/__init__.py +++ b/python/monarch/job/__init__.py @@ -9,11 +9,4 @@ from monarch._src.job.slurm import SlurmJob # Define exports -__all__ = [ - "JobTrait", - "job_load", - "job_loads", - "JobState", - "LocalJob", - "SlurmJob", -] +__all__ = ["JobTrait", "job_load", "job_loads", "JobState", "LocalJob", "SlurmJob"] From 3cda869a354bf42f11d8d51a13726a1d07d3741c Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 11 Dec 2025 16:22:40 -0800 Subject: [PATCH 16/29] renaming --- examples/skypilot/README.md | 2 +- .../{run_getting_started.yaml => getting_started.sky.yaml} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename examples/skypilot/{run_getting_started.yaml => getting_started.sky.yaml} (97%) diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md index e9c6a2ca9..cbda51b08 100644 --- a/examples/skypilot/README.md +++ b/examples/skypilot/README.md @@ -1,4 +1,4 @@ -# Monarch SkyPilot Integration +# Running Monarch on Kubernetes and cloud VMs via SkyPilot This directory contains a standalone integration for running Monarch workloads on **Kubernetes and cloud VMs** via [SkyPilot](https://github.com/skypilot-org/skypilot). diff --git a/examples/skypilot/run_getting_started.yaml b/examples/skypilot/getting_started.sky.yaml similarity index 97% rename from examples/skypilot/run_getting_started.yaml rename to examples/skypilot/getting_started.sky.yaml index c42b6e7ca..0398cc873 100644 --- a/examples/skypilot/run_getting_started.yaml +++ b/examples/skypilot/getting_started.sky.yaml @@ -5,7 +5,7 @@ # # Usage: # cd monarch/examples/skypilot -# sky launch run_getting_started.yaml -c monarch-demo +# sky launch getting_started.sky.yaml -c monarch-demo # # To view logs: # sky logs monarch-demo From 32ee2d3d028f1ba8bf0f9b3316c4b24dc24d21d6 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 11 Dec 2025 16:41:15 -0800 Subject: [PATCH 17/29] Add DDP notebook --- ....yaml => monarch_getting_started.sky.yaml} | 0 examples/skypilot/skypilot_ddp.ipynb | 306 ++++++++++++++++++ examples/skypilot/skypilot_ddp.py | 200 ------------ 3 files changed, 306 insertions(+), 200 deletions(-) rename examples/skypilot/{getting_started.sky.yaml => monarch_getting_started.sky.yaml} (100%) create mode 100644 examples/skypilot/skypilot_ddp.ipynb delete mode 100644 examples/skypilot/skypilot_ddp.py diff --git a/examples/skypilot/getting_started.sky.yaml b/examples/skypilot/monarch_getting_started.sky.yaml similarity index 100% rename from examples/skypilot/getting_started.sky.yaml rename to examples/skypilot/monarch_getting_started.sky.yaml diff --git a/examples/skypilot/skypilot_ddp.ipynb b/examples/skypilot/skypilot_ddp.ipynb new file mode 100644 index 000000000..b309de8d5 --- /dev/null +++ b/examples/skypilot/skypilot_ddp.ipynb @@ -0,0 +1,306 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Monarch DDP Example with SkyPilot\n", + "\n", + "This notebook demonstrates running PyTorch DDP (DistributedDataParallel) training on cloud infrastructure provisioned by SkyPilot.\n", + "\n", + "Adapted from the SLURM DDP example (`slurm_ddp.ipynb`).\n", + "\n", + "## Prerequisites\n", + "\n", + "```bash\n", + "pip install torchmonarch-nightly\n", + "pip install skypilot[kubernetes] # or skypilot[aws], skypilot[gcp], etc.\n", + "sky check # Verify SkyPilot configuration\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports and Setup\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Set timeouts before importing monarch\n", + "os.environ[\"HYPERACTOR_HOST_SPAWN_READY_TIMEOUT\"] = \"300s\"\n", + "os.environ[\"HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT\"] = \"300s\"\n", + "os.environ[\"HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE\"] = \"300s\"\n", + "\n", + "import torch\n", + "import torch.distributed as dist\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "\n", + "import sky\n", + "from monarch.actor import Actor, current_rank, endpoint\n", + "from monarch.utils import setup_env_for_distributed\n", + "from torch.nn.parallel import DistributedDataParallel as DDP\n", + "\n", + "# Import SkyPilotJob from local module\n", + "from skypilot_job import SkyPilotJob\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define the Model and DDP Actor\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ToyModel(nn.Module):\n", + " \"\"\"A simple toy model for demonstration purposes.\"\"\"\n", + "\n", + " def __init__(self):\n", + " super(ToyModel, self).__init__()\n", + " self.net1 = nn.Linear(10, 10)\n", + " self.relu = nn.ReLU()\n", + " self.net2 = nn.Linear(10, 5)\n", + "\n", + " def forward(self, x):\n", + " return self.net2(self.relu(self.net1(x)))\n", + "\n", + "\n", + "class DDPActor(Actor):\n", + " \"\"\"This Actor wraps the basic functionality from Torch's DDP example.\n", + "\n", + " Adapted from: https://docs.pytorch.org/tutorials/intermediate/ddp_tutorial.html#basic-use-case\n", + " \"\"\"\n", + "\n", + " def __init__(self):\n", + " self.rank = current_rank().rank\n", + "\n", + " @endpoint\n", + " async def setup(self) -> str:\n", + " \"\"\"Initialize the PyTorch distributed process group.\"\"\"\n", + " WORLD_SIZE = int(os.environ[\"WORLD_SIZE\"])\n", + " dist.init_process_group(\"gloo\", rank=self.rank, world_size=WORLD_SIZE)\n", + " return f\"Rank {self.rank}: Initialized distributed (world_size={WORLD_SIZE})\"\n", + "\n", + " @endpoint\n", + " async def cleanup(self) -> str:\n", + " \"\"\"Clean up the PyTorch distributed process group.\"\"\"\n", + " dist.destroy_process_group()\n", + " return f\"Rank {self.rank}: Cleaned up distributed\"\n", + "\n", + " @endpoint\n", + " async def demo_basic(self) -> str:\n", + " \"\"\"Run a basic DDP training example.\"\"\"\n", + " local_rank = int(os.environ[\"LOCAL_RANK\"])\n", + " model = ToyModel().to(local_rank)\n", + " ddp_model = DDP(model, device_ids=[local_rank])\n", + "\n", + " loss_fn = nn.MSELoss()\n", + " optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)\n", + "\n", + " optimizer.zero_grad()\n", + " outputs = ddp_model(torch.randn(20, 10))\n", + " labels = torch.randn(20, 5).to(local_rank)\n", + " loss = loss_fn(outputs, labels)\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " return f\"Rank {self.rank}: Training step complete (loss={loss.item():.4f})\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "Configure your cloud provider, cluster size, and GPU type below:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration - modify these values as needed\n", + "CLOUD = \"kubernetes\" # Options: kubernetes, aws, gcp, azure\n", + "NUM_HOSTS = 2\n", + "GPUS_PER_HOST = 1\n", + "CLUSTER_NAME = \"monarch-ddp\"\n", + "ACCELERATOR = \"H200:1\" # e.g., H100:1, A100:1, V100:1\n", + "\n", + "def get_cloud(cloud_name: str):\n", + " \"\"\"Get SkyPilot cloud object from name.\"\"\"\n", + " clouds = {\n", + " \"kubernetes\": sky.Kubernetes,\n", + " \"aws\": sky.AWS,\n", + " \"gcp\": sky.GCP,\n", + " \"azure\": sky.Azure,\n", + " }\n", + " if cloud_name.lower() not in clouds:\n", + " raise ValueError(f\"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}\")\n", + " return clouds[cloud_name.lower()]()\n", + "\n", + "print(f\"Configuration:\")\n", + "print(f\" Cloud: {CLOUD}\")\n", + "print(f\" Hosts: {NUM_HOSTS}\")\n", + "print(f\" GPUs per host: {GPUS_PER_HOST}\")\n", + "print(f\" Accelerator: {ACCELERATOR}\")\n", + "print(f\" Cluster name: {CLUSTER_NAME}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create SkyPilot Job\n", + "\n", + "Create a SkyPilot job to provision cloud instances:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "job = SkyPilotJob(\n", + " meshes={\"mesh0\": NUM_HOSTS},\n", + " resources=sky.Resources(\n", + " cloud=get_cloud(CLOUD),\n", + " accelerators=ACCELERATOR,\n", + " ),\n", + " cluster_name=CLUSTER_NAME,\n", + " idle_minutes_to_autostop=10,\n", + " down_on_autostop=True,\n", + ")\n", + "\n", + "print(f\"SkyPilot job created for cluster '{CLUSTER_NAME}'\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch Cluster and Create Process Mesh\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Launch the cluster and get the job state\n", + "print(\"Launching SkyPilot cluster...\")\n", + "job_state = job.state()\n", + "\n", + "# Create process mesh with GPUs\n", + "print(\"Creating process mesh...\")\n", + "proc_mesh = job_state.mesh0.spawn_procs({\"gpus\": GPUS_PER_HOST})\n", + "print(f\"Process mesh extent: {proc_mesh.extent}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Spawn DDP Actors and Run Training\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Spawn DDP actors on the process mesh\n", + "print(\"Spawning DDP actors...\")\n", + "ddp_actor = proc_mesh.spawn(\"ddp_actor\", DDPActor)\n", + "\n", + "# Set up the distributed environment\n", + "print(\"Setting up distributed environment...\")\n", + "await setup_env_for_distributed(proc_mesh)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run the DDP example\n", + "print(\"Running DDP training...\\n\")\n", + "\n", + "# Initialize distributed process group\n", + "print(\"[1] Initializing distributed process group...\")\n", + "results = await ddp_actor.setup.call()\n", + "for coord, msg in results:\n", + " print(f\" {msg}\")\n", + "\n", + "# Run the basic DDP training example\n", + "print(\"\\n[2] Running DDP training step...\")\n", + "results = await ddp_actor.demo_basic.call()\n", + "for coord, msg in results:\n", + " print(f\" {msg}\")\n", + "\n", + "# Clean up distributed process group\n", + "print(\"\\n[3] Cleaning up distributed process group...\")\n", + "results = await ddp_actor.cleanup.call()\n", + "for coord, msg in results:\n", + " print(f\" {msg}\")\n", + "\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"DDP example completed successfully!\")\n", + "print(\"=\" * 60)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleanup\n", + "\n", + "Tear down the SkyPilot cluster when done:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Tear down the SkyPilot cluster\n", + "print(\"Cleaning up SkyPilot cluster...\")\n", + "job.kill()\n", + "print(\"Done!\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/skypilot/skypilot_ddp.py b/examples/skypilot/skypilot_ddp.py deleted file mode 100644 index 9b9657428..000000000 --- a/examples/skypilot/skypilot_ddp.py +++ /dev/null @@ -1,200 +0,0 @@ -#!/usr/bin/env python3 -""" -Monarch DDP Example with SkyPilot -================================= - -This script demonstrates running PyTorch DDP (DistributedDataParallel) training -on cloud infrastructure provisioned by SkyPilot. - -Adapted from the SLURM DDP example (slurm_ddp.ipynb). - -Usage: - python skypilot_ddp.py --num-hosts 2 --gpus-per-host 1 -""" - -import argparse -import asyncio -import logging -import os -import sys - -# Set timeouts before importing monarch -os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s" -os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s" -os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s" - -import torch -import torch.distributed as dist -import torch.nn as nn -import torch.optim as optim - -from monarch.actor import Actor, current_rank, endpoint -from monarch.utils import setup_env_for_distributed -from torch.nn.parallel import DistributedDataParallel as DDP - -# Import SkyPilotJob from local module -from skypilot_job import SkyPilotJob - -try: - import sky -except ImportError: - print("ERROR: SkyPilot is not installed. Run: pip install skypilot[kubernetes]") - sys.exit(1) - -logging.basicConfig( - level=logging.INFO, - format="%(name)s %(asctime)s %(levelname)s %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - force=True, -) -logger = logging.getLogger(__name__) - - -class ToyModel(nn.Module): - """A simple toy model for demonstration purposes.""" - - def __init__(self): - super(ToyModel, self).__init__() - self.net1 = nn.Linear(10, 10) - self.relu = nn.ReLU() - self.net2 = nn.Linear(10, 5) - - def forward(self, x): - return self.net2(self.relu(self.net1(x))) - - -class DDPActor(Actor): - """This Actor wraps the basic functionality from Torch's DDP example. - - Adapted from: https://docs.pytorch.org/tutorials/intermediate/ddp_tutorial.html#basic-use-case - """ - - def __init__(self): - self.rank = current_rank().rank - - def _rprint(self, msg): - """Helper method to print with rank information.""" - print(f"{self.rank=} {msg}") - - @endpoint - async def setup(self): - """Initialize the PyTorch distributed process group.""" - self._rprint("Initializing torch distributed") - - WORLD_SIZE = int(os.environ["WORLD_SIZE"]) - # initialize the process group - dist.init_process_group("gloo", rank=self.rank, world_size=WORLD_SIZE) - self._rprint("Finished initializing torch distributed") - - @endpoint - async def cleanup(self): - """Clean up the PyTorch distributed process group.""" - self._rprint("Cleaning up torch distributed") - dist.destroy_process_group() - - @endpoint - async def demo_basic(self): - """Run a basic DDP training example.""" - self._rprint("Running basic DDP example") - - # create model and move it to GPU with id rank - local_rank = int(os.environ["LOCAL_RANK"]) - self._rprint(f"{local_rank=}") - model = ToyModel().to(local_rank) - ddp_model = DDP(model, device_ids=[local_rank]) - - loss_fn = nn.MSELoss() - optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) - - optimizer.zero_grad() - outputs = ddp_model(torch.randn(20, 10)) - labels = torch.randn(20, 5).to(local_rank) - loss_fn(outputs, labels).backward() - optimizer.step() - - print(f"{self.rank=} Finished running basic DDP example") - - -def get_cloud(cloud_name: str): - """Get SkyPilot cloud object from name.""" - clouds = { - "kubernetes": sky.Kubernetes, - "aws": sky.AWS, - "gcp": sky.GCP, - "azure": sky.Azure, - } - if cloud_name.lower() not in clouds: - raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}") - return clouds[cloud_name.lower()]() - - -async def main(): - parser = argparse.ArgumentParser(description="Monarch DDP with SkyPilot") - parser.add_argument("--cloud", default="kubernetes", help="Cloud provider") - parser.add_argument("--num-hosts", type=int, default=2, help="Number of hosts") - parser.add_argument("--gpus-per-host", type=int, default=1, help="GPUs per host") - parser.add_argument("--cluster-name", default="monarch-ddp", help="Cluster name") - parser.add_argument("--accelerator", default="H200:1", help="GPU accelerator") - args = parser.parse_args() - - print("=" * 60) - print("Monarch DDP Example with SkyPilot") - print("=" * 60) - print(f"\nConfiguration:") - print(f" Cloud: {args.cloud}") - print(f" Hosts: {args.num_hosts}") - print(f" GPUs per host: {args.gpus_per_host}") - print(f" Accelerator: {args.accelerator}") - - # Create SkyPilot job - job = SkyPilotJob( - meshes={"mesh0": args.num_hosts}, - resources=sky.Resources( - cloud=get_cloud(args.cloud), - accelerators=args.accelerator, - ), - cluster_name=args.cluster_name, - idle_minutes_to_autostop=10, - down_on_autostop=True, - ) - - try: - print("\n[1] Launching SkyPilot cluster...") - job_state = job.state() - - print("\n[2] Creating process mesh...") - proc_mesh = job_state.mesh0.spawn_procs({"gpus": args.gpus_per_host}) - print(f" Process mesh extent: {proc_mesh.extent}") - - print("\n[3] Spawning DDP actors...") - ddp_actor = proc_mesh.spawn("ddp_actor", DDPActor) - - print("\n[4] Setting up distributed environment...") - await setup_env_for_distributed(proc_mesh) - - print("\n[5] Running DDP example...") - await ddp_actor.setup.call() - await ddp_actor.demo_basic.call() - await ddp_actor.cleanup.call() - - print("\n" + "=" * 60) - print("DDP example completed successfully!") - print("=" * 60) - - except Exception as e: - print(f"\nERROR: {e}") - import traceback - traceback.print_exc() - print(f"\nNot cleaning up cluster for debugging...") - print(f" Debug with: sky ssh {args.cluster_name}") - print(f" Clean up: sky down {args.cluster_name}") - raise - else: - print("\n[6] Cleaning up SkyPilot cluster...") - job.kill() - print(" Done!") - - -if __name__ == "__main__": - asyncio.run(main()) - From ca7014ae4bf1a318cbbaf73939f4da8460c85983 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 11 Dec 2025 17:10:20 -0800 Subject: [PATCH 18/29] Readme updates --- examples/skypilot/README.md | 143 +++++++++++++++++++----------------- 1 file changed, 77 insertions(+), 66 deletions(-) diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md index cbda51b08..db2fa9a39 100644 --- a/examples/skypilot/README.md +++ b/examples/skypilot/README.md @@ -11,12 +11,13 @@ This directory contains a standalone integration for running Monarch workloads o - Hyperscalers: AWS, GCP, Azure - Neoclouds: CoreWeave, Nebius, and [20+ other clouds](https://docs.skypilot.co/en/latest/getting-started/installation.html) -## Installation +## Quickstart -```bash -# Install Monarch -pip install torchmonarch-nightly +Prerequisites: Install SkyPilot and verify GPUs are available. +
+SkyPilot Installation +```bash # Install SkyPilot with your preferred backend pip install skypilot[kubernetes] # For Kubernetes pip install skypilot[aws] # For AWS @@ -25,69 +26,32 @@ pip install skypilot[all] # For all clouds # Verify SkyPilot setup sky check -``` - -TODO(romilb): Link to SkyPilot docs for k8s setup - -## Quick Start - -```python -import sky -from skypilot_job import SkyPilotJob -from monarch.actor import Actor, endpoint -class MyActor(Actor): - @endpoint - def hello(self) -> str: - return "Hello from the cloud!" - -# Create a SkyPilot job with 2 nodes -job = SkyPilotJob( - meshes={"workers": 2}, - resources=sky.Resources( - cloud=sky.Kubernetes(), # or sky.AWS(), sky.GCP(), etc. - accelerators="H100:1", - ), - cluster_name="my-monarch-cluster", - idle_minutes_to_autostop=10, - down_on_autostop=True, -) - -# Launch and connect -state = job.state() -hosts = state.workers +# Verify GPUs available +sky show-gpus --infra kubernetes +``` -# Spawn processes and actors -procs = hosts.spawn_procs(per_host={"gpus": 1}) -actors = procs.spawn("my_actors", MyActor) +For more details, see the [SkyPilot documentation](https://docs.skypilot.co/en/latest/getting-started/installation.html). -# Use your actors -results = actors.hello.call().get() -print(results) # ["Hello from the cloud!", "Hello from the cloud!"] +
-# Clean up -job.kill() -``` -## Running the Example +Run this command from your local machine to run the getting started example: ```bash -cd examples/skypilot - -# Run on Kubernetes -python getting_started.py --cloud kubernetes --num-hosts 2 +sky launch monarch_getting_started.sky.yaml -c monarch-demo +``` -# Run on AWS -python getting_started.py --cloud aws --num-hosts 2 --accelerator "A100:1" +SkyPilot will: +1. Launch a Kubernetes pod +2. Install dependencies +3. Sync the example directory with the pod +4. Run `skypilot_getting_started.py` in the pod and stream the logs -# Run on GCP -python getting_started.py --cloud gcp --num-hosts 2 --accelerator "A100:1" -``` +
+Example Output -Example output: ``` -$ python skypilot_getting_started.py --num-hosts 2 --gpus-per-host 1 --cluster-name monarch-skypilot-test - ============================================================ Monarch Getting Started with SkyPilot ============================================================ @@ -151,29 +115,76 @@ Cluster 'monarch-skypilot-test' terminated Cluster terminated. ``` -## Default Image +
+ -By default, `SkyPilotJob` uses the `pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime` Docker image which has compatible system libraries for `torchmonarch-nightly`. TODO(romilb): mention image requirements. +
+Running from within the Kubernetes cluster -## Faster Cold Starts with SkyPilot's cluster reuse +If you are already in the Kubernetes cluster you'd like to run workers on, you can directly run `skypilot_getting_started.py`. + +```bash +python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 8 --gpus "H200:8" +``` + +
+ +## SkyPilotJob Class + +SkyPilotJob allows you to run Monarch on Kubernetes and cloud VMs via SkyPilot. + +Example usage: -TODO(romilb): Validate if this works: ```python +import sky +from skypilot_job import SkyPilotJob +from monarch.actor import Actor, endpoint + +class MyActor(Actor): + @endpoint + def hello(self) -> str: + return "Hello from the cloud!" + +# Create a SkyPilot job with 2 nodes job = SkyPilotJob( - ..., - idle_minutes_to_autostop=30, # Keep cluster alive + meshes={"workers": 2}, + resources=sky.Resources( + cloud=sky.Kubernetes(), # or sky.AWS(), sky.GCP(), etc. + accelerators="H100:1", + ), + cluster_name="my-monarch-cluster", + idle_minutes_to_autostop=10, + down_on_autostop=True, ) -``` -TODO(romilb): Benchmark pre-baked container images +# Launch and connect +state = job.state() +hosts = state.workers + +# Spawn processes and actors +procs = hosts.spawn_procs(per_host={"gpus": 1}) +actors = procs.spawn("my_actors", MyActor) + +# Use your actors +results = actors.hello.call().get() +print(results) # ["Hello from the cloud!", "Hello from the cloud!"] -## Network Requirements +# Clean up +job.kill() +``` + +### Network Requirements The client must have direct network connectivity to the worker nodes: - **Kubernetes**: Run the client inside the same cluster (e.g., in a pod) - **Cloud VMs**: Ensure security groups allow inbound traffic on port 22222 -## Troubleshooting + +### Default Image + +By default, `SkyPilotJob` uses the `pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime` Docker image which has compatible system libraries for `torchmonarch-nightly`. + +## Troubleshooting tips **Check SkyPilot setup:** ```bash From ffe74f55651c2e777839e885aa32f6e75417bbf6 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 11 Dec 2025 19:49:12 -0800 Subject: [PATCH 19/29] Updates --- examples/skypilot/README.md | 74 ++++++++++++++++++- examples/skypilot/skypilot_getting_started.py | 4 +- examples/skypilot/skypilot_job.py | 2 +- 3 files changed, 76 insertions(+), 4 deletions(-) diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md index db2fa9a39..1f3bc59c5 100644 --- a/examples/skypilot/README.md +++ b/examples/skypilot/README.md @@ -6,7 +6,47 @@ This directory contains a standalone integration for running Monarch workloads o `SkyPilotJob` provisions cloud instances (or K8s pods) and starts Monarch workers on them, allowing you to run distributed Monarch actors across multiple machines. -**Supported platforms:** +### Architecture + +```mermaid +flowchart TB + subgraph laptop["💻 Your Laptop"] + user["$ sky launch monarch_getting_started.sky.yaml"] + end + + subgraph k8s["☸️ Kubernetes Cluster"] + subgraph driver["Driver Pod"] + script["skypilot_getting_started.py"] + skyjob["SkyPilotJob"] + end + + subgraph workers["Worker Pods (provisioned by SkyPilot)"] + subgraph w1["Worker Pod 0"] + mw1["Monarch Worker"] + end + subgraph w2["Worker Pod 1"] + mw2["Monarch Worker"] + end + end + end + + user -->|"SkyPilot launches"| driver + script --> skyjob + skyjob -->|"provisions via SkyPilot"| workers + skyjob <-->|"TCP :22222"| mw1 + skyjob <-->|"TCP :22222"| mw2 + mw1 + mw2 +``` + +**How it works:** +1. You run `sky launch` from your laptop to start the driver pod +2. The driver runs `skypilot_getting_started.py` which creates a `SkyPilotJob` +3. `SkyPilotJob` provisions GPU worker pods via SkyPilot +4. The driver connects to Monarch workers over TCP (port 22222) +5. Actors are spawned on each GPU and execute your distributed code + +**Supported infra:** - Kubernetes (any cluster) - Hyperscalers: AWS, GCP, Azure - Neoclouds: CoreWeave, Nebius, and [20+ other clouds](https://docs.skypilot.co/en/latest/getting-started/installation.html) @@ -117,6 +157,11 @@ Cluster 'monarch-skypilot-test' terminated +When done, clean up with: +```bash +sky down monarch-demo +``` +
Running from within the Kubernetes cluster @@ -129,6 +174,33 @@ python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-h
+
+ +### Running the DDP Jupyter Notebook + +To run the `skypilot_ddp.ipynb` notebook interactively, first launch a driver pod and then connect via SSH port forwarding: + +```bash +# 1. Launch a driver pod (without running a script) +sky launch monarch_getting_started.sky.yaml -c monarch-demo + +# 2. SSH into the pod with port forwarding for Jupyter +sky ssh monarch-demo -L 8888:localhost:8888 + +# 3. Inside the pod, start Jupyter Notebook (no token required) +cd ~/sky_workdir +jupyter notebook --no-browser --port=8888 --ip=0.0.0.0 --NotebookApp.token='' --NotebookApp.password='' +``` + +Then open http://localhost:8888 in your browser and run `skypilot_ddp.ipynb`. + +When done, clean up with: +```bash +sky down monarch-demo +``` + +
+ ## SkyPilotJob Class SkyPilotJob allows you to run Monarch on Kubernetes and cloud VMs via SkyPilot. diff --git a/examples/skypilot/skypilot_getting_started.py b/examples/skypilot/skypilot_getting_started.py index 814f4e6d0..aa70d8d88 100644 --- a/examples/skypilot/skypilot_getting_started.py +++ b/examples/skypilot/skypilot_getting_started.py @@ -14,10 +14,10 @@ Usage: # Run on Kubernetes with 2 nodes, 8 GPUs per node - python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 8 --gpus "H200:8" + python examples/skypilot/skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 8 --gpus "H200:8" # Run on cloud VMs - python skypilot_getting_started.py --cloud --num-hosts 2 --gpus-per-host 1 --gpus "H100:1" + python examples/skypilot/skypilot_getting_started.py --cloud --num-hosts 2 --gpus-per-host 1 --gpus "H100:1" """ import argparse diff --git a/examples/skypilot/skypilot_job.py b/examples/skypilot/skypilot_job.py index edb418da8..e9c36df36 100644 --- a/examples/skypilot/skypilot_job.py +++ b/examples/skypilot/skypilot_job.py @@ -125,7 +125,7 @@ def __init__( resources: SkyPilot Resources specification for the instances. If None, uses SkyPilot defaults. cluster_name: Name for the SkyPilot cluster. If None, auto-generated. - monarch_port: Port for TCP communication between Monarch workers. + monarch_port: Port bootstrapping communication between Monarch workers. idle_minutes_to_autostop: If set, cluster will autostop after this many minutes of idleness. down_on_autostop: If True, tear down cluster on autostop instead of From 85145041e9c0d9fb4dea1fdf1aa43a1d0791a23e Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 12 Dec 2025 23:58:43 +0000 Subject: [PATCH 20/29] fix mermaid doc --- examples/skypilot/README.md | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md index 1f3bc59c5..b65200a03 100644 --- a/examples/skypilot/README.md +++ b/examples/skypilot/README.md @@ -20,7 +20,7 @@ flowchart TB skyjob["SkyPilotJob"] end - subgraph workers["Worker Pods (provisioned by SkyPilot)"] + subgraph workers["Worker Pods (SkyPilot clusters)"] subgraph w1["Worker Pod 0"] mw1["Monarch Worker"] end @@ -32,7 +32,7 @@ flowchart TB user -->|"SkyPilot launches"| driver script --> skyjob - skyjob -->|"provisions via SkyPilot"| workers + skyjob -->|"provisioned via SkyPilot"| workers skyjob <-->|"TCP :22222"| mw1 skyjob <-->|"TCP :22222"| mw2 mw1 @@ -174,8 +174,6 @@ python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-h -
- ### Running the DDP Jupyter Notebook To run the `skypilot_ddp.ipynb` notebook interactively, first launch a driver pod and then connect via SSH port forwarding: @@ -187,20 +185,18 @@ sky launch monarch_getting_started.sky.yaml -c monarch-demo # 2. SSH into the pod with port forwarding for Jupyter sky ssh monarch-demo -L 8888:localhost:8888 -# 3. Inside the pod, start Jupyter Notebook (no token required) +# 3. Inside the pod, start Jupyter Notebook cd ~/sky_workdir jupyter notebook --no-browser --port=8888 --ip=0.0.0.0 --NotebookApp.token='' --NotebookApp.password='' ``` -Then open http://localhost:8888 in your browser and run `skypilot_ddp.ipynb`. +Then open http://localhost:8888 in your browser and open `skypilot_ddp.ipynb`. When done, clean up with: ```bash sky down monarch-demo ``` -
- ## SkyPilotJob Class SkyPilotJob allows you to run Monarch on Kubernetes and cloud VMs via SkyPilot. From 2d83527b1fc82fce16a3a1e37098b5da0b53b24b Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sat, 13 Dec 2025 00:38:05 +0000 Subject: [PATCH 21/29] Docs updates --- docs/source/examples/README.rst | 1 + docs/source/examples/getting_started.py | 4 ++-- docs/source/index.md | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/examples/README.rst b/docs/source/examples/README.rst index 3d5089de2..e4b943c2d 100644 --- a/docs/source/examples/README.rst +++ b/docs/source/examples/README.rst @@ -8,6 +8,7 @@ Examples - :doc:`distributed_tensors.py `: Shows how to dispatch tensors and tensor level operations to a distributed mesh of workers and GPUs - :doc:`debugging.py `: Shows how to use the Monarch debugger to debug a distributed program - `Multinode Slurm Tutorial `_: Multinode distributed training tutorial using Monarch and Slurm to run an SPMD training job. +- `SkyPilot Integration `_: Run Monarch on Kubernetes and cloud VMs via SkyPilot. .. toctree:: :hidden: diff --git a/docs/source/examples/getting_started.py b/docs/source/examples/getting_started.py index 6c7359f95..476a550bb 100644 --- a/docs/source/examples/getting_started.py +++ b/docs/source/examples/getting_started.py @@ -145,8 +145,8 @@ def get_value(self) -> int: # ============== # When we created our processes before, we spawned them on `this_host()` -- the machine # running the top-level script. For larger jobs, monarch controls many machines. How these -# machines are obtained depends on the scheduling system (slurm, kubernetes, etc), but these -# schedulers are typically encapsulated in a config file. +# machines are obtained depends on the scheduling system (Slurm, Kubernetes, SkyPilot, etc.), +# but these schedulers are typically encapsulated in a config file. from monarch.actor import context, HostMesh, hosts_from_config diff --git a/docs/source/index.md b/docs/source/index.md index 88072f60c..3321e3abc 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -82,3 +82,4 @@ We welcome contributions from the community! If you're interested in contributin - [Demo notebook](https://github.com/meta-pytorch/monarch/blob/main/examples/presentation/presentation.ipynb) - [DevX Pytorch tutorial](https://docs.pytorch.org/tutorials/intermediate/monarch_distributed_tutorial.html) - [Lightning Monarch blog](https://lightning.ai/meta-ai/environments/large-scale-interactive-training-with-monarch) +- [Running on Kubernetes via SkyPilot](https://github.com/meta-pytorch/monarch/tree/main/examples/skypilot) From 4d6ed271ef4c058ccfdb8c95a173194937fa83ed Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sat, 13 Dec 2025 00:44:13 +0000 Subject: [PATCH 22/29] updates --- docs/source/examples/README.rst | 2 +- docs/source/index.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/examples/README.rst b/docs/source/examples/README.rst index e4b943c2d..3b27c3f97 100644 --- a/docs/source/examples/README.rst +++ b/docs/source/examples/README.rst @@ -8,7 +8,7 @@ Examples - :doc:`distributed_tensors.py `: Shows how to dispatch tensors and tensor level operations to a distributed mesh of workers and GPUs - :doc:`debugging.py `: Shows how to use the Monarch debugger to debug a distributed program - `Multinode Slurm Tutorial `_: Multinode distributed training tutorial using Monarch and Slurm to run an SPMD training job. -- `SkyPilot Integration `_: Run Monarch on Kubernetes and cloud VMs via SkyPilot. +- `Multinode Kubernetes examples `_: Run Monarch on Kubernetes and cloud VMs via SkyPilot. .. toctree:: :hidden: diff --git a/docs/source/index.md b/docs/source/index.md index 3321e3abc..4a182f88a 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -82,4 +82,4 @@ We welcome contributions from the community! If you're interested in contributin - [Demo notebook](https://github.com/meta-pytorch/monarch/blob/main/examples/presentation/presentation.ipynb) - [DevX Pytorch tutorial](https://docs.pytorch.org/tutorials/intermediate/monarch_distributed_tutorial.html) - [Lightning Monarch blog](https://lightning.ai/meta-ai/environments/large-scale-interactive-training-with-monarch) -- [Running on Kubernetes via SkyPilot](https://github.com/meta-pytorch/monarch/tree/main/examples/skypilot) +- [Monarch on Kubernetes](https://github.com/meta-pytorch/monarch/tree/main/examples/skypilot) From 3f3e890105eb77e2214eed182d2ec606871490f5 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sat, 13 Dec 2025 00:45:28 +0000 Subject: [PATCH 23/29] updates --- examples/skypilot/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md index b65200a03..91c6a375d 100644 --- a/examples/skypilot/README.md +++ b/examples/skypilot/README.md @@ -1,10 +1,10 @@ # Running Monarch on Kubernetes and cloud VMs via SkyPilot -This directory contains a standalone integration for running Monarch workloads on **Kubernetes and cloud VMs** via [SkyPilot](https://github.com/skypilot-org/skypilot). +This directory contains examples for running Monarch workloads on **Kubernetes and cloud VMs** via [SkyPilot](https://github.com/skypilot-org/skypilot). ## Overview -`SkyPilotJob` provisions cloud instances (or K8s pods) and starts Monarch workers on them, allowing you to run distributed Monarch actors across multiple machines. +`SkyPilotJob` provisions cloud instances (or K8s pods) and starts Monarch workers on them, allowing you to run distributed Monarch actors across multiple Kubernetes pods. ### Architecture From be7818e3d34bba6a3804815bac73caa47c20f59f Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sat, 13 Dec 2025 00:50:42 +0000 Subject: [PATCH 24/29] Add notes on how to set resources and num nodes --- examples/skypilot/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md index 91c6a375d..d662701a2 100644 --- a/examples/skypilot/README.md +++ b/examples/skypilot/README.md @@ -81,6 +81,7 @@ Run this command from your local machine to run the getting started example: ```bash sky launch monarch_getting_started.sky.yaml -c monarch-demo ``` +**💡 Tip:** Run `sky show-gpus --infra kubernetes` to see available GPUs in your cluster, then edit `--accelerator` and `--num-hosts` in the `run` section of the YAML to match your resources. SkyPilot will: 1. Launch a Kubernetes pod From 0a443b3e3fca2fab32641f923ee5ea3809ca747a Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sat, 13 Dec 2025 00:51:19 +0000 Subject: [PATCH 25/29] fix ssh command --- examples/skypilot/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md index d662701a2..fa117ec03 100644 --- a/examples/skypilot/README.md +++ b/examples/skypilot/README.md @@ -184,7 +184,7 @@ To run the `skypilot_ddp.ipynb` notebook interactively, first launch a driver po sky launch monarch_getting_started.sky.yaml -c monarch-demo # 2. SSH into the pod with port forwarding for Jupyter -sky ssh monarch-demo -L 8888:localhost:8888 +ssh monarch-demo -L 8888:localhost:8888 # 3. Inside the pod, start Jupyter Notebook cd ~/sky_workdir From 5fcf775f8ef40ab2370097890270afee7553c907 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sat, 13 Dec 2025 00:52:53 +0000 Subject: [PATCH 26/29] Update jupyter commands --- examples/skypilot/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md index fa117ec03..090a9fc5c 100644 --- a/examples/skypilot/README.md +++ b/examples/skypilot/README.md @@ -188,7 +188,8 @@ ssh monarch-demo -L 8888:localhost:8888 # 3. Inside the pod, start Jupyter Notebook cd ~/sky_workdir -jupyter notebook --no-browser --port=8888 --ip=0.0.0.0 --NotebookApp.token='' --NotebookApp.password='' +uv pip install --system jupyter +jupyter notebook --no-browser --port=8888 --ip=0.0.0.0 --NotebookApp.token='' --NotebookApp.password='' --allow-root ``` Then open http://localhost:8888 in your browser and open `skypilot_ddp.ipynb`. From 56e7c8c7fb2016c0eb49038ceeedce41d52fafbe Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sat, 13 Dec 2025 02:25:35 +0000 Subject: [PATCH 27/29] Add CPU-only support --- examples/skypilot/README.md | 30 +++++++++++++++++-- .../skypilot/monarch_getting_started.sky.yaml | 24 ++++++++++----- examples/skypilot/skypilot_getting_started.py | 25 ++++++++++++---- 3 files changed, 64 insertions(+), 15 deletions(-) diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md index 090a9fc5c..b88f0068e 100644 --- a/examples/skypilot/README.md +++ b/examples/skypilot/README.md @@ -81,9 +81,29 @@ Run this command from your local machine to run the getting started example: ```bash sky launch monarch_getting_started.sky.yaml -c monarch-demo ``` -**💡 Tip:** Run `sky show-gpus --infra kubernetes` to see available GPUs in your cluster, then edit `--accelerator` and `--num-hosts` in the `run` section of the YAML to match your resources. -SkyPilot will: +
+💡 Customizing the run (GPU count, CPU-only mode, etc.) + +Run `sky show-gpus --infra kubernetes` to see available GPUs in your cluster, then customize with environment variables: + +```bash +# Custom GPU configuration +sky launch monarch_getting_started.sky.yaml -c monarch-demo \ + --env NUM_HOSTS=4 \ + --env GPUS_PER_HOST=8 \ + --env ACCELERATOR="H100:8" + +# CPU-only mode (no GPUs required) +sky launch monarch_getting_started.sky.yaml -c monarch-demo \ + --env GPUS_PER_HOST=0 \ + --env ACCELERATOR=none +``` + +
+ + +On running `sky launch`, SkyPilot will: 1. Launch a Kubernetes pod 2. Install dependencies 3. Sync the example directory with the pod @@ -170,7 +190,11 @@ sky down monarch-demo If you are already in the Kubernetes cluster you'd like to run workers on, you can directly run `skypilot_getting_started.py`. ```bash -python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 8 --gpus "H200:8" +# With GPUs +python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 8 --accelerator "H200:8" + +# CPU-only (no GPUs) +python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 0 --accelerator none ``` diff --git a/examples/skypilot/monarch_getting_started.sky.yaml b/examples/skypilot/monarch_getting_started.sky.yaml index 0398cc873..61aa75cee 100644 --- a/examples/skypilot/monarch_getting_started.sky.yaml +++ b/examples/skypilot/monarch_getting_started.sky.yaml @@ -5,7 +5,10 @@ # # Usage: # cd monarch/examples/skypilot -# sky launch getting_started.sky.yaml -c monarch-demo +# sky launch monarch_getting_started.sky.yaml -c monarch-demo +# +# # For CPU-only clusters (no GPUs): +# sky launch monarch_getting_started.sky.yaml -c monarch-demo --env GPUS_PER_HOST=0 --env ACCELERATOR=none # # To view logs: # sky logs monarch-demo @@ -23,6 +26,13 @@ resources: cpus: 2+ # No GPUs needed for the driver script image_id: docker:pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime +# Environment variables for configuring the example +# Override with: sky launch ... --env NUM_HOSTS=4 --env GPUS_PER_HOST=8 +envs: + NUM_HOSTS: 2 # Number of worker nodes to provision + GPUS_PER_HOST: 1 # GPUs per worker (set to 0 for CPU-only) + ACCELERATOR: "H200:1" # SkyPilot GPU spec (set to "none" for CPU-only). Keep quantity aligned with GPUS_PER_HOST. + # Sync the current directory (examples/skypilot) to the cluster workdir: . @@ -64,17 +74,17 @@ setup: | echo "=== Setup complete ===" run: | - echo "=== Running Monarch Getting Started with SkyPilot ===" + echo "=== Running Monarch Getting Started with SkyPilot ===" + echo "Configuration: NUM_HOSTS=$NUM_HOSTS, GPUS_PER_HOST=$GPUS_PER_HOST, ACCELERATOR=$ACCELERATOR" # Run the getting started example - # This will launch a SkyPilot cluster with Monarch workers. - # Change the arguments to your desired values. + # Uses environment variables set above (can be overridden with --env) python skypilot_getting_started.py \ --cloud kubernetes \ - --num-hosts 2 \ - --gpus-per-host 1 \ + --num-hosts $NUM_HOSTS \ + --gpus-per-host $GPUS_PER_HOST \ --cluster-name monarch-workers \ - --accelerator "H200:1" + --accelerator "$ACCELERATOR" echo "=== Example ran successfully ===" diff --git a/examples/skypilot/skypilot_getting_started.py b/examples/skypilot/skypilot_getting_started.py index aa70d8d88..04148c9be 100644 --- a/examples/skypilot/skypilot_getting_started.py +++ b/examples/skypilot/skypilot_getting_started.py @@ -18,6 +18,9 @@ # Run on cloud VMs python examples/skypilot/skypilot_getting_started.py --cloud --num-hosts 2 --gpus-per-host 1 --gpus "H100:1" + + # Run on CPU-only cluster (no GPUs) + python examples/skypilot/skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 0 --accelerator none """ import argparse @@ -143,14 +146,20 @@ def main(): ) args = parser.parse_args() + # Determine if running in CPU-only mode + cpu_only = args.gpus_per_host == 0 or args.accelerator.lower() == "none" + print("=" * 60) print("Monarch Getting Started with SkyPilot") print("=" * 60) print(f"\nConfiguration:") print(f" Cloud: {args.cloud}") print(f" Hosts: {args.num_hosts}") - print(f" GPUs per host: {args.gpus_per_host}") - print(f" Accelerator: {args.accelerator}") + if cpu_only: + print(f" Mode: CPU-only (no GPUs)") + else: + print(f" GPUs per host: {args.gpus_per_host}") + print(f" Accelerator: {args.accelerator}") print(f" Cluster name: {args.cluster_name}") if args.region: print(f" Region: {args.region}") @@ -161,8 +170,10 @@ def main(): # Build resources specification resources_kwargs = { "cloud": get_cloud(args.cloud), - "accelerators": args.accelerator, } + # Only request GPUs if not in CPU-only mode + if not cpu_only: + resources_kwargs["accelerators"] = args.accelerator if args.region: resources_kwargs["region"] = args.region @@ -191,8 +202,12 @@ def main(): # ==================================================================== print("\n[3] Spawning processes on cloud hosts...") - # Create a process mesh - GPU processes per host - procs: ProcMesh = hosts.spawn_procs(per_host={"gpus": args.gpus_per_host}) + # Create a process mesh + if cpu_only: + # CPU-only mode: spawn 1 CPU process per host + procs: ProcMesh = hosts.spawn_procs(per_host={"procs": 1}) + else: + procs: ProcMesh = hosts.spawn_procs(per_host={"gpus": args.gpus_per_host}) print(f" Process mesh extent: {procs.extent}") # Spawn counter actors From 236a01a998493129ef039d13549c7e40eade13f1 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sat, 20 Dec 2025 01:08:39 +0530 Subject: [PATCH 28/29] update docs --- docs/source/examples/README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/examples/README.rst b/docs/source/examples/README.rst index 3b27c3f97..37b9cfe3c 100644 --- a/docs/source/examples/README.rst +++ b/docs/source/examples/README.rst @@ -8,7 +8,7 @@ Examples - :doc:`distributed_tensors.py `: Shows how to dispatch tensors and tensor level operations to a distributed mesh of workers and GPUs - :doc:`debugging.py `: Shows how to use the Monarch debugger to debug a distributed program - `Multinode Slurm Tutorial `_: Multinode distributed training tutorial using Monarch and Slurm to run an SPMD training job. -- `Multinode Kubernetes examples `_: Run Monarch on Kubernetes and cloud VMs via SkyPilot. +- `Running on Kubernetes using Skypilot `_: Run Monarch on Kubernetes and cloud VMs via SkyPilot. .. toctree:: :hidden: From 8207b92987b228972453cf700aa883638e80793e Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Sat, 20 Dec 2025 01:38:56 +0530 Subject: [PATCH 29/29] review comments --- docs/source/index.md | 2 +- examples/skypilot/skypilot_job.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/source/index.md b/docs/source/index.md index 4a182f88a..32928e328 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -82,4 +82,4 @@ We welcome contributions from the community! If you're interested in contributin - [Demo notebook](https://github.com/meta-pytorch/monarch/blob/main/examples/presentation/presentation.ipynb) - [DevX Pytorch tutorial](https://docs.pytorch.org/tutorials/intermediate/monarch_distributed_tutorial.html) - [Lightning Monarch blog](https://lightning.ai/meta-ai/environments/large-scale-interactive-training-with-monarch) -- [Monarch on Kubernetes](https://github.com/meta-pytorch/monarch/tree/main/examples/skypilot) +- [Monarch on Kubernetes using Skypilot](https://github.com/meta-pytorch/monarch/tree/main/examples/skypilot) diff --git a/examples/skypilot/skypilot_job.py b/examples/skypilot/skypilot_job.py index e9c36df36..809b97e5e 100644 --- a/examples/skypilot/skypilot_job.py +++ b/examples/skypilot/skypilot_job.py @@ -1,5 +1,5 @@ """ -SkyPilotJob for Monarch. +Monarch JobTrait implementation for SkyPilot. SkyPilotJob allows running Monarch on Kubernetes and cloud VMs via SkyPilot. @@ -40,7 +40,7 @@ logger.propagate = False # Default port for Monarch TCP communication -DEFAULT_MONARCH_PORT = 22222 +MONARCH_WORKER_PORT = 22222 # Timeout for waiting for the job to reach RUNNING status. JOB_TIMEOUT = 300 # seconds @@ -110,9 +110,9 @@ def __init__( meshes: Dict[str, int], resources: Optional["sky.Resources"] = None, cluster_name: Optional[str] = None, - monarch_port: int = DEFAULT_MONARCH_PORT, + monarch_port: int = MONARCH_WORKER_PORT, idle_minutes_to_autostop: Optional[int] = None, - down_on_autostop: bool = False, + down_on_autostop: bool = True, python_exe: str = "python", setup_commands: Optional[str] = None, workdir: Optional[str] = None, @@ -129,7 +129,9 @@ def __init__( idle_minutes_to_autostop: If set, cluster will autostop after this many minutes of idleness. down_on_autostop: If True, tear down cluster on autostop instead of - just stopping it. + just stopping it. On Kubernetes, autostop is not + supported and this must be set to True. Pods will + be deleted when the SkyPilot cluster is downed. python_exe: Python executable to use for worker processes. setup_commands: Optional setup commands to run before starting workers. If None, uses DEFAULT_SETUP_COMMANDS which installs