From 6ffd98ca5f1973a2d45b7906de2dda13295fefe5 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil@berkeley.edu>
Date: Fri, 5 Dec 2025 06:49:55 +0000
Subject: [PATCH 01/29] Add SkyPilot integration for job launching

---
 python/monarch/_src/job/skypilot.py       | 343 +++++++++++++++
 python/monarch/job/__init__.py            |  16 +-
 python/tests/test_skypilot_integration.py | 213 ++++++++++
 python/tests/test_skypilot_job.py         | 493 ++++++++++++++++++++++
 4 files changed, 1064 insertions(+), 1 deletion(-)
 create mode 100644 python/monarch/_src/job/skypilot.py
 create mode 100644 python/tests/test_skypilot_integration.py
 create mode 100644 python/tests/test_skypilot_job.py

diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py
new file mode 100644
index 000000000..39795d239
--- /dev/null
+++ b/python/monarch/_src/job/skypilot.py
@@ -0,0 +1,343 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import logging
+import os
+import sys
+import time
+from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
+
+from monarch._src.job.job import JobState, JobTrait
+
+# Defer imports that may not be available in all environments
+if TYPE_CHECKING:
+    import sky
+    from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle
+
+try:
+    import sky
+    from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle
+
+    HAS_SKYPILOT = True
+except ImportError:
+    HAS_SKYPILOT = False
+    sky = None  # type: ignore[assignment]
+    CloudVmRayResourceHandle = None  # type: ignore[assignment, misc]
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+logger.addHandler(logging.StreamHandler(sys.stderr))
+logger.propagate = False
+
+# Default port for Monarch TCP communication
+DEFAULT_MONARCH_PORT = 22222
+
+
+def _configure_transport() -> None:
+    """Configure the Monarch transport. Deferred import to avoid import errors."""
+    from monarch._rust_bindings.monarch_hyperactor.channel import ChannelTransport
+    from monarch._rust_bindings.monarch_hyperactor.config import configure
+
+    configure(default_transport=ChannelTransport.TcpWithHostname)
+
+
+def _attach_to_workers_wrapper(name: str, ca: str, workers: List[str]):
+    """Wrapper around attach_to_workers with deferred import."""
+    from monarch._src.actor.bootstrap import attach_to_workers
+
+    return attach_to_workers(name=name, ca=ca, workers=workers)
+
+
+class SkyPilotJob(JobTrait):
+    """
+    A job scheduler that uses SkyPilot to provision cloud instances.
+
+    SkyPilot supports multiple cloud providers (AWS, GCP, Azure, Lambda, etc.)
+    and can automatically select the cheapest available option.
+
+    This implementation:
+    1. Uses sky.launch() to provision cloud instances with specified resources
+    2. Runs Monarch workers on each node via a startup script
+    3. Connects to workers using their IP addresses from the cluster handle
+
+    Example:
+        >>> import sky
+        >>> from monarch.job import SkyPilotJob
+        >>>
+        >>> job = SkyPilotJob(
+        ...     meshes={"trainers": 2},
+        ...     resources=sky.Resources(accelerators="A100:1"),
+        ...     cluster_name="my-monarch-cluster",
+        ... )
+        >>> state = job.state()
+        >>> trainers = state.trainers  # HostMesh with 2 nodes
+    """
+
+    def __init__(
+        self,
+        meshes: Dict[str, int],
+        resources: Optional["sky.Resources"] = None,
+        cluster_name: Optional[str] = None,
+        monarch_port: int = DEFAULT_MONARCH_PORT,
+        idle_minutes_to_autostop: Optional[int] = None,
+        down_on_autostop: bool = False,
+        python_exe: str = "python",
+        setup_commands: Optional[str] = None,
+    ) -> None:
+        """
+        Args:
+            meshes: Dictionary mapping mesh names to number of nodes.
+                    e.g., {"trainers": 4, "dataloaders": 2}
+            resources: SkyPilot Resources specification for the instances.
+                       If None, uses SkyPilot defaults.
+            cluster_name: Name for the SkyPilot cluster. If None, auto-generated.
+            monarch_port: Port for TCP communication between Monarch workers.
+            idle_minutes_to_autostop: If set, cluster will autostop after this
+                                      many minutes of idleness.
+            down_on_autostop: If True, tear down cluster on autostop instead of
+                              just stopping it.
+            python_exe: Python executable to use for worker processes.
+            setup_commands: Optional setup commands to run before starting workers.
+                           Use this to install dependencies.
+        """
+        if not HAS_SKYPILOT:
+            raise ImportError(
+                "SkyPilot is not installed. Install it with: pip install skypilot"
+            )
+
+        # Configure transport at runtime when Monarch is available
+        try:
+            _configure_transport()
+        except ImportError:
+            # Monarch bindings not available, will fail later when needed
+            pass
+
+        super().__init__()
+
+        self._meshes = meshes
+        self._resources = resources
+        self._cluster_name = cluster_name
+        self._port = monarch_port
+        self._idle_minutes_to_autostop = idle_minutes_to_autostop
+        self._down_on_autostop = down_on_autostop
+        self._python_exe = python_exe
+        self._setup_commands = setup_commands
+
+        # Runtime state
+        self._launched_cluster_name: Optional[str] = None
+        self._node_ips: List[str] = []
+
+    def _create(self, client_script: Optional[str]) -> None:
+        """Launch a SkyPilot cluster and start Monarch workers."""
+        if client_script is not None:
+            raise RuntimeError("SkyPilotJob cannot run batch-mode scripts yet")
+
+        total_nodes = sum(self._meshes.values())
+
+        # Build the worker startup command
+        worker_command = self._build_worker_command()
+
+        # Create setup commands
+        setup = self._setup_commands or ""
+        if setup and not setup.endswith("\n"):
+            setup += "\n"
+
+        # Create the SkyPilot task
+        task = sky.Task(
+            name="monarch-workers",
+            setup=setup if setup else None,
+            run=worker_command,
+            num_nodes=total_nodes,
+        )
+
+        if self._resources is not None:
+            task.set_resources(self._resources)
+
+        # Generate cluster name if not provided
+        cluster_name = self._cluster_name or f"monarch-{os.getpid()}"
+
+        logger.info(f"Launching SkyPilot cluster '{cluster_name}' with {total_nodes} nodes")
+
+        # Launch the cluster
+        # Note: sky.launch returns a request ID in the SDK, we need to get the result
+        try:
+            request_id = sky.launch(
+                task,
+                cluster_name=cluster_name,
+                idle_minutes_to_autostop=self._idle_minutes_to_autostop,
+                down=self._down_on_autostop,
+            )
+            # Get the result from the request
+            job_id, handle = sky.get(request_id)
+        except Exception as e:
+            logger.error(f"Failed to launch SkyPilot cluster: {e}")
+            raise RuntimeError(f"Failed to launch SkyPilot cluster: {e}") from e
+
+        self._launched_cluster_name = cluster_name
+        logger.info(f"SkyPilot cluster '{cluster_name}' launched successfully")
+
+    def _build_worker_command(self) -> str:
+        """Build the command to start Monarch workers on each node."""
+        # This command will be run on each node
+        # We use the node's IP to create a unique address for each worker
+        return f"""
+import socket
+hostname = socket.gethostname()
+# Get the IP address of this node
+ip_addr = socket.gethostbyname(hostname)
+address = f"tcp://{{ip_addr}}:{self._port}"
+print(f"Starting Monarch worker at {{address}}")
+
+from monarch.actor import run_worker_loop_forever
+run_worker_loop_forever(address=address, ca="trust_all_connections")
+"""
+
+    def _get_node_ips(self) -> List[str]:
+        """Get the IP addresses of all nodes in the cluster."""
+        if not self._launched_cluster_name:
+            raise RuntimeError("Cluster has not been launched yet")
+
+        # Query cluster status to get handle with node IPs
+        try:
+            request_id = sky.status(cluster_names=[self._launched_cluster_name])
+            statuses = sky.get(request_id)
+        except Exception as e:
+            raise RuntimeError(f"Failed to get cluster status: {e}") from e
+
+        if not statuses:
+            raise RuntimeError(
+                f"Cluster '{self._launched_cluster_name}' not found"
+            )
+
+        status = statuses[0]
+        handle = status.handle
+
+        if handle is None:
+            raise RuntimeError(
+                f"Cluster '{self._launched_cluster_name}' has no handle"
+            )
+
+        if not isinstance(handle, CloudVmRayResourceHandle):
+            raise RuntimeError(
+                f"Unexpected handle type: {type(handle)}"
+            )
+
+        # Get the external IPs from the handle
+        if handle.stable_internal_external_ips is None:
+            raise RuntimeError("Cluster has no IP information")
+
+        # stable_internal_external_ips is List[Tuple[internal_ip, external_ip]]
+        # We use external IPs to connect
+        ips = []
+        for internal_ip, external_ip in handle.stable_internal_external_ips:
+            # Prefer external IP, fall back to internal
+            ip = external_ip if external_ip else internal_ip
+            if ip:
+                ips.append(ip)
+
+        if not ips:
+            raise RuntimeError("No IP addresses found for cluster nodes")
+
+        return ips
+
+    def _wait_for_workers_ready(
+        self, expected_nodes: int, timeout: int = 300, poll_interval: int = 5
+    ) -> List[str]:
+        """Wait for workers to be ready and return their addresses."""
+        start_time = time.time()
+
+        while time.time() - start_time < timeout:
+            try:
+                ips = self._get_node_ips()
+                if len(ips) >= expected_nodes:
+                    logger.info(f"Found {len(ips)} nodes ready")
+                    return ips
+            except Exception as e:
+                logger.debug(f"Waiting for workers: {e}")
+
+            time.sleep(poll_interval)
+
+        raise RuntimeError(
+            f"Timeout waiting for {expected_nodes} workers after {timeout}s"
+        )
+
+    def _state(self) -> JobState:
+        """Get the current state with HostMesh objects for each mesh."""
+        if not self._jobs_active():
+            raise RuntimeError("SkyPilot cluster is not active")
+
+        # Get node IPs if not cached
+        if not self._node_ips:
+            total_nodes = sum(self._meshes.values())
+            self._node_ips = self._wait_for_workers_ready(total_nodes)
+
+        # Distribute IPs among meshes
+        host_meshes = {}
+        ip_idx = 0
+
+        for mesh_name, num_nodes in self._meshes.items():
+            mesh_ips = self._node_ips[ip_idx : ip_idx + num_nodes]
+            ip_idx += num_nodes
+
+            workers = [f"tcp://{ip}:{self._port}" for ip in mesh_ips]
+
+            host_mesh = _attach_to_workers_wrapper(
+                name=mesh_name,
+                ca="trust_all_connections",
+                workers=workers,
+            )
+            host_meshes[mesh_name] = host_mesh
+
+        return JobState(host_meshes)
+
+    def can_run(self, spec: "JobTrait") -> bool:
+        """Check if this job can run the given spec."""
+        if not isinstance(spec, SkyPilotJob):
+            return False
+
+        return (
+            spec._meshes == self._meshes
+            and spec._resources == self._resources
+            and spec._port == self._port
+            and self._jobs_active()
+        )
+
+    def _jobs_active(self) -> bool:
+        """Check if the SkyPilot cluster is still active."""
+        if not self.active or not self._launched_cluster_name:
+            return False
+
+        try:
+            request_id = sky.status(cluster_names=[self._launched_cluster_name])
+            statuses = sky.get(request_id)
+
+            if not statuses:
+                return False
+
+            status = statuses[0]
+            # Check if cluster is UP
+            return status.status == sky.ClusterStatus.UP
+        except Exception as e:
+            logger.warning(f"Error checking cluster status: {e}")
+            return False
+
+    def _kill(self) -> None:
+        """Tear down the SkyPilot cluster."""
+        if self._launched_cluster_name is not None:
+            try:
+                logger.info(f"Tearing down SkyPilot cluster '{self._launched_cluster_name}'")
+                request_id = sky.down(self._launched_cluster_name)
+                sky.get(request_id)
+                logger.info(f"Cluster '{self._launched_cluster_name}' terminated")
+            except Exception as e:
+                logger.warning(f"Failed to tear down cluster: {e}")
+
+        self._launched_cluster_name = None
+        self._node_ips.clear()
+
diff --git a/python/monarch/job/__init__.py b/python/monarch/job/__init__.py
index b6852a0a1..674007d53 100644
--- a/python/monarch/job/__init__.py
+++ b/python/monarch/job/__init__.py
@@ -8,5 +8,19 @@
 from monarch._src.job.job import job_load, job_loads, JobState, JobTrait, LocalJob
 from monarch._src.job.slurm import SlurmJob
 
+# SkyPilot is an optional dependency
+try:
+    from monarch._src.job.skypilot import SkyPilotJob
+except ImportError:
+    SkyPilotJob = None  # type: ignore[misc,assignment]
+
 # Define exports
-__all__ = ["JobTrait", "job_load", "job_loads", "JobState", "LocalJob", "SlurmJob"]
+__all__ = [
+    "JobTrait",
+    "job_load",
+    "job_loads",
+    "JobState",
+    "LocalJob",
+    "SlurmJob",
+    "SkyPilotJob",
+]
diff --git a/python/tests/test_skypilot_integration.py b/python/tests/test_skypilot_integration.py
new file mode 100644
index 000000000..5469f4717
--- /dev/null
+++ b/python/tests/test_skypilot_integration.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Integration test script for SkyPilot job.
+
+This script tests the basic SkyPilot integration without requiring Monarch
+runtime. It validates that SkyPilot cluster launching and node IP retrieval works.
+
+Run this script with:
+    python tests/test_skypilot_integration.py
+
+Prerequisites:
+- SkyPilot installed and configured with cloud credentials
+- Run `sky check` to verify cloud access
+"""
+
+import argparse
+import sys
+import time
+
+try:
+    import sky
+    from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle
+except ImportError:
+    print("Error: SkyPilot is not installed. Install with: pip install skypilot")
+    sys.exit(1)
+
+
+def test_skypilot_cluster_launch(
+    cluster_name: str = "monarch-integration-test",
+    cloud: str = "aws",
+    cpus: str = "2+",
+    timeout_minutes: int = 10,
+) -> bool:
+    """
+    Test launching a SkyPilot cluster and retrieving node IPs.
+
+    Args:
+        cluster_name: Name for the test cluster
+        cloud: Cloud provider to use
+        cpus: CPU specification
+        timeout_minutes: Timeout for cluster launch
+
+    Returns:
+        True if test passed, False otherwise
+    """
+    print(f"\n{'='*60}")
+    print("SkyPilot Integration Test")
+    print(f"{'='*60}\n")
+
+    # Create a simple task
+    task = sky.Task(
+        name="monarch-test-task",
+        run="echo 'SkyPilot test successful' && hostname && sleep 30",
+    )
+
+    # Set resources based on cloud
+    cloud_obj = None
+    if cloud.lower() == "aws":
+        cloud_obj = sky.AWS()
+    elif cloud.lower() == "gcp":
+        cloud_obj = sky.GCP()
+    elif cloud.lower() == "azure":
+        cloud_obj = sky.Azure()
+    elif cloud.lower() == "kubernetes":
+        cloud_obj = sky.Kubernetes()
+
+    resources = sky.Resources(
+        cloud=cloud_obj,
+        cpus=cpus,
+    )
+    task.set_resources(resources)
+
+    print(f"Test configuration:")
+    print(f"  Cluster name: {cluster_name}")
+    print(f"  Cloud: {cloud}")
+    print(f"  CPUs: {cpus}")
+    print()
+
+    try:
+        # Launch the cluster
+        print("Step 1: Launching cluster...")
+        request_id = sky.launch(
+            task,
+            cluster_name=cluster_name,
+            idle_minutes_to_autostop=5,
+            down=True,  # Auto-teardown after idle
+        )
+
+        print(f"  Request ID: {request_id}")
+        job_id, handle = sky.get(request_id)
+        print(f"  Job ID: {job_id}")
+
+        if handle is None:
+            print("  ERROR: No handle returned from launch")
+            return False
+
+        print("  Cluster launched successfully!")
+
+        # Get cluster status and node IPs
+        print("\nStep 2: Getting cluster status and node IPs...")
+        request_id = sky.status(cluster_names=[cluster_name])
+        statuses = sky.get(request_id)
+
+        if not statuses:
+            print("  ERROR: No status returned")
+            return False
+
+        status = statuses[0]
+        print(f"  Cluster status: {status.status}")
+        print(f"  Cluster name: {status.name}")
+
+        handle = status.handle
+        if handle is None:
+            print("  ERROR: Status has no handle")
+            return False
+
+        if not isinstance(handle, CloudVmRayResourceHandle):
+            print(f"  ERROR: Unexpected handle type: {type(handle)}")
+            return False
+
+        # Get IPs
+        if handle.stable_internal_external_ips:
+            print(f"\n  Node IPs ({len(handle.stable_internal_external_ips)} nodes):")
+            for i, (internal_ip, external_ip) in enumerate(
+                handle.stable_internal_external_ips
+            ):
+                print(f"    Node {i}: internal={internal_ip}, external={external_ip}")
+        else:
+            print("  WARNING: No IP information available yet")
+
+        # Test passed!
+        print("\n" + "=" * 60)
+        print("TEST PASSED!")
+        print("=" * 60)
+        print(
+            "\nThe SkyPilot integration is working correctly."
+            "\nMonarch workers can be launched on these nodes."
+        )
+        return True
+
+    except Exception as e:
+        print(f"\nERROR: {e}")
+        import traceback
+
+        traceback.print_exc()
+        return False
+
+    finally:
+        # Cleanup
+        print("\nStep 3: Cleaning up cluster...")
+        try:
+            request_id = sky.down(cluster_name)
+            sky.get(request_id)
+            print("  Cluster terminated successfully")
+        except Exception as e:
+            print(f"  Warning: Failed to cleanup cluster: {e}")
+            print(f"  You may need to manually run: sky down {cluster_name}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Integration test for SkyPilot-Monarch integration"
+    )
+    parser.add_argument(
+        "--cluster-name",
+        default="monarch-integration-test",
+        help="Name for the test cluster",
+    )
+    parser.add_argument(
+        "--cloud",
+        default="aws",
+        choices=["aws", "gcp", "azure", "kubernetes"],
+        help="Cloud provider to use",
+    )
+    parser.add_argument(
+        "--cpus",
+        default="2+",
+        help="CPU specification",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=10,
+        help="Timeout in minutes for cluster launch",
+    )
+
+    args = parser.parse_args()
+
+    # Check SkyPilot is configured
+    print("Checking SkyPilot configuration...")
+    print(f"  Using cloud: {args.cloud}")
+    print("  (Run 'sky check' to verify cloud credentials)")
+
+    # Run the test
+    success = test_skypilot_cluster_launch(
+        cluster_name=args.cluster_name,
+        cloud=args.cloud,
+        cpus=args.cpus,
+        timeout_minutes=args.timeout,
+    )
+
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/python/tests/test_skypilot_job.py b/python/tests/test_skypilot_job.py
new file mode 100644
index 000000000..295c7a74d
--- /dev/null
+++ b/python/tests/test_skypilot_job.py
@@ -0,0 +1,493 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+"""Tests for the SkyPilot job integration."""
+
+import os
+import sys
+import tempfile
+from typing import Any, Dict, List, Optional, Tuple
+from unittest import mock
+
+import pytest
+
+
+# Check if SkyPilot is available
+try:
+    import sky
+
+    HAS_SKYPILOT = True
+except ImportError:
+    HAS_SKYPILOT = False
+
+# Check if Monarch bindings are available
+try:
+    from monarch._rust_bindings.monarch_hyperactor.config import configure
+
+    HAS_MONARCH_BINDINGS = True
+except ImportError:
+    HAS_MONARCH_BINDINGS = False
+
+# Skip all tests in this module if SkyPilot or Monarch bindings are not installed
+pytestmark = [
+    pytest.mark.skipif(not HAS_SKYPILOT, reason="SkyPilot not installed"),
+    pytest.mark.skipif(not HAS_MONARCH_BINDINGS, reason="Monarch bindings not available"),
+]
+
+
+class MockClusterHandle:
+    """Mock CloudVmRayResourceHandle for testing."""
+
+    def __init__(
+        self,
+        cluster_name: str,
+        node_ips: List[Tuple[str, str]],
+    ):
+        self.cluster_name = cluster_name
+        self.cluster_name_on_cloud = cluster_name
+        self.stable_internal_external_ips = node_ips
+        self.launched_nodes = len(node_ips)
+
+
+class MockStatusResponse:
+    """Mock status response from sky.status()."""
+
+    def __init__(
+        self,
+        name: str,
+        status: "sky.ClusterStatus",
+        handle: Optional[MockClusterHandle] = None,
+    ):
+        self.name = name
+        self.status = status
+        self.handle = handle
+
+
+@pytest.fixture
+def mock_sky():
+    """Fixture to mock SkyPilot SDK functions."""
+    with mock.patch("monarch._src.job.skypilot.sky") as mock_sky_module:
+        # Mock ClusterStatus enum
+        mock_sky_module.ClusterStatus = sky.ClusterStatus
+
+        # Mock sky.launch to return a mock request_id
+        mock_sky_module.launch.return_value = "mock-request-id"
+
+        # Mock sky.get to return appropriate results
+        def mock_get(request_id):
+            if request_id == "mock-request-id":
+                # Return (job_id, handle) for launch
+                return (
+                    1,
+                    MockClusterHandle(
+                        "test-cluster",
+                        [("10.0.0.1", "1.2.3.4"), ("10.0.0.2", "1.2.3.5")],
+                    ),
+                )
+            elif request_id == "mock-status-request-id":
+                # Return list of status responses
+                return [
+                    MockStatusResponse(
+                        "test-cluster",
+                        sky.ClusterStatus.UP,
+                        MockClusterHandle(
+                            "test-cluster",
+                            [("10.0.0.1", "1.2.3.4"), ("10.0.0.2", "1.2.3.5")],
+                        ),
+                    )
+                ]
+            elif request_id == "mock-down-request-id":
+                return None
+            return None
+
+        mock_sky_module.get.side_effect = mock_get
+
+        # Mock sky.status
+        mock_sky_module.status.return_value = "mock-status-request-id"
+
+        # Mock sky.down
+        mock_sky_module.down.return_value = "mock-down-request-id"
+
+        # Mock sky.Task
+        mock_sky_module.Task = mock.MagicMock()
+
+        # Mock sky.Resources
+        mock_sky_module.Resources = sky.Resources
+
+        yield mock_sky_module
+
+
+@pytest.fixture
+def mock_attach_to_workers():
+    """Fixture to mock attach_to_workers wrapper."""
+    with mock.patch(
+        "monarch._src.job.skypilot._attach_to_workers_wrapper"
+    ) as mock_attach:
+        # Create a simple mock HostMesh
+        class MockHostMesh:
+            def __init__(self, name):
+                self.name = name
+
+        def create_mock_host_mesh(name, ca, workers):
+            return MockHostMesh(name)
+
+        mock_attach.side_effect = create_mock_host_mesh
+        yield mock_attach
+
+
+@pytest.fixture
+def mock_configure_transport():
+    """Fixture to mock _configure_transport."""
+    with mock.patch(
+        "monarch._src.job.skypilot._configure_transport"
+    ) as mock_config:
+        yield mock_config
+
+
+@pytest.mark.skipif(not HAS_SKYPILOT, reason="SkyPilot not installed")
+def test_skypilot_job_import():
+    """Test that SkyPilotJob can be imported from monarch.job."""
+    from monarch.job import SkyPilotJob
+
+    # SkyPilotJob should be available (or None if import failed)
+    # This test verifies the export is working
+    if HAS_MONARCH_BINDINGS:
+        assert SkyPilotJob is not None
+    # If bindings are not available, SkyPilotJob will be None (graceful degradation)
+
+
+def test_skypilot_job_init(mock_configure_transport):
+    """Test SkyPilotJob initialization."""
+    from monarch._src.job.skypilot import SkyPilotJob
+
+    job = SkyPilotJob(
+        meshes={"trainers": 2, "workers": 1},
+        cluster_name="test-cluster",
+        monarch_port=12345,
+    )
+
+    assert job._meshes == {"trainers": 2, "workers": 1}
+    assert job._cluster_name == "test-cluster"
+    assert job._port == 12345
+    assert job._launched_cluster_name is None
+    assert job._node_ips == []
+
+
+def test_skypilot_job_init_with_resources(mock_configure_transport):
+    """Test SkyPilotJob initialization with SkyPilot resources."""
+    from monarch._src.job.skypilot import SkyPilotJob
+
+    resources = sky.Resources(accelerators="A100:1")
+
+    job = SkyPilotJob(
+        meshes={"trainers": 4},
+        resources=resources,
+        cluster_name="gpu-cluster",
+    )
+
+    assert job._resources == resources
+    assert job._meshes == {"trainers": 4}
+
+
+def test_skypilot_job_build_worker_command(mock_configure_transport):
+    """Test the worker command generation."""
+    from monarch._src.job.skypilot import SkyPilotJob
+
+    job = SkyPilotJob(
+        meshes={"trainers": 1},
+        monarch_port=22222,
+    )
+
+    command = job._build_worker_command()
+
+    # Check that the command contains expected elements
+    assert "socket.gethostname()" in command
+    assert "tcp://" in command
+    assert "22222" in command
+    assert "run_worker_loop_forever" in command
+    assert 'ca="trust_all_connections"' in command
+
+
+def test_skypilot_job_create(mock_sky, mock_attach_to_workers, mock_configure_transport):
+    """Test the _create method."""
+    from monarch._src.job.skypilot import SkyPilotJob
+
+    job = SkyPilotJob(
+        meshes={"trainers": 2},
+        cluster_name="test-cluster",
+    )
+
+    # Call _create
+    job._create(None)
+
+    # Verify sky.launch was called
+    mock_sky.launch.assert_called_once()
+
+    # Check that cluster name was stored
+    assert job._launched_cluster_name == "test-cluster"
+
+
+def test_skypilot_job_create_batch_mode_raises(mock_sky, mock_configure_transport):
+    """Test that _create raises an error for batch mode."""
+    from monarch._src.job.skypilot import SkyPilotJob
+
+    job = SkyPilotJob(meshes={"trainers": 1})
+
+    with pytest.raises(RuntimeError, match="batch-mode scripts"):
+        job._create("some_script.py")
+
+
+def test_skypilot_job_state(mock_sky, mock_attach_to_workers, mock_configure_transport):
+    """Test the _state method."""
+    from monarch._src.job.skypilot import SkyPilotJob
+
+    job = SkyPilotJob(
+        meshes={"trainers": 2},
+        cluster_name="test-cluster",
+    )
+
+    # Apply the job first
+    job.apply()
+
+    # Now get state
+    state = job._state()
+
+    # Verify attach_to_workers was called with correct addresses
+    mock_attach_to_workers.assert_called()
+    call_args = mock_attach_to_workers.call_args
+
+    # Check the call arguments
+    assert call_args.kwargs["name"] == "trainers"
+    assert call_args.kwargs["ca"] == "trust_all_connections"
+    # Workers should use external IPs
+    workers = call_args.kwargs["workers"]
+    assert len(workers) == 2
+    assert all("tcp://" in w for w in workers)
+
+    # Check that state has the trainers mesh
+    assert hasattr(state, "trainers")
+
+
+def test_skypilot_job_state_multiple_meshes(mock_sky, mock_attach_to_workers, mock_configure_transport):
+    """Test _state with multiple meshes."""
+    from monarch._src.job.skypilot import SkyPilotJob
+
+    # Create mock status with 3 nodes
+    def mock_get_multi(request_id):
+        if request_id == "mock-request-id":
+            return (
+                1,
+                MockClusterHandle(
+                    "test-cluster",
+                    [
+                        ("10.0.0.1", "1.2.3.4"),
+                        ("10.0.0.2", "1.2.3.5"),
+                        ("10.0.0.3", "1.2.3.6"),
+                    ],
+                ),
+            )
+        elif request_id == "mock-status-request-id":
+            return [
+                MockStatusResponse(
+                    "test-cluster",
+                    sky.ClusterStatus.UP,
+                    MockClusterHandle(
+                        "test-cluster",
+                        [
+                            ("10.0.0.1", "1.2.3.4"),
+                            ("10.0.0.2", "1.2.3.5"),
+                            ("10.0.0.3", "1.2.3.6"),
+                        ],
+                    ),
+                )
+            ]
+        return None
+
+    mock_sky.get.side_effect = mock_get_multi
+
+    job = SkyPilotJob(
+        meshes={"trainers": 2, "evaluator": 1},
+        cluster_name="test-cluster",
+    )
+
+    job.apply()
+    state = job._state()
+
+    # Verify attach_to_workers was called twice (once for each mesh)
+    assert mock_attach_to_workers.call_count == 2
+
+    # Check that state has both meshes
+    assert hasattr(state, "trainers")
+    assert hasattr(state, "evaluator")
+
+
+def test_skypilot_job_kill(mock_sky, mock_attach_to_workers, mock_configure_transport):
+    """Test the _kill method."""
+    from monarch._src.job.skypilot import SkyPilotJob
+
+    job = SkyPilotJob(
+        meshes={"trainers": 1},
+        cluster_name="test-cluster",
+    )
+
+    # Apply the job first
+    job.apply()
+    assert job._launched_cluster_name == "test-cluster"
+
+    # Kill the job
+    job._kill()
+
+    # Verify sky.down was called
+    mock_sky.down.assert_called_once_with("test-cluster")
+
+    # Check that state was cleared
+    assert job._launched_cluster_name is None
+    assert job._node_ips == []
+
+
+def test_skypilot_job_can_run(mock_sky, mock_attach_to_workers, mock_configure_transport):
+    """Test the can_run method."""
+    from monarch._src.job.skypilot import SkyPilotJob
+
+    job1 = SkyPilotJob(
+        meshes={"trainers": 2},
+        cluster_name="test-cluster",
+        monarch_port=22222,
+    )
+
+    job2 = SkyPilotJob(
+        meshes={"trainers": 2},
+        cluster_name="test-cluster",
+        monarch_port=22222,
+    )
+
+    job3 = SkyPilotJob(
+        meshes={"trainers": 4},  # Different mesh config
+        cluster_name="test-cluster",
+        monarch_port=22222,
+    )
+
+    # Apply job1
+    job1.apply()
+
+    # job1 should be able to run job2 (same config)
+    assert job1.can_run(job2) is True
+
+    # job1 should NOT be able to run job3 (different mesh config)
+    assert job1.can_run(job3) is False
+
+
+def test_skypilot_job_jobs_active(mock_sky, mock_attach_to_workers, mock_configure_transport):
+    """Test the _jobs_active method."""
+    from monarch._src.job.skypilot import SkyPilotJob
+
+    job = SkyPilotJob(
+        meshes={"trainers": 1},
+        cluster_name="test-cluster",
+    )
+
+    # Before apply, should not be active
+    assert job._jobs_active() is False
+
+    # Apply the job
+    job.apply()
+
+    # After apply, should be active (mocked status returns UP)
+    assert job._jobs_active() is True
+
+
+def test_skypilot_job_serialization(mock_sky, mock_attach_to_workers, mock_configure_transport):
+    """Test that SkyPilotJob can be serialized and deserialized."""
+    from monarch._src.job.skypilot import SkyPilotJob
+    from monarch._src.job.job import job_loads
+
+    job = SkyPilotJob(
+        meshes={"trainers": 2, "workers": 1},
+        cluster_name="test-cluster",
+        monarch_port=33333,
+    )
+
+    # Serialize
+    serialized = job.dumps()
+
+    # Deserialize
+    loaded_job = job_loads(serialized)
+
+    # Check attributes
+    assert isinstance(loaded_job, SkyPilotJob)
+    assert loaded_job._meshes == {"trainers": 2, "workers": 1}
+    assert loaded_job._cluster_name == "test-cluster"
+    assert loaded_job._port == 33333
+
+
+def test_skypilot_job_with_setup_commands(mock_configure_transport):
+    """Test SkyPilotJob with custom setup commands."""
+    from monarch._src.job.skypilot import SkyPilotJob
+
+    setup = "pip install torch\npip install monarch"
+
+    job = SkyPilotJob(
+        meshes={"trainers": 1},
+        setup_commands=setup,
+    )
+
+    assert job._setup_commands == setup
+
+
+def test_skypilot_job_with_autostop(mock_configure_transport):
+    """Test SkyPilotJob with autostop configuration."""
+    from monarch._src.job.skypilot import SkyPilotJob
+
+    job = SkyPilotJob(
+        meshes={"trainers": 1},
+        idle_minutes_to_autostop=30,
+        down_on_autostop=True,
+    )
+
+    assert job._idle_minutes_to_autostop == 30
+    assert job._down_on_autostop is True
+
+
+# Integration test - only run if explicitly requested
+@pytest.mark.skip(reason="Integration test - run manually with --run-integration")
+def test_skypilot_job_integration():
+    """
+    Integration test that actually launches a SkyPilot cluster.
+
+    To run this test:
+        pytest tests/test_skypilot_job.py::test_skypilot_job_integration --run-integration
+
+    Make sure you have SkyPilot credentials configured.
+    """
+    from monarch._src.job.skypilot import SkyPilotJob
+
+    # Create a minimal job - just 1 node with cheap resources
+    job = SkyPilotJob(
+        meshes={"workers": 1},
+        resources=sky.Resources(
+            cloud=sky.AWS(),  # Change to your preferred cloud
+            cpus="2+",
+        ),
+        cluster_name="monarch-test-integration",
+        idle_minutes_to_autostop=5,
+        down_on_autostop=True,
+    )
+
+    try:
+        # Apply the job
+        job.apply()
+
+        # Check that we can get state
+        state = job.state()
+        assert hasattr(state, "workers")
+
+        print("Integration test passed!")
+    finally:
+        # Always clean up
+        job.kill()
+

From 17536f3168a1129861c02cb3426385f65a0ccc73 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil@berkeley.edu>
Date: Fri, 5 Dec 2025 07:22:25 +0000
Subject: [PATCH 02/29] Add workdir and file_mounts parameters to SkyPilotJob

Summary:
This update introduces two new optional parameters, `workdir` and `file_mounts`, to the `SkyPilotJob` class. The `workdir` parameter allows users to specify a local directory to sync with the cluster, while `file_mounts` enables additional file mounts by mapping remote paths to local paths. These enhancements improve the flexibility and usability of job configurations in SkyPilot.
---
 python/monarch/_src/job/skypilot.py | 30 +++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py
index 39795d239..1a49a04f9 100644
--- a/python/monarch/_src/job/skypilot.py
+++ b/python/monarch/_src/job/skypilot.py
@@ -89,6 +89,8 @@ def __init__(
         down_on_autostop: bool = False,
         python_exe: str = "python",
         setup_commands: Optional[str] = None,
+        workdir: Optional[str] = None,
+        file_mounts: Optional[Dict[str, str]] = None,
     ) -> None:
         """
         Args:
@@ -104,7 +106,11 @@ def __init__(
                               just stopping it.
             python_exe: Python executable to use for worker processes.
             setup_commands: Optional setup commands to run before starting workers.
-                           Use this to install dependencies.
+                           Use this to install dependencies including Monarch.
+            workdir: Local directory to sync to the cluster. If provided, this
+                    directory will be uploaded to ~/sky_workdir on each node.
+            file_mounts: Dictionary mapping remote paths to local paths for
+                        additional file mounts.
         """
         if not HAS_SKYPILOT:
             raise ImportError(
@@ -128,6 +134,8 @@ def __init__(
         self._down_on_autostop = down_on_autostop
         self._python_exe = python_exe
         self._setup_commands = setup_commands
+        self._workdir = workdir
+        self._file_mounts = file_mounts
 
         # Runtime state
         self._launched_cluster_name: Optional[str] = None
@@ -154,8 +162,13 @@ def _create(self, client_script: Optional[str]) -> None:
             setup=setup if setup else None,
             run=worker_command,
             num_nodes=total_nodes,
+            workdir=self._workdir,
         )
 
+        # Add file mounts if provided
+        if self._file_mounts:
+            task.set_file_mounts(self._file_mounts)
+
         if self._resources is not None:
             task.set_resources(self._resources)
 
@@ -183,20 +196,21 @@ def _create(self, client_script: Optional[str]) -> None:
         logger.info(f"SkyPilot cluster '{cluster_name}' launched successfully")
 
     def _build_worker_command(self) -> str:
-        """Build the command to start Monarch workers on each node."""
-        # This command will be run on each node
-        # We use the node's IP to create a unique address for each worker
-        return f"""
+        """Build the bash command to start Monarch workers on each node."""
+        # This command will be run on each node via SkyPilot
+        # SkyPilot expects a bash script, so we wrap Python code in python -c
+        python_code = f'''
 import socket
 hostname = socket.gethostname()
-# Get the IP address of this node
 ip_addr = socket.gethostbyname(hostname)
 address = f"tcp://{{ip_addr}}:{self._port}"
 print(f"Starting Monarch worker at {{address}}")
-
 from monarch.actor import run_worker_loop_forever
 run_worker_loop_forever(address=address, ca="trust_all_connections")
-"""
+'''
+        # Escape single quotes in the Python code for bash
+        escaped_code = python_code.replace("'", "'\"'\"'")
+        return f"python -c '{escaped_code}'"
 
     def _get_node_ips(self) -> List[str]:
         """Get the IP addresses of all nodes in the cluster."""

From b2344bf547548db49ea7e72df5f55db1b2e13920 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil@berkeley.edu>
Date: Fri, 5 Dec 2025 18:05:55 +0000
Subject: [PATCH 03/29] fixes

---
 python/monarch/_src/job/skypilot.py | 10 ++++++++--
 python/tests/test_skypilot_job.py   |  8 ++++----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py
index 1a49a04f9..529997fe7 100644
--- a/python/monarch/_src/job/skypilot.py
+++ b/python/monarch/_src/job/skypilot.py
@@ -170,7 +170,12 @@ def _create(self, client_script: Optional[str]) -> None:
             task.set_file_mounts(self._file_mounts)
 
         if self._resources is not None:
-            task.set_resources(self._resources)
+            # Copy resources and override image_id to use PyTorch image with CUDA
+            # This ensures torchmonarch has access to CUDA libraries
+            resources = self._resources.copy(
+                image_id="docker:pytorch/pytorch:2.9.1-cuda12.6-cudnn9-devel"
+            )
+            task.set_resources(resources)
 
         # Generate cluster name if not provided
         cluster_name = self._cluster_name or f"monarch-{os.getpid()}"
@@ -210,7 +215,8 @@ def _build_worker_command(self) -> str:
 '''
         # Escape single quotes in the Python code for bash
         escaped_code = python_code.replace("'", "'\"'\"'")
-        return f"python -c '{escaped_code}'"
+        # Set timeout env var - setup takes time so we need longer than default 30s
+        return f"export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=5m && python -c '{escaped_code}'"
 
     def _get_node_ips(self) -> List[str]:
         """Get the IP addresses of all nodes in the cluster."""
diff --git a/python/tests/test_skypilot_job.py b/python/tests/test_skypilot_job.py
index 295c7a74d..b6af37a22 100644
--- a/python/tests/test_skypilot_job.py
+++ b/python/tests/test_skypilot_job.py
@@ -1,12 +1,12 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
+# All rights retuprved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
 
-"""Tests for the SkyPilot job integration."""
+"""Tests for the SubernyPilot job integration."""
 
 import os
 import sys
@@ -425,8 +425,8 @@ def test_skypilot_job_serialization(mock_sky, mock_attach_to_workers, mock_confi
     assert loaded_job._port == 33333
 
 
-def test_skypilot_job_with_setup_commands(mock_configure_transport):
-    """Test SkyPilotJob with custom setup commands."""
+def test_skypilot_job_with_upup_commands(mock_configure_transport):
+    """Test SkyPilotJob with custom ppppppppppp commands."""
     from monarch._src.job.skypilot import SkyPilotJob
 
     setup = "pip install torch\npip install monarch"

From 6803cbf12505e9023fecc988a9223ddbc202259b Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil@berkeley.edu>
Date: Fri, 5 Dec 2025 19:35:57 +0000
Subject: [PATCH 04/29] Add SkyPilot integration example and documentation

- Add SKY_README.md with comprehensive documentation:
  - Architecture overview
  - Implementation details
  - Usage examples
  - Troubleshooting guide
  - Networking considerations for Kubernetes

- Add python/examples/skypilot_getting_started.py:
  - Example script demonstrating Monarch actors on SkyPilot
  - Supports multiple clouds (Kubernetes, AWS, GCP, Azure)
  - Configurable via command-line arguments

- Update skypilot.py:
  - Add host mesh initialization wait
  - Improve logging for debugging
  - Fix worker command environment variable setup
---
 SKY_README.md                               | 293 ++++++++++++++++++++
 python/examples/skypilot_getting_started.py | 266 ++++++++++++++++++
 python/monarch/_src/job/skypilot.py         |  44 ++-
 3 files changed, 598 insertions(+), 5 deletions(-)
 create mode 100644 SKY_README.md
 create mode 100644 python/examples/skypilot_getting_started.py

diff --git a/SKY_README.md b/SKY_README.md
new file mode 100644
index 000000000..1558c62da
--- /dev/null
+++ b/SKY_README.md
@@ -0,0 +1,293 @@
+# Monarch + SkyPilot Integration
+
+This document describes the SkyPilot integration for Monarch, which enables running Monarch actors on cloud infrastructure provisioned by SkyPilot.
+
+## Overview
+
+SkyPilot is a framework for running ML workloads on any cloud (AWS, GCP, Azure, Lambda, Kubernetes, etc.). The `SkyPilotJob` class in Monarch provides a seamless integration that:
+
+1. **Provisions cloud instances** using SkyPilot's unified API
+2. **Installs Monarch** (`torchmonarch` from PyPI) on remote nodes
+3. **Starts Monarch workers** on each node listening for connections
+4. **Connects clients** to workers using TCP for distributed actor communication
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                         Client Machine                          │
+│  ┌─────────────────────────────────────────────────────────┐   │
+│  │                    SkyPilotJob                           │   │
+│  │  - Calls sky.launch() to provision cloud instances       │   │
+│  │  - Configures setup commands to install torchmonarch     │   │
+│  │  - Builds worker command with run_worker_loop_forever()  │   │
+│  │  - Calls attach_to_workers() to create HostMesh          │   │
+│  └─────────────────────────────────────────────────────────┘   │
+└───────────────────────────────┬─────────────────────────────────┘
+                                │ TCP connections (port 22222)
+        ┌───────────────────────┼───────────────────────┐
+        │                       │                       │
+        ▼                       ▼                       ▼
+┌───────────────┐       ┌───────────────┐       ┌───────────────┐
+│   Worker 1    │       │   Worker 2    │       │   Worker N    │
+│ (Cloud Node)  │       │ (Cloud Node)  │       │ (Cloud Node)  │
+│               │       │               │       │               │
+│ run_worker_   │       │ run_worker_   │       │ run_worker_   │
+│ loop_forever()│       │ loop_forever()│       │ loop_forever()│
+│               │       │               │       │               │
+│ tcp://<ip>:   │       │ tcp://<ip>:   │       │ tcp://<ip>:   │
+│   22222       │       │   22222       │       │   22222       │
+└───────────────┘       └───────────────┘       └───────────────┘
+```
+
+## Implementation Details
+
+### Files
+
+- **`python/monarch/_src/job/skypilot.py`**: Core `SkyPilotJob` implementation
+- **`python/monarch/job/__init__.py`**: Exports `SkyPilotJob` (with graceful ImportError handling)
+- **`python/tests/test_skypilot_job.py`**: Unit tests with mocked SkyPilot
+- **`python/tests/test_skypilot_integration.py`**: Integration test scaffolding
+- **`python/examples/skypilot_getting_started.py`**: Example demonstrating usage
+
+### Key Classes and Functions
+
+#### `SkyPilotJob(JobTrait)`
+
+Main job class that implements the Monarch `JobTrait` interface.
+
+```python
+from monarch.job import SkyPilotJob
+import sky
+
+job = SkyPilotJob(
+    meshes={"trainers": 2},           # 2 nodes for "trainers" mesh
+    resources=sky.Resources(
+        cloud=sky.Kubernetes(),
+        accelerators="H100:1",
+    ),
+    cluster_name="my-cluster",
+    idle_minutes_to_autostop=10,
+    down_on_autostop=True,
+    setup_commands="pip install torchmonarch",
+)
+
+state = job.state()  # Launches cluster and returns JobState
+hosts = state.trainers  # HostMesh with 2 nodes
+```
+
+#### Parameters
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `meshes` | `Dict[str, int]` | Mesh names to node counts |
+| `resources` | `sky.Resources` | SkyPilot resource specification |
+| `cluster_name` | `str` | Name for the cluster |
+| `monarch_port` | `int` | TCP port for workers (default: 22222) |
+| `idle_minutes_to_autostop` | `int` | Auto-stop after idle minutes |
+| `down_on_autostop` | `bool` | Terminate (not just stop) on autostop |
+| `setup_commands` | `str` | Shell commands to run before workers start |
+| `workdir` | `str` | Local directory to sync to cluster |
+| `file_mounts` | `Dict[str, str]` | Additional file mounts |
+
+### Worker Lifecycle
+
+1. **Launch**: `sky.launch()` creates the cluster with specified resources
+2. **Setup**: `setup_commands` run to install `torchmonarch`
+3. **Run**: Worker command executes `run_worker_loop_forever(address, ca)`
+4. **Connect**: Client calls `attach_to_workers()` to create `HostMesh`
+5. **Teardown**: `sky.down()` terminates the cluster
+
+### Environment Variables
+
+The following environment variables control timeouts:
+
+```python
+os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s"  # Worker spawn timeout
+os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s"  # Message delivery timeout
+os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s"  # Proc mesh spawn timeout
+```
+
+## Requirements
+
+### Client Side
+- Monarch with Rust bindings (`pip install -e .` from source)
+- SkyPilot (`pip install skypilot`)
+- Configured cloud credentials (`sky check`)
+
+### Worker Side (installed via setup_commands)
+- `torchmonarch` from PyPI
+- **CUDA libraries** - torchmonarch requires `libcuda.so.1`
+- This means workers **must run on GPU nodes**
+
+## Usage
+
+### Basic Example
+
+```python
+import sky
+from monarch.job import SkyPilotJob
+from monarch.actor import Actor, endpoint
+
+class MyActor(Actor):
+    @endpoint
+    def hello(self) -> str:
+        return "Hello from cloud!"
+
+# Create job
+job = SkyPilotJob(
+    meshes={"workers": 2},
+    resources=sky.Resources(
+        cloud=sky.AWS(),
+        accelerators="A100:1",
+    ),
+    setup_commands="pip install torchmonarch",
+)
+
+# Launch and get state
+state = job.state()
+hosts = state.workers
+
+# Spawn processes and actors
+procs = hosts.spawn_procs(per_host={"gpus": 1})
+actors = procs.spawn("my_actors", MyActor)
+
+# Interact with actors
+results = actors.hello.call().get()
+print(results)  # ["Hello from cloud!", "Hello from cloud!"]
+
+# Cleanup
+job.kill()
+```
+
+### Running the Example
+
+```bash
+# Install dependencies
+pip install skypilot
+pip install -e .  # Build Monarch from source
+
+# Configure cloud credentials
+sky check
+
+# Run example
+cd python/examples
+python skypilot_getting_started.py \
+    --cloud kubernetes \
+    --num-hosts 2 \
+    --accelerator "H100:1" \
+    --cluster-name my-monarch-cluster
+```
+
+### Supported Clouds
+
+- **Kubernetes**: Use `sky.Kubernetes()` with `--region` for context
+- **AWS**: Use `sky.AWS()` 
+- **GCP**: Use `sky.GCP()`
+- **Azure**: Use `sky.Azure()`
+- **Lambda Labs**: Use `sky.Lambda()`
+- And others supported by SkyPilot
+
+## Networking Considerations
+
+### Kubernetes
+
+When using Kubernetes, the client and workers must be in the **same Kubernetes cluster** for pod-to-pod communication. Use the `region` parameter to specify the Kubernetes context:
+
+```python
+resources=sky.Resources(
+    cloud=sky.Kubernetes(),
+    region="my-k8s-context",  # Must match client's cluster
+)
+```
+
+### Public Clouds (AWS, GCP, Azure)
+
+SkyPilot handles networking automatically. Workers get public IPs that clients can connect to.
+
+### Firewall
+
+Ensure port 22222 (or your custom `monarch_port`) is accessible:
+- Kubernetes: Pod networking should handle this
+- AWS: Security groups
+- GCP: Firewall rules
+- Azure: Network security groups
+
+## Troubleshooting
+
+### "libcuda.so.1: cannot open shared object file"
+
+**Cause**: Workers are running on CPU-only nodes, but `torchmonarch` requires CUDA.
+
+**Solution**: Request GPU nodes:
+```python
+resources=sky.Resources(accelerators="H100:1")
+```
+
+### "No route to host" or connection timeouts
+
+**Cause**: Client and workers are in different networks (e.g., different Kubernetes clusters).
+
+**Solution**: Ensure client and workers are in the same network:
+- For Kubernetes: Use `region` parameter to specify the correct context
+- For public clouds: Check security group / firewall rules
+
+### "error spawning proc mesh: statuses: Timeout"
+
+**Causes**:
+1. Workers aren't listening on the expected port
+2. Network connectivity issues
+3. Workers crashed during startup
+
+**Debug steps**:
+1. Check SkyPilot logs: `sky logs <cluster-name>`
+2. SSH into cluster: `sky ssh <cluster-name>`
+3. Check if port is listening: `ss -tlnp | grep 22222`
+4. Check Monarch logs: `/tmp/sky/monarch_log.log`
+
+### Workers crash immediately
+
+Check SkyPilot logs for the error:
+```bash
+sky logs <cluster-name>
+```
+
+Common issues:
+- Missing CUDA libraries → use GPU nodes
+- torchmonarch installation failed → check setup_commands
+- Python version mismatch → ensure compatible Python version
+
+## Testing
+
+### Unit Tests (with mocked SkyPilot)
+
+```bash
+cd python
+pytest tests/test_skypilot_job.py -v
+```
+
+### Integration Tests (requires real cloud)
+
+```bash
+cd python
+pytest tests/test_skypilot_integration.py -v --cloud kubernetes
+```
+
+## Comparison with SlurmJob
+
+| Feature | SkyPilotJob | SlurmJob |
+|---------|-------------|----------|
+| Cloud Support | Multi-cloud (AWS, GCP, Azure, K8s, etc.) | HPC clusters only |
+| Setup | Automatic via SkyPilot | Requires Slurm installation |
+| Autoscaling | Supported | Depends on cluster |
+| Cost Optimization | Automatic (cheapest region) | N/A |
+| Worker Discovery | Via cluster handle IPs | Via squeue hostnames |
+
+## Future Work
+
+- [ ] Support for spot/preemptible instances
+- [ ] Multi-region deployments  
+- [ ] Automatic failover on spot termination
+- [ ] Integration with SkyPilot managed jobs
+- [ ] Support for batch mode (client script on cluster)
+
diff --git a/python/examples/skypilot_getting_started.py b/python/examples/skypilot_getting_started.py
new file mode 100644
index 000000000..64fbf9658
--- /dev/null
+++ b/python/examples/skypilot_getting_started.py
@@ -0,0 +1,266 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Monarch Getting Started with SkyPilot
+=====================================
+
+This script demonstrates running Monarch actors on cloud infrastructure
+provisioned by SkyPilot. It follows the Monarch getting started guide
+but uses SkyPilot to launch the worker nodes.
+
+Prerequisites:
+- Monarch installed with its Rust bindings (build with `pip install -e .` in monarch/)
+- SkyPilot installed and configured (run `sky check`)
+- torchmonarch available on PyPI (requires CUDA on remote nodes)
+
+Usage:
+    python skypilot_getting_started.py
+
+    # With explicit options:
+    python skypilot_getting_started.py --cloud kubernetes --num-hosts 2
+
+See SKY_README.md for full documentation.
+"""
+
+import argparse
+import os
+import sys
+
+# Set timeouts before importing monarch - worker setup takes time
+os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s"
+os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s"
+os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s"
+
+# Check dependencies before importing
+try:
+    import sky
+except ImportError:
+    print("ERROR: SkyPilot is not installed. Run: pip install skypilot")
+    sys.exit(1)
+
+try:
+    from monarch.job import SkyPilotJob
+    from monarch.actor import Actor, endpoint, ProcMesh, context
+except ImportError as e:
+    print(f"ERROR: Monarch is not properly installed: {e}")
+    print("\nTo install Monarch, you need to build it from source:")
+    print("  cd monarch/")
+    print("  pip install -e .")
+    print("\nThis requires the Rust toolchain and other dependencies.")
+    print("See monarch/README.md for full installation instructions.")
+    sys.exit(1)
+
+# ============================================================================
+# Step 1: Define our Actors (same as getting started guide)
+# ============================================================================
+
+
+class Counter(Actor):
+    """A simple counter actor that demonstrates basic messaging."""
+
+    def __init__(self, initial_value: int = 0):
+        self.value = initial_value
+
+    @endpoint
+    def increment(self) -> None:
+        self.value += 1
+
+    @endpoint
+    def get_value(self) -> int:
+        return self.value
+
+
+class Trainer(Actor):
+    """A trainer actor that demonstrates distributed training patterns."""
+
+    @endpoint
+    def step(self) -> str:
+        my_point = context().message_rank
+        return f"Trainer {my_point} taking a step."
+
+    @endpoint
+    def get_info(self) -> str:
+        rank = context().actor_instance.rank
+        return f"Trainer at rank {rank}"
+
+
+# ============================================================================
+# Step 2: Create a SkyPilot Job to provision cloud infrastructure
+# ============================================================================
+
+
+def get_cloud(cloud_name: str):
+    """Get SkyPilot cloud object from name."""
+    clouds = {
+        "kubernetes": sky.Kubernetes,
+        "aws": sky.AWS,
+        "gcp": sky.GCP,
+        "azure": sky.Azure,
+    }
+    if cloud_name.lower() not in clouds:
+        raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}")
+    return clouds[cloud_name.lower()]()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Monarch Getting Started with SkyPilot")
+    parser.add_argument(
+        "--cloud",
+        default="kubernetes",
+        help="Cloud provider to use (kubernetes, aws, gcp, azure)",
+    )
+    parser.add_argument(
+        "--num-hosts",
+        type=int,
+        default=2,
+        help="Number of host nodes to provision",
+    )
+    parser.add_argument(
+        "--gpus-per-host",
+        type=int,
+        default=2,
+        help="Number of GPU processes per host",
+    )
+    parser.add_argument(
+        "--cluster-name",
+        default="monarch-getting-started",
+        help="Name for the SkyPilot cluster",
+    )
+    parser.add_argument(
+        "--accelerator",
+        default="H100:1",
+        help="GPU accelerator to request (e.g., H100:1, A100:1, V100:1)",
+    )
+    parser.add_argument(
+        "--region",
+        default=None,
+        help="Cloud region/Kubernetes context to use",
+    )
+    args = parser.parse_args()
+
+    print("=" * 60)
+    print("Monarch Getting Started with SkyPilot")
+    print("=" * 60)
+    print(f"\nConfiguration:")
+    print(f"  Cloud: {args.cloud}")
+    print(f"  Hosts: {args.num_hosts}")
+    print(f"  GPUs per host: {args.gpus_per_host}")
+    print(f"  Accelerator: {args.accelerator}")
+    print(f"  Cluster name: {args.cluster_name}")
+    if args.region:
+        print(f"  Region: {args.region}")
+
+    # Create a SkyPilotJob to provision nodes
+    # This will launch cloud instances and start Monarch workers on them
+    print("\n[1] Creating SkyPilot job...")
+
+    # Setup commands to install Monarch on the remote nodes
+    # torchmonarch is the PyPI package name for Monarch
+    setup_commands = """
+sudo apt-get update && sudo apt-get install -y rdma-core libibverbs1 libmlx5-1 libibverbs-dev || true
+pip install torchmonarch
+echo "DONE INSTALLING TORCHMONARCH"
+"""
+
+    # Build resources specification
+    resources_kwargs = {
+        "cloud": get_cloud(args.cloud),
+        "cpus": "2+",
+        "accelerators": args.accelerator,  # GPU required - torchmonarch needs CUDA
+    }
+    if args.region:
+        resources_kwargs["region"] = args.region
+
+    job = SkyPilotJob(
+        # Define the mesh of hosts we need
+        meshes={"trainers": args.num_hosts},
+        # Specify cloud resources - GPU required for torchmonarch (needs CUDA)
+        resources=sky.Resources(**resources_kwargs),
+        cluster_name=args.cluster_name,
+        # Auto-cleanup after 10 minutes of idle time
+        idle_minutes_to_autostop=10,
+        down_on_autostop=True,
+        # Setup commands to install dependencies
+        setup_commands=setup_commands,
+    )
+
+    try:
+        # Get the job state - this launches the cluster and returns HostMeshes
+        print("\n[2] Launching cluster and starting Monarch workers...")
+        state = job.state()
+
+        # Get our host mesh
+        hosts = state.trainers
+        print(f"    Got host mesh with extent: {hosts.extent}")
+
+        # ====================================================================
+        # Step 3: Spawn processes and actors on the cloud hosts
+        # ====================================================================
+
+        print("\n[3] Spawning processes on cloud hosts...")
+        # Create a process mesh - GPU processes per host
+        procs: ProcMesh = hosts.spawn_procs(per_host={"gpus": args.gpus_per_host})
+        print(f"    Process mesh extent: {procs.extent}")
+
+        # Spawn counter actors
+        print("\n[4] Spawning Counter actors...")
+        counters: Counter = procs.spawn("counters", Counter, initial_value=0)
+
+        # ====================================================================
+        # Step 4: Interact with the actors
+        # ====================================================================
+
+        # Broadcast increment to all counters
+        print("\n[5] Broadcasting increment to all counters...")
+        counters.increment.broadcast()
+        counters.increment.broadcast()
+        counters.increment.broadcast()
+
+        # Get all counter values
+        print("\n[6] Getting counter values...")
+        values = counters.get_value.call().get()
+        print(f"    Counter values: {values}")
+
+        # Spawn trainer actors
+        print("\n[7] Spawning Trainer actors...")
+        trainers: Trainer = procs.spawn("trainers", Trainer)
+
+        # Do a training step
+        print("\n[8] Performing distributed training step...")
+        results = trainers.step.call().get()
+        for r in results:
+            print(f"    {r}")
+
+        # Get trainer info
+        print("\n[9] Getting trainer info...")
+        info = trainers.get_info.call().get()
+        for i in info:
+            print(f"    {i}")
+
+        print("\n" + "=" * 60)
+        print("SUCCESS! Monarch actors ran on SkyPilot cluster!")
+        print("=" * 60)
+
+    except Exception as e:
+        print(f"\nERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        print(f"\n[10] ERROR - not cleaning up cluster for debugging...")
+        print(f"    You can debug with: sky ssh {args.cluster_name}")
+        print(f"    To clean up later: sky down {args.cluster_name}")
+        raise
+    else:
+        # Clean up - tear down the SkyPilot cluster
+        print("\n[10] Cleaning up SkyPilot cluster...")
+        job.kill()
+        print("    Cluster terminated.")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py
index 529997fe7..21454de45 100644
--- a/python/monarch/_src/job/skypilot.py
+++ b/python/monarch/_src/job/skypilot.py
@@ -204,19 +204,42 @@ def _build_worker_command(self) -> str:
         """Build the bash command to start Monarch workers on each node."""
         # This command will be run on each node via SkyPilot
         # SkyPilot expects a bash script, so we wrap Python code in python -c
+        # Note: Use IP address (not hostname) for the worker address since
+        # Kubernetes hostnames may not resolve across pods
         python_code = f'''
 import socket
+import logging
+import sys
+
+# Enable verbose logging
+logging.basicConfig(level=logging.DEBUG, stream=sys.stdout, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+
 hostname = socket.gethostname()
 ip_addr = socket.gethostbyname(hostname)
 address = f"tcp://{{ip_addr}}:{self._port}"
-print(f"Starting Monarch worker at {{address}}")
-from monarch.actor import run_worker_loop_forever
-run_worker_loop_forever(address=address, ca="trust_all_connections")
+print(f"Starting Monarch worker at {{address}} (hostname={{hostname}})", flush=True)
+sys.stdout.flush()
+
+try:
+    from monarch.actor import run_worker_loop_forever
+    print(f"Imported run_worker_loop_forever successfully", flush=True)
+    print(f"Worker ready and listening...", flush=True)
+    run_worker_loop_forever(address=address, ca="trust_all_connections")
+except Exception as e:
+    print(f"ERROR in worker: {{e}}", flush=True)
+    import traceback
+    traceback.print_exc()
+    raise
 '''
         # Escape single quotes in the Python code for bash
         escaped_code = python_code.replace("'", "'\"'\"'")
-        # Set timeout env var - setup takes time so we need longer than default 30s
-        return f"export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=5m && python -c '{escaped_code}'"
+        # Set timeout env vars - setup takes time so we need longer than default 30s
+        env_vars = " ".join([
+            "export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=5m",
+            "export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=5m",
+            "export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=5m",
+        ])
+        return f"{env_vars} && python -c '{escaped_code}'"
 
     def _get_node_ips(self) -> List[str]:
         """Get the IP addresses of all nodes in the cluster."""
@@ -306,12 +329,23 @@ def _state(self) -> JobState:
             ip_idx += num_nodes
 
             workers = [f"tcp://{ip}:{self._port}" for ip in mesh_ips]
+            logger.info(f"Connecting to workers for mesh '{mesh_name}': {workers}")
 
             host_mesh = _attach_to_workers_wrapper(
                 name=mesh_name,
                 ca="trust_all_connections",
                 workers=workers,
             )
+            
+            # Wait for the host mesh to be initialized (connections established)
+            logger.info(f"Waiting for host mesh '{mesh_name}' to initialize...")
+            host_mesh.initialized.get()
+            logger.info(f"Host mesh '{mesh_name}' initialized successfully")
+            
+            # Give connections a moment to fully stabilize
+            time.sleep(5)
+            logger.info(f"Host mesh '{mesh_name}' ready")
+            
             host_meshes[mesh_name] = host_mesh
 
         return JobState(host_meshes)

From a740ae1d317ee8e31ef465c2fccd6bc9e9fed279 Mon Sep 17 00:00:00 2001
From: Romil <romil.bhardwaj@gmail.com>
Date: Fri, 5 Dec 2025 23:51:38 +0000
Subject: [PATCH 05/29] Working example

---
 python/examples/skypilot_getting_started.py | 54 ++++++++++++--
 python/monarch/_src/job/skypilot.py         | 56 ++++++++++++---
 test_worker_setup.yaml                      | 78 +++++++++++++++++++++
 3 files changed, 172 insertions(+), 16 deletions(-)
 create mode 100644 test_worker_setup.yaml

diff --git a/python/examples/skypilot_getting_started.py b/python/examples/skypilot_getting_started.py
index 64fbf9658..b1841dcde 100644
--- a/python/examples/skypilot_getting_started.py
+++ b/python/examples/skypilot_getting_started.py
@@ -133,7 +133,7 @@ def main():
     )
     parser.add_argument(
         "--accelerator",
-        default="H100:1",
+        default="H200:1",
         help="GPU accelerator to request (e.g., H100:1, A100:1, V100:1)",
     )
     parser.add_argument(
@@ -160,11 +160,51 @@ def main():
     print("\n[1] Creating SkyPilot job...")
 
     # Setup commands to install Monarch on the remote nodes
-    # torchmonarch is the PyPI package name for Monarch
+    # Build from source to ensure client/worker version compatibility
+    # NOTE: Currently builds WITHOUT tensor engine due to old rdma-core on Ubuntu 20.04
     setup_commands = """
-sudo apt-get update && sudo apt-get install -y rdma-core libibverbs1 libmlx5-1 libibverbs-dev || true
-pip install torchmonarch
-echo "DONE INSTALLING TORCHMONARCH"
+set -ex
+
+# Add PPA for newer toolchains
+sudo apt-get update
+sudo apt-get install -y software-properties-common
+sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+sudo apt-get update
+
+# Install system dependencies
+sudo apt-get install -y \
+  build-essential \
+  ninja-build \
+  g++-11 \
+  rdma-core \
+  libibverbs1 \
+  libmlx5-1 \
+  libibverbs-dev \
+  curl \
+  pkg-config \
+  libssl-dev
+
+# Install CUDA toolkit and NCCL
+wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt-get update
+sudo apt-get install -y cuda-toolkit-12-1
+sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9
+
+# Install Rust
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+source $HOME/.cargo/env
+rustup default nightly
+
+# Install Python dependencies
+cd ~/sky_workdir
+pip install setuptools-rust maturin
+pip install -r torch-requirements.txt -r build-requirements.txt
+
+# Build Monarch (without tensor engine due to old rdma-core)
+CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation .
+
+echo "DONE INSTALLING MONARCH"
 """
 
     # Build resources specification
@@ -187,6 +227,10 @@ def main():
         down_on_autostop=True,
         # Setup commands to install dependencies
         setup_commands=setup_commands,
+        # Sync Monarch source to workers for building
+        workdir="/home/sky/dev/monarch",
+        # Use default python (same as used by pip in setup)
+        python_exe="python",
     )
 
     try:
diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py
index 21454de45..1d8c40427 100644
--- a/python/monarch/_src/job/skypilot.py
+++ b/python/monarch/_src/job/skypilot.py
@@ -170,12 +170,7 @@ def _create(self, client_script: Optional[str]) -> None:
             task.set_file_mounts(self._file_mounts)
 
         if self._resources is not None:
-            # Copy resources and override image_id to use PyTorch image with CUDA
-            # This ensures torchmonarch has access to CUDA libraries
-            resources = self._resources.copy(
-                image_id="docker:pytorch/pytorch:2.9.1-cuda12.6-cudnn9-devel"
-            )
-            task.set_resources(resources)
+            task.set_resources(self._resources)
 
         # Generate cluster name if not provided
         cluster_name = self._cluster_name or f"monarch-{os.getpid()}"
@@ -199,6 +194,45 @@ def _create(self, client_script: Optional[str]) -> None:
 
         self._launched_cluster_name = cluster_name
         logger.info(f"SkyPilot cluster '{cluster_name}' launched successfully")
+        
+        # Wait for the job to be RUNNING (setup complete, run started)
+        self._wait_for_job_running(cluster_name, job_id, timeout=900)
+    
+    def _wait_for_job_running(self, cluster_name: str, job_id: int, timeout: int = 900) -> None:
+        """Wait for the SkyPilot job to reach RUNNING status (setup complete)."""
+        import time
+        start_time = time.time()
+        poll_interval = 10  # seconds
+        
+        logger.info(f"Waiting for job {job_id} setup to complete (timeout={timeout}s)...")
+        
+        while time.time() - start_time < timeout:
+            try:
+                # Get job queue for the cluster
+                request_id = sky.queue(cluster_name)
+                jobs = sky.get(request_id)
+                
+                # Find our job
+                for job in jobs:
+                    if job.get('id') == job_id or job.get('job_id') == job_id:
+                        status = job.get('status', '')
+                        status_str = str(status)
+                        if 'RUNNING' in status_str:
+                            logger.info(f"Job {job_id} is now RUNNING (setup complete)")
+                            return
+                        elif 'FAILED' in status_str or 'CANCELLED' in status_str:
+                            raise RuntimeError(f"Job {job_id} failed with status: {status}. Check logs with: sky logs {cluster_name}")
+                        else:
+                            elapsed = int(time.time() - start_time)
+                            logger.info(f"Job {job_id} status: {status} (waited {elapsed}s)")
+                        break
+                
+            except Exception as e:
+                logger.warning(f"Error checking job status: {e}")
+            
+            time.sleep(poll_interval)
+        
+        raise RuntimeError(f"Timeout waiting for job {job_id} to reach RUNNING status")
 
     def _build_worker_command(self) -> str:
         """Build the bash command to start Monarch workers on each node."""
@@ -233,13 +267,13 @@ def _build_worker_command(self) -> str:
 '''
         # Escape single quotes in the Python code for bash
         escaped_code = python_code.replace("'", "'\"'\"'")
-        # Set timeout env vars - setup takes time so we need longer than default 30s
+        # Set timeout env vars - setup takes time (building from source) so we need longer timeouts
         env_vars = " ".join([
-            "export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=5m",
-            "export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=5m",
-            "export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=5m",
+            "export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=15m",
+            "export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=15m",
+            "export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=15m",
         ])
-        return f"{env_vars} && python -c '{escaped_code}'"
+        return f"{env_vars} && {self._python_exe} -c '{escaped_code}'"
 
     def _get_node_ips(self) -> List[str]:
         """Get the IP addresses of all nodes in the cluster."""
diff --git a/test_worker_setup.yaml b/test_worker_setup.yaml
new file mode 100644
index 000000000..8649bbc9c
--- /dev/null
+++ b/test_worker_setup.yaml
@@ -0,0 +1,78 @@
+# Minimal SkyPilot YAML to test Monarch build on remote workers
+# Usage: sky launch test_worker_setup.yaml -c monarch-test
+#        sky down monarch-test
+#
+# NOTE: Currently builds WITHOUT tensor engine due to old rdma-core on Ubuntu 20.04.
+# For tensor engine support, need a newer base image with rdma-core >= 32.
+
+name: monarch-worker-test
+
+resources:
+  cloud: kubernetes
+  accelerators: H200:1
+  cpus: 4+
+  memory: 16+
+
+num_nodes: 1
+
+# Sync the local monarch repo to the worker
+workdir: /home/sky/dev/monarch
+
+setup: |
+  set -ex
+  
+  echo "=== System info ==="
+  uname -a
+  cat /etc/os-release | head -3
+  
+  echo "=== Adding PPA for newer toolchains ==="
+  sudo apt-get update
+  sudo apt-get install -y software-properties-common
+  sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+  sudo apt-get update
+  
+  echo "=== Installing system dependencies ==="
+  sudo apt-get install -y \
+    build-essential \
+    ninja-build \
+    g++-11 \
+    rdma-core \
+    libibverbs1 \
+    libmlx5-1 \
+    libibverbs-dev \
+    curl \
+    pkg-config \
+    libssl-dev
+  
+  echo "=== Installing CUDA toolkit ==="
+  wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
+  sudo dpkg -i cuda-keyring_1.1-1_all.deb
+  sudo apt-get update
+  sudo apt-get install -y cuda-toolkit-12-1
+  sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9
+  
+  echo "=== Installing Rust ==="
+  curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+  source $HOME/.cargo/env
+  rustup default nightly
+  
+  echo "=== Installing Python dependencies ==="
+  cd ~/sky_workdir
+  pip install setuptools-rust maturin
+  pip install -r torch-requirements.txt -r build-requirements.txt
+  
+  echo "=== Building Monarch (without tensor engine due to old rdma-core) ==="
+  cd ~/sky_workdir
+  CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation .
+  
+  echo "=== Verifying installation ==="
+  pip list | grep monarch
+  python -c "import monarch; print('Monarch imported successfully')"
+  python -c "import monarch._rust_bindings; print('Rust bindings loaded successfully')"
+  
+  echo "=== SETUP COMPLETE ==="
+
+run: |
+  echo "Worker setup test completed successfully!"
+  python -c "import monarch; print('Monarch ready')"
+  echo "Ready for Monarch worker operations"

From efa313ff490a0d301e82f5c1793d7849fc55f943 Mon Sep 17 00:00:00 2001
From: Romil <romil.bhardwaj@gmail.com>
Date: Mon, 8 Dec 2025 03:48:53 +0000
Subject: [PATCH 06/29] fix

---
 python/examples/skypilot_getting_started.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/examples/skypilot_getting_started.py b/python/examples/skypilot_getting_started.py
index b1841dcde..f75ae0822 100644
--- a/python/examples/skypilot_getting_started.py
+++ b/python/examples/skypilot_getting_started.py
@@ -40,7 +40,7 @@
 try:
     import sky
 except ImportError:
-    print("ERROR: SkyPilot is not installed. Run: pip install skypilot")
+    print("ERROR: SkyPilot is not installed. Run: pip install skypilot[kubernetes]")
     sys.exit(1)
 
 try:

From 35b6e0eec913097a1aab1bf0ccd881287ab5f484 Mon Sep 17 00:00:00 2001
From: Romil <romil.bhardwaj@gmail.com>
Date: Mon, 8 Dec 2025 04:47:06 +0000
Subject: [PATCH 07/29] updates

---
 .../skypilot_getting_started.py               | 70 +++-----------
 examples/skypilot_run_example.yaml            | 96 +++++++++++++++++++
 python/monarch/_src/job/skypilot.py           | 65 ++++++++++++-
 test_worker_setup.yaml                        | 78 ---------------
 4 files changed, 171 insertions(+), 138 deletions(-)
 rename {python/examples => examples}/skypilot_getting_started.py (80%)
 create mode 100644 examples/skypilot_run_example.yaml
 delete mode 100644 test_worker_setup.yaml

diff --git a/python/examples/skypilot_getting_started.py b/examples/skypilot_getting_started.py
similarity index 80%
rename from python/examples/skypilot_getting_started.py
rename to examples/skypilot_getting_started.py
index f75ae0822..3ccc1d10a 100644
--- a/python/examples/skypilot_getting_started.py
+++ b/examples/skypilot_getting_started.py
@@ -16,13 +16,13 @@
 Prerequisites:
 - Monarch installed with its Rust bindings (build with `pip install -e .` in monarch/)
 - SkyPilot installed and configured (run `sky check`)
-- torchmonarch available on PyPI (requires CUDA on remote nodes)
 
 Usage:
-    python skypilot_getting_started.py
+    # Run from inside a Kubernetes pod (client runs locally):
+    python examples/skypilot_getting_started.py --cloud kubernetes --num-hosts 2
 
-    # With explicit options:
-    python skypilot_getting_started.py --cloud kubernetes --num-hosts 2
+    # Run from outside the cluster using the SkyPilot YAML:
+    sky launch examples/skypilot_run_example.yaml
 
 See SKY_README.md for full documentation.
 """
@@ -159,63 +159,18 @@ def main():
     # This will launch cloud instances and start Monarch workers on them
     print("\n[1] Creating SkyPilot job...")
 
-    # Setup commands to install Monarch on the remote nodes
-    # Build from source to ensure client/worker version compatibility
-    # NOTE: Currently builds WITHOUT tensor engine due to old rdma-core on Ubuntu 20.04
-    setup_commands = """
-set -ex
-
-# Add PPA for newer toolchains
-sudo apt-get update
-sudo apt-get install -y software-properties-common
-sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
-sudo apt-get update
-
-# Install system dependencies
-sudo apt-get install -y \
-  build-essential \
-  ninja-build \
-  g++-11 \
-  rdma-core \
-  libibverbs1 \
-  libmlx5-1 \
-  libibverbs-dev \
-  curl \
-  pkg-config \
-  libssl-dev
-
-# Install CUDA toolkit and NCCL
-wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
-sudo dpkg -i cuda-keyring_1.1-1_all.deb
-sudo apt-get update
-sudo apt-get install -y cuda-toolkit-12-1
-sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9
-
-# Install Rust
-curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-source $HOME/.cargo/env
-rustup default nightly
-
-# Install Python dependencies
-cd ~/sky_workdir
-pip install setuptools-rust maturin
-pip install -r torch-requirements.txt -r build-requirements.txt
-
-# Build Monarch (without tensor engine due to old rdma-core)
-CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation .
-
-echo "DONE INSTALLING MONARCH"
-"""
-
     # Build resources specification
     resources_kwargs = {
         "cloud": get_cloud(args.cloud),
-        "cpus": "2+",
         "accelerators": args.accelerator,  # GPU required - torchmonarch needs CUDA
     }
     if args.region:
         resources_kwargs["region"] = args.region
 
+    # Find Monarch repo root (this script is in examples/)
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    monarch_root = os.path.dirname(script_dir)  # Go up from examples/
+    
     job = SkyPilotJob(
         # Define the mesh of hosts we need
         meshes={"trainers": args.num_hosts},
@@ -225,12 +180,9 @@ def main():
         # Auto-cleanup after 10 minutes of idle time
         idle_minutes_to_autostop=10,
         down_on_autostop=True,
-        # Setup commands to install dependencies
-        setup_commands=setup_commands,
-        # Sync Monarch source to workers for building
-        workdir="/home/sky/dev/monarch",
-        # Use default python (same as used by pip in setup)
-        python_exe="python",
+        # Sync Monarch source to workers for building from source
+        # (SkyPilotJob uses default setup commands when workdir is provided)
+        workdir=monarch_root,
     )
 
     try:
diff --git a/examples/skypilot_run_example.yaml b/examples/skypilot_run_example.yaml
new file mode 100644
index 000000000..a2638a5ec
--- /dev/null
+++ b/examples/skypilot_run_example.yaml
@@ -0,0 +1,96 @@
+# SkyPilot YAML for running Monarch SkyPilot example from outside the cluster
+#
+# This YAML launches a "client" pod that runs the skypilot_getting_started.py
+# script. The script then uses SkyPilotJob to launch additional "worker" pods.
+#
+# Usage:
+#   sky launch examples/skypilot_run_example.yaml
+#
+# Requirements:
+#   - SkyPilot configured with Kubernetes access (sky check)
+#   - Kubernetes cluster with GPU nodes available
+#
+# Note: Cold start is slow (~7-10 minutes) because both the client and workers
+# need to build Monarch from source to ensure version compatibility.
+
+name: monarch-skypilot-example
+
+resources:
+  cloud: kubernetes
+  # Client pod needs minimal resources - workers do the heavy lifting
+  cpus: 4+
+  memory: 16+
+  # Request a GPU for the client too (needed to build Monarch with CUDA support)
+  accelerators: H200:1
+
+# Sync the Monarch repository to the client pod
+workdir: .
+
+setup: |
+  set -ex
+  
+  echo "=== Setting up Monarch client pod ==="
+  
+  # Add PPA for newer toolchains
+  sudo apt-get update
+  sudo apt-get install -y software-properties-common
+  sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+  sudo apt-get update
+  
+  # Install system dependencies
+  sudo apt-get install -y \
+    build-essential \
+    ninja-build \
+    g++-11 \
+    rdma-core \
+    libibverbs1 \
+    libmlx5-1 \
+    libibverbs-dev \
+    curl \
+    pkg-config \
+    libssl-dev
+  
+  # Install CUDA toolkit and NCCL
+  wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
+  sudo dpkg -i cuda-keyring_1.1-1_all.deb
+  sudo apt-get update
+  sudo apt-get install -y cuda-toolkit-12-1
+  sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9
+  
+  # Install Rust
+  curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+  source $HOME/.cargo/env
+  rustup default nightly
+  
+  # Install SkyPilot with Kubernetes support
+  pip install "skypilot[kubernetes]"
+  
+  # Install Python dependencies and build Monarch from source
+  cd ~/sky_workdir
+  pip install setuptools-rust maturin
+  pip install -r torch-requirements.txt -r build-requirements.txt
+  CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation -e .
+  
+  echo "=== Client setup complete ==="
+
+run: |
+  set -ex
+  source $HOME/.cargo/env
+  cd ~/sky_workdir
+  
+  # Set timeouts for worker communication
+  export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=15m
+  export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=15m
+  export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=15m
+  
+  # Run the example
+  # Using --num-hosts 1 and --gpus-per-host 1 for a minimal test
+  # Adjust these values based on available cluster resources
+  python examples/skypilot_getting_started.py \
+    --cloud kubernetes \
+    --num-hosts 1 \
+    --gpus-per-host 1 \
+    --accelerator "H200:1" \
+    --cluster-name monarch-workers
+
+
diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py
index 1d8c40427..6eade6eae 100644
--- a/python/monarch/_src/job/skypilot.py
+++ b/python/monarch/_src/job/skypilot.py
@@ -38,6 +38,58 @@
 # Default port for Monarch TCP communication
 DEFAULT_MONARCH_PORT = 22222
 
+# Default setup commands to build Monarch from source on remote workers.
+# NOTE: Cold start is slow (~7-10 minutes) because we need to compile Monarch
+# each worker This is necessary to ensure client/worker version compatibility
+# when using a development branch. For production use, consider
+# using pre-built wheels from PyPI (pip install torchmonarch).
+#
+# For faster cold starts (<30s), use a custom Docker image with all dependencies
+# pre-installed by setting image_id in sky.Resources:
+#   resources = sky.Resources(image_id="docker:your-registry/monarch-image:tag", ...)
+DEFAULT_SETUP_COMMANDS = """
+set -ex
+
+# Add PPA for newer toolchains
+sudo apt-get update
+sudo apt-get install -y software-properties-common
+sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+sudo apt-get update
+
+# Install system dependencies
+sudo apt-get install -y \
+  build-essential \
+  ninja-build \
+  g++-11 \
+  rdma-core \
+  libibverbs1 \
+  libmlx5-1 \
+  libibverbs-dev \
+  curl \
+  pkg-config \
+  libssl-dev
+
+# Install CUDA toolkit and NCCL
+wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt-get update
+sudo apt-get install -y cuda-toolkit-12-1
+sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9
+
+# Install Rust
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+source $HOME/.cargo/env
+rustup default nightly
+
+# Install Python dependencies and build Monarch from source
+cd ~/sky_workdir
+pip install setuptools-rust maturin
+pip install -r torch-requirements.txt -r build-requirements.txt
+CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation .
+
+echo "Done installing Monarch"
+"""
+
 
 def _configure_transport() -> None:
     """Configure the Monarch transport. Deferred import to avoid import errors."""
@@ -55,6 +107,7 @@ def _attach_to_workers_wrapper(name: str, ca: str, workers: List[str]):
 
 
 class SkyPilotJob(JobTrait):
+    
     """
     A job scheduler that uses SkyPilot to provision cloud instances.
 
@@ -107,8 +160,12 @@ def __init__(
             python_exe: Python executable to use for worker processes.
             setup_commands: Optional setup commands to run before starting workers.
                            Use this to install dependencies including Monarch.
+                           If None and workdir is provided, uses DEFAULT_SETUP_COMMANDS
+                           which builds Monarch from source.
             workdir: Local directory to sync to the cluster. If provided, this
                     directory will be uploaded to ~/sky_workdir on each node.
+                    When using workdir with the Monarch repo, DEFAULT_SETUP_COMMANDS
+                    will build Monarch from source on each worker.
             file_mounts: Dictionary mapping remote paths to local paths for
                         additional file mounts.
         """
@@ -152,7 +209,13 @@ def _create(self, client_script: Optional[str]) -> None:
         worker_command = self._build_worker_command()
 
         # Create setup commands
-        setup = self._setup_commands or ""
+        # If workdir is provided but no setup_commands, use defaults to build Monarch
+        if self._setup_commands is not None:
+            setup = self._setup_commands
+        elif self._workdir is not None:
+            setup = DEFAULT_SETUP_COMMANDS
+        else:
+            setup = ""
         if setup and not setup.endswith("\n"):
             setup += "\n"
 
diff --git a/test_worker_setup.yaml b/test_worker_setup.yaml
deleted file mode 100644
index 8649bbc9c..000000000
--- a/test_worker_setup.yaml
+++ /dev/null
@@ -1,78 +0,0 @@
-# Minimal SkyPilot YAML to test Monarch build on remote workers
-# Usage: sky launch test_worker_setup.yaml -c monarch-test
-#        sky down monarch-test
-#
-# NOTE: Currently builds WITHOUT tensor engine due to old rdma-core on Ubuntu 20.04.
-# For tensor engine support, need a newer base image with rdma-core >= 32.
-
-name: monarch-worker-test
-
-resources:
-  cloud: kubernetes
-  accelerators: H200:1
-  cpus: 4+
-  memory: 16+
-
-num_nodes: 1
-
-# Sync the local monarch repo to the worker
-workdir: /home/sky/dev/monarch
-
-setup: |
-  set -ex
-  
-  echo "=== System info ==="
-  uname -a
-  cat /etc/os-release | head -3
-  
-  echo "=== Adding PPA for newer toolchains ==="
-  sudo apt-get update
-  sudo apt-get install -y software-properties-common
-  sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
-  sudo apt-get update
-  
-  echo "=== Installing system dependencies ==="
-  sudo apt-get install -y \
-    build-essential \
-    ninja-build \
-    g++-11 \
-    rdma-core \
-    libibverbs1 \
-    libmlx5-1 \
-    libibverbs-dev \
-    curl \
-    pkg-config \
-    libssl-dev
-  
-  echo "=== Installing CUDA toolkit ==="
-  wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
-  sudo dpkg -i cuda-keyring_1.1-1_all.deb
-  sudo apt-get update
-  sudo apt-get install -y cuda-toolkit-12-1
-  sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9
-  
-  echo "=== Installing Rust ==="
-  curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-  source $HOME/.cargo/env
-  rustup default nightly
-  
-  echo "=== Installing Python dependencies ==="
-  cd ~/sky_workdir
-  pip install setuptools-rust maturin
-  pip install -r torch-requirements.txt -r build-requirements.txt
-  
-  echo "=== Building Monarch (without tensor engine due to old rdma-core) ==="
-  cd ~/sky_workdir
-  CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation .
-  
-  echo "=== Verifying installation ==="
-  pip list | grep monarch
-  python -c "import monarch; print('Monarch imported successfully')"
-  python -c "import monarch._rust_bindings; print('Rust bindings loaded successfully')"
-  
-  echo "=== SETUP COMPLETE ==="
-
-run: |
-  echo "Worker setup test completed successfully!"
-  python -c "import monarch; print('Monarch ready')"
-  echo "Ready for Monarch worker operations"

From 6a75678ae1f9046900c52f35589b957e1a8d5e48 Mon Sep 17 00:00:00 2001
From: Romil <romil.bhardwaj@gmail.com>
Date: Mon, 8 Dec 2025 05:24:05 +0000
Subject: [PATCH 08/29] cleanup

---
 SKY_README.md                             | 293 -------------
 examples/skypilot_run_example.yaml        |  96 -----
 python/tests/test_skypilot_integration.py | 213 ----------
 python/tests/test_skypilot_job.py         | 493 ----------------------
 4 files changed, 1095 deletions(-)
 delete mode 100644 SKY_README.md
 delete mode 100644 examples/skypilot_run_example.yaml
 delete mode 100644 python/tests/test_skypilot_integration.py
 delete mode 100644 python/tests/test_skypilot_job.py

diff --git a/SKY_README.md b/SKY_README.md
deleted file mode 100644
index 1558c62da..000000000
--- a/SKY_README.md
+++ /dev/null
@@ -1,293 +0,0 @@
-# Monarch + SkyPilot Integration
-
-This document describes the SkyPilot integration for Monarch, which enables running Monarch actors on cloud infrastructure provisioned by SkyPilot.
-
-## Overview
-
-SkyPilot is a framework for running ML workloads on any cloud (AWS, GCP, Azure, Lambda, Kubernetes, etc.). The `SkyPilotJob` class in Monarch provides a seamless integration that:
-
-1. **Provisions cloud instances** using SkyPilot's unified API
-2. **Installs Monarch** (`torchmonarch` from PyPI) on remote nodes
-3. **Starts Monarch workers** on each node listening for connections
-4. **Connects clients** to workers using TCP for distributed actor communication
-
-## Architecture
-
-```
-┌─────────────────────────────────────────────────────────────────┐
-│                         Client Machine                          │
-│  ┌─────────────────────────────────────────────────────────┐   │
-│  │                    SkyPilotJob                           │   │
-│  │  - Calls sky.launch() to provision cloud instances       │   │
-│  │  - Configures setup commands to install torchmonarch     │   │
-│  │  - Builds worker command with run_worker_loop_forever()  │   │
-│  │  - Calls attach_to_workers() to create HostMesh          │   │
-│  └─────────────────────────────────────────────────────────┘   │
-└───────────────────────────────┬─────────────────────────────────┘
-                                │ TCP connections (port 22222)
-        ┌───────────────────────┼───────────────────────┐
-        │                       │                       │
-        ▼                       ▼                       ▼
-┌───────────────┐       ┌───────────────┐       ┌───────────────┐
-│   Worker 1    │       │   Worker 2    │       │   Worker N    │
-│ (Cloud Node)  │       │ (Cloud Node)  │       │ (Cloud Node)  │
-│               │       │               │       │               │
-│ run_worker_   │       │ run_worker_   │       │ run_worker_   │
-│ loop_forever()│       │ loop_forever()│       │ loop_forever()│
-│               │       │               │       │               │
-│ tcp://<ip>:   │       │ tcp://<ip>:   │       │ tcp://<ip>:   │
-│   22222       │       │   22222       │       │   22222       │
-└───────────────┘       └───────────────┘       └───────────────┘
-```
-
-## Implementation Details
-
-### Files
-
-- **`python/monarch/_src/job/skypilot.py`**: Core `SkyPilotJob` implementation
-- **`python/monarch/job/__init__.py`**: Exports `SkyPilotJob` (with graceful ImportError handling)
-- **`python/tests/test_skypilot_job.py`**: Unit tests with mocked SkyPilot
-- **`python/tests/test_skypilot_integration.py`**: Integration test scaffolding
-- **`python/examples/skypilot_getting_started.py`**: Example demonstrating usage
-
-### Key Classes and Functions
-
-#### `SkyPilotJob(JobTrait)`
-
-Main job class that implements the Monarch `JobTrait` interface.
-
-```python
-from monarch.job import SkyPilotJob
-import sky
-
-job = SkyPilotJob(
-    meshes={"trainers": 2},           # 2 nodes for "trainers" mesh
-    resources=sky.Resources(
-        cloud=sky.Kubernetes(),
-        accelerators="H100:1",
-    ),
-    cluster_name="my-cluster",
-    idle_minutes_to_autostop=10,
-    down_on_autostop=True,
-    setup_commands="pip install torchmonarch",
-)
-
-state = job.state()  # Launches cluster and returns JobState
-hosts = state.trainers  # HostMesh with 2 nodes
-```
-
-#### Parameters
-
-| Parameter | Type | Description |
-|-----------|------|-------------|
-| `meshes` | `Dict[str, int]` | Mesh names to node counts |
-| `resources` | `sky.Resources` | SkyPilot resource specification |
-| `cluster_name` | `str` | Name for the cluster |
-| `monarch_port` | `int` | TCP port for workers (default: 22222) |
-| `idle_minutes_to_autostop` | `int` | Auto-stop after idle minutes |
-| `down_on_autostop` | `bool` | Terminate (not just stop) on autostop |
-| `setup_commands` | `str` | Shell commands to run before workers start |
-| `workdir` | `str` | Local directory to sync to cluster |
-| `file_mounts` | `Dict[str, str]` | Additional file mounts |
-
-### Worker Lifecycle
-
-1. **Launch**: `sky.launch()` creates the cluster with specified resources
-2. **Setup**: `setup_commands` run to install `torchmonarch`
-3. **Run**: Worker command executes `run_worker_loop_forever(address, ca)`
-4. **Connect**: Client calls `attach_to_workers()` to create `HostMesh`
-5. **Teardown**: `sky.down()` terminates the cluster
-
-### Environment Variables
-
-The following environment variables control timeouts:
-
-```python
-os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s"  # Worker spawn timeout
-os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s"  # Message delivery timeout
-os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s"  # Proc mesh spawn timeout
-```
-
-## Requirements
-
-### Client Side
-- Monarch with Rust bindings (`pip install -e .` from source)
-- SkyPilot (`pip install skypilot`)
-- Configured cloud credentials (`sky check`)
-
-### Worker Side (installed via setup_commands)
-- `torchmonarch` from PyPI
-- **CUDA libraries** - torchmonarch requires `libcuda.so.1`
-- This means workers **must run on GPU nodes**
-
-## Usage
-
-### Basic Example
-
-```python
-import sky
-from monarch.job import SkyPilotJob
-from monarch.actor import Actor, endpoint
-
-class MyActor(Actor):
-    @endpoint
-    def hello(self) -> str:
-        return "Hello from cloud!"
-
-# Create job
-job = SkyPilotJob(
-    meshes={"workers": 2},
-    resources=sky.Resources(
-        cloud=sky.AWS(),
-        accelerators="A100:1",
-    ),
-    setup_commands="pip install torchmonarch",
-)
-
-# Launch and get state
-state = job.state()
-hosts = state.workers
-
-# Spawn processes and actors
-procs = hosts.spawn_procs(per_host={"gpus": 1})
-actors = procs.spawn("my_actors", MyActor)
-
-# Interact with actors
-results = actors.hello.call().get()
-print(results)  # ["Hello from cloud!", "Hello from cloud!"]
-
-# Cleanup
-job.kill()
-```
-
-### Running the Example
-
-```bash
-# Install dependencies
-pip install skypilot
-pip install -e .  # Build Monarch from source
-
-# Configure cloud credentials
-sky check
-
-# Run example
-cd python/examples
-python skypilot_getting_started.py \
-    --cloud kubernetes \
-    --num-hosts 2 \
-    --accelerator "H100:1" \
-    --cluster-name my-monarch-cluster
-```
-
-### Supported Clouds
-
-- **Kubernetes**: Use `sky.Kubernetes()` with `--region` for context
-- **AWS**: Use `sky.AWS()` 
-- **GCP**: Use `sky.GCP()`
-- **Azure**: Use `sky.Azure()`
-- **Lambda Labs**: Use `sky.Lambda()`
-- And others supported by SkyPilot
-
-## Networking Considerations
-
-### Kubernetes
-
-When using Kubernetes, the client and workers must be in the **same Kubernetes cluster** for pod-to-pod communication. Use the `region` parameter to specify the Kubernetes context:
-
-```python
-resources=sky.Resources(
-    cloud=sky.Kubernetes(),
-    region="my-k8s-context",  # Must match client's cluster
-)
-```
-
-### Public Clouds (AWS, GCP, Azure)
-
-SkyPilot handles networking automatically. Workers get public IPs that clients can connect to.
-
-### Firewall
-
-Ensure port 22222 (or your custom `monarch_port`) is accessible:
-- Kubernetes: Pod networking should handle this
-- AWS: Security groups
-- GCP: Firewall rules
-- Azure: Network security groups
-
-## Troubleshooting
-
-### "libcuda.so.1: cannot open shared object file"
-
-**Cause**: Workers are running on CPU-only nodes, but `torchmonarch` requires CUDA.
-
-**Solution**: Request GPU nodes:
-```python
-resources=sky.Resources(accelerators="H100:1")
-```
-
-### "No route to host" or connection timeouts
-
-**Cause**: Client and workers are in different networks (e.g., different Kubernetes clusters).
-
-**Solution**: Ensure client and workers are in the same network:
-- For Kubernetes: Use `region` parameter to specify the correct context
-- For public clouds: Check security group / firewall rules
-
-### "error spawning proc mesh: statuses: Timeout"
-
-**Causes**:
-1. Workers aren't listening on the expected port
-2. Network connectivity issues
-3. Workers crashed during startup
-
-**Debug steps**:
-1. Check SkyPilot logs: `sky logs <cluster-name>`
-2. SSH into cluster: `sky ssh <cluster-name>`
-3. Check if port is listening: `ss -tlnp | grep 22222`
-4. Check Monarch logs: `/tmp/sky/monarch_log.log`
-
-### Workers crash immediately
-
-Check SkyPilot logs for the error:
-```bash
-sky logs <cluster-name>
-```
-
-Common issues:
-- Missing CUDA libraries → use GPU nodes
-- torchmonarch installation failed → check setup_commands
-- Python version mismatch → ensure compatible Python version
-
-## Testing
-
-### Unit Tests (with mocked SkyPilot)
-
-```bash
-cd python
-pytest tests/test_skypilot_job.py -v
-```
-
-### Integration Tests (requires real cloud)
-
-```bash
-cd python
-pytest tests/test_skypilot_integration.py -v --cloud kubernetes
-```
-
-## Comparison with SlurmJob
-
-| Feature | SkyPilotJob | SlurmJob |
-|---------|-------------|----------|
-| Cloud Support | Multi-cloud (AWS, GCP, Azure, K8s, etc.) | HPC clusters only |
-| Setup | Automatic via SkyPilot | Requires Slurm installation |
-| Autoscaling | Supported | Depends on cluster |
-| Cost Optimization | Automatic (cheapest region) | N/A |
-| Worker Discovery | Via cluster handle IPs | Via squeue hostnames |
-
-## Future Work
-
-- [ ] Support for spot/preemptible instances
-- [ ] Multi-region deployments  
-- [ ] Automatic failover on spot termination
-- [ ] Integration with SkyPilot managed jobs
-- [ ] Support for batch mode (client script on cluster)
-
diff --git a/examples/skypilot_run_example.yaml b/examples/skypilot_run_example.yaml
deleted file mode 100644
index a2638a5ec..000000000
--- a/examples/skypilot_run_example.yaml
+++ /dev/null
@@ -1,96 +0,0 @@
-# SkyPilot YAML for running Monarch SkyPilot example from outside the cluster
-#
-# This YAML launches a "client" pod that runs the skypilot_getting_started.py
-# script. The script then uses SkyPilotJob to launch additional "worker" pods.
-#
-# Usage:
-#   sky launch examples/skypilot_run_example.yaml
-#
-# Requirements:
-#   - SkyPilot configured with Kubernetes access (sky check)
-#   - Kubernetes cluster with GPU nodes available
-#
-# Note: Cold start is slow (~7-10 minutes) because both the client and workers
-# need to build Monarch from source to ensure version compatibility.
-
-name: monarch-skypilot-example
-
-resources:
-  cloud: kubernetes
-  # Client pod needs minimal resources - workers do the heavy lifting
-  cpus: 4+
-  memory: 16+
-  # Request a GPU for the client too (needed to build Monarch with CUDA support)
-  accelerators: H200:1
-
-# Sync the Monarch repository to the client pod
-workdir: .
-
-setup: |
-  set -ex
-  
-  echo "=== Setting up Monarch client pod ==="
-  
-  # Add PPA for newer toolchains
-  sudo apt-get update
-  sudo apt-get install -y software-properties-common
-  sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
-  sudo apt-get update
-  
-  # Install system dependencies
-  sudo apt-get install -y \
-    build-essential \
-    ninja-build \
-    g++-11 \
-    rdma-core \
-    libibverbs1 \
-    libmlx5-1 \
-    libibverbs-dev \
-    curl \
-    pkg-config \
-    libssl-dev
-  
-  # Install CUDA toolkit and NCCL
-  wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
-  sudo dpkg -i cuda-keyring_1.1-1_all.deb
-  sudo apt-get update
-  sudo apt-get install -y cuda-toolkit-12-1
-  sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9
-  
-  # Install Rust
-  curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-  source $HOME/.cargo/env
-  rustup default nightly
-  
-  # Install SkyPilot with Kubernetes support
-  pip install "skypilot[kubernetes]"
-  
-  # Install Python dependencies and build Monarch from source
-  cd ~/sky_workdir
-  pip install setuptools-rust maturin
-  pip install -r torch-requirements.txt -r build-requirements.txt
-  CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation -e .
-  
-  echo "=== Client setup complete ==="
-
-run: |
-  set -ex
-  source $HOME/.cargo/env
-  cd ~/sky_workdir
-  
-  # Set timeouts for worker communication
-  export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=15m
-  export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=15m
-  export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=15m
-  
-  # Run the example
-  # Using --num-hosts 1 and --gpus-per-host 1 for a minimal test
-  # Adjust these values based on available cluster resources
-  python examples/skypilot_getting_started.py \
-    --cloud kubernetes \
-    --num-hosts 1 \
-    --gpus-per-host 1 \
-    --accelerator "H200:1" \
-    --cluster-name monarch-workers
-
-
diff --git a/python/tests/test_skypilot_integration.py b/python/tests/test_skypilot_integration.py
deleted file mode 100644
index 5469f4717..000000000
--- a/python/tests/test_skypilot_integration.py
+++ /dev/null
@@ -1,213 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Integration test script for SkyPilot job.
-
-This script tests the basic SkyPilot integration without requiring Monarch
-runtime. It validates that SkyPilot cluster launching and node IP retrieval works.
-
-Run this script with:
-    python tests/test_skypilot_integration.py
-
-Prerequisites:
-- SkyPilot installed and configured with cloud credentials
-- Run `sky check` to verify cloud access
-"""
-
-import argparse
-import sys
-import time
-
-try:
-    import sky
-    from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle
-except ImportError:
-    print("Error: SkyPilot is not installed. Install with: pip install skypilot")
-    sys.exit(1)
-
-
-def test_skypilot_cluster_launch(
-    cluster_name: str = "monarch-integration-test",
-    cloud: str = "aws",
-    cpus: str = "2+",
-    timeout_minutes: int = 10,
-) -> bool:
-    """
-    Test launching a SkyPilot cluster and retrieving node IPs.
-
-    Args:
-        cluster_name: Name for the test cluster
-        cloud: Cloud provider to use
-        cpus: CPU specification
-        timeout_minutes: Timeout for cluster launch
-
-    Returns:
-        True if test passed, False otherwise
-    """
-    print(f"\n{'='*60}")
-    print("SkyPilot Integration Test")
-    print(f"{'='*60}\n")
-
-    # Create a simple task
-    task = sky.Task(
-        name="monarch-test-task",
-        run="echo 'SkyPilot test successful' && hostname && sleep 30",
-    )
-
-    # Set resources based on cloud
-    cloud_obj = None
-    if cloud.lower() == "aws":
-        cloud_obj = sky.AWS()
-    elif cloud.lower() == "gcp":
-        cloud_obj = sky.GCP()
-    elif cloud.lower() == "azure":
-        cloud_obj = sky.Azure()
-    elif cloud.lower() == "kubernetes":
-        cloud_obj = sky.Kubernetes()
-
-    resources = sky.Resources(
-        cloud=cloud_obj,
-        cpus=cpus,
-    )
-    task.set_resources(resources)
-
-    print(f"Test configuration:")
-    print(f"  Cluster name: {cluster_name}")
-    print(f"  Cloud: {cloud}")
-    print(f"  CPUs: {cpus}")
-    print()
-
-    try:
-        # Launch the cluster
-        print("Step 1: Launching cluster...")
-        request_id = sky.launch(
-            task,
-            cluster_name=cluster_name,
-            idle_minutes_to_autostop=5,
-            down=True,  # Auto-teardown after idle
-        )
-
-        print(f"  Request ID: {request_id}")
-        job_id, handle = sky.get(request_id)
-        print(f"  Job ID: {job_id}")
-
-        if handle is None:
-            print("  ERROR: No handle returned from launch")
-            return False
-
-        print("  Cluster launched successfully!")
-
-        # Get cluster status and node IPs
-        print("\nStep 2: Getting cluster status and node IPs...")
-        request_id = sky.status(cluster_names=[cluster_name])
-        statuses = sky.get(request_id)
-
-        if not statuses:
-            print("  ERROR: No status returned")
-            return False
-
-        status = statuses[0]
-        print(f"  Cluster status: {status.status}")
-        print(f"  Cluster name: {status.name}")
-
-        handle = status.handle
-        if handle is None:
-            print("  ERROR: Status has no handle")
-            return False
-
-        if not isinstance(handle, CloudVmRayResourceHandle):
-            print(f"  ERROR: Unexpected handle type: {type(handle)}")
-            return False
-
-        # Get IPs
-        if handle.stable_internal_external_ips:
-            print(f"\n  Node IPs ({len(handle.stable_internal_external_ips)} nodes):")
-            for i, (internal_ip, external_ip) in enumerate(
-                handle.stable_internal_external_ips
-            ):
-                print(f"    Node {i}: internal={internal_ip}, external={external_ip}")
-        else:
-            print("  WARNING: No IP information available yet")
-
-        # Test passed!
-        print("\n" + "=" * 60)
-        print("TEST PASSED!")
-        print("=" * 60)
-        print(
-            "\nThe SkyPilot integration is working correctly."
-            "\nMonarch workers can be launched on these nodes."
-        )
-        return True
-
-    except Exception as e:
-        print(f"\nERROR: {e}")
-        import traceback
-
-        traceback.print_exc()
-        return False
-
-    finally:
-        # Cleanup
-        print("\nStep 3: Cleaning up cluster...")
-        try:
-            request_id = sky.down(cluster_name)
-            sky.get(request_id)
-            print("  Cluster terminated successfully")
-        except Exception as e:
-            print(f"  Warning: Failed to cleanup cluster: {e}")
-            print(f"  You may need to manually run: sky down {cluster_name}")
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Integration test for SkyPilot-Monarch integration"
-    )
-    parser.add_argument(
-        "--cluster-name",
-        default="monarch-integration-test",
-        help="Name for the test cluster",
-    )
-    parser.add_argument(
-        "--cloud",
-        default="aws",
-        choices=["aws", "gcp", "azure", "kubernetes"],
-        help="Cloud provider to use",
-    )
-    parser.add_argument(
-        "--cpus",
-        default="2+",
-        help="CPU specification",
-    )
-    parser.add_argument(
-        "--timeout",
-        type=int,
-        default=10,
-        help="Timeout in minutes for cluster launch",
-    )
-
-    args = parser.parse_args()
-
-    # Check SkyPilot is configured
-    print("Checking SkyPilot configuration...")
-    print(f"  Using cloud: {args.cloud}")
-    print("  (Run 'sky check' to verify cloud credentials)")
-
-    # Run the test
-    success = test_skypilot_cluster_launch(
-        cluster_name=args.cluster_name,
-        cloud=args.cloud,
-        cpus=args.cpus,
-        timeout_minutes=args.timeout,
-    )
-
-    sys.exit(0 if success else 1)
-
-
-if __name__ == "__main__":
-    main()
-
diff --git a/python/tests/test_skypilot_job.py b/python/tests/test_skypilot_job.py
deleted file mode 100644
index b6af37a22..000000000
--- a/python/tests/test_skypilot_job.py
+++ /dev/null
@@ -1,493 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights retuprved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-
-"""Tests for the SubernyPilot job integration."""
-
-import os
-import sys
-import tempfile
-from typing import Any, Dict, List, Optional, Tuple
-from unittest import mock
-
-import pytest
-
-
-# Check if SkyPilot is available
-try:
-    import sky
-
-    HAS_SKYPILOT = True
-except ImportError:
-    HAS_SKYPILOT = False
-
-# Check if Monarch bindings are available
-try:
-    from monarch._rust_bindings.monarch_hyperactor.config import configure
-
-    HAS_MONARCH_BINDINGS = True
-except ImportError:
-    HAS_MONARCH_BINDINGS = False
-
-# Skip all tests in this module if SkyPilot or Monarch bindings are not installed
-pytestmark = [
-    pytest.mark.skipif(not HAS_SKYPILOT, reason="SkyPilot not installed"),
-    pytest.mark.skipif(not HAS_MONARCH_BINDINGS, reason="Monarch bindings not available"),
-]
-
-
-class MockClusterHandle:
-    """Mock CloudVmRayResourceHandle for testing."""
-
-    def __init__(
-        self,
-        cluster_name: str,
-        node_ips: List[Tuple[str, str]],
-    ):
-        self.cluster_name = cluster_name
-        self.cluster_name_on_cloud = cluster_name
-        self.stable_internal_external_ips = node_ips
-        self.launched_nodes = len(node_ips)
-
-
-class MockStatusResponse:
-    """Mock status response from sky.status()."""
-
-    def __init__(
-        self,
-        name: str,
-        status: "sky.ClusterStatus",
-        handle: Optional[MockClusterHandle] = None,
-    ):
-        self.name = name
-        self.status = status
-        self.handle = handle
-
-
-@pytest.fixture
-def mock_sky():
-    """Fixture to mock SkyPilot SDK functions."""
-    with mock.patch("monarch._src.job.skypilot.sky") as mock_sky_module:
-        # Mock ClusterStatus enum
-        mock_sky_module.ClusterStatus = sky.ClusterStatus
-
-        # Mock sky.launch to return a mock request_id
-        mock_sky_module.launch.return_value = "mock-request-id"
-
-        # Mock sky.get to return appropriate results
-        def mock_get(request_id):
-            if request_id == "mock-request-id":
-                # Return (job_id, handle) for launch
-                return (
-                    1,
-                    MockClusterHandle(
-                        "test-cluster",
-                        [("10.0.0.1", "1.2.3.4"), ("10.0.0.2", "1.2.3.5")],
-                    ),
-                )
-            elif request_id == "mock-status-request-id":
-                # Return list of status responses
-                return [
-                    MockStatusResponse(
-                        "test-cluster",
-                        sky.ClusterStatus.UP,
-                        MockClusterHandle(
-                            "test-cluster",
-                            [("10.0.0.1", "1.2.3.4"), ("10.0.0.2", "1.2.3.5")],
-                        ),
-                    )
-                ]
-            elif request_id == "mock-down-request-id":
-                return None
-            return None
-
-        mock_sky_module.get.side_effect = mock_get
-
-        # Mock sky.status
-        mock_sky_module.status.return_value = "mock-status-request-id"
-
-        # Mock sky.down
-        mock_sky_module.down.return_value = "mock-down-request-id"
-
-        # Mock sky.Task
-        mock_sky_module.Task = mock.MagicMock()
-
-        # Mock sky.Resources
-        mock_sky_module.Resources = sky.Resources
-
-        yield mock_sky_module
-
-
-@pytest.fixture
-def mock_attach_to_workers():
-    """Fixture to mock attach_to_workers wrapper."""
-    with mock.patch(
-        "monarch._src.job.skypilot._attach_to_workers_wrapper"
-    ) as mock_attach:
-        # Create a simple mock HostMesh
-        class MockHostMesh:
-            def __init__(self, name):
-                self.name = name
-
-        def create_mock_host_mesh(name, ca, workers):
-            return MockHostMesh(name)
-
-        mock_attach.side_effect = create_mock_host_mesh
-        yield mock_attach
-
-
-@pytest.fixture
-def mock_configure_transport():
-    """Fixture to mock _configure_transport."""
-    with mock.patch(
-        "monarch._src.job.skypilot._configure_transport"
-    ) as mock_config:
-        yield mock_config
-
-
-@pytest.mark.skipif(not HAS_SKYPILOT, reason="SkyPilot not installed")
-def test_skypilot_job_import():
-    """Test that SkyPilotJob can be imported from monarch.job."""
-    from monarch.job import SkyPilotJob
-
-    # SkyPilotJob should be available (or None if import failed)
-    # This test verifies the export is working
-    if HAS_MONARCH_BINDINGS:
-        assert SkyPilotJob is not None
-    # If bindings are not available, SkyPilotJob will be None (graceful degradation)
-
-
-def test_skypilot_job_init(mock_configure_transport):
-    """Test SkyPilotJob initialization."""
-    from monarch._src.job.skypilot import SkyPilotJob
-
-    job = SkyPilotJob(
-        meshes={"trainers": 2, "workers": 1},
-        cluster_name="test-cluster",
-        monarch_port=12345,
-    )
-
-    assert job._meshes == {"trainers": 2, "workers": 1}
-    assert job._cluster_name == "test-cluster"
-    assert job._port == 12345
-    assert job._launched_cluster_name is None
-    assert job._node_ips == []
-
-
-def test_skypilot_job_init_with_resources(mock_configure_transport):
-    """Test SkyPilotJob initialization with SkyPilot resources."""
-    from monarch._src.job.skypilot import SkyPilotJob
-
-    resources = sky.Resources(accelerators="A100:1")
-
-    job = SkyPilotJob(
-        meshes={"trainers": 4},
-        resources=resources,
-        cluster_name="gpu-cluster",
-    )
-
-    assert job._resources == resources
-    assert job._meshes == {"trainers": 4}
-
-
-def test_skypilot_job_build_worker_command(mock_configure_transport):
-    """Test the worker command generation."""
-    from monarch._src.job.skypilot import SkyPilotJob
-
-    job = SkyPilotJob(
-        meshes={"trainers": 1},
-        monarch_port=22222,
-    )
-
-    command = job._build_worker_command()
-
-    # Check that the command contains expected elements
-    assert "socket.gethostname()" in command
-    assert "tcp://" in command
-    assert "22222" in command
-    assert "run_worker_loop_forever" in command
-    assert 'ca="trust_all_connections"' in command
-
-
-def test_skypilot_job_create(mock_sky, mock_attach_to_workers, mock_configure_transport):
-    """Test the _create method."""
-    from monarch._src.job.skypilot import SkyPilotJob
-
-    job = SkyPilotJob(
-        meshes={"trainers": 2},
-        cluster_name="test-cluster",
-    )
-
-    # Call _create
-    job._create(None)
-
-    # Verify sky.launch was called
-    mock_sky.launch.assert_called_once()
-
-    # Check that cluster name was stored
-    assert job._launched_cluster_name == "test-cluster"
-
-
-def test_skypilot_job_create_batch_mode_raises(mock_sky, mock_configure_transport):
-    """Test that _create raises an error for batch mode."""
-    from monarch._src.job.skypilot import SkyPilotJob
-
-    job = SkyPilotJob(meshes={"trainers": 1})
-
-    with pytest.raises(RuntimeError, match="batch-mode scripts"):
-        job._create("some_script.py")
-
-
-def test_skypilot_job_state(mock_sky, mock_attach_to_workers, mock_configure_transport):
-    """Test the _state method."""
-    from monarch._src.job.skypilot import SkyPilotJob
-
-    job = SkyPilotJob(
-        meshes={"trainers": 2},
-        cluster_name="test-cluster",
-    )
-
-    # Apply the job first
-    job.apply()
-
-    # Now get state
-    state = job._state()
-
-    # Verify attach_to_workers was called with correct addresses
-    mock_attach_to_workers.assert_called()
-    call_args = mock_attach_to_workers.call_args
-
-    # Check the call arguments
-    assert call_args.kwargs["name"] == "trainers"
-    assert call_args.kwargs["ca"] == "trust_all_connections"
-    # Workers should use external IPs
-    workers = call_args.kwargs["workers"]
-    assert len(workers) == 2
-    assert all("tcp://" in w for w in workers)
-
-    # Check that state has the trainers mesh
-    assert hasattr(state, "trainers")
-
-
-def test_skypilot_job_state_multiple_meshes(mock_sky, mock_attach_to_workers, mock_configure_transport):
-    """Test _state with multiple meshes."""
-    from monarch._src.job.skypilot import SkyPilotJob
-
-    # Create mock status with 3 nodes
-    def mock_get_multi(request_id):
-        if request_id == "mock-request-id":
-            return (
-                1,
-                MockClusterHandle(
-                    "test-cluster",
-                    [
-                        ("10.0.0.1", "1.2.3.4"),
-                        ("10.0.0.2", "1.2.3.5"),
-                        ("10.0.0.3", "1.2.3.6"),
-                    ],
-                ),
-            )
-        elif request_id == "mock-status-request-id":
-            return [
-                MockStatusResponse(
-                    "test-cluster",
-                    sky.ClusterStatus.UP,
-                    MockClusterHandle(
-                        "test-cluster",
-                        [
-                            ("10.0.0.1", "1.2.3.4"),
-                            ("10.0.0.2", "1.2.3.5"),
-                            ("10.0.0.3", "1.2.3.6"),
-                        ],
-                    ),
-                )
-            ]
-        return None
-
-    mock_sky.get.side_effect = mock_get_multi
-
-    job = SkyPilotJob(
-        meshes={"trainers": 2, "evaluator": 1},
-        cluster_name="test-cluster",
-    )
-
-    job.apply()
-    state = job._state()
-
-    # Verify attach_to_workers was called twice (once for each mesh)
-    assert mock_attach_to_workers.call_count == 2
-
-    # Check that state has both meshes
-    assert hasattr(state, "trainers")
-    assert hasattr(state, "evaluator")
-
-
-def test_skypilot_job_kill(mock_sky, mock_attach_to_workers, mock_configure_transport):
-    """Test the _kill method."""
-    from monarch._src.job.skypilot import SkyPilotJob
-
-    job = SkyPilotJob(
-        meshes={"trainers": 1},
-        cluster_name="test-cluster",
-    )
-
-    # Apply the job first
-    job.apply()
-    assert job._launched_cluster_name == "test-cluster"
-
-    # Kill the job
-    job._kill()
-
-    # Verify sky.down was called
-    mock_sky.down.assert_called_once_with("test-cluster")
-
-    # Check that state was cleared
-    assert job._launched_cluster_name is None
-    assert job._node_ips == []
-
-
-def test_skypilot_job_can_run(mock_sky, mock_attach_to_workers, mock_configure_transport):
-    """Test the can_run method."""
-    from monarch._src.job.skypilot import SkyPilotJob
-
-    job1 = SkyPilotJob(
-        meshes={"trainers": 2},
-        cluster_name="test-cluster",
-        monarch_port=22222,
-    )
-
-    job2 = SkyPilotJob(
-        meshes={"trainers": 2},
-        cluster_name="test-cluster",
-        monarch_port=22222,
-    )
-
-    job3 = SkyPilotJob(
-        meshes={"trainers": 4},  # Different mesh config
-        cluster_name="test-cluster",
-        monarch_port=22222,
-    )
-
-    # Apply job1
-    job1.apply()
-
-    # job1 should be able to run job2 (same config)
-    assert job1.can_run(job2) is True
-
-    # job1 should NOT be able to run job3 (different mesh config)
-    assert job1.can_run(job3) is False
-
-
-def test_skypilot_job_jobs_active(mock_sky, mock_attach_to_workers, mock_configure_transport):
-    """Test the _jobs_active method."""
-    from monarch._src.job.skypilot import SkyPilotJob
-
-    job = SkyPilotJob(
-        meshes={"trainers": 1},
-        cluster_name="test-cluster",
-    )
-
-    # Before apply, should not be active
-    assert job._jobs_active() is False
-
-    # Apply the job
-    job.apply()
-
-    # After apply, should be active (mocked status returns UP)
-    assert job._jobs_active() is True
-
-
-def test_skypilot_job_serialization(mock_sky, mock_attach_to_workers, mock_configure_transport):
-    """Test that SkyPilotJob can be serialized and deserialized."""
-    from monarch._src.job.skypilot import SkyPilotJob
-    from monarch._src.job.job import job_loads
-
-    job = SkyPilotJob(
-        meshes={"trainers": 2, "workers": 1},
-        cluster_name="test-cluster",
-        monarch_port=33333,
-    )
-
-    # Serialize
-    serialized = job.dumps()
-
-    # Deserialize
-    loaded_job = job_loads(serialized)
-
-    # Check attributes
-    assert isinstance(loaded_job, SkyPilotJob)
-    assert loaded_job._meshes == {"trainers": 2, "workers": 1}
-    assert loaded_job._cluster_name == "test-cluster"
-    assert loaded_job._port == 33333
-
-
-def test_skypilot_job_with_upup_commands(mock_configure_transport):
-    """Test SkyPilotJob with custom ppppppppppp commands."""
-    from monarch._src.job.skypilot import SkyPilotJob
-
-    setup = "pip install torch\npip install monarch"
-
-    job = SkyPilotJob(
-        meshes={"trainers": 1},
-        setup_commands=setup,
-    )
-
-    assert job._setup_commands == setup
-
-
-def test_skypilot_job_with_autostop(mock_configure_transport):
-    """Test SkyPilotJob with autostop configuration."""
-    from monarch._src.job.skypilot import SkyPilotJob
-
-    job = SkyPilotJob(
-        meshes={"trainers": 1},
-        idle_minutes_to_autostop=30,
-        down_on_autostop=True,
-    )
-
-    assert job._idle_minutes_to_autostop == 30
-    assert job._down_on_autostop is True
-
-
-# Integration test - only run if explicitly requested
-@pytest.mark.skip(reason="Integration test - run manually with --run-integration")
-def test_skypilot_job_integration():
-    """
-    Integration test that actually launches a SkyPilot cluster.
-
-    To run this test:
-        pytest tests/test_skypilot_job.py::test_skypilot_job_integration --run-integration
-
-    Make sure you have SkyPilot credentials configured.
-    """
-    from monarch._src.job.skypilot import SkyPilotJob
-
-    # Create a minimal job - just 1 node with cheap resources
-    job = SkyPilotJob(
-        meshes={"workers": 1},
-        resources=sky.Resources(
-            cloud=sky.AWS(),  # Change to your preferred cloud
-            cpus="2+",
-        ),
-        cluster_name="monarch-test-integration",
-        idle_minutes_to_autostop=5,
-        down_on_autostop=True,
-    )
-
-    try:
-        # Apply the job
-        job.apply()
-
-        # Check that we can get state
-        state = job.state()
-        assert hasattr(state, "workers")
-
-        print("Integration test passed!")
-    finally:
-        # Always clean up
-        job.kill()
-

From fd310d377b891dfc6f561e3938e7c9f53b826796 Mon Sep 17 00:00:00 2001
From: Romil <romil.bhardwaj@gmail.com>
Date: Mon, 8 Dec 2025 05:25:24 +0000
Subject: [PATCH 09/29] cleanup

---
 examples/skypilot_getting_started.py | 6 ------
 python/monarch/_src/job/skypilot.py  | 6 ------
 2 files changed, 12 deletions(-)

diff --git a/examples/skypilot_getting_started.py b/examples/skypilot_getting_started.py
index 3ccc1d10a..b4974e46c 100644
--- a/examples/skypilot_getting_started.py
+++ b/examples/skypilot_getting_started.py
@@ -1,10 +1,4 @@
 #!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
 """
 Monarch Getting Started with SkyPilot
 =====================================
diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py
index 6eade6eae..a5684b148 100644
--- a/python/monarch/_src/job/skypilot.py
+++ b/python/monarch/_src/job/skypilot.py
@@ -1,9 +1,3 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
 # pyre-unsafe
 
 import logging

From 8511e5a4870ee80214f870aa2f5fa83eafab7efc Mon Sep 17 00:00:00 2001
From: Romil <romil.bhardwaj@gmail.com>
Date: Mon, 8 Dec 2025 06:06:31 +0000
Subject: [PATCH 10/29] updates

---
 examples/skypilot_getting_started.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/examples/skypilot_getting_started.py b/examples/skypilot_getting_started.py
index b4974e46c..3a6813e5a 100644
--- a/examples/skypilot_getting_started.py
+++ b/examples/skypilot_getting_started.py
@@ -25,7 +25,7 @@
 import os
 import sys
 
-# Set timeouts before importing monarch - worker setup takes time
+# Set timeouts before importing monarch - monarch build takes time
 os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s"
 os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s"
 os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s"
@@ -42,15 +42,10 @@
     from monarch.actor import Actor, endpoint, ProcMesh, context
 except ImportError as e:
     print(f"ERROR: Monarch is not properly installed: {e}")
-    print("\nTo install Monarch, you need to build it from source:")
-    print("  cd monarch/")
-    print("  pip install -e .")
-    print("\nThis requires the Rust toolchain and other dependencies.")
-    print("See monarch/README.md for full installation instructions.")
     sys.exit(1)
 
 # ============================================================================
-# Step 1: Define our Actors (same as getting started guide)
+# Step 1: Define actors (same as getting started guide)
 # ============================================================================
 
 
@@ -94,7 +89,7 @@ def get_cloud(cloud_name: str):
         "kubernetes": sky.Kubernetes,
         "aws": sky.AWS,
         "gcp": sky.GCP,
-        "azure": sky.Azure,
+        "azure": sky.Azure, # TODO(romilb): Add more clouds here
     }
     if cloud_name.lower() not in clouds:
         raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}")
@@ -114,6 +109,7 @@ def main():
         default=2,
         help="Number of host nodes to provision",
     )
+    # TODO(romilb): This should be parsed from the accelerator spec
     parser.add_argument(
         "--gpus-per-host",
         type=int,
@@ -156,7 +152,7 @@ def main():
     # Build resources specification
     resources_kwargs = {
         "cloud": get_cloud(args.cloud),
-        "accelerators": args.accelerator,  # GPU required - torchmonarch needs CUDA
+        "accelerators": args.accelerator,
     }
     if args.region:
         resources_kwargs["region"] = args.region
@@ -168,7 +164,6 @@ def main():
     job = SkyPilotJob(
         # Define the mesh of hosts we need
         meshes={"trainers": args.num_hosts},
-        # Specify cloud resources - GPU required for torchmonarch (needs CUDA)
         resources=sky.Resources(**resources_kwargs),
         cluster_name=args.cluster_name,
         # Auto-cleanup after 10 minutes of idle time
@@ -233,7 +228,7 @@ def main():
             print(f"    {i}")
 
         print("\n" + "=" * 60)
-        print("SUCCESS! Monarch actors ran on SkyPilot cluster!")
+        print("Success! Monarch actors ran on SkyPilot cluster!")
         print("=" * 60)
 
     except Exception as e:

From 20d36e8c08853d9b436582709f281a8c63934eec Mon Sep 17 00:00:00 2001
From: Romil <romil.bhardwaj@gmail.com>
Date: Thu, 11 Dec 2025 01:31:34 +0000
Subject: [PATCH 11/29] Extract SkyPilotJob from monarch src

---
 examples/skypilot/README.md                   | 153 ++++++
 examples/skypilot/__init__.py                 |  23 +
 .../skypilot_getting_started.py               |  51 +-
 examples/skypilot/skypilot_job.py             | 458 ++++++++++++++++++
 4 files changed, 660 insertions(+), 25 deletions(-)
 create mode 100644 examples/skypilot/README.md
 create mode 100644 examples/skypilot/__init__.py
 rename examples/{ => skypilot}/skypilot_getting_started.py (84%)
 create mode 100644 examples/skypilot/skypilot_job.py

diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md
new file mode 100644
index 000000000..74f16b9e0
--- /dev/null
+++ b/examples/skypilot/README.md
@@ -0,0 +1,153 @@
+# Monarch SkyPilot Integration
+
+This directory contains a standalone integration for running Monarch workloads on **Kubernetes and cloud VMs** via [SkyPilot](https://github.com/skypilot-org/skypilot).
+
+## Overview
+
+`SkyPilotJob` provisions cloud instances (or K8s pods) and starts Monarch workers on them, allowing you to run distributed Monarch actors across multiple machines.
+
+**Supported platforms:**
+- Kubernetes (any cluster)
+- AWS, GCP, Azure
+- Lambda Labs, CoreWeave, RunPod, and [20+ other clouds](https://docs.skypilot.co/en/latest/getting-started/installation.html)
+
+## Installation
+
+```bash
+# Install Monarch
+pip install torchmonarch-nightly
+
+# Install SkyPilot with your preferred backend
+pip install skypilot[kubernetes]  # For Kubernetes
+pip install skypilot[aws]         # For AWS
+pip install skypilot[gcp]         # For GCP
+pip install skypilot[all]         # For all clouds
+
+# Verify SkyPilot setup
+sky check
+```
+
+## Quick Start
+
+```python
+import sky
+from skypilot_job import SkyPilotJob
+from monarch.actor import Actor, endpoint
+
+class MyActor(Actor):
+    @endpoint
+    def hello(self) -> str:
+        return "Hello from the cloud!"
+
+# Create a SkyPilot job with 2 nodes
+job = SkyPilotJob(
+    meshes={"workers": 2},
+    resources=sky.Resources(
+        cloud=sky.Kubernetes(),  # or sky.AWS(), sky.GCP(), etc.
+        accelerators="H100:1",
+    ),
+    cluster_name="my-monarch-cluster",
+    idle_minutes_to_autostop=10,
+    down_on_autostop=True,
+)
+
+# Launch and connect
+state = job.state()
+hosts = state.workers
+
+# Spawn processes and actors
+procs = hosts.spawn_procs(per_host={"gpus": 1})
+actors = procs.spawn("my_actors", MyActor)
+
+# Use your actors
+results = actors.hello.call().get()
+print(results)  # ["Hello from the cloud!", "Hello from the cloud!"]
+
+# Clean up
+job.kill()
+```
+
+## Running the Example
+
+```bash
+cd examples/skypilot
+
+# Run on Kubernetes
+python getting_started.py --cloud kubernetes --num-hosts 2
+
+# Run on AWS
+python getting_started.py --cloud aws --num-hosts 2 --accelerator "A100:1"
+
+# Run on GCP
+python getting_started.py --cloud gcp --num-hosts 2 --accelerator "A100:1"
+```
+
+## Configuration Options
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `meshes` | Dict mapping mesh names to node counts | Required |
+| `resources` | SkyPilot Resources specification | None (SkyPilot defaults) |
+| `cluster_name` | Name for the cluster | Auto-generated |
+| `monarch_port` | Port for Monarch TCP communication | 22222 |
+| `idle_minutes_to_autostop` | Auto-stop after idle time | None |
+| `down_on_autostop` | Tear down on autostop vs just stop | False |
+| `setup_commands` | Custom setup script | Installs torchmonarch-nightly |
+| `workdir` | Local directory to sync to cluster | None |
+| `file_mounts` | Additional files to mount | None |
+
+## Default Image
+
+By default, `SkyPilotJob` uses the `pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime` Docker image which has compatible system libraries for `torchmonarch-nightly`. Setup time is ~1-2 minutes (just pip install).
+
+## Faster Cold Starts
+
+For faster cold starts (<30s):
+
+**Option 1: Use a pre-built Docker image**
+```python
+resources = sky.Resources(
+    image_id="docker:your-registry/monarch-image:tag",
+    accelerators="H100:1",
+)
+```
+
+**Option 2: Use SkyPilot's cluster reuse**
+```python
+job = SkyPilotJob(
+    ...,
+    idle_minutes_to_autostop=30,  # Keep cluster alive
+    down_on_autostop=False,       # Just stop, don't terminate
+)
+```
+
+## Network Requirements
+
+The client must have direct network connectivity to the worker nodes:
+- **Kubernetes**: Run the client inside the same cluster (e.g., in a pod)
+- **Cloud VMs**: Ensure security groups allow inbound traffic on port 22222
+
+## Troubleshooting
+
+**Check SkyPilot setup:**
+```bash
+sky check
+sky show-gpus
+```
+
+**View cluster logs:**
+```bash
+sky logs <cluster-name>
+```
+
+**SSH into a worker:**
+```bash
+sky ssh <cluster-name>
+```
+
+**Clean up clusters:**
+```bash
+sky down <cluster-name>
+sky down --all  # Remove all clusters
+```
+
diff --git a/examples/skypilot/__init__.py b/examples/skypilot/__init__.py
new file mode 100644
index 000000000..8e7acc6da
--- /dev/null
+++ b/examples/skypilot/__init__.py
@@ -0,0 +1,23 @@
+"""
+SkyPilot integration for Monarch.
+
+This is a standalone package that provides SkyPilotJob - a way to run Monarch
+workloads on Kubernetes and cloud VMs via SkyPilot.
+
+This package is separate from the main Monarch codebase to allow independent
+iteration and to avoid chicken-and-egg problems with releases.
+
+Usage:
+    from skypilot_job import SkyPilotJob
+    
+    job = SkyPilotJob(
+        meshes={"workers": 2},
+        resources=sky.Resources(cloud=sky.Kubernetes(), accelerators="H100:1"),
+    )
+    state = job.state()
+"""
+
+from .skypilot_job import SkyPilotJob
+
+__all__ = ["SkyPilotJob"]
+
diff --git a/examples/skypilot_getting_started.py b/examples/skypilot/skypilot_getting_started.py
similarity index 84%
rename from examples/skypilot_getting_started.py
rename to examples/skypilot/skypilot_getting_started.py
index 3a6813e5a..b9f703bee 100644
--- a/examples/skypilot_getting_started.py
+++ b/examples/skypilot/skypilot_getting_started.py
@@ -1,35 +1,41 @@
 #!/usr/bin/env python3
 """
-Monarch Getting Started with SkyPilot
-=====================================
+Running Monarch on Kubernetes with SkyPilot
+===========================================
 
 This script demonstrates running Monarch actors on cloud infrastructure
-provisioned by SkyPilot. It follows the Monarch getting started guide
-but uses SkyPilot to launch the worker nodes.
+provisioned by SkyPilot (Kubernetes or cloud VMs).
 
 Prerequisites:
-- Monarch installed with its Rust bindings (build with `pip install -e .` in monarch/)
-- SkyPilot installed and configured (run `sky check`)
+    pip install torchmonarch-nightly
+    pip install skypilot[kubernetes]  # or skypilot[aws], skypilot[gcp], etc.
+    sky check  # Verify SkyPilot configuration
 
 Usage:
-    # Run from inside a Kubernetes pod (client runs locally):
-    python examples/skypilot_getting_started.py --cloud kubernetes --num-hosts 2
+    # Run on Kubernetes:
+    python getting_started.py --cloud kubernetes --num-hosts 2
 
-    # Run from outside the cluster using the SkyPilot YAML:
-    sky launch examples/skypilot_run_example.yaml
+    # Run on AWS:
+    python getting_started.py --cloud aws --num-hosts 2
 
-See SKY_README.md for full documentation.
+    # Run on GCP:
+    python getting_started.py --cloud gcp --num-hosts 2
 """
 
 import argparse
 import os
 import sys
 
-# Set timeouts before importing monarch - monarch build takes time
+# Set timeouts before importing monarch
 os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s"
 os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s"
 os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s"
 
+# If running inside a SkyPilot cluster, unset the in-cluster context
+# to allow launching new clusters on the same Kubernetes cluster
+if "SKYPILOT_IN_CLUSTER_CONTEXT_NAME" in os.environ:
+    del os.environ["SKYPILOT_IN_CLUSTER_CONTEXT_NAME"]
+
 # Check dependencies before importing
 try:
     import sky
@@ -38,12 +44,15 @@
     sys.exit(1)
 
 try:
-    from monarch.job import SkyPilotJob
     from monarch.actor import Actor, endpoint, ProcMesh, context
 except ImportError as e:
     print(f"ERROR: Monarch is not properly installed: {e}")
+    print("Run: pip install torchmonarch-nightly")
     sys.exit(1)
 
+# Import SkyPilotJob from the local package
+from skypilot_job import SkyPilotJob
+
 # ============================================================================
 # Step 1: Define actors (same as getting started guide)
 # ============================================================================
@@ -89,7 +98,8 @@ def get_cloud(cloud_name: str):
         "kubernetes": sky.Kubernetes,
         "aws": sky.AWS,
         "gcp": sky.GCP,
-        "azure": sky.Azure, # TODO(romilb): Add more clouds here
+        "azure": sky.Azure,
+        "lambda": sky.Lambda,
     }
     if cloud_name.lower() not in clouds:
         raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}")
@@ -101,7 +111,7 @@ def main():
     parser.add_argument(
         "--cloud",
         default="kubernetes",
-        help="Cloud provider to use (kubernetes, aws, gcp, azure)",
+        help="Cloud provider to use (kubernetes, aws, gcp, azure, lambda)",
     )
     parser.add_argument(
         "--num-hosts",
@@ -109,11 +119,10 @@ def main():
         default=2,
         help="Number of host nodes to provision",
     )
-    # TODO(romilb): This should be parsed from the accelerator spec
     parser.add_argument(
         "--gpus-per-host",
         type=int,
-        default=2,
+        default=1,
         help="Number of GPU processes per host",
     )
     parser.add_argument(
@@ -146,7 +155,6 @@ def main():
         print(f"  Region: {args.region}")
 
     # Create a SkyPilotJob to provision nodes
-    # This will launch cloud instances and start Monarch workers on them
     print("\n[1] Creating SkyPilot job...")
 
     # Build resources specification
@@ -156,10 +164,6 @@ def main():
     }
     if args.region:
         resources_kwargs["region"] = args.region
-
-    # Find Monarch repo root (this script is in examples/)
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    monarch_root = os.path.dirname(script_dir)  # Go up from examples/
     
     job = SkyPilotJob(
         # Define the mesh of hosts we need
@@ -169,9 +173,6 @@ def main():
         # Auto-cleanup after 10 minutes of idle time
         idle_minutes_to_autostop=10,
         down_on_autostop=True,
-        # Sync Monarch source to workers for building from source
-        # (SkyPilotJob uses default setup commands when workdir is provided)
-        workdir=monarch_root,
     )
 
     try:
diff --git a/examples/skypilot/skypilot_job.py b/examples/skypilot/skypilot_job.py
new file mode 100644
index 000000000..7b5ea1178
--- /dev/null
+++ b/examples/skypilot/skypilot_job.py
@@ -0,0 +1,458 @@
+"""
+SkyPilot integration for Monarch - standalone implementation.
+
+This module provides SkyPilotJob, which allows running Monarch workloads on
+Kubernetes and cloud VMs via SkyPilot. It is designed to be used independently
+of the main Monarch source tree.
+
+Requirements:
+    - pip install torchmonarch-nightly (or torchmonarch)
+    - pip install skypilot[kubernetes] (or other cloud backends)
+"""
+
+import logging
+import os
+import sys
+import time
+from typing import Dict, List, Optional, TYPE_CHECKING
+
+# Import Monarch's job interface
+from monarch._src.job.job import JobState, JobTrait
+
+# If running inside a SkyPilot cluster, unset the in-cluster context variable
+# to allow launching new clusters on the same Kubernetes cluster.
+# This must be done before importing sky to affect the API server.
+if "SKYPILOT_IN_CLUSTER_CONTEXT_NAME" in os.environ:
+    del os.environ["SKYPILOT_IN_CLUSTER_CONTEXT_NAME"]
+
+# Defer imports that may not be available in all environments
+if TYPE_CHECKING:
+    import sky
+
+try:
+    import sky
+    HAS_SKYPILOT = True
+except ImportError:
+    HAS_SKYPILOT = False
+    sky = None  # type: ignore[assignment]
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+if not logger.handlers:
+    logger.addHandler(logging.StreamHandler(sys.stderr))
+logger.propagate = False
+
+# Default port for Monarch TCP communication
+DEFAULT_MONARCH_PORT = 22222
+
+# Default setup commands to install Monarch from PyPI on remote workers.
+# Requires a Docker image with Ubuntu 22.04+ for compatible libibverbs.
+#
+# Cold start time: ~1-2 minutes (pip install only).
+# For faster cold starts (<30s), use a custom Docker image with Monarch pre-installed.
+DEFAULT_SETUP_COMMANDS = """
+set -ex
+
+# Install torchmonarch from PyPI
+pip install torchmonarch-nightly
+
+echo "Done installing Monarch"
+"""
+
+# Default Docker image - PyTorch with CUDA on Ubuntu 22.04 (has compatible libibverbs)
+DEFAULT_IMAGE_ID = "docker:pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime"
+
+
+def _configure_transport() -> None:
+    """Configure the Monarch transport. Deferred import to avoid import errors."""
+    from monarch._rust_bindings.monarch_hyperactor.channel import ChannelTransport
+    from monarch._rust_bindings.monarch_hyperactor.config import configure
+
+    configure(default_transport=ChannelTransport.TcpWithHostname)
+
+
+def _attach_to_workers_wrapper(name: str, ca: str, workers: List[str]):
+    """Wrapper around attach_to_workers with deferred import."""
+    from monarch._src.actor.bootstrap import attach_to_workers
+
+    return attach_to_workers(name=name, ca=ca, workers=workers)
+
+
+class SkyPilotJob(JobTrait):
+    """
+    A job scheduler that uses SkyPilot to provision cloud instances.
+
+    SkyPilot supports multiple cloud providers (AWS, GCP, Azure, Lambda, etc.)
+    and Kubernetes, and can automatically select the cheapest available option.
+
+    This implementation:
+    1. Uses sky.launch() to provision cloud instances with specified resources
+    2. Runs Monarch workers on each node via a startup script
+    3. Connects to workers using their IP addresses from the cluster handle
+
+    Example:
+        >>> import sky
+        >>> from skypilot_job import SkyPilotJob
+        >>>
+        >>> job = SkyPilotJob(
+        ...     meshes={"trainers": 2},
+        ...     resources=sky.Resources(accelerators="A100:1"),
+        ...     cluster_name="my-monarch-cluster",
+        ... )
+        >>> state = job.state()
+        >>> trainers = state.trainers  # HostMesh with 2 nodes
+    """
+
+    def __init__(
+        self,
+        meshes: Dict[str, int],
+        resources: Optional["sky.Resources"] = None,
+        cluster_name: Optional[str] = None,
+        monarch_port: int = DEFAULT_MONARCH_PORT,
+        idle_minutes_to_autostop: Optional[int] = None,
+        down_on_autostop: bool = False,
+        python_exe: str = "python",
+        setup_commands: Optional[str] = None,
+        workdir: Optional[str] = None,
+        file_mounts: Optional[Dict[str, str]] = None,
+    ) -> None:
+        """
+        Args:
+            meshes: Dictionary mapping mesh names to number of nodes.
+                    e.g., {"trainers": 4, "dataloaders": 2}
+            resources: SkyPilot Resources specification for the instances.
+                       If None, uses SkyPilot defaults.
+            cluster_name: Name for the SkyPilot cluster. If None, auto-generated.
+            monarch_port: Port for TCP communication between Monarch workers.
+            idle_minutes_to_autostop: If set, cluster will autostop after this
+                                      many minutes of idleness.
+            down_on_autostop: If True, tear down cluster on autostop instead of
+                              just stopping it.
+            python_exe: Python executable to use for worker processes.
+            setup_commands: Optional setup commands to run before starting workers.
+                           If None, uses DEFAULT_SETUP_COMMANDS which installs
+                           torchmonarch-nightly from PyPI.
+            workdir: Local directory to sync to the cluster. If provided, this
+                    directory will be uploaded to ~/sky_workdir on each node.
+            file_mounts: Dictionary mapping remote paths to local paths for
+                        additional file mounts.
+        """
+        if not HAS_SKYPILOT:
+            raise ImportError(
+                "SkyPilot is not installed. Install it with: pip install skypilot[kubernetes]"
+            )
+
+        # Configure transport at runtime when Monarch is available
+        try:
+            _configure_transport()
+        except ImportError:
+            # Monarch bindings not available, will fail later when needed
+            pass
+
+        super().__init__()
+
+        self._meshes = meshes
+        self._resources = resources
+        self._cluster_name = cluster_name
+        self._port = monarch_port
+        self._idle_minutes_to_autostop = idle_minutes_to_autostop
+        self._down_on_autostop = down_on_autostop
+        self._python_exe = python_exe
+        self._setup_commands = setup_commands
+        self._workdir = workdir
+        self._file_mounts = file_mounts
+
+        # Runtime state
+        self._launched_cluster_name: Optional[str] = None
+        self._node_ips: List[str] = []
+
+    def _create(self, client_script: Optional[str]) -> None:
+        """Launch a SkyPilot cluster and start Monarch workers."""
+        if client_script is not None:
+            raise RuntimeError("SkyPilotJob cannot run batch-mode scripts yet")
+
+        total_nodes = sum(self._meshes.values())
+
+        # Build the worker startup command
+        worker_command = self._build_worker_command()
+
+        # Use provided setup commands or default to PyPI install
+        setup = self._setup_commands if self._setup_commands is not None else DEFAULT_SETUP_COMMANDS
+        if setup and not setup.endswith("\n"):
+            setup += "\n"
+
+        # Create the SkyPilot task
+        task = sky.Task(
+            name="monarch-workers",
+            setup=setup if setup else None,
+            run=worker_command,
+            num_nodes=total_nodes,
+            workdir=self._workdir,
+        )
+
+        # Add file mounts if provided
+        if self._file_mounts:
+            task.set_file_mounts(self._file_mounts)
+
+        # Set resources, using default image_id if not specified
+        resources = self._resources
+        if resources is not None:
+            # If no image_id specified, use the default PyTorch image
+            if resources.image_id is None:
+                resources = resources.copy(image_id=DEFAULT_IMAGE_ID)
+            task.set_resources(resources)
+        else:
+            # No resources specified, create default with image_id
+            task.set_resources(sky.Resources(image_id=DEFAULT_IMAGE_ID))
+
+        # Generate cluster name if not provided
+        cluster_name = self._cluster_name or f"monarch-{os.getpid()}"
+
+        logger.info(f"Launching SkyPilot cluster '{cluster_name}' with {total_nodes} nodes")
+
+        # Launch the cluster
+        try:
+            request_id = sky.launch(
+                task,
+                cluster_name=cluster_name,
+                idle_minutes_to_autostop=self._idle_minutes_to_autostop,
+                down=self._down_on_autostop,
+            )
+            # Get the result from the request
+            job_id, handle = sky.get(request_id)
+        except Exception as e:
+            logger.error(f"Failed to launch SkyPilot cluster: {e}")
+            raise RuntimeError(f"Failed to launch SkyPilot cluster: {e}") from e
+
+        self._launched_cluster_name = cluster_name
+        logger.info(f"SkyPilot cluster '{cluster_name}' launched successfully")
+        
+        # Wait for the job to be RUNNING (setup complete, run started)
+        self._wait_for_job_running(cluster_name, job_id, timeout=300)
+    
+    def _wait_for_job_running(self, cluster_name: str, job_id: int, timeout: int = 300) -> None:
+        """Wait for the SkyPilot job to reach RUNNING status (setup complete)."""
+        start_time = time.time()
+        poll_interval = 10  # seconds
+        
+        logger.info(f"Waiting for job {job_id} setup to complete (timeout={timeout}s)...")
+        
+        while time.time() - start_time < timeout:
+            try:
+                # Get job queue for the cluster
+                request_id = sky.queue(cluster_name)
+                jobs = sky.get(request_id)
+                
+                # Find our job
+                for job in jobs:
+                    if job.get('id') == job_id or job.get('job_id') == job_id:
+                        status = job.get('status', '')
+                        status_str = str(status)
+                        if 'RUNNING' in status_str:
+                            logger.info(f"Job {job_id} is now RUNNING (setup complete)")
+                            return
+                        elif 'FAILED' in status_str or 'CANCELLED' in status_str:
+                            raise RuntimeError(f"Job {job_id} failed with status: {status}. Check logs with: sky logs {cluster_name}")
+                        else:
+                            elapsed = int(time.time() - start_time)
+                            logger.info(f"Job {job_id} status: {status} (waited {elapsed}s)")
+                        break
+                
+            except Exception as e:
+                logger.warning(f"Error checking job status: {e}")
+            
+            time.sleep(poll_interval)
+        
+        raise RuntimeError(f"Timeout waiting for job {job_id} to reach RUNNING status")
+
+    def _build_worker_command(self) -> str:
+        """Build the bash command to start Monarch workers on each node."""
+        # This command will be run on each node via SkyPilot
+        # SkyPilot expects a bash script, so we wrap Python code in python -c
+        # Note: Use IP address (not hostname) for the worker address since
+        # Kubernetes hostnames may not resolve across pods
+        python_code = f'''
+import socket
+import logging
+import sys
+
+# Enable verbose logging
+logging.basicConfig(level=logging.DEBUG, stream=sys.stdout, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
+
+hostname = socket.gethostname()
+ip_addr = socket.gethostbyname(hostname)
+address = f"tcp://{{ip_addr}}:{self._port}"
+print(f"Starting Monarch worker at {{address}} (hostname={{hostname}})", flush=True)
+sys.stdout.flush()
+
+try:
+    from monarch.actor import run_worker_loop_forever
+    print(f"Imported run_worker_loop_forever successfully", flush=True)
+    print(f"Worker ready and listening...", flush=True)
+    run_worker_loop_forever(address=address, ca="trust_all_connections")
+except Exception as e:
+    print(f"ERROR in worker: {{e}}", flush=True)
+    import traceback
+    traceback.print_exc()
+    raise
+'''
+        # Escape single quotes in the Python code for bash
+        escaped_code = python_code.replace("'", "'\"'\"'")
+        # Set timeout env vars
+        env_vars = " ".join([
+            "export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=5m",
+            "export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=5m",
+            "export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=5m",
+        ])
+        return f"{env_vars} && {self._python_exe} -c '{escaped_code}'"
+
+    def _get_node_ips(self) -> List[str]:
+        """Get the IP addresses of all nodes in the cluster."""
+        if not self._launched_cluster_name:
+            raise RuntimeError("Cluster has not been launched yet")
+
+        # Query cluster status to get handle with node IPs
+        try:
+            request_id = sky.status(cluster_names=[self._launched_cluster_name])
+            statuses = sky.get(request_id)
+        except Exception as e:
+            raise RuntimeError(f"Failed to get cluster status: {e}") from e
+
+        if not statuses:
+            raise RuntimeError(
+                f"Cluster '{self._launched_cluster_name}' not found"
+            )
+
+        status = statuses[0]
+        handle = status.handle
+
+        if handle is None:
+            raise RuntimeError(
+                f"Cluster '{self._launched_cluster_name}' has no handle"
+            )
+        
+        # Get the external IPs from the handle
+        if handle.stable_internal_external_ips is None:
+            raise RuntimeError("Cluster has no IP information")
+
+        # stable_internal_external_ips is List[Tuple[internal_ip, external_ip]]
+        # We use external IPs to connect
+        ips = []
+        for internal_ip, external_ip in handle.stable_internal_external_ips:
+            # Prefer external IP, fall back to internal
+            ip = external_ip if external_ip else internal_ip
+            if ip:
+                ips.append(ip)
+
+        if not ips:
+            raise RuntimeError("No IP addresses found for cluster nodes")
+
+        return ips
+
+    def _wait_for_workers_ready(
+        self, expected_nodes: int, timeout: int = 300, poll_interval: int = 5
+    ) -> List[str]:
+        """Wait for workers to be ready and return their addresses."""
+        start_time = time.time()
+
+        while time.time() - start_time < timeout:
+            try:
+                ips = self._get_node_ips()
+                if len(ips) >= expected_nodes:
+                    logger.info(f"Found {len(ips)} nodes ready")
+                    return ips
+            except Exception as e:
+                logger.debug(f"Waiting for workers: {e}")
+
+            time.sleep(poll_interval)
+
+        raise RuntimeError(
+            f"Timeout waiting for {expected_nodes} workers after {timeout}s"
+        )
+
+    def _state(self) -> JobState:
+        """Get the current state with HostMesh objects for each mesh."""
+        if not self._jobs_active():
+            raise RuntimeError("SkyPilot cluster is not active")
+
+        # Get node IPs if not cached
+        if not self._node_ips:
+            total_nodes = sum(self._meshes.values())
+            self._node_ips = self._wait_for_workers_ready(total_nodes)
+
+        # Distribute IPs among meshes
+        host_meshes = {}
+        ip_idx = 0
+
+        for mesh_name, num_nodes in self._meshes.items():
+            mesh_ips = self._node_ips[ip_idx : ip_idx + num_nodes]
+            ip_idx += num_nodes
+
+            workers = [f"tcp://{ip}:{self._port}" for ip in mesh_ips]
+            logger.info(f"Connecting to workers for mesh '{mesh_name}': {workers}")
+
+            host_mesh = _attach_to_workers_wrapper(
+                name=mesh_name,
+                ca="trust_all_connections",
+                workers=workers,
+            )
+            
+            # Wait for the host mesh to be initialized (connections established)
+            logger.info(f"Waiting for host mesh '{mesh_name}' to initialize...")
+            host_mesh.initialized.get()
+            logger.info(f"Host mesh '{mesh_name}' initialized successfully")
+            
+            # Give connections a moment to fully stabilize
+            time.sleep(5)
+            logger.info(f"Host mesh '{mesh_name}' ready")
+            
+            host_meshes[mesh_name] = host_mesh
+
+        return JobState(host_meshes)
+
+    def can_run(self, spec: "JobTrait") -> bool:
+        """Check if this job can run the given spec."""
+        if not isinstance(spec, SkyPilotJob):
+            return False
+
+        return (
+            spec._meshes == self._meshes
+            and spec._resources == self._resources
+            and spec._port == self._port
+            and self._jobs_active()
+        )
+
+    def _jobs_active(self) -> bool:
+        """Check if the SkyPilot cluster is still active."""
+        if not self.active or not self._launched_cluster_name:
+            return False
+
+        try:
+            request_id = sky.status(cluster_names=[self._launched_cluster_name])
+            statuses = sky.get(request_id)
+
+            if not statuses:
+                return False
+
+            status = statuses[0]
+            # Check if cluster is UP
+            return status.status == sky.ClusterStatus.UP
+        except Exception as e:
+            logger.warning(f"Error checking cluster status: {e}")
+            return False
+
+    def _kill(self) -> None:
+        """Tear down the SkyPilot cluster."""
+        if self._launched_cluster_name is not None:
+            try:
+                logger.info(f"Tearing down SkyPilot cluster '{self._launched_cluster_name}'")
+                request_id = sky.down(self._launched_cluster_name)
+                sky.get(request_id)
+                logger.info(f"Cluster '{self._launched_cluster_name}' terminated")
+            except Exception as e:
+                logger.warning(f"Failed to tear down cluster: {e}")
+
+        self._launched_cluster_name = None
+        self._node_ips.clear()
+

From e23bd3faa3b2e881bb44a062106b2d9bd9f28843 Mon Sep 17 00:00:00 2001
From: Romil <romil.bhardwaj@gmail.com>
Date: Thu, 11 Dec 2025 01:42:49 +0000
Subject: [PATCH 12/29] remove stale changes

---
 python/monarch/_src/job/skypilot.py | 488 ----------------------------
 python/monarch/job/__init__.py      |   7 -
 2 files changed, 495 deletions(-)
 delete mode 100644 python/monarch/_src/job/skypilot.py

diff --git a/python/monarch/_src/job/skypilot.py b/python/monarch/_src/job/skypilot.py
deleted file mode 100644
index a5684b148..000000000
--- a/python/monarch/_src/job/skypilot.py
+++ /dev/null
@@ -1,488 +0,0 @@
-# pyre-unsafe
-
-import logging
-import os
-import sys
-import time
-from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
-
-from monarch._src.job.job import JobState, JobTrait
-
-# Defer imports that may not be available in all environments
-if TYPE_CHECKING:
-    import sky
-    from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle
-
-try:
-    import sky
-    from sky.backends.cloud_vm_ray_backend import CloudVmRayResourceHandle
-
-    HAS_SKYPILOT = True
-except ImportError:
-    HAS_SKYPILOT = False
-    sky = None  # type: ignore[assignment]
-    CloudVmRayResourceHandle = None  # type: ignore[assignment, misc]
-
-
-logger: logging.Logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-logger.addHandler(logging.StreamHandler(sys.stderr))
-logger.propagate = False
-
-# Default port for Monarch TCP communication
-DEFAULT_MONARCH_PORT = 22222
-
-# Default setup commands to build Monarch from source on remote workers.
-# NOTE: Cold start is slow (~7-10 minutes) because we need to compile Monarch
-# each worker This is necessary to ensure client/worker version compatibility
-# when using a development branch. For production use, consider
-# using pre-built wheels from PyPI (pip install torchmonarch).
-#
-# For faster cold starts (<30s), use a custom Docker image with all dependencies
-# pre-installed by setting image_id in sky.Resources:
-#   resources = sky.Resources(image_id="docker:your-registry/monarch-image:tag", ...)
-DEFAULT_SETUP_COMMANDS = """
-set -ex
-
-# Add PPA for newer toolchains
-sudo apt-get update
-sudo apt-get install -y software-properties-common
-sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
-sudo apt-get update
-
-# Install system dependencies
-sudo apt-get install -y \
-  build-essential \
-  ninja-build \
-  g++-11 \
-  rdma-core \
-  libibverbs1 \
-  libmlx5-1 \
-  libibverbs-dev \
-  curl \
-  pkg-config \
-  libssl-dev
-
-# Install CUDA toolkit and NCCL
-wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
-sudo dpkg -i cuda-keyring_1.1-1_all.deb
-sudo apt-get update
-sudo apt-get install -y cuda-toolkit-12-1
-sudo apt-get install -y --allow-change-held-packages libnccl2=2.28.9-1+cuda12.9 libnccl-dev=2.28.9-1+cuda12.9
-
-# Install Rust
-curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-source $HOME/.cargo/env
-rustup default nightly
-
-# Install Python dependencies and build Monarch from source
-cd ~/sky_workdir
-pip install setuptools-rust maturin
-pip install -r torch-requirements.txt -r build-requirements.txt
-CC=gcc-11 CXX=g++-11 USE_TENSOR_ENGINE=0 pip install --no-build-isolation .
-
-echo "Done installing Monarch"
-"""
-
-
-def _configure_transport() -> None:
-    """Configure the Monarch transport. Deferred import to avoid import errors."""
-    from monarch._rust_bindings.monarch_hyperactor.channel import ChannelTransport
-    from monarch._rust_bindings.monarch_hyperactor.config import configure
-
-    configure(default_transport=ChannelTransport.TcpWithHostname)
-
-
-def _attach_to_workers_wrapper(name: str, ca: str, workers: List[str]):
-    """Wrapper around attach_to_workers with deferred import."""
-    from monarch._src.actor.bootstrap import attach_to_workers
-
-    return attach_to_workers(name=name, ca=ca, workers=workers)
-
-
-class SkyPilotJob(JobTrait):
-    
-    """
-    A job scheduler that uses SkyPilot to provision cloud instances.
-
-    SkyPilot supports multiple cloud providers (AWS, GCP, Azure, Lambda, etc.)
-    and can automatically select the cheapest available option.
-
-    This implementation:
-    1. Uses sky.launch() to provision cloud instances with specified resources
-    2. Runs Monarch workers on each node via a startup script
-    3. Connects to workers using their IP addresses from the cluster handle
-
-    Example:
-        >>> import sky
-        >>> from monarch.job import SkyPilotJob
-        >>>
-        >>> job = SkyPilotJob(
-        ...     meshes={"trainers": 2},
-        ...     resources=sky.Resources(accelerators="A100:1"),
-        ...     cluster_name="my-monarch-cluster",
-        ... )
-        >>> state = job.state()
-        >>> trainers = state.trainers  # HostMesh with 2 nodes
-    """
-
-    def __init__(
-        self,
-        meshes: Dict[str, int],
-        resources: Optional["sky.Resources"] = None,
-        cluster_name: Optional[str] = None,
-        monarch_port: int = DEFAULT_MONARCH_PORT,
-        idle_minutes_to_autostop: Optional[int] = None,
-        down_on_autostop: bool = False,
-        python_exe: str = "python",
-        setup_commands: Optional[str] = None,
-        workdir: Optional[str] = None,
-        file_mounts: Optional[Dict[str, str]] = None,
-    ) -> None:
-        """
-        Args:
-            meshes: Dictionary mapping mesh names to number of nodes.
-                    e.g., {"trainers": 4, "dataloaders": 2}
-            resources: SkyPilot Resources specification for the instances.
-                       If None, uses SkyPilot defaults.
-            cluster_name: Name for the SkyPilot cluster. If None, auto-generated.
-            monarch_port: Port for TCP communication between Monarch workers.
-            idle_minutes_to_autostop: If set, cluster will autostop after this
-                                      many minutes of idleness.
-            down_on_autostop: If True, tear down cluster on autostop instead of
-                              just stopping it.
-            python_exe: Python executable to use for worker processes.
-            setup_commands: Optional setup commands to run before starting workers.
-                           Use this to install dependencies including Monarch.
-                           If None and workdir is provided, uses DEFAULT_SETUP_COMMANDS
-                           which builds Monarch from source.
-            workdir: Local directory to sync to the cluster. If provided, this
-                    directory will be uploaded to ~/sky_workdir on each node.
-                    When using workdir with the Monarch repo, DEFAULT_SETUP_COMMANDS
-                    will build Monarch from source on each worker.
-            file_mounts: Dictionary mapping remote paths to local paths for
-                        additional file mounts.
-        """
-        if not HAS_SKYPILOT:
-            raise ImportError(
-                "SkyPilot is not installed. Install it with: pip install skypilot"
-            )
-
-        # Configure transport at runtime when Monarch is available
-        try:
-            _configure_transport()
-        except ImportError:
-            # Monarch bindings not available, will fail later when needed
-            pass
-
-        super().__init__()
-
-        self._meshes = meshes
-        self._resources = resources
-        self._cluster_name = cluster_name
-        self._port = monarch_port
-        self._idle_minutes_to_autostop = idle_minutes_to_autostop
-        self._down_on_autostop = down_on_autostop
-        self._python_exe = python_exe
-        self._setup_commands = setup_commands
-        self._workdir = workdir
-        self._file_mounts = file_mounts
-
-        # Runtime state
-        self._launched_cluster_name: Optional[str] = None
-        self._node_ips: List[str] = []
-
-    def _create(self, client_script: Optional[str]) -> None:
-        """Launch a SkyPilot cluster and start Monarch workers."""
-        if client_script is not None:
-            raise RuntimeError("SkyPilotJob cannot run batch-mode scripts yet")
-
-        total_nodes = sum(self._meshes.values())
-
-        # Build the worker startup command
-        worker_command = self._build_worker_command()
-
-        # Create setup commands
-        # If workdir is provided but no setup_commands, use defaults to build Monarch
-        if self._setup_commands is not None:
-            setup = self._setup_commands
-        elif self._workdir is not None:
-            setup = DEFAULT_SETUP_COMMANDS
-        else:
-            setup = ""
-        if setup and not setup.endswith("\n"):
-            setup += "\n"
-
-        # Create the SkyPilot task
-        task = sky.Task(
-            name="monarch-workers",
-            setup=setup if setup else None,
-            run=worker_command,
-            num_nodes=total_nodes,
-            workdir=self._workdir,
-        )
-
-        # Add file mounts if provided
-        if self._file_mounts:
-            task.set_file_mounts(self._file_mounts)
-
-        if self._resources is not None:
-            task.set_resources(self._resources)
-
-        # Generate cluster name if not provided
-        cluster_name = self._cluster_name or f"monarch-{os.getpid()}"
-
-        logger.info(f"Launching SkyPilot cluster '{cluster_name}' with {total_nodes} nodes")
-
-        # Launch the cluster
-        # Note: sky.launch returns a request ID in the SDK, we need to get the result
-        try:
-            request_id = sky.launch(
-                task,
-                cluster_name=cluster_name,
-                idle_minutes_to_autostop=self._idle_minutes_to_autostop,
-                down=self._down_on_autostop,
-            )
-            # Get the result from the request
-            job_id, handle = sky.get(request_id)
-        except Exception as e:
-            logger.error(f"Failed to launch SkyPilot cluster: {e}")
-            raise RuntimeError(f"Failed to launch SkyPilot cluster: {e}") from e
-
-        self._launched_cluster_name = cluster_name
-        logger.info(f"SkyPilot cluster '{cluster_name}' launched successfully")
-        
-        # Wait for the job to be RUNNING (setup complete, run started)
-        self._wait_for_job_running(cluster_name, job_id, timeout=900)
-    
-    def _wait_for_job_running(self, cluster_name: str, job_id: int, timeout: int = 900) -> None:
-        """Wait for the SkyPilot job to reach RUNNING status (setup complete)."""
-        import time
-        start_time = time.time()
-        poll_interval = 10  # seconds
-        
-        logger.info(f"Waiting for job {job_id} setup to complete (timeout={timeout}s)...")
-        
-        while time.time() - start_time < timeout:
-            try:
-                # Get job queue for the cluster
-                request_id = sky.queue(cluster_name)
-                jobs = sky.get(request_id)
-                
-                # Find our job
-                for job in jobs:
-                    if job.get('id') == job_id or job.get('job_id') == job_id:
-                        status = job.get('status', '')
-                        status_str = str(status)
-                        if 'RUNNING' in status_str:
-                            logger.info(f"Job {job_id} is now RUNNING (setup complete)")
-                            return
-                        elif 'FAILED' in status_str or 'CANCELLED' in status_str:
-                            raise RuntimeError(f"Job {job_id} failed with status: {status}. Check logs with: sky logs {cluster_name}")
-                        else:
-                            elapsed = int(time.time() - start_time)
-                            logger.info(f"Job {job_id} status: {status} (waited {elapsed}s)")
-                        break
-                
-            except Exception as e:
-                logger.warning(f"Error checking job status: {e}")
-            
-            time.sleep(poll_interval)
-        
-        raise RuntimeError(f"Timeout waiting for job {job_id} to reach RUNNING status")
-
-    def _build_worker_command(self) -> str:
-        """Build the bash command to start Monarch workers on each node."""
-        # This command will be run on each node via SkyPilot
-        # SkyPilot expects a bash script, so we wrap Python code in python -c
-        # Note: Use IP address (not hostname) for the worker address since
-        # Kubernetes hostnames may not resolve across pods
-        python_code = f'''
-import socket
-import logging
-import sys
-
-# Enable verbose logging
-logging.basicConfig(level=logging.DEBUG, stream=sys.stdout, format="%(asctime)s %(levelname)s %(name)s: %(message)s")
-
-hostname = socket.gethostname()
-ip_addr = socket.gethostbyname(hostname)
-address = f"tcp://{{ip_addr}}:{self._port}"
-print(f"Starting Monarch worker at {{address}} (hostname={{hostname}})", flush=True)
-sys.stdout.flush()
-
-try:
-    from monarch.actor import run_worker_loop_forever
-    print(f"Imported run_worker_loop_forever successfully", flush=True)
-    print(f"Worker ready and listening...", flush=True)
-    run_worker_loop_forever(address=address, ca="trust_all_connections")
-except Exception as e:
-    print(f"ERROR in worker: {{e}}", flush=True)
-    import traceback
-    traceback.print_exc()
-    raise
-'''
-        # Escape single quotes in the Python code for bash
-        escaped_code = python_code.replace("'", "'\"'\"'")
-        # Set timeout env vars - setup takes time (building from source) so we need longer timeouts
-        env_vars = " ".join([
-            "export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=15m",
-            "export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=15m",
-            "export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=15m",
-        ])
-        return f"{env_vars} && {self._python_exe} -c '{escaped_code}'"
-
-    def _get_node_ips(self) -> List[str]:
-        """Get the IP addresses of all nodes in the cluster."""
-        if not self._launched_cluster_name:
-            raise RuntimeError("Cluster has not been launched yet")
-
-        # Query cluster status to get handle with node IPs
-        try:
-            request_id = sky.status(cluster_names=[self._launched_cluster_name])
-            statuses = sky.get(request_id)
-        except Exception as e:
-            raise RuntimeError(f"Failed to get cluster status: {e}") from e
-
-        if not statuses:
-            raise RuntimeError(
-                f"Cluster '{self._launched_cluster_name}' not found"
-            )
-
-        status = statuses[0]
-        handle = status.handle
-
-        if handle is None:
-            raise RuntimeError(
-                f"Cluster '{self._launched_cluster_name}' has no handle"
-            )
-
-        if not isinstance(handle, CloudVmRayResourceHandle):
-            raise RuntimeError(
-                f"Unexpected handle type: {type(handle)}"
-            )
-
-        # Get the external IPs from the handle
-        if handle.stable_internal_external_ips is None:
-            raise RuntimeError("Cluster has no IP information")
-
-        # stable_internal_external_ips is List[Tuple[internal_ip, external_ip]]
-        # We use external IPs to connect
-        ips = []
-        for internal_ip, external_ip in handle.stable_internal_external_ips:
-            # Prefer external IP, fall back to internal
-            ip = external_ip if external_ip else internal_ip
-            if ip:
-                ips.append(ip)
-
-        if not ips:
-            raise RuntimeError("No IP addresses found for cluster nodes")
-
-        return ips
-
-    def _wait_for_workers_ready(
-        self, expected_nodes: int, timeout: int = 300, poll_interval: int = 5
-    ) -> List[str]:
-        """Wait for workers to be ready and return their addresses."""
-        start_time = time.time()
-
-        while time.time() - start_time < timeout:
-            try:
-                ips = self._get_node_ips()
-                if len(ips) >= expected_nodes:
-                    logger.info(f"Found {len(ips)} nodes ready")
-                    return ips
-            except Exception as e:
-                logger.debug(f"Waiting for workers: {e}")
-
-            time.sleep(poll_interval)
-
-        raise RuntimeError(
-            f"Timeout waiting for {expected_nodes} workers after {timeout}s"
-        )
-
-    def _state(self) -> JobState:
-        """Get the current state with HostMesh objects for each mesh."""
-        if not self._jobs_active():
-            raise RuntimeError("SkyPilot cluster is not active")
-
-        # Get node IPs if not cached
-        if not self._node_ips:
-            total_nodes = sum(self._meshes.values())
-            self._node_ips = self._wait_for_workers_ready(total_nodes)
-
-        # Distribute IPs among meshes
-        host_meshes = {}
-        ip_idx = 0
-
-        for mesh_name, num_nodes in self._meshes.items():
-            mesh_ips = self._node_ips[ip_idx : ip_idx + num_nodes]
-            ip_idx += num_nodes
-
-            workers = [f"tcp://{ip}:{self._port}" for ip in mesh_ips]
-            logger.info(f"Connecting to workers for mesh '{mesh_name}': {workers}")
-
-            host_mesh = _attach_to_workers_wrapper(
-                name=mesh_name,
-                ca="trust_all_connections",
-                workers=workers,
-            )
-            
-            # Wait for the host mesh to be initialized (connections established)
-            logger.info(f"Waiting for host mesh '{mesh_name}' to initialize...")
-            host_mesh.initialized.get()
-            logger.info(f"Host mesh '{mesh_name}' initialized successfully")
-            
-            # Give connections a moment to fully stabilize
-            time.sleep(5)
-            logger.info(f"Host mesh '{mesh_name}' ready")
-            
-            host_meshes[mesh_name] = host_mesh
-
-        return JobState(host_meshes)
-
-    def can_run(self, spec: "JobTrait") -> bool:
-        """Check if this job can run the given spec."""
-        if not isinstance(spec, SkyPilotJob):
-            return False
-
-        return (
-            spec._meshes == self._meshes
-            and spec._resources == self._resources
-            and spec._port == self._port
-            and self._jobs_active()
-        )
-
-    def _jobs_active(self) -> bool:
-        """Check if the SkyPilot cluster is still active."""
-        if not self.active or not self._launched_cluster_name:
-            return False
-
-        try:
-            request_id = sky.status(cluster_names=[self._launched_cluster_name])
-            statuses = sky.get(request_id)
-
-            if not statuses:
-                return False
-
-            status = statuses[0]
-            # Check if cluster is UP
-            return status.status == sky.ClusterStatus.UP
-        except Exception as e:
-            logger.warning(f"Error checking cluster status: {e}")
-            return False
-
-    def _kill(self) -> None:
-        """Tear down the SkyPilot cluster."""
-        if self._launched_cluster_name is not None:
-            try:
-                logger.info(f"Tearing down SkyPilot cluster '{self._launched_cluster_name}'")
-                request_id = sky.down(self._launched_cluster_name)
-                sky.get(request_id)
-                logger.info(f"Cluster '{self._launched_cluster_name}' terminated")
-            except Exception as e:
-                logger.warning(f"Failed to tear down cluster: {e}")
-
-        self._launched_cluster_name = None
-        self._node_ips.clear()
-
diff --git a/python/monarch/job/__init__.py b/python/monarch/job/__init__.py
index 674007d53..0f6ec1960 100644
--- a/python/monarch/job/__init__.py
+++ b/python/monarch/job/__init__.py
@@ -8,12 +8,6 @@
 from monarch._src.job.job import job_load, job_loads, JobState, JobTrait, LocalJob
 from monarch._src.job.slurm import SlurmJob
 
-# SkyPilot is an optional dependency
-try:
-    from monarch._src.job.skypilot import SkyPilotJob
-except ImportError:
-    SkyPilotJob = None  # type: ignore[misc,assignment]
-
 # Define exports
 __all__ = [
     "JobTrait",
@@ -22,5 +16,4 @@
     "JobState",
     "LocalJob",
     "SlurmJob",
-    "SkyPilotJob",
 ]

From 40f3a6a9c06944d79c9baee1f4a81cc4c3a89e4a Mon Sep 17 00:00:00 2001
From: Romil <romil.bhardwaj@gmail.com>
Date: Thu, 11 Dec 2025 02:40:18 +0000
Subject: [PATCH 13/29] Add DDP and titan examples

---
 examples/skypilot/README.md         |  67 ++++++++
 examples/skypilot/skypilot_ddp.py   | 200 +++++++++++++++++++++++
 examples/skypilot/skypilot_titan.py | 245 ++++++++++++++++++++++++++++
 3 files changed, 512 insertions(+)
 create mode 100644 examples/skypilot/skypilot_ddp.py
 create mode 100644 examples/skypilot/skypilot_titan.py

diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md
index 74f16b9e0..153089ad1 100644
--- a/examples/skypilot/README.md
+++ b/examples/skypilot/README.md
@@ -82,6 +82,73 @@ python getting_started.py --cloud aws --num-hosts 2 --accelerator "A100:1"
 python getting_started.py --cloud gcp --num-hosts 2 --accelerator "A100:1"
 ```
 
+Example output:
+```
+$ python skypilot_getting_started.py --num-hosts 2 --gpus-per-host 1 --cluster-name monarch-skypilot-test
+
+============================================================
+Monarch Getting Started with SkyPilot
+============================================================
+
+Configuration:
+  Cloud: kubernetes
+  Hosts: 2
+  GPUs per host: 1
+  Accelerator: H200:1
+  Cluster name: monarch-skypilot-test
+
+[1] Creating SkyPilot job...
+
+[2] Launching cluster and starting Monarch workers...
+No cached job found at path: .monarch/job_state.pkl
+Applying current job
+Launching SkyPilot cluster 'monarch-skypilot-test' with 2 nodes
+Running on cluster: monarch-skypilot-test
+SkyPilot cluster 'monarch-skypilot-test' launched successfully
+Waiting for job 1 setup to complete (timeout=300s)...
+Job 1 status: JobStatus.SETTING_UP (waited 5s)
+Job 1 is now RUNNING (setup complete)
+Saving job to cache at .monarch/job_state.pkl
+Job has started, connecting to current state
+Found 2 nodes ready
+Connecting to workers for mesh 'trainers': ['tcp://10.0.4.22:22222', 'tcp://10.0.4.112:22222']
+Monarch internal logs are being written to /tmp/sky/monarch_log.log; execution id sky_Dec-11_01:31_653
+Waiting for host mesh 'trainers' to initialize...
+Host mesh 'trainers' initialized successfully
+Host mesh 'trainers' ready
+    Got host mesh with extent: {hosts: 2}
+
+[3] Spawning processes on cloud hosts...
+    Process mesh extent: {hosts: 2, gpus: 1}
+
+[4] Spawning Counter actors...
+
+[5] Broadcasting increment to all counters...
+
+[6] Getting counter values...
+    Counter values: ValueMesh({hosts: 2, gpus: 1}):
+  (({'hosts': 0/2, 'gpus': 0/1}, 3), ({'hosts': 1/2, 'gpus': 0/1}, 3))
+
+[7] Spawning Trainer actors...
+
+[8] Performing distributed training step...
+    ({'hosts': 0/2, 'gpus': 0/1}, "Trainer {'hosts': 0/2, 'gpus': 0/1} taking a step.")
+    ({'hosts': 1/2, 'gpus': 0/1}, "Trainer {'hosts': 1/2, 'gpus': 0/1} taking a step.")
+
+[9] Getting trainer info...
+    ({'hosts': 0/2, 'gpus': 0/1}, "Trainer at rank {'hosts': 0/2, 'gpus': 0/1}")
+    ({'hosts': 1/2, 'gpus': 0/1}, "Trainer at rank {'hosts': 1/2, 'gpus': 0/1}")
+
+============================================================
+Success! Monarch actors ran on SkyPilot cluster!
+============================================================
+
+[10] Cleaning up SkyPilot cluster...
+Tearing down SkyPilot cluster 'monarch-skypilot-test'
+Cluster 'monarch-skypilot-test' terminated
+    Cluster terminated.
+```
+
 ## Configuration Options
 
 | Parameter | Description | Default |
diff --git a/examples/skypilot/skypilot_ddp.py b/examples/skypilot/skypilot_ddp.py
new file mode 100644
index 000000000..9b9657428
--- /dev/null
+++ b/examples/skypilot/skypilot_ddp.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+"""
+Monarch DDP Example with SkyPilot
+=================================
+
+This script demonstrates running PyTorch DDP (DistributedDataParallel) training
+on cloud infrastructure provisioned by SkyPilot.
+
+Adapted from the SLURM DDP example (slurm_ddp.ipynb).
+
+Usage:
+    python skypilot_ddp.py --num-hosts 2 --gpus-per-host 1
+"""
+
+import argparse
+import asyncio
+import logging
+import os
+import sys
+
+# Set timeouts before importing monarch
+os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s"
+os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s"
+os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s"
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.optim as optim
+
+from monarch.actor import Actor, current_rank, endpoint
+from monarch.utils import setup_env_for_distributed
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+# Import SkyPilotJob from local module
+from skypilot_job import SkyPilotJob
+
+try:
+    import sky
+except ImportError:
+    print("ERROR: SkyPilot is not installed. Run: pip install skypilot[kubernetes]")
+    sys.exit(1)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(name)s %(asctime)s %(levelname)s %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    force=True,
+)
+logger = logging.getLogger(__name__)
+
+
+class ToyModel(nn.Module):
+    """A simple toy model for demonstration purposes."""
+
+    def __init__(self):
+        super(ToyModel, self).__init__()
+        self.net1 = nn.Linear(10, 10)
+        self.relu = nn.ReLU()
+        self.net2 = nn.Linear(10, 5)
+
+    def forward(self, x):
+        return self.net2(self.relu(self.net1(x)))
+
+
+class DDPActor(Actor):
+    """This Actor wraps the basic functionality from Torch's DDP example.
+
+    Adapted from: https://docs.pytorch.org/tutorials/intermediate/ddp_tutorial.html#basic-use-case
+    """
+
+    def __init__(self):
+        self.rank = current_rank().rank
+
+    def _rprint(self, msg):
+        """Helper method to print with rank information."""
+        print(f"{self.rank=} {msg}")
+
+    @endpoint
+    async def setup(self):
+        """Initialize the PyTorch distributed process group."""
+        self._rprint("Initializing torch distributed")
+
+        WORLD_SIZE = int(os.environ["WORLD_SIZE"])
+        # initialize the process group
+        dist.init_process_group("gloo", rank=self.rank, world_size=WORLD_SIZE)
+        self._rprint("Finished initializing torch distributed")
+
+    @endpoint
+    async def cleanup(self):
+        """Clean up the PyTorch distributed process group."""
+        self._rprint("Cleaning up torch distributed")
+        dist.destroy_process_group()
+
+    @endpoint
+    async def demo_basic(self):
+        """Run a basic DDP training example."""
+        self._rprint("Running basic DDP example")
+
+        # create model and move it to GPU with id rank
+        local_rank = int(os.environ["LOCAL_RANK"])
+        self._rprint(f"{local_rank=}")
+        model = ToyModel().to(local_rank)
+        ddp_model = DDP(model, device_ids=[local_rank])
+
+        loss_fn = nn.MSELoss()
+        optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
+        optimizer.zero_grad()
+        outputs = ddp_model(torch.randn(20, 10))
+        labels = torch.randn(20, 5).to(local_rank)
+        loss_fn(outputs, labels).backward()
+        optimizer.step()
+
+        print(f"{self.rank=} Finished running basic DDP example")
+
+
+def get_cloud(cloud_name: str):
+    """Get SkyPilot cloud object from name."""
+    clouds = {
+        "kubernetes": sky.Kubernetes,
+        "aws": sky.AWS,
+        "gcp": sky.GCP,
+        "azure": sky.Azure,
+    }
+    if cloud_name.lower() not in clouds:
+        raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}")
+    return clouds[cloud_name.lower()]()
+
+
+async def main():
+    parser = argparse.ArgumentParser(description="Monarch DDP with SkyPilot")
+    parser.add_argument("--cloud", default="kubernetes", help="Cloud provider")
+    parser.add_argument("--num-hosts", type=int, default=2, help="Number of hosts")
+    parser.add_argument("--gpus-per-host", type=int, default=1, help="GPUs per host")
+    parser.add_argument("--cluster-name", default="monarch-ddp", help="Cluster name")
+    parser.add_argument("--accelerator", default="H200:1", help="GPU accelerator")
+    args = parser.parse_args()
+
+    print("=" * 60)
+    print("Monarch DDP Example with SkyPilot")
+    print("=" * 60)
+    print(f"\nConfiguration:")
+    print(f"  Cloud: {args.cloud}")
+    print(f"  Hosts: {args.num_hosts}")
+    print(f"  GPUs per host: {args.gpus_per_host}")
+    print(f"  Accelerator: {args.accelerator}")
+
+    # Create SkyPilot job
+    job = SkyPilotJob(
+        meshes={"mesh0": args.num_hosts},
+        resources=sky.Resources(
+            cloud=get_cloud(args.cloud),
+            accelerators=args.accelerator,
+        ),
+        cluster_name=args.cluster_name,
+        idle_minutes_to_autostop=10,
+        down_on_autostop=True,
+    )
+
+    try:
+        print("\n[1] Launching SkyPilot cluster...")
+        job_state = job.state()
+        
+        print("\n[2] Creating process mesh...")
+        proc_mesh = job_state.mesh0.spawn_procs({"gpus": args.gpus_per_host})
+        print(f"    Process mesh extent: {proc_mesh.extent}")
+
+        print("\n[3] Spawning DDP actors...")
+        ddp_actor = proc_mesh.spawn("ddp_actor", DDPActor)
+
+        print("\n[4] Setting up distributed environment...")
+        await setup_env_for_distributed(proc_mesh)
+
+        print("\n[5] Running DDP example...")
+        await ddp_actor.setup.call()
+        await ddp_actor.demo_basic.call()
+        await ddp_actor.cleanup.call()
+
+        print("\n" + "=" * 60)
+        print("DDP example completed successfully!")
+        print("=" * 60)
+
+    except Exception as e:
+        print(f"\nERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        print(f"\nNot cleaning up cluster for debugging...")
+        print(f"    Debug with: sky ssh {args.cluster_name}")
+        print(f"    Clean up: sky down {args.cluster_name}")
+        raise
+    else:
+        print("\n[6] Cleaning up SkyPilot cluster...")
+        job.kill()
+        print("    Done!")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
diff --git a/examples/skypilot/skypilot_titan.py b/examples/skypilot/skypilot_titan.py
new file mode 100644
index 000000000..1f4930ba6
--- /dev/null
+++ b/examples/skypilot/skypilot_titan.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python3
+"""
+Monarch + TorchTitan Example with SkyPilot
+==========================================
+
+This script demonstrates running TorchTitan distributed training on cloud
+infrastructure provisioned by SkyPilot.
+
+Adapted from the SLURM TorchTitan example (slurm_titan.ipynb).
+
+Prerequisites:
+    - TorchTitan installed: pip install torchtitan
+    - Model config file (e.g., debug_model.toml)
+    - Tokenizer files in ./tokenizer/
+
+Usage:
+    python skypilot_titan.py --num-hosts 2 --gpus-per-host 1 --config debug_model.toml
+"""
+
+import argparse
+import asyncio
+import logging
+import os
+import sys
+from dataclasses import dataclass
+
+# Set timeouts before importing monarch
+os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s"
+os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s"
+os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s"
+
+# Check for TorchTitan
+try:
+    from torchtitan.train import Trainer
+    from torchtitan.config import ConfigManager, JobConfig
+    from torchtitan.tools.logging import init_logger, logger as titan_logger
+    HAS_TORCHTITAN = True
+except ImportError:
+    HAS_TORCHTITAN = False
+    print("WARNING: TorchTitan is not installed. Install with: pip install torchtitan")
+    print("This example will show the structure but cannot run training.")
+
+import torch
+from monarch.actor import Actor, current_rank, endpoint
+from monarch.utils import setup_env_for_distributed
+
+# Import SkyPilotJob from local module
+from skypilot_job import SkyPilotJob
+
+try:
+    import sky
+except ImportError:
+    print("ERROR: SkyPilot is not installed. Run: pip install skypilot[kubernetes]")
+    sys.exit(1)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(name)s %(asctime)s %(levelname)s %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    force=True,
+)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class RunParams:
+    """Parameters for training job."""
+    training_steps: int = 50
+    model_config: str = "debug_model.toml"
+    dataset: str = "c4"
+    num_nodes: int = 2
+    gpus_per_node: int = 1
+
+
+if HAS_TORCHTITAN:
+    class TrainerActor(Actor):
+        """A wrapper class that executes a TorchTitan trainer in a Monarch actor."""
+
+        def __init__(self, job_config: "JobConfig") -> None:
+            self.job_config = job_config
+            rank = current_rank().rank
+            self.uid = f"[trainer_{rank}]"
+
+        @endpoint
+        async def start_training(self) -> None:
+            init_logger()
+            trainer = None
+
+            try:
+                trainer = Trainer(self.job_config)
+                titan_logger.info(f"{self.uid} initialized successfully and starting training")
+                trainer.train()
+            except Exception:
+                if trainer:
+                    trainer.close()
+                raise
+            else:
+                trainer.close()
+            finally:
+                torch.distributed.destroy_process_group()
+                titan_logger.info(f"{self.uid} trainer cleaned up")
+
+
+def make_job_config(run_params: RunParams, script_dir: str) -> "JobConfig":
+    """Create a job config for TorchTitan."""
+    if not HAS_TORCHTITAN:
+        raise RuntimeError("TorchTitan is not installed")
+    
+    data_parallel_shard_degree = run_params.num_nodes * run_params.gpus_per_node
+    output_path = "./outputs"
+
+    default_args = [
+        "--job.config_file",
+        os.path.join(script_dir, run_params.model_config),
+        "--model.tokenizer_path",
+        os.path.join(script_dir, "tokenizer"),
+        "--comm.trace_buf_size",
+        "0",
+        "--metrics.log_freq",
+        "1",
+        "--parallelism.data_parallel_shard_degree",
+        str(data_parallel_shard_degree),
+        "--activation_checkpoint.mode",
+        "full",
+        "--comm.train_timeout_seconds",
+        "60",
+        "--training.steps",
+        str(run_params.training_steps),
+        "--training.dataset",
+        run_params.dataset,
+        "--job.dump_folder",
+        output_path,
+        "--metrics.enable_tensorboard",
+    ]
+
+    config_manager = ConfigManager()
+    job_config = config_manager.parse_args(default_args)
+
+    return job_config
+
+
+def get_cloud(cloud_name: str):
+    """Get SkyPilot cloud object from name."""
+    clouds = {
+        "kubernetes": sky.Kubernetes,
+        "aws": sky.AWS,
+        "gcp": sky.GCP,
+        "azure": sky.Azure,
+    }
+    if cloud_name.lower() not in clouds:
+        raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}")
+    return clouds[cloud_name.lower()]()
+
+
+async def main():
+    parser = argparse.ArgumentParser(description="Monarch + TorchTitan with SkyPilot")
+    parser.add_argument("--cloud", default="kubernetes", help="Cloud provider")
+    parser.add_argument("--num-hosts", type=int, default=2, help="Number of hosts")
+    parser.add_argument("--gpus-per-host", type=int, default=1, help="GPUs per host")
+    parser.add_argument("--cluster-name", default="monarch-titan", help="Cluster name")
+    parser.add_argument("--accelerator", default="H200:1", help="GPU accelerator")
+    parser.add_argument("--config", default="debug_model.toml", help="TorchTitan config file")
+    parser.add_argument("--steps", type=int, default=50, help="Training steps")
+    args = parser.parse_args()
+
+    if not HAS_TORCHTITAN:
+        print("ERROR: TorchTitan is required for this example.")
+        print("Install with: pip install torchtitan")
+        sys.exit(1)
+
+    print("=" * 60)
+    print("Monarch + TorchTitan with SkyPilot")
+    print("=" * 60)
+    print(f"\nConfiguration:")
+    print(f"  Cloud: {args.cloud}")
+    print(f"  Hosts: {args.num_hosts}")
+    print(f"  GPUs per host: {args.gpus_per_host}")
+    print(f"  Accelerator: {args.accelerator}")
+    print(f"  Config: {args.config}")
+    print(f"  Steps: {args.steps}")
+
+    # Setup run parameters
+    run_params = RunParams(
+        training_steps=args.steps,
+        model_config=args.config,
+        num_nodes=args.num_hosts,
+        gpus_per_node=args.gpus_per_host,
+    )
+
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    job_config = make_job_config(run_params, script_dir)
+
+    # Create SkyPilot job
+    job = SkyPilotJob(
+        meshes={"mesh0": args.num_hosts},
+        resources=sky.Resources(
+            cloud=get_cloud(args.cloud),
+            accelerators=args.accelerator,
+        ),
+        cluster_name=args.cluster_name,
+        idle_minutes_to_autostop=10,
+        down_on_autostop=True,
+    )
+
+    try:
+        print("\n[1] Launching SkyPilot cluster...")
+        job_state = job.state()
+        
+        print("\n[2] Creating process mesh...")
+        proc_mesh = job_state.mesh0.spawn_procs({"gpus": args.gpus_per_host})
+        print(f"    Process mesh extent: {proc_mesh.extent}")
+
+        print("\n[3] Configuring remote logging...")
+        await proc_mesh.logging_option(stream_to_client=True)
+
+        print("\n[4] Setting up distributed environment...")
+        await setup_env_for_distributed(proc_mesh)
+
+        print("\n[5] Spawning TrainerActor...")
+        trainer = proc_mesh.spawn("trainer_actor", TrainerActor, job_config)
+
+        print("\n[6] Starting training...")
+        await trainer.start_training.call()
+
+        print("\n" + "=" * 60)
+        print("Training completed successfully!")
+        print("=" * 60)
+
+    except Exception as e:
+        print(f"\nERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        print(f"\nNot cleaning up cluster for debugging...")
+        print(f"    Debug with: sky ssh {args.cluster_name}")
+        print(f"    Clean up: sky down {args.cluster_name}")
+        raise
+    else:
+        print("\n[7] Cleaning up SkyPilot cluster...")
+        job.kill()
+        print("    Done!")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+

From 2132e3cbb190a19e438d8dd11334b80765bd3dd4 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Wed, 10 Dec 2025 23:53:28 -0800
Subject: [PATCH 14/29] Update README.md

---
 examples/skypilot/README.md | 39 +++++++++----------------------------
 1 file changed, 9 insertions(+), 30 deletions(-)

diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md
index 153089ad1..e9c6a2ca9 100644
--- a/examples/skypilot/README.md
+++ b/examples/skypilot/README.md
@@ -8,8 +8,8 @@ This directory contains a standalone integration for running Monarch workloads o
 
 **Supported platforms:**
 - Kubernetes (any cluster)
-- AWS, GCP, Azure
-- Lambda Labs, CoreWeave, RunPod, and [20+ other clouds](https://docs.skypilot.co/en/latest/getting-started/installation.html)
+- Hyperscalers: AWS, GCP, Azure
+- Neoclouds: CoreWeave, Nebius, and [20+ other clouds](https://docs.skypilot.co/en/latest/getting-started/installation.html)
 
 ## Installation
 
@@ -27,6 +27,8 @@ pip install skypilot[all]         # For all clouds
 sky check
 ```
 
+TODO(romilb): Link to SkyPilot docs for k8s setup
+
 ## Quick Start
 
 ```python
@@ -149,45 +151,22 @@ Cluster 'monarch-skypilot-test' terminated
     Cluster terminated.
 ```
 
-## Configuration Options
-
-| Parameter | Description | Default |
-|-----------|-------------|---------|
-| `meshes` | Dict mapping mesh names to node counts | Required |
-| `resources` | SkyPilot Resources specification | None (SkyPilot defaults) |
-| `cluster_name` | Name for the cluster | Auto-generated |
-| `monarch_port` | Port for Monarch TCP communication | 22222 |
-| `idle_minutes_to_autostop` | Auto-stop after idle time | None |
-| `down_on_autostop` | Tear down on autostop vs just stop | False |
-| `setup_commands` | Custom setup script | Installs torchmonarch-nightly |
-| `workdir` | Local directory to sync to cluster | None |
-| `file_mounts` | Additional files to mount | None |
-
 ## Default Image
 
-By default, `SkyPilotJob` uses the `pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime` Docker image which has compatible system libraries for `torchmonarch-nightly`. Setup time is ~1-2 minutes (just pip install).
+By default, `SkyPilotJob` uses the `pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime` Docker image which has compatible system libraries for `torchmonarch-nightly`. TODO(romilb): mention image requirements.
 
-## Faster Cold Starts
+## Faster Cold Starts with SkyPilot's cluster reuse
 
-For faster cold starts (<30s):
-
-**Option 1: Use a pre-built Docker image**
-```python
-resources = sky.Resources(
-    image_id="docker:your-registry/monarch-image:tag",
-    accelerators="H100:1",
-)
-```
-
-**Option 2: Use SkyPilot's cluster reuse**
+TODO(romilb): Validate if this works:
 ```python
 job = SkyPilotJob(
     ...,
     idle_minutes_to_autostop=30,  # Keep cluster alive
-    down_on_autostop=False,       # Just stop, don't terminate
 )
 ```
 
+TODO(romilb): Benchmark pre-baked container images
+
 ## Network Requirements
 
 The client must have direct network connectivity to the worker nodes:

From 0b1e5fd947987c0748e815327c77983d5c95c4dd Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Thu, 11 Dec 2025 16:19:42 -0800
Subject: [PATCH 15/29] Clean up, add run_getting_started

---
 examples/skypilot/run_getting_started.yaml    |  80 ++++++
 examples/skypilot/skypilot_getting_started.py |  26 +-
 examples/skypilot/skypilot_job.py             |  41 ++-
 examples/skypilot/skypilot_titan.py           | 245 ------------------
 python/monarch/job/__init__.py                |   9 +-
 5 files changed, 115 insertions(+), 286 deletions(-)
 create mode 100644 examples/skypilot/run_getting_started.yaml
 delete mode 100644 examples/skypilot/skypilot_titan.py

diff --git a/examples/skypilot/run_getting_started.yaml b/examples/skypilot/run_getting_started.yaml
new file mode 100644
index 000000000..c42b6e7ca
--- /dev/null
+++ b/examples/skypilot/run_getting_started.yaml
@@ -0,0 +1,80 @@
+# SkyPilot YAML for running the Monarch Getting Started example.
+#
+# This YAML file syncs the example directory, installs dependencies,
+# and runs the getting started example.
+#
+# Usage:
+#   cd monarch/examples/skypilot
+#   sky launch run_getting_started.yaml -c monarch-demo
+#
+# To view logs:
+#   sky logs monarch-demo
+#
+# To SSH into the cluster:
+#   sky ssh monarch-demo
+#
+# To tear down:
+#   sky down monarch-demo
+
+name: monarch-getting-started
+
+resources:
+  cloud: kubernetes # Optional, remove or change to your preferred cloud provider
+  cpus: 2+ # No GPUs needed for the driver script
+  image_id: docker:pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime
+
+# Sync the current directory (examples/skypilot) to the cluster
+workdir: .
+
+setup: |
+  set -ex
+  
+  echo "=== Installing system dependencies ==="
+  # Install socat (required for SkyPilot Kubernetes portforward networking) and curl
+  apt-get update && apt-get install -y socat curl
+  
+  # Install kubectl for Kubernetes cluster management
+  echo "=== Installing kubectl ==="
+  curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
+  chmod +x kubectl
+  mv kubectl /usr/local/bin/
+  kubectl version --client || echo "kubectl installed"
+  
+  echo "=== Installing Python dependencies ==="  
+  uv pip install --system torchmonarch-nightly
+  # Install SkyPilot with Kubernetes support for launching nested clusters
+  uv pip install --system "skypilot[kubernetes]"
+  
+  # Verify installations
+  python -c "import monarch; print(f'Monarch installed: {monarch}')"
+  python -c "import sky; print(f'SkyPilot installed: {sky}')"
+  
+  # Configure SkyPilot to use in-cluster Kubernetes context
+  # This allows the driver pod to launch nested SkyPilot clusters
+  unset SKYPILOT_IN_CLUSTER_CONTEXT_NAME
+  sky api start
+  
+  # Verify Kubernetes access
+  echo "=== Verifying Kubernetes access ==="
+  sky check kubernetes
+
+  echo "=== GPUs available ==="
+  sky show-gpus --infra kubernetes
+  
+  echo "=== Setup complete ==="
+
+run: |  
+  echo "=== Running Monarch Getting Started with SkyPilot ==="  
+
+  # Run the getting started example
+  # This will launch a SkyPilot cluster with Monarch workers.
+  # Change the arguments to your desired values.
+  python skypilot_getting_started.py \
+    --cloud kubernetes \
+    --num-hosts 2 \
+    --gpus-per-host 1 \
+    --cluster-name monarch-workers \
+    --accelerator "H200:1"
+  
+  echo "=== Example ran successfully ==="
+
diff --git a/examples/skypilot/skypilot_getting_started.py b/examples/skypilot/skypilot_getting_started.py
index b9f703bee..814f4e6d0 100644
--- a/examples/skypilot/skypilot_getting_started.py
+++ b/examples/skypilot/skypilot_getting_started.py
@@ -10,16 +10,14 @@
     pip install torchmonarch-nightly
     pip install skypilot[kubernetes]  # or skypilot[aws], skypilot[gcp], etc.
     sky check  # Verify SkyPilot configuration
+    sky show-gpus --infra kubernetes  # Verify GPUs available
 
 Usage:
-    # Run on Kubernetes:
-    python getting_started.py --cloud kubernetes --num-hosts 2
+    # Run on Kubernetes with 2 nodes, 8 GPUs per node
+    python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 8 --gpus "H200:8"
 
-    # Run on AWS:
-    python getting_started.py --cloud aws --num-hosts 2
-
-    # Run on GCP:
-    python getting_started.py --cloud gcp --num-hosts 2
+    # Run on cloud VMs
+    python skypilot_getting_started.py --cloud <aws/gcp/azure/...> --num-hosts 2 --gpus-per-host 1 --gpus "H100:1"
 """
 
 import argparse
@@ -54,7 +52,7 @@
 from skypilot_job import SkyPilotJob
 
 # ============================================================================
-# Step 1: Define actors (same as getting started guide)
+# Step 1: Define actors
 # ============================================================================
 
 
@@ -88,7 +86,7 @@ def get_info(self) -> str:
 
 
 # ============================================================================
-# Step 2: Create a SkyPilot Job to provision cloud infrastructure
+# Step 2: Create a SkyPilot Job to provision k8s pods/cloud VMs
 # ============================================================================
 
 
@@ -99,7 +97,10 @@ def get_cloud(cloud_name: str):
         "aws": sky.AWS,
         "gcp": sky.GCP,
         "azure": sky.Azure,
-        "lambda": sky.Lambda,
+        "nebius": sky.Nebius,
+        # "slurm": sky.Slurm,
+        # "ssh": sky.SSH, 
+        # TODO(romilb): Add other clouds
     }
     if cloud_name.lower() not in clouds:
         raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}")
@@ -111,7 +112,7 @@ def main():
     parser.add_argument(
         "--cloud",
         default="kubernetes",
-        help="Cloud provider to use (kubernetes, aws, gcp, azure, lambda)",
+        help="Cloud provider to use (kubernetes, aws, gcp, azure, ssh)",
     )
     parser.add_argument(
         "--num-hosts",
@@ -165,12 +166,13 @@ def main():
     if args.region:
         resources_kwargs["region"] = args.region
     
+    # Create a SkyPilotJob to provision nodes
     job = SkyPilotJob(
         # Define the mesh of hosts we need
         meshes={"trainers": args.num_hosts},
         resources=sky.Resources(**resources_kwargs),
         cluster_name=args.cluster_name,
-        # Auto-cleanup after 10 minutes of idle time
+        # Auto-cleanup after 10 minutes of idle time (recommended for auto clean up if the job/controller fails)
         idle_minutes_to_autostop=10,
         down_on_autostop=True,
     )
diff --git a/examples/skypilot/skypilot_job.py b/examples/skypilot/skypilot_job.py
index 7b5ea1178..edb418da8 100644
--- a/examples/skypilot/skypilot_job.py
+++ b/examples/skypilot/skypilot_job.py
@@ -1,9 +1,7 @@
 """
-SkyPilot integration for Monarch - standalone implementation.
+SkyPilotJob for Monarch.
 
-This module provides SkyPilotJob, which allows running Monarch workloads on
-Kubernetes and cloud VMs via SkyPilot. It is designed to be used independently
-of the main Monarch source tree.
+SkyPilotJob allows running Monarch on Kubernetes and cloud VMs via SkyPilot. 
 
 Requirements:
     - pip install torchmonarch-nightly (or torchmonarch)
@@ -16,7 +14,6 @@
 import time
 from typing import Dict, List, Optional, TYPE_CHECKING
 
-# Import Monarch's job interface
 from monarch._src.job.job import JobState, JobTrait
 
 # If running inside a SkyPilot cluster, unset the in-cluster context variable
@@ -25,7 +22,6 @@
 if "SKYPILOT_IN_CLUSTER_CONTEXT_NAME" in os.environ:
     del os.environ["SKYPILOT_IN_CLUSTER_CONTEXT_NAME"]
 
-# Defer imports that may not be available in all environments
 if TYPE_CHECKING:
     import sky
 
@@ -46,21 +42,22 @@
 # Default port for Monarch TCP communication
 DEFAULT_MONARCH_PORT = 22222
 
+# Timeout for waiting for the job to reach RUNNING status.
+JOB_TIMEOUT = 300 # seconds
+
 # Default setup commands to install Monarch from PyPI on remote workers.
-# Requires a Docker image with Ubuntu 22.04+ for compatible libibverbs.
+# Requires a Docker image with Ubuntu 22.04+ with RDMA dependencies.
+# In this implementation, we default to pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime image.
 #
-# Cold start time: ~1-2 minutes (pip install only).
 # For faster cold starts (<30s), use a custom Docker image with Monarch pre-installed.
 DEFAULT_SETUP_COMMANDS = """
 set -ex
 
 # Install torchmonarch from PyPI
-pip install torchmonarch-nightly
+uv pip install --system torchmonarch-nightly
 
 echo "Done installing Monarch"
 """
-
-# Default Docker image - PyTorch with CUDA on Ubuntu 22.04 (has compatible libibverbs)
 DEFAULT_IMAGE_ID = "docker:pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime"
 
 
@@ -81,16 +78,20 @@ def _attach_to_workers_wrapper(name: str, ca: str, workers: List[str]):
 
 class SkyPilotJob(JobTrait):
     """
-    A job scheduler that uses SkyPilot to provision cloud instances.
+    SkyPilotJob to provision and manage Monarch workers K8s and cloud VMs.
 
-    SkyPilot supports multiple cloud providers (AWS, GCP, Azure, Lambda, etc.)
-    and Kubernetes, and can automatically select the cheapest available option.
+    SkyPilot supports multiple backends - Kubernetes and VMs on AWS, GCP, Azure,
+    CoreWeave, Nebius, and 20+ other clouds.
 
     This implementation:
     1. Uses sky.launch() to provision cloud instances with specified resources
     2. Runs Monarch workers on each node via a startup script
     3. Connects to workers using their IP addresses from the cluster handle
 
+    Caveats:
+      * For Kubernetes, the driver/client must be run inside the same cluster.
+        TOOD(romilb): Explore if loadbalancer can be used to connect to workers.
+
     Example:
         >>> import sky
         >>> from skypilot_job import SkyPilotJob
@@ -198,12 +199,10 @@ def _create(self, client_script: Optional[str]) -> None:
         # Set resources, using default image_id if not specified
         resources = self._resources
         if resources is not None:
-            # If no image_id specified, use the default PyTorch image
             if resources.image_id is None:
                 resources = resources.copy(image_id=DEFAULT_IMAGE_ID)
             task.set_resources(resources)
         else:
-            # No resources specified, create default with image_id
             task.set_resources(sky.Resources(image_id=DEFAULT_IMAGE_ID))
 
         # Generate cluster name if not provided
@@ -229,9 +228,9 @@ def _create(self, client_script: Optional[str]) -> None:
         logger.info(f"SkyPilot cluster '{cluster_name}' launched successfully")
         
         # Wait for the job to be RUNNING (setup complete, run started)
-        self._wait_for_job_running(cluster_name, job_id, timeout=300)
+        self._wait_for_job_running(cluster_name, job_id, timeout=JOB_TIMEOUT)
     
-    def _wait_for_job_running(self, cluster_name: str, job_id: int, timeout: int = 300) -> None:
+    def _wait_for_job_running(self, cluster_name: str, job_id: int, timeout: int = JOB_TIMEOUT) -> None:
         """Wait for the SkyPilot job to reach RUNNING status (setup complete)."""
         start_time = time.time()
         poll_interval = 10  # seconds
@@ -301,9 +300,9 @@ def _build_worker_command(self) -> str:
         escaped_code = python_code.replace("'", "'\"'\"'")
         # Set timeout env vars
         env_vars = " ".join([
-            "export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT=5m",
-            "export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT=5m",
-            "export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE=5m",
+            f"export HYPERACTOR_HOST_SPAWN_READY_TIMEOUT={JOB_TIMEOUT}s",
+            f"export HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT={JOB_TIMEOUT}s",
+            f"export HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE={JOB_TIMEOUT}s",
         ])
         return f"{env_vars} && {self._python_exe} -c '{escaped_code}'"
 
diff --git a/examples/skypilot/skypilot_titan.py b/examples/skypilot/skypilot_titan.py
deleted file mode 100644
index 1f4930ba6..000000000
--- a/examples/skypilot/skypilot_titan.py
+++ /dev/null
@@ -1,245 +0,0 @@
-#!/usr/bin/env python3
-"""
-Monarch + TorchTitan Example with SkyPilot
-==========================================
-
-This script demonstrates running TorchTitan distributed training on cloud
-infrastructure provisioned by SkyPilot.
-
-Adapted from the SLURM TorchTitan example (slurm_titan.ipynb).
-
-Prerequisites:
-    - TorchTitan installed: pip install torchtitan
-    - Model config file (e.g., debug_model.toml)
-    - Tokenizer files in ./tokenizer/
-
-Usage:
-    python skypilot_titan.py --num-hosts 2 --gpus-per-host 1 --config debug_model.toml
-"""
-
-import argparse
-import asyncio
-import logging
-import os
-import sys
-from dataclasses import dataclass
-
-# Set timeouts before importing monarch
-os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s"
-os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s"
-os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s"
-
-# Check for TorchTitan
-try:
-    from torchtitan.train import Trainer
-    from torchtitan.config import ConfigManager, JobConfig
-    from torchtitan.tools.logging import init_logger, logger as titan_logger
-    HAS_TORCHTITAN = True
-except ImportError:
-    HAS_TORCHTITAN = False
-    print("WARNING: TorchTitan is not installed. Install with: pip install torchtitan")
-    print("This example will show the structure but cannot run training.")
-
-import torch
-from monarch.actor import Actor, current_rank, endpoint
-from monarch.utils import setup_env_for_distributed
-
-# Import SkyPilotJob from local module
-from skypilot_job import SkyPilotJob
-
-try:
-    import sky
-except ImportError:
-    print("ERROR: SkyPilot is not installed. Run: pip install skypilot[kubernetes]")
-    sys.exit(1)
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(name)s %(asctime)s %(levelname)s %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-    force=True,
-)
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class RunParams:
-    """Parameters for training job."""
-    training_steps: int = 50
-    model_config: str = "debug_model.toml"
-    dataset: str = "c4"
-    num_nodes: int = 2
-    gpus_per_node: int = 1
-
-
-if HAS_TORCHTITAN:
-    class TrainerActor(Actor):
-        """A wrapper class that executes a TorchTitan trainer in a Monarch actor."""
-
-        def __init__(self, job_config: "JobConfig") -> None:
-            self.job_config = job_config
-            rank = current_rank().rank
-            self.uid = f"[trainer_{rank}]"
-
-        @endpoint
-        async def start_training(self) -> None:
-            init_logger()
-            trainer = None
-
-            try:
-                trainer = Trainer(self.job_config)
-                titan_logger.info(f"{self.uid} initialized successfully and starting training")
-                trainer.train()
-            except Exception:
-                if trainer:
-                    trainer.close()
-                raise
-            else:
-                trainer.close()
-            finally:
-                torch.distributed.destroy_process_group()
-                titan_logger.info(f"{self.uid} trainer cleaned up")
-
-
-def make_job_config(run_params: RunParams, script_dir: str) -> "JobConfig":
-    """Create a job config for TorchTitan."""
-    if not HAS_TORCHTITAN:
-        raise RuntimeError("TorchTitan is not installed")
-    
-    data_parallel_shard_degree = run_params.num_nodes * run_params.gpus_per_node
-    output_path = "./outputs"
-
-    default_args = [
-        "--job.config_file",
-        os.path.join(script_dir, run_params.model_config),
-        "--model.tokenizer_path",
-        os.path.join(script_dir, "tokenizer"),
-        "--comm.trace_buf_size",
-        "0",
-        "--metrics.log_freq",
-        "1",
-        "--parallelism.data_parallel_shard_degree",
-        str(data_parallel_shard_degree),
-        "--activation_checkpoint.mode",
-        "full",
-        "--comm.train_timeout_seconds",
-        "60",
-        "--training.steps",
-        str(run_params.training_steps),
-        "--training.dataset",
-        run_params.dataset,
-        "--job.dump_folder",
-        output_path,
-        "--metrics.enable_tensorboard",
-    ]
-
-    config_manager = ConfigManager()
-    job_config = config_manager.parse_args(default_args)
-
-    return job_config
-
-
-def get_cloud(cloud_name: str):
-    """Get SkyPilot cloud object from name."""
-    clouds = {
-        "kubernetes": sky.Kubernetes,
-        "aws": sky.AWS,
-        "gcp": sky.GCP,
-        "azure": sky.Azure,
-    }
-    if cloud_name.lower() not in clouds:
-        raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}")
-    return clouds[cloud_name.lower()]()
-
-
-async def main():
-    parser = argparse.ArgumentParser(description="Monarch + TorchTitan with SkyPilot")
-    parser.add_argument("--cloud", default="kubernetes", help="Cloud provider")
-    parser.add_argument("--num-hosts", type=int, default=2, help="Number of hosts")
-    parser.add_argument("--gpus-per-host", type=int, default=1, help="GPUs per host")
-    parser.add_argument("--cluster-name", default="monarch-titan", help="Cluster name")
-    parser.add_argument("--accelerator", default="H200:1", help="GPU accelerator")
-    parser.add_argument("--config", default="debug_model.toml", help="TorchTitan config file")
-    parser.add_argument("--steps", type=int, default=50, help="Training steps")
-    args = parser.parse_args()
-
-    if not HAS_TORCHTITAN:
-        print("ERROR: TorchTitan is required for this example.")
-        print("Install with: pip install torchtitan")
-        sys.exit(1)
-
-    print("=" * 60)
-    print("Monarch + TorchTitan with SkyPilot")
-    print("=" * 60)
-    print(f"\nConfiguration:")
-    print(f"  Cloud: {args.cloud}")
-    print(f"  Hosts: {args.num_hosts}")
-    print(f"  GPUs per host: {args.gpus_per_host}")
-    print(f"  Accelerator: {args.accelerator}")
-    print(f"  Config: {args.config}")
-    print(f"  Steps: {args.steps}")
-
-    # Setup run parameters
-    run_params = RunParams(
-        training_steps=args.steps,
-        model_config=args.config,
-        num_nodes=args.num_hosts,
-        gpus_per_node=args.gpus_per_host,
-    )
-
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    job_config = make_job_config(run_params, script_dir)
-
-    # Create SkyPilot job
-    job = SkyPilotJob(
-        meshes={"mesh0": args.num_hosts},
-        resources=sky.Resources(
-            cloud=get_cloud(args.cloud),
-            accelerators=args.accelerator,
-        ),
-        cluster_name=args.cluster_name,
-        idle_minutes_to_autostop=10,
-        down_on_autostop=True,
-    )
-
-    try:
-        print("\n[1] Launching SkyPilot cluster...")
-        job_state = job.state()
-        
-        print("\n[2] Creating process mesh...")
-        proc_mesh = job_state.mesh0.spawn_procs({"gpus": args.gpus_per_host})
-        print(f"    Process mesh extent: {proc_mesh.extent}")
-
-        print("\n[3] Configuring remote logging...")
-        await proc_mesh.logging_option(stream_to_client=True)
-
-        print("\n[4] Setting up distributed environment...")
-        await setup_env_for_distributed(proc_mesh)
-
-        print("\n[5] Spawning TrainerActor...")
-        trainer = proc_mesh.spawn("trainer_actor", TrainerActor, job_config)
-
-        print("\n[6] Starting training...")
-        await trainer.start_training.call()
-
-        print("\n" + "=" * 60)
-        print("Training completed successfully!")
-        print("=" * 60)
-
-    except Exception as e:
-        print(f"\nERROR: {e}")
-        import traceback
-        traceback.print_exc()
-        print(f"\nNot cleaning up cluster for debugging...")
-        print(f"    Debug with: sky ssh {args.cluster_name}")
-        print(f"    Clean up: sky down {args.cluster_name}")
-        raise
-    else:
-        print("\n[7] Cleaning up SkyPilot cluster...")
-        job.kill()
-        print("    Done!")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
-
diff --git a/python/monarch/job/__init__.py b/python/monarch/job/__init__.py
index 0f6ec1960..b6852a0a1 100644
--- a/python/monarch/job/__init__.py
+++ b/python/monarch/job/__init__.py
@@ -9,11 +9,4 @@
 from monarch._src.job.slurm import SlurmJob
 
 # Define exports
-__all__ = [
-    "JobTrait",
-    "job_load",
-    "job_loads",
-    "JobState",
-    "LocalJob",
-    "SlurmJob",
-]
+__all__ = ["JobTrait", "job_load", "job_loads", "JobState", "LocalJob", "SlurmJob"]

From 3cda869a354bf42f11d8d51a13726a1d07d3741c Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Thu, 11 Dec 2025 16:22:40 -0800
Subject: [PATCH 16/29] renaming

---
 examples/skypilot/README.md                                     | 2 +-
 .../{run_getting_started.yaml => getting_started.sky.yaml}      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename examples/skypilot/{run_getting_started.yaml => getting_started.sky.yaml} (97%)

diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md
index e9c6a2ca9..cbda51b08 100644
--- a/examples/skypilot/README.md
+++ b/examples/skypilot/README.md
@@ -1,4 +1,4 @@
-# Monarch SkyPilot Integration
+# Running Monarch on Kubernetes and cloud VMs via SkyPilot
 
 This directory contains a standalone integration for running Monarch workloads on **Kubernetes and cloud VMs** via [SkyPilot](https://github.com/skypilot-org/skypilot).
 
diff --git a/examples/skypilot/run_getting_started.yaml b/examples/skypilot/getting_started.sky.yaml
similarity index 97%
rename from examples/skypilot/run_getting_started.yaml
rename to examples/skypilot/getting_started.sky.yaml
index c42b6e7ca..0398cc873 100644
--- a/examples/skypilot/run_getting_started.yaml
+++ b/examples/skypilot/getting_started.sky.yaml
@@ -5,7 +5,7 @@
 #
 # Usage:
 #   cd monarch/examples/skypilot
-#   sky launch run_getting_started.yaml -c monarch-demo
+#   sky launch getting_started.sky.yaml -c monarch-demo
 #
 # To view logs:
 #   sky logs monarch-demo

From 32ee2d3d028f1ba8bf0f9b3316c4b24dc24d21d6 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Thu, 11 Dec 2025 16:41:15 -0800
Subject: [PATCH 17/29] Add DDP notebook

---
 ....yaml => monarch_getting_started.sky.yaml} |   0
 examples/skypilot/skypilot_ddp.ipynb          | 306 ++++++++++++++++++
 examples/skypilot/skypilot_ddp.py             | 200 ------------
 3 files changed, 306 insertions(+), 200 deletions(-)
 rename examples/skypilot/{getting_started.sky.yaml => monarch_getting_started.sky.yaml} (100%)
 create mode 100644 examples/skypilot/skypilot_ddp.ipynb
 delete mode 100644 examples/skypilot/skypilot_ddp.py

diff --git a/examples/skypilot/getting_started.sky.yaml b/examples/skypilot/monarch_getting_started.sky.yaml
similarity index 100%
rename from examples/skypilot/getting_started.sky.yaml
rename to examples/skypilot/monarch_getting_started.sky.yaml
diff --git a/examples/skypilot/skypilot_ddp.ipynb b/examples/skypilot/skypilot_ddp.ipynb
new file mode 100644
index 000000000..b309de8d5
--- /dev/null
+++ b/examples/skypilot/skypilot_ddp.ipynb
@@ -0,0 +1,306 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Monarch DDP Example with SkyPilot\n",
+        "\n",
+        "This notebook demonstrates running PyTorch DDP (DistributedDataParallel) training on cloud infrastructure provisioned by SkyPilot.\n",
+        "\n",
+        "Adapted from the SLURM DDP example (`slurm_ddp.ipynb`).\n",
+        "\n",
+        "## Prerequisites\n",
+        "\n",
+        "```bash\n",
+        "pip install torchmonarch-nightly\n",
+        "pip install skypilot[kubernetes]  # or skypilot[aws], skypilot[gcp], etc.\n",
+        "sky check  # Verify SkyPilot configuration\n",
+        "```\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Imports and Setup\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "\n",
+        "# Set timeouts before importing monarch\n",
+        "os.environ[\"HYPERACTOR_HOST_SPAWN_READY_TIMEOUT\"] = \"300s\"\n",
+        "os.environ[\"HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT\"] = \"300s\"\n",
+        "os.environ[\"HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE\"] = \"300s\"\n",
+        "\n",
+        "import torch\n",
+        "import torch.distributed as dist\n",
+        "import torch.nn as nn\n",
+        "import torch.optim as optim\n",
+        "\n",
+        "import sky\n",
+        "from monarch.actor import Actor, current_rank, endpoint\n",
+        "from monarch.utils import setup_env_for_distributed\n",
+        "from torch.nn.parallel import DistributedDataParallel as DDP\n",
+        "\n",
+        "# Import SkyPilotJob from local module\n",
+        "from skypilot_job import SkyPilotJob\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Define the Model and DDP Actor\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "class ToyModel(nn.Module):\n",
+        "    \"\"\"A simple toy model for demonstration purposes.\"\"\"\n",
+        "\n",
+        "    def __init__(self):\n",
+        "        super(ToyModel, self).__init__()\n",
+        "        self.net1 = nn.Linear(10, 10)\n",
+        "        self.relu = nn.ReLU()\n",
+        "        self.net2 = nn.Linear(10, 5)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        return self.net2(self.relu(self.net1(x)))\n",
+        "\n",
+        "\n",
+        "class DDPActor(Actor):\n",
+        "    \"\"\"This Actor wraps the basic functionality from Torch's DDP example.\n",
+        "\n",
+        "    Adapted from: https://docs.pytorch.org/tutorials/intermediate/ddp_tutorial.html#basic-use-case\n",
+        "    \"\"\"\n",
+        "\n",
+        "    def __init__(self):\n",
+        "        self.rank = current_rank().rank\n",
+        "\n",
+        "    @endpoint\n",
+        "    async def setup(self) -> str:\n",
+        "        \"\"\"Initialize the PyTorch distributed process group.\"\"\"\n",
+        "        WORLD_SIZE = int(os.environ[\"WORLD_SIZE\"])\n",
+        "        dist.init_process_group(\"gloo\", rank=self.rank, world_size=WORLD_SIZE)\n",
+        "        return f\"Rank {self.rank}: Initialized distributed (world_size={WORLD_SIZE})\"\n",
+        "\n",
+        "    @endpoint\n",
+        "    async def cleanup(self) -> str:\n",
+        "        \"\"\"Clean up the PyTorch distributed process group.\"\"\"\n",
+        "        dist.destroy_process_group()\n",
+        "        return f\"Rank {self.rank}: Cleaned up distributed\"\n",
+        "\n",
+        "    @endpoint\n",
+        "    async def demo_basic(self) -> str:\n",
+        "        \"\"\"Run a basic DDP training example.\"\"\"\n",
+        "        local_rank = int(os.environ[\"LOCAL_RANK\"])\n",
+        "        model = ToyModel().to(local_rank)\n",
+        "        ddp_model = DDP(model, device_ids=[local_rank])\n",
+        "\n",
+        "        loss_fn = nn.MSELoss()\n",
+        "        optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)\n",
+        "\n",
+        "        optimizer.zero_grad()\n",
+        "        outputs = ddp_model(torch.randn(20, 10))\n",
+        "        labels = torch.randn(20, 5).to(local_rank)\n",
+        "        loss = loss_fn(outputs, labels)\n",
+        "        loss.backward()\n",
+        "        optimizer.step()\n",
+        "\n",
+        "        return f\"Rank {self.rank}: Training step complete (loss={loss.item():.4f})\"\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Configuration\n",
+        "\n",
+        "Configure your cloud provider, cluster size, and GPU type below:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Configuration - modify these values as needed\n",
+        "CLOUD = \"kubernetes\"  # Options: kubernetes, aws, gcp, azure\n",
+        "NUM_HOSTS = 2\n",
+        "GPUS_PER_HOST = 1\n",
+        "CLUSTER_NAME = \"monarch-ddp\"\n",
+        "ACCELERATOR = \"H200:1\"  # e.g., H100:1, A100:1, V100:1\n",
+        "\n",
+        "def get_cloud(cloud_name: str):\n",
+        "    \"\"\"Get SkyPilot cloud object from name.\"\"\"\n",
+        "    clouds = {\n",
+        "        \"kubernetes\": sky.Kubernetes,\n",
+        "        \"aws\": sky.AWS,\n",
+        "        \"gcp\": sky.GCP,\n",
+        "        \"azure\": sky.Azure,\n",
+        "    }\n",
+        "    if cloud_name.lower() not in clouds:\n",
+        "        raise ValueError(f\"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}\")\n",
+        "    return clouds[cloud_name.lower()]()\n",
+        "\n",
+        "print(f\"Configuration:\")\n",
+        "print(f\"  Cloud: {CLOUD}\")\n",
+        "print(f\"  Hosts: {NUM_HOSTS}\")\n",
+        "print(f\"  GPUs per host: {GPUS_PER_HOST}\")\n",
+        "print(f\"  Accelerator: {ACCELERATOR}\")\n",
+        "print(f\"  Cluster name: {CLUSTER_NAME}\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Create SkyPilot Job\n",
+        "\n",
+        "Create a SkyPilot job to provision cloud instances:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "job = SkyPilotJob(\n",
+        "    meshes={\"mesh0\": NUM_HOSTS},\n",
+        "    resources=sky.Resources(\n",
+        "        cloud=get_cloud(CLOUD),\n",
+        "        accelerators=ACCELERATOR,\n",
+        "    ),\n",
+        "    cluster_name=CLUSTER_NAME,\n",
+        "    idle_minutes_to_autostop=10,\n",
+        "    down_on_autostop=True,\n",
+        ")\n",
+        "\n",
+        "print(f\"SkyPilot job created for cluster '{CLUSTER_NAME}'\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Launch Cluster and Create Process Mesh\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Launch the cluster and get the job state\n",
+        "print(\"Launching SkyPilot cluster...\")\n",
+        "job_state = job.state()\n",
+        "\n",
+        "# Create process mesh with GPUs\n",
+        "print(\"Creating process mesh...\")\n",
+        "proc_mesh = job_state.mesh0.spawn_procs({\"gpus\": GPUS_PER_HOST})\n",
+        "print(f\"Process mesh extent: {proc_mesh.extent}\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Spawn DDP Actors and Run Training\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Spawn DDP actors on the process mesh\n",
+        "print(\"Spawning DDP actors...\")\n",
+        "ddp_actor = proc_mesh.spawn(\"ddp_actor\", DDPActor)\n",
+        "\n",
+        "# Set up the distributed environment\n",
+        "print(\"Setting up distributed environment...\")\n",
+        "await setup_env_for_distributed(proc_mesh)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Run the DDP example\n",
+        "print(\"Running DDP training...\\n\")\n",
+        "\n",
+        "# Initialize distributed process group\n",
+        "print(\"[1] Initializing distributed process group...\")\n",
+        "results = await ddp_actor.setup.call()\n",
+        "for coord, msg in results:\n",
+        "    print(f\"    {msg}\")\n",
+        "\n",
+        "# Run the basic DDP training example\n",
+        "print(\"\\n[2] Running DDP training step...\")\n",
+        "results = await ddp_actor.demo_basic.call()\n",
+        "for coord, msg in results:\n",
+        "    print(f\"    {msg}\")\n",
+        "\n",
+        "# Clean up distributed process group\n",
+        "print(\"\\n[3] Cleaning up distributed process group...\")\n",
+        "results = await ddp_actor.cleanup.call()\n",
+        "for coord, msg in results:\n",
+        "    print(f\"    {msg}\")\n",
+        "\n",
+        "print(\"\\n\" + \"=\" * 60)\n",
+        "print(\"DDP example completed successfully!\")\n",
+        "print(\"=\" * 60)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Cleanup\n",
+        "\n",
+        "Tear down the SkyPilot cluster when done:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Tear down the SkyPilot cluster\n",
+        "print(\"Cleaning up SkyPilot cluster...\")\n",
+        "job.kill()\n",
+        "print(\"Done!\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": []
+    }
+  ],
+  "metadata": {
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}
diff --git a/examples/skypilot/skypilot_ddp.py b/examples/skypilot/skypilot_ddp.py
deleted file mode 100644
index 9b9657428..000000000
--- a/examples/skypilot/skypilot_ddp.py
+++ /dev/null
@@ -1,200 +0,0 @@
-#!/usr/bin/env python3
-"""
-Monarch DDP Example with SkyPilot
-=================================
-
-This script demonstrates running PyTorch DDP (DistributedDataParallel) training
-on cloud infrastructure provisioned by SkyPilot.
-
-Adapted from the SLURM DDP example (slurm_ddp.ipynb).
-
-Usage:
-    python skypilot_ddp.py --num-hosts 2 --gpus-per-host 1
-"""
-
-import argparse
-import asyncio
-import logging
-import os
-import sys
-
-# Set timeouts before importing monarch
-os.environ["HYPERACTOR_HOST_SPAWN_READY_TIMEOUT"] = "300s"
-os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT"] = "300s"
-os.environ["HYPERACTOR_MESH_PROC_SPAWN_MAX_IDLE"] = "300s"
-
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-import torch.optim as optim
-
-from monarch.actor import Actor, current_rank, endpoint
-from monarch.utils import setup_env_for_distributed
-from torch.nn.parallel import DistributedDataParallel as DDP
-
-# Import SkyPilotJob from local module
-from skypilot_job import SkyPilotJob
-
-try:
-    import sky
-except ImportError:
-    print("ERROR: SkyPilot is not installed. Run: pip install skypilot[kubernetes]")
-    sys.exit(1)
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(name)s %(asctime)s %(levelname)s %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-    force=True,
-)
-logger = logging.getLogger(__name__)
-
-
-class ToyModel(nn.Module):
-    """A simple toy model for demonstration purposes."""
-
-    def __init__(self):
-        super(ToyModel, self).__init__()
-        self.net1 = nn.Linear(10, 10)
-        self.relu = nn.ReLU()
-        self.net2 = nn.Linear(10, 5)
-
-    def forward(self, x):
-        return self.net2(self.relu(self.net1(x)))
-
-
-class DDPActor(Actor):
-    """This Actor wraps the basic functionality from Torch's DDP example.
-
-    Adapted from: https://docs.pytorch.org/tutorials/intermediate/ddp_tutorial.html#basic-use-case
-    """
-
-    def __init__(self):
-        self.rank = current_rank().rank
-
-    def _rprint(self, msg):
-        """Helper method to print with rank information."""
-        print(f"{self.rank=} {msg}")
-
-    @endpoint
-    async def setup(self):
-        """Initialize the PyTorch distributed process group."""
-        self._rprint("Initializing torch distributed")
-
-        WORLD_SIZE = int(os.environ["WORLD_SIZE"])
-        # initialize the process group
-        dist.init_process_group("gloo", rank=self.rank, world_size=WORLD_SIZE)
-        self._rprint("Finished initializing torch distributed")
-
-    @endpoint
-    async def cleanup(self):
-        """Clean up the PyTorch distributed process group."""
-        self._rprint("Cleaning up torch distributed")
-        dist.destroy_process_group()
-
-    @endpoint
-    async def demo_basic(self):
-        """Run a basic DDP training example."""
-        self._rprint("Running basic DDP example")
-
-        # create model and move it to GPU with id rank
-        local_rank = int(os.environ["LOCAL_RANK"])
-        self._rprint(f"{local_rank=}")
-        model = ToyModel().to(local_rank)
-        ddp_model = DDP(model, device_ids=[local_rank])
-
-        loss_fn = nn.MSELoss()
-        optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
-
-        optimizer.zero_grad()
-        outputs = ddp_model(torch.randn(20, 10))
-        labels = torch.randn(20, 5).to(local_rank)
-        loss_fn(outputs, labels).backward()
-        optimizer.step()
-
-        print(f"{self.rank=} Finished running basic DDP example")
-
-
-def get_cloud(cloud_name: str):
-    """Get SkyPilot cloud object from name."""
-    clouds = {
-        "kubernetes": sky.Kubernetes,
-        "aws": sky.AWS,
-        "gcp": sky.GCP,
-        "azure": sky.Azure,
-    }
-    if cloud_name.lower() not in clouds:
-        raise ValueError(f"Unknown cloud: {cloud_name}. Available: {list(clouds.keys())}")
-    return clouds[cloud_name.lower()]()
-
-
-async def main():
-    parser = argparse.ArgumentParser(description="Monarch DDP with SkyPilot")
-    parser.add_argument("--cloud", default="kubernetes", help="Cloud provider")
-    parser.add_argument("--num-hosts", type=int, default=2, help="Number of hosts")
-    parser.add_argument("--gpus-per-host", type=int, default=1, help="GPUs per host")
-    parser.add_argument("--cluster-name", default="monarch-ddp", help="Cluster name")
-    parser.add_argument("--accelerator", default="H200:1", help="GPU accelerator")
-    args = parser.parse_args()
-
-    print("=" * 60)
-    print("Monarch DDP Example with SkyPilot")
-    print("=" * 60)
-    print(f"\nConfiguration:")
-    print(f"  Cloud: {args.cloud}")
-    print(f"  Hosts: {args.num_hosts}")
-    print(f"  GPUs per host: {args.gpus_per_host}")
-    print(f"  Accelerator: {args.accelerator}")
-
-    # Create SkyPilot job
-    job = SkyPilotJob(
-        meshes={"mesh0": args.num_hosts},
-        resources=sky.Resources(
-            cloud=get_cloud(args.cloud),
-            accelerators=args.accelerator,
-        ),
-        cluster_name=args.cluster_name,
-        idle_minutes_to_autostop=10,
-        down_on_autostop=True,
-    )
-
-    try:
-        print("\n[1] Launching SkyPilot cluster...")
-        job_state = job.state()
-        
-        print("\n[2] Creating process mesh...")
-        proc_mesh = job_state.mesh0.spawn_procs({"gpus": args.gpus_per_host})
-        print(f"    Process mesh extent: {proc_mesh.extent}")
-
-        print("\n[3] Spawning DDP actors...")
-        ddp_actor = proc_mesh.spawn("ddp_actor", DDPActor)
-
-        print("\n[4] Setting up distributed environment...")
-        await setup_env_for_distributed(proc_mesh)
-
-        print("\n[5] Running DDP example...")
-        await ddp_actor.setup.call()
-        await ddp_actor.demo_basic.call()
-        await ddp_actor.cleanup.call()
-
-        print("\n" + "=" * 60)
-        print("DDP example completed successfully!")
-        print("=" * 60)
-
-    except Exception as e:
-        print(f"\nERROR: {e}")
-        import traceback
-        traceback.print_exc()
-        print(f"\nNot cleaning up cluster for debugging...")
-        print(f"    Debug with: sky ssh {args.cluster_name}")
-        print(f"    Clean up: sky down {args.cluster_name}")
-        raise
-    else:
-        print("\n[6] Cleaning up SkyPilot cluster...")
-        job.kill()
-        print("    Done!")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
-

From ca7014ae4bf1a318cbbaf73939f4da8460c85983 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Thu, 11 Dec 2025 17:10:20 -0800
Subject: [PATCH 18/29] Readme updates

---
 examples/skypilot/README.md | 143 +++++++++++++++++++-----------------
 1 file changed, 77 insertions(+), 66 deletions(-)

diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md
index cbda51b08..db2fa9a39 100644
--- a/examples/skypilot/README.md
+++ b/examples/skypilot/README.md
@@ -11,12 +11,13 @@ This directory contains a standalone integration for running Monarch workloads o
 - Hyperscalers: AWS, GCP, Azure
 - Neoclouds: CoreWeave, Nebius, and [20+ other clouds](https://docs.skypilot.co/en/latest/getting-started/installation.html)
 
-## Installation
+## Quickstart
 
-```bash
-# Install Monarch
-pip install torchmonarch-nightly
+Prerequisites: Install SkyPilot and verify GPUs are available.
+<details>
+<summary><strong>SkyPilot Installation</strong></summary>
 
+```bash
 # Install SkyPilot with your preferred backend
 pip install skypilot[kubernetes]  # For Kubernetes
 pip install skypilot[aws]         # For AWS
@@ -25,69 +26,32 @@ pip install skypilot[all]         # For all clouds
 
 # Verify SkyPilot setup
 sky check
-```
-
-TODO(romilb): Link to SkyPilot docs for k8s setup
-
-## Quick Start
-
-```python
-import sky
-from skypilot_job import SkyPilotJob
-from monarch.actor import Actor, endpoint
 
-class MyActor(Actor):
-    @endpoint
-    def hello(self) -> str:
-        return "Hello from the cloud!"
-
-# Create a SkyPilot job with 2 nodes
-job = SkyPilotJob(
-    meshes={"workers": 2},
-    resources=sky.Resources(
-        cloud=sky.Kubernetes(),  # or sky.AWS(), sky.GCP(), etc.
-        accelerators="H100:1",
-    ),
-    cluster_name="my-monarch-cluster",
-    idle_minutes_to_autostop=10,
-    down_on_autostop=True,
-)
-
-# Launch and connect
-state = job.state()
-hosts = state.workers
+# Verify GPUs available
+sky show-gpus --infra kubernetes
+```
 
-# Spawn processes and actors
-procs = hosts.spawn_procs(per_host={"gpus": 1})
-actors = procs.spawn("my_actors", MyActor)
+For more details, see the [SkyPilot documentation](https://docs.skypilot.co/en/latest/getting-started/installation.html).
 
-# Use your actors
-results = actors.hello.call().get()
-print(results)  # ["Hello from the cloud!", "Hello from the cloud!"]
+</details>
 
-# Clean up
-job.kill()
-```
 
-## Running the Example
+Run this command from your local machine to run the getting started example:
 
 ```bash
-cd examples/skypilot
-
-# Run on Kubernetes
-python getting_started.py --cloud kubernetes --num-hosts 2
+sky launch monarch_getting_started.sky.yaml -c monarch-demo
+```
 
-# Run on AWS
-python getting_started.py --cloud aws --num-hosts 2 --accelerator "A100:1"
+SkyPilot will:
+1. Launch a Kubernetes pod
+2. Install dependencies
+3. Sync the example directory with the pod
+4. Run `skypilot_getting_started.py` in the pod and stream the logs
 
-# Run on GCP
-python getting_started.py --cloud gcp --num-hosts 2 --accelerator "A100:1"
-```
+<details>
+<summary><strong>Example Output</strong></summary>
 
-Example output:
 ```
-$ python skypilot_getting_started.py --num-hosts 2 --gpus-per-host 1 --cluster-name monarch-skypilot-test
-
 ============================================================
 Monarch Getting Started with SkyPilot
 ============================================================
@@ -151,29 +115,76 @@ Cluster 'monarch-skypilot-test' terminated
     Cluster terminated.
 ```
 
-## Default Image
+</details>
+
 
-By default, `SkyPilotJob` uses the `pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime` Docker image which has compatible system libraries for `torchmonarch-nightly`. TODO(romilb): mention image requirements.
+<details>
+<summary><strong>Running from within the Kubernetes cluster</strong></summary>
 
-## Faster Cold Starts with SkyPilot's cluster reuse
+If you are already in the Kubernetes cluster you'd like to run workers on, you can directly run `skypilot_getting_started.py`.
+
+```bash
+python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 8 --gpus "H200:8"
+```
+
+</details>
+
+## SkyPilotJob Class
+
+SkyPilotJob allows you to run Monarch on Kubernetes and cloud VMs via SkyPilot.
+
+Example usage:
 
-TODO(romilb): Validate if this works:
 ```python
+import sky
+from skypilot_job import SkyPilotJob
+from monarch.actor import Actor, endpoint
+
+class MyActor(Actor):
+    @endpoint
+    def hello(self) -> str:
+        return "Hello from the cloud!"
+
+# Create a SkyPilot job with 2 nodes
 job = SkyPilotJob(
-    ...,
-    idle_minutes_to_autostop=30,  # Keep cluster alive
+    meshes={"workers": 2},
+    resources=sky.Resources(
+        cloud=sky.Kubernetes(),  # or sky.AWS(), sky.GCP(), etc.
+        accelerators="H100:1",
+    ),
+    cluster_name="my-monarch-cluster",
+    idle_minutes_to_autostop=10,
+    down_on_autostop=True,
 )
-```
 
-TODO(romilb): Benchmark pre-baked container images
+# Launch and connect
+state = job.state()
+hosts = state.workers
+
+# Spawn processes and actors
+procs = hosts.spawn_procs(per_host={"gpus": 1})
+actors = procs.spawn("my_actors", MyActor)
+
+# Use your actors
+results = actors.hello.call().get()
+print(results)  # ["Hello from the cloud!", "Hello from the cloud!"]
 
-## Network Requirements
+# Clean up
+job.kill()
+```
+
+### Network Requirements
 
 The client must have direct network connectivity to the worker nodes:
 - **Kubernetes**: Run the client inside the same cluster (e.g., in a pod)
 - **Cloud VMs**: Ensure security groups allow inbound traffic on port 22222
 
-## Troubleshooting
+
+### Default Image
+
+By default, `SkyPilotJob` uses the `pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime` Docker image which has compatible system libraries for `torchmonarch-nightly`.
+
+## Troubleshooting tips
 
 **Check SkyPilot setup:**
 ```bash

From ffe74f55651c2e777839e885aa32f6e75417bbf6 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Thu, 11 Dec 2025 19:49:12 -0800
Subject: [PATCH 19/29] Updates

---
 examples/skypilot/README.md                   | 74 ++++++++++++++++++-
 examples/skypilot/skypilot_getting_started.py |  4 +-
 examples/skypilot/skypilot_job.py             |  2 +-
 3 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md
index db2fa9a39..1f3bc59c5 100644
--- a/examples/skypilot/README.md
+++ b/examples/skypilot/README.md
@@ -6,7 +6,47 @@ This directory contains a standalone integration for running Monarch workloads o
 
 `SkyPilotJob` provisions cloud instances (or K8s pods) and starts Monarch workers on them, allowing you to run distributed Monarch actors across multiple machines.
 
-**Supported platforms:**
+### Architecture
+
+```mermaid
+flowchart TB
+    subgraph laptop["💻 Your Laptop"]
+        user["$ sky launch monarch_getting_started.sky.yaml"]
+    end
+
+    subgraph k8s["☸️ Kubernetes Cluster"]
+        subgraph driver["Driver Pod"]
+            script["skypilot_getting_started.py"]
+            skyjob["SkyPilotJob"]
+        end
+        
+        subgraph workers["Worker Pods (provisioned by SkyPilot)"]
+            subgraph w1["Worker Pod 0"]
+                mw1["Monarch Worker"]
+            end
+            subgraph w2["Worker Pod 1"]
+                mw2["Monarch Worker"]
+            end
+        end
+    end
+
+    user -->|"SkyPilot launches"| driver
+    script --> skyjob
+    skyjob -->|"provisions via SkyPilot"| workers
+    skyjob <-->|"TCP :22222"| mw1
+    skyjob <-->|"TCP :22222"| mw2
+    mw1
+    mw2
+```
+
+**How it works:**
+1. You run `sky launch` from your laptop to start the driver pod
+2. The driver runs `skypilot_getting_started.py` which creates a `SkyPilotJob`
+3. `SkyPilotJob` provisions GPU worker pods via SkyPilot
+4. The driver connects to Monarch workers over TCP (port 22222)
+5. Actors are spawned on each GPU and execute your distributed code
+
+**Supported infra:**
 - Kubernetes (any cluster)
 - Hyperscalers: AWS, GCP, Azure
 - Neoclouds: CoreWeave, Nebius, and [20+ other clouds](https://docs.skypilot.co/en/latest/getting-started/installation.html)
@@ -117,6 +157,11 @@ Cluster 'monarch-skypilot-test' terminated
 
 </details>
 
+When done, clean up with:
+```bash
+sky down monarch-demo
+```
+
 
 <details>
 <summary><strong>Running from within the Kubernetes cluster</strong></summary>
@@ -129,6 +174,33 @@ python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-h
 
 </details>
 
+<details>
+
+### Running the DDP Jupyter Notebook
+
+To run the `skypilot_ddp.ipynb` notebook interactively, first launch a driver pod and then connect via SSH port forwarding:
+
+```bash
+# 1. Launch a driver pod (without running a script)
+sky launch monarch_getting_started.sky.yaml -c monarch-demo
+
+# 2. SSH into the pod with port forwarding for Jupyter
+sky ssh monarch-demo -L 8888:localhost:8888
+
+# 3. Inside the pod, start Jupyter Notebook (no token required)
+cd ~/sky_workdir
+jupyter notebook --no-browser --port=8888 --ip=0.0.0.0 --NotebookApp.token='' --NotebookApp.password=''
+```
+
+Then open http://localhost:8888 in your browser and run `skypilot_ddp.ipynb`.
+
+When done, clean up with:
+```bash
+sky down monarch-demo
+```
+
+</details>
+
 ## SkyPilotJob Class
 
 SkyPilotJob allows you to run Monarch on Kubernetes and cloud VMs via SkyPilot.
diff --git a/examples/skypilot/skypilot_getting_started.py b/examples/skypilot/skypilot_getting_started.py
index 814f4e6d0..aa70d8d88 100644
--- a/examples/skypilot/skypilot_getting_started.py
+++ b/examples/skypilot/skypilot_getting_started.py
@@ -14,10 +14,10 @@
 
 Usage:
     # Run on Kubernetes with 2 nodes, 8 GPUs per node
-    python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 8 --gpus "H200:8"
+    python examples/skypilot/skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 8 --gpus "H200:8"
 
     # Run on cloud VMs
-    python skypilot_getting_started.py --cloud <aws/gcp/azure/...> --num-hosts 2 --gpus-per-host 1 --gpus "H100:1"
+    python examples/skypilot/skypilot_getting_started.py --cloud <aws/gcp/azure/...> --num-hosts 2 --gpus-per-host 1 --gpus "H100:1"
 """
 
 import argparse
diff --git a/examples/skypilot/skypilot_job.py b/examples/skypilot/skypilot_job.py
index edb418da8..e9c36df36 100644
--- a/examples/skypilot/skypilot_job.py
+++ b/examples/skypilot/skypilot_job.py
@@ -125,7 +125,7 @@ def __init__(
             resources: SkyPilot Resources specification for the instances.
                        If None, uses SkyPilot defaults.
             cluster_name: Name for the SkyPilot cluster. If None, auto-generated.
-            monarch_port: Port for TCP communication between Monarch workers.
+            monarch_port: Port bootstrapping communication between Monarch workers.
             idle_minutes_to_autostop: If set, cluster will autostop after this
                                       many minutes of idleness.
             down_on_autostop: If True, tear down cluster on autostop instead of

From 85145041e9c0d9fb4dea1fdf1aa43a1d0791a23e Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Fri, 12 Dec 2025 23:58:43 +0000
Subject: [PATCH 20/29] fix mermaid doc

---
 examples/skypilot/README.md | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md
index 1f3bc59c5..b65200a03 100644
--- a/examples/skypilot/README.md
+++ b/examples/skypilot/README.md
@@ -20,7 +20,7 @@ flowchart TB
             skyjob["SkyPilotJob"]
         end
         
-        subgraph workers["Worker Pods (provisioned by SkyPilot)"]
+        subgraph workers["Worker&nbsp;Pods&nbsp;(SkyPilot&nbsp;clusters)"]
             subgraph w1["Worker Pod 0"]
                 mw1["Monarch Worker"]
             end
@@ -32,7 +32,7 @@ flowchart TB
 
     user -->|"SkyPilot launches"| driver
     script --> skyjob
-    skyjob -->|"provisions via SkyPilot"| workers
+    skyjob -->|"provisioned via SkyPilot"| workers
     skyjob <-->|"TCP :22222"| mw1
     skyjob <-->|"TCP :22222"| mw2
     mw1
@@ -174,8 +174,6 @@ python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-h
 
 </details>
 
-<details>
-
 ### Running the DDP Jupyter Notebook
 
 To run the `skypilot_ddp.ipynb` notebook interactively, first launch a driver pod and then connect via SSH port forwarding:
@@ -187,20 +185,18 @@ sky launch monarch_getting_started.sky.yaml -c monarch-demo
 # 2. SSH into the pod with port forwarding for Jupyter
 sky ssh monarch-demo -L 8888:localhost:8888
 
-# 3. Inside the pod, start Jupyter Notebook (no token required)
+# 3. Inside the pod, start Jupyter Notebook
 cd ~/sky_workdir
 jupyter notebook --no-browser --port=8888 --ip=0.0.0.0 --NotebookApp.token='' --NotebookApp.password=''
 ```
 
-Then open http://localhost:8888 in your browser and run `skypilot_ddp.ipynb`.
+Then open http://localhost:8888 in your browser and open `skypilot_ddp.ipynb`.
 
 When done, clean up with:
 ```bash
 sky down monarch-demo
 ```
 
-</details>
-
 ## SkyPilotJob Class
 
 SkyPilotJob allows you to run Monarch on Kubernetes and cloud VMs via SkyPilot.

From 2d83527b1fc82fce16a3a1e37098b5da0b53b24b Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Sat, 13 Dec 2025 00:38:05 +0000
Subject: [PATCH 21/29] Docs updates

---
 docs/source/examples/README.rst         | 1 +
 docs/source/examples/getting_started.py | 4 ++--
 docs/source/index.md                    | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/examples/README.rst b/docs/source/examples/README.rst
index 3d5089de2..e4b943c2d 100644
--- a/docs/source/examples/README.rst
+++ b/docs/source/examples/README.rst
@@ -8,6 +8,7 @@ Examples
 - :doc:`distributed_tensors.py <distributed_tensors>`: Shows how to dispatch tensors and tensor level operations to a distributed mesh of workers and GPUs
 - :doc:`debugging.py <debugging>`: Shows how to use the Monarch debugger to debug a distributed program
 - `Multinode Slurm Tutorial <https://docs.pytorch.org/tutorials/intermediate/monarch_distributed_tutorial.html>`_: Multinode distributed training tutorial using Monarch and Slurm to run an SPMD training job.
+- `SkyPilot Integration <https://github.com/pytorch-labs/monarch/tree/main/examples/skypilot>`_: Run Monarch on Kubernetes and cloud VMs via SkyPilot.
 
 .. toctree::
    :hidden:
diff --git a/docs/source/examples/getting_started.py b/docs/source/examples/getting_started.py
index 6c7359f95..476a550bb 100644
--- a/docs/source/examples/getting_started.py
+++ b/docs/source/examples/getting_started.py
@@ -145,8 +145,8 @@ def get_value(self) -> int:
 # ==============
 # When we created our processes before, we spawned them on `this_host()` -- the machine
 # running the top-level script. For larger jobs, monarch controls many machines. How these
-# machines are obtained depends on the scheduling system (slurm, kubernetes, etc), but these
-# schedulers are typically encapsulated in a config file.
+# machines are obtained depends on the scheduling system (Slurm, Kubernetes, SkyPilot, etc.),
+# but these schedulers are typically encapsulated in a config file.
 
 from monarch.actor import context, HostMesh, hosts_from_config
 
diff --git a/docs/source/index.md b/docs/source/index.md
index 88072f60c..3321e3abc 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -82,3 +82,4 @@ We welcome contributions from the community! If you're interested in contributin
 - [Demo notebook](https://github.com/meta-pytorch/monarch/blob/main/examples/presentation/presentation.ipynb)
 - [DevX Pytorch tutorial](https://docs.pytorch.org/tutorials/intermediate/monarch_distributed_tutorial.html)
 - [Lightning Monarch blog](https://lightning.ai/meta-ai/environments/large-scale-interactive-training-with-monarch)
+- [Running on Kubernetes via SkyPilot](https://github.com/meta-pytorch/monarch/tree/main/examples/skypilot)

From 4d6ed271ef4c058ccfdb8c95a173194937fa83ed Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Sat, 13 Dec 2025 00:44:13 +0000
Subject: [PATCH 22/29] updates

---
 docs/source/examples/README.rst | 2 +-
 docs/source/index.md            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/examples/README.rst b/docs/source/examples/README.rst
index e4b943c2d..3b27c3f97 100644
--- a/docs/source/examples/README.rst
+++ b/docs/source/examples/README.rst
@@ -8,7 +8,7 @@ Examples
 - :doc:`distributed_tensors.py <distributed_tensors>`: Shows how to dispatch tensors and tensor level operations to a distributed mesh of workers and GPUs
 - :doc:`debugging.py <debugging>`: Shows how to use the Monarch debugger to debug a distributed program
 - `Multinode Slurm Tutorial <https://docs.pytorch.org/tutorials/intermediate/monarch_distributed_tutorial.html>`_: Multinode distributed training tutorial using Monarch and Slurm to run an SPMD training job.
-- `SkyPilot Integration <https://github.com/pytorch-labs/monarch/tree/main/examples/skypilot>`_: Run Monarch on Kubernetes and cloud VMs via SkyPilot.
+- `Multinode Kubernetes examples <https://github.com/pytorch-labs/monarch/tree/main/examples/skypilot>`_: Run Monarch on Kubernetes and cloud VMs via SkyPilot.
 
 .. toctree::
    :hidden:
diff --git a/docs/source/index.md b/docs/source/index.md
index 3321e3abc..4a182f88a 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -82,4 +82,4 @@ We welcome contributions from the community! If you're interested in contributin
 - [Demo notebook](https://github.com/meta-pytorch/monarch/blob/main/examples/presentation/presentation.ipynb)
 - [DevX Pytorch tutorial](https://docs.pytorch.org/tutorials/intermediate/monarch_distributed_tutorial.html)
 - [Lightning Monarch blog](https://lightning.ai/meta-ai/environments/large-scale-interactive-training-with-monarch)
-- [Running on Kubernetes via SkyPilot](https://github.com/meta-pytorch/monarch/tree/main/examples/skypilot)
+- [Monarch on Kubernetes](https://github.com/meta-pytorch/monarch/tree/main/examples/skypilot)

From 3f3e890105eb77e2214eed182d2ec606871490f5 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Sat, 13 Dec 2025 00:45:28 +0000
Subject: [PATCH 23/29] updates

---
 examples/skypilot/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md
index b65200a03..91c6a375d 100644
--- a/examples/skypilot/README.md
+++ b/examples/skypilot/README.md
@@ -1,10 +1,10 @@
 # Running Monarch on Kubernetes and cloud VMs via SkyPilot
 
-This directory contains a standalone integration for running Monarch workloads on **Kubernetes and cloud VMs** via [SkyPilot](https://github.com/skypilot-org/skypilot).
+This directory contains examples for running Monarch workloads on **Kubernetes and cloud VMs** via [SkyPilot](https://github.com/skypilot-org/skypilot).
 
 ## Overview
 
-`SkyPilotJob` provisions cloud instances (or K8s pods) and starts Monarch workers on them, allowing you to run distributed Monarch actors across multiple machines.
+`SkyPilotJob` provisions cloud instances (or K8s pods) and starts Monarch workers on them, allowing you to run distributed Monarch actors across multiple Kubernetes pods.
 
 ### Architecture
 

From be7818e3d34bba6a3804815bac73caa47c20f59f Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Sat, 13 Dec 2025 00:50:42 +0000
Subject: [PATCH 24/29] Add notes on how to set resources and num nodes

---
 examples/skypilot/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md
index 91c6a375d..d662701a2 100644
--- a/examples/skypilot/README.md
+++ b/examples/skypilot/README.md
@@ -81,6 +81,7 @@ Run this command from your local machine to run the getting started example:
 ```bash
 sky launch monarch_getting_started.sky.yaml -c monarch-demo
 ```
+**💡 Tip:** Run `sky show-gpus --infra kubernetes` to see available GPUs in your cluster, then edit `--accelerator` and `--num-hosts` in the `run` section of the YAML to match your resources.
 
 SkyPilot will:
 1. Launch a Kubernetes pod

From 0a443b3e3fca2fab32641f923ee5ea3809ca747a Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Sat, 13 Dec 2025 00:51:19 +0000
Subject: [PATCH 25/29] fix ssh command

---
 examples/skypilot/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md
index d662701a2..fa117ec03 100644
--- a/examples/skypilot/README.md
+++ b/examples/skypilot/README.md
@@ -184,7 +184,7 @@ To run the `skypilot_ddp.ipynb` notebook interactively, first launch a driver po
 sky launch monarch_getting_started.sky.yaml -c monarch-demo
 
 # 2. SSH into the pod with port forwarding for Jupyter
-sky ssh monarch-demo -L 8888:localhost:8888
+ssh monarch-demo -L 8888:localhost:8888
 
 # 3. Inside the pod, start Jupyter Notebook
 cd ~/sky_workdir

From 5fcf775f8ef40ab2370097890270afee7553c907 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Sat, 13 Dec 2025 00:52:53 +0000
Subject: [PATCH 26/29] Update jupyter commands

---
 examples/skypilot/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md
index fa117ec03..090a9fc5c 100644
--- a/examples/skypilot/README.md
+++ b/examples/skypilot/README.md
@@ -188,7 +188,8 @@ ssh monarch-demo -L 8888:localhost:8888
 
 # 3. Inside the pod, start Jupyter Notebook
 cd ~/sky_workdir
-jupyter notebook --no-browser --port=8888 --ip=0.0.0.0 --NotebookApp.token='' --NotebookApp.password=''
+uv pip install --system jupyter
+jupyter notebook --no-browser --port=8888 --ip=0.0.0.0 --NotebookApp.token='' --NotebookApp.password='' --allow-root
 ```
 
 Then open http://localhost:8888 in your browser and open `skypilot_ddp.ipynb`.

From 56e7c8c7fb2016c0eb49038ceeedce41d52fafbe Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Sat, 13 Dec 2025 02:25:35 +0000
Subject: [PATCH 27/29] Add CPU-only support

---
 examples/skypilot/README.md                   | 30 +++++++++++++++++--
 .../skypilot/monarch_getting_started.sky.yaml | 24 ++++++++++-----
 examples/skypilot/skypilot_getting_started.py | 25 ++++++++++++----
 3 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/examples/skypilot/README.md b/examples/skypilot/README.md
index 090a9fc5c..b88f0068e 100644
--- a/examples/skypilot/README.md
+++ b/examples/skypilot/README.md
@@ -81,9 +81,29 @@ Run this command from your local machine to run the getting started example:
 ```bash
 sky launch monarch_getting_started.sky.yaml -c monarch-demo
 ```
-**💡 Tip:** Run `sky show-gpus --infra kubernetes` to see available GPUs in your cluster, then edit `--accelerator` and `--num-hosts` in the `run` section of the YAML to match your resources.
 
-SkyPilot will:
+<details>
+<summary><strong>💡 Customizing the run (GPU count, CPU-only mode, etc.)</strong></summary>
+
+Run `sky show-gpus --infra kubernetes` to see available GPUs in your cluster, then customize with environment variables:
+
+```bash
+# Custom GPU configuration
+sky launch monarch_getting_started.sky.yaml -c monarch-demo \
+  --env NUM_HOSTS=4 \
+  --env GPUS_PER_HOST=8 \
+  --env ACCELERATOR="H100:8"
+
+# CPU-only mode (no GPUs required)
+sky launch monarch_getting_started.sky.yaml -c monarch-demo \
+  --env GPUS_PER_HOST=0 \
+  --env ACCELERATOR=none
+```
+
+</details>
+
+
+On running `sky launch`, SkyPilot will:
 1. Launch a Kubernetes pod
 2. Install dependencies
 3. Sync the example directory with the pod
@@ -170,7 +190,11 @@ sky down monarch-demo
 If you are already in the Kubernetes cluster you'd like to run workers on, you can directly run `skypilot_getting_started.py`.
 
 ```bash
-python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 8 --gpus "H200:8"
+# With GPUs
+python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 8 --accelerator "H200:8"
+
+# CPU-only (no GPUs)
+python skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 0 --accelerator none
 ```
 
 </details>
diff --git a/examples/skypilot/monarch_getting_started.sky.yaml b/examples/skypilot/monarch_getting_started.sky.yaml
index 0398cc873..61aa75cee 100644
--- a/examples/skypilot/monarch_getting_started.sky.yaml
+++ b/examples/skypilot/monarch_getting_started.sky.yaml
@@ -5,7 +5,10 @@
 #
 # Usage:
 #   cd monarch/examples/skypilot
-#   sky launch getting_started.sky.yaml -c monarch-demo
+#   sky launch monarch_getting_started.sky.yaml -c monarch-demo
+#
+#   # For CPU-only clusters (no GPUs):
+#   sky launch monarch_getting_started.sky.yaml -c monarch-demo --env GPUS_PER_HOST=0 --env ACCELERATOR=none
 #
 # To view logs:
 #   sky logs monarch-demo
@@ -23,6 +26,13 @@ resources:
   cpus: 2+ # No GPUs needed for the driver script
   image_id: docker:pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime
 
+# Environment variables for configuring the example
+# Override with: sky launch ... --env NUM_HOSTS=4 --env GPUS_PER_HOST=8
+envs:
+  NUM_HOSTS: 2           # Number of worker nodes to provision
+  GPUS_PER_HOST: 1       # GPUs per worker (set to 0 for CPU-only)
+  ACCELERATOR: "H200:1"  # SkyPilot GPU spec (set to "none" for CPU-only). Keep quantity aligned with GPUS_PER_HOST.
+
 # Sync the current directory (examples/skypilot) to the cluster
 workdir: .
 
@@ -64,17 +74,17 @@ setup: |
   echo "=== Setup complete ==="
 
 run: |  
-  echo "=== Running Monarch Getting Started with SkyPilot ==="  
+  echo "=== Running Monarch Getting Started with SkyPilot ==="
+  echo "Configuration: NUM_HOSTS=$NUM_HOSTS, GPUS_PER_HOST=$GPUS_PER_HOST, ACCELERATOR=$ACCELERATOR"
 
   # Run the getting started example
-  # This will launch a SkyPilot cluster with Monarch workers.
-  # Change the arguments to your desired values.
+  # Uses environment variables set above (can be overridden with --env)
   python skypilot_getting_started.py \
     --cloud kubernetes \
-    --num-hosts 2 \
-    --gpus-per-host 1 \
+    --num-hosts $NUM_HOSTS \
+    --gpus-per-host $GPUS_PER_HOST \
     --cluster-name monarch-workers \
-    --accelerator "H200:1"
+    --accelerator "$ACCELERATOR"
   
   echo "=== Example ran successfully ==="
 
diff --git a/examples/skypilot/skypilot_getting_started.py b/examples/skypilot/skypilot_getting_started.py
index aa70d8d88..04148c9be 100644
--- a/examples/skypilot/skypilot_getting_started.py
+++ b/examples/skypilot/skypilot_getting_started.py
@@ -18,6 +18,9 @@
 
     # Run on cloud VMs
     python examples/skypilot/skypilot_getting_started.py --cloud <aws/gcp/azure/...> --num-hosts 2 --gpus-per-host 1 --gpus "H100:1"
+
+    # Run on CPU-only cluster (no GPUs)
+    python examples/skypilot/skypilot_getting_started.py --cloud kubernetes --num-hosts 2 --gpus-per-host 0 --accelerator none
 """
 
 import argparse
@@ -143,14 +146,20 @@ def main():
     )
     args = parser.parse_args()
 
+    # Determine if running in CPU-only mode
+    cpu_only = args.gpus_per_host == 0 or args.accelerator.lower() == "none"
+
     print("=" * 60)
     print("Monarch Getting Started with SkyPilot")
     print("=" * 60)
     print(f"\nConfiguration:")
     print(f"  Cloud: {args.cloud}")
     print(f"  Hosts: {args.num_hosts}")
-    print(f"  GPUs per host: {args.gpus_per_host}")
-    print(f"  Accelerator: {args.accelerator}")
+    if cpu_only:
+        print(f"  Mode: CPU-only (no GPUs)")
+    else:
+        print(f"  GPUs per host: {args.gpus_per_host}")
+        print(f"  Accelerator: {args.accelerator}")
     print(f"  Cluster name: {args.cluster_name}")
     if args.region:
         print(f"  Region: {args.region}")
@@ -161,8 +170,10 @@ def main():
     # Build resources specification
     resources_kwargs = {
         "cloud": get_cloud(args.cloud),
-        "accelerators": args.accelerator,
     }
+    # Only request GPUs if not in CPU-only mode
+    if not cpu_only:
+        resources_kwargs["accelerators"] = args.accelerator
     if args.region:
         resources_kwargs["region"] = args.region
     
@@ -191,8 +202,12 @@ def main():
         # ====================================================================
 
         print("\n[3] Spawning processes on cloud hosts...")
-        # Create a process mesh - GPU processes per host
-        procs: ProcMesh = hosts.spawn_procs(per_host={"gpus": args.gpus_per_host})
+        # Create a process mesh
+        if cpu_only:
+            # CPU-only mode: spawn 1 CPU process per host
+            procs: ProcMesh = hosts.spawn_procs(per_host={"procs": 1})
+        else:
+            procs: ProcMesh = hosts.spawn_procs(per_host={"gpus": args.gpus_per_host})
         print(f"    Process mesh extent: {procs.extent}")
 
         # Spawn counter actors

From 236a01a998493129ef039d13549c7e40eade13f1 Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Sat, 20 Dec 2025 01:08:39 +0530
Subject: [PATCH 28/29] update docs

---
 docs/source/examples/README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/examples/README.rst b/docs/source/examples/README.rst
index 3b27c3f97..37b9cfe3c 100644
--- a/docs/source/examples/README.rst
+++ b/docs/source/examples/README.rst
@@ -8,7 +8,7 @@ Examples
 - :doc:`distributed_tensors.py <distributed_tensors>`: Shows how to dispatch tensors and tensor level operations to a distributed mesh of workers and GPUs
 - :doc:`debugging.py <debugging>`: Shows how to use the Monarch debugger to debug a distributed program
 - `Multinode Slurm Tutorial <https://docs.pytorch.org/tutorials/intermediate/monarch_distributed_tutorial.html>`_: Multinode distributed training tutorial using Monarch and Slurm to run an SPMD training job.
-- `Multinode Kubernetes examples <https://github.com/pytorch-labs/monarch/tree/main/examples/skypilot>`_: Run Monarch on Kubernetes and cloud VMs via SkyPilot.
+- `Running on Kubernetes using Skypilot <https://github.com/pytorch-labs/monarch/tree/main/examples/skypilot>`_: Run Monarch on Kubernetes and cloud VMs via SkyPilot.
 
 .. toctree::
    :hidden:

From 8207b92987b228972453cf700aa883638e80793e Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@gmail.com>
Date: Sat, 20 Dec 2025 01:38:56 +0530
Subject: [PATCH 29/29] review comments

---
 docs/source/index.md              |  2 +-
 examples/skypilot/skypilot_job.py | 12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/docs/source/index.md b/docs/source/index.md
index 4a182f88a..32928e328 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -82,4 +82,4 @@ We welcome contributions from the community! If you're interested in contributin
 - [Demo notebook](https://github.com/meta-pytorch/monarch/blob/main/examples/presentation/presentation.ipynb)
 - [DevX Pytorch tutorial](https://docs.pytorch.org/tutorials/intermediate/monarch_distributed_tutorial.html)
 - [Lightning Monarch blog](https://lightning.ai/meta-ai/environments/large-scale-interactive-training-with-monarch)
-- [Monarch on Kubernetes](https://github.com/meta-pytorch/monarch/tree/main/examples/skypilot)
+- [Monarch on Kubernetes using Skypilot](https://github.com/meta-pytorch/monarch/tree/main/examples/skypilot)
diff --git a/examples/skypilot/skypilot_job.py b/examples/skypilot/skypilot_job.py
index e9c36df36..809b97e5e 100644
--- a/examples/skypilot/skypilot_job.py
+++ b/examples/skypilot/skypilot_job.py
@@ -1,5 +1,5 @@
 """
-SkyPilotJob for Monarch.
+Monarch JobTrait implementation for SkyPilot.
 
 SkyPilotJob allows running Monarch on Kubernetes and cloud VMs via SkyPilot. 
 
@@ -40,7 +40,7 @@
 logger.propagate = False
 
 # Default port for Monarch TCP communication
-DEFAULT_MONARCH_PORT = 22222
+MONARCH_WORKER_PORT = 22222
 
 # Timeout for waiting for the job to reach RUNNING status.
 JOB_TIMEOUT = 300 # seconds
@@ -110,9 +110,9 @@ def __init__(
         meshes: Dict[str, int],
         resources: Optional["sky.Resources"] = None,
         cluster_name: Optional[str] = None,
-        monarch_port: int = DEFAULT_MONARCH_PORT,
+        monarch_port: int = MONARCH_WORKER_PORT,
         idle_minutes_to_autostop: Optional[int] = None,
-        down_on_autostop: bool = False,
+        down_on_autostop: bool = True,
         python_exe: str = "python",
         setup_commands: Optional[str] = None,
         workdir: Optional[str] = None,
@@ -129,7 +129,9 @@ def __init__(
             idle_minutes_to_autostop: If set, cluster will autostop after this
                                       many minutes of idleness.
             down_on_autostop: If True, tear down cluster on autostop instead of
-                              just stopping it.
+                              just stopping it. On Kubernetes, autostop is not 
+                              supported and this must be set to True. Pods will
+                              be deleted when the SkyPilot cluster is downed.
             python_exe: Python executable to use for worker processes.
             setup_commands: Optional setup commands to run before starting workers.
                            If None, uses DEFAULT_SETUP_COMMANDS which installs