add monarch serve torchx command to launch the (MAST) job and cache the command inside the jobs .pkl

colin2328 · facebook-github-bot · commit acc760691411 · 2025-12-10T10:59:17.000-08:00
Summary: Intorduce monarch serve torchx ... as proposed here https://docs.google.com/document/d/1F3m3oDBX3sipHCxsp_2ghSvgOBKgmFRWql3A5StMh1I/edit?tab=t.0#heading=h.zb5e0il0bn6a here, we create the job and create the .pkl file of the command we then add run_spmd so we can run python -c "from monarch.job import job_load; job = job_load(); job.run_spmd()" Differential Revision: D88515552
diff --git a/python/monarch/_src/job/spmd.py b/python/monarch/_src/job/spmd.py
@@ -0,0 +1,136 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Dict, List, Optional
+
+from monarch._rust_bindings.monarch_hyperactor.channel import ChannelTransport
+from monarch._rust_bindings.monarch_hyperactor.config import configure
+from monarch._src.job.job import JobState, JobTrait
+from monarch._src.spmd import SPMDActor
+
+
+def create_job_for_scheduler(
+    scheduler: str,
+    scheduler_cfg: Dict[str, Any],
+    num_hosts: int,
+    host_type: str,
+    workspace: Optional[str] = None,
+) -> JobTrait:
+    """
+    Create appropriate job based on scheduler type.
+
+    Args:
+        scheduler: Scheduler name (e.g., "mast", "mast_conda", "slurm")
+        scheduler_cfg: Scheduler configuration dict with keys like hpcIdentity, etc.
+        num_hosts: Number of hosts to allocate
+        host_type: Host type (e.g., "gtt_any")
+        workspace: Optional local workspace directory to pack
+
+    Returns:
+        JobTrait instance configured for the scheduler
+
+    Raises:
+        NotImplementedError: If scheduler is not yet supported
+        ValueError: If scheduler is unsupported
+    """
+    match scheduler:
+        case "mast_conda":
+            from monarch._src.job.meta import MASTJob
+
+            job = MASTJob(
+                hpcIdentity=scheduler_cfg["hpcIdentity"],
+                hpcJobOncall=scheduler_cfg["hpcJobOncall"],
+                rmAttribution=scheduler_cfg["rmAttribution"],
+                hpcClusterUuid=scheduler_cfg.get("hpcClusterUuid", "MastProdCluster"),
+            )
+            job.add_mesh("workers", num_hosts, host_type)
+
+            # Add workspace if provided (pack to root of WORKSPACE_DIR)
+            if workspace:
+                job.add_directory(workspace, "")
+
+            return job
+
+        case "slurm":
+            raise NotImplementedError(f"Scheduler {scheduler} not yet supported")
+
+        case _:
+            raise ValueError(f"Unsupported scheduler: {scheduler}")
+
+
+class SPMDJob(JobTrait):
+    """
+    SPMD (Single Program Multiple Data) job that wraps any JobTrait.
+
+    This job type is created via `monarch serve torchx ...` CLI and stores
+    both the underlying job (e.g., MASTJob) and the original torchx command metadata.
+    """
+
+    def __init__(
+        self,
+        job: JobTrait,
+        scheduler: str,
+        nnodes: int,
+        nproc_per_node: int,
+        component: str,
+        component_args: List[str],
+        script_args: List[str],
+        workspace: Optional[str] = None,
+        scheduler_args: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__()
+        self._job = job
+        self._scheduler = scheduler
+        self._nnodes = nnodes
+        self._nproc_per_node = nproc_per_node
+        self._component = component
+        self._component_args = component_args
+        self._script_args = script_args
+        self._workspace = workspace
+        self._scheduler_args = scheduler_args or {}
+
+    def _create(self, client_script: Optional[str] = None):
+        self._job._create(client_script)
+
+    def can_run(self, spec: "JobTrait") -> bool:
+        if not isinstance(spec, SPMDJob):
+            return False
+        return (
+            self._scheduler == spec._scheduler
+            and self._nnodes == spec._nnodes
+            and self._nproc_per_node == spec._nproc_per_node
+            and self._component == spec._component
+            and self._component_args == spec._component_args
+            and self._script_args == spec._script_args
+            and self._workspace == spec._workspace
+            and self._scheduler_args == spec._scheduler_args
+            and self._job.can_run(spec._job)
+        )
+
+    def _state(self) -> JobState:
+        return self._job._state()
+
+    def _kill(self):
+        self._job._kill()
+
+    def run_spmd(self):
+        configure(default_transport=ChannelTransport.MetaTlsWithHostname)
+        job_state = self._state()
+        workers = job_state.workers
+        pm = workers.spawn_procs(per_host={"gpus": self._nproc_per_node})
+        am = pm.spawn("_SPMDActor", SPMDActor)
+
+        first_values = dict.fromkeys(pm._labels, 0)
+        master_addr, master_port = (
+            am.slice(**first_values).get_host_port.call_one(None).get()
+        )
+
+        print("Calling SPMDActor.main with:")
+        print(f"  master_addr: {master_addr}")
+        print(f"  master_port: {master_port}")
+        print(f"  script_args: {self._script_args}")
+
+        am.main.call(master_addr, master_port, self._script_args).get()
diff --git a/python/monarch/_src/spmd/actor.py b/python/monarch/_src/spmd/actor.py
@@ -112,6 +112,11 @@ def main(self, master_addr: str, master_port: int, script_args: list[str]) -> bo
         """
         self._setup_env(master_addr, master_port)
 
+        # Change to workspace directory if available
+        workspace_dir = os.environ.get("WORKSPACE_DIR")
+        if workspace_dir and os.path.exists(workspace_dir):
+            os.chdir(workspace_dir)
+
         if script_args and script_args[0] == "-m":
             module_name = script_args[1]
             sys.argv = [module_name] + list(script_args[2:])
diff --git a/python/monarch/job/spmd.py b/python/monarch/job/spmd.py
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from monarch._src.job.spmd import create_job_for_scheduler, SPMDJob
+
+__all__ = ["SPMDJob", "create_job_for_scheduler"]
diff --git a/python/monarch/tools/cli.py b/python/monarch/tools/cli.py
@@ -9,6 +9,10 @@
 import json
 import sys
 
+from monarch.job.spmd import (  # @manual=//monarch/python/monarch/job:job
+    create_job_for_scheduler,
+    SPMDJob,
+)
 from monarch.tools.commands import (
     bounce,
     component_args_from_cli,
@@ -25,6 +29,9 @@
 )
 
 from monarch.tools.debug_env import _get_debug_server_host, _get_debug_server_port
+from torchx.cli.cmd_run import _parse_component_name_and_args, CmdRun
+from torchx.components.fb import parse_j
+from torchx.runner import get_runner
 from torchx.specs.finder import get_component
 
 
@@ -163,6 +170,141 @@ def run(self, args: argparse.Namespace) -> None:
         debug(args.host, args.port)
 
 
+class ServeCmd:
+    """
+    Parse and cache a torchx command for monarch execution.
+
+    Example:
+        monarch serve torchx run -s conda_mast -j1x8 train.py -- --lr 0.001
+    """
+
+    def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
+        subparser.add_argument(
+            "torchx_args",
+            nargs=argparse.REMAINDER,
+            help="torchx command arguments (e.g., 'run -s mast_conda -j1x8 train.py -- --lr 0.001')",
+        )
+
+    def run(self, args: argparse.Namespace) -> None:
+        # Validate input
+        if (
+            not args.torchx_args
+            or len(args.torchx_args) < 2
+            or args.torchx_args[0] != "torchx"
+            or args.torchx_args[1] != "run"
+        ):
+            print("Error: Expected 'torchx run ...' command", file=sys.stderr)
+            print(
+                "Usage: monarch serve torchx run --scheduler SCHEDULER [--scheduler_args ARGS] COMPONENT [COMPONENT_ARGS] [-- SCRIPT_ARGS]",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+
+        # Create torchx CmdRun to reuse its parser
+        cmd_run = CmdRun()
+        parser = argparse.ArgumentParser()
+        cmd_run.add_arguments(parser)
+
+        # Remove 'torchx run' from beginning
+        torchx_args = args.torchx_args[2:]
+
+        # Parse using torchx's parser
+        try:
+            parsed = parser.parse_args(torchx_args)
+        except SystemExit:
+            print("Error: Failed to parse torchx arguments", file=sys.stderr)
+            sys.exit(1)
+
+        # Get runner to parse scheduler_args
+        runner = get_runner()
+        scheduler_opts = runner.scheduler_run_opts(parsed.scheduler)
+        scheduler_cfg = scheduler_opts.cfg_from_str(parsed.scheduler_args or "")
+
+        # Parse component name and args using torchx helper
+        component_name, component_args = _parse_component_name_and_args(
+            parsed.component_name_and_args, parser
+        )
+
+        # Extract script args (everything after -- delimiter)
+        script_args = []
+        try:
+            delimiter_idx = component_args.index("--")
+            script_args = component_args[delimiter_idx + 1 :]
+        except ValueError:
+            pass
+
+        # Extract -j and -h from component_args
+        job_spec = None
+        host_type = "gtt_any"
+        i = 0
+        while i < len(component_args):
+            if component_args[i] in ["-j", "--job_spec"]:
+                if i + 1 < len(component_args):
+                    job_spec = component_args[i + 1]
+                    i += 2
+                else:
+                    print("Error: -j requires a value", file=sys.stderr)
+                    sys.exit(1)
+            elif component_args[i] in ["-h", "--host_type"]:
+                if i + 1 < len(component_args):
+                    host_type = component_args[i + 1]
+                    i += 2
+                else:
+                    print("Error: -h requires a value", file=sys.stderr)
+                    sys.exit(1)
+            else:
+                i += 1
+
+        if not job_spec:
+            print(
+                "Error: -j/--job_spec required in component arguments", file=sys.stderr
+            )
+            sys.exit(1)
+
+        # Parse job_spec using torchx's parse_j
+        try:
+            nnodes, nproc_per_node = parse_j(job_spec)
+        except Exception as e:
+            print(f"Error: Failed to parse job spec '{job_spec}': {e}", file=sys.stderr)
+            sys.exit(1)
+
+        print(f"Scheduler: {parsed.scheduler}")
+        print(f"Component: {component_name}")
+        print(
+            f"Job spec: {job_spec} ({nnodes} node(s) x {nproc_per_node} proc(s) per node)"
+        )
+        print(f"Host type: {host_type}")
+        if parsed.workspace:
+            print(f"Workspace: {parsed.workspace}")
+
+        # Create underlying job based on scheduler type
+        underlying_job = create_job_for_scheduler(
+            scheduler=parsed.scheduler,
+            scheduler_cfg=scheduler_cfg,
+            num_hosts=nnodes,
+            host_type=host_type,
+            workspace=parsed.workspace,
+        )
+
+        # Wrap in SPMDJob
+        spmd_job = SPMDJob(
+            job=underlying_job,
+            scheduler=parsed.scheduler,
+            nnodes=nnodes,
+            nproc_per_node=nproc_per_node,
+            component=component_name,
+            component_args=component_args,
+            script_args=script_args,
+            workspace=parsed.workspace,
+            scheduler_args=scheduler_cfg,
+        )
+
+        # Launch job (calls apply + caches)
+        print(f"\nLaunching {parsed.scheduler} job...")
+        spmd_job.state()
+        print("✓ Job launched successfully and cached to .monarch/job_state.pkl")
+
+
 def get_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description="Monarch CLI")
     subparser = parser.add_subparsers(title="COMMANDS")
@@ -172,6 +314,7 @@ def get_parser() -> argparse.ArgumentParser:
         "info": InfoCmd(),
         "kill": KillCmd(),
         "debug": DebugCmd(),
+        "serve": ServeCmd(),
         # --- placeholder subcommands (not yet implemented) ---
         "bounce": BounceCmd(),
         "stop": StopCmd(),