From 00459a5ea4b5a71b943a9684686356d907fc4f53 Mon Sep 17 00:00:00 2001 From: Shayne Fletcher Date: Thu, 18 Dec 2025 14:10:02 -0800 Subject: [PATCH 1/3] : python: actor: skip this_host() shutdown (#2171) Summary: D89052078 changed the implementation of `shutdown_context()` but unfortunately as written it can't work and generates a runtime error: "cannot shut down `HostMesh` that is a reference instead of owned". skip the explicit shutdown while we work out a better fix. Reviewed By: colin2328 Differential Revision: D89478006 --- python/monarch/_src/actor/actor_mesh.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/monarch/_src/actor/actor_mesh.py b/python/monarch/_src/actor/actor_mesh.py index f18de2f0c..2aa5f696f 100644 --- a/python/monarch/_src/actor/actor_mesh.py +++ b/python/monarch/_src/actor/actor_mesh.py @@ -381,10 +381,14 @@ def shutdown_context() -> "Future[None]": """ from monarch._src.actor.future import Future - client_host_ctx = _client_context.try_get() - if client_host_ctx is not None: - host_mesh = client_host_ctx.actor_instance.proc_mesh.host_mesh - return host_mesh.shutdown() + # TODO(shayne,2025-12-18): Since D89089836 we can't call shutdown + # like this and doing so is causing runtime errors. This avoids + # the error while I work out a better fix. + + # client_host_ctx = _client_context.try_get() + # if client_host_ctx is not None: + # host_mesh = client_host_ctx.actor_instance.proc_mesh.host_mesh + # return host_mesh.shutdown() # Nothing to shutdown - return a completed future async def noop() -> None: From 997a64ec7a7c2d2ab47b44b9ced49ee68885fb0a Mon Sep 17 00:00:00 2001 From: Shayne Fletcher Date: Thu, 18 Dec 2025 14:10:02 -0800 Subject: [PATCH 2/3] : python: actor: reinstate this_host() shutdown Summary: reinstate `this_host` host-mesh shutdown (see D89478006 for why we turned it off) Differential Revision: D89495607 --- .../src/v1/host_mesh/mesh_agent.rs | 3 +- monarch_hyperactor/src/v1/host_mesh.rs | 43 +++++++++++++++++++ .../monarch_hyperactor/v1/host_mesh.pyi | 13 ++++++ python/monarch/_src/actor/actor_mesh.py | 17 +++++--- 4 files changed, 68 insertions(+), 8 deletions(-) diff --git a/hyperactor_mesh/src/v1/host_mesh/mesh_agent.rs b/hyperactor_mesh/src/v1/host_mesh/mesh_agent.rs index 50ccfd144..3545973f4 100644 --- a/hyperactor_mesh/src/v1/host_mesh/mesh_agent.rs +++ b/hyperactor_mesh/src/v1/host_mesh/mesh_agent.rs @@ -20,6 +20,7 @@ use hyperactor::ActorHandle; use hyperactor::ActorId; use hyperactor::ActorRef; use hyperactor::Context; +use hyperactor::HandleClient; use hyperactor::Handler; use hyperactor::Instance; use hyperactor::Named; @@ -328,7 +329,7 @@ impl Handler for HostMeshAgent { } } -#[derive(Serialize, Deserialize, Debug, Named, Handler, RefClient)] +#[derive(Serialize, Deserialize, Debug, Named, Handler, RefClient, HandleClient)] pub struct ShutdownHost { /// Grace window: send SIGTERM and wait this long before /// escalating. diff --git a/monarch_hyperactor/src/v1/host_mesh.rs b/monarch_hyperactor/src/v1/host_mesh.rs index 55c55ab32..fe772ca9d 100644 --- a/monarch_hyperactor/src/v1/host_mesh.rs +++ b/monarch_hyperactor/src/v1/host_mesh.rs @@ -10,7 +10,9 @@ use std::collections::HashMap; use std::ops::Deref; use std::path::PathBuf; use std::sync::OnceLock; +use std::time::Duration; +use hyperactor::ActorHandle; use hyperactor::Instance; use hyperactor::Proc; use hyperactor_mesh::bootstrap::BootstrapCommand; @@ -22,6 +24,8 @@ use hyperactor_mesh::v1::ProcMeshRef; use hyperactor_mesh::v1::host_mesh::HostMesh; use hyperactor_mesh::v1::host_mesh::HostMeshRef; use hyperactor_mesh::v1::host_mesh::mesh_agent::GetLocalProcClient; +use hyperactor_mesh::v1::host_mesh::mesh_agent::HostMeshAgent; +use hyperactor_mesh::v1::host_mesh::mesh_agent::ShutdownHostClient; use hyperactor_mesh::v1::proc_mesh::ProcRef; use ndslice::View; use ndslice::view::RankedSliceable; @@ -253,6 +257,9 @@ impl PyHostMeshRefImpl { /// Static storage for the root client instance when using host-based bootstrap. static ROOT_CLIENT_INSTANCE_FOR_HOST: OnceLock> = OnceLock::new(); +/// Static storage for the host mesh agent created by bootstrap_host(). +static HOST_MESH_AGENT_FOR_HOST: OnceLock> = OnceLock::new(); + /// Bootstrap the client host and root client actor. /// /// This creates a proper Host with BootstrapProcManager, spawns the root client @@ -282,6 +289,9 @@ fn bootstrap_host(bootstrap_cmd: Option) -> PyResult) -> PyResult { r.map(PyHostMesh::new_ref) } +#[pyfunction] +fn shutdown_local_host_mesh() -> PyResult { + let agent = HOST_MESH_AGENT_FOR_HOST + .get() + .ok_or_else(|| PyException::new_err("No local host mesh to shutdown"))? + .clone(); + + PyPythonTask::new(async move { + // Create a temporary instance to send the shutdown message + let temp_proc = hyperactor::Proc::local(); + let (instance, _) = temp_proc + .instance("shutdown_requester") + .map_err(|e| PyException::new_err(e.to_string()))?; + + // Use same defaults as HostMesh::shutdown(): + // - MESH_TERMINATE_TIMEOUT = 10 seconds + // - MESH_TERMINATE_CONCURRENCY = 16 + agent + .shutdown_host(&instance, Duration::from_secs(10), 16) + .await + .map_err(|e| PyException::new_err(e.to_string()))?; + + Ok(()) + }) +} + pub fn register_python_bindings(hyperactor_mod: &Bound<'_, PyModule>) -> PyResult<()> { let f = wrap_pyfunction!(py_host_mesh_from_bytes, hyperactor_mod)?; f.setattr( @@ -348,6 +384,13 @@ pub fn register_python_bindings(hyperactor_mod: &Bound<'_, PyModule>) -> PyResul )?; hyperactor_mod.add_function(f2)?; + let f3 = wrap_pyfunction!(shutdown_local_host_mesh, hyperactor_mod)?; + f3.setattr( + "__module__", + "monarch._rust_bindings.monarch_hyperactor.v1.host_mesh", + )?; + hyperactor_mod.add_function(f3)?; + hyperactor_mod.add_class::()?; hyperactor_mod.add_class::()?; Ok(()) diff --git a/python/monarch/_rust_bindings/monarch_hyperactor/v1/host_mesh.pyi b/python/monarch/_rust_bindings/monarch_hyperactor/v1/host_mesh.pyi index e0a9227f4..cef221754 100644 --- a/python/monarch/_rust_bindings/monarch_hyperactor/v1/host_mesh.pyi +++ b/python/monarch/_rust_bindings/monarch_hyperactor/v1/host_mesh.pyi @@ -113,3 +113,16 @@ def bootstrap_host( - `bootstrap_cmd`: The bootstrap command to use to bootstrap the host. """ ... + +def shutdown_local_host_mesh() -> PythonTask[None]: + """ + Shutdown the local host mesh created by bootstrap_host(). + + Sends ShutdownHost message to the local host mesh agent with: + - timeout: 10 seconds grace period before SIGTERM escalation + - max_in_flight: 16 concurrent child terminations + + Raises: + RuntimeError: If no local host mesh exists (bootstrap_host not called) + """ + ... diff --git a/python/monarch/_src/actor/actor_mesh.py b/python/monarch/_src/actor/actor_mesh.py index 2aa5f696f..479b49dea 100644 --- a/python/monarch/_src/actor/actor_mesh.py +++ b/python/monarch/_src/actor/actor_mesh.py @@ -380,15 +380,18 @@ def shutdown_context() -> "Future[None]": completion. """ from monarch._src.actor.future import Future + from monarch._src.actor.v1 import enabled as v1_enabled - # TODO(shayne,2025-12-18): Since D89089836 we can't call shutdown - # like this and doing so is causing runtime errors. This avoids - # the error while I work out a better fix. + if v1_enabled: + try: + from monarch._rust_bindings.monarch_hyperactor.v1.host_mesh import ( + shutdown_local_host_mesh, + ) - # client_host_ctx = _client_context.try_get() - # if client_host_ctx is not None: - # host_mesh = client_host_ctx.actor_instance.proc_mesh.host_mesh - # return host_mesh.shutdown() + return Future(coro=shutdown_local_host_mesh()) + except RuntimeError: + # No local host mesh to shutdown + pass # Nothing to shutdown - return a completed future async def noop() -> None: From db3aff447ebf2ca932526c15e7122b3ae5ff2167 Mon Sep 17 00:00:00 2001 From: Shayne Fletcher Date: Thu, 18 Dec 2025 14:10:02 -0800 Subject: [PATCH 3/3] : python: actor: shutdown_context() terminates this_host() (#2178) Summary: now `shutdown_context()` terminates the OS process running `this_host()`. Differential Revision: D89498869 --- hyperactor/src/init.rs | 2 +- hyperactor/src/lib.rs | 2 ++ monarch_hyperactor/src/v1/host_mesh.rs | 12 ++++++++---- .../monarch_hyperactor/v1/host_mesh.pyi | 8 ++++++-- python/monarch/_src/actor/actor_mesh.py | 4 +++- 5 files changed, 20 insertions(+), 8 deletions(-) diff --git a/hyperactor/src/init.rs b/hyperactor/src/init.rs index 24e471ede..b9ae6e3af 100644 --- a/hyperactor/src/init.rs +++ b/hyperactor/src/init.rs @@ -19,7 +19,7 @@ static RUNTIME: OnceLock = OnceLock::new(); /// /// Panics if the runtime has not been initialized *and* the caller is not in an /// async context. -pub(crate) fn get_runtime() -> tokio::runtime::Handle { +pub fn get_runtime() -> tokio::runtime::Handle { match RUNTIME.get() { Some(handle) => handle.clone(), None => tokio::runtime::Handle::current(), diff --git a/hyperactor/src/lib.rs b/hyperactor/src/lib.rs index b1b4b61e5..943ad0bff 100644 --- a/hyperactor/src/lib.rs +++ b/hyperactor/src/lib.rs @@ -142,6 +142,8 @@ pub use hyperactor_telemetry::declare_static_timer; pub use hyperactor_telemetry::key_value; pub use hyperactor_telemetry::kv_pairs; #[doc(inline)] +pub use init::get_runtime; +#[doc(inline)] pub use init::initialize; #[doc(inline)] pub use init::initialize_with_current_runtime; diff --git a/monarch_hyperactor/src/v1/host_mesh.rs b/monarch_hyperactor/src/v1/host_mesh.rs index fe772ca9d..225a19500 100644 --- a/monarch_hyperactor/src/v1/host_mesh.rs +++ b/monarch_hyperactor/src/v1/host_mesh.rs @@ -344,13 +344,14 @@ fn py_host_mesh_from_bytes(bytes: &Bound<'_, PyBytes>) -> PyResult { } #[pyfunction] -fn shutdown_local_host_mesh() -> PyResult { +fn shutdown_local_host_mesh() -> PyResult<()> { let agent = HOST_MESH_AGENT_FOR_HOST .get() .ok_or_else(|| PyException::new_err("No local host mesh to shutdown"))? .clone(); - PyPythonTask::new(async move { + // Block on the async shutdown operation + hyperactor::get_runtime().block_on(async move { // Create a temporary instance to send the shutdown message let temp_proc = hyperactor::Proc::local(); let (instance, _) = temp_proc @@ -365,8 +366,11 @@ fn shutdown_local_host_mesh() -> PyResult { .await .map_err(|e| PyException::new_err(e.to_string()))?; - Ok(()) - }) + Ok::<(), pyo3::PyErr>(()) + })?; + + // Exit the process + std::process::exit(0) } pub fn register_python_bindings(hyperactor_mod: &Bound<'_, PyModule>) -> PyResult<()> { diff --git a/python/monarch/_rust_bindings/monarch_hyperactor/v1/host_mesh.pyi b/python/monarch/_rust_bindings/monarch_hyperactor/v1/host_mesh.pyi index cef221754..ee57bd426 100644 --- a/python/monarch/_rust_bindings/monarch_hyperactor/v1/host_mesh.pyi +++ b/python/monarch/_rust_bindings/monarch_hyperactor/v1/host_mesh.pyi @@ -114,9 +114,12 @@ def bootstrap_host( """ ... -def shutdown_local_host_mesh() -> PythonTask[None]: +def shutdown_local_host_mesh() -> None: """ - Shutdown the local host mesh created by bootstrap_host(). + Shutdown the local host mesh created by bootstrap_host() and exit the process. + + This function blocks until shutdown completes. On successful shutdown, it + calls exit(0) and never returns. On failure, it raises an exception. Sends ShutdownHost message to the local host mesh agent with: - timeout: 10 seconds grace period before SIGTERM escalation @@ -124,5 +127,6 @@ def shutdown_local_host_mesh() -> PythonTask[None]: Raises: RuntimeError: If no local host mesh exists (bootstrap_host not called) + or if shutdown fails """ ... diff --git a/python/monarch/_src/actor/actor_mesh.py b/python/monarch/_src/actor/actor_mesh.py index 479b49dea..d8504153e 100644 --- a/python/monarch/_src/actor/actor_mesh.py +++ b/python/monarch/_src/actor/actor_mesh.py @@ -388,7 +388,9 @@ def shutdown_context() -> "Future[None]": shutdown_local_host_mesh, ) - return Future(coro=shutdown_local_host_mesh()) + # This function blocks and then exits the process + shutdown_local_host_mesh() + # Never reached except RuntimeError: # No local host mesh to shutdown pass