Skip to content

Commit dc09a8f

Browse files
shayne-fletchermeta-codesync[bot]
authored andcommitted
python: actor: reinstate this_host() shutdown
Summary: reinstate `this_host` host-mesh shutdown (see D89478006 for why we turned it off) Reviewed By: mariusae Differential Revision: D89495607 fbshipit-source-id: 9573c51d37d4e3db12ccb1ecabcf550eb41b1cc6
1 parent 1baa637 commit dc09a8f

File tree

4 files changed

+68
-8
lines changed

4 files changed

+68
-8
lines changed

hyperactor_mesh/src/v1/host_mesh/mesh_agent.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use hyperactor::ActorHandle;
2020
use hyperactor::ActorId;
2121
use hyperactor::ActorRef;
2222
use hyperactor::Context;
23+
use hyperactor::HandleClient;
2324
use hyperactor::Handler;
2425
use hyperactor::Instance;
2526
use hyperactor::Named;
@@ -328,7 +329,7 @@ impl Handler<resource::GetRankStatus> for HostMeshAgent {
328329
}
329330
}
330331

331-
#[derive(Serialize, Deserialize, Debug, Named, Handler, RefClient)]
332+
#[derive(Serialize, Deserialize, Debug, Named, Handler, RefClient, HandleClient)]
332333
pub struct ShutdownHost {
333334
/// Grace window: send SIGTERM and wait this long before
334335
/// escalating.

monarch_hyperactor/src/v1/host_mesh.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@ use std::collections::HashMap;
1010
use std::ops::Deref;
1111
use std::path::PathBuf;
1212
use std::sync::OnceLock;
13+
use std::time::Duration;
1314

15+
use hyperactor::ActorHandle;
1416
use hyperactor::Instance;
1517
use hyperactor::Proc;
1618
use hyperactor_mesh::bootstrap::BootstrapCommand;
@@ -22,6 +24,8 @@ use hyperactor_mesh::v1::ProcMeshRef;
2224
use hyperactor_mesh::v1::host_mesh::HostMesh;
2325
use hyperactor_mesh::v1::host_mesh::HostMeshRef;
2426
use hyperactor_mesh::v1::host_mesh::mesh_agent::GetLocalProcClient;
27+
use hyperactor_mesh::v1::host_mesh::mesh_agent::HostMeshAgent;
28+
use hyperactor_mesh::v1::host_mesh::mesh_agent::ShutdownHostClient;
2529
use hyperactor_mesh::v1::proc_mesh::ProcRef;
2630
use ndslice::View;
2731
use ndslice::view::RankedSliceable;
@@ -253,6 +257,9 @@ impl PyHostMeshRefImpl {
253257
/// Static storage for the root client instance when using host-based bootstrap.
254258
static ROOT_CLIENT_INSTANCE_FOR_HOST: OnceLock<Instance<PythonActor>> = OnceLock::new();
255259

260+
/// Static storage for the host mesh agent created by bootstrap_host().
261+
static HOST_MESH_AGENT_FOR_HOST: OnceLock<ActorHandle<HostMeshAgent>> = OnceLock::new();
262+
256263
/// Bootstrap the client host and root client actor.
257264
///
258265
/// This creates a proper Host with BootstrapProcManager, spawns the root client
@@ -282,6 +289,9 @@ fn bootstrap_host(bootstrap_cmd: Option<PyBootstrapCommand>) -> PyResult<PyPytho
282289
.await
283290
.map_err(|e| PyException::new_err(e.to_string()))?;
284291

292+
// Store the agent for later shutdown
293+
HOST_MESH_AGENT_FOR_HOST.set(host_mesh_agent.clone()).ok(); // Ignore error if already set
294+
285295
let host_mesh_name = hyperactor_mesh::v1::Name::new_reserved("local").unwrap();
286296
let host_mesh = HostMeshRef::from_host_agent(host_mesh_name, host_mesh_agent.bind())
287297
.map_err(|e| PyException::new_err(e.to_string()))?;
@@ -333,6 +343,32 @@ fn py_host_mesh_from_bytes(bytes: &Bound<'_, PyBytes>) -> PyResult<PyHostMesh> {
333343
r.map(PyHostMesh::new_ref)
334344
}
335345

346+
#[pyfunction]
347+
fn shutdown_local_host_mesh() -> PyResult<PyPythonTask> {
348+
let agent = HOST_MESH_AGENT_FOR_HOST
349+
.get()
350+
.ok_or_else(|| PyException::new_err("No local host mesh to shutdown"))?
351+
.clone();
352+
353+
PyPythonTask::new(async move {
354+
// Create a temporary instance to send the shutdown message
355+
let temp_proc = hyperactor::Proc::local();
356+
let (instance, _) = temp_proc
357+
.instance("shutdown_requester")
358+
.map_err(|e| PyException::new_err(e.to_string()))?;
359+
360+
// Use same defaults as HostMesh::shutdown():
361+
// - MESH_TERMINATE_TIMEOUT = 10 seconds
362+
// - MESH_TERMINATE_CONCURRENCY = 16
363+
agent
364+
.shutdown_host(&instance, Duration::from_secs(10), 16)
365+
.await
366+
.map_err(|e| PyException::new_err(e.to_string()))?;
367+
368+
Ok(())
369+
})
370+
}
371+
336372
pub fn register_python_bindings(hyperactor_mod: &Bound<'_, PyModule>) -> PyResult<()> {
337373
let f = wrap_pyfunction!(py_host_mesh_from_bytes, hyperactor_mod)?;
338374
f.setattr(
@@ -348,6 +384,13 @@ pub fn register_python_bindings(hyperactor_mod: &Bound<'_, PyModule>) -> PyResul
348384
)?;
349385
hyperactor_mod.add_function(f2)?;
350386

387+
let f3 = wrap_pyfunction!(shutdown_local_host_mesh, hyperactor_mod)?;
388+
f3.setattr(
389+
"__module__",
390+
"monarch._rust_bindings.monarch_hyperactor.v1.host_mesh",
391+
)?;
392+
hyperactor_mod.add_function(f3)?;
393+
351394
hyperactor_mod.add_class::<PyHostMesh>()?;
352395
hyperactor_mod.add_class::<PyBootstrapCommand>()?;
353396
Ok(())

python/monarch/_rust_bindings/monarch_hyperactor/v1/host_mesh.pyi

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,16 @@ def bootstrap_host(
113113
- `bootstrap_cmd`: The bootstrap command to use to bootstrap the host.
114114
"""
115115
...
116+
117+
def shutdown_local_host_mesh() -> PythonTask[None]:
118+
"""
119+
Shutdown the local host mesh created by bootstrap_host().
120+
121+
Sends ShutdownHost message to the local host mesh agent with:
122+
- timeout: 10 seconds grace period before SIGTERM escalation
123+
- max_in_flight: 16 concurrent child terminations
124+
125+
Raises:
126+
RuntimeError: If no local host mesh exists (bootstrap_host not called)
127+
"""
128+
...

python/monarch/_src/actor/actor_mesh.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -380,15 +380,18 @@ def shutdown_context() -> "Future[None]":
380380
completion.
381381
"""
382382
from monarch._src.actor.future import Future
383+
from monarch._src.actor.v1 import enabled as v1_enabled
383384

384-
# TODO(shayne,2025-12-18): Since D89089836 we can't call shutdown
385-
# like this and doing so is causing runtime errors. This avoids
386-
# the error while I work out a better fix.
385+
if v1_enabled:
386+
try:
387+
from monarch._rust_bindings.monarch_hyperactor.v1.host_mesh import (
388+
shutdown_local_host_mesh,
389+
)
387390

388-
# client_host_ctx = _client_context.try_get()
389-
# if client_host_ctx is not None:
390-
# host_mesh = client_host_ctx.actor_instance.proc_mesh.host_mesh
391-
# return host_mesh.shutdown()
391+
return Future(coro=shutdown_local_host_mesh())
392+
except RuntimeError:
393+
# No local host mesh to shutdown
394+
pass
392395

393396
# Nothing to shutdown - return a completed future
394397
async def noop() -> None:

0 commit comments

Comments
 (0)