From b86fcc9ae4eba3d27ef384da1b3f776078c5a4f6 Mon Sep 17 00:00:00 2001 From: Dave Kwon Date: Fri, 19 Dec 2025 12:57:43 -0800 Subject: [PATCH] Fix pdeathsig logic for bootstrap Summary: For Kubernetes, entry point command does run in pid 1, meaning we cannot rely on checking parent process becoming pid 1. Instead store the parent pid and compare them are equal. Differential Revision: D89570365 --- hyperactor_mesh/src/bootstrap.rs | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/hyperactor_mesh/src/bootstrap.rs b/hyperactor_mesh/src/bootstrap.rs index 182159c4b..6873ae747 100644 --- a/hyperactor_mesh/src/bootstrap.rs +++ b/hyperactor_mesh/src/bootstrap.rs @@ -552,21 +552,30 @@ impl Bootstrap { pub fn install_pdeathsig_kill() -> io::Result<()> { #[cfg(target_os = "linux")] { + // SAFETY: `getppid()` is a simple libc syscall returning the + // parent PID; it has no side effects and does not touch memory. + let ppid_before = unsafe { libc::getppid() }; + // SAFETY: Calling into libc; does not dereference memory, just // asks the kernel to deliver SIGKILL on parent death. let rc = unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL as libc::c_int) }; if rc != 0 { return Err(io::Error::last_os_error()); } - } - // Race-close: if the parent died between our exec and prctl(), - // we won't get a signal, so detect that and exit now. - // - // If getppid() == 1, we've already been reparented (parent gone). - // SAFETY: `getppid()` is a simple libc syscall returning the - // parent PID; it has no side effects and does not touch memory. - if unsafe { libc::getppid() } == 1 { - std::process::exit(0); + + // Race-close: if the parent died between our exec and prctl(), + // we won't get a signal, so detect that and exit now. + // + // If the parent PID changed, the parent has died and we've been + // reparented. Note: We cannot assume ppid == 1 means the parent + // died, as in container environments (e.g., Kubernetes) the parent + // may legitimately run as PID 1. + // SAFETY: `getppid()` is a simple libc syscall returning the + // parent PID; it has no side effects and does not touch memory. + let ppid_after = unsafe { libc::getppid() }; + if ppid_before != ppid_after { + std::process::exit(0); + } } Ok(()) }