From 48b19604fda95ba1e2ddb164134f689154f2a713 Mon Sep 17 00:00:00 2001 From: "Dustin C. Hatch" Date: Wed, 8 Oct 2025 20:19:48 -0500 Subject: [PATCH] Do not replace current process with reboot command Instead of replacing the current process with the reboot command directly via `exec`, we need to run it in a child process and keep the current process running. The former method has the interesting side-effect of getting the machine into a state where it can never reboot: 1. When the reboot sentinel file appears, the coordinator acquires the lock and drains the node, then `exec`s the reboot command. 2. The DaemonSet pod goes into _Completed_ state once the reboot command finishes. If the reboot command starts the reboot process immediately, there is no issue, but if it starts a delayed reboot, trouble ensues. 3. After a timeout, Kubernetes restarts the DaemonSet pod, starting the coordinator process again. 4. The coordinator notices that the reboot sentinel already exists and immediately `exec`s the reboot command again. 5. The reboot command restarts the delayed reboot process, pushing the actual reboot time further into the future. 6. Return to step 2. To break this loop, someone needs to either remove the reboot sentinel file, letting the coordinator start up and run without doing anything, or forcably reboot the node. We can avoid this loop by never exiting from the process managed by the pod. The reboot command runs and exits, but the parent process continues until it's signalled to stop. --- src/main.rs | 20 +++++++------------- tests/krc-it-test/main.rs | 2 +- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/src/main.rs b/src/main.rs index 3c2edfd..540267d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,8 +7,6 @@ mod lock; mod test; use std::ffi::OsString; -use std::io::ErrorKind; -use std::os::unix::process::CommandExt; use std::path::Path; use std::process::Command; use std::time::Duration; @@ -64,8 +62,10 @@ async fn inner_main( acquire_lock(client.clone(), &lock, &ctx).await; info!("Initiating node reboot"); - exec_cmd(ctx.reboot_cmd()); - unreachable!(); + if let Err(e) = run_cmd(ctx.reboot_cmd()) { + error!("Failed to run reboot command: {e}"); + } + Ok(()) } async fn acquire_lock(client: kube::Client, lock: &lock::Lock, ctx: &Context) { @@ -85,17 +85,11 @@ async fn acquire_lock(client: kube::Client, lock: &lock::Lock, ctx: &Context) { } } -fn exec_cmd(cmd: &[OsString]) { +fn run_cmd(cmd: &[OsString]) -> std::io::Result<()> { let program = &cmd[0]; let args = &cmd[1..]; - let error = Command::new(program).args(args).exec(); - let rc = match error.kind() { - ErrorKind::NotFound => 127, - ErrorKind::PermissionDenied => 126, - _ => 1, - }; - eprintln!("{}: {error}", program.to_string_lossy()); - std::process::exit(rc); + Command::new(program).args(args).spawn()?.wait()?; + Ok(()) } async fn release_lock(client: kube::Client, lock: &lock::Lock) { diff --git a/tests/krc-it-test/main.rs b/tests/krc-it-test/main.rs index 6fdcbc6..9d1b5a0 100644 --- a/tests/krc-it-test/main.rs +++ b/tests/krc-it-test/main.rs @@ -108,7 +108,7 @@ fn run_it( env.insert("REBOOT_LOCK_GROUP".into(), g.into()); } Command::new(EXE) - .args(["echo", "test success"]) + .args(["sh", "-c", "echo 'test success' & kill $PPID"]) .env_clear() .envs(&env) .stdin(Stdio::null())