diff --git a/Cargo.lock b/Cargo.lock index 081ffe6be..a5ae4b97f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -427,7 +427,7 @@ version = "0.72.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" dependencies = [ - "bitflags", + "bitflags 2.11.1", "cexpr", "clang-sys", "itertools 0.13.0", @@ -439,6 +439,12 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.11.1" @@ -575,6 +581,17 @@ dependencies = [ "libbz2-rs-sys", ] +[[package]] +name = "capctl" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a6e71767585f51c2a33fed6d67147ec0343725fc3c03bf4b89fe67fede56aa5" +dependencies = [ + "bitflags 1.3.2", + "cfg-if", + "libc", +] + [[package]] name = "cassowary" version = "0.3.0" @@ -955,7 +972,7 @@ version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df" dependencies = [ - "bitflags", + "bitflags 2.11.1", "crossterm_winapi", "libc", "mio 0.8.11", @@ -971,7 +988,7 @@ version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6" dependencies = [ - "bitflags", + "bitflags 2.11.1", "crossterm_winapi", "mio 1.2.0", "parking_lot", @@ -2302,7 +2319,7 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "533e68a5842e734946fe159fb03fc9bbbb254f590dd0d8ad321ae5ff7beca2c1" dependencies = [ - "bitflags", + "bitflags 2.11.1", "inotify-sys", "libc", ] @@ -2601,7 +2618,7 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07293a4e297ac234359b510362495713f75ea345d5307140414f20c69ffeb087" dependencies = [ - "bitflags", + "bitflags 2.11.1", "libc", ] @@ -2837,7 +2854,7 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c" dependencies = [ - "bitflags", + "bitflags 2.11.1", "libc", "plain", "redox_syscall 0.7.4", @@ -3101,7 +3118,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46" dependencies = [ - "bitflags", + "bitflags 2.11.1", "cfg-if", "cfg_aliases", "libc", @@ -3123,7 +3140,7 @@ version = "8.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d3d07927151ff8575b7087f245456e549fea62edf0ec4e565a5ee50c8402bc3" dependencies = [ - "bitflags", + "bitflags 2.11.1", "fsevent-sys", "inotify", "kqueue", @@ -3141,7 +3158,7 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42b8cfee0e339a0337359f3c88165702ac6e600dc01c0cc9579a92d62b08477a" dependencies = [ - "bitflags", + "bitflags 2.11.1", ] [[package]] @@ -3793,6 +3810,7 @@ version = "0.0.0" dependencies = [ "anyhow", "base64 0.22.1", + "capctl", "hex", "landlock", "libc", @@ -4390,7 +4408,7 @@ version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad" dependencies = [ - "bitflags", + "bitflags 2.11.1", "memchr", "unicase", ] @@ -4576,7 +4594,7 @@ version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f44c9e68fd46eda15c646fbb85e1040b657a58cdc8c98db1d97a55930d991eef" dependencies = [ - "bitflags", + "bitflags 2.11.1", "cassowary", "compact_str", "crossterm 0.27.0", @@ -4596,7 +4614,7 @@ version = "11.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" dependencies = [ - "bitflags", + "bitflags 2.11.1", ] [[package]] @@ -4618,7 +4636,7 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags", + "bitflags 2.11.1", ] [[package]] @@ -4627,7 +4645,7 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a" dependencies = [ - "bitflags", + "bitflags 2.11.1", ] [[package]] @@ -4843,7 +4861,7 @@ checksum = "afe62631a04a1f4d71a14b99505483b95ff97c503b67d876c042fce659186956" dependencies = [ "aes", "aws-lc-rs", - "bitflags", + "bitflags 2.11.1", "block-padding", "byteorder", "bytes", @@ -4964,7 +4982,7 @@ version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags", + "bitflags 2.11.1", "errno", "libc", "linux-raw-sys 0.4.15", @@ -4977,7 +4995,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags", + "bitflags 2.11.1", "errno", "libc", "linux-raw-sys 0.12.1", @@ -5189,7 +5207,7 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ - "bitflags", + "bitflags 2.11.1", "core-foundation", "core-foundation-sys", "libc", @@ -5681,7 +5699,7 @@ checksum = "aa003f0038df784eb8fecbbac13affe3da23b45194bd57dba231c8f48199c526" dependencies = [ "atoi", "base64 0.22.1", - "bitflags", + "bitflags 2.11.1", "byteorder", "bytes", "crc", @@ -5723,7 +5741,7 @@ checksum = "db58fcd5a53cf07c184b154801ff91347e4c30d17a3562a635ff028ad5deda46" dependencies = [ "atoi", "base64 0.22.1", - "bitflags", + "bitflags 2.11.1", "byteorder", "crc", "dotenvy", @@ -6416,7 +6434,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" dependencies = [ "base64 0.21.7", - "bitflags", + "bitflags 2.11.1", "bytes", "http", "http-body", @@ -6434,7 +6452,7 @@ version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" dependencies = [ - "bitflags", + "bitflags 2.11.1", "bytes", "futures-util", "http", @@ -6914,7 +6932,7 @@ version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ - "bitflags", + "bitflags 2.11.1", "hashbrown 0.15.5", "indexmap", "semver", @@ -7511,7 +7529,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", - "bitflags", + "bitflags 2.11.1", "indexmap", "log", "serde", diff --git a/architecture/sandbox.md b/architecture/sandbox.md index 2552304e1..293d21fc6 100644 --- a/architecture/sandbox.md +++ b/architecture/sandbox.md @@ -14,7 +14,12 @@ Each sandbox workload has two trust levels: | Agent child | Runs as an unprivileged user with filesystem, process, and network restrictions applied. | The supervisor keeps enough privilege to manage the sandbox, but the agent child -loses that privilege before user code runs. +loses that privilege before user code runs. On Linux, child setup clears the +capability bounding set during privilege drop so later execs cannot regain +container-granted capabilities. This is fail-closed: the supervisor retains +`CAP_SETPCAP` solely to perform the clear, and spawning the workload or SSH shell +aborts unless the bounding set ends up empty. A `setpcap` `EPERM` is tolerated +only when the set is already empty; any other outcome fails the spawn. ## Startup Flow diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index 16814784a..66d0d9d90 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -877,8 +877,6 @@ pub fn build_container_spec_with_token_and_gpu_devices( "NET_RAW".into(), // Not needed: the supervisor does not manipulate file capabilities. "SETFCAP".into(), - // Not needed: the supervisor does not manage its own capability bounding set. - "SETPCAP".into(), // Not needed: the supervisor does not call chroot(). "SYS_CHROOT".into(), ], @@ -899,13 +897,18 @@ pub fn build_container_spec_with_token_and_gpu_devices( // Without it the proxy cannot determine which binary made each outbound // connection and all traffic is denied. "DAC_READ_SEARCH".into(), + // Child setup clears the capability bounding set before exec, which + // requires CAP_SETPCAP in the supervisor until drop_privileges(). + "SETPCAP".into(), ], - // SETUID, SETGID, CHOWN, and FOWNER are intentionally kept from Podman's - // default set and not dropped: + // SETUID, SETGID, SETPCAP, CHOWN, and FOWNER are intentionally kept from + // Podman's default set and not dropped: // SETUID/SETGID – drop_privileges(): setuid()/setgid()/initgroups() to the // sandbox user. In rootless Podman cap_drop:ALL removes them // from the bounding set even though uid=0 owns the user // namespace — so we keep them by not dropping them explicitly. + // SETPCAP – drop_privileges(): clears the child capability + // bounding set before the sandbox user execs. // CHOWN – prepare_filesystem(): chown(path, uid, gid) on newly // created read_write directories so the sandbox user can // write to them. @@ -1451,12 +1454,14 @@ mod tests { added.contains(&"DAC_READ_SEARCH"), "missing DAC_READ_SEARCH" ); + assert!(added.contains(&"SETPCAP"), "missing SETPCAP"); // SETUID and SETGID are NOT in cap_add — they remain available from the // default bounding set because we no longer use cap_drop:ALL. Verify they - // are also not explicitly dropped. Similarly CHOWN and FOWNER must not be - // dropped because prepare_filesystem() calls chown() on newly created - // read_write directories before the supervisor drops privileges. + // are also not explicitly dropped. Similarly SETPCAP, CHOWN and FOWNER + // must not be dropped because child setup clears the bounding set and + // prepare_filesystem() calls chown() on newly created read_write + // directories before the supervisor drops privileges. let dropped: Vec<&str> = spec["cap_drop"] .as_array() .expect("cap_drop should be an array") @@ -1473,6 +1478,10 @@ mod tests { !dropped.contains(&"FOWNER"), "FOWNER must not be dropped (needed for chown on non-owned files)" ); + assert!( + !dropped.contains(&"SETPCAP"), + "SETPCAP must not be dropped (needed for child bounding-set clear)" + ); assert!( !dropped.contains(&"ALL"), "must not use cap_drop:ALL in rootless Podman" diff --git a/crates/openshell-supervisor-process/Cargo.toml b/crates/openshell-supervisor-process/Cargo.toml index b2dad859e..1163cc954 100644 --- a/crates/openshell-supervisor-process/Cargo.toml +++ b/crates/openshell-supervisor-process/Cargo.toml @@ -35,6 +35,7 @@ libc = "0.2" rustix = { workspace = true } [target.'cfg(target_os = "linux")'.dependencies] +capctl = "0.2.4" landlock = "0.4" seccompiler = "0.5" tempfile = "3" diff --git a/crates/openshell-supervisor-process/src/process.rs b/crates/openshell-supervisor-process/src/process.rs index 9f9fe1822..c1b6b4532 100644 --- a/crates/openshell-supervisor-process/src/process.rs +++ b/crates/openshell-supervisor-process/src/process.rs @@ -155,6 +155,46 @@ fn parse_pids_max(contents: &str) -> RuntimePidLimitStatus { } } +#[cfg(target_os = "linux")] +fn drop_capability_bounding_set() -> Result<()> { + let clear_result = capctl::caps::bounding::clear(); + let remaining = capctl::caps::bounding::probe(); + + validate_capability_bounding_set_clear( + clear_result, + remaining, + capctl::caps::bounding::clear_unknown, + ) +} + +#[cfg(target_os = "linux")] +fn validate_capability_bounding_set_clear( + clear_result: capctl::Result<()>, + remaining: capctl::caps::CapSet, + clear_unknown: impl FnOnce() -> capctl::Result<()>, +) -> Result<()> { + match clear_result { + Ok(()) if remaining.is_empty() => Ok(()), + Ok(()) => Err(miette::miette!( + "Failed to clear child capability bounding set: capabilities remain raised: {remaining:?}" + )), + Err(err) if err.code() == libc::EPERM && remaining.is_empty() => match clear_unknown() { + Ok(()) => { + debug!( + "CAP_SETPCAP is unavailable, but the child capability bounding set is already empty" + ); + Ok(()) + } + Err(unknown_err) => Err(miette::miette!( + "Failed to clear unknown child capability bounding set entries: {unknown_err}" + )), + }, + Err(err) => Err(miette::miette!( + "Failed to clear child capability bounding set: {err}" + )), + } +} + // Pins the pre-seccomp child mount namespace where supervisor identity sockets // are shadowed. Children enter it with setns before dropping privileges. #[cfg(target_os = "linux")] @@ -969,6 +1009,9 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> { )); } + #[cfg(target_os = "linux")] + drop_capability_bounding_set()?; + if user_name.is_some() { nix::unistd::setuid(user.uid).into_diagnostic()?; @@ -1083,6 +1126,101 @@ mod tests { ); } + #[cfg(target_os = "linux")] + fn capability_bounding_set_clear_available() -> bool { + capctl::caps::CapState::get_current() + .is_ok_and(|state| state.effective.has(capctl::caps::Cap::SETPCAP)) + || capctl::caps::bounding::probe().is_empty() + } + + #[test] + #[cfg(target_os = "linux")] + fn capability_bounding_set_clear_accepts_empty_eperm() { + let remaining = capctl::caps::CapSet::empty(); + + assert!( + validate_capability_bounding_set_clear( + Err(capctl::Error::from_code(libc::EPERM)), + remaining, + || Ok(()), + ) + .is_ok() + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn capability_bounding_set_clear_rejects_nonempty_eperm() { + let mut remaining = capctl::caps::CapSet::empty(); + remaining.add(capctl::caps::Cap::CHOWN); + + let result = validate_capability_bounding_set_clear( + Err(capctl::Error::from_code(libc::EPERM)), + remaining, + || panic!("unknown capabilities should not be checked when known caps remain"), + ); + + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Failed to clear child capability bounding set") + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn capability_bounding_set_clear_rejects_nonempty_success() { + let mut remaining = capctl::caps::CapSet::empty(); + remaining.add(capctl::caps::Cap::CHOWN); + + let result = validate_capability_bounding_set_clear(Ok(()), remaining, || { + panic!("unknown capabilities should not be checked when known caps remain") + }); + + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("capabilities remain raised") + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn capability_bounding_set_clear_rejects_unknown_eperm() { + let remaining = capctl::caps::CapSet::empty(); + + let result = validate_capability_bounding_set_clear( + Err(capctl::Error::from_code(libc::EPERM)), + remaining, + || Err(capctl::Error::from_code(libc::EPERM)), + ); + + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Failed to clear unknown child capability bounding set entries") + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn capability_probe_child() { + if std::env::var_os("OPENSHELL_TEST_PROBE_CHILD_CAPS").is_none() { + return; + } + + assert!( + capctl::caps::bounding::probe().is_empty(), + "child CapBnd should be empty after exec" + ); + } + #[test] fn drop_privileges_noop_when_no_user_or_group() { let policy = policy_with_process(ProcessPolicy { @@ -1130,7 +1268,67 @@ mod tests { run_as_group: Some(current_group.name), }); - assert!(drop_privileges(&policy).is_ok()); + let result = drop_privileges(&policy); + + #[cfg(target_os = "linux")] + { + if capability_bounding_set_clear_available() { + assert!(result.is_ok(), "drop_privileges failed: {result:?}"); + } else { + let msg = format!("{}", result.unwrap_err()); + assert!( + msg.contains("Failed to clear child capability bounding set"), + "unexpected failure: {msg}" + ); + } + } + + #[cfg(not(target_os = "linux"))] + assert!(result.is_ok()); + } + + #[test] + #[cfg(target_os = "linux")] + #[allow(unsafe_code)] + fn drop_privileges_clears_bounding_set_for_spawned_child_when_permitted() { + use std::os::unix::process::CommandExt; + + if !capability_bounding_set_clear_available() { + eprintln!( + "skipping: CAP_SETPCAP is not effective and the capability bounding set is nonempty" + ); + return; + } + + let current_group = Group::from_gid(nix::unistd::getegid()) + .expect("getgrgid") + .expect("current group entry"); + + let policy = policy_with_process(ProcessPolicy { + run_as_user: None, + run_as_group: Some(current_group.name), + }); + + let mut cmd = std::process::Command::new(std::env::current_exe().expect("current exe")); + cmd.arg("capability_probe_child") + .arg("--nocapture") + .env("OPENSHELL_TEST_PROBE_CHILD_CAPS", "1") + .stdin(StdStdio::null()) + .stdout(StdStdio::piped()) + .stderr(StdStdio::piped()); + + unsafe { + cmd.pre_exec(move || { + drop_privileges(&policy).map_err(|err| std::io::Error::other(err.to_string())) + }); + } + + let output = cmd.output().expect("spawn child status probe"); + assert!( + output.status.success(), + "status probe failed: {}", + String::from_utf8_lossy(&output.stderr) + ); } #[test]