From 1cdfef5bf1a78a4ab533377daea209b8ea7fa939 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 5 Dec 2025 11:06:10 -0800 Subject: [PATCH 1/7] rename DumpSetup to DebugCollector --- .../src/{dump_setup.rs => debug_collector.rs} | 66 +++++++++---------- ..._setup_task.rs => debug_collector_task.rs} | 23 +++---- sled-agent/config-reconciler/src/handle.rs | 6 +- sled-agent/config-reconciler/src/lib.rs | 4 +- .../config-reconciler/src/reconciler_task.rs | 2 +- .../src/reconciler_task/external_disks.rs | 8 +-- .../src/reconciler_task/zones.rs | 2 +- sled-agent/src/zone_bundle.rs | 9 +-- sled-agent/types/src/support_bundle.rs | 6 +- 9 files changed, 64 insertions(+), 62 deletions(-) rename sled-agent/config-reconciler/src/{dump_setup.rs => debug_collector.rs} (97%) rename sled-agent/config-reconciler/src/{dump_setup_task.rs => debug_collector_task.rs} (92%) diff --git a/sled-agent/config-reconciler/src/dump_setup.rs b/sled-agent/config-reconciler/src/debug_collector.rs similarity index 97% rename from sled-agent/config-reconciler/src/dump_setup.rs rename to sled-agent/config-reconciler/src/debug_collector.rs index 91726253cca..8d41ff5de3d 100644 --- a/sled-agent/config-reconciler/src/dump_setup.rs +++ b/sled-agent/config-reconciler/src/debug_collector.rs @@ -186,7 +186,7 @@ trait GetMountpoint: AsRef { } #[derive(Debug)] -enum DumpSetupCmd { +enum DebugCollectorCmd { ArchiveFormerZoneRoot { zone_root: Utf8PathBuf, zone_name: String, @@ -200,7 +200,7 @@ enum DumpSetupCmd { }, } -struct DumpSetupWorker { +struct DebugCollectorWorker { core_dataset_names: Vec, debug_dataset_names: Vec, @@ -215,32 +215,32 @@ struct DumpSetupWorker { savecored_slices: HashSet, log: Logger, - rx: Receiver, + rx: Receiver, coredumpadm_invoker: Box, zfs_invoker: Box, zone_invoker: Box, } -pub struct DumpSetup { - tx: tokio::sync::mpsc::Sender, +pub struct DebugCollector { + tx: tokio::sync::mpsc::Sender, mount_config: Arc, _poller: tokio::task::JoinHandle<()>, log: Logger, } -impl DumpSetup { +impl DebugCollector { pub fn new(log: &Logger, mount_config: Arc) -> Self { let (tx, rx) = tokio::sync::mpsc::channel(16); - let worker = DumpSetupWorker::new( + let worker = DebugCollectorWorker::new( Box::new(RealCoreDumpAdm {}), Box::new(RealZfs {}), Box::new(RealZone {}), - log.new(o!("component" => "DumpSetup-worker")), + log.new(o!("component" => "DebugCollector-worker")), rx, ); let _poller = tokio::spawn(async move { worker.poll_file_archival().await }); - let log = log.new(o!("component" => "DumpSetup")); + let log = log.new(o!("component" => "DebugCollector")); Self { tx, mount_config, _poller, log } } @@ -249,7 +249,7 @@ impl DumpSetup { /// /// This function returns only once this request has been handled, which /// can be used as a signal by callers that any "old disks" are no longer - /// being used by [DumpSetup]. + /// being used by [DebugCollector]. pub async fn update_dumpdev_setup( &self, disks: impl Iterator, @@ -322,7 +322,7 @@ impl DumpSetup { let (tx, rx) = oneshot::channel(); if let Err(err) = self .tx - .send(DumpSetupCmd::UpdateDumpdevSetup { + .send(DebugCollectorCmd::UpdateDumpdevSetup { dump_slices: m2_dump_slices, debug_datasets: u2_debug_datasets, core_datasets: m2_core_datasets, @@ -330,11 +330,11 @@ impl DumpSetup { }) .await { - error!(log, "DumpSetup channel closed: {:?}", err.0); + error!(log, "DebugCollector channel closed: {:?}", err.0); }; if let Err(err) = rx.await { - error!(log, "DumpSetup failed to await update"; "err" => ?err); + error!(log, "DebugCollector failed to await update"; "err" => ?err); } } @@ -383,7 +383,7 @@ impl DumpSetup { info!(log, "requesting archive of former zone root"); let zone_root = zone_root.to_owned(); let zone_name = file_name.to_string(); - let cmd = DumpSetupCmd::ArchiveFormerZoneRoot { + let cmd = DebugCollectorCmd::ArchiveFormerZoneRoot { zone_root, zone_name, completion_tx, @@ -392,7 +392,7 @@ impl DumpSetup { error!( log, "failed to request archive of former zone root"; - "error" => "DumpSetup channel closed" + "error" => "DebugCollector channel closed" ); } } @@ -603,13 +603,13 @@ fn safe_to_delete(path: &Utf8Path, meta: &std::fs::Metadata) -> bool { return true; } -impl DumpSetupWorker { +impl DebugCollectorWorker { fn new( coredumpadm_invoker: Box, zfs_invoker: Box, zone_invoker: Box, log: Logger, - rx: Receiver, + rx: Receiver, ) -> Self { Self { core_dataset_names: vec![], @@ -630,7 +630,7 @@ impl DumpSetupWorker { } async fn poll_file_archival(mut self) { - info!(self.log, "DumpSetup poll loop started."); + info!(self.log, "DebugCollector poll loop started."); // A oneshot which helps callers track when updates have propagated. // @@ -642,7 +642,7 @@ impl DumpSetupWorker { loop { match tokio::time::timeout(ARCHIVAL_INTERVAL, self.rx.recv()).await { - Ok(Some(DumpSetupCmd::UpdateDumpdevSetup { + Ok(Some(DebugCollectorCmd::UpdateDumpdevSetup { dump_slices, debug_datasets, core_datasets, @@ -656,7 +656,7 @@ impl DumpSetupWorker { core_datasets, ); } - Ok(Some(DumpSetupCmd::ArchiveFormerZoneRoot { + Ok(Some(DebugCollectorCmd::ArchiveFormerZoneRoot { zone_root, zone_name, completion_tx, @@ -1586,7 +1586,7 @@ mod tests { ); const NOT_MOUNTED_INTERNAL: &str = "oxi_acab2069-6e63-6c75-de73-20c06c756db0"; - let mut worker = DumpSetupWorker::new( + let mut worker = DebugCollectorWorker::new( Box::::default(), Box::new(FakeZfs { zpool_props: [( @@ -1641,7 +1641,7 @@ mod tests { name: ZpoolName::from_str(ERROR_INTERNAL).unwrap(), }; const ZPOOL_MNT: &str = "/path/to/internal/zpool"; - let mut worker = DumpSetupWorker::new( + let mut worker = DebugCollectorWorker::new( Box::::default(), Box::new(FakeZfs { zpool_props: [ @@ -1732,7 +1732,7 @@ mod tests { let logctx = omicron_test_utils::dev::test_setup_log( "test_savecore_and_dumpadm_not_called_when_occupied_and_no_dir", ); - let mut worker = DumpSetupWorker::new( + let mut worker = DebugCollectorWorker::new( Box::::default(), Box::::default(), Box::::default(), @@ -1760,7 +1760,7 @@ mod tests { let logctx = omicron_test_utils::dev::test_setup_log( "test_dumpadm_called_when_vacant_slice_but_no_dir", ); - let mut worker = DumpSetupWorker::new( + let mut worker = DebugCollectorWorker::new( Box::::default(), Box::::default(), Box::::default(), @@ -1791,7 +1791,7 @@ mod tests { const MOUNTED_EXTERNAL: &str = "oxp_446f6e74-4469-6557-6f6e-646572696e67"; const ZPOOL_MNT: &str = "/path/to/external/zpool"; - let mut worker = DumpSetupWorker::new( + let mut worker = DebugCollectorWorker::new( Box::::default(), Box::new(FakeZfs { zpool_props: [( @@ -1853,7 +1853,7 @@ mod tests { "oxi_474e554e-6174-616c-6965-4e677579656e"; const MOUNTED_EXTERNAL: &str = "oxp_446f6e74-4469-6557-6f6e-646572696e67"; - let mut worker = DumpSetupWorker::new( + let mut worker = DebugCollectorWorker::new( Box::::default(), Box::new(FakeZfs { zpool_props: [ @@ -1981,15 +1981,15 @@ mod tests { } } - async fn new_dump_setup_worker( + async fn new_debug_collector_worker( &self, used: u64, available: u64, - ) -> DumpSetupWorker { + ) -> DebugCollectorWorker { let tempdir_path = self.tempdir.path().to_string(); const MOUNTED_EXTERNAL: &str = "oxp_446f6e74-4469-6557-6f6e-646572696e67"; - let mut worker = DumpSetupWorker::new( + let mut worker = DebugCollectorWorker::new( Box::::default(), Box::new(FakeZfs { zpool_props: [ @@ -2190,7 +2190,7 @@ mod tests { USED, CAPACITY, ); - let worker = files.new_dump_setup_worker(USED, AVAILABLE).await; + let worker = files.new_debug_collector_worker(USED, AVAILABLE).await; // Before we cleanup: All files in "debug" exist files.check_all_files_exist(); @@ -2232,7 +2232,7 @@ mod tests { USED, CAPACITY, ); - let worker = files.new_dump_setup_worker(USED, AVAILABLE).await; + let worker = files.new_debug_collector_worker(USED, AVAILABLE).await; // Before we cleanup: All files in "debug" exist files.check_all_files_exist(); @@ -2290,7 +2290,7 @@ mod tests { USED, CAPACITY, ); - let worker = files.new_dump_setup_worker(USED, AVAILABLE).await; + let worker = files.new_debug_collector_worker(USED, AVAILABLE).await; // Before we cleanup: All files in "debug" exist files.check_all_files_exist(); @@ -2351,7 +2351,7 @@ mod tests { USED, CAPACITY, ); - let worker = files.new_dump_setup_worker(USED, AVAILABLE).await; + let worker = files.new_debug_collector_worker(USED, AVAILABLE).await; // Before we cleanup: All files in "debug" exist files.check_all_files_exist(); diff --git a/sled-agent/config-reconciler/src/dump_setup_task.rs b/sled-agent/config-reconciler/src/debug_collector_task.rs similarity index 92% rename from sled-agent/config-reconciler/src/dump_setup_task.rs rename to sled-agent/config-reconciler/src/debug_collector_task.rs index 5b03194b565..a82fb28c5e4 100644 --- a/sled-agent/config-reconciler/src/dump_setup_task.rs +++ b/sled-agent/config-reconciler/src/debug_collector_task.rs @@ -6,7 +6,7 @@ //! response to changes in available disks. use crate::InternalDisksReceiver; -use crate::dump_setup::DumpSetup; +use crate::debug_collector::DebugCollector; use camino::Utf8PathBuf; use debug_ignore::DebugIgnore; use sled_storage::config::MountConfig; @@ -33,16 +33,16 @@ pub(crate) fn spawn( // to enqueue the request or for the request to complete. let (archive_tx, archive_rx) = mpsc::channel(1); - let dump_setup_task = DumpSetupTask { + let debug_collector_task = DebugCollectorTask { internal_disks_rx, external_disks_rx, archive_rx, - dump_setup: DumpSetup::new(base_log, mount_config), + debug_collector: DebugCollector::new(base_log, mount_config), last_disks_used: HashSet::new(), - log: base_log.new(slog::o!("component" => "DumpSetupTask")), + log: base_log.new(slog::o!("component" => "DebugCollectorTask")), }; - tokio::spawn(dump_setup_task.run()); + tokio::spawn(debug_collector_task.run()); FormerZoneRootArchiver { log: DebugIgnore( @@ -52,7 +52,7 @@ pub(crate) fn spawn( } } -struct DumpSetupTask { +struct DebugCollectorTask { // Input channels on which we receive updates about disk changes. internal_disks_rx: InternalDisksReceiver, external_disks_rx: watch::Receiver>, @@ -60,15 +60,16 @@ struct DumpSetupTask { archive_rx: mpsc::Receiver, // Invokes dumpadm(8) and savecore(8) when new disks are encountered - dump_setup: DumpSetup, + debug_collector: DebugCollector, - // Set of internal + external disks we most recently passed to `dump_setup`. + // Set of internal + external disks we most recently passed to the + // Debug Collector last_disks_used: HashSet, log: Logger, } -impl DumpSetupTask { +impl DebugCollectorTask { async fn run(mut self) { self.update_setup_if_needed().await; @@ -119,7 +120,7 @@ impl DumpSetupTask { completion_tx } = request; self - .dump_setup + .debug_collector .archive_former_zone_root(&path, completion_tx) .await; } @@ -138,7 +139,7 @@ impl DumpSetupTask { .collect::>(); if disks_avail != self.last_disks_used { - self.dump_setup.update_dumpdev_setup(disks_avail.iter()).await; + self.debug_collector.update_dumpdev_setup(disks_avail.iter()).await; self.last_disks_used = disks_avail; } } diff --git a/sled-agent/config-reconciler/src/handle.rs b/sled-agent/config-reconciler/src/handle.rs index 4851c06c1f8..428abb5d5a0 100644 --- a/sled-agent/config-reconciler/src/handle.rs +++ b/sled-agent/config-reconciler/src/handle.rs @@ -50,8 +50,8 @@ use crate::SledAgentFacilities; use crate::TimeSyncStatus; use crate::dataset_serialization_task::DatasetTaskHandle; use crate::dataset_serialization_task::NestedDatasetMountError; -use crate::dump_setup_task; -use crate::dump_setup_task::FormerZoneRootArchiver; +use crate::debug_collector_task; +use crate::debug_collector_task::FormerZoneRootArchiver; use crate::internal_disks::InternalDisksReceiver; use crate::ledger::CurrentSledConfig; use crate::ledger::LedgerTaskHandle; @@ -136,7 +136,7 @@ impl ConfigReconcilerHandle { // Spawn the task that manages dump devices. let (external_disks_tx, external_disks_rx) = watch::channel(HashSet::new()); - let former_zone_root_archiver = dump_setup_task::spawn( + let former_zone_root_archiver = debug_collector_task::spawn( internal_disks_rx.clone(), external_disks_rx, Arc::clone(&mount_config), diff --git a/sled-agent/config-reconciler/src/lib.rs b/sled-agent/config-reconciler/src/lib.rs index 22e0d4dc82f..a56d36dcda3 100644 --- a/sled-agent/config-reconciler/src/lib.rs +++ b/sled-agent/config-reconciler/src/lib.rs @@ -46,9 +46,9 @@ //! [`ConfigReconcilerHandle::inventory()`]. mod dataset_serialization_task; +mod debug_collector; +mod debug_collector_task; mod disks_common; -mod dump_setup; -mod dump_setup_task; mod handle; mod host_phase_2; mod internal_disks; diff --git a/sled-agent/config-reconciler/src/reconciler_task.rs b/sled-agent/config-reconciler/src/reconciler_task.rs index 1277bd73f66..f63a09a4885 100644 --- a/sled-agent/config-reconciler/src/reconciler_task.rs +++ b/sled-agent/config-reconciler/src/reconciler_task.rs @@ -42,7 +42,7 @@ use crate::InternalDisksReceiver; use crate::SledAgentArtifactStore; use crate::TimeSyncConfig; use crate::dataset_serialization_task::DatasetTaskHandle; -use crate::dump_setup_task::FormerZoneRootArchiver; +use crate::debug_collector_task::FormerZoneRootArchiver; use crate::host_phase_2::BootPartitionReconciler; use crate::ledger::CurrentSledConfig; use crate::raw_disks::RawDisksReceiver; diff --git a/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs b/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs index c794c1dc388..5574af8ee7e 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs @@ -45,9 +45,9 @@ use std::sync::Arc; use std::sync::OnceLock; use tokio::sync::watch; +use crate::debug_collector_task::FormerZoneRootArchiver; use crate::disks_common::MaybeUpdatedDisk; use crate::disks_common::update_properties_from_raw_disk; -use crate::dump_setup_task::FormerZoneRootArchiver; use camino::Utf8PathBuf; use illumos_utils::zfs::Mountpoint; @@ -238,7 +238,7 @@ pub(super) struct ExternalDisks { currently_managed_zpools_tx: watch::Sender>, // Output channel for the raw disks we're managing. This is only consumed - // within this crate by `DumpSetupTask` (for managing dump devices). + // within this crate by `DebugCollectorTask` (for managing dump devices). external_disks_tx: watch::Sender>, // For requesting archival of former zone root directories. @@ -501,8 +501,8 @@ impl ExternalDisks { // Update the output channels now. This is important to do before // cleaning up former zone root datasets because that step will require - // that the archival task (DumpSetup) has seen the new disks and added - // any debug datasets found on them. + // that the archival task (DebugCollector) has seen the new disks and + // added any debug datasets found on them. self.update_output_watch_channels(); // For any newly-adopted disks, clean up any former zone root datasets diff --git a/sled-agent/config-reconciler/src/reconciler_task/zones.rs b/sled-agent/config-reconciler/src/reconciler_task/zones.rs index 958c507286f..ebd18f84345 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/zones.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/zones.rs @@ -11,7 +11,7 @@ use crate::InternalDisks; use crate::ResolverStatusExt; use crate::SledAgentFacilities; use crate::TimeSyncConfig; -use crate::dump_setup_task::FormerZoneRootArchiver; +use crate::debug_collector_task::FormerZoneRootArchiver; use camino::Utf8PathBuf; use futures::FutureExt as _; use futures::future; diff --git a/sled-agent/src/zone_bundle.rs b/sled-agent/src/zone_bundle.rs index 86c854d6445..29c095cab24 100644 --- a/sled-agent/src/zone_bundle.rs +++ b/sled-agent/src/zone_bundle.rs @@ -670,7 +670,8 @@ async fn create_snapshot( // A key feature of the zone-bundle process is that we pull all the log files // for a zone. This is tricky. The logs are both being written to by the // programs we're interested in, and also potentially being rotated by `logadm`, -// and / or archived out to the U.2s through the code in `crate::dump_setup`. +// and / or archived out to the U.2s through the code in +// `crate::debug_collector`. // // We need to capture all these logs, while avoiding inconsistent state (e.g., a // missing log message that existed when the bundle was created) and also @@ -1033,8 +1034,8 @@ async fn create( // // Both of these are dynamic. The current log file is likely being written // by the service itself, and `logadm` may also be rotating files. At the - // same time, the log-archival process in `dump_setup.rs` may be copying - // these out to the U.2s, after which it deletes those on the zone + // same time, the log-archival process in `debug_collector.rs` may be + // copying these out to the U.2s, after which it deletes those on the zone // filesystem itself. // // To avoid various kinds of corruption, such as a bad tarball or missing @@ -1261,7 +1262,7 @@ async fn find_archived_log_files<'a, T: Iterator>( } } } else { - // The logic in `dump_setup` picks some U.2 in which to start + // The logic in `debug_collector` picks some U.2 in which to start // archiving logs, and thereafter tries to keep placing new ones // there, subject to space constraints. It's not really an error for // there to be no entries for the named zone in any particular U.2 diff --git a/sled-agent/types/src/support_bundle.rs b/sled-agent/types/src/support_bundle.rs index ab31feb735d..42fff3374bc 100644 --- a/sled-agent/types/src/support_bundle.rs +++ b/sled-agent/types/src/support_bundle.rs @@ -13,9 +13,9 @@ // | | This is a per-bundle nested dataset // | This is a Debug dataset // -// NOTE: The "DumpSetupWorker" has been explicitly configured to ignore these files, so they are -// not removed. If the files used here change in the future, DumpSetupWorker should also be -// updated. +// NOTE: The DebugCollector has been explicitly configured to ignore these +// files, so they are not removed. If the files used here change in the future, +// DebugCollector should also be updated. pub const BUNDLE_FILE_NAME: &str = "bundle.zip"; pub const BUNDLE_TMP_FILE_NAME: &str = "bundle.zip.tmp"; From 5019f2a98fe933d8cf7b3d4052753807536faea4 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 5 Dec 2025 15:32:24 -0800 Subject: [PATCH 2/7] better document DebugCollector internals --- .../config-reconciler/src/debug_collector.rs | 105 +++++++++++++++++- 1 file changed, 103 insertions(+), 2 deletions(-) diff --git a/sled-agent/config-reconciler/src/debug_collector.rs b/sled-agent/config-reconciler/src/debug_collector.rs index 8d41ff5de3d..f108bcb0a98 100644 --- a/sled-agent/config-reconciler/src/debug_collector.rs +++ b/sled-agent/config-reconciler/src/debug_collector.rs @@ -118,28 +118,75 @@ use tokio::sync::mpsc::Receiver; use tokio::sync::oneshot; use zone::{Zone, ZoneError}; +// Names of ZFS dataset properties. These are stable and documented in zfs(1M). + const ZFS_PROP_USED: &str = "used"; const ZFS_PROP_AVAILABLE: &str = "available"; +// Parameters related to management of storage on debug datasets + +/// Threshold of percent utilization for debug datasets above which we prefer to +/// avoid using this dataset if others are available +/// +/// This is also the target utilization when cleaning up old data. We'll delete +/// old data until a dataset reaches this level. const DATASET_USAGE_PERCENT_CHOICE: u64 = 70; + +/// Threshold of percent utilization for debug datasets above which we will try +/// to switch to a different debug dataset or else trigger cleanup of old data const DATASET_USAGE_PERCENT_CLEANUP: u64 = 80; +/// How frequently we move debug data (cores, log files, etc.) from their +/// origins into debug datasets for long-term storage const ARCHIVAL_INTERVAL: Duration = Duration::from_secs(300); -// we sure are passing a lot of Utf8PathBufs around, let's be careful about it +// Newtypes for distinguishing different types of filesystem paths + +/// Filesystem path to a block device that can be used as a dump device +/// +/// This is generally a slice on an internal (M.2) device. #[derive(AsRef, Clone, Debug, Eq, From, Hash, Ord, PartialEq, PartialOrd)] struct DumpSlicePath(Utf8PathBuf); + +/// Filesystem path to the mountpoint of a ZFS dataset that's intended for +/// storing debug data for the long term +/// +/// This is generally on a removable (U.2) device. #[derive(AsRef, Clone, Debug, Eq, From, Hash, Ord, PartialEq, PartialOrd)] struct DebugDataset(Utf8PathBuf); + +/// Filesystem path to the mountpoint of a ZFS dataset that's available as the +/// first place that user process core dumps get written +/// +/// This is generally on an internal (M.2) device. #[derive(AsRef, Clone, Debug, Eq, From, Hash, Ord, PartialEq, PartialOrd)] struct CoreDataset(Utf8PathBuf); +// Types that describe ZFS datasets that are used to store different kinds of +// debug data + +/// Identifies a ZFS dataset to which user process core dumps may be written +/// +/// Equivalently, identifies a pool on which there exists such a dataset. These +/// are equivalent because there is exactly one such dataset on any pool and it +/// has a well-known name within the pool. See the impl of `GetMountpoint` for +/// what it is. +/// +/// This pool is generally on an M.2 device. #[derive(AsRef, Clone, Debug, From)] pub(super) struct CoreZpool { mount_config: Arc, name: ZpoolName, } +/// Identifies a ZFS dataset that's intended for long-term storage of debug data +/// +/// Equivalently, identifies a pool on which there exists such a dataset. These +/// are equivalent because there is exactly one such dataset on any pool and it +/// has a well-known name within the pool. See the impl of `GetMountpoint` for +/// what it is. +/// +/// This pool is generally on an M.2 device. #[derive(AsRef, Clone, Debug, From)] pub(super) struct DebugZpool { mount_config: Arc, @@ -161,7 +208,8 @@ impl GetMountpoint for CoreZpool { } } -// only want to access these directories after they're mounted! +/// GetMountpoint provides a common interface for getting the mountpoint of a +/// `DebugZpool` or `CoreZpool` dataset based on just the pool that it's on. trait GetMountpoint: AsRef { type NewType: From; const MOUNTPOINT: &'static str; @@ -172,6 +220,8 @@ trait GetMountpoint: AsRef { &self, invoker: &dyn ZfsInvoker, ) -> Result, ZfsGetError> { + // It's important that we only access this directory if it's mounted. + // If not, return `None`. if invoker.zfs_get_prop(&self.as_ref().to_string(), "mounted")? == "yes" { Ok(Some(Self::NewType::from(invoker.mountpoint( @@ -185,13 +235,25 @@ trait GetMountpoint: AsRef { } } +/// Actor-style commands sent to the DebugCollectorWorker +/// +/// These are sent from different places. #[derive(Debug)] enum DebugCollectorCmd { + /// Archive logs and other debug data from directory `zone_root`, which + /// corresponds to the root of a previously-running zone called `zone_name`. + /// Reply on `completion_tx` when finished. + /// + /// This message is sent from the DebugCollectorTask (outside this module). ArchiveFormerZoneRoot { zone_root: Utf8PathBuf, zone_name: String, completion_tx: oneshot::Sender<()>, }, + + /// Update the DebugCollector with details about the currently available + /// dump slices, debug datasets, and core datasets. (See the corresponding + /// types above.) Reply on `update_complete_tx` when finished. UpdateDumpdevSetup { dump_slices: Vec, debug_datasets: Vec, @@ -200,27 +262,57 @@ enum DebugCollectorCmd { }, } +/// Operates the debug collector: +/// +/// - receives and processes `DebugCollectorCmd` commands (see above) +/// - configures dumpadm(8), savecore(8), and coreadm(8) +/// - periodically archives log files and other debug data from running zones +/// +/// This runs in its own tokio task. More precisely, poll_file_archival() runs +/// in its own tokio task and does all these things. struct DebugCollectorWorker { + /// list of ZFS datasets that can be used for saving process core dumps core_dataset_names: Vec, + /// list of ZFS datasets that can be used for long-term storage of + /// debug data debug_dataset_names: Vec, + /// currently-chosen dump device chosen_dump_slice: Option, + /// currently-chosen directory for long-term storage of new debug data chosen_debug_dir: Option, + /// currently-chosen directory for process core dumps + /// (before being moved to longer-term storage) chosen_core_dir: Option, + /// list of dump devices available known_dump_slices: Vec, + /// list of directories available for long-term storage of debug data known_debug_dirs: Vec, + /// list of directories available for process core dumps known_core_dirs: Vec, + /// list of dump devices that once contained a crash dump that has since + /// been saved to a debug directory savecored_slices: HashSet, log: Logger, + + /// channel for receiving commands to update configuration, etc. rx: Receiver, + + // helpers for invoking system functionality + // (abstracted behind traits for testing) coredumpadm_invoker: Box, zfs_invoker: Box, zone_invoker: Box, } +/// "External" handle to the debug collector +/// +/// The DebugCollectorTask (a tiny task that passes information from the rest of +/// sled agent to this subystem) has this handle and uses it to send commands to +/// the DebugCollectorWorker. pub struct DebugCollector { tx: tokio::sync::mpsc::Sender, mount_config: Arc, @@ -411,6 +503,7 @@ enum ZfsGetError { Parse(#[from] std::num::ParseIntError), } +/// Helper for invoking coreadm(8) and dumpadm(8), abstracted out for tests #[async_trait] trait CoreDumpAdmInvoker { fn coreadm(&self, core_dir: &Utf8PathBuf) -> Result<(), ExecutionError>; @@ -421,6 +514,7 @@ trait CoreDumpAdmInvoker { ) -> Result, ExecutionError>; } +/// Helper for interacting with ZFS filesystems, abstracted out for tests trait ZfsInvoker { fn zfs_get_prop( &self, @@ -459,6 +553,7 @@ trait ZfsInvoker { ) -> Utf8PathBuf; } +/// Helper for listing currently-running zones on the system #[async_trait] trait ZoneInvoker { async fn get_zones(&self) -> Result, ArchiveLogsError>; @@ -584,6 +679,8 @@ impl ZoneInvoker for RealZone { } } +/// Returns whether a given file on a debug dataset can safely be deleted as +/// part of cleaning up that dataset fn safe_to_delete(path: &Utf8Path, meta: &std::fs::Metadata) -> bool { if !meta.is_file() { // Ignore non-files (e.g. directories, symlinks) @@ -629,6 +726,7 @@ impl DebugCollectorWorker { } } + /// Runs the body of the DebugCollector async fn poll_file_archival(mut self) { info!(self.log, "DebugCollector poll loop started."); @@ -640,6 +738,9 @@ impl DebugCollectorWorker { let mut evaluation_and_archiving_complete_tx = None; loop { + // Wait for either: + // - an external command to arrive on `self.rx` + // - a timer to fire for periodic archival of logs, etc. match tokio::time::timeout(ARCHIVAL_INTERVAL, self.rx.recv()).await { Ok(Some(DebugCollectorCmd::UpdateDumpdevSetup { From 46db18c361d66a71685c3a16a11ad2457d49e687 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 5 Dec 2025 16:54:36 -0800 Subject: [PATCH 3/7] update block comment --- .../config-reconciler/src/debug_collector.rs | 129 +++++++++++++++--- 1 file changed, 113 insertions(+), 16 deletions(-) diff --git a/sled-agent/config-reconciler/src/debug_collector.rs b/sled-agent/config-reconciler/src/debug_collector.rs index f108bcb0a98..b3d88522293 100644 --- a/sled-agent/config-reconciler/src/debug_collector.rs +++ b/sled-agent/config-reconciler/src/debug_collector.rs @@ -2,15 +2,86 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! This module is responsible for moving debug info (kernel crash dumps, -//! userspace process core dumps, and rotated logs) onto external drives for -//! perusal/archival, and to prevent internal drives from filling up. -//! (For background on the paths and datasets being used, see RFD 118) +//! The `DebugCollector` is responsible for collecting, archiving, and managing +//! long-term storage of various debug data on the system: //! -//! The behaviors documented below describe current behavior, but are not -//! necessarily a long-term guarantee, and details may be subject to change. +//! - kernel crash dumps +//! - process core dumps +//! - log files from zones +//! +//! Each of these types of debug data goes through a different flow, but +//! ultimately all of it winds up on **debug datasets** (ZFS datasets on the +//! system's external (U.2) disks). +//! +//! Support bundles also contain debug data and are stored in debug datasets. +//! However, they're almost entirely layered _atop_ this subsystem. The debug +//! collector doesn't know anything about support bundles aside from the fact +//! that it explicitly avoids deleting them when cleaning up debug datasets. +//! +//! ## Overview +//! +//! While the debug collector directly copies log files from zones into the +//! debug dataset, it's only indirectly responsible for where kernel crash +//! dumps and process core dumps go. In summary, the DebugCollector: +//! +//! - nominates one of the currently available debug datasets as the +//! destination for debug data. +//! - uses dumpadm(8) to specify a **dump device**, which is where the kernel +//! will save a crash dump if it panics. Oxide sleds use slices on an +//! internal (M.2) disk as dump devices. +//! - uses savecore(8) to copy any crash dumps found on eligible dump devices +//! to a debug dataset. +//! - uses coreadm(8) to configure the system to save user process core dumps +//! to a specific **cores dataset*. +//! - periodically archives to a debug dataset: +//! - core files found on any "cores" datasets, and +//! - log files from inside zones' filesystems. +//! When core files and log files are archived, they're copied to a debug +//! dataset and the original file is deleted. +//! - when needed, deletes the oldest files in debug datasets to preserve space +//! for new debugging data +//! +//! That's the big picture, but there are many operational edge cases. +//! +//! Here's a summary of the data flow: +//! +//! +------------------------+ +--------------+ +---------------+ +//! | dump device containing | | user process | | log files | +//! | kernel crash dump | +--------------+ | inside zones | +//! +------------------------+ | +---------------+ +//! | | | +//! | | process crash: | +//! | | system writes | +//! | DebugCollector | core dump to | +//! | invokes | configured | +//! | savecore(8) | directory | +//! | | | +//! | v | +//! | +--------------------------------------+ | +//! | | chosen "core" dataset | | +//! | | (ZFS dataset on internal (M.2) disk) | | +//! | +--------------------------------------+ | +//! | | | +//! | | | +//! | | | +//! | | DebugCollector | +//! | | periodically archives | +//! | | the core dumps and | +//! | | log files (copies to | +//! | | debug dataset, then | +//! | | deletes the original) | +//! | | | +//! v v v +//! +----------------------------------------------------------+ +//! | debug datasets (ZFS datasets on external (U.2) disks) |-+ +//! +----------------------------------------------------------+ |-+ +//! +----------------------------------------------------------+ | +//! +----------------------------------------------------------+ +//! +//! For background on the paths and datasets being used, see RFD 118. //! //! ## Choice of destination external drive for archived logs and dumps +//! //! As zpools on external (U.2) drives come online, their proportion of space //! used is checked, any that are over 70% are skipped, and of the remaining //! candidates the one with the *most* content is designated as the target onto @@ -23,7 +94,19 @@ //! If the chosen drive eventually exceeds 80% of its capacity used, then a //! different drive is chosen by the same algorithm. //! +//! ## Debug dataset directory structure (not a committed interface!) +//! +//! Debug datasets are mounted at `/pool/ext/$pool_uuid/crypt/debug`. In that +//! directory we find: +//! +//! * `core.[zone-name].[exe-filename].[pid].[time]`: process core dumps +//! * `unix.[0-9]+`, `bounds`: files associated with kernel crash dumps +//! * `$UUID`: support bundles (wholly unrelated to the DebugCollector) +//! * `oxz_[zone-type]_[zone-uuid]`: directory containing all the log files +//! for the corresponding zone. +//! //! ## Kernel crash dumps +//! //! As internal (M.2) drives are discovered, their designated dump slices are //! checked for the presence of a previous kernel crash dump that hasn't been //! archived. If a dump is present that has not yet been archived, and an @@ -40,14 +123,15 @@ //! do not configure a dump slice, preferring to preserve evidence of the //! original root cause of an issue rather than overwriting it with confounding //! variables (in the event adjacent systems begin behaving erratically due to -//! the initial failure). -//! In this event, as soon as an external drive becomes available to archive -//! one or all of the occupied dump slices' contents, the golden-path procedure -//! detailed above occurs and a dump slice is configured. +//! the initial failure). In this event, as soon as an external drive becomes +//! available to archive one or all of the occupied dump slices' contents, the +//! golden-path procedure detailed above occurs and a dump slice is configured. //! //! ## Process core dumps +//! //! As zpools on internal (M.2) drives come online, the first one seen by the //! poll loop is chosen to be the destination of process cores in all zones: +//! //! ```text //! /pool/int/*/crash/core.[zone-name].[exe-filename].[pid].[time] //! ``` @@ -61,17 +145,21 @@ //! -G default+debug //! ``` //! -//! Every 5 minutes, all core files found on internal drives are moved to the -//! DUMP_DATASET of the (similarly chosen) removable U.2 drive, like so: +//! Every 5 minutes, all core files found on internal drives are moved to a +//! debug dataset, like so: +//! //! ```text //! /pool/int/*/crash/core.global.sled-agent.101.34784217 //! -> /pool/ext/*/crypt/debug/core.global.sled-agent.101.34784217 //! ``` //! -//! ## Log rotation and archival -//! Every 5 minutes, each log that logadm(8) has rotated (in every zone) gets -//! archived into the DUMP_DATASET of the chosen U.2, with the suffixed -//! number replaced by the modified timestamp, like so: +//! ## Periodic log rotation and archival +//! +//! logadm(8) is confiured in all zones to rotate log files. Every 5 minutes, +//! each log that logadm(8) has rotated in every zone gets archived into the +//! current debug dataset, with the suffixed number replaced by the modified +//! timestamp, like so: +//! //! ```text //! /var/svc/log/foo.log.0 //! -> /pool/ext/*/crypt/debug/global/foo.log.34784217 @@ -85,6 +173,15 @@ //! In the event of filename collisions (i.e. several instances of a service's //! rotated log files having the same modified time to the second), the //! number is incremented by 1 until no conflict remains. +//! +//! ## On-demand log archival +//! +//! Logs are also archived upon request. This happens when zones are being +//! shut down (to catch any logs rotated since the last periodic archival). It +//! also happens when the system starts up in order to archive logs from zones +//! that were running when the system last shut down. In both of these cases, +//! the _live_ log files are also archived, since they will not have a chance +//! to get rotated and so would otherwise be lost. use async_trait::async_trait; use camino::Utf8Path; From 6c1bc9880bc82f31073f193bbf9405929c11bea8 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Sat, 6 Dec 2025 11:32:24 -0800 Subject: [PATCH 4/7] fix doctest --- sled-agent/config-reconciler/src/debug_collector.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sled-agent/config-reconciler/src/debug_collector.rs b/sled-agent/config-reconciler/src/debug_collector.rs index b3d88522293..489244441af 100644 --- a/sled-agent/config-reconciler/src/debug_collector.rs +++ b/sled-agent/config-reconciler/src/debug_collector.rs @@ -45,6 +45,7 @@ //! //! Here's a summary of the data flow: //! +//! ```text //! +------------------------+ +--------------+ +---------------+ //! | dump device containing | | user process | | log files | //! | kernel crash dump | +--------------+ | inside zones | @@ -77,6 +78,7 @@ //! +----------------------------------------------------------+ |-+ //! +----------------------------------------------------------+ | //! +----------------------------------------------------------+ +//! ``` //! //! For background on the paths and datasets being used, see RFD 118. //! From 402cdb630b39110985639808506892567b6c5189 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 8 Dec 2025 15:06:07 -0800 Subject: [PATCH 5/7] move debug_collector into subdirectory; conslidate task and the rest of it --- sled-agent/config-reconciler/src/debug_collector/mod.rs | 9 +++++++++ .../{debug_collector_task.rs => debug_collector/task.rs} | 2 +- .../{debug_collector.rs => debug_collector/worker.rs} | 0 sled-agent/config-reconciler/src/handle.rs | 6 +++--- sled-agent/config-reconciler/src/lib.rs | 1 - sled-agent/config-reconciler/src/reconciler_task.rs | 2 +- .../src/reconciler_task/external_disks.rs | 2 +- .../config-reconciler/src/reconciler_task/zones.rs | 2 +- 8 files changed, 16 insertions(+), 8 deletions(-) create mode 100644 sled-agent/config-reconciler/src/debug_collector/mod.rs rename sled-agent/config-reconciler/src/{debug_collector_task.rs => debug_collector/task.rs} (99%) rename sled-agent/config-reconciler/src/{debug_collector.rs => debug_collector/worker.rs} (100%) diff --git a/sled-agent/config-reconciler/src/debug_collector/mod.rs b/sled-agent/config-reconciler/src/debug_collector/mod.rs new file mode 100644 index 00000000000..4d7ad3d9ef4 --- /dev/null +++ b/sled-agent/config-reconciler/src/debug_collector/mod.rs @@ -0,0 +1,9 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +mod task; +mod worker; + +pub use task::FormerZoneRootArchiver; +pub (crate) use task::spawn; diff --git a/sled-agent/config-reconciler/src/debug_collector_task.rs b/sled-agent/config-reconciler/src/debug_collector/task.rs similarity index 99% rename from sled-agent/config-reconciler/src/debug_collector_task.rs rename to sled-agent/config-reconciler/src/debug_collector/task.rs index a82fb28c5e4..e8d6d69af21 100644 --- a/sled-agent/config-reconciler/src/debug_collector_task.rs +++ b/sled-agent/config-reconciler/src/debug_collector/task.rs @@ -5,8 +5,8 @@ //! Long-running tokio task responsible for updating the dump device setup in //! response to changes in available disks. +use super::worker::DebugCollector; use crate::InternalDisksReceiver; -use crate::debug_collector::DebugCollector; use camino::Utf8PathBuf; use debug_ignore::DebugIgnore; use sled_storage::config::MountConfig; diff --git a/sled-agent/config-reconciler/src/debug_collector.rs b/sled-agent/config-reconciler/src/debug_collector/worker.rs similarity index 100% rename from sled-agent/config-reconciler/src/debug_collector.rs rename to sled-agent/config-reconciler/src/debug_collector/worker.rs diff --git a/sled-agent/config-reconciler/src/handle.rs b/sled-agent/config-reconciler/src/handle.rs index 428abb5d5a0..d921aea7845 100644 --- a/sled-agent/config-reconciler/src/handle.rs +++ b/sled-agent/config-reconciler/src/handle.rs @@ -50,8 +50,8 @@ use crate::SledAgentFacilities; use crate::TimeSyncStatus; use crate::dataset_serialization_task::DatasetTaskHandle; use crate::dataset_serialization_task::NestedDatasetMountError; -use crate::debug_collector_task; -use crate::debug_collector_task::FormerZoneRootArchiver; +use crate::debug_collector; +use crate::debug_collector::FormerZoneRootArchiver; use crate::internal_disks::InternalDisksReceiver; use crate::ledger::CurrentSledConfig; use crate::ledger::LedgerTaskHandle; @@ -136,7 +136,7 @@ impl ConfigReconcilerHandle { // Spawn the task that manages dump devices. let (external_disks_tx, external_disks_rx) = watch::channel(HashSet::new()); - let former_zone_root_archiver = debug_collector_task::spawn( + let former_zone_root_archiver = debug_collector::spawn( internal_disks_rx.clone(), external_disks_rx, Arc::clone(&mount_config), diff --git a/sled-agent/config-reconciler/src/lib.rs b/sled-agent/config-reconciler/src/lib.rs index a56d36dcda3..b63028f6d5e 100644 --- a/sled-agent/config-reconciler/src/lib.rs +++ b/sled-agent/config-reconciler/src/lib.rs @@ -47,7 +47,6 @@ mod dataset_serialization_task; mod debug_collector; -mod debug_collector_task; mod disks_common; mod handle; mod host_phase_2; diff --git a/sled-agent/config-reconciler/src/reconciler_task.rs b/sled-agent/config-reconciler/src/reconciler_task.rs index f63a09a4885..41db4bc7c29 100644 --- a/sled-agent/config-reconciler/src/reconciler_task.rs +++ b/sled-agent/config-reconciler/src/reconciler_task.rs @@ -42,7 +42,7 @@ use crate::InternalDisksReceiver; use crate::SledAgentArtifactStore; use crate::TimeSyncConfig; use crate::dataset_serialization_task::DatasetTaskHandle; -use crate::debug_collector_task::FormerZoneRootArchiver; +use crate::debug_collector::FormerZoneRootArchiver; use crate::host_phase_2::BootPartitionReconciler; use crate::ledger::CurrentSledConfig; use crate::raw_disks::RawDisksReceiver; diff --git a/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs b/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs index 5574af8ee7e..cb3b91ecb3b 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/external_disks.rs @@ -45,7 +45,7 @@ use std::sync::Arc; use std::sync::OnceLock; use tokio::sync::watch; -use crate::debug_collector_task::FormerZoneRootArchiver; +use crate::debug_collector::FormerZoneRootArchiver; use crate::disks_common::MaybeUpdatedDisk; use crate::disks_common::update_properties_from_raw_disk; use camino::Utf8PathBuf; diff --git a/sled-agent/config-reconciler/src/reconciler_task/zones.rs b/sled-agent/config-reconciler/src/reconciler_task/zones.rs index ebd18f84345..88163399d74 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/zones.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/zones.rs @@ -11,7 +11,7 @@ use crate::InternalDisks; use crate::ResolverStatusExt; use crate::SledAgentFacilities; use crate::TimeSyncConfig; -use crate::debug_collector_task::FormerZoneRootArchiver; +use crate::debug_collector::FormerZoneRootArchiver; use camino::Utf8PathBuf; use futures::FutureExt as _; use futures::future; From 5832c87e8b1e4bc39b98a1a17b724a346e5d74c3 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 8 Dec 2025 16:21:52 -0800 Subject: [PATCH 6/7] split up debug_collector into modules --- .../src/debug_collector/handle.rs | 206 +++++++++ .../src/debug_collector/helpers.rs | 214 +++++++++ .../src/debug_collector/mod.rs | 86 +++- .../src/debug_collector/task.rs | 17 +- .../src/debug_collector/worker.rs | 419 +----------------- 5 files changed, 540 insertions(+), 402 deletions(-) create mode 100644 sled-agent/config-reconciler/src/debug_collector/handle.rs create mode 100644 sled-agent/config-reconciler/src/debug_collector/helpers.rs diff --git a/sled-agent/config-reconciler/src/debug_collector/handle.rs b/sled-agent/config-reconciler/src/debug_collector/handle.rs new file mode 100644 index 00000000000..41f4442720a --- /dev/null +++ b/sled-agent/config-reconciler/src/debug_collector/handle.rs @@ -0,0 +1,206 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::helpers::RealCoreDumpAdm; +use super::helpers::RealZfs; +use super::helpers::RealZone; +use super::worker::CoreZpool; +use super::worker::DebugCollectorCmd; +use super::worker::DebugCollectorWorker; +use super::worker::DebugZpool; +use super::worker::DumpSlicePath; +use camino::Utf8Path; +use illumos_utils::zpool::ZpoolHealth; +use omicron_common::disk::DiskVariant; +use sled_storage::config::MountConfig; +use sled_storage::disk::Disk; +use slog::Logger; +use slog::error; +use slog::info; +use slog::o; +use slog::warn; +use std::sync::Arc; +use tokio::sync::oneshot; + +/// Handle to the DebugCollectorWorker, used by the DebugCollectorTask +/// +/// The DebugCollectorTask (a tiny task that passes information from the rest of +/// sled agent to this subystem) has this handle and uses it to send commands to +/// the DebugCollectorWorker. +pub struct DebugCollector { + tx: tokio::sync::mpsc::Sender, + mount_config: Arc, + _poller: tokio::task::JoinHandle<()>, + log: Logger, +} + +impl DebugCollector { + pub fn new(log: &Logger, mount_config: Arc) -> Self { + let (tx, rx) = tokio::sync::mpsc::channel(16); + let worker = DebugCollectorWorker::new( + Box::new(RealCoreDumpAdm {}), + Box::new(RealZfs {}), + Box::new(RealZone {}), + log.new(o!("component" => "DebugCollector-worker")), + rx, + ); + let _poller = + tokio::spawn(async move { worker.poll_file_archival().await }); + let log = log.new(o!("component" => "DebugCollector")); + Self { tx, mount_config, _poller, log } + } + + /// Given the set of all managed disks, updates the dump device location + /// for logs and dumps. + /// + /// This function returns only once this request has been handled, which + /// can be used as a signal by callers that any "old disks" are no longer + /// being used by [DebugCollector]. + pub async fn update_dumpdev_setup( + &self, + disks: impl Iterator, + ) { + let log = &self.log; + let mut m2_dump_slices = Vec::new(); + let mut u2_debug_datasets = Vec::new(); + let mut m2_core_datasets = Vec::new(); + let mount_config = self.mount_config.clone(); + for disk in disks { + match disk.variant() { + DiskVariant::M2 => { + // We only setup dump devices on real disks + if !disk.is_synthetic() { + match disk.dump_device_devfs_path(false) { + Ok(path) => { + m2_dump_slices.push(DumpSlicePath::from(path)) + } + Err(err) => { + warn!( + log, + "Error getting dump device devfs path: \ + {err:?}" + ); + } + } + } + let name = disk.zpool_name(); + if let Ok(info) = + illumos_utils::zpool::Zpool::get_info(&name.to_string()) + .await + { + if info.health() == ZpoolHealth::Online { + m2_core_datasets.push(CoreZpool { + mount_config: mount_config.clone(), + name: *name, + }); + } else { + warn!( + log, + "Zpool {name:?} not online, won't attempt to \ + save process core dumps there" + ); + } + } + } + DiskVariant::U2 => { + let name = disk.zpool_name(); + if let Ok(info) = + illumos_utils::zpool::Zpool::get_info(&name.to_string()) + .await + { + if info.health() == ZpoolHealth::Online { + u2_debug_datasets.push(DebugZpool { + mount_config: mount_config.clone(), + name: *name, + }); + } else { + warn!( + log, + "Zpool {name:?} not online, won't attempt to \ + save kernel core dumps there" + ); + } + } + } + } + } + + let (tx, rx) = oneshot::channel(); + if let Err(err) = self + .tx + .send(DebugCollectorCmd::UpdateDumpdevSetup { + dump_slices: m2_dump_slices, + debug_datasets: u2_debug_datasets, + core_datasets: m2_core_datasets, + update_complete_tx: tx, + }) + .await + { + error!(log, "DebugCollector channel closed: {:?}", err.0); + }; + + if let Err(err) = rx.await { + error!(log, "DebugCollector failed to await update"; "err" => ?err); + } + } + + /// Request archive of logs from the specified directory, which is assumed + /// to correspond to the root filesystem of a zone that is no longer + /// running. + /// + /// Unlike typical log file archival, this includes non-rotated log files. + /// + /// This makes a best-effort and logs failures rather than reporting them to + /// the caller. + /// + /// When this future completes, the request has only been enqueued. To know + /// when archival has completed, you must wait on the receive side of + /// `completion_tx`. + pub async fn archive_former_zone_root( + &self, + zone_root: &Utf8Path, + completion_tx: oneshot::Sender<()>, + ) { + let log = self.log.new(o!("zone_root" => zone_root.to_string())); + + // Validate the path that we were given. We're only ever given zone + // root filesystems, whose basename is always a zonename, and we always + // prefix our zone names with `oxz_`. If that's not what we find here, + // log an error and bail out. These error cases should be impossible to + // hit in practice. + let Some(file_name) = zone_root.file_name() else { + error!( + log, + "cannot archive former zone root"; + "error" => "path has no filename part", + ); + return; + }; + + if !file_name.starts_with("oxz_") { + error!( + log, + "cannot archive former zone root"; + "error" => "filename does not start with \"oxz_\"", + ); + return; + } + + info!(log, "requesting archive of former zone root"); + let zone_root = zone_root.to_owned(); + let zone_name = file_name.to_string(); + let cmd = DebugCollectorCmd::ArchiveFormerZoneRoot { + zone_root, + zone_name, + completion_tx, + }; + if let Err(_) = self.tx.send(cmd).await { + error!( + log, + "failed to request archive of former zone root"; + "error" => "DebugCollector channel closed" + ); + } + } +} diff --git a/sled-agent/config-reconciler/src/debug_collector/helpers.rs b/sled-agent/config-reconciler/src/debug_collector/helpers.rs new file mode 100644 index 00000000000..5624cc61d77 --- /dev/null +++ b/sled-agent/config-reconciler/src/debug_collector/helpers.rs @@ -0,0 +1,214 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Helpers that extract system facilities for operating on zones, ZFS, etc. so +//! that these implementations can be swapped out in tests + +use async_trait::async_trait; +use camino::Utf8PathBuf; +use illumos_utils::ExecutionError; +use illumos_utils::coreadm::{CoreAdm, CoreFileOption}; +use illumos_utils::dumpadm::{DumpAdm, DumpContentType}; +use illumos_utils::zone::ZONE_PREFIX; +use illumos_utils::zpool::ZpoolName; +use sled_storage::config::MountConfig; +use slog::error; +use std::ffi::OsString; +use zone::{Zone, ZoneError}; + +// Names of ZFS dataset properties. These are stable and documented in zfs(1M). + +pub(super) const ZFS_PROP_USED: &str = "used"; +pub(super) const ZFS_PROP_AVAILABLE: &str = "available"; + +/// Helper for invoking coreadm(8) and dumpadm(8), abstracted out for tests +#[async_trait] +pub(super) trait CoreDumpAdmInvoker { + fn coreadm(&self, core_dir: &Utf8PathBuf) -> Result<(), ExecutionError>; + async fn dumpadm( + &self, + dump_slice: &Utf8PathBuf, + savecore_dir: Option<&Utf8PathBuf>, + ) -> Result, ExecutionError>; +} + +/// Helper for interacting with ZFS filesystems, abstracted out for tests +pub(super) trait ZfsInvoker { + fn zfs_get_prop( + &self, + mountpoint_or_name: &str, + property: &str, + ) -> Result; + + fn zfs_get_integer( + &self, + mountpoint_or_name: &str, + property: &str, + ) -> Result { + self.zfs_get_prop(mountpoint_or_name, property)? + .parse() + .map_err(Into::into) + } + + fn below_thresh( + &self, + mountpoint: &Utf8PathBuf, + percent: u64, + ) -> Result<(bool, u64), ZfsGetError> { + let used = self.zfs_get_integer(mountpoint.as_str(), ZFS_PROP_USED)?; + let available = + self.zfs_get_integer(mountpoint.as_str(), ZFS_PROP_AVAILABLE)?; + let capacity = used + available; + let below = (used * 100) / capacity < percent; + Ok((below, used)) + } + + fn mountpoint( + &self, + mount_config: &MountConfig, + zpool: &ZpoolName, + mountpoint: &'static str, + ) -> Utf8PathBuf; +} + +#[derive(Debug, thiserror::Error)] +pub(super) enum ZfsGetError { + #[error("Error executing 'zfs get' command: {0}")] + IoError(#[from] std::io::Error), + #[error( + "Output of 'zfs get' was not only not an integer string, it wasn't \ + even UTF-8: {0}" + )] + Utf8(#[from] std::string::FromUtf8Error), + #[error("Error parsing output of 'zfs get' command as integer: {0}")] + Parse(#[from] std::num::ParseIntError), +} + +/// Helper for listing currently-running zones on the system +#[async_trait] +pub(super) trait ZoneInvoker { + async fn get_zones(&self) -> Result, ZoneError>; +} + +// Concrete implementations backed by the real system + +pub(super) struct RealCoreDumpAdm {} +pub(super) struct RealZfs {} +pub(super) struct RealZone {} + +#[async_trait] +impl CoreDumpAdmInvoker for RealCoreDumpAdm { + fn coreadm(&self, core_dir: &Utf8PathBuf) -> Result<(), ExecutionError> { + let mut cmd = CoreAdm::new(); + + // disable per-process core patterns + cmd.disable(CoreFileOption::Process); + cmd.disable(CoreFileOption::ProcSetid); + + // use the global core pattern + cmd.enable(CoreFileOption::Global); + cmd.enable(CoreFileOption::GlobalSetid); + + // set the global pattern to place all cores into core_dir, + // with filenames of "core.[zone-name].[exe-filename].[pid].[time]" + cmd.global_pattern(core_dir.join("core.%z.%f.%p.%t")); + + // also collect DWARF data from the exe and its library deps + cmd.global_contents("default+debug"); + + cmd.execute() + } + + // Invokes `dumpadm(8)` to configure the kernel to dump core into the given + // `dump_slice` block device in the event of a panic. If a core is already + // present in that block device, and a `savecore_dir` is provided, this + // function also invokes `savecore(8)` to save it into that directory. + // On success, returns Ok(Some(stdout)) if `savecore(8)` was invoked, or + // Ok(None) if it wasn't. + async fn dumpadm( + &self, + dump_slice: &Utf8PathBuf, + savecore_dir: Option<&Utf8PathBuf>, + ) -> Result, ExecutionError> { + let savecore_dir_cloned = if let Some(dir) = savecore_dir.cloned() { + dir + } else { + // if we don't have a savecore destination yet, still create and use + // a tmpfs path (rather than the default location under /var/crash, + // which is in the ramdisk pool), because dumpadm refuses to do what + // we ask otherwise. + let tmp_crash = "/tmp/crash"; + tokio::fs::create_dir_all(tmp_crash).await.map_err(|err| { + ExecutionError::ExecutionStart { + command: format!("mkdir {tmp_crash:?}"), + err, + } + })?; + Utf8PathBuf::from(tmp_crash) + }; + + // Use the given block device path for dump storage: + let mut cmd = DumpAdm::new(dump_slice.to_owned(), savecore_dir_cloned); + + // Include memory from the current process if there is one for the panic + // context, in addition to kernel memory: + cmd.content_type(DumpContentType::CurProc); + + // Compress crash dumps: + cmd.compress(true); + + // Do not run savecore(8) automatically on boot (irrelevant anyhow, as + // the config file being mutated by dumpadm won't survive reboots on + // gimlets). The sled-agent will invoke it manually instead. + cmd.no_boot_time_savecore(); + + cmd.execute()?; + + // do we have a destination for the saved dump + if savecore_dir.is_some() { + // and does the dump slice have one to save off + if let Ok(true) = + illumos_utils::dumpadm::dump_flag_is_valid(dump_slice).await + { + return illumos_utils::dumpadm::SaveCore.execute(); + } + } + Ok(None) + } +} + +impl ZfsInvoker for RealZfs { + fn zfs_get_prop( + &self, + mountpoint_or_name: &str, + property: &str, + ) -> Result { + let mut cmd = std::process::Command::new(illumos_utils::zfs::ZFS); + cmd.arg("get").arg("-Hpo").arg("value"); + cmd.arg(property); + cmd.arg(mountpoint_or_name); + let output = cmd.output()?; + Ok(String::from_utf8(output.stdout)?.trim().to_string()) + } + + fn mountpoint( + &self, + mount_config: &MountConfig, + zpool: &ZpoolName, + mountpoint: &'static str, + ) -> Utf8PathBuf { + zpool.dataset_mountpoint(&mount_config.root, mountpoint) + } +} + +#[async_trait] +impl ZoneInvoker for RealZone { + async fn get_zones(&self) -> Result, ZoneError> { + Ok(zone::Adm::list() + .await? + .into_iter() + .filter(|z| z.global() || z.name().starts_with(ZONE_PREFIX)) + .collect::>()) + } +} diff --git a/sled-agent/config-reconciler/src/debug_collector/mod.rs b/sled-agent/config-reconciler/src/debug_collector/mod.rs index 4d7ad3d9ef4..367b75029ed 100644 --- a/sled-agent/config-reconciler/src/debug_collector/mod.rs +++ b/sled-agent/config-reconciler/src/debug_collector/mod.rs @@ -2,8 +2,92 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +//! The Debug Collector is responsible for collecting, archiving, and managing +//! long-term storage of various debug data on the system. For details on how +//! this works, see the [`worker`] module docs. The docs here describe the +//! components involved in this subsystem. +//! +//! The consumer (sled agent) interacts with this subsystem in basically two +//! ways: +//! +//! - As the set of internal and external disks change, the consumer updates a +//! pair of watch channels so that this subsystem can respond appropriately. +//! +//! - When needed, the consumer requests immediate archival of debug data on +//! a former zone root filesystem and waits for this to finish. +//! +//! For historical reasons, the flow is a bit more complicated than you'd think. +//! It's easiest to describe it from the bottom up. +//! +//! 1. The `DebugCollectorWorker` (in worker.rs) is an actor-style struct +//! running in its own tokio task that does virtually all of the work of this +//! subsystem: deciding which datasets to use for what purposes, configuring +//! dumpadm and coreadm, running savecore, archiving log files, cleaning up +//! debug datasets, etc. +//! +//! 2. The `DebugCollector` is a client-side handle to the +//! `DebugCollectorWorker`. It provides a function-oriented interface to the +//! worker that just sends messages over an `mpsc` channel to the worker and +//! then waits for responses. These commands do the two things mentioned +//! above (updating the worker with new disk information and requesting +//! archival of debug data from former zone roots). +//! +//! The consumer sets up the debug collector by invoking [`spawn()`] with the +//! watch channels that store the information about internal and external disks. +//! `spawn()` returns a [`FormerZoneRootArchiver`] that the consumer uses when +//! it wants to archive former zone root filesystems. Internally, `spawn()` +//! creates: +//! +//! - a `DebugCollectorTask`. This runs in its own tokio task. Its sole job +//! is to notice when the watch channels containing the set of current +//! internal and external disks change and propagate that information to the +//! `DebugCollectorWorker` (via the `DebugCollector`). (In the future, +//! these watch channels could probably be plumbed directly to the worker, +//! eliminating the `DebugCollectorTask` and its tokio task altogether.) +//! +//! - the `DebugCollector` (see above), which itself creates the +//! `DebugCollectorWorker` (see above). +//! +//! So the flow ends up being: +//! +//! ```text +//! +---------------------+ +//! | sled agent at-large | +//! +---------------------+ +//! | +//! | either: +//! | +//! | 1. change to the set of internal/external disks available +//! | (stored in watch channels) +//! | +//! | 2. request to archive debug data from former zone root filesystem +//! | +//! v +//! +---------------------+ +//! | DebugCollectorTask | (running in its own tokio task) +//! | (see task.rs) | +//! +---------------------+ +//! | +//! | calls into +//! | +//! v +//! +---------------------+ +//! | DebugCollector | +//! | (see handle.rs) | +//! +---------------------+ +//! | +//! | sends message on `mpsc` channel +//! v +//! +----------------------+ +//! | DebugCollectorWorker | (running in its own tokio task) +//! | (see worker.rs) | +//! +----------------------+ +//! ``` + +mod handle; +mod helpers; mod task; mod worker; pub use task::FormerZoneRootArchiver; -pub (crate) use task::spawn; +pub(crate) use task::spawn; diff --git a/sled-agent/config-reconciler/src/debug_collector/task.rs b/sled-agent/config-reconciler/src/debug_collector/task.rs index e8d6d69af21..71fb1665018 100644 --- a/sled-agent/config-reconciler/src/debug_collector/task.rs +++ b/sled-agent/config-reconciler/src/debug_collector/task.rs @@ -2,10 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Long-running tokio task responsible for updating the dump device setup in -//! response to changes in available disks. - -use super::worker::DebugCollector; +use super::handle::DebugCollector; use crate::InternalDisksReceiver; use camino::Utf8PathBuf; use debug_ignore::DebugIgnore; @@ -21,6 +18,9 @@ use tokio::sync::mpsc; use tokio::sync::oneshot; use tokio::sync::watch; +/// Set up the debug collector subsystem +/// +/// See the comment in debug_collector/mod.rs for details. pub(crate) fn spawn( internal_disks_rx: InternalDisksReceiver, external_disks_rx: watch::Receiver>, @@ -52,6 +52,15 @@ pub(crate) fn spawn( } } +/// The `DebugCollectorTask` does two things: +/// +/// - watches for changes to the disk-related `watch` channels and propagates +/// them to the `DebugCollectorWorker` +/// +/// - accepts requests to archive debug data from former zone root filesystems +/// and propagates them to the `DebugCollectorWorker` +/// +/// See the comment in debug_collector/mod.rs for details. struct DebugCollectorTask { // Input channels on which we receive updates about disk changes. internal_disks_rx: InternalDisksReceiver, diff --git a/sled-agent/config-reconciler/src/debug_collector/worker.rs b/sled-agent/config-reconciler/src/debug_collector/worker.rs index eeff4a03fa3..ee4999fcaf9 100644 --- a/sled-agent/config-reconciler/src/debug_collector/worker.rs +++ b/sled-agent/config-reconciler/src/debug_collector/worker.rs @@ -185,26 +185,25 @@ //! the _live_ log files are also archived, since they will not have a chance //! to get rotated and so would otherwise be lost. -use async_trait::async_trait; +use super::helpers::CoreDumpAdmInvoker; +use super::helpers::ZFS_PROP_AVAILABLE; +use super::helpers::ZFS_PROP_USED; +use super::helpers::ZfsGetError; +use super::helpers::ZfsInvoker; +use super::helpers::ZoneInvoker; use camino::Utf8Path; use camino::Utf8PathBuf; use derive_more::{AsRef, From}; use illumos_utils::ExecutionError; -use illumos_utils::coreadm::{CoreAdm, CoreFileOption}; -use illumos_utils::dumpadm::{DumpAdm, DumpContentType}; -use illumos_utils::zone::ZONE_PREFIX; -use illumos_utils::zpool::{ZpoolHealth, ZpoolName}; -use omicron_common::disk::DiskVariant; +use illumos_utils::zpool::ZpoolName; use sled_agent_types::support_bundle::BUNDLE_FILE_NAME; use sled_agent_types::support_bundle::BUNDLE_TMP_FILE_NAME; use sled_storage::config::MountConfig; use sled_storage::dataset::{CRASH_DATASET, DUMP_DATASET}; -use sled_storage::disk::Disk; use slog::Logger; use slog::debug; use slog::error; use slog::info; -use slog::o; use slog::trace; use slog::warn; use slog_error_chain::InlineErrorChain; @@ -215,12 +214,7 @@ use std::sync::Arc; use std::time::{Duration, SystemTime, SystemTimeError, UNIX_EPOCH}; use tokio::sync::mpsc::Receiver; use tokio::sync::oneshot; -use zone::{Zone, ZoneError}; - -// Names of ZFS dataset properties. These are stable and documented in zfs(1M). - -const ZFS_PROP_USED: &str = "used"; -const ZFS_PROP_AVAILABLE: &str = "available"; +use zone::ZoneError; // Parameters related to management of storage on debug datasets @@ -245,21 +239,21 @@ const ARCHIVAL_INTERVAL: Duration = Duration::from_secs(300); /// /// This is generally a slice on an internal (M.2) device. #[derive(AsRef, Clone, Debug, Eq, From, Hash, Ord, PartialEq, PartialOrd)] -struct DumpSlicePath(Utf8PathBuf); +pub(super) struct DumpSlicePath(Utf8PathBuf); /// Filesystem path to the mountpoint of a ZFS dataset that's intended for /// storing debug data for the long term /// /// This is generally on a removable (U.2) device. #[derive(AsRef, Clone, Debug, Eq, From, Hash, Ord, PartialEq, PartialOrd)] -struct DebugDataset(Utf8PathBuf); +pub(super) struct DebugDataset(Utf8PathBuf); /// Filesystem path to the mountpoint of a ZFS dataset that's available as the /// first place that user process core dumps get written /// /// This is generally on an internal (M.2) device. #[derive(AsRef, Clone, Debug, Eq, From, Hash, Ord, PartialEq, PartialOrd)] -struct CoreDataset(Utf8PathBuf); +pub(super) struct CoreDataset(Utf8PathBuf); // Types that describe ZFS datasets that are used to store different kinds of // debug data @@ -274,8 +268,8 @@ struct CoreDataset(Utf8PathBuf); /// This pool is generally on an M.2 device. #[derive(AsRef, Clone, Debug, From)] pub(super) struct CoreZpool { - mount_config: Arc, - name: ZpoolName, + pub(super) mount_config: Arc, + pub(super) name: ZpoolName, } /// Identifies a ZFS dataset that's intended for long-term storage of debug data @@ -288,8 +282,8 @@ pub(super) struct CoreZpool { /// This pool is generally on an M.2 device. #[derive(AsRef, Clone, Debug, From)] pub(super) struct DebugZpool { - mount_config: Arc, - name: ZpoolName, + pub(super) mount_config: Arc, + pub(super) name: ZpoolName, } impl GetMountpoint for DebugZpool { @@ -338,7 +332,7 @@ trait GetMountpoint: AsRef { /// /// These are sent from different places. #[derive(Debug)] -enum DebugCollectorCmd { +pub(super) enum DebugCollectorCmd { /// Archive logs and other debug data from directory `zone_root`, which /// corresponds to the root of a previously-running zone called `zone_name`. /// Reply on `completion_tx` when finished. @@ -369,7 +363,7 @@ enum DebugCollectorCmd { /// /// This runs in its own tokio task. More precisely, poll_file_archival() runs /// in its own tokio task and does all these things. -struct DebugCollectorWorker { +pub(super) struct DebugCollectorWorker { /// list of ZFS datasets that can be used for saving process core dumps core_dataset_names: Vec, /// list of ZFS datasets that can be used for long-term storage of @@ -407,377 +401,6 @@ struct DebugCollectorWorker { zone_invoker: Box, } -/// "External" handle to the debug collector -/// -/// The DebugCollectorTask (a tiny task that passes information from the rest of -/// sled agent to this subystem) has this handle and uses it to send commands to -/// the DebugCollectorWorker. -pub struct DebugCollector { - tx: tokio::sync::mpsc::Sender, - mount_config: Arc, - _poller: tokio::task::JoinHandle<()>, - log: Logger, -} - -impl DebugCollector { - pub fn new(log: &Logger, mount_config: Arc) -> Self { - let (tx, rx) = tokio::sync::mpsc::channel(16); - let worker = DebugCollectorWorker::new( - Box::new(RealCoreDumpAdm {}), - Box::new(RealZfs {}), - Box::new(RealZone {}), - log.new(o!("component" => "DebugCollector-worker")), - rx, - ); - let _poller = - tokio::spawn(async move { worker.poll_file_archival().await }); - let log = log.new(o!("component" => "DebugCollector")); - Self { tx, mount_config, _poller, log } - } - - /// Given the set of all managed disks, updates the dump device location - /// for logs and dumps. - /// - /// This function returns only once this request has been handled, which - /// can be used as a signal by callers that any "old disks" are no longer - /// being used by [DebugCollector]. - pub async fn update_dumpdev_setup( - &self, - disks: impl Iterator, - ) { - let log = &self.log; - let mut m2_dump_slices = Vec::new(); - let mut u2_debug_datasets = Vec::new(); - let mut m2_core_datasets = Vec::new(); - let mount_config = self.mount_config.clone(); - for disk in disks { - match disk.variant() { - DiskVariant::M2 => { - // We only setup dump devices on real disks - if !disk.is_synthetic() { - match disk.dump_device_devfs_path(false) { - Ok(path) => { - m2_dump_slices.push(DumpSlicePath(path)) - } - Err(err) => { - warn!( - log, - "Error getting dump device devfs path: \ - {err:?}" - ); - } - } - } - let name = disk.zpool_name(); - if let Ok(info) = - illumos_utils::zpool::Zpool::get_info(&name.to_string()) - .await - { - if info.health() == ZpoolHealth::Online { - m2_core_datasets.push(CoreZpool { - mount_config: mount_config.clone(), - name: *name, - }); - } else { - warn!( - log, - "Zpool {name:?} not online, won't attempt to \ - save process core dumps there" - ); - } - } - } - DiskVariant::U2 => { - let name = disk.zpool_name(); - if let Ok(info) = - illumos_utils::zpool::Zpool::get_info(&name.to_string()) - .await - { - if info.health() == ZpoolHealth::Online { - u2_debug_datasets.push(DebugZpool { - mount_config: mount_config.clone(), - name: *name, - }); - } else { - warn!( - log, - "Zpool {name:?} not online, won't attempt to \ - save kernel core dumps there" - ); - } - } - } - } - } - - let (tx, rx) = oneshot::channel(); - if let Err(err) = self - .tx - .send(DebugCollectorCmd::UpdateDumpdevSetup { - dump_slices: m2_dump_slices, - debug_datasets: u2_debug_datasets, - core_datasets: m2_core_datasets, - update_complete_tx: tx, - }) - .await - { - error!(log, "DebugCollector channel closed: {:?}", err.0); - }; - - if let Err(err) = rx.await { - error!(log, "DebugCollector failed to await update"; "err" => ?err); - } - } - - /// Request archive of logs from the specified directory, which is assumed - /// to correspond to the root filesystem of a zone that is no longer - /// running. - /// - /// Unlike typical log file archival, this includes non-rotated log files. - /// - /// This makes a best-effort and logs failures rather than reporting them to - /// the caller. - /// - /// When this future completes, the request has only been enqueued. To know - /// when archival has completed, you must wait on the receive side of - /// `completion_tx`. - pub async fn archive_former_zone_root( - &self, - zone_root: &Utf8Path, - completion_tx: oneshot::Sender<()>, - ) { - let log = self.log.new(o!("zone_root" => zone_root.to_string())); - - // Validate the path that we were given. We're only ever given zone - // root filesystems, whose basename is always a zonename, and we always - // prefix our zone names with `oxz_`. If that's not what we find here, - // log an error and bail out. These error cases should be impossible to - // hit in practice. - let Some(file_name) = zone_root.file_name() else { - error!( - log, - "cannot archive former zone root"; - "error" => "path has no filename part", - ); - return; - }; - - if !file_name.starts_with("oxz_") { - error!( - log, - "cannot archive former zone root"; - "error" => "filename does not start with \"oxz_\"", - ); - return; - } - - info!(log, "requesting archive of former zone root"); - let zone_root = zone_root.to_owned(); - let zone_name = file_name.to_string(); - let cmd = DebugCollectorCmd::ArchiveFormerZoneRoot { - zone_root, - zone_name, - completion_tx, - }; - if let Err(_) = self.tx.send(cmd).await { - error!( - log, - "failed to request archive of former zone root"; - "error" => "DebugCollector channel closed" - ); - } - } -} - -#[derive(Debug, thiserror::Error)] -enum ZfsGetError { - #[error("Error executing 'zfs get' command: {0}")] - IoError(#[from] std::io::Error), - #[error( - "Output of 'zfs get' was not only not an integer string, it wasn't \ - even UTF-8: {0}" - )] - Utf8(#[from] std::string::FromUtf8Error), - #[error("Error parsing output of 'zfs get' command as integer: {0}")] - Parse(#[from] std::num::ParseIntError), -} - -/// Helper for invoking coreadm(8) and dumpadm(8), abstracted out for tests -#[async_trait] -trait CoreDumpAdmInvoker { - fn coreadm(&self, core_dir: &Utf8PathBuf) -> Result<(), ExecutionError>; - async fn dumpadm( - &self, - dump_slice: &Utf8PathBuf, - savecore_dir: Option<&Utf8PathBuf>, - ) -> Result, ExecutionError>; -} - -/// Helper for interacting with ZFS filesystems, abstracted out for tests -trait ZfsInvoker { - fn zfs_get_prop( - &self, - mountpoint_or_name: &str, - property: &str, - ) -> Result; - - fn zfs_get_integer( - &self, - mountpoint_or_name: &str, - property: &str, - ) -> Result { - self.zfs_get_prop(mountpoint_or_name, property)? - .parse() - .map_err(Into::into) - } - - fn below_thresh( - &self, - mountpoint: &Utf8PathBuf, - percent: u64, - ) -> Result<(bool, u64), ZfsGetError> { - let used = self.zfs_get_integer(mountpoint.as_str(), ZFS_PROP_USED)?; - let available = - self.zfs_get_integer(mountpoint.as_str(), ZFS_PROP_AVAILABLE)?; - let capacity = used + available; - let below = (used * 100) / capacity < percent; - Ok((below, used)) - } - - fn mountpoint( - &self, - mount_config: &MountConfig, - zpool: &ZpoolName, - mountpoint: &'static str, - ) -> Utf8PathBuf; -} - -/// Helper for listing currently-running zones on the system -#[async_trait] -trait ZoneInvoker { - async fn get_zones(&self) -> Result, ArchiveLogsError>; -} - -struct RealCoreDumpAdm {} -struct RealZfs {} -struct RealZone {} - -#[async_trait] -impl CoreDumpAdmInvoker for RealCoreDumpAdm { - fn coreadm(&self, core_dir: &Utf8PathBuf) -> Result<(), ExecutionError> { - let mut cmd = CoreAdm::new(); - - // disable per-process core patterns - cmd.disable(CoreFileOption::Process); - cmd.disable(CoreFileOption::ProcSetid); - - // use the global core pattern - cmd.enable(CoreFileOption::Global); - cmd.enable(CoreFileOption::GlobalSetid); - - // set the global pattern to place all cores into core_dir, - // with filenames of "core.[zone-name].[exe-filename].[pid].[time]" - cmd.global_pattern(core_dir.join("core.%z.%f.%p.%t")); - - // also collect DWARF data from the exe and its library deps - cmd.global_contents("default+debug"); - - cmd.execute() - } - - // Invokes `dumpadm(8)` to configure the kernel to dump core into the given - // `dump_slice` block device in the event of a panic. If a core is already - // present in that block device, and a `savecore_dir` is provided, this - // function also invokes `savecore(8)` to save it into that directory. - // On success, returns Ok(Some(stdout)) if `savecore(8)` was invoked, or - // Ok(None) if it wasn't. - async fn dumpadm( - &self, - dump_slice: &Utf8PathBuf, - savecore_dir: Option<&Utf8PathBuf>, - ) -> Result, ExecutionError> { - let savecore_dir_cloned = if let Some(dir) = savecore_dir.cloned() { - dir - } else { - // if we don't have a savecore destination yet, still create and use - // a tmpfs path (rather than the default location under /var/crash, - // which is in the ramdisk pool), because dumpadm refuses to do what - // we ask otherwise. - let tmp_crash = "/tmp/crash"; - tokio::fs::create_dir_all(tmp_crash).await.map_err(|err| { - ExecutionError::ExecutionStart { - command: format!("mkdir {tmp_crash:?}"), - err, - } - })?; - Utf8PathBuf::from(tmp_crash) - }; - - // Use the given block device path for dump storage: - let mut cmd = DumpAdm::new(dump_slice.to_owned(), savecore_dir_cloned); - - // Include memory from the current process if there is one for the panic - // context, in addition to kernel memory: - cmd.content_type(DumpContentType::CurProc); - - // Compress crash dumps: - cmd.compress(true); - - // Do not run savecore(8) automatically on boot (irrelevant anyhow, as - // the config file being mutated by dumpadm won't survive reboots on - // gimlets). The sled-agent will invoke it manually instead. - cmd.no_boot_time_savecore(); - - cmd.execute()?; - - // do we have a destination for the saved dump - if savecore_dir.is_some() { - // and does the dump slice have one to save off - if let Ok(true) = - illumos_utils::dumpadm::dump_flag_is_valid(dump_slice).await - { - return illumos_utils::dumpadm::SaveCore.execute(); - } - } - Ok(None) - } -} - -impl ZfsInvoker for RealZfs { - fn zfs_get_prop( - &self, - mountpoint_or_name: &str, - property: &str, - ) -> Result { - let mut cmd = std::process::Command::new(illumos_utils::zfs::ZFS); - cmd.arg("get").arg("-Hpo").arg("value"); - cmd.arg(property); - cmd.arg(mountpoint_or_name); - let output = cmd.output()?; - Ok(String::from_utf8(output.stdout)?.trim().to_string()) - } - - fn mountpoint( - &self, - mount_config: &MountConfig, - zpool: &ZpoolName, - mountpoint: &'static str, - ) -> Utf8PathBuf { - zpool.dataset_mountpoint(&mount_config.root, mountpoint) - } -} - -#[async_trait] -impl ZoneInvoker for RealZone { - async fn get_zones(&self) -> Result, ArchiveLogsError> { - Ok(zone::Adm::list() - .await? - .into_iter() - .filter(|z| z.global() || z.name().starts_with(ZONE_PREFIX)) - .collect::>()) - } -} - /// Returns whether a given file on a debug dataset can safely be deleted as /// part of cleaning up that dataset fn safe_to_delete(path: &Utf8Path, meta: &std::fs::Metadata) -> bool { @@ -800,7 +423,7 @@ fn safe_to_delete(path: &Utf8Path, meta: &std::fs::Metadata) -> bool { } impl DebugCollectorWorker { - fn new( + pub(super) fn new( coredumpadm_invoker: Box, zfs_invoker: Box, zone_invoker: Box, @@ -826,7 +449,7 @@ impl DebugCollectorWorker { } /// Runs the body of the DebugCollector - async fn poll_file_archival(mut self) { + pub(super) async fn poll_file_archival(mut self) { info!(self.log, "DebugCollector poll loop started."); // A oneshot which helps callers track when updates have propagated. @@ -1691,6 +1314,7 @@ struct CleanupDirInfo { #[cfg(test)] mod tests { use super::*; + use async_trait::async_trait; use camino::Utf8Path; use camino_tempfile::Utf8TempDir; use illumos_utils::dumpadm::{ @@ -1700,6 +1324,7 @@ mod tests { use std::collections::HashMap; use std::str::FromStr; use tokio::io::AsyncWriteExt; + use zone::Zone; impl Clone for ZfsGetError { fn clone(&self) -> Self { @@ -1792,7 +1417,7 @@ mod tests { } #[async_trait] impl ZoneInvoker for FakeZone { - async fn get_zones(&self) -> Result, ArchiveLogsError> { + async fn get_zones(&self) -> Result, ZoneError> { Ok(self.zones.clone()) } } From 9e5a97ec97763ae7bfb262da6e9d6142796fd8aa Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 9 Dec 2025 08:58:28 -0800 Subject: [PATCH 7/7] update comment to reflect support bundles are directories --- sled-agent/config-reconciler/src/debug_collector/worker.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sled-agent/config-reconciler/src/debug_collector/worker.rs b/sled-agent/config-reconciler/src/debug_collector/worker.rs index ee4999fcaf9..c092a5a4843 100644 --- a/sled-agent/config-reconciler/src/debug_collector/worker.rs +++ b/sled-agent/config-reconciler/src/debug_collector/worker.rs @@ -103,7 +103,8 @@ //! //! * `core.[zone-name].[exe-filename].[pid].[time]`: process core dumps //! * `unix.[0-9]+`, `bounds`: files associated with kernel crash dumps -//! * `$UUID`: support bundles (wholly unrelated to the DebugCollector) +//! * `$UUID`: directories related to support bundles (wholly unrelated to +//! the DebugCollector) //! * `oxz_[zone-type]_[zone-uuid]`: directory containing all the log files //! for the corresponding zone. //!