From 8951c578beaac5f6ea39a3057c4e6249bb7b9617 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 08:38:59 +0100 Subject: [PATCH 01/24] s3_scrubber: remote atty dependency --- Cargo.lock | 21 --------------------- s3_scrubber/Cargo.toml | 2 -- s3_scrubber/src/lib.rs | 1 - 3 files changed, 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 98cf23c62043..99bae89bcb35 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -213,17 +213,6 @@ dependencies = [ "critical-section", ] -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi 0.1.19", - "libc", - "winapi", -] - [[package]] name = "autocfg" version = "1.1.0" @@ -1786,15 +1775,6 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" -[[package]] -name = "hermit-abi" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] - [[package]] name = "hermit-abi" version = "0.2.6" @@ -3738,7 +3718,6 @@ name = "s3_scrubber" version = "0.1.0" dependencies = [ "anyhow", - "atty", "aws-config", "aws-sdk-s3", "aws-smithy-http", diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml index 47668eb4aafb..a6e7ccce40bf 100644 --- a/s3_scrubber/Cargo.toml +++ b/s3_scrubber/Cargo.toml @@ -34,6 +34,4 @@ pageserver = {path="../pageserver"} tracing.workspace = true tracing-subscriber.workspace = true clap.workspace = true - -atty = "0.2" tracing-appender = "0.2" \ No newline at end of file diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index ea1338cf111a..d7882e05b63d 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -140,7 +140,6 @@ pub fn init_logging(binary_name: &str, dry_run: bool, node_kind: &str) -> Worker .with_writer(file_writer); let stdout_logs = fmt::Layer::new() .with_target(false) - .with_ansi(atty::is(atty::Stream::Stdout)) .with_writer(std::io::stdout); tracing_subscriber::registry() .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"))) From 87421aa542374637694c13e15c666a0d7fd1b807 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 18 Aug 2023 17:10:14 +0100 Subject: [PATCH 02/24] scrubber: add scan-metadata command This provides the same analysis done at the end of `tidy`, but in a standalone command that uses Stream-based listing helpers with parallel execution to provide a faster result when one is interested in the contents of a bucket, but does not want to check the control plane state to learn which items in the bucket correspond to active/non-deleted tenants/timelines. --- Cargo.lock | 15 ++ s3_scrubber/Cargo.toml | 8 +- s3_scrubber/src/checks.rs | 191 +++++++------- .../delete_batch_producer/timeline_batch.rs | 2 +- s3_scrubber/src/lib.rs | 117 +++++++-- s3_scrubber/src/main.rs | 96 +++----- s3_scrubber/src/metadata_stream.rs | 106 ++++++++ s3_scrubber/src/s3_deletion.rs | 8 +- s3_scrubber/src/scan_metadata.rs | 233 ++++++++++++++++++ 9 files changed, 596 insertions(+), 180 deletions(-) create mode 100644 s3_scrubber/src/metadata_stream.rs create mode 100644 s3_scrubber/src/scan_metadata.rs diff --git a/Cargo.lock b/Cargo.lock index 99bae89bcb35..503c944affab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1805,6 +1805,16 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" +[[package]] +name = "histogram" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e673d137229619d5c2c8903b6ed5852b43636c0017ff2e66b1aafb8ccf04b80b" +dependencies = [ + "serde", + "thiserror", +] + [[package]] name = "hmac" version = "0.12.1" @@ -3718,6 +3728,7 @@ name = "s3_scrubber" version = "0.1.0" dependencies = [ "anyhow", + "async-stream", "aws-config", "aws-sdk-s3", "aws-smithy-http", @@ -3728,7 +3739,10 @@ dependencies = [ "clap", "crc32c", "either", + "futures-util", "hex", + "histogram", + "itertools", "pageserver", "rand", "reqwest", @@ -3738,6 +3752,7 @@ dependencies = [ "thiserror", "tokio", "tokio-rustls", + "tokio-stream", "tracing", "tracing-appender", "tracing-subscriber", diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml index a6e7ccce40bf..82cd105d4f1c 100644 --- a/s3_scrubber/Cargo.toml +++ b/s3_scrubber/Cargo.toml @@ -22,6 +22,10 @@ serde_json.workspace = true serde_with.workspace = true workspace_hack.workspace = true utils.workspace = true +async-stream.workspace = true +tokio-stream.workspace = true +futures-util.workspace = true +itertools.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } chrono = { workspace = true, default-features = false, features = ["clock", "serde"] } @@ -30,8 +34,8 @@ aws-config = { workspace = true, default-features = false, features = ["rustls", pageserver = {path="../pageserver"} - tracing.workspace = true tracing-subscriber.workspace = true clap.workspace = true -tracing-appender = "0.2" \ No newline at end of file +tracing-appender = "0.2" +histogram = "0.7" \ No newline at end of file diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index c52a40ee9453..becb2af8ad87 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -4,13 +4,12 @@ use std::time::Duration; use anyhow::Context; use aws_sdk_s3::Client; -use tokio::io::AsyncReadExt; use tokio::task::JoinSet; use tracing::{error, info, info_span, warn, Instrument}; use crate::cloud_admin_api::{BranchData, CloudAdminApiClient, ProjectId}; use crate::delete_batch_producer::DeleteProducerStats; -use crate::{list_objects_with_retries, RootTarget, MAX_RETRIES}; +use crate::{download_object_with_retries, list_objects_with_retries, RootTarget, MAX_RETRIES}; use pageserver::tenant::storage_layer::LayerFileName; use pageserver::tenant::IndexPart; use utils::id::TenantTimelineId; @@ -92,9 +91,9 @@ pub async fn validate_pageserver_active_tenant_and_timelines( branch_checks.spawn( async move { let check_errors = branch_cleanup_and_check_errors( - id, + &id, &s3_root, - &s3_active_branch, + Some(&s3_active_branch), console_branch, s3_data, ) @@ -107,13 +106,13 @@ pub async fn validate_pageserver_active_tenant_and_timelines( } let mut total_stats = BranchCheckStats::default(); - while let Some((id, branch_check_errors)) = branch_checks + while let Some((id, analysis)) = branch_checks .join_next() .await .transpose() .context("branch check task join")? { - total_stats.add(id, branch_check_errors); + total_stats.add(id, analysis.errors); } Ok(total_stats) } @@ -160,35 +159,59 @@ impl BranchCheckStats { } } -async fn branch_cleanup_and_check_errors( - id: TenantTimelineId, +pub struct TimelineAnalysis { + /// Anomalies detected + pub errors: Vec, + + /// Healthy-but-noteworthy, like old-versioned structures that are readable but + /// worth reporting for awareness that we must not remove that old version decoding + /// yet. + pub warnings: Vec, + + /// Keys not referenced in metadata: candidates for removal + pub garbage_keys: Vec, +} + +impl TimelineAnalysis { + fn new() -> Self { + Self { + errors: Vec::new(), + warnings: Vec::new(), + garbage_keys: Vec::new(), + } + } +} + +pub async fn branch_cleanup_and_check_errors( + id: &TenantTimelineId, s3_root: &RootTarget, - s3_active_branch: &BranchData, + s3_active_branch: Option<&BranchData>, console_branch: Option, s3_data: Option, -) -> Vec { - info!( - "Checking timeline for branch branch {:?}/{:?}", - s3_active_branch.project_id, s3_active_branch.id - ); - let mut branch_check_errors = Vec::new(); - - match console_branch { - Some(console_active_branch) => { - if console_active_branch.deleted { - branch_check_errors.push(format!("Timeline has deleted branch data in the console (id = {:?}, project_id = {:?}), recheck whether if it got removed during the check", - s3_active_branch.id, s3_active_branch.project_id)) - } - }, - None => branch_check_errors.push(format!("Timeline has no branch data in the console (id = {:?}, project_id = {:?}), recheck whether if it got removed during the check", +) -> TimelineAnalysis { + let mut result = TimelineAnalysis::new(); + + info!("Checking timeline {id}"); + + if let Some(s3_active_branch) = s3_active_branch { + info!( + "Checking console status for timeline for branch branch {:?}/{:?}", + s3_active_branch.project_id, s3_active_branch.id + ); + match console_branch { + Some(_) => {result.errors.push(format!("Timeline has deleted branch data in the console (id = {:?}, project_id = {:?}), recheck whether if it got removed during the check", + s3_active_branch.id, s3_active_branch.project_id)) + }, + None => { + result.errors.push(format!("Timeline has no branch data in the console (id = {:?}, project_id = {:?}), recheck whether if it got removed during the check", s3_active_branch.id, s3_active_branch.project_id)) + } + }; } - let mut keys_to_remove = Vec::new(); - match s3_data { Some(s3_data) => { - keys_to_remove.extend(s3_data.keys_to_remove); + result.garbage_keys.extend(s3_data.keys_to_remove); match s3_data.blob_data { BlobDataParseResult::Parsed { @@ -196,16 +219,23 @@ async fn branch_cleanup_and_check_errors( mut s3_layers, } => { if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) { - branch_check_errors.push(format!( + result.errors.push(format!( "index_part.json version: {}", index_part.get_version() )) } + if &index_part.get_version() != IndexPart::KNOWN_VERSIONS.last().unwrap() { + result.warnings.push(format!( + "index_part.json version is not latest: {}", + index_part.get_version() + )) + } + if index_part.metadata.disk_consistent_lsn() != index_part.get_disk_consistent_lsn() { - branch_check_errors.push(format!( + result.errors.push(format!( "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})", index_part.metadata.disk_consistent_lsn(), index_part.get_disk_consistent_lsn(), @@ -220,13 +250,13 @@ async fn branch_cleanup_and_check_errors( for (layer, metadata) in index_part.layer_metadata { if metadata.file_size == 0 { - branch_check_errors.push(format!( + result.errors.push(format!( "index_part.json contains a layer {} that has 0 size in its layer metadata", layer.file_name(), )) } if !s3_layers.remove(&layer) { - branch_check_errors.push(format!( + result.errors.push(format!( "index_part.json contains a layer {} that is not present in S3", layer.file_name(), )) @@ -234,55 +264,66 @@ async fn branch_cleanup_and_check_errors( } if !s3_layers.is_empty() { - branch_check_errors.push(format!( + result.errors.push(format!( "index_part.json does not contain layers from S3: {:?}", s3_layers .iter() .map(|layer_name| layer_name.file_name()) .collect::>(), )); - keys_to_remove.extend(s3_layers.iter().map(|layer_name| { - let mut key = s3_root.timeline_root(id).prefix_in_bucket; - let delimiter = s3_root.delimiter(); - if !key.ends_with(delimiter) { - key.push_str(delimiter); - } - key.push_str(&layer_name.file_name()); - key - })); + result + .garbage_keys + .extend(s3_layers.iter().map(|layer_name| { + let mut key = s3_root.timeline_root(id).prefix_in_bucket; + let delimiter = s3_root.delimiter(); + if !key.ends_with(delimiter) { + key.push_str(delimiter); + } + key.push_str(&layer_name.file_name()); + key + })); } } - BlobDataParseResult::Incorrect(parse_errors) => branch_check_errors.extend( + BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend( parse_errors .into_iter() .map(|error| format!("parse error: {error}")), ), } } - None => branch_check_errors.push("Timeline has no data on S3 at all".to_string()), + None => result + .errors + .push("Timeline has no data on S3 at all".to_string()), } - if branch_check_errors.is_empty() { + if result.errors.is_empty() { info!("No check errors found"); } else { - warn!("Found check errors: {branch_check_errors:?}"); + warn!("Timeline metadata errors: {0:?}", result.errors); } - if !keys_to_remove.is_empty() { - error!("The following keys should be removed from S3: {keys_to_remove:?}") + if !result.warnings.is_empty() { + warn!("Timeline metadata warnings: {0:?}", result.warnings); } - branch_check_errors + if !result.garbage_keys.is_empty() { + error!( + "The following keys should be removed from S3: {0:?}", + result.garbage_keys + ) + } + + result } #[derive(Debug)] -struct S3TimelineBlobData { - blob_data: BlobDataParseResult, - keys_to_remove: Vec, +pub struct S3TimelineBlobData { + pub blob_data: BlobDataParseResult, + pub keys_to_remove: Vec, } #[derive(Debug)] -enum BlobDataParseResult { +pub enum BlobDataParseResult { Parsed { index_part: IndexPart, s3_layers: HashSet, @@ -290,7 +331,7 @@ enum BlobDataParseResult { Incorrect(Vec), } -async fn list_timeline_blobs( +pub async fn list_timeline_blobs( s3_client: &Client, id: TenantTimelineId, s3_root: &RootTarget, @@ -298,7 +339,7 @@ async fn list_timeline_blobs( let mut s3_layers = HashSet::new(); let mut index_part_object = None; - let timeline_dir_target = s3_root.timeline_root(id); + let timeline_dir_target = s3_root.timeline_root(&id); let mut continuation_token = None; let mut errors = Vec::new(); @@ -394,45 +435,3 @@ async fn list_timeline_blobs( keys_to_remove, }) } - -async fn download_object_with_retries( - s3_client: &Client, - bucket_name: &str, - key: &str, -) -> anyhow::Result> { - for _ in 0..MAX_RETRIES { - let mut body_buf = Vec::new(); - let response_stream = match s3_client - .get_object() - .bucket(bucket_name) - .key(key) - .send() - .await - { - Ok(response) => response, - Err(e) => { - error!("Failed to download object for key {key}: {e}"); - tokio::time::sleep(Duration::from_secs(1)).await; - continue; - } - }; - - match response_stream - .body - .into_async_read() - .read_to_end(&mut body_buf) - .await - { - Ok(bytes_read) => { - info!("Downloaded {bytes_read} bytes for object object with key {key}"); - return Ok(body_buf); - } - Err(e) => { - error!("Failed to stream object body for key {key}: {e}"); - tokio::time::sleep(Duration::from_secs(1)).await; - } - } - } - - anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times") -} diff --git a/s3_scrubber/src/delete_batch_producer/timeline_batch.rs b/s3_scrubber/src/delete_batch_producer/timeline_batch.rs index 0a0b31ae8709..2ad522d3fb52 100644 --- a/s3_scrubber/src/delete_batch_producer/timeline_batch.rs +++ b/s3_scrubber/src/delete_batch_producer/timeline_batch.rs @@ -41,7 +41,7 @@ pub async fn schedule_cleanup_deleted_timelines( let new_stats = async move { let tenant_id_to_check = project_to_check.tenant; - let check_target = check_root.timelines_root(tenant_id_to_check); + let check_target = check_root.timelines_root(&tenant_id_to_check); let stats = super::process_s3_target_recursively( &check_s3_client, &check_target, diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index d7882e05b63d..f834bbc4c689 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -1,12 +1,15 @@ pub mod checks; pub mod cloud_admin_api; pub mod delete_batch_producer; +pub mod metadata_stream; mod s3_deletion; +pub mod scan_metadata; use std::env; use std::fmt::Display; use std::time::Duration; +use anyhow::Context; use aws_config::environment::EnvironmentVariableCredentialsProvider; use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; @@ -14,7 +17,9 @@ use aws_config::sso::SsoCredentialsProvider; use aws_sdk_s3::config::Region; use aws_sdk_s3::{Client, Config}; +use reqwest::Url; pub use s3_deletion::S3Deleter; +use tokio::io::AsyncReadExt; use tracing::error; use tracing_appender::non_blocking::WorkerGuard; use tracing_subscriber::{fmt, prelude::*, EnvFilter}; @@ -23,6 +28,8 @@ use utils::id::{TenantId, TenantTimelineId}; const MAX_RETRIES: usize = 20; const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN"; +pub const CLI_NAME: &str = "s3-scrubber"; + #[derive(Debug, Clone)] pub struct S3Target { pub bucket_name: String, @@ -69,19 +76,19 @@ impl RootTarget { } } - pub fn tenant_root(&self, tenant_id: TenantId) -> S3Target { + pub fn tenant_root(&self, tenant_id: &TenantId) -> S3Target { self.tenants_root().with_sub_segment(&tenant_id.to_string()) } - pub fn timelines_root(&self, tenant_id: TenantId) -> S3Target { + pub fn timelines_root(&self, tenant_id: &TenantId) -> S3Target { match self { Self::Pageserver(_) => self.tenant_root(tenant_id).with_sub_segment("timelines"), Self::Safekeeper(_) => self.tenant_root(tenant_id), } } - pub fn timeline_root(&self, id: TenantTimelineId) -> S3Target { - self.timelines_root(id.tenant_id) + pub fn timeline_root(&self, id: &TenantTimelineId) -> S3Target { + self.timelines_root(&id.tenant_id) .with_sub_segment(&id.timeline_id.to_string()) } @@ -100,6 +107,48 @@ impl RootTarget { } } +pub struct BucketConfig { + pub region: String, + pub bucket: String, + pub sso_account_id: String, +} + +impl Display for BucketConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}/{}/{}", self.sso_account_id, self.region, self.bucket) + } +} + +impl BucketConfig { + pub fn from_env() -> anyhow::Result { + let sso_account_id = + env::var("SSO_ACCOUNT_ID").context("'SSO_ACCOUNT_ID' param retrieval")?; + let region = env::var("REGION").context("'REGION' param retrieval")?; + let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?; + + Ok(Self { + region, + bucket, + sso_account_id, + }) + } +} + +pub struct ConsoleConfig { + pub admin_api_url: Url, +} + +impl ConsoleConfig { + pub fn from_env() -> anyhow::Result { + let admin_api_url: Url = env::var("CLOUD_ADMIN_API_URL") + .context("'CLOUD_ADMIN_API_URL' param retrieval")? + .parse() + .context("'CLOUD_ADMIN_API_URL' param parsing")?; + + Ok(Self { admin_api_url }) + } +} + pub fn get_cloud_admin_api_token_or_exit() -> String { match env::var(CLOUD_ADMIN_API_TOKEN_ENV_VAR) { Ok(token) => token, @@ -114,23 +163,7 @@ pub fn get_cloud_admin_api_token_or_exit() -> String { } } -pub fn init_logging(binary_name: &str, dry_run: bool, node_kind: &str) -> WorkerGuard { - let file_name = if dry_run { - format!( - "{}_{}_{}__dry.log", - binary_name, - node_kind, - chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") - ) - } else { - format!( - "{}_{}_{}.log", - binary_name, - node_kind, - chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") - ) - }; - +pub fn init_logging(file_name: &str) -> WorkerGuard { let (file_writer, guard) = tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name)); @@ -201,3 +234,45 @@ async fn list_objects_with_retries( anyhow::bail!("Failed to list objects {MAX_RETRIES} times") } + +async fn download_object_with_retries( + s3_client: &Client, + bucket_name: &str, + key: &str, +) -> anyhow::Result> { + for _ in 0..MAX_RETRIES { + let mut body_buf = Vec::new(); + let response_stream = match s3_client + .get_object() + .bucket(bucket_name) + .key(key) + .send() + .await + { + Ok(response) => response, + Err(e) => { + error!("Failed to download object for key {key}: {e}"); + tokio::time::sleep(Duration::from_secs(1)).await; + continue; + } + }; + + match response_stream + .body + .into_async_read() + .read_to_end(&mut body_buf) + .await + { + Ok(bytes_read) => { + tracing::info!("Downloaded {bytes_read} bytes for object object with key {key}"); + return Ok(body_buf); + } + Err(e) => { + error!("Failed to stream object body for key {key}: {e}"); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + + anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times") +} diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs index 7004bcad5167..6bb08681614b 100644 --- a/s3_scrubber/src/main.rs +++ b/s3_scrubber/src/main.rs @@ -1,17 +1,16 @@ use std::collections::HashMap; -use std::env; use std::fmt::Display; use std::num::NonZeroUsize; use std::sync::Arc; use anyhow::Context; use aws_sdk_s3::config::Region; -use reqwest::Url; use s3_scrubber::cloud_admin_api::CloudAdminApiClient; use s3_scrubber::delete_batch_producer::DeleteBatchProducer; +use s3_scrubber::scan_metadata::scan_metadata; use s3_scrubber::{ - checks, get_cloud_admin_api_token_or_exit, init_logging, init_s3_client, RootTarget, S3Deleter, - S3Target, TraversingDepth, + checks, get_cloud_admin_api_token_or_exit, init_logging, init_s3_client, BucketConfig, + ConsoleConfig, RootTarget, S3Deleter, S3Target, TraversingDepth, CLI_NAME, }; use tracing::{info, info_span, warn}; @@ -59,48 +58,7 @@ enum Command { #[arg(short, long, default_value_t = false)] skip_validation: bool, }, -} - -struct BucketConfig { - region: String, - bucket: String, - sso_account_id: String, -} - -impl Display for BucketConfig { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}/{}/{}", self.sso_account_id, self.region, self.bucket) - } -} - -impl BucketConfig { - fn from_env() -> anyhow::Result { - let sso_account_id = - env::var("SSO_ACCOUNT_ID").context("'SSO_ACCOUNT_ID' param retrieval")?; - let region = env::var("REGION").context("'REGION' param retrieval")?; - let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?; - - Ok(Self { - region, - bucket, - sso_account_id, - }) - } -} - -struct ConsoleConfig { - admin_api_url: Url, -} - -impl ConsoleConfig { - fn from_env() -> anyhow::Result { - let admin_api_url: Url = env::var("CLOUD_ADMIN_API_URL") - .context("'CLOUD_ADMIN_API_URL' param retrieval")? - .parse() - .context("'CLOUD_ADMIN_API_URL' param parsing")?; - - Ok(Self { admin_api_url }) - } + ScanMetadata {}, } async fn tidy( @@ -111,13 +69,25 @@ async fn tidy( depth: TraversingDepth, skip_validation: bool, ) -> anyhow::Result<()> { - let binary_name = env::args() - .next() - .context("binary name in not the first argument")?; - let dry_run = !cli.delete; - let _guard = init_logging(&binary_name, dry_run, node_kind.as_str()); - let _main_span = info_span!("tidy", binary = %binary_name, %dry_run).entered(); + let file_name = if dry_run { + format!( + "{}_{}_{}__dry.log", + CLI_NAME, + node_kind, + chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") + ) + } else { + format!( + "{}_{}_{}.log", + CLI_NAME, + node_kind, + chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") + ) + }; + + let _guard = init_logging(&file_name); + let _main_span = info_span!("tidy", %dry_run).entered(); if dry_run { info!("Dry run, not removing items for real"); @@ -247,7 +217,7 @@ async fn main() -> anyhow::Result<()> { let bucket_config = BucketConfig::from_env()?; - match &cli.command { + match cli.command { Command::Tidy { node_kind, depth, @@ -258,11 +228,25 @@ async fn main() -> anyhow::Result<()> { &cli, bucket_config, console_config, - *node_kind, - *depth, - *skip_validation, + node_kind, + depth, + skip_validation, ) .await } + Command::ScanMetadata {} => match scan_metadata(bucket_config).await { + Err(e) => { + tracing::error!("Failed: {e}"); + Err(e) + } + Ok(summary) => { + println!("{}", summary.summary_string()); + if summary.is_fatal() { + Err(anyhow::anyhow!("Fatal scrub errors detected")) + } else { + Ok(()) + } + } + }, } } diff --git a/s3_scrubber/src/metadata_stream.rs b/s3_scrubber/src/metadata_stream.rs new file mode 100644 index 000000000000..4e500a96cf71 --- /dev/null +++ b/s3_scrubber/src/metadata_stream.rs @@ -0,0 +1,106 @@ +use anyhow::Context; +use async_stream::{stream, try_stream}; +use aws_sdk_s3::Client; +use tokio_stream::Stream; + +use crate::{list_objects_with_retries, RootTarget, TenantId}; +use utils::id::{TenantTimelineId, TimelineId}; + +/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2 +pub fn stream_tenants<'a>( + s3_client: &'a Client, + target: &'a RootTarget, +) -> impl Stream> + 'a { + try_stream! { + let mut continuation_token = None; + loop { + let tenants_target = target.tenants_root(); + let fetch_response = + list_objects_with_retries(s3_client, tenants_target, continuation_token.clone()).await?; + + let new_entry_ids = fetch_response + .common_prefixes() + .unwrap_or_default() + .iter() + .filter_map(|prefix| prefix.prefix()) + .filter_map(|prefix| -> Option<&str> { + prefix + .strip_prefix(&tenants_target.prefix_in_bucket)? + .strip_suffix('/') + }).map(|entry_id_str| { + entry_id_str + .parse() + .with_context(|| format!("Incorrect entry id str: {entry_id_str}")) + }); + + for i in new_entry_ids { + yield i?; + } + + match fetch_response.next_continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + } +} + +/// Given a TenantId, output a stream of the timelines within that tenant, discovered +/// using ListObjectsv2. The listing is done before the stream is built, so that this +/// function can be used to generate concurrency on a stream using buffer_unordered. +pub async fn stream_tenant_timelines<'a>( + s3_client: &'a Client, + target: &'a RootTarget, + tenant: TenantId, +) -> anyhow::Result> + 'a> { + let mut timeline_ids: Vec> = Vec::new(); + let mut continuation_token = None; + let timelines_target = target.timelines_root(&tenant); + + loop { + tracing::info!("Listing in {}", tenant); + let fetch_response = + list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone()) + .await; + let fetch_response = match fetch_response { + Err(e) => { + timeline_ids.push(Err(e)); + break; + } + Ok(r) => r, + }; + + let new_entry_ids = fetch_response + .common_prefixes() + .unwrap_or_default() + .iter() + .filter_map(|prefix| prefix.prefix()) + .filter_map(|prefix| -> Option<&str> { + prefix + .strip_prefix(&timelines_target.prefix_in_bucket)? + .strip_suffix('/') + }) + .map(|entry_id_str| { + entry_id_str + .parse::() + .with_context(|| format!("Incorrect entry id str: {entry_id_str}")) + }); + + for i in new_entry_ids { + timeline_ids.push(i); + } + + match fetch_response.next_continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + + tracing::info!("Yielding for {}", tenant); + Ok(stream! { + for i in timeline_ids { + let id = i?; + yield Ok(TenantTimelineId::new(tenant, id)); + } + }) +} diff --git a/s3_scrubber/src/s3_deletion.rs b/s3_scrubber/src/s3_deletion.rs index 716443790b9a..a03cc65c8913 100644 --- a/s3_scrubber/src/s3_deletion.rs +++ b/s3_scrubber/src/s3_deletion.rs @@ -164,7 +164,7 @@ async fn delete_tenants_batch( s3_target, s3_client, dry_run, - |root_target, tenant_to_delete| root_target.tenant_root(tenant_to_delete), + |root_target, tenant_to_delete| root_target.tenant_root(&tenant_to_delete), ) .await?; @@ -215,7 +215,7 @@ async fn delete_timelines_batch( s3_target, s3_client, dry_run, - |root_target, timeline_to_delete| root_target.timeline_root(timeline_to_delete), + |root_target, timeline_to_delete| root_target.timeline_root(&timeline_to_delete), ) .await?; @@ -386,7 +386,7 @@ async fn ensure_tenant_batch_deleted( for &tenant_id in batch { let fetch_response = - list_objects_with_retries(s3_client, &s3_target.tenant_root(tenant_id), None).await?; + list_objects_with_retries(s3_client, &s3_target.tenant_root(&tenant_id), None).await?; if fetch_response.is_truncated() || fetch_response.contents().is_some() @@ -415,7 +415,7 @@ async fn ensure_timeline_batch_deleted( for &id in batch { let fetch_response = - list_objects_with_retries(s3_client, &s3_target.timeline_root(id), None).await?; + list_objects_with_retries(s3_client, &s3_target.timeline_root(&id), None).await?; if fetch_response.is_truncated() || fetch_response.contents().is_some() diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs new file mode 100644 index 000000000000..301d6a018cd6 --- /dev/null +++ b/s3_scrubber/src/scan_metadata.rs @@ -0,0 +1,233 @@ +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use crate::checks::{ + branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData, + TimelineAnalysis, +}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; +use crate::{init_logging, init_s3_client, BucketConfig, RootTarget, S3Target, CLI_NAME}; +use aws_sdk_s3::Client; +use aws_types::region::Region; +use futures_util::{pin_mut, StreamExt, TryStreamExt}; +use histogram::Histogram; +use pageserver::tenant::IndexPart; +use utils::id::TenantTimelineId; + +pub struct MetadataSummary { + count: usize, + with_errors: HashSet, + with_warnings: HashSet, + with_garbage: HashSet, + indices_by_version: HashMap, + + layer_count: MinMaxHisto, + timeline_size_bytes: MinMaxHisto, + layer_size_bytes: MinMaxHisto, +} + +/// A histogram plus minimum and maximum tracking +struct MinMaxHisto { + histo: Histogram, + min: u64, + max: u64, +} + +impl MinMaxHisto { + fn new() -> Self { + Self { + histo: histogram::Histogram::builder() + .build() + .expect("Bad histogram params"), + min: u64::MAX, + max: 0, + } + } + + fn sample(&mut self, v: u64) -> Result<(), histogram::Error> { + self.min = std::cmp::min(self.min, v); + self.max = std::cmp::max(self.max, v); + let r = self.histo.increment(v, 1); + + if r.is_err() { + tracing::warn!("Bad histogram sample: {v}"); + } + + r + } + + fn oneline(&self) -> String { + let percentiles = match self.histo.percentiles(&[1.0, 10.0, 50.0, 90.0, 99.0]) { + Ok(p) => p, + Err(e) => return format!("No data: {}", e), + }; + + let percentiles: Vec = percentiles + .iter() + .map(|p| p.bucket().low() + p.bucket().high() / 2) + .collect(); + + format!( + "min {}, 1% {}, 10% {}, 50% {}, 90% {}, 99% {}, max {}", + self.min, + percentiles[0], + percentiles[1], + percentiles[2], + percentiles[3], + percentiles[4], + self.max, + ) + } +} + +impl MetadataSummary { + fn new() -> Self { + Self { + count: 0, + with_errors: HashSet::new(), + with_warnings: HashSet::new(), + with_garbage: HashSet::new(), + indices_by_version: HashMap::new(), + layer_count: MinMaxHisto::new(), + timeline_size_bytes: MinMaxHisto::new(), + layer_size_bytes: MinMaxHisto::new(), + } + } + + fn update_histograms(&mut self, index_part: &IndexPart) -> Result<(), histogram::Error> { + self.layer_count + .sample(index_part.layer_metadata.len() as u64)?; + let mut total_size: u64 = 0; + for meta in index_part.layer_metadata.values() { + total_size += meta.file_size; + self.layer_size_bytes.sample(meta.file_size)?; + } + self.timeline_size_bytes.sample(total_size)?; + + Ok(()) + } + + fn update_data(&mut self, data: &S3TimelineBlobData) { + self.count += 1; + if let BlobDataParseResult::Parsed { + index_part, + s3_layers: _, + } = &data.blob_data + { + *self + .indices_by_version + .entry(index_part.get_version()) + .or_insert(0) += 1; + + if let Err(e) = self.update_histograms(index_part) { + // Value out of range? Warn that the results are untrustworthy + tracing::warn!( + "Error updating histograms, summary stats may be wrong: {}", + e + ); + } + } + } + + fn update_analysis(&mut self, id: &TenantTimelineId, analysis: &TimelineAnalysis) { + if !analysis.errors.is_empty() { + self.with_errors.insert(*id); + } + + if !analysis.warnings.is_empty() { + self.with_warnings.insert(*id); + } + } + + /// Long-form output for printing at end of a scan + pub fn summary_string(&self) -> String { + let version_summary: String = itertools::join( + self.indices_by_version + .iter() + .map(|(k, v)| format!("{k}: {v}")), + ", ", + ); + + format!( + "Timelines: {0} +With errors: {1} +With warnings: {2} +With garbage: {3} +Index versions: {version_summary} +Timeline size bytes: {4} +Layer size bytes: {5} +Timeline layer count: {6} +", + self.count, + self.with_errors.len(), + self.with_warnings.len(), + self.with_garbage.len(), + self.timeline_size_bytes.oneline(), + self.layer_size_bytes.oneline(), + self.layer_count.oneline(), + ) + } + + pub fn is_fatal(&self) -> bool { + !self.with_errors.is_empty() + } +} + +/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics. +pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result { + let file_name = format!( + "{}_scan_metadata_{}_{}.log", + CLI_NAME, + bucket_config.bucket, + chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") + ); + + let _guard = init_logging(&file_name); + + let s3_client = Arc::new(init_s3_client( + bucket_config.sso_account_id, + Region::new(bucket_config.region), + )); + let delimiter = "/"; + let target = RootTarget::Pageserver(S3Target { + bucket_name: bucket_config.bucket.to_string(), + prefix_in_bucket: ["pageserver", "v1", "tenants", ""].join(delimiter), + delimiter: delimiter.to_string(), + }); + + let tenants = stream_tenants(&s3_client, &target); + + // How many tenants to process in parallel. The higher this is, the harder we hit S3. + const CONCURRENCY: usize = 32; + + // Generate a stream of TenantTimelineId + let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t)); + let timelines = timelines.try_buffer_unordered(CONCURRENCY); + let timelines = timelines.try_flatten(); + + // Generate a stream of S3TimelineBlobData + async fn report_on_timeline( + s3_client: &Client, + target: &RootTarget, + ttid: TenantTimelineId, + ) -> anyhow::Result<(TenantTimelineId, S3TimelineBlobData)> { + let data = list_timeline_blobs(s3_client, ttid, target).await?; + Ok((ttid, data)) + } + let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid)); + let timelines = timelines.try_buffer_unordered(CONCURRENCY); + + let mut summary = MetadataSummary::new(); + pin_mut!(timelines); + while let Some(i) = timelines.next().await { + let (ttid, data) = i?; + summary.update_data(&data); + + let analysis = + branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data)).await; + + summary.update_analysis(&ttid, &analysis); + } + + Ok(summary) +} From f544cdc98fda4038987ed924efe343d2a39a2e0b Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 09:22:33 +0100 Subject: [PATCH 03/24] tests: enable scrubbing at end of test --- test_runner/fixtures/neon_fixtures.py | 70 ++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b2cd0fe96833..ff84bf2c85f9 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -454,6 +454,7 @@ def __init__( self.preserve_database_files = preserve_database_files self.initial_tenant = initial_tenant or TenantId.generate() self.initial_timeline = initial_timeline or TimelineId.generate() + self.scrub_on_exit = False def init_configs(self) -> NeonEnv: # Cannot create more than one environment from one builder @@ -483,6 +484,23 @@ def init_start(self, initial_tenant_conf: Optional[Dict[str, str]] = None) -> Ne return env + def enable_scrub_on_exit(self): + """ + Call this if you would like the fixture to automatically run + s3_scrubber at the end of the test, as a bidirectional test + that the scrubber is working properly, and that the code within + the test didn't produce any invalid remote state. + """ + + if not isinstance(self.remote_storage, S3Storage): + # The scrubber can't talk to e.g. LocalFS -- it needs + # an HTTP endpoint (mock is fine) to connect to. + raise RuntimeError( + "Cannot scrub with remote_storage={self.remote_storage}, require an S3 endpoint" + ) + + self.scrub_on_exit = True + def enable_remote_storage( self, remote_storage_kind: RemoteStorageKind, @@ -714,11 +732,20 @@ def __exit__( self.env.pageserver.stop(immediate=True) cleanup_error = None + + if self.scrub_on_exit: + try: + S3Scrubber(self, self.remote_storage).scan_metadata() + except Exception as e: + log.error(f"Error during remote storage scrub: {e}") + cleanup_error = e + try: self.cleanup_remote_storage() except Exception as e: log.error(f"Error during remote storage cleanup: {e}") - cleanup_error = e + if cleanup_error is not None: + cleanup_error = e try: self.cleanup_local_storage() @@ -2724,6 +2751,47 @@ def get_metrics(self) -> SafekeeperMetrics: return metrics +class S3Scrubber: + def __init__(self, env, remote_storage): + self.env = env + self.remote_storage = remote_storage + + def scrubber_cli(self, args, timeout): + s3_storage = self.env.remote_storage + assert isinstance(s3_storage, S3Storage) + + env = { + "REGION": s3_storage.bucket_region, + "BUCKET": s3_storage.bucket_name, + } + env.update(s3_storage.access_env_vars()) + + if s3_storage.endpoint is not None: + env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint}) + + base_args = [self.env.neon_binpath / "s3_scrubber"] + args = base_args + args + r = subprocess.run( + args, + env=env, + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=timeout, + ) + if r.returncode: + log.warning(f"Scrub command {args} failed") + log.warning(f"Scrub environment: {env}") + log.warning(f"Scrub stdout: {r.stdout}") + log.warning(f"Scrub stderr: {r.stderr}") + + raise RuntimeError("Remote storage scrub failed") + + def scan_metadata(self): + self.scrubber_cli(["scan-metadata"], timeout=30) + + def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: """Compute the working directory for an individual test.""" test_name = request.node.name From 0f0cd9c853463dc9b113fbf5347a07eee0e274df Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 09:35:46 +0100 Subject: [PATCH 04/24] scrubber: make SSO_ACCOUNT_ID optional --- s3_scrubber/src/lib.rs | 46 +++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index f834bbc4c689..4ee20d304715 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -110,19 +110,31 @@ impl RootTarget { pub struct BucketConfig { pub region: String, pub bucket: String, - pub sso_account_id: String, + + /// Use SSO if this is set, else rely on AWS_* environment vars + pub sso_account_id: Option, } impl Display for BucketConfig { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}/{}/{}", self.sso_account_id, self.region, self.bucket) + write!( + f, + "{}/{}/{}", + self.sso_account_id + .as_ref() + .map(|s| s.as_str()) + .unwrap_or(""), + self.region, + self.bucket + ) } } impl BucketConfig { pub fn from_env() -> anyhow::Result { - let sso_account_id = - env::var("SSO_ACCOUNT_ID").context("'SSO_ACCOUNT_ID' param retrieval")?; + let sso_account_id = env::var("SSO_ACCOUNT_ID") + .context("'SSO_ACCOUNT_ID' param retrieval") + .ok(); let region = env::var("REGION").context("'REGION' param retrieval")?; let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?; @@ -183,22 +195,32 @@ pub fn init_logging(file_name: &str) -> WorkerGuard { guard } -pub fn init_s3_client(account_id: String, bucket_region: Region) -> Client { +pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Client { let credentials_provider = { // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" - CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new()) - // uses sso - .or_else( + let chain = CredentialsProviderChain::first_try( + "env", + EnvironmentVariableCredentialsProvider::new(), + ); + + // Use SSO if we were given an account ID + match account_id { + Some(sso_account) => chain.or_else( "sso", SsoCredentialsProvider::builder() - .account_id(account_id) + .account_id(&sso_account) .role_name("PowerUserAccess") .start_url("https://neondb.awsapps.com/start") .region(Region::from_static("eu-central-1")) .build(), - ) - // uses imds v2 - .or_else("imds", ImdsCredentialsProvider::builder().build()) + ), + None => chain, + } + .or_else( + // Finally try IMDS + "imds", + ImdsCredentialsProvider::builder().build(), + ) }; let config = Config::builder() From 5cdecb53a7c58ac0d9fb1c6d5e764f75cacdbc4a Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 10:18:05 +0100 Subject: [PATCH 05/24] scrubber: respect AWS_ENDPOINT_URL --- s3_scrubber/src/lib.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index 4ee20d304715..d7bf9c21ecf2 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -223,12 +223,15 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie ) }; - let config = Config::builder() + let mut builder = Config::builder() .region(bucket_region) - .credentials_provider(credentials_provider) - .build(); + .credentials_provider(credentials_provider); - Client::from_conf(config) + if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") { + builder = builder.endpoint_url(endpoint) + } + + Client::from_conf(builder.build()) } async fn list_objects_with_retries( From f9836adc5640d862dbffa367f3006f3a9e401e4a Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 10:20:29 +0100 Subject: [PATCH 06/24] tests: enable remote storage & scrub in pageserver restart & chaos tests --- test_runner/fixtures/remote_storage.py | 13 +++++++++++++ test_runner/regress/test_pageserver_restart.py | 11 +++++++++++ 2 files changed, 24 insertions(+) diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 320e658639c2..a68257bbace5 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -88,6 +88,19 @@ def available_s3_storages() -> List[RemoteStorageKind]: return remote_storages +def s3_storage() -> RemoteStorageKind: + """ + For tests that require a remote storage impl that exposes an S3 + endpoint, but don't want to parametrize over multiple storage types. + + Use real S3 if available, else use MockS3 + """ + if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None: + return RemoteStorageKind.REAL_S3 + else: + return RemoteStorageKind.MOCK_S3 + + @dataclass class LocalFsStorage: root: Path diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 1e41ebd15b59..20da112031cd 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -3,11 +3,17 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.remote_storage import s3_storage # Test restarting page server, while safekeeper and compute node keep # running. def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): + neon_env_builder.enable_remote_storage( + remote_storage_kind=s3_storage(), test_name="test_pageserver_restart" + ) + neon_env_builder.enable_scrub_on_exit() + env = neon_env_builder.init_start() env.neon_cli.create_branch("test_pageserver_restart") @@ -109,6 +115,11 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # safekeeper and compute node keep running. @pytest.mark.timeout(540) def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder): + neon_env_builder.enable_remote_storage( + remote_storage_kind=s3_storage(), test_name="test_pageserver_restart" + ) + neon_env_builder.enable_scrub_on_exit() + env = neon_env_builder.init_start() # Use a tiny checkpoint distance, to create a lot of layers quickly. From bc685687fee868922997385defe51b44491a5abb Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 17:14:51 +0100 Subject: [PATCH 07/24] Update s3_scrubber/src/scan_metadata.rs Co-authored-by: Joonas Koivunen --- s3_scrubber/src/scan_metadata.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs index 301d6a018cd6..69cd60d137a8 100644 --- a/s3_scrubber/src/scan_metadata.rs +++ b/s3_scrubber/src/scan_metadata.rs @@ -197,7 +197,8 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result Date: Fri, 1 Sep 2023 12:59:19 +0100 Subject: [PATCH 08/24] Wait for custom extensions build before deploy (#5170) ## Problem Currently, the `deploy` job doesn't wait for the custom extension job (in another repo) and can be started even with failed extensions build. This PR adds another job that polls the status of the extension build job and fails if the extension build fails. ## Summary of changes - Add `wait-for-extensions-build` job, which waits for a custom extension build in another repo. --- .github/workflows/build_and_test.yml | 50 +++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1ec2a65a89dc..144a96910e9d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -899,7 +899,7 @@ jobs: - name: Cleanup ECR folder run: rm -rf ~/.ecr - build-private-extensions: + trigger-custom-extensions-build: runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned @@ -908,8 +908,7 @@ jobs: steps: - name: Set PR's status to pending and request a remote CI test run: | - COMMIT_SHA=${{ github.event.pull_request.head.sha }} - COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} + COMMIT_SHA=${{ github.event.pull_request.head.sha || github.sha }} REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions" curl -f -X POST \ @@ -939,10 +938,53 @@ jobs: } }" + wait-for-extensions-build: + runs-on: ubuntu-latest + needs: [ trigger-custom-extensions-build ] + + steps: + - name: Wait for extension build to finish + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + TIMEOUT=600 # 10 minutes, currently it takes ~2-3 minutes + INTERVAL=15 # try each N seconds + + last_status="" # a variable to carry the last status of the "build-and-upload-extensions" context + + for ((i=0; i <= $TIMEOUT; i+=$INTERVAL)); do + sleep $INTERVAL + + # Get statuses for the latest commit in the PR / branch + gh api \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha || github.sha }}" > statuses.json + + # Get the latest status for the "build-and-upload-extensions" context + last_status=$(jq --raw-output '[.[] | select(.context == "build-and-upload-extensions")] | sort_by(.created_at)[-1].state' statuses.json) + if [ "${last_status}" = "pending" ]; then + # Extension build is still in progress. + continue + elif [ "${last_status}" = "success" ]; then + # Extension build is successful. + exit 0 + else + # Status is neither "pending" nor "success", exit the loop and fail the job. + break + fi + done + + # Extension build failed, print `statuses.json` for debugging and fail the job. + jq '.' statuses.json + + echo >&2 "Status of extension build is '${last_status}' != 'success'" + exit 1 + deploy: runs-on: [ self-hosted, gen3, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - needs: [ promote-images, tag, regress-tests ] + needs: [ promote-images, tag, regress-tests, wait-for-extensions-build ] if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch' steps: - name: Fix git ownership From 06053ddd7d63a7bfbe8af33bf552ecb17de0296a Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 1 Sep 2023 15:24:04 +0200 Subject: [PATCH 09/24] remote_timeline_client: tests: run upload ops on the tokio::test runtime (#5177) The `remote_timeline_client` tests use `#[tokio::test]` and rely on the fact that the test runtime that is set up by this macro is single-threaded. In PR https://github.com/neondatabase/neon/pull/5164, we observed interesting flakiness with the `upload_scheduling` test case: it would observe the upload of the third layer (`layer_file_name_3`) before we did `wait_completion`. Under the single-threaded-runtime assumption, that wouldn't be possible, because the test code doesn't await inbetween scheduling the upload and calling `wait_completion`. However, RemoteTimelineClient was actually using `BACKGROUND_RUNTIME`. That means there was parallelism where the tests didn't expect it, leading to flakiness such as execution of an UploadOp task before the test calls `wait_completion`. The most confusing scenario is code like this: ``` schedule upload(A); wait_completion.await; // B schedule_upload(C); wait_completion.await; // D ``` On a single-threaded executor, it is guaranteed that the upload up C doesn't run before D, because we (the test) don't relinquish control to the executor before D's `await` point. However, RemoteTimelineClient actually scheduled onto the BACKGROUND_RUNTIME, so, `A` could start running before `B` and `C` could start running before `D`. This would cause flaky tests when making assertions about the state manipulated by the operations. The concrete issue that led to discover of this bug was an assertion about `remote_fs_dir` state in #5164. --- pageserver/src/tenant/remote_timeline_client.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 50bb8b43dee9..7e7007d7e29d 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -342,7 +342,12 @@ impl RemoteTimelineClient { ) -> RemoteTimelineClient { RemoteTimelineClient { conf, - runtime: BACKGROUND_RUNTIME.handle().to_owned(), + runtime: if cfg!(test) { + // remote_timeline_client.rs tests rely on current-thread runtime + tokio::runtime::Handle::current() + } else { + BACKGROUND_RUNTIME.handle().clone() + }, tenant_id, timeline_id, generation, From 94ad50423e43065d9fb30afe32cdf093e2b81cf6 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 1 Sep 2023 15:24:58 +0200 Subject: [PATCH 10/24] rfc: Crash-Consistent Layer Map Updates By Leveraging index_part.json (#5086) This RFC describes a simple scheme to make layer map updates crash consistent by leveraging the index_part.json in remote storage. Without such a mechanism, crashes can induce certain edge cases in which broadly held assumptions about system invariants don't hold. --- ...consistent-layer-map-through-index-part.md | 281 ++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 docs/rfcs/027-crash-consistent-layer-map-through-index-part.md diff --git a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md new file mode 100644 index 000000000000..2c6b46eabeb6 --- /dev/null +++ b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md @@ -0,0 +1,281 @@ + +# Crash-Consistent Layer Map Updates By Leveraging `index_part.json` + +* Created on: Aug 23, 2023 +* Author: Christian Schwarz + +## Summary + +This RFC describes a simple scheme to make layer map updates crash consistent by leveraging the `index_part.json` in remote storage. +Without such a mechanism, crashes can induce certain edge cases in which broadly held assumptions about system invariants don't hold. + +## Motivation + +### Background + +We can currently easily make complex, atomic updates to the layer map by means of an RwLock. +If we crash or restart pageserver, we reconstruct the layer map from: +1. local timeline directory contents +2. remote `index_part.json` contents. + +The function that is responsible for this is called `Timeline::load_layer_map()`. +The reconciliation process's behavior is the following: +* local-only files will become part of the layer map as local-only layers and rescheduled for upload +* For a file name that, by its name, is present locally and in the remote `index_part.json`, but where the local file has a different size (future: checksum) than the remote file, we will delete the local file and leave the remote file as a `RemoteLayer` in the layer map. + +### The Problem + +There are are cases where we need to make an atomic update to the layer map that involves **more than one layer**. +The best example is compaction, where we need to insert the L1 layers generated from the L0 layers, and remove the L0 layers. +As stated above, making the update to the layer map in atomic way is trivial. +But, there is no system call API to make an atomic update to a directory that involves more than one file rename and deletion. +Currently, we issue the system calls one by one and hope we don't crash. + +What happens if we crash and restart in the middle of that system call sequence? +We will reconstruct the layer map according to the reconciliation process, taking as input whatever transitory state the timeline directory ended up in. + +We cannot roll back or complete the timeline directory update during which we crashed, because we keep no record of the changes we plan to make. + +### Problem's Implications For Compaction + +The implications of the above are primarily problematic for compaction. +Specifically, the part of it that compacts L0 layers into L1 layers. + +Remember that compaction takes a set of L0 layers and reshuffles the delta records in them into L1 layer files. +Once the L1 layer files are written to disk, it atomically removes the L0 layers from the layer map and adds the L1 layers to the layer map. +It then deletes the L0 layers locally, and schedules an upload of the L1 layers and and updated index part. + +If we crash before deleting L0s, but after writing out L1s, the next compaction after restart will re-digest the L0s and produce new L1s. +This means the compaction after restart will **overwrite** the previously written L1s. +Currently we also schedule an S3 upload of the overwritten L1. + +If the compaction algorithm doesn't change between the two compaction runs, is deterministic, and uses the same set of L0s as input, then the second run will produce identical L1s and the overwrites will go unnoticed. + +*However*: +1. the file size of the overwritten L1s may not be identical, and +2. the bit pattern of the overwritten L1s may not be identical, and, +3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite + +The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted). + +For example, if an unresponsive node A becomes active again after control plane has relocated the tenant to a new node B, the node A may overwrite some L1s. +But node B based its world view on the version of node A's `index_part.json` from _before_ the overwrite. +That earlier `index_part.json`` contained the file size of the pre-overwrite L1. +If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1. +Effectively, the data in the L1 has become inaccessible to node B. +If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem. + +If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems. + +In case of (1) and (2), where we know that the logical content of the layers is still the same, we can recover by manually patching the `index_part.json` of the new node to the overwritten L1's file size / checksum. + +But if (3) ever happens, the logical content may be different, and, we could have truly lost data. + +Given the above considerations, we should avoid making correctness of split-brain protection dependent on overwrites preserving _logical_ layer file contents. +**It is a much cleaner separation of concerns to require that layer files are truly immutable in S3, i.e., PUT once and then only DELETEd, never overwritten (overPUTted).** + +## Design + +Instead of reconciling a layer map from local timeline directory contents and remote index part, this RFC proposes to view the remote index part as authoritative during timeline load. +Local layer files will be recognized if they match what's listed in remote index part, and removed otherwise. + +During **timeline load**, the only thing that matters is the remote index part content. +Essentially, timeline load becomes much like attach, except we don't need to prefix-list the remote timelines. +The local timeline dir's `metadata` file does not matter. +The layer files in the local timeline dir are seen as a nice-to-have cache of layer files that are in the remote index part. +Any layer files in the local timeline dir that aren't in the remote index part are removed during startup. +The `Timeline::load_layer_map()` no longer "merges" local timeline dir contents with the remote index part. +Instead, it treats the remote index part as the authoritative layer map. +If the local timeline dir contains a layer that is in the remote index part, that's nice, and we'll re-use it if file size (and in the future, check sum) match what's stated in the index part. +If it doesn't match, we remove the file from the local timeline dir. + +After load, **at runtime**, nothing changes compared to what we did before this RFC. +The procedure for single- and multi-object changes is reproduced here for reference: +* For any new layers that the change adds: + * Write them to a temporary location. + * While holding layer map lock: + * Move them to the final location. + * Insert into layer map. +* Make the S3 changes. + We won't reproduce the remote timeline client method calls here because these are subject to change. + Instead we reproduce the sequence of s3 changes that must result for a given single-/multi-object change: + * PUT layer files inserted by the change. + * PUT an index part that has insertions and deletions of the change. + * DELETE the layer files that are deleted by the change. + +Note that it is safe for the DELETE to be deferred arbitrarily. +* If it never happens, we leak the object, but, that's not a correctness concern. +* As of #4938, we don't schedule the remote timeline client operation for deletion immediately, but, only when we drop the `LayerInner`. +* With the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919), the deletions will be written to deletion queue for processing when it's safe to do so (see the RFC for details). + +## How This Solves The Problem + +If we crash before we've finished the S3 changes, then timeline load will reset layer map to the state that's in the S3 index part. +The S3 change sequence above is obviously crash-consistent. +If we crash before the index part PUT, then we leak the inserted layer files to S3. +If we crash after the index part PUT, we leak the to-be-DELETEd layer files to S3. +Leaking is fine, it's a pre-existing condition and not addressed in this RFC. + +Multi-object changes that previously created and removed files in timeline dir are now atomic because the layer map updates are atomic and crash consistent: +* atomic layer map update at runtime, currently by using an RwLock in write mode +* atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic +* local timeline dir state: + * irrelevant for layer map content => irrelevant for atomic updates / crash consistency + * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them + * if we crash before index part PUT, local layer files will be deleted + +## Trade-Offs + +### Fundamental + +If we crash before finishing the index part PUT, we lose all the work that hasn't reached the S3 `index_part.json`: +* wal ingest: we lose not-yet-uploaded L0s; load on the **safekeepers** + work for pageserver +* compaction: we lose the entire compaction iteration work; need to re-do it again +* gc: no change to what we have today + +If the work is still deemed necessary after restart, the restarted restarted pageserver will re-do this work. +The amount of work to be re-do is capped to the lag of S3 changes to the local changes. +Assuming upload queue allows for unlimited queue depth (that's what it does today), this means: +* on-demand downloads that were needed to do the work: are likely still present, not lost +* wal ingest: currently unbounded +* L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()` + * Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M. + * In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`. +* image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))` + * I have no intuition how expensive / long-running it is in reality. +* gc: `update_gc_info`` work (not substantial, AFAIK) + +To limit the amount of lost upload work, and ingest work, we can limit the upload queue depth (see suggestions in the next sub-section). +However, to limit the amount of lost CPU work, we would need a way to make make the compaction/image-layer-generation algorithms interruptible & resumable. +We aren't there yet, the need for it is tracked by ([#4580](https://github.com/neondatabase/neon/issues/4580)). +However, this RFC is not constraining the design space either. + +### Practical + +#### Pageserver Restarts + +Pageserver crashes are very rare ; it would likely be acceptable to re-do the lost work in that case. +However, regular pageserver restart happen frequently, e.g., during weekly deploys. + +In general, pageserver restart faces the problem of tenants that "take too long" to shut down. +They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down. +We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file). +A longer budget would expose tenants that are done early to a longer downtime. +A short budget would risk throwing away more work that'd have to be re-done after restart. + +In the context of this RFC, killing the process would mean losing the work that hasn't made it to S3. +We can mitigate this problem as follows: +0. initially, by accepting that we need to do the work again +1. short-term, introducing measures to cap the amount of in-flight work: + + - cap upload queue length, use backpressure to slow down compaction + - disabling compaction/image-layer-generation X minutes before `systemctl restart pageserver` + - introducing a read-only shutdown state for tenants that are fast to shut down; + that state would be equivalent to the state of a tenant in hot standby / readonly mode. + +2. mid term, by not restarting pageserver in place, but using [*seamless tenant migration*](https://github.com/neondatabase/neon/pull/5029) to drain a pageserver's tenants before we restart it. + +#### `disk_consistent_lsn` can go backwards + +`disk_consistent_lsn` can go backwards across restarts if we crash before we've finished the index part PUT. +Nobody should care about it, because the only thing that matters is `remote_consistent_lsn`. +Compute certainly doesn't care about `disk_consistent_lsn`. + + +## Side-Effects Of This Design + +* local `metadata` is basically reduced to a cache of which timelines exist for this tenant; i.e., we can avoid a `ListObjects` requests for a tenant's timelines during tenant load. + +## Limitations + +Multi-object changes that span multiple timelines aren't covered by this RFC. +That's fine because we currently don't need them, as evidenced by the absence +of a Pageserver operation that holds multiple timelines' layer map lock at a time. + +## Impacted components + +Primarily pageservers. + +Safekeepers will experience more load when we need to re-ingest WAL because we've thrown away work. +No changes to safekeepers are needed. + +## Alternatives considered + +### Alternative 1: WAL + +We could have a local WAL for timeline dir changes, as proposed here https://github.com/neondatabase/neon/issues/4418 and partially implemented here https://github.com/neondatabase/neon/pull/4422 . +The WAL would be used to +1. make multi-object changes atomic +2. replace `reconcile_with_remote()` reconciliation: scheduling of layer upload would be part of WAL replay. + +The WAL is appealing in a local-first world, but, it's much more complex than the design described above: +* New on-disk state to get right. +* Forward- and backward-compatibility development costs in the future. + +### Alternative 2: Flow Everything Through `index_part.json` + +We could have gone to the other extreme and **only** update the layer map whenever we've PUT `index_part.json`. +I.e., layer map would always be the last-persisted S3 state. +That's axiomatically beautiful, not least because it fully separates the layer file production and consumption path (=> [layer file spreading proposal](https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=4)). +And it might make hot standbys / read-only pageservers less of a special case in the future. + +But, I have some uncertainties with regard to WAL ingestion, because it needs to be able to do some reads for the logical size feedback to safekeepers. + +And it's silly that we wouldn't be able to use the results of compaction or image layer generation before we're done with the upload. + +Lastly, a temporarily clogged-up upload queue (e.g. S3 is down) shouldn't immediately render ingestion unavailable. + +### Alternative 3: Sequence Numbers For Layers + +Instead of what's proposed in this RFC, we could use unique numbers to identify layer files: + +``` +# before +tenants/$tenant/timelines/$timeline/$key_and_lsn_range +# after +tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range +``` + +To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`. + +This alternative does not solve atomic layer map updates. +In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers. +In fact, this alternative makes it worse because the data is now duplicated in the not-overwritten and overwritten L1 layer files. +We'd need to write a deduplication pass that checks if perfectly overlapping layers have identical contents. + +However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC. + +So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3). +But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute. +The proposed design in this RFC addresses both. + +So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top. +That way, we avoid a phase where the crash-during-compaction problem is accute. + +## Related issues + +- https://github.com/neondatabase/neon/issues/4749 +- https://github.com/neondatabase/neon/issues/4418 + - https://github.com/neondatabase/neon/pull/4422 +- https://github.com/neondatabase/neon/issues/5077 +- https://github.com/neondatabase/neon/issues/4088 + - (re)resolutions: + - https://github.com/neondatabase/neon/pull/4696 + - https://github.com/neondatabase/neon/pull/4094 + - https://neondb.slack.com/archives/C033QLM5P7D/p1682519017949719 + +Note that the test case introduced in https://github.com/neondatabase/neon/pull/4696/files#diff-13114949d1deb49ae394405d4c49558adad91150ba8a34004133653a8a5aeb76 will produce L1s with the same logical content, but, as outlined in the last paragraph of the _Problem Statement_ section above, we don't want to make that assumption in order to fix the problem. + + +## Implementation Plan + +1. Remove support for `remote_storage=None`, because we now rely on the existence of an index part. + + - The nasty part here is to fix all the tests that fiddle with the local timeline directory. + Possibly they are just irrelevant with this change, but, each case will require inspection. + +2. Implement the design above. + + - Initially, ship without the mitigations for restart and accept we will do some work twice. + - Measure the impact and implement one of the mitigations. + From 6455f0da15faa818a5c89d285ecaec1f3c01eb5f Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 1 Sep 2023 17:30:22 +0200 Subject: [PATCH 11/24] FileBlockReader is never used (#5181) part of #4743 preliminary to #5180 --- pageserver/ctl/src/layers.rs | 2 +- pageserver/src/tenant/block_io.rs | 28 ++++++------------- .../src/tenant/storage_layer/delta_layer.rs | 2 +- .../src/tenant/storage_layer/image_layer.rs | 2 +- 4 files changed, 11 insertions(+), 23 deletions(-) diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 2af54902f795..608b3cecd65c 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -68,7 +68,7 @@ async fn read_delta_file(path: impl AsRef) -> Result<()> { }, ) .await?; - let cursor = BlockCursor::new_fileblockreader_virtual(&file); + let cursor = BlockCursor::new_fileblockreader(&file); for (k, v) in all { let value = cursor.read_blob(v.pos()).await?; println!("key:{} value_len:{}", k, value.len()); diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 69d5b49c6d41..93b79d211dfc 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -7,7 +7,6 @@ use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ}; use crate::virtual_file::VirtualFile; use bytes::Bytes; -use std::fs::File; use std::ops::{Deref, DerefMut}; use std::os::unix::fs::FileExt; @@ -73,8 +72,7 @@ impl<'a> Deref for BlockLease<'a> { /// /// Unlike traits, we also support the read function to be async though. pub(crate) enum BlockReaderRef<'a> { - FileBlockReaderVirtual(&'a FileBlockReader), - FileBlockReaderFile(&'a FileBlockReader), + FileBlockReaderVirtual(&'a FileBlockReader), EphemeralFile(&'a EphemeralFile), Adapter(Adapter<&'a DeltaLayerInner>), #[cfg(test)] @@ -87,7 +85,6 @@ impl<'a> BlockReaderRef<'a> { use BlockReaderRef::*; match self { FileBlockReaderVirtual(r) => r.read_blk(blknum).await, - FileBlockReaderFile(r) => r.read_blk(blknum).await, EphemeralFile(r) => r.read_blk(blknum).await, Adapter(r) => r.read_blk(blknum).await, #[cfg(test)] @@ -105,7 +102,7 @@ impl<'a> BlockReaderRef<'a> { /// /// ```no_run /// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader}; -/// # let reader: FileBlockReader = unimplemented!("stub"); +/// # let reader: FileBlockReader = unimplemented!("stub"); /// let cursor = reader.block_cursor(); /// let buf = cursor.read_blk(1); /// // do stuff with 'buf' @@ -122,7 +119,7 @@ impl<'a> BlockCursor<'a> { BlockCursor { reader } } // Needed by cli - pub fn new_fileblockreader_virtual(reader: &'a FileBlockReader) -> Self { + pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self { BlockCursor { reader: BlockReaderRef::FileBlockReaderVirtual(reader), } @@ -143,18 +140,15 @@ impl<'a> BlockCursor<'a> { /// /// The file is assumed to be immutable. This doesn't provide any functions /// for modifying the file, nor for invalidating the cache if it is modified. -pub struct FileBlockReader { - pub file: F, +pub struct FileBlockReader { + pub file: VirtualFile, /// Unique ID of this file, used as key in the page cache. file_id: page_cache::FileId, } -impl FileBlockReader -where - F: FileExt, -{ - pub fn new(file: F) -> Self { +impl FileBlockReader { + pub fn new(file: VirtualFile) -> Self { let file_id = page_cache::next_file_id(); FileBlockReader { file_id, file } @@ -196,13 +190,7 @@ where } } -impl BlockReader for FileBlockReader { - fn block_cursor(&self) -> BlockCursor<'_> { - BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self)) - } -} - -impl BlockReader for FileBlockReader { +impl BlockReader for FileBlockReader { fn block_cursor(&self) -> BlockCursor<'_> { BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self)) } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index d9df346a1440..f9a8c52c2f0f 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -219,7 +219,7 @@ pub struct DeltaLayerInner { index_root_blk: u32, /// Reader object for reading blocks from the file. - file: FileBlockReader, + file: FileBlockReader, } impl AsRef for DeltaLayerInner { diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index b1fc25709277..32f20e6227f6 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -155,7 +155,7 @@ pub struct ImageLayerInner { lsn: Lsn, /// Reader object for reading blocks from the file. - file: FileBlockReader, + file: FileBlockReader, } impl std::fmt::Debug for ImageLayerInner { From 9b91c07fcf3497cfc4af48ba0e2e073f2a80ab4c Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 1 Sep 2023 18:10:40 +0200 Subject: [PATCH 12/24] pageserver: run all Rust tests with remote storage enabled (#5164) For [#5086](https://github.com/neondatabase/neon/pull/5086#issuecomment-1701331777) we will require remote storage to be configured in pageserver. This PR enables `localfs`-based storage for all Rust unit tests. Changes: - In `TenantHarness`, set up localfs remote storage for the tenant. - `create_test_timeline` should mimic what real timeline creation does, and real timeline creation waits for the timeline to reach remote storage. With this PR, `create_test_timeline` now does that as well. - All the places that create the harness tenant twice need to shut down the tenant before the re-create through a second call to `try_load` or `load`. - Without shutting down, upload tasks initiated by/through the first incarnation of the harness tenant might still be ongoing when the second incarnation of the harness tenant is `try_load`/`load`ed. That doesn't make sense in the tests that do that, they generally try to set up a scenario similar to pageserver stop & start. - There was one test that recreates a timeline, not the tenant. For that case, I needed to create a `Timeline::shutdown` method. It's a refactoring of the existing `Tenant::shutdown` method. - The remote_timeline_client tests previously set up their own `GenericRemoteStorage` and `RemoteTimelineClient`. Now they re-use the one that's pre-created by the TenantHarness. Some adjustments to the assertions were needed because the assertions now need to account for the initial image layer that's created by `create_test_timeline` to be present. --- pageserver/src/tenant.rs | 158 ++++++++---------- .../src/tenant/remote_timeline_client.rs | 142 ++++++---------- pageserver/src/tenant/timeline.rs | 75 +++++---- test_runner/regress/test_tenant_delete.py | 4 +- test_runner/regress/test_timeline_delete.py | 6 +- 5 files changed, 177 insertions(+), 208 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c32fb6c7f69a..ce8520853773 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -195,7 +195,7 @@ pub struct Tenant { walredo_mgr: Arc, // provides access to timeline data sitting in the remote storage - remote_storage: Option, + pub(crate) remote_storage: Option, /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`]. cached_logical_sizes: tokio::sync::Mutex>, @@ -1517,6 +1517,15 @@ impl Tenant { tline.maybe_spawn_flush_loop(); tline.freeze_and_flush().await.context("freeze_and_flush")?; + // Make sure the freeze_and_flush reaches remote storage. + tline + .remote_client + .as_ref() + .unwrap() + .wait_completion() + .await + .unwrap(); + let tl = uninit_tl.finish_creation()?; // The non-test code would call tl.activate() here. tl.set_state(TimelineState::Active); @@ -1693,65 +1702,6 @@ impl Tenant { Ok(()) } - /// Flush all in-memory data to disk and remote storage, if any. - /// - /// Used at graceful shutdown. - async fn freeze_and_flush_on_shutdown(&self) { - let mut js = tokio::task::JoinSet::new(); - - // execute on each timeline on the JoinSet, join after. - let per_timeline = |timeline_id: TimelineId, timeline: Arc| { - async move { - debug_assert_current_span_has_tenant_and_timeline_id(); - - match timeline.freeze_and_flush().await { - Ok(()) => {} - Err(e) => { - warn!("failed to freeze and flush: {e:#}"); - return; - } - } - - let res = if let Some(client) = timeline.remote_client.as_ref() { - // if we did not wait for completion here, it might be our shutdown process - // didn't wait for remote uploads to complete at all, as new tasks can forever - // be spawned. - // - // what is problematic is the shutting down of RemoteTimelineClient, because - // obviously it does not make sense to stop while we wait for it, but what - // about corner cases like s3 suddenly hanging up? - client.wait_completion().await - } else { - Ok(()) - }; - - if let Err(e) = res { - warn!("failed to await for frozen and flushed uploads: {e:#}"); - } - } - .instrument(tracing::info_span!("freeze_and_flush_on_shutdown", %timeline_id)) - }; - - { - let timelines = self.timelines.lock().unwrap(); - timelines - .iter() - .map(|(id, tl)| (*id, Arc::clone(tl))) - .for_each(|(timeline_id, timeline)| { - js.spawn(per_timeline(timeline_id, timeline)); - }) - }; - - while let Some(res) = js.join_next().await { - match res { - Ok(()) => {} - Err(je) if je.is_cancelled() => unreachable!("no cancelling used"), - Err(je) if je.is_panic() => { /* logged already */ } - Err(je) => warn!("unexpected JoinError: {je:?}"), - } - } - } - pub fn current_state(&self) -> TenantState { self.state.borrow().clone() } @@ -1882,19 +1832,22 @@ impl Tenant { } }; - if freeze_and_flush { - // walreceiver has already began to shutdown with TenantState::Stopping, but we need to - // await for them to stop. - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(self.tenant_id), - None, - ) - .await; - - // this will wait for uploads to complete; in the past, it was done outside tenant - // shutdown in pageserver::shutdown_pageserver. - self.freeze_and_flush_on_shutdown().await; + let mut js = tokio::task::JoinSet::new(); + { + let timelines = self.timelines.lock().unwrap(); + timelines.values().for_each(|timeline| { + let timeline = Arc::clone(timeline); + let span = Span::current(); + js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await }); + }) + }; + while let Some(res) = js.join_next().await { + match res { + Ok(()) => {} + Err(je) if je.is_cancelled() => unreachable!("no cancelling used"), + Err(je) if je.is_panic() => { /* logged already */ } + Err(je) => warn!("unexpected JoinError: {je:?}"), + } } // shutdown all tenant and timeline tasks: gc, compaction, page service @@ -3467,6 +3420,8 @@ pub mod harness { pub tenant_conf: TenantConf, pub tenant_id: TenantId, pub generation: Generation, + remote_storage: GenericRemoteStorage, + pub remote_fs_dir: PathBuf, } static LOG_HANDLE: OnceCell<()> = OnceCell::new(); @@ -3504,29 +3459,39 @@ pub mod harness { fs::create_dir_all(conf.tenant_path(&tenant_id))?; fs::create_dir_all(conf.timelines_path(&tenant_id))?; + use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; + let remote_fs_dir = conf.workdir.join("localfs"); + std::fs::create_dir_all(&remote_fs_dir).unwrap(); + let config = RemoteStorageConfig { + // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS, + max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(), + // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS, + max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(), + storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), + }; + let remote_storage = GenericRemoteStorage::from_config(&config).unwrap(); + Ok(Self { conf, tenant_conf, tenant_id, generation: Generation::new(0xdeadbeef), + remote_storage, + remote_fs_dir, }) } pub async fn load(&self) -> (Arc, RequestContext) { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); ( - self.try_load(&ctx, None) + self.try_load(&ctx) .await .expect("failed to load test tenant"), ctx, ) } - pub async fn try_load( - &self, - ctx: &RequestContext, - remote_storage: Option, - ) -> anyhow::Result> { + pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result> { let walredo_mgr = Arc::new(TestRedoManager); let tenant = Arc::new(Tenant::new( @@ -3536,7 +3501,7 @@ pub mod harness { walredo_mgr, self.tenant_id, self.generation, - remote_storage, + Some(self.remote_storage.clone()), )); tenant .load(None, ctx) @@ -4004,6 +3969,13 @@ mod tests { .create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx) .await?; make_some_layers(tline.as_ref(), Lsn(0x8000)).await?; + // so that all uploads finish & we can call harness.load() below again + tenant + .shutdown(Default::default(), true) + .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id)) + .await + .ok() + .unwrap(); } let (tenant, _ctx) = harness.load().await; @@ -4037,6 +4009,14 @@ mod tests { .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60)).await?; + + // so that all uploads finish & we can call harness.load() below again + tenant + .shutdown(Default::default(), true) + .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id)) + .await + .ok() + .unwrap(); } // check that both of them are initially unloaded @@ -4089,6 +4069,13 @@ mod tests { .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; drop(tline); + // so that all uploads finish & we can call harness.try_load() below again + tenant + .shutdown(Default::default(), true) + .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id)) + .await + .ok() + .unwrap(); drop(tenant); let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); @@ -4100,11 +4087,7 @@ mod tests { metadata_bytes[8] ^= 1; std::fs::write(metadata_path, metadata_bytes)?; - let err = harness - .try_load(&ctx, None) - .await - .err() - .expect("should fail"); + let err = harness.try_load(&ctx).await.err().expect("should fail"); // get all the stack with all .context, not only the last one let message = format!("{err:#}"); let expected = "failed to load metadata"; @@ -4558,6 +4541,11 @@ mod tests { let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; // Keeps uninit mark in place + let raw_tline = tline.raw_timeline().unwrap(); + raw_tline + .shutdown(false) + .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id)) + .await; std::mem::forget(tline); } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 7e7007d7e29d..13f3fac41c7d 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1468,11 +1468,8 @@ mod tests { }, DEFAULT_PG_VERSION, }; - use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; - use std::{ - collections::HashSet, - path::{Path, PathBuf}, - }; + + use std::{collections::HashSet, path::Path}; use utils::lsn::Lsn; pub(super) fn dummy_contents(name: &str) -> Vec { @@ -1529,8 +1526,6 @@ mod tests { tenant: Arc, timeline: Arc, tenant_ctx: RequestContext, - remote_fs_dir: PathBuf, - client: Arc, } impl TestSetup { @@ -1540,52 +1535,15 @@ mod tests { let harness = TenantHarness::create(test_name)?; let (tenant, ctx) = harness.load().await; - // create an empty timeline directory let timeline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; - let remote_fs_dir = harness.conf.workdir.join("remote_fs"); - std::fs::create_dir_all(remote_fs_dir)?; - let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?; - - let storage_config = RemoteStorageConfig { - max_concurrent_syncs: std::num::NonZeroUsize::new( - remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS, - ) - .unwrap(), - max_sync_errors: std::num::NonZeroU32::new( - remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS, - ) - .unwrap(), - storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), - }; - - let generation = Generation::new(0xdeadbeef); - - let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); - - let client = Arc::new(RemoteTimelineClient { - conf: harness.conf, - runtime: tokio::runtime::Handle::current(), - tenant_id: harness.tenant_id, - timeline_id: TIMELINE_ID, - generation, - storage_impl: storage, - upload_queue: Mutex::new(UploadQueue::Uninitialized), - metrics: Arc::new(RemoteTimelineClientMetrics::new( - &harness.tenant_id, - &TIMELINE_ID, - )), - }); - Ok(Self { harness, tenant, timeline, tenant_ctx: ctx, - remote_fs_dir, - client, }) } } @@ -1610,26 +1568,37 @@ mod tests { let TestSetup { harness, tenant: _tenant, - timeline: _timeline, + timeline, tenant_ctx: _tenant_ctx, - remote_fs_dir, - client, } = TestSetup::new("upload_scheduling").await.unwrap(); + let client = timeline.remote_client.as_ref().unwrap(); + + // Download back the index.json, and check that the list of files is correct + let initial_index_part = match client.download_index_file().await.unwrap() { + MaybeDeletedIndexPart::IndexPart(index_part) => index_part, + MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"), + }; + let initial_layers = initial_index_part + .layer_metadata + .keys() + .map(|f| f.to_owned()) + .collect::>(); + let initial_layer = { + assert!(initial_layers.len() == 1); + initial_layers.into_iter().next().unwrap() + }; + let timeline_path = harness.timeline_path(&TIMELINE_ID); println!("workdir: {}", harness.conf.workdir.display()); - let remote_timeline_dir = - remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap()); + let remote_timeline_dir = harness + .remote_fs_dir + .join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap()); println!("remote_timeline_dir: {}", remote_timeline_dir.display()); - let metadata = dummy_metadata(Lsn(0x10)); - client - .init_upload_queue_for_empty_remote(&metadata) - .unwrap(); - - let generation = Generation::new(0xdeadbeef); + let generation = harness.generation; // Create a couple of dummy files, schedule upload for them let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); @@ -1710,6 +1679,7 @@ mod tests { .map(|f| f.to_owned()) .collect(), &[ + &initial_layer.file_name(), &layer_file_name_1.file_name(), &layer_file_name_2.file_name(), ], @@ -1739,6 +1709,7 @@ mod tests { } assert_remote_files( &[ + &initial_layer.file_name(), &layer_file_name_1.file_name(), &layer_file_name_2.file_name(), "index_part.json", @@ -1752,6 +1723,7 @@ mod tests { assert_remote_files( &[ + &initial_layer.file_name(), &layer_file_name_2.file_name(), &layer_file_name_3.file_name(), "index_part.json", @@ -1768,16 +1740,10 @@ mod tests { let TestSetup { harness, tenant: _tenant, - timeline: _timeline, - client, + timeline, .. } = TestSetup::new("metrics").await.unwrap(); - - let metadata = dummy_metadata(Lsn(0x10)); - client - .init_upload_queue_for_empty_remote(&metadata) - .unwrap(); - + let client = timeline.remote_client.as_ref().unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); @@ -1788,11 +1754,20 @@ mod tests { ) .unwrap(); - #[derive(Debug, PartialEq)] + #[derive(Debug, PartialEq, Clone, Copy)] struct BytesStartedFinished { started: Option, finished: Option, } + impl std::ops::Add for BytesStartedFinished { + type Output = Self; + fn add(self, rhs: Self) -> Self::Output { + Self { + started: self.started.map(|v| v + rhs.started.unwrap_or(0)), + finished: self.finished.map(|v| v + rhs.finished.unwrap_or(0)), + } + } + } let get_bytes_started_stopped = || { let started = client .metrics @@ -1809,47 +1784,38 @@ mod tests { }; // Test + tracing::info!("now doing actual test"); - let generation = Generation::new(0xdeadbeef); - - let init = get_bytes_started_stopped(); + let actual_a = get_bytes_started_stopped(); client .schedule_layer_file_upload( &layer_file_name_1, - &LayerFileMetadata::new(content_1.len() as u64, generation), + &LayerFileMetadata::new(content_1.len() as u64, harness.generation), ) .unwrap(); - let pre = get_bytes_started_stopped(); + let actual_b = get_bytes_started_stopped(); client.wait_completion().await.unwrap(); - let post = get_bytes_started_stopped(); + let actual_c = get_bytes_started_stopped(); // Validate - assert_eq!( - init, - BytesStartedFinished { - started: None, - finished: None - } - ); - assert_eq!( - pre, - BytesStartedFinished { + let expected_b = actual_a + + BytesStartedFinished { started: Some(content_1.len()), // assert that the _finished metric is created eagerly so that subtractions work on first sample finished: Some(0), - } - ); - assert_eq!( - post, - BytesStartedFinished { + }; + assert_eq!(actual_b, expected_b); + + let expected_c = actual_a + + BytesStartedFinished { started: Some(content_1.len()), - finished: Some(content_1.len()) - } - ); + finished: Some(content_1.len()), + }; + assert_eq!(actual_c, expected_c); } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f0ae38580665..7e8c0391f4b1 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -90,6 +90,7 @@ use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::config::TenantConf; +use super::debug_assert_current_span_has_tenant_and_timeline_id; use super::remote_timeline_client::index::IndexPart; use super::remote_timeline_client::RemoteTimelineClient; use super::storage_layer::{ @@ -933,6 +934,48 @@ impl Timeline { self.launch_eviction_task(background_jobs_can_start); } + #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] + pub async fn shutdown(self: &Arc, freeze_and_flush: bool) { + debug_assert_current_span_has_tenant_and_timeline_id(); + + // prevent writes to the InMemoryLayer + task_mgr::shutdown_tasks( + Some(TaskKind::WalReceiverManager), + Some(self.tenant_id), + Some(self.timeline_id), + ) + .await; + + // now all writers to InMemory layer are gone, do the final flush if requested + if freeze_and_flush { + match self.freeze_and_flush().await { + Ok(()) => {} + Err(e) => { + warn!("failed to freeze and flush: {e:#}"); + return; // TODO: should probably drain remote timeline client anyways? + } + } + + // drain the upload queue + let res = if let Some(client) = self.remote_client.as_ref() { + // if we did not wait for completion here, it might be our shutdown process + // didn't wait for remote uploads to complete at all, as new tasks can forever + // be spawned. + // + // what is problematic is the shutting down of RemoteTimelineClient, because + // obviously it does not make sense to stop while we wait for it, but what + // about corner cases like s3 suddenly hanging up? + client.wait_completion().await + } else { + Ok(()) + }; + + if let Err(e) = res { + warn!("failed to await for frozen and flushed uploads: {e:#}"); + } + } + } + pub fn set_state(&self, new_state: TimelineState) { match (self.current_state(), new_state) { (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { @@ -4742,22 +4785,8 @@ mod tests { let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap(); - let remote_storage = { - // this is never used for anything, because of how the create_test_timeline works, but - // it is with us in spirit and a Some. - use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind}; - let path = harness.conf.workdir.join("localfs"); - std::fs::create_dir_all(&path).unwrap(); - let config = RemoteStorageConfig { - max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(), - max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(), - storage: RemoteStorageKind::LocalFs(path), - }; - GenericRemoteStorage::from_config(&config).unwrap() - }; - let ctx = any_context(); - let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap(); + let tenant = harness.try_load(&ctx).await.unwrap(); let timeline = tenant .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await @@ -4807,22 +4836,8 @@ mod tests { async fn layer_eviction_aba_fails() { let harness = TenantHarness::create("layer_eviction_aba_fails").unwrap(); - let remote_storage = { - // this is never used for anything, because of how the create_test_timeline works, but - // it is with us in spirit and a Some. - use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind}; - let path = harness.conf.workdir.join("localfs"); - std::fs::create_dir_all(&path).unwrap(); - let config = RemoteStorageConfig { - max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(), - max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(), - storage: RemoteStorageKind::LocalFs(path), - }; - GenericRemoteStorage::from_config(&config).unwrap() - }; - let ctx = any_context(); - let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap(); + let tenant = harness.try_load(&ctx).await.unwrap(); let timeline = tenant .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 448dcfaff740..7519e5bf2392 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -192,7 +192,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints( # allow errors caused by failpoints f".*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # We may leave some upload tasks in the queue. They're likely deletes. # For uploads we explicitly wait with `last_flush_lsn_upload` below. # So by ignoring these instead of waiting for empty upload queue @@ -338,7 +338,7 @@ def test_tenant_delete_is_resumed_on_attach( # From deletion polling f".*NotFound: tenant {env.initial_tenant}.*", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*', diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 916c0111f77c..6ed437d23bab 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -231,7 +231,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}") # It appears when we stopped flush loop during deletion and then pageserver is stopped env.pageserver.allowed_errors.append( - ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited" + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", ) # This happens when we fail before scheduling background operation. # Timeline is left in stopping state and retry tries to stop it again. @@ -449,7 +449,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild ) # this happens, because the stuck timeline is visible to shutdown env.pageserver.allowed_errors.append( - ".*freeze_and_flush_on_shutdown.+: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited" + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", ) ps_http = env.pageserver.http_client() @@ -881,7 +881,7 @@ def test_timeline_delete_resumed_on_attach( # allow errors caused by failpoints f".*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", # Polling after attach may fail with this From 3112aa95413bbba1c246815fd4ca41c9e639f5fe Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Fri, 1 Sep 2023 17:21:33 +0100 Subject: [PATCH 13/24] proxy: error typo (#5187) ## Problem https://github.com/neondatabase/neon/pull/5162#discussion_r1311853491 --- proxy/src/console/provider.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index ea37c3ade772..7d587ff1ec67 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -64,7 +64,7 @@ pub mod errors { } http::StatusCode::LOCKED => { // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded. - format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contract our support") + format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support") } _ => REQUEST_FAILED.to_owned(), }, From b7f3ca6d1f6cee81530def74e8ff07caf8f64c03 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 17:29:38 +0100 Subject: [PATCH 14/24] tests: get test name automatically for remote storage (#5184) ## Problem Tests using remote storage have manually entered `test_name` parameters, which: - Are easy to accidentally duplicate when copying code to make a new test - Omit parameters, so don't actually create unique S3 buckets when running many tests concurrently. ## Summary of changes - Use the `request` fixture in neon_env_builder fixture to get the test name, then munge that into an S3 compatible bucket name. - Remove the explicit `test_name` parameters to enable_remote_storage --- test_runner/fixtures/neon_fixtures.py | 16 ++++++++++++--- .../regress/test_attach_tenant_config.py | 2 -- .../regress/test_disk_usage_eviction.py | 2 +- .../regress/test_download_extensions.py | 4 ---- test_runner/regress/test_gc_aggressive.py | 1 - test_runner/regress/test_layer_eviction.py | 2 -- test_runner/regress/test_metric_collection.py | 1 - test_runner/regress/test_ondemand_download.py | 6 ------ test_runner/regress/test_remote_storage.py | 7 ------- test_runner/regress/test_tenant_conf.py | 2 -- test_runner/regress/test_tenant_delete.py | 6 +----- test_runner/regress/test_tenant_detach.py | 9 --------- test_runner/regress/test_tenant_relocation.py | 2 -- test_runner/regress/test_tenants.py | 2 -- .../test_tenants_with_remote_storage.py | 3 --- .../regress/test_threshold_based_eviction.py | 3 +-- test_runner/regress/test_timeline_delete.py | 12 ++--------- test_runner/regress/test_timeline_size.py | 20 +++++-------------- test_runner/regress/test_wal_acceptor.py | 2 -- 19 files changed, 23 insertions(+), 79 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index ff84bf2c85f9..6adba21ecf04 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -414,6 +414,7 @@ def __init__( neon_binpath: Path, pg_distrib_dir: Path, pg_version: PgVersion, + test_name: str, remote_storage: Optional[RemoteStorage] = None, remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER, pageserver_config_override: Optional[str] = None, @@ -456,6 +457,11 @@ def __init__( self.initial_timeline = initial_timeline or TimelineId.generate() self.scrub_on_exit = False + assert test_name.startswith( + "test_" + ), "Unexpectedly instantiated from outside a test function" + self.test_name = test_name + def init_configs(self) -> NeonEnv: # Cannot create more than one environment from one builder assert self.env is None, "environment already initialized" @@ -504,23 +510,24 @@ def enable_scrub_on_exit(self): def enable_remote_storage( self, remote_storage_kind: RemoteStorageKind, - test_name: str, force_enable: bool = True, enable_remote_extensions: bool = False, ): + bucket_name = re.sub(r"[_\[\]]", "-", self.test_name)[:63] + if remote_storage_kind == RemoteStorageKind.NOOP: return elif remote_storage_kind == RemoteStorageKind.LOCAL_FS: self.enable_local_fs_remote_storage(force_enable=force_enable) elif remote_storage_kind == RemoteStorageKind.MOCK_S3: self.enable_mock_s3_remote_storage( - bucket_name=test_name, + bucket_name=bucket_name, force_enable=force_enable, enable_remote_extensions=enable_remote_extensions, ) elif remote_storage_kind == RemoteStorageKind.REAL_S3: self.enable_real_s3_remote_storage( - test_name=test_name, + test_name=bucket_name, force_enable=force_enable, enable_remote_extensions=enable_remote_extensions, ) @@ -976,6 +983,7 @@ def _shared_simple_env( pg_version=pg_version, run_id=run_id, preserve_database_files=pytestconfig.getoption("--preserve-database-files"), + test_name=request.node.name, ) as builder: env = builder.init_start() @@ -1011,6 +1019,7 @@ def neon_env_builder( pg_version: PgVersion, default_broker: NeonBroker, run_id: uuid.UUID, + request: FixtureRequest, ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1039,6 +1048,7 @@ def neon_env_builder( broker=default_broker, run_id=run_id, preserve_database_files=pytestconfig.getoption("--preserve-database-files"), + test_name=request.node.name, ) as builder: yield builder diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index bc6afa84a1ab..1acf429c52e2 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -16,7 +16,6 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv: neon_env_builder.enable_remote_storage( remote_storage_kind=RemoteStorageKind.LOCAL_FS, - test_name="test_attach_tenant_config", ) env = neon_env_builder.init_start() @@ -39,7 +38,6 @@ class NegativeTests: def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, None, None]: neon_env_builder.enable_remote_storage( remote_storage_kind=RemoteStorageKind.LOCAL_FS, - test_name="test_attach_tenant_config", ) env = neon_env_builder.init_start() assert isinstance(env.remote_storage, LocalFsStorage) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 182069315e0e..cdbd02de03e9 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -135,7 +135,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev log.info(f"setting up eviction_env for test {request.node.name}") - neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}") + neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS) # initial tenant will not be present on this pageserver env = neon_env_builder.init_configs() diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index b208616345e9..54f51414bbfd 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -90,7 +90,6 @@ def test_remote_extensions( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_remote_extensions", enable_remote_extensions=True, ) env = neon_env_builder.init_start() @@ -157,7 +156,6 @@ def test_remote_library( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_remote_library", enable_remote_extensions=True, ) env = neon_env_builder.init_start() @@ -218,7 +216,6 @@ def test_multiple_extensions_one_archive( ): neon_env_builder.enable_remote_storage( remote_storage_kind=RemoteStorageKind.REAL_S3, - test_name="test_multiple_extensions_one_archive", enable_remote_extensions=True, ) env = neon_env_builder.init_start() @@ -266,7 +263,6 @@ def test_extension_download_after_restart( neon_env_builder.enable_remote_storage( remote_storage_kind=RemoteStorageKind.MOCK_S3, - test_name="test_extension_download_after_restart", enable_remote_extensions=True, ) env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index be817521cd24..8d7a42a8056b 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -102,7 +102,6 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_gc_index_upload", ) env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 8f627defb5ef..c939ace803f5 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -21,7 +21,6 @@ def test_basic_eviction( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_download_remote_layers_api", ) env = neon_env_builder.init_start( @@ -157,7 +156,6 @@ def test_basic_eviction( def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_remote_storage( remote_storage_kind=RemoteStorageKind.LOCAL_FS, - test_name="test_gc_of_remote_layers", ) env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py index 3f4b42707a01..e4641cff055d 100644 --- a/test_runner/regress/test_metric_collection.py +++ b/test_runner/regress/test_metric_collection.py @@ -96,7 +96,6 @@ def test_metric_collection( neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_metric_collection", ) log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}") diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index a4e86e051946..0ca6a7a595da 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -54,7 +54,6 @@ def test_ondemand_download_large_rel( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_ondemand_download_large_rel", ) # thinking about using a shared environment? the test assumes that global @@ -157,7 +156,6 @@ def test_ondemand_download_timetravel( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_ondemand_download_timetravel", ) # thinking about using a shared environment? the test assumes that global @@ -319,7 +317,6 @@ def test_download_remote_layers_api( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_download_remote_layers_api", ) ##### First start, insert data and upload it to the remote storage @@ -481,7 +478,6 @@ def test_compaction_downloads_on_demand_without_image_creation( """ neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_compaction_downloads_on_demand_without_image_creation", ) conf = { @@ -569,7 +565,6 @@ def test_compaction_downloads_on_demand_with_image_creation( """ neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_compaction_downloads_on_demand", ) conf = { @@ -670,7 +665,6 @@ def test_ondemand_download_failure_to_replace( neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_ondemand_download_failure_to_replace", ) # disable gc and compaction via default tenant config because config is lost while detaching diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 0bd365efaab2..8b75d352005d 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -62,7 +62,6 @@ def test_remote_storage_backup_and_restore( neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_remote_storage_backup_and_restore", ) # Exercise retry code path by making all uploads and downloads fail for the @@ -225,7 +224,6 @@ def test_remote_storage_upload_queue_retries( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_remote_storage_upload_queue_retries", ) env = neon_env_builder.init_start() @@ -381,7 +379,6 @@ def test_remote_timeline_client_calls_started_metric( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_remote_timeline_client_metrics", ) # thinking about using a shared environment? the test assumes that global @@ -524,7 +521,6 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_timeline_deletion_with_files_stuck_in_upload_queue", ) env = neon_env_builder.init_start( @@ -642,7 +638,6 @@ def test_empty_branch_remote_storage_upload( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_empty_branch_remote_storage_upload", ) env = neon_env_builder.init_start() @@ -694,7 +689,6 @@ def test_empty_branch_remote_storage_upload_on_restart( """ neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_empty_branch_remote_storage_upload_on_restart", ) env = neon_env_builder.init_start() @@ -792,7 +786,6 @@ def test_compaction_delete_before_upload( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_compaction_delete_before_upload", ) env = neon_env_builder.init_start( diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 60ec532db497..93ba5477a61d 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -294,7 +294,6 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_remote_storage( remote_storage_kind=RemoteStorageKind.LOCAL_FS, - test_name="test_creating_tenant_conf_after_attach", ) env = neon_env_builder.init_start() @@ -339,7 +338,6 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( ): neon_env_builder.enable_remote_storage( remote_storage_kind=RemoteStorageKind.LOCAL_FS, - test_name="test_live_reconfig_get_evictions_low_residence_duration_metric_threshold", ) env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 7519e5bf2392..c7324cac83cc 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -43,7 +43,6 @@ def test_tenant_delete_smoke( neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_tenant_delete_smoke", ) env = neon_env_builder.init_start() @@ -177,9 +176,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints( if simulate_failures: neon_env_builder.pageserver_config_override = "test_remote_failures=1" - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints" - ) + neon_env_builder.enable_remote_storage(remote_storage_kind) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) @@ -300,7 +297,6 @@ def test_tenant_delete_is_resumed_on_attach( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_deleted_tenant_ignored_on_attach", ) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index d2302ca1aff6..20526cd0a971 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -46,7 +46,6 @@ def test_tenant_reattach( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_tenant_reattach", ) # Exercise retry code path by making all uploads and downloads fail for the @@ -231,7 +230,6 @@ def test_tenant_reattach_while_busy( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_tenant_reattach_while_busy", ) env = neon_env_builder.init_start() @@ -453,7 +451,6 @@ def test_detach_while_attaching( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_detach_while_attaching", ) ##### First start, insert secret data and upload it to the remote storage @@ -537,7 +534,6 @@ def test_ignored_tenant_reattach( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_ignored_tenant_reattach", ) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -609,7 +605,6 @@ def test_ignored_tenant_download_missing_layers( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_ignored_tenant_download_and_attach", ) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -675,7 +670,6 @@ def test_ignored_tenant_stays_broken_without_metadata( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_ignored_tenant_stays_broken_without_metadata", ) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -719,7 +713,6 @@ def test_load_attach_negatives( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_load_attach_negatives", ) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -764,7 +757,6 @@ def test_ignore_while_attaching( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_ignore_while_attaching", ) env = neon_env_builder.init_start() @@ -868,7 +860,6 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_metrics_while_ignoring_broken_tenant_and_reloading", ) env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 32ad5381b45b..6a81ff498e0d 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -526,7 +526,6 @@ def test_emergency_relocate_with_branches_slow_replay( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_emergency_relocate_with_branches_slow_replay", ) env = neon_env_builder.init_start() @@ -683,7 +682,6 @@ def test_emergency_relocate_with_branches_createdb( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_emergency_relocate_with_branches_createdb", ) env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 19bc3ed37c8f..985bd63b24ba 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -244,7 +244,6 @@ def test_pageserver_metrics_removed_after_detach( neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_pageserver_metrics_removed_after_detach", ) neon_env_builder.num_safekeepers = 3 @@ -305,7 +304,6 @@ def test_pageserver_with_empty_tenants( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_pageserver_with_empty_tenants", ) env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 2925f8c2da2c..6a541c8a3742 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -64,7 +64,6 @@ async def all_tenants_workload(env: NeonEnv, tenants_endpoints): def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_tenants_many", ) env = neon_env_builder.init_start() @@ -117,7 +116,6 @@ def test_tenants_attached_after_download( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="remote_storage_kind", ) data_id = 1 @@ -232,7 +230,6 @@ def test_tenant_redownloads_truncated_file_on_startup( # since we now store the layer file length metadata, we notice on startup that a layer file is of wrong size, and proceed to redownload it. neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_tenant_redownloads_truncated_file_on_startup", ) env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index a0e423e7ffb5..739a9a5b747c 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -16,13 +16,12 @@ def test_threshold_based_eviction( - request, httpserver: HTTPServer, httpserver_listen_address, pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, ): - neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}") + neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS) # Start with metrics collection enabled, so that the eviction task # imitates its accesses. We'll use a non-existent endpoint to make it fail. diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 6ed437d23bab..0ce714d1853e 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -191,9 +191,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( 8. Retry or restart without the failpoint and check the result. """ - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints" - ) + neon_env_builder.enable_remote_storage(remote_storage_kind) env = neon_env_builder.init_start( initial_tenant_conf={ @@ -350,7 +348,6 @@ def test_timeline_resurrection_on_attach( neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_timeline_resurrection_on_attach", ) ##### First start, insert data and upload it to the remote storage @@ -438,7 +435,6 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild neon_env_builder.enable_remote_storage( remote_storage_kind=RemoteStorageKind.MOCK_S3, - test_name="test_timeline_delete_fail_before_local_delete", ) env = neon_env_builder.init_start() @@ -558,7 +554,6 @@ def test_concurrent_timeline_delete_stuck_on( neon_env_builder.enable_remote_storage( remote_storage_kind=RemoteStorageKind.MOCK_S3, - test_name=f"concurrent_timeline_delete_stuck_on_{stuck_failpoint}", ) env = neon_env_builder.init_start() @@ -636,7 +631,6 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): """ neon_env_builder.enable_remote_storage( remote_storage_kind=RemoteStorageKind.MOCK_S3, - test_name="test_delete_timeline_client_hangup", ) env = neon_env_builder.init_start() @@ -706,7 +700,6 @@ def test_timeline_delete_works_for_remote_smoke( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_timeline_delete_works_for_remote_smoke", ) env = neon_env_builder.init_start() @@ -780,7 +773,7 @@ def test_delete_orphaned_objects( pg_bin: PgBin, ): remote_storage_kind = RemoteStorageKind.LOCAL_FS - neon_env_builder.enable_remote_storage(remote_storage_kind, "test_delete_orphaned_objects") + neon_env_builder.enable_remote_storage(remote_storage_kind) env = neon_env_builder.init_start( initial_tenant_conf={ @@ -844,7 +837,6 @@ def test_timeline_delete_resumed_on_attach( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_deleted_tenant_ignored_on_attach", ) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index f6e4a667a489..d754ce0aa072 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -306,9 +306,7 @@ def test_timeline_physical_size_init( neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] ): if remote_storage_kind is not None: - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_timeline_physical_size_init" - ) + neon_env_builder.enable_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() @@ -349,9 +347,7 @@ def test_timeline_physical_size_post_checkpoint( neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] ): if remote_storage_kind is not None: - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_timeline_physical_size_init" - ) + neon_env_builder.enable_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() @@ -382,9 +378,7 @@ def test_timeline_physical_size_post_compaction( neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] ): if remote_storage_kind is not None: - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_timeline_physical_size_init" - ) + neon_env_builder.enable_remote_storage(remote_storage_kind) # Disable background compaction as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed @@ -437,9 +431,7 @@ def test_timeline_physical_size_post_gc( neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] ): if remote_storage_kind is not None: - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_timeline_physical_size_init" - ) + neon_env_builder.enable_remote_storage(remote_storage_kind) # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed @@ -572,9 +564,7 @@ def test_tenant_physical_size( random.seed(100) if remote_storage_kind is not None: - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_timeline_physical_size_init" - ) + neon_env_builder.enable_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 8ca93845b216..119a597d434b 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -439,7 +439,6 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_safekeepers_wal_backup", ) neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER @@ -491,7 +490,6 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_s3_wal_replay", ) neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER From 80c942fb12b591b532e863a1bc733a789099e8d9 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 1 Sep 2023 17:49:54 +0100 Subject: [PATCH 15/24] drop vestigial test_name arguments --- test_runner/regress/test_pageserver_restart.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 20da112031cd..567200ba1f75 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -9,9 +9,7 @@ # Test restarting page server, while safekeeper and compute node keep # running. def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_remote_storage( - remote_storage_kind=s3_storage(), test_name="test_pageserver_restart" - ) + neon_env_builder.enable_remote_storage(remote_storage_kind=s3_storage()) neon_env_builder.enable_scrub_on_exit() env = neon_env_builder.init_start() @@ -115,9 +113,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # safekeeper and compute node keep running. @pytest.mark.timeout(540) def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_remote_storage( - remote_storage_kind=s3_storage(), test_name="test_pageserver_restart" - ) + neon_env_builder.enable_remote_storage(remote_storage_kind=s3_storage()) neon_env_builder.enable_scrub_on_exit() env = neon_env_builder.init_start() From 826aafb58ade89183104f84d4f598b49355fb505 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 4 Sep 2023 09:18:28 +0100 Subject: [PATCH 16/24] pageserver: define "/tenants/" in a constant --- pageserver/src/config.rs | 5 +++-- pageserver/src/tenant.rs | 3 +++ s3_scrubber/src/scan_metadata.rs | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 5394f17398e7..ab485c969d04 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -32,7 +32,8 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig; use crate::tenant::config::TenantConf; use crate::tenant::config::TenantConfOpt; use crate::tenant::{ - TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME, + TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, + TIMELINES_SEGMENT_NAME, }; use crate::{ IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, @@ -563,7 +564,7 @@ impl PageServerConf { // pub fn tenants_path(&self) -> PathBuf { - self.workdir.join("tenants") + self.workdir.join(TENANTS_SEGMENT_NAME) } pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ce8520853773..652cdbcda194 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -144,6 +144,9 @@ pub use crate::tenant::metadata::save_metadata; // re-export for use in walreceiver pub use crate::tenant::timeline::WalReceiverInfo; +/// The "tenants" part of "tenants//timelines..." +pub const TENANTS_SEGMENT_NAME: &str = "tenants"; + /// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs index 69cd60d137a8..f75d7645a88b 100644 --- a/s3_scrubber/src/scan_metadata.rs +++ b/s3_scrubber/src/scan_metadata.rs @@ -11,7 +11,7 @@ use aws_sdk_s3::Client; use aws_types::region::Region; use futures_util::{pin_mut, StreamExt, TryStreamExt}; use histogram::Histogram; -use pageserver::tenant::IndexPart; +use pageserver::tenant::{IndexPart, TENANTS_SEGMENT_NAME}; use utils::id::TenantTimelineId; pub struct MetadataSummary { @@ -191,7 +191,7 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result Date: Mon, 4 Sep 2023 10:15:06 +0100 Subject: [PATCH 17/24] Make all referencves to "tenants" and "timelines" use constants --- pageserver/ctl/src/layer_map_analyzer.rs | 5 +++-- pageserver/ctl/src/layers.rs | 9 +++++---- pageserver/src/virtual_file.rs | 3 ++- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index 29bd6ce5986c..32d0d1bed286 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -3,6 +3,7 @@ //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data. use anyhow::Result; +use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use std::cmp::Ordering; use std::collections::BinaryHeap; use std::ops::Range; @@ -142,12 +143,12 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let mut total_delta_layers = 0usize; let mut total_image_layers = 0usize; let mut total_excess_layers = 0usize; - for tenant in fs::read_dir(storage_path.join("tenants"))? { + for tenant in fs::read_dir(storage_path.join(TENANTS_SEGMENT_NAME))? { let tenant = tenant?; if !tenant.file_type()?.is_dir() { continue; } - for timeline in fs::read_dir(tenant.path().join("timelines"))? { + for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? { let timeline = timeline?; if !timeline.file_type()?.is_dir() { continue; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 608b3cecd65c..ff2044653a79 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -5,6 +5,7 @@ use clap::Subcommand; use pageserver::tenant::block_io::BlockCursor; use pageserver::tenant::disk_btree::DiskBtreeReader; use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary}; +use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use pageserver::{page_cache, virtual_file}; use pageserver::{ repository::{Key, KEY_SIZE}, @@ -80,13 +81,13 @@ async fn read_delta_file(path: impl AsRef) -> Result<()> { pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { match cmd { LayerCmd::List { path } => { - for tenant in fs::read_dir(path.join("tenants"))? { + for tenant in fs::read_dir(path.join(TENANTS_SEGMENT_NAME))? { let tenant = tenant?; if !tenant.file_type()?.is_dir() { continue; } println!("tenant {}", tenant.file_name().to_string_lossy()); - for timeline in fs::read_dir(tenant.path().join("timelines"))? { + for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? { let timeline = timeline?; if !timeline.file_type()?.is_dir() { continue; @@ -101,9 +102,9 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { timeline, } => { let timeline_path = path - .join("tenants") + .join(TENANTS_SEGMENT_NAME) .join(tenant) - .join("timelines") + .join(TIMELINES_SEGMENT_NAME) .join(timeline); let mut idx = 0; for layer in fs::read_dir(timeline_path)? { diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index a86b8fa2a6fa..66c099fa32bb 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -11,6 +11,7 @@ //! src/backend/storage/file/fd.c //! use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME}; +use crate::tenant::TENANTS_SEGMENT_NAME; use once_cell::sync::OnceCell; use std::fs::{self, File, OpenOptions}; use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write}; @@ -200,7 +201,7 @@ impl VirtualFile { let parts = path_str.split('/').collect::>(); let tenant_id; let timeline_id; - if parts.len() > 5 && parts[parts.len() - 5] == "tenants" { + if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME { tenant_id = parts[parts.len() - 4].to_string(); timeline_id = parts[parts.len() - 2].to_string(); } else { From 40a809f0e34ded078a0fe47a8226a57879f37afc Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 4 Sep 2023 09:21:47 +0100 Subject: [PATCH 18/24] test: stash test_output_dir on NeonEnvBuilder --- test_runner/fixtures/neon_fixtures.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 6adba21ecf04..4fd1dd39eca3 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -415,6 +415,7 @@ def __init__( pg_distrib_dir: Path, pg_version: PgVersion, test_name: str, + test_output_dir: Path, remote_storage: Optional[RemoteStorage] = None, remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER, pageserver_config_override: Optional[str] = None, @@ -456,6 +457,7 @@ def __init__( self.initial_tenant = initial_tenant or TenantId.generate() self.initial_timeline = initial_timeline or TimelineId.generate() self.scrub_on_exit = False + self.test_output_dir = test_output_dir assert test_name.startswith( "test_" @@ -956,6 +958,7 @@ def _shared_simple_env( default_broker: NeonBroker, run_id: uuid.UUID, top_output_dir: Path, + test_output_dir: Path, neon_binpath: Path, pg_distrib_dir: Path, pg_version: PgVersion, @@ -984,6 +987,7 @@ def _shared_simple_env( run_id=run_id, preserve_database_files=pytestconfig.getoption("--preserve-database-files"), test_name=request.node.name, + test_output_dir=test_output_dir, ) as builder: env = builder.init_start() @@ -1011,7 +1015,7 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: @pytest.fixture(scope="function") def neon_env_builder( pytestconfig: Config, - test_output_dir: str, + test_output_dir: Path, port_distributor: PortDistributor, mock_s3_server: MockS3Server, neon_binpath: Path, @@ -1049,6 +1053,7 @@ def neon_env_builder( run_id=run_id, preserve_database_files=pytestconfig.getoption("--preserve-database-files"), test_name=request.node.name, + test_output_dir=test_output_dir, ) as builder: yield builder From 25ceb876544cc50c79390a46c015e13977b4a16a Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 4 Sep 2023 09:50:57 +0100 Subject: [PATCH 19/24] tests: improve subprocess_capture --- test_runner/fixtures/neon_fixtures.py | 29 +++++----- test_runner/fixtures/utils.py | 80 ++++++++++++++++++++++++--- 2 files changed, 85 insertions(+), 24 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 4fd1dd39eca3..4ea1d7970578 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -744,7 +744,7 @@ def __exit__( if self.scrub_on_exit: try: - S3Scrubber(self, self.remote_storage).scan_metadata() + S3Scrubber(self.test_output_dir, self).scan_metadata() except Exception as e: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e @@ -1760,7 +1760,10 @@ def run_capture( self._fixpath(command) log.info(f"Running command '{' '.join(command)}'") env = self._build_env(env) - return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs) + base_path, _, _ = subprocess_capture( + self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs + ) + return base_path @pytest.fixture(scope="function") @@ -2767,13 +2770,13 @@ def get_metrics(self) -> SafekeeperMetrics: class S3Scrubber: - def __init__(self, env, remote_storage): + def __init__(self, log_dir: Path, env: NeonEnvBuilder): self.env = env - self.remote_storage = remote_storage + self.log_dir = log_dir def scrubber_cli(self, args, timeout): + assert isinstance(self.env.remote_storage, S3Storage) s3_storage = self.env.remote_storage - assert isinstance(s3_storage, S3Storage) env = { "REGION": s3_storage.bucket_region, @@ -2786,20 +2789,14 @@ def scrubber_cli(self, args, timeout): base_args = [self.env.neon_binpath / "s3_scrubber"] args = base_args + args - r = subprocess.run( - args, - env=env, - check=False, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - timeout=timeout, + + (output_path, _, status_code) = subprocess_capture( + self.log_dir, args, echo_stderr=True, echo_stdout=True, env=env, check=False ) - if r.returncode: + if status_code: log.warning(f"Scrub command {args} failed") log.warning(f"Scrub environment: {env}") - log.warning(f"Scrub stdout: {r.stdout}") - log.warning(f"Scrub stderr: {r.stderr}") + log.warning(f"Output at: {output_path}") raise RuntimeError("Remote storage scrub failed") diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index d03d2e7595f4..775a6a5b56aa 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -4,9 +4,10 @@ import re import subprocess import tarfile +import threading import time from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, TypeVar +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, TypeVar from urllib.parse import urlencode import allure @@ -26,34 +27,97 @@ def get_self_dir() -> Path: return Path(__file__).resolve().parent -def subprocess_capture(capture_dir: Path, cmd: List[str], **kwargs: Any) -> str: - """Run a process and capture its output +def subprocess_capture( + capture_dir: Path, + cmd: List[str], + *, + check=False, + echo_stderr=False, + echo_stdout=False, + capture_stdout=False, + **kwargs: Any, +) -> Tuple[str, Optional[str], int]: + """Run a process and bifurcate its outout to files and log - Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" + stderr and stdout are always captured in files. They are also optionally + echoed to the log (echo_stderr, echo_stdout), and/or captured and returned + (capture_stdout). + + File output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" where "cmd" is the name of the program and NNN is an incrementing counter. If those files already exist, we will overwrite them. - Returns basepath for files with captured output. + + Returns 3-tuple of: + - The base path for output files + - Captured stdout, or None + - The exit status of the process """ assert isinstance(cmd, list) - base = f"{os.path.basename(cmd[0])}_{global_counter()}" + base_cmd = os.path.basename(cmd[0]) + base = f"{base_cmd}_{global_counter()}" basepath = os.path.join(capture_dir, base) stdout_filename = f"{basepath}.stdout" stderr_filename = f"{basepath}.stderr" + # Since we will stream stdout and stderr concurrently, need to do it in a thread. + class OutputHandler(threading.Thread): + def __init__(self, in_file, out_file, echo: bool, capture: bool): + super().__init__() + self.in_file = in_file + self.out_file = out_file + self.echo = echo + self.capture = capture + self.captured = "" + + def run(self): + for line in self.in_file: + if self.echo: + log.info(line) + + if self.capture: + self.captured += line + + self.out_file.write(line) + + captured = None try: with open(stdout_filename, "w") as stdout_f: with open(stderr_filename, "w") as stderr_f: log.info(f'Capturing stdout to "{base}.stdout" and stderr to "{base}.stderr"') - subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) + + p = subprocess.Popen( + cmd, + **kwargs, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout_handler = OutputHandler( + p.stdout, stdout_f, echo=echo_stdout, capture=capture_stdout + ) + stdout_handler.start() + stderr_handler = OutputHandler(p.stderr, stderr_f, echo=echo_stderr, capture=False) + stderr_handler.start() + + r = p.wait() + + stdout_handler.join() + stderr_handler.join() + + if check and r != 0: + raise subprocess.CalledProcessError(r, " ".join(cmd)) + + if capture_stdout: + captured = stdout_handler.captured finally: # Remove empty files if there is no output for filename in (stdout_filename, stderr_filename): if os.stat(filename).st_size == 0: os.remove(filename) - return basepath + return (basepath, captured, r) _global_counter = 0 From 177645c3c1f52329ad7ad00d35115f329772a1ea Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 4 Sep 2023 10:12:07 +0100 Subject: [PATCH 20/24] s3_scrubber: remove main.rs span --- s3_scrubber/src/main.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs index 6bb08681614b..3c60723f888b 100644 --- a/s3_scrubber/src/main.rs +++ b/s3_scrubber/src/main.rs @@ -12,7 +12,7 @@ use s3_scrubber::{ checks, get_cloud_admin_api_token_or_exit, init_logging, init_s3_client, BucketConfig, ConsoleConfig, RootTarget, S3Deleter, S3Target, TraversingDepth, CLI_NAME, }; -use tracing::{info, info_span, warn}; +use tracing::{info, warn}; use clap::{Parser, Subcommand, ValueEnum}; @@ -87,7 +87,6 @@ async fn tidy( }; let _guard = init_logging(&file_name); - let _main_span = info_span!("tidy", %dry_run).entered(); if dry_run { info!("Dry run, not removing items for real"); From 21c489d44e6e95764aeaf5fc58f206af091e1ecf Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 4 Sep 2023 10:56:19 +0100 Subject: [PATCH 21/24] clippy --- s3_scrubber/src/lib.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index d7bf9c21ecf2..405ce627ff24 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -120,10 +120,7 @@ impl Display for BucketConfig { write!( f, "{}/{}/{}", - self.sso_account_id - .as_ref() - .map(|s| s.as_str()) - .unwrap_or(""), + self.sso_account_id.as_deref().unwrap_or(""), self.region, self.bucket ) @@ -208,7 +205,7 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie Some(sso_account) => chain.or_else( "sso", SsoCredentialsProvider::builder() - .account_id(&sso_account) + .account_id(sso_account) .role_name("PowerUserAccess") .start_url("https://neondb.awsapps.com/start") .region(Region::from_static("eu-central-1")) From 36c57b5e09a98d924e5b82e3c075450a1ef41c80 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 4 Sep 2023 12:40:35 +0100 Subject: [PATCH 22/24] Fix a doc string with angle brackes outside `` --- pageserver/src/tenant.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 7deea8abc76e..90491bda3f7c 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -141,7 +141,7 @@ pub use crate::tenant::metadata::save_metadata; // re-export for use in walreceiver pub use crate::tenant::timeline::WalReceiverInfo; -/// The "tenants" part of "tenants//timelines..." +/// The "tenants" part of `tenants//timelines...` pub const TENANTS_SEGMENT_NAME: &str = "tenants"; /// Parts of the `.neon/tenants//timelines/` directory prefix. From a6fef130bbbc39af371660e4d5d9ff5228b165d1 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 6 Sep 2023 09:14:20 +0100 Subject: [PATCH 23/24] typos --- s3_scrubber/src/checks.rs | 6 +++--- s3_scrubber/src/lib.rs | 4 +--- test_runner/fixtures/utils.py | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index becb2af8ad87..e86c381985fb 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -195,15 +195,15 @@ pub async fn branch_cleanup_and_check_errors( if let Some(s3_active_branch) = s3_active_branch { info!( - "Checking console status for timeline for branch branch {:?}/{:?}", + "Checking console status for timeline for branch {:?}/{:?}", s3_active_branch.project_id, s3_active_branch.id ); match console_branch { - Some(_) => {result.errors.push(format!("Timeline has deleted branch data in the console (id = {:?}, project_id = {:?}), recheck whether if it got removed during the check", + Some(_) => {result.errors.push(format!("Timeline has deleted branch data in the console (id = {:?}, project_id = {:?}), recheck whether it got removed during the check", s3_active_branch.id, s3_active_branch.project_id)) }, None => { - result.errors.push(format!("Timeline has no branch data in the console (id = {:?}, project_id = {:?}), recheck whether if it got removed during the check", + result.errors.push(format!("Timeline has no branch data in the console (id = {:?}, project_id = {:?}), recheck whether it got removed during the check", s3_active_branch.id, s3_active_branch.project_id)) } }; diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index 405ce627ff24..f19a55efacb8 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -129,9 +129,7 @@ impl Display for BucketConfig { impl BucketConfig { pub fn from_env() -> anyhow::Result { - let sso_account_id = env::var("SSO_ACCOUNT_ID") - .context("'SSO_ACCOUNT_ID' param retrieval") - .ok(); + let sso_account_id = env::var("SSO_ACCOUNT_ID").ok(); let region = env::var("REGION").context("'REGION' param retrieval")?; let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?; diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 775a6a5b56aa..b07ca0805eeb 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -37,7 +37,7 @@ def subprocess_capture( capture_stdout=False, **kwargs: Any, ) -> Tuple[str, Optional[str], int]: - """Run a process and bifurcate its outout to files and log + """Run a process and bifurcate its output to files and the `log` logger stderr and stdout are always captured in files. They are also optionally echoed to the log (echo_stderr, echo_stdout), and/or captured and returned From 8ac049b474321fdc72ddcb56d7165153a1a900e8 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 6 Sep 2023 10:49:16 +0100 Subject: [PATCH 24/24] Tolerate non-unicode postgres log output --- test_runner/fixtures/utils.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index b07ca0805eeb..46ab446f99c6 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -73,24 +73,27 @@ def __init__(self, in_file, out_file, echo: bool, capture: bool): def run(self): for line in self.in_file: - if self.echo: - log.info(line) + # Only bother decoding if we are going to do something more than stream to a file + if self.echo or self.capture: + string = line.decode(encoding="utf-8", errors="replace") - if self.capture: - self.captured += line + if self.echo: + log.info(string) + + if self.capture: + self.captured += string self.out_file.write(line) captured = None try: - with open(stdout_filename, "w") as stdout_f: - with open(stderr_filename, "w") as stderr_f: + with open(stdout_filename, "wb") as stdout_f: + with open(stderr_filename, "wb") as stderr_f: log.info(f'Capturing stdout to "{base}.stdout" and stderr to "{base}.stderr"') p = subprocess.Popen( cmd, **kwargs, - universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, )