From 981881544c0634705d745f793783442d14fa14fe Mon Sep 17 00:00:00 2001 From: Alessandro Decina Date: Thu, 21 Mar 2024 14:28:23 +1100 Subject: [PATCH] runtime: do fewer syscalls in remap_append_vec_file (#336) * runtime: do fewer syscalls in remap_append_vec_file Use renameat2(src, dest, NOREPLACE) as an atomic version of if statx(dest).is_err() { rename(src, dest) }. We have high inode contention during storage rebuild and this saves 1 fs syscall for each appendvec. * Address review feedback --- Cargo.lock | 1 + programs/sbf/Cargo.lock | 1 + runtime/Cargo.toml | 1 + runtime/src/serde_snapshot.rs | 89 ++++++++++++++++++++++++----- runtime/src/serde_snapshot/tests.rs | 63 +++++++++++++++++++- 5 files changed, 137 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 06d28868c2bcff..2fb61646b042b1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6997,6 +6997,7 @@ dependencies = [ "index_list", "itertools", "lazy_static", + "libc", "libsecp256k1", "log", "lru", diff --git a/programs/sbf/Cargo.lock b/programs/sbf/Cargo.lock index 3204776825622b..c211c81696541a 100644 --- a/programs/sbf/Cargo.lock +++ b/programs/sbf/Cargo.lock @@ -5695,6 +5695,7 @@ dependencies = [ "index_list", "itertools", "lazy_static", + "libc", "log", "lru", "lz4", diff --git a/runtime/Cargo.toml b/runtime/Cargo.toml index 02553d4215909d..49451aa02eed26 100644 --- a/runtime/Cargo.toml +++ b/runtime/Cargo.toml @@ -28,6 +28,7 @@ im = { workspace = true, features = ["rayon", "serde"] } index_list = { workspace = true } itertools = { workspace = true } lazy_static = { workspace = true } +libc = { workspace = true } log = { workspace = true } lru = { workspace = true } lz4 = { workspace = true } diff --git a/runtime/src/serde_snapshot.rs b/runtime/src/serde_snapshot.rs index f866f32577f38e..998fa82e2326d1 100644 --- a/runtime/src/serde_snapshot.rs +++ b/runtime/src/serde_snapshot.rs @@ -1,3 +1,5 @@ +#[cfg(target_os = "linux")] +use std::ffi::{CStr, CString}; use { crate::{ bank::{builtins::BuiltinPrototype, Bank, BankFieldsToDeserialize, BankRc}, @@ -655,30 +657,55 @@ pub(crate) fn reconstruct_single_storage( ))) } -fn remap_append_vec_file( +// Remap the AppendVec ID to handle any duplicate IDs that may previously existed +// due to full snapshots and incremental snapshots generated from different +// nodes +pub(crate) fn remap_append_vec_file( slot: Slot, old_append_vec_id: SerializedAppendVecId, append_vec_path: &Path, next_append_vec_id: &AtomicAppendVecId, num_collisions: &AtomicUsize, ) -> io::Result<(AppendVecId, PathBuf)> { - // Remap the AppendVec ID to handle any duplicate IDs that may previously existed - // due to full snapshots and incremental snapshots generated from different nodes + #[cfg(target_os = "linux")] + let append_vec_path_cstr = cstring_from_path(append_vec_path)?; + + let mut remapped_append_vec_path = append_vec_path.to_path_buf(); + + // Break out of the loop in the following situations: + // 1. The new ID is the same as the original ID. This means we do not need to + // rename the file, since the ID is the "correct" one already. + // 2. There is not a file already at the new path. This means it is safe to + // rename the file to this new path. let (remapped_append_vec_id, remapped_append_vec_path) = loop { let remapped_append_vec_id = next_append_vec_id.fetch_add(1, Ordering::AcqRel); + + // this can only happen in the first iteration of the loop + if old_append_vec_id == remapped_append_vec_id as SerializedAppendVecId { + break (remapped_append_vec_id, remapped_append_vec_path); + } + let remapped_file_name = AccountsFile::file_name(slot, remapped_append_vec_id); - let remapped_append_vec_path = append_vec_path.parent().unwrap().join(remapped_file_name); - - // Break out of the loop in the following situations: - // 1. The new ID is the same as the original ID. This means we do not need to - // rename the file, since the ID is the "correct" one already. - // 2. There is not a file already at the new path. This means it is safe to - // rename the file to this new path. - // **DEVELOPER NOTE:** Keep this check last so that it can short-circuit if - // possible. - if old_append_vec_id == remapped_append_vec_id as SerializedAppendVecId - || std::fs::metadata(&remapped_append_vec_path).is_err() + remapped_append_vec_path = append_vec_path.parent().unwrap().join(remapped_file_name); + + #[cfg(target_os = "linux")] { + let remapped_append_vec_path_cstr = cstring_from_path(&remapped_append_vec_path)?; + + // On linux we use renameat2(NO_REPLACE) instead of IF metadata(path).is_err() THEN + // rename() in order to save a statx() syscall. + match rename_no_replace(&append_vec_path_cstr, &remapped_append_vec_path_cstr) { + // If the file was successfully renamed, break out of the loop + Ok(_) => break (remapped_append_vec_id, remapped_append_vec_path), + // If there's already a file at the new path, continue so we try + // the next ID + Err(e) if e.kind() == io::ErrorKind::AlreadyExists => {} + Err(e) => return Err(e), + } + } + + #[cfg(not(target_os = "linux"))] + if std::fs::metadata(&remapped_append_vec_path).is_err() { break (remapped_append_vec_id, remapped_append_vec_path); } @@ -686,7 +713,10 @@ fn remap_append_vec_file( // and try again. num_collisions.fetch_add(1, Ordering::Relaxed); }; - // Only rename the file if the new ID is actually different from the original. + + // Only rename the file if the new ID is actually different from the original. In the target_os + // = linux case, we have already renamed if necessary. + #[cfg(not(target_os = "linux"))] if old_append_vec_id != remapped_append_vec_id as SerializedAppendVecId { std::fs::rename(append_vec_path, &remapped_append_vec_path)?; } @@ -953,3 +983,32 @@ where ReconstructedAccountsDbInfo { accounts_data_len }, )) } + +// Rename `src` to `dest` only if `dest` doesn't already exist. +#[cfg(target_os = "linux")] +fn rename_no_replace(src: &CStr, dest: &CStr) -> io::Result<()> { + let ret = unsafe { + libc::renameat2( + libc::AT_FDCWD, + src.as_ptr() as *const _, + libc::AT_FDCWD, + dest.as_ptr() as *const _, + libc::RENAME_NOREPLACE, + ) + }; + if ret == -1 { + return Err(io::Error::last_os_error()); + } + + Ok(()) +} + +#[cfg(target_os = "linux")] +fn cstring_from_path(path: &Path) -> io::Result { + // It is better to allocate here than use the stack. Jemalloc is going to give us a chunk of a + // preallocated small arena anyway. Instead if we used the stack since PATH_MAX=4096 it would + // result in LLVM inserting a stack probe, see + // https://docs.rs/compiler_builtins/latest/compiler_builtins/probestack/index.html. + CString::new(path.as_os_str().as_encoded_bytes()) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e)) +} diff --git a/runtime/src/serde_snapshot/tests.rs b/runtime/src/serde_snapshot/tests.rs index 510069c92662fc..2e5393a3a5bf49 100644 --- a/runtime/src/serde_snapshot/tests.rs +++ b/runtime/src/serde_snapshot/tests.rs @@ -3,8 +3,8 @@ mod serde_snapshot_tests { use { crate::{ serde_snapshot::{ - newer, reconstruct_accountsdb_from_fields, SerdeStyle, SerializableAccountsDb, - SnapshotAccountsDbFields, TypeContext, + newer, reconstruct_accountsdb_from_fields, remap_append_vec_file, SerdeStyle, + SerializableAccountsDb, SnapshotAccountsDbFields, TypeContext, }, snapshot_utils::{get_storages_to_serialize, StorageAndNextAppendVecId}, }, @@ -34,12 +34,17 @@ mod serde_snapshot_tests { rent_collector::RentCollector, }, std::{ + fs::File, io::{BufReader, Cursor, Read, Write}, ops::RangeFull, path::{Path, PathBuf}, - sync::{atomic::Ordering, Arc}, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, }, tempfile::TempDir, + test_case::test_case, }; fn linear_ancestors(end_slot: u64) -> Ancestors { @@ -845,4 +850,56 @@ mod serde_snapshot_tests { ); } } + + // no remap needed + #[test_case(456, 456, 456, 0, |_| {})] + // remap from 456 to 457, no collisions + #[test_case(456, 457, 457, 0, |_| {})] + // attempt to remap from 456 to 457, but there's a collision, so we get 458 + #[test_case(456, 457, 458, 1, |tmp| { + File::create(tmp.join("123.457")).unwrap(); + })] + fn test_remap_append_vec_file( + old_id: usize, + next_id: usize, + expected_remapped_id: usize, + expected_collisions: usize, + become_ungovernable: impl FnOnce(&Path), + ) { + let tmp = tempfile::tempdir().unwrap(); + let old_path = tmp.path().join(format!("123.{old_id}")); + let expected_remapped_path = tmp.path().join(format!("123.{expected_remapped_id}")); + File::create(&old_path).unwrap(); + + become_ungovernable(tmp.path()); + + let next_append_vec_id = AtomicAppendVecId::new(next_id as u32); + let num_collisions = AtomicUsize::new(0); + let (remapped_id, remapped_path) = + remap_append_vec_file(123, old_id, &old_path, &next_append_vec_id, &num_collisions) + .unwrap(); + assert_eq!(remapped_id as usize, expected_remapped_id); + assert_eq!(&remapped_path, &expected_remapped_path); + assert_eq!(num_collisions.load(Ordering::Relaxed), expected_collisions); + } + + #[test] + #[should_panic(expected = "No such file or directory")] + fn test_remap_append_vec_file_error() { + let tmp = tempfile::tempdir().unwrap(); + let original_path = tmp.path().join("123.456"); + + // In remap_append_vec() we want to handle EEXIST (collisions), but we want to return all + // other errors + let next_append_vec_id = AtomicAppendVecId::new(457); + let num_collisions = AtomicUsize::new(0); + remap_append_vec_file( + 123, + 456, + &original_path, + &next_append_vec_id, + &num_collisions, + ) + .unwrap(); + } }