From e10b324efbf3af95a3b1195308f15416a29f5309 Mon Sep 17 00:00:00 2001 From: Sergei Shulepov Date: Tue, 5 Nov 2024 20:04:51 +0100 Subject: [PATCH] feat: conditional preallocate Allow the user to check whether they want to preallocate the space for the hashtable file or not. By default it's set to true. --- nomt/src/bitbox/ht_file.rs | 67 ++++++++++++++++++++++++-------------- nomt/src/options.rs | 16 +++++++++ nomt/src/store/mod.rs | 2 +- nomt/src/sys/linux.rs | 36 ++++++++++++++++++++ 4 files changed, 96 insertions(+), 25 deletions(-) diff --git a/nomt/src/bitbox/ht_file.rs b/nomt/src/bitbox/ht_file.rs index be7a77bd..8e9b3c1e 100644 --- a/nomt/src/bitbox/ht_file.rs +++ b/nomt/src/bitbox/ht_file.rs @@ -62,8 +62,8 @@ pub fn open( /// Creates the store file. Fails if store file already exists. /// -/// Lays out the meta page, and fills the file with zeroes. -pub fn create(path: PathBuf, num_pages: u32) -> std::io::Result<()> { +/// Lays out the meta page. If `preallocate` is true, preallocates the blocks for the file. +pub fn create(path: PathBuf, num_pages: u32, preallocate: bool) -> std::io::Result<()> { let start = std::time::Instant::now(); let ht_path = path.join("ht"); let ht_file = OpenOptions::new().write(true).create(true).open(ht_path)?; @@ -71,9 +71,9 @@ pub fn create(path: PathBuf, num_pages: u32) -> std::io::Result<()> { // number of pages + pages required for meta bits. let page_count = num_pages + num_meta_byte_pages(num_pages); let len = page_count as usize * PAGE_SIZE; - ht_file.set_len(len as u64)?; - zero_file(&ht_file, len)?; + resize_and_prealloc(&ht_file, len as u64, preallocate)?; + ht_file.sync_all()?; drop(ht_file); @@ -90,30 +90,50 @@ pub fn create(path: PathBuf, num_pages: u32) -> std::io::Result<()> { Ok(()) } -#[cfg(target_os = "linux")] -fn zero_file(file: &File, len: usize) -> std::io::Result<()> { - let res = unsafe { - use std::os::fd::AsRawFd; - - libc::fallocate( - file.as_raw_fd(), - libc::FALLOC_FL_ZERO_RANGE, - 0 as _, - len as _, - ) - }; - - if res == -1 { - Err(std::io::Error::last_os_error()) - } else { - Ok(()) +/// Sets the file size and attempts to preallocate the file if `preallocate` is true. +/// +/// Returns an error if setting the file size fails. File preallocation is done on a best-effort basis +/// and may silently fall back to regular allocation. +/// +/// After this call, if successful, the file size is set to `len` bytes. +fn resize_and_prealloc(ht_file: &File, len: u64, preallocate: bool) -> std::io::Result<()> { + if !preallocate { + // If not preallocating, just set the file size and return. + ht_file.set_len(len)?; + return Ok(()); + } + + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + // To preallocate on Linux systems, try using fallocate with ZERO_RANGE first as it's more + // efficient. fallocate sets the file size as well, so ftruncate (aka file.set_len()) is + // not needed. + if crate::sys::linux::tmpfs_check(ht_file) { + // Skip preallocation for tmpfs. It doesn't support fallocate and it's + // memory-backed anyway. ftruncate and bail. + ht_file.set_len(len)?; + return Ok(()); + } + if let Err(_) = crate::sys::linux::falloc_zero_file(ht_file, len) { + // If fallocate fails, fall back to zeroing the file with write. + resize_and_zero_file(ht_file, len)?; + } + return Ok(()); + } else { + resize_and_zero_file(ht_file, len)?; + } } } -#[cfg(not(target_os = "linux"))] -fn zero_file(mut file: &File, len: usize) -> std::io::Result<()> { +// Fallback method for allocating extents for the file: just incrementally write zeroes to the file. +fn resize_and_zero_file(mut file: &File, len: u64) -> std::io::Result<()> { use std::io::Write; + // Set the file size first. + file.set_len(len)?; + + // Zero the file. + let len = len as usize; let buf = [0u8; PAGE_SIZE * 4]; let mut remaining = len; while remaining > 0 { @@ -121,6 +141,5 @@ fn zero_file(mut file: &File, len: usize) -> std::io::Result<()> { file.write_all(&buf[..len])?; remaining -= len; } - Ok(()) } diff --git a/nomt/src/options.rs b/nomt/src/options.rs index 011da0e4..0dc84624 100644 --- a/nomt/src/options.rs +++ b/nomt/src/options.rs @@ -19,6 +19,8 @@ pub struct Options { pub(crate) warm_up: bool, /// The number of threads to use for fetching prior values. pub(crate) rollback_tp_size: usize, + /// Whether to preallocate the hashtable file. + pub(crate) preallocate_ht: bool, } impl Options { @@ -40,6 +42,7 @@ impl Options { max_rollback_log_len: 100, warm_up: false, rollback_tp_size: 4, + preallocate_ht: true, } } @@ -121,4 +124,17 @@ impl Options { pub fn rollback_tp_size(&mut self, rollback_tp_size: usize) { self.rollback_tp_size = rollback_tp_size; } + + /// Sets whether to preallocate the hashtable file. + /// + /// Many filesystems don't handle sparse files well. If the `preallocate_ht` option is set to + /// `true`, NOMT will try to make sure that the file is fully allocated. + /// + /// If set to `false` this won't allocate the disk space for the hashtable file upfront, but can + /// lead to fragmentation later. + /// + /// Default: `true`. + pub fn preallocate_ht(&mut self, preallocate_ht: bool) { + self.preallocate_ht = preallocate_ht; + } } diff --git a/nomt/src/store/mod.rs b/nomt/src/store/mod.rs index b7e7611c..741fe3d7 100644 --- a/nomt/src/store/mod.rs +++ b/nomt/src/store/mod.rs @@ -326,7 +326,7 @@ fn create(o: &crate::Options) -> anyhow::Result<()> { meta_fd.sync_all()?; drop(meta_fd); - bitbox::create(o.path.clone(), o.bitbox_num_pages)?; + bitbox::create(o.path.clone(), o.bitbox_num_pages, o.preallocate_ht)?; beatree::create(&o.path)?; // As the last step, sync the directory. diff --git a/nomt/src/sys/linux.rs b/nomt/src/sys/linux.rs index 8d4c2fcc..9053414a 100644 --- a/nomt/src/sys/linux.rs +++ b/nomt/src/sys/linux.rs @@ -1 +1,37 @@ //! Linux-specific code. + +use super::unix::cvt_r; +use std::fs::File; +use std::os::fd::AsRawFd; + +/// Returns true if the file is on a tmpfs filesystem. +/// False if it's not or the check fails. +pub fn tmpfs_check(file: &File) -> bool { + unsafe { + // SAFETY: unsafe because ffi call. This should be IO-safe because the file is passed + // by reference. This should be memory-safe because the `statfs` struct is + // zeroed and the `f_type` field should be set by the ffi call. + let mut stat: libc::statfs = std::mem::zeroed(); + cvt_r(|| libc::fstatfs(file.as_raw_fd(), &mut stat)) + .map(|_| stat.f_type == libc::TMPFS_MAGIC) + .unwrap_or(false) + } +} + +/// fallocate changes the size of the file to the given length if it's less than the current size. +/// If the file is larger than the given length, the file is not truncated. +/// +/// Doesn't work on tmpfs. +pub fn falloc_zero_file(file: &File, len: u64) -> std::io::Result<()> { + cvt_r(|| unsafe { + // SAFETY: unsafe because ffi call. This should be IO-safe because the file is passed + // by reference. + libc::fallocate( + file.as_raw_fd(), + libc::FALLOC_FL_ZERO_RANGE, + 0 as _, + len as _, + ) + }) + .map(drop) +}