From 6dd8582ef631d29f35ac25e2975edb7b52bae510 Mon Sep 17 00:00:00 2001 From: MigeljanImeri Date: Thu, 25 Jan 2024 09:43:16 -0700 Subject: [PATCH] Allow bypassing the vdev queue on SSDs Allow bypassing the vdev queue on SSDs if the vdev queue is less than zfs_vdev_queue_bypass_pct percent full. This can lead to an over 2x IOPS speed-up on some benchmarks. The intention behind this property is to improve performance when using O_DIRECT. Signed-off-by: MigeljanImeri --- include/sys/vdev_impl.h | 1 + include/sys/zio.h | 19 +++++++++++++++++ man/man4/zfs.4 | 7 ++++++ module/zfs/vdev.c | 2 ++ module/zfs/vdev_queue.c | 47 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 76 insertions(+) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index abd66b8abc96..3be570afb978 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -151,6 +151,7 @@ struct vdev_queue { uint32_t vq_ia_active; /* Active interactive I/Os. */ uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */ list_t vq_active_list; /* List of active I/Os. */ + kmutex_t vq_active_list_lock; hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 46f5d68aed4a..15cd171fde2d 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -249,6 +249,24 @@ typedef uint64_t zio_flag_t; #define ZIO_CHILD_BIT(x) (1U << (x)) #define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1U << (x))) + +/* + * ZIOs that are ZIO_FLAG_IMPORTANT are always queued so that they never get + * starved out. This allows us to bypass the queue for "normal" reads and + * writes when the queues are low for better IOPS. If the queues get too high + * then we go back to queuing the "normal" reads/writes so as not to starve + * out more important IOs like scrub/resilver/retry. See + * zfs_vdev_queue_bypass_pct for details. + */ + +#define ZIO_FLAG_IMPORTANT \ + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL | \ + ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB | \ + ZIO_FLAG_IO_RETRY + +#define ZIO_IS_NORMAL(zio) \ + !((zio)->io_flags & (ZIO_FLAG_IMPORTANT)) + enum zio_child { ZIO_CHILD_VDEV = 0, ZIO_CHILD_GANG, @@ -449,6 +467,7 @@ enum zio_qstate { ZIO_QS_NONE = 0, ZIO_QS_QUEUED, ZIO_QS_ACTIVE, + ZIO_QS_BYPASS, }; struct zio { diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 7078a5ba8373..d1c4ad15e060 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1528,6 +1528,13 @@ Default queue depth for each vdev IO allocator. Higher values allow for better coalescing of sequential writes before sending them to the disk, but can increase transaction commit times. . +.It Sy zfs_vdev_queue_bypass_pct Ns = Ns Sy 10 Pq uint +Allow bypassing the vdev's queue if the vdev queue is less than +zfs_vdev_queue_bypass_pct percent full. +This only applies to SSDs (non-rotational drives). +Only "normal" (read/write) zios can bypass the queue. +You can use 0 to always queue IOs and 100 to never queue IOs. +. .It Sy zfs_vdev_failfast_mask Ns = Ns Sy 1 Pq uint Defines if the driver should retire on a given error type. The following options may be bitwise-ored together: diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 5df2f77e5780..a25d4b00f026 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -5634,7 +5634,9 @@ vdev_deadman(vdev_t *vd, const char *tag) * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ + mutex_enter(&vq->vq_active_list_lock); fio = list_head(&vq->vq_active_list); + mutex_exit(&vq->vq_active_list_lock); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 092b3f375be0..91093233e361 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -228,6 +228,12 @@ uint_t zfs_vdev_queue_depth_pct = 300; */ uint_t zfs_vdev_def_queue_depth = 32; +/* + * Allow io to bypass the queue depending on how full the queue is. + * 0 = never bypass, 100 = always bypass. + */ +uint_t zfs_vdev_queue_bypass_pct = 10; + static int vdev_queue_offset_compare(const void *x1, const void *x2) { @@ -502,6 +508,7 @@ vdev_queue_init(vdev_t *vd) list_create(&vq->vq_active_list, sizeof (struct zio), offsetof(struct zio, io_queue_node.l)); mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vq->vq_active_list_lock, NULL, MUTEX_DEFAULT, NULL); } void @@ -520,6 +527,7 @@ vdev_queue_fini(vdev_t *vd) list_destroy(&vq->vq_active_list); mutex_destroy(&vq->vq_lock); + mutex_destroy(&vq->vq_active_list_lock); } static void @@ -572,7 +580,9 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) vq->vq_nia_credit--; } zio->io_queue_state = ZIO_QS_ACTIVE; + mutex_enter(&vq->vq_active_list_lock); list_insert_tail(&vq->vq_active_list, zio); + mutex_exit(&vq->vq_active_list_lock); } static void @@ -589,7 +599,9 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) vq->vq_nia_credit = zfs_vdev_nia_credit; } else if (vq->vq_ia_active == 0) vq->vq_nia_credit++; + mutex_enter(&vq->vq_active_list_lock); list_remove(&vq->vq_active_list, zio); + mutex_exit(&vq->vq_active_list_lock); zio->io_queue_state = ZIO_QS_NONE; } @@ -946,6 +958,30 @@ vdev_queue_io(zio_t *zio) zio->io_flags |= ZIO_FLAG_DONT_QUEUE; zio->io_timestamp = gethrtime(); + /* + * Bypass queue if certain conditions are met. Queue bypassing requires + * a non-rotational device. Reads / writes will attempt to bypass queue, + * depending on how full the queue is. Other operations will always + * queue. Bypassing the queue can lead to a 2x IOPS speed-ups on some + * benchmarks. If the queue is too full (due to a scrub or resilver) + * then go back to queuing normal reads/writes so as not to starve out + * the more important IOs. + */ + if (zio->io_vd->vdev_nonrot && ZIO_IS_NORMAL(zio)) { + + int bypass = vdev_queue_length(vq->vq_vdev) < + (zfs_vdev_max_active * zfs_vdev_queue_bypass_pct) / 100 + ? 1 : 0; + + if (bypass) { + zio->io_queue_state = ZIO_QS_BYPASS; + mutex_enter(&vq->vq_active_list_lock); + list_insert_tail(&vq->vq_active_list, zio); + mutex_exit(&vq->vq_active_list_lock); + return (zio); + } + } + mutex_enter(&vq->vq_lock); vdev_queue_io_add(vq, zio); nio = vdev_queue_io_to_issue(vq); @@ -978,6 +1014,14 @@ vdev_queue_io_done(zio_t *zio) vq->vq_io_complete_ts = now; vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp; + if (zio->io_queue_state == ZIO_QS_BYPASS) { + mutex_enter(&vq->vq_active_list_lock); + list_remove(&vq->vq_active_list, zio); + mutex_exit(&vq->vq_active_list_lock); + zio->io_queue_state = ZIO_QS_NONE; + return; + } + mutex_enter(&vq->vq_lock); vdev_queue_pending_remove(vq, zio); @@ -1163,3 +1207,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW, "Default queue depth for each allocator"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_bypass_pct, UINT, ZMOD_RW, + "Queue bypass percentage per vdev");