From 6dd8582ef631d29f35ac25e2975edb7b52bae510 Mon Sep 17 00:00:00 2001
From: MigeljanImeri <imerimigel@gmail.com>
Date: Thu, 25 Jan 2024 09:43:16 -0700
Subject: [PATCH] Allow bypassing the vdev queue on SSDs

Allow bypassing the vdev queue on SSDs if the vdev queue is less than
zfs_vdev_queue_bypass_pct percent full. This can lead to an over 2x
IOPS speed-up on some benchmarks. The intention behind this property
is to improve performance when using O_DIRECT.

Signed-off-by: MigeljanImeri <ImeriMigel@gmail.com>
---
 include/sys/vdev_impl.h |  1 +
 include/sys/zio.h       | 19 +++++++++++++++++
 man/man4/zfs.4          |  7 ++++++
 module/zfs/vdev.c       |  2 ++
 module/zfs/vdev_queue.c | 47 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 76 insertions(+)

diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index abd66b8abc96..3be570afb978 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -151,6 +151,7 @@ struct vdev_queue {
 	uint32_t	vq_ia_active;	/* Active interactive I/Os. */
 	uint32_t	vq_nia_credit;	/* Non-interactive I/Os credit. */
 	list_t		vq_active_list;	/* List of active I/Os. */
+	kmutex_t	vq_active_list_lock;
 	hrtime_t	vq_io_complete_ts; /* time last i/o completed */
 	hrtime_t	vq_io_delta_ts;
 	zio_t		vq_io_search; /* used as local for stack reduction */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 46f5d68aed4a..15cd171fde2d 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -249,6 +249,24 @@ typedef uint64_t zio_flag_t;
 #define	ZIO_CHILD_BIT(x)		(1U << (x))
 #define	ZIO_CHILD_BIT_IS_SET(val, x)	((val) & (1U << (x)))
 
+
+/*
+ * ZIOs that are ZIO_FLAG_IMPORTANT are always queued so that they never get
+ * starved out. This allows us to bypass the queue for "normal" reads and
+ * writes when the queues are low for better IOPS. If the queues get too high
+ * then we go back to queuing the "normal" reads/writes so as not to starve
+ * out more important IOs like scrub/resilver/retry. See
+ * zfs_vdev_queue_bypass_pct for details.
+ */
+
+#define	ZIO_FLAG_IMPORTANT					\
+	ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL |		\
+	ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB |			\
+	ZIO_FLAG_IO_RETRY
+
+#define	ZIO_IS_NORMAL(zio)					\
+	!((zio)->io_flags & (ZIO_FLAG_IMPORTANT))
+
 enum zio_child {
 	ZIO_CHILD_VDEV = 0,
 	ZIO_CHILD_GANG,
@@ -449,6 +467,7 @@ enum zio_qstate {
 	ZIO_QS_NONE = 0,
 	ZIO_QS_QUEUED,
 	ZIO_QS_ACTIVE,
+	ZIO_QS_BYPASS,
 };
 
 struct zio {
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 7078a5ba8373..d1c4ad15e060 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1528,6 +1528,13 @@ Default queue depth for each vdev IO allocator.
 Higher values allow for better coalescing of sequential writes before sending
 them to the disk, but can increase transaction commit times.
 .
+.It Sy zfs_vdev_queue_bypass_pct Ns = Ns Sy 10 Pq uint
+Allow bypassing the vdev's queue if the vdev queue is less than
+zfs_vdev_queue_bypass_pct percent full.
+This only applies to SSDs (non-rotational drives).
+Only "normal" (read/write) zios can bypass the queue.
+You can use 0 to always queue IOs and 100 to never queue IOs.
+.
 .It Sy zfs_vdev_failfast_mask Ns = Ns Sy 1 Pq uint
 Defines if the driver should retire on a given error type.
 The following options may be bitwise-ored together:
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 5df2f77e5780..a25d4b00f026 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -5634,7 +5634,9 @@ vdev_deadman(vdev_t *vd, const char *tag)
 			 * if any I/O has been outstanding for longer than
 			 * the spa_deadman_synctime invoke the deadman logic.
 			 */
+			mutex_enter(&vq->vq_active_list_lock);
 			fio = list_head(&vq->vq_active_list);
+			mutex_exit(&vq->vq_active_list_lock);
 			delta = gethrtime() - fio->io_timestamp;
 			if (delta > spa_deadman_synctime(spa))
 				zio_deadman(fio, tag);
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 092b3f375be0..91093233e361 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -228,6 +228,12 @@ uint_t zfs_vdev_queue_depth_pct = 300;
  */
 uint_t zfs_vdev_def_queue_depth = 32;
 
+/*
+ * Allow io to bypass the queue depending on how full the queue is.
+ * 0 = never bypass, 100 = always bypass.
+ */
+uint_t zfs_vdev_queue_bypass_pct = 10;
+
 static int
 vdev_queue_offset_compare(const void *x1, const void *x2)
 {
@@ -502,6 +508,7 @@ vdev_queue_init(vdev_t *vd)
 	list_create(&vq->vq_active_list, sizeof (struct zio),
 	    offsetof(struct zio, io_queue_node.l));
 	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vq->vq_active_list_lock, NULL, MUTEX_DEFAULT, NULL);
 }
 
 void
@@ -520,6 +527,7 @@ vdev_queue_fini(vdev_t *vd)
 
 	list_destroy(&vq->vq_active_list);
 	mutex_destroy(&vq->vq_lock);
+	mutex_destroy(&vq->vq_active_list_lock);
 }
 
 static void
@@ -572,7 +580,9 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 		vq->vq_nia_credit--;
 	}
 	zio->io_queue_state = ZIO_QS_ACTIVE;
+	mutex_enter(&vq->vq_active_list_lock);
 	list_insert_tail(&vq->vq_active_list, zio);
+	mutex_exit(&vq->vq_active_list_lock);
 }
 
 static void
@@ -589,7 +599,9 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 			vq->vq_nia_credit = zfs_vdev_nia_credit;
 	} else if (vq->vq_ia_active == 0)
 		vq->vq_nia_credit++;
+	mutex_enter(&vq->vq_active_list_lock);
 	list_remove(&vq->vq_active_list, zio);
+	mutex_exit(&vq->vq_active_list_lock);
 	zio->io_queue_state = ZIO_QS_NONE;
 }
 
@@ -946,6 +958,30 @@ vdev_queue_io(zio_t *zio)
 	zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
 	zio->io_timestamp = gethrtime();
 
+	/*
+	 * Bypass queue if certain conditions are met. Queue bypassing requires
+	 * a non-rotational device. Reads / writes will attempt to bypass queue,
+	 * depending on how full the queue is. Other operations will always
+	 * queue. Bypassing the queue can lead to a 2x IOPS speed-ups on some
+	 * benchmarks. If the queue is too full (due to a scrub or resilver)
+	 * then go back to queuing normal reads/writes so as not to starve out
+	 * the more important IOs.
+	 */
+	if (zio->io_vd->vdev_nonrot && ZIO_IS_NORMAL(zio)) {
+
+		int bypass = vdev_queue_length(vq->vq_vdev) <
+		    (zfs_vdev_max_active * zfs_vdev_queue_bypass_pct) / 100
+		    ? 1 : 0;
+
+		if (bypass) {
+			zio->io_queue_state = ZIO_QS_BYPASS;
+			mutex_enter(&vq->vq_active_list_lock);
+			list_insert_tail(&vq->vq_active_list, zio);
+			mutex_exit(&vq->vq_active_list_lock);
+			return (zio);
+		}
+	}
+
 	mutex_enter(&vq->vq_lock);
 	vdev_queue_io_add(vq, zio);
 	nio = vdev_queue_io_to_issue(vq);
@@ -978,6 +1014,14 @@ vdev_queue_io_done(zio_t *zio)
 	vq->vq_io_complete_ts = now;
 	vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp;
 
+	if (zio->io_queue_state == ZIO_QS_BYPASS) {
+		mutex_enter(&vq->vq_active_list_lock);
+		list_remove(&vq->vq_active_list, zio);
+		mutex_exit(&vq->vq_active_list_lock);
+		zio->io_queue_state = ZIO_QS_NONE;
+		return;
+	}
+
 	mutex_enter(&vq->vq_lock);
 	vdev_queue_pending_remove(vq, zio);
 
@@ -1163,3 +1207,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW,
 	"Default queue depth for each allocator");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_bypass_pct, UINT, ZMOD_RW,
+	"Queue bypass percentage per vdev");