test/bdev: extend chaining test with bdev layer ENOMEM case

The test already checked ENOMEM handling, but it only used bdevs that support chaining (crypto, malloc), so bdev layer didn't need to execute any accel operations. So, to force bdev layer to do that, a passthru bdev was added, as it doesn't support chaining. Signed-off-by: Konrad Sztyber <konrad.sztyber@intel.com> Change-Id: I322a65ccebb0f144c759692fff285cfd44bbab4b Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/17766 Reviewed-by: Shuhei Matsumoto <smatsumoto@nvidia.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
bdev: remove handle_no_momem from push/seq cb
2023-05-09 05:35:39 +00:00 · 2023-05-09 05:35:39 +00:00 · 2023-05-09 05:35:39 +00:00 · 2023-05-09 05:35:39 +00:00 · 2023-05-09 05:35:39 +00:00 · 2023-05-08 13:50:02 +00:00
236 changed files with 12599 additions and 3727 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -17,6 +17,22 @@ multiple readers.

 New function `spdk_env_get_main_core` was added.

+### gpt
+
+GPT bdevs now use the GPT Unique Partition ID as the bdev's UUID.
+
+### lvol
+
+New API `spdk_lvol_iter_immediate_clones` was added to iterate the clones of an lvol.
+
+New APIs `spdk_lvol_get_by_uuid` and `spdk_lvol_get_by_names` to get lvols by the lvol's UUID or
+lvstore and lvol names.
+
+New `bdev_lvol_get_lvols` RPC to list logical volumes. This provides information about logical
+volumes without providing information about the bdevs. It is useful for listing the lvols
+associated with specific lvol stores and for listing lvols that are degraded and have no
+associated bdev.
+
 ### nvmf

 New `spdk_nvmf_request_copy_to/from_buf()` APIs have been added, which support
@ -27,6 +43,9 @@ Two functions related to Asynchronous Event and error handling have been made pu
 -	`spdk_nvmf_ctrlr_async_event_error_event`,
 -	`spdk_nvmf_ctrlr_abort_aer`.

+Parameters `cb_fn` and `ctx` of `spdk_nvmf_qpair_disconnect` API are deprecated. These parameters
+will be removed in 23.09 release.
+
 ### nvme

 New API `spdk_nvme_ns_get_format_index` was added to calculate the exact format index, that
@ -38,6 +57,10 @@ receive and send the I/O management commands.
 New `spdk_nvmf_transport_create_async` was added, it accepts a callback and callback argument.
 `spdk_nvmf_transport_create` is marked deprecated.

+### part
+
+New API `spdk_bdev_part_construct_ext` is added and allows the bdev's UUID to be specified.
+
 ### examples

 `examples/nvme/perf` application now accepts `--use-every-core` parameter that changes
@ -1175,6 +1198,8 @@ Updated OCF submodule to v20.12.2

 Added `bdev_ocf_set_cache_mode` RPC to dynamically switch cache mode of OCF bdev.

+OCF deprecation notice has removed as Huawei is picking up support for the OCF project.
+
 ### opal

 Removed the `spdk_opal_supported` API.
--- a/3
+++ b/3
@ -217,3 +217,6 @@ CONFIG_SMA=n

 # Build with Avahi support
 CONFIG_AVAHI=n
+
+# Setup DPDK's RTE_MAX_LCORES
+CONFIG_MAX_LCORES=
--- a/app/spdk_top/spdk_top.c
+++ b/app/spdk_top/spdk_top.c
@ -202,6 +202,7 @@ struct rpc_thread_info {
 	char *name;
 	uint64_t id;
 	int core_num;
+	int core_idx;
 	char *cpumask;
 	uint64_t busy;
 	uint64_t last_busy;
@ -651,9 +652,9 @@ subsort_threads(enum column_threads_type sort_column, const void *p1, const void
 		break;
 	case COL_THREADS_CPU_USAGE:
 		count1 = get_cpu_usage(thread_info1.busy - thread_info1.last_busy,
-				       g_cores_info[thread_info1.core_num].busy + g_cores_info[thread_info1.core_num].idle);
+				       g_cores_info[thread_info1.core_idx].busy + g_cores_info[thread_info1.core_idx].idle);
 		count2 = get_cpu_usage(thread_info2.busy - thread_info2.last_busy,
-				       g_cores_info[thread_info2.core_num].busy + g_cores_info[thread_info2.core_num].idle);
+				       g_cores_info[thread_info2.core_idx].busy + g_cores_info[thread_info2.core_idx].idle);
 		break;
 	case COL_THREADS_NONE:
 	default:
@ -767,6 +768,7 @@ get_thread_data(void)
 				thread = &g_threads_info[k];
 				if (thread->id == core_info->threads.thread[j].id) {
 					thread->core_num = core_info->lcore;
+					thread->core_idx = i;
 					break;
 				}
 			}
@ -1313,7 +1315,7 @@ draw_thread_tab_row(uint64_t current_row, uint8_t item_index)
 {
 	struct col_desc *col_desc = g_col_desc[THREADS_TAB];
 	uint16_t col = TABS_DATA_START_COL;
-	int core_num, color_attr = COLOR_PAIR(6);
+	int core_idx, color_attr = COLOR_PAIR(6);
 	char pollers_number[MAX_POLLER_COUNT_STR_LEN], idle_time[MAX_TIME_STR_LEN],
 	     busy_time[MAX_TIME_STR_LEN], core_str[MAX_CORE_MASK_STR_LEN],
 	     cpu_usage[MAX_CPU_STR_LEN], *status_str;
@ -1383,10 +1385,10 @@ draw_thread_tab_row(uint64_t current_row, uint8_t item_index)
 	}

 	if (!col_desc[COL_THREADS_CPU_USAGE].disabled) {
-		core_num = g_threads_info[current_row].core_num;
-		uint64_t core_busy_period = g_cores_info[core_num].busy - g_cores_info[core_num].last_busy;
-		uint64_t core_idle_period = g_cores_info[core_num].idle - g_cores_info[core_num].last_idle;
-		if (core_num >= 0 && core_num < RPC_MAX_CORES) {
+		core_idx = g_threads_info[current_row].core_idx;
+		if (core_idx >= 0 && core_idx < RPC_MAX_CORES) {
+			uint64_t core_busy_period = g_cores_info[core_idx].busy - g_cores_info[core_idx].last_busy;
+			uint64_t core_idle_period = g_cores_info[core_idx].idle - g_cores_info[core_idx].last_idle;
 			get_cpu_usage_str(busy_period, core_busy_period + core_idle_period, cpu_usage);
 		} else {
 			snprintf(cpu_usage, sizeof(cpu_usage), "n/a");
--- a/autobuild.sh
+++ b/autobuild.sh
@ -6,7 +6,6 @@

 rootdir=$(readlink -f $(dirname $0))

-source "$1"
 source "$rootdir/test/common/autobuild_common.sh"

 SPDK_TEST_AUTOBUILD=${SPDK_TEST_AUTOBUILD:-}
--- a/autopackage.sh
+++ b/autopackage.sh
@ -4,21 +4,8 @@
 #  All rights reserved.
 #

-set -e
-
-# If the configuration of tests is not provided, no tests will be carried out.
-if [[ ! -f $1 ]]; then
-	echo "ERROR: SPDK test configuration not specified"
-	exit 1
-fi
-
-source "$1"
-
 rootdir=$(readlink -f $(dirname $0))
-testdir=$rootdir # to get the storage space for tests
-source "$rootdir/test/common/autotest_common.sh"
-
-out=$PWD
+source "$rootdir/test/common/autobuild_common.sh"

 MAKEFLAGS=${MAKEFLAGS:--j16}
 cd $rootdir
@ -36,7 +23,7 @@ fi
 timing_exit porcelain_check

 if [[ $SPDK_TEST_RELEASE_BUILD -eq 1 ]]; then
-	run_test "packaging" $rootdir/test/packaging/packaging.sh
+	build_packaging
 	$MAKE clean
 fi

--- a/autotest.sh
+++ b/autotest.sh
@ -215,6 +215,9 @@ if [ $SPDK_RUN_FUNCTIONAL_TEST -eq 1 ]; then
 		if [[ $SPDK_TEST_NVME_CMB -eq 1 ]]; then
 			run_test "nvme_cmb" $rootdir/test/nvme/cmb/cmb.sh
 		fi
+		if [[ $SPDK_TEST_NVME_FDP -eq 1 ]]; then
+			run_test "nvme_fdp" test/nvme/nvme_fdp.sh
+		fi

 		if [[ $SPDK_TEST_NVME_ZNS -eq 1 ]]; then
 			run_test "nvme_zns" $rootdir/test/nvme/zns/zns.sh
@ -335,10 +338,8 @@ if [ $SPDK_RUN_FUNCTIONAL_TEST -eq 1 ]; then
 	if [ $SPDK_TEST_CRYPTO -eq 1 ]; then
 		run_test "blockdev_crypto_aesni" $rootdir/test/bdev/blockdev.sh "crypto_aesni"
 		run_test "blockdev_crypto_sw" $rootdir/test/bdev/blockdev.sh "crypto_sw"
-		# Proceed with the test only if QAT devices are in place
-		if [[ $(lspci -d:37c8) ]]; then
 		run_test "blockdev_crypto_qat" $rootdir/test/bdev/blockdev.sh "crypto_qat"
-		fi
+		run_test "chaining" $rootdir/test/bdev/chaining.sh
 	fi

 	if [[ $SPDK_TEST_SCHEDULER -eq 1 ]]; then
--- a/23
+++ b/23
@ -28,7 +28,10 @@ function usage() {
 	echo " --cross-prefix=prefix     Prefix for cross compilation (default: none)"
 	echo "                           example: aarch64-linux-gnu"
 	echo " --libdir=path             Configure installation path for the libraries (default: \$prefix/lib)"
-	echo ""
+	echo " --max-lcores=VAL          DPDK configuration. VAL defines maximum number of lcores supported"
+	echo "                           by EAL, or enables autodetection if set to 'detect'. When 'detect'"
+	echo "                           is specified, DPDK will detect number of cores in the system during"
+	echo "                           compilation, and will set maximum number of lcores to this value"
 	echo " --enable-debug            Configure for debug builds"
 	echo " --enable-werror           Treat compiler warnings as errors"
 	echo " --enable-asan             Enable address sanitizer"
@ -94,7 +97,7 @@ function usage() {
 	echo " --without-iscsi-initiator No path required."
 	echo " --with-vtune=DIR          Required to profile I/O under Intel VTune Amplifier XE. (Deprecated)"
 	echo " --without-vtune           example: /opt/intel/vtune_amplifier_xe_version"
-	echo " --with-ocf[=DIR]          Build OCF library and bdev module. (Deprecated)"
+	echo " --with-ocf[=DIR]          Build OCF library and bdev module."
 	echo " --without-ocf             If argument is directory, interpret it as root of OCF repo"
 	echo "                           If argument is file, interpret it as compiled OCF lib"
 	echo "                           If no argument is specified, OCF git submodule is used by default"
@ -651,6 +654,15 @@ for i in "$@"; do
 		--without-avahi)
 			CONFIG[AVAHI]=n
 			;;
+		--max-lcores='')
+			echo "Must specify max number of lcores for --max-lcores"
+			usage
+			exit 1
+			;;
+		--max-lcores=*)
+			CONFIG[MAX_LCORES]="${i#*=}"
+			CONFIG["MAX_LCORES"]=${CONFIG["MAX_LCORES"],,}
+			;;
 		--)
 			break
 			;;
@ -1250,6 +1262,13 @@ if [[ "${CONFIG[AVAHI]}" = "y" ]]; then
 	fi
 fi

+if [[ -n ${CONFIG[MAX_LCORES]} ]]; then
+	if [[ ! ${CONFIG[MAX_LCORES]} =~ ^([1-9][0-9]*|detect)$ ]] || ((CONFIG[MAX_LCORES] > 1024)); then
+		echo "ERROR: Max number of lcores must be a decimal number in range [1..1024] or 'detect' (given: ${CONFIG[MAX_LCORES]})"
+		exit 1
+	fi
+fi
+
 # For ARM Neoverse-N1 platform, debug build needs gcc version newer than 8.4
 if [[ "${CONFIG[DEBUG]}" = "y" && $arch = aarch64* && "$CC_TYPE" = "gcc" ]]; then
 	GCC_VERSION=$($CC -dumpfullversion)
--- a/deprecation.md
+++ b/deprecation.md
@ -1,6 +1,6 @@
 # Deprecation

-## ABI and API Deprecation {#deprecation}
+## ABI and API Deprecation

 This document details the policy for maintaining stability of SPDK ABI and API.

@ -17,41 +17,25 @@ Deprecated code paths must be registered with `SPDK_DEPRECATION_REGISTER()` and
 log at the warn level when `SPDK_LOG_DEPRECATED()` is called, subject to rate limits.
 The tags can be matched with the level 4 headers below.

-## Deprecation Notices {#deprecation-notices}
+## Deprecation Notices

 ### PMDK

 PMDK is no longer supported and integrations with it in SPDK are now deprecated, and will be removed in SPDK 23.05.
 Please see: [UPDATE ON PMDK AND OUR LONG TERM SUPPORT STRATEGY](https://pmem.io/blog/2022/11/update-on-pmdk-and-our-long-term-support-strategy/).

-#### `libreduce_pm_file`
-
-Reduce library will no longer depend on libpmem. `pm_file_dir` parameter in `spdk_reduce_vol_init()`
-will no longer point to pmem device or pmem file. Instead it will be possible to operate on a file,
-without the benefits of persistency.
-
 ### VTune

 #### `vtune_support`

 VTune integration is in now deprecated and will be removed in SPDK 23.05.

-### OCF
+### nvmf

-#### `bdev_ocf`
+#### `spdk_nvmf_qpair_disconnect`

-The Open CAS Framework (OCF) integration via bdev module and env_ocf is currently marked
-deprecated due to discontinued support from Intel. However, Huawei is working to pick up
-support for these components, and the OCF project in general. This code will definitely
-remain in SPDK 23.05 release, in fact it is likely deprecation notice will be removed by
-then.
-
-### nvme
-
-#### `nvme_ctrlr_prepare_for_reset`
-
-Deprecated `spdk_nvme_ctrlr_prepare_for_reset` API, which will be removed in SPDK 22.01.
-For PCIe transport, `spdk_nvme_ctrlr_disconnect` should be used before freeing I/O qpairs.
+Parameters `cb_fn` and `ctx` of `spdk_nvmf_qpair_disconnect` API are deprecated. These parameters
+will be removed in 23.09 release.

 ### gpt

--- a/doc/.gitignore
+++ b/doc/.gitignore
@ -1,3 +1,4 @@
-# changelog.md is generated by Makefile
+# changelog.md and deprecation.md is generated by Makefile
 changelog.md
+deprecation.md
 output/
--- a/doc/Doxyfile
+++ b/doc/Doxyfile
@ -813,7 +813,7 @@ INPUT                  += \
                         compression.md \
                         concurrency.md \
                         containers.md \
-                         ../deprecation.md \
+                         deprecation.md \
                         distributions.md \
                         event.md \
                         ftl.md \
--- a/doc/Makefile
+++ b/doc/Makefile
@ -13,6 +13,10 @@ all: doc

 doc: output

+deprecation.md: ../deprecation.md
+	$(Q)sed -e 's/^# Deprecation/# Deprecation {#deprecation}/' \
+	    < $< > $@
+
 changelog.md: ../CHANGELOG.md
 	$(Q)sed -e 's/^# Changelog/# Changelog {#changelog}/' \
 	    -e 's/^##/#/' \
@ -20,9 +24,9 @@ changelog.md: ../CHANGELOG.md
 	    -e '/# v..\...:/s/\./-/2' \
 	    < $< > $@

-output: Doxyfile changelog.md $(wildcard *.md) $(wildcard ../include/spdk/*.h)
+output: Doxyfile changelog.md deprecation.md $(wildcard *.md) $(wildcard ../include/spdk/*.h)
 	$(Q)rm -rf $@
 	$(Q)doxygen Doxyfile

 clean:
-	$(Q)rm -rf output changelog.md
+	$(Q)rm -rf output changelog.md deprecation.md
--- a/doc/gdb_macros.md
+++ b/doc/gdb_macros.md
@ -125,6 +125,54 @@ nqn "nqn.2016-06.io.spdk.umgmt:cnode1", '\000' <repeats 191 times>
 ID 1
 ~~~

+Printing SPDK spinlocks:
+
+In this example, the spinlock has been initialized and locked but has never been unlocked.
+After it is unlocked the first time the last unlocked stack will be present and the
+`Locked by spdk_thread` line will say `not locked`.
+
+~~~{.sh}
+Breakpoint 2, spdk_spin_unlock (sspin=0x655110 <g_bdev_mgr+80>) at thread.c:2915
+2915            struct spdk_thread *thread = spdk_get_thread();
+(gdb) print *sspin
+$2 = struct spdk_spinlock:
+  Locked by spdk_thread: 0x658080
+  Initialized at:
+     0x43e677 <spdk_spin_init+213> thread.c:2878
+     0x404feb <_bdev_init+16> /build/spdk/spdk-review-public/lib/bdev/bdev.c:116
+     0x44483d <__libc_csu_init+77>
+     0x7ffff62c9d18 <__libc_start_main+120>
+     0x40268e <_start+46>
+  Last locked at:
+     0x43e936 <spdk_spin_lock+436> thread.c:2909
+     0x40ca9c <bdev_name_add+129> /build/spdk/spdk-review-public/lib/bdev/bdev.c:3855
+     0x411a3c <bdev_register+641> /build/spdk/spdk-review-public/lib/bdev/bdev.c:6660
+     0x412e1e <spdk_bdev_register+24> /build/spdk/spdk-review-public/lib/bdev/bdev.c:7171
+     0x417895 <num_blocks_test+119> bdev_ut.c:878
+     0x7ffff7bc38cb <run_single_test.constprop+379>
+     0x7ffff7bc3b61 <run_single_suite.constprop+433>
+     0x7ffff7bc3f76 <CU_run_all_tests+118>
+     0x43351f <main+1439> bdev_ut.c:6295
+     0x7ffff62c9d85 <__libc_start_main+229>
+     0x40268e <_start+46>
+  Last unlocked at:
+~~~
+
+Print a single spinlock stack:
+
+~~~{.sh}
+(gdb) print sspin->internal.lock_stack
+$1 = struct sspin_stack:
+ 0x40c6a1 <spdk_spin_lock+436> /build/spdk/spdk-review-public/lib/thread/thread.c:2909
+ 0x413f48 <spdk_spin+552> thread_ut.c:1831
+ 0x7ffff7bc38cb <run_single_test.constprop+379>
+ 0x7ffff7bc3b61 <run_single_suite.constprop+433>
+ 0x7ffff7bc3f76 <CU_run_all_tests+118>
+ 0x4148fa <main+547> thread_ut.c:1948
+ 0x7ffff62c9d85 <__libc_start_main+229>
+ 0x40248e <_start+46>
+~~~
+
 ## Loading The gdb Macros

 Copy the gdb macros to the host where you are about to debug.
--- a/doc/img/lvol_esnap_clone.svg
+++ b/doc/img/lvol_esnap_clone.svg
@ -0,0 +1,673 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   width="181.24mm"
+   height="79.375mm"
+   version="1.1"
+   viewBox="0 0 181.24 79.375"
+   id="svg172"
+   sodipodi:docname="lvol_esnap_clone.svg"
+   inkscape:version="1.2.2 (b0a8486541, 2022-12-01)"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:dc="http://purl.org/dc/elements/1.1/">
+  <sodipodi:namedview
+     id="namedview174"
+     pagecolor="#ffffff"
+     bordercolor="#000000"
+     borderopacity="0.25"
+     inkscape:showpageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:deskcolor="#d1d1d1"
+     inkscape:document-units="mm"
+     showgrid="false"
+     inkscape:zoom="1.7926966"
+     inkscape:cx="338.59607"
+     inkscape:cy="148.93764"
+     inkscape:window-width="1351"
+     inkscape:window-height="930"
+     inkscape:window-x="762"
+     inkscape:window-y="134"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g170" />
+  <title
+     id="title2">Thin Provisioning</title>
+  <defs
+     id="defs28">
+    <marker
+       id="marker2036"
+       overflow="visible"
+       orient="auto">
+      <path
+         transform="matrix(-.4 0 0 -.4 -4 0)"
+         d="m0 0 5-5-17.5 5 17.5 5z"
+         fill-rule="evenodd"
+         stroke="#000"
+         stroke-width="1pt"
+         id="path4" />
+    </marker>
+    <marker
+       id="marker1960"
+       overflow="visible"
+       orient="auto">
+      <path
+         transform="matrix(-.4 0 0 -.4 -4 0)"
+         d="m0 0 5-5-17.5 5 17.5 5z"
+         fill-rule="evenodd"
+         stroke="#000"
+         stroke-width="1pt"
+         id="path7" />
+    </marker>
+    <marker
+       id="marker1890"
+       overflow="visible"
+       orient="auto">
+      <path
+         transform="matrix(-.4 0 0 -.4 -4 0)"
+         d="m0 0 5-5-17.5 5 17.5 5z"
+         fill-rule="evenodd"
+         stroke="#000"
+         stroke-width="1pt"
+         id="path10" />
+    </marker>
+    <marker
+       id="marker1826"
+       overflow="visible"
+       orient="auto">
+      <path
+         transform="matrix(-.4 0 0 -.4 -4 0)"
+         d="m0 0 5-5-17.5 5 17.5 5z"
+         fill-rule="evenodd"
+         stroke="#000"
+         stroke-width="1pt"
+         id="path13" />
+    </marker>
+    <marker
+       id="marker1816"
+       overflow="visible"
+       orient="auto">
+      <path
+         transform="matrix(-.4 0 0 -.4 -4 0)"
+         d="m0 0 5-5-17.5 5 17.5 5z"
+         fill-rule="evenodd"
+         stroke="#000"
+         stroke-width="1pt"
+         id="path16" />
+    </marker>
+    <marker
+       id="Arrow1Mend"
+       overflow="visible"
+       orient="auto">
+      <path
+         transform="matrix(-.4 0 0 -.4 -4 0)"
+         d="m0 0 5-5-17.5 5 17.5 5z"
+         fill-rule="evenodd"
+         stroke="#000"
+         stroke-width="1pt"
+         id="path19" />
+    </marker>
+    <marker
+       id="marker11771-4-9"
+       overflow="visible"
+       orient="auto">
+      <path
+         transform="matrix(-.4 0 0 -.4 -4 0)"
+         d="m0 0 5-5-17.5 5 17.5 5z"
+         fill="#f00"
+         fill-rule="evenodd"
+         stroke="#ff2a2a"
+         stroke-width="1pt"
+         id="path22" />
+    </marker>
+    <marker
+       id="marker1826-2-4-7-1-7"
+       overflow="visible"
+       orient="auto">
+      <path
+         transform="matrix(-.4 0 0 -.4 -4 0)"
+         d="m0 0 5-5-17.5 5 17.5 5z"
+         fill="#00f"
+         fill-rule="evenodd"
+         stroke="#00f"
+         stroke-width="1pt"
+         id="path25" />
+    </marker>
+  </defs>
+  <metadata
+     id="metadata30">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title>Thin Provisioning</dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     transform="translate(2.6458 2.3956)"
+     id="g34">
+    <rect
+       x="-2.6458"
+       y="-2.3956"
+       width="181.24"
+       height="79.375"
+       fill="#fffffe"
+       stroke-width=".26458"
+       id="rect32" />
+  </g>
+  <g
+     transform="translate(-3.9688 -4.6356)"
+     id="g170">
+    <g
+       stroke="#000"
+       id="g52">
+      <g
+         stroke-width=".26458"
+         id="g48">
+        <rect
+           x="44.979"
+           y="32.417"
+           width="22.49"
+           height="6.6146"
+           fill="none"
+           stroke-dasharray="0.52916663, 0.52916663"
+           id="rect36" />
+        <rect
+           x="67.469"
+           y="32.417"
+           width="22.49"
+           height="6.6146"
+           fill="#d7d7f4"
+           id="rect38" />
+        <rect
+           x="89.958"
+           y="32.417"
+           width="22.49"
+           height="6.6146"
+           fill="#d7d7f4"
+           id="rect40" />
+        <rect
+           x="112.45"
+           y="32.417"
+           width="22.49"
+           height="6.6146"
+           fill="none"
+           stroke-dasharray="0.52916663, 0.52916663"
+           id="rect42" />
+        <rect
+           x="134.94"
+           y="32.417"
+           width="22.49"
+           height="6.6146"
+           fill="none"
+           stroke-dasharray="0.52916663, 0.52916663"
+           id="rect44" />
+        <rect
+           x="157.43"
+           y="32.417"
+           width="22.49"
+           height="6.6146"
+           fill="#d7d7f4"
+           id="rect46" />
+      </g>
+      <rect
+         x="44.979"
+         y="46.969"
+         width="22.49"
+         height="6.6146"
+         fill="#f4d7d7"
+         stroke-dasharray="0.52999997, 0.26499999"
+         stroke-width=".265"
+         id="rect50" />
+    </g>
+    <text
+       x="56.412949"
+       y="51.598957"
+       fill="#000000"
+       font-family="sans-serif"
+       font-size="10.583px"
+       letter-spacing="0px"
+       stroke-width="0.26458"
+       word-spacing="0px"
+       style="line-height:1.25"
+       xml:space="preserve"
+       id="text56"><tspan
+         x="56.412949"
+         y="51.598957"
+         font-family="sans-serif"
+         font-size="3.5278px"
+         stroke-width="0.26458"
+         text-align="center"
+         text-anchor="middle"
+         style="font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
+         id="tspan54">26f9a7...</tspan></text>
+    <rect
+       x="67.469"
+       y="46.969"
+       width="22.49"
+       height="6.6146"
+       fill="#f4d7d7"
+       stroke="#000"
+       stroke-dasharray="0.52999997, 0.26499999"
+       stroke-width=".265"
+       id="rect58" />
+    <text
+       x="78.902527"
+       y="51.598961"
+       fill="#000000"
+       font-family="sans-serif"
+       font-size="10.583px"
+       letter-spacing="0px"
+       stroke-width="0.26458"
+       word-spacing="0px"
+       style="line-height:1.25"
+       xml:space="preserve"
+       id="text62"><tspan
+         x="78.902527"
+         y="51.598961"
+         font-family="sans-serif"
+         font-size="3.5278px"
+         stroke-width="0.26458"
+         text-align="center"
+         text-anchor="middle"
+         style="font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
+         id="tspan60">b44ab3...</tspan></text>
+    <rect
+       x="89.958"
+       y="46.969"
+       width="22.49"
+       height="6.6146"
+       fill="#f4d7d7"
+       stroke="#000"
+       stroke-dasharray="0.52999997, 0.26499999"
+       stroke-width=".265"
+       id="rect64" />
+    <text
+       x="101.39211"
+       y="51.598961"
+       fill="#000000"
+       font-family="sans-serif"
+       font-size="10.583px"
+       letter-spacing="0px"
+       stroke-width="0.26458"
+       word-spacing="0px"
+       style="line-height:1.25"
+       xml:space="preserve"
+       id="text68"><tspan
+         x="101.39211"
+         y="51.598961"
+         font-family="sans-serif"
+         font-size="3.5278px"
+         stroke-width="0.26458"
+         text-align="center"
+         text-anchor="middle"
+         style="font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
+         id="tspan66">ee5593...</tspan></text>
+    <rect
+       x="112.45"
+       y="46.969"
+       width="22.49"
+       height="6.6146"
+       fill="#f4d7d7"
+       stroke="#000"
+       stroke-dasharray="0.52999997, 0.26499999"
+       stroke-width=".265"
+       id="rect70" />
+    <text
+       x="123.88169"
+       y="51.598961"
+       fill="#000000"
+       font-family="sans-serif"
+       font-size="10.583px"
+       letter-spacing="0px"
+       stroke-width="0.26458"
+       word-spacing="0px"
+       style="line-height:1.25"
+       xml:space="preserve"
+       id="text74"><tspan
+         x="123.88169"
+         y="51.598961"
+         font-family="sans-serif"
+         font-size="3.5278px"
+         stroke-width="0.26458"
+         text-align="center"
+         text-anchor="middle"
+         style="font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
+         id="tspan72">7a3bfe...</tspan></text>
+    <rect
+       x="134.94"
+       y="46.969"
+       width="22.49"
+       height="6.6146"
+       fill="#f4d7d7"
+       stroke="#000"
+       stroke-dasharray="0.52999997, 0.26499999"
+       stroke-width=".265"
+       id="rect76" />
+    <text
+       x="146.37128"
+       y="51.598957"
+       fill="#000000"
+       font-family="sans-serif"
+       font-size="10.583px"
+       letter-spacing="0px"
+       stroke-width="0.26458"
+       word-spacing="0px"
+       style="line-height:1.25"
+       xml:space="preserve"
+       id="text80"><tspan
+         x="146.37128"
+         y="51.598957"
+         font-family="sans-serif"
+         font-size="3.5278px"
+         stroke-width="0.26458"
+         text-align="center"
+         text-anchor="middle"
+         style="font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
+         id="tspan78">8f4e15...</tspan></text>
+    <rect
+       x="157.43"
+       y="46.969"
+       width="22.49"
+       height="6.6146"
+       fill="#f4d7d7"
+       stroke="#000"
+       stroke-dasharray="0.52999997, 0.26499999"
+       stroke-width=".265"
+       id="rect82" />
+    <g
+       font-family="sans-serif"
+       letter-spacing="0px"
+       stroke-width=".26458"
+       word-spacing="0px"
+       id="g98">
+      <text
+         x="168.86086"
+         y="51.598961"
+         font-size="10.583px"
+         style="line-height:1.25"
+         xml:space="preserve"
+         id="text86"><tspan
+           x="168.86086"
+           y="51.598961"
+           font-family="sans-serif"
+           font-size="3.5278px"
+           stroke-width="0.26458"
+           text-align="center"
+           text-anchor="middle"
+           style="font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
+           id="tspan84">40c285...</tspan></text>
+      <text
+         x="6.6430736"
+         y="51.680019"
+         font-size="3.5278px"
+         style="line-height:1.25;font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
+         xml:space="preserve"
+         id="text90"><tspan
+           x="6.6430736"
+           y="51.680019"
+           stroke-width="0.26458"
+           id="tspan88">read-only bdev</tspan></text>
+      <text
+         x="6.6296382"
+         y="12.539818"
+         font-size="3.5278px"
+         style="line-height:1.25;font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
+         xml:space="preserve"
+         id="text96"><tspan
+           sodipodi:role="line"
+           id="tspan436"
+           x="6.6296382"
+           y="12.539818">esnap clone</tspan><tspan
+           sodipodi:role="line"
+           x="6.6296382"
+           y="16.949568"
+           id="tspan440">Volume</tspan><tspan
+           sodipodi:role="line"
+           id="tspan438"
+           x="6.6296382"
+           y="21.359318" /></text>
+    </g>
+    <g
+       stroke="#000"
+       id="g118">
+      <path
+         d="m6.6146 24.479 173.3 1e-6"
+         fill="none"
+         stroke-dasharray="1.59, 1.59"
+         stroke-width=".265"
+         id="path100" />
+      <g
+         fill="#f4d7d7"
+         stroke-dasharray="0.52916663, 0.26458332"
+         stroke-width=".26458"
+         id="g108">
+        <rect
+           x="44.979"
+           y="9.9271"
+           width="22.49"
+           height="6.6146"
+           id="rect102" />
+        <rect
+           x="112.45"
+           y="9.9271"
+           width="22.49"
+           height="6.6146"
+           id="rect104" />
+        <rect
+           x="134.94"
+           y="9.9271"
+           width="22.49"
+           height="6.6146"
+           id="rect106" />
+      </g>
+      <g
+         fill="#d7d7f4"
+         stroke-width=".26458"
+         id="g116">
+        <rect
+           x="67.469"
+           y="9.9271"
+           width="22.49"
+           height="6.6146"
+           id="rect110" />
+        <rect
+           x="89.958"
+           y="9.9271"
+           width="22.49"
+           height="6.6146"
+           id="rect112" />
+        <rect
+           x="157.43"
+           y="9.9271"
+           width="22.49"
+           height="6.6146"
+           id="rect114" />
+      </g>
+    </g>
+    <text
+       x="6.614583"
+       y="37.708332"
+       fill="#000000"
+       font-family="sans-serif"
+       font-size="3.5278px"
+       letter-spacing="0px"
+       stroke-width=".26458"
+       word-spacing="0px"
+       style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25"
+       xml:space="preserve"
+       id="text122"><tspan
+         x="6.614583"
+         y="37.708332"
+         stroke-width=".26458"
+         id="tspan120">active clusters</tspan></text>
+    <rect
+       x="37.042"
+       y="7.2812"
+       width="145.52"
+       height="11.906"
+       ry="1.3229"
+       fill="none"
+       stroke="#999"
+       stroke-width=".5"
+       id="rect124" />
+    <rect
+       x="37.042"
+       y="29.771"
+       width="145.52"
+       height="26.458"
+       ry="1.3229"
+       fill="none"
+       stroke="#999"
+       stroke-width=".5"
+       id="rect126" />
+    <g
+       fill="#00f"
+       stroke="#00f"
+       id="g144">
+      <g
+         stroke-width=".26458"
+         id="g140">
+        <path
+           d="m78.052 16.542v15.875"
+           marker-end="url(#marker1960)"
+           id="path128" />
+        <path
+           d="m55.562 16.542v30.427"
+           marker-end="url(#marker2036)"
+           id="path130" />
+        <path
+           d="m100.54 16.542v15.875"
+           marker-end="url(#marker1890)"
+           id="path132" />
+        <path
+           d="m169.33 16.542v15.875"
+           marker-end="url(#Arrow1Mend)"
+           id="path134" />
+        <path
+           d="m124.35 16.542v30.427"
+           marker-end="url(#marker1826)"
+           id="path136" />
+        <path
+           d="m146.84 16.542v30.427"
+           marker-end="url(#marker1816)"
+           id="path138" />
+      </g>
+      <path
+         d="m132.29 61.521 10.583 1e-5"
+         marker-end="url(#marker1826-2-4-7-1-7)"
+         stroke-width=".265"
+         id="path142" />
+    </g>
+    <path
+       d="m132.29 66.813h10.583"
+       fill="#f00"
+       marker-end="url(#marker11771-4-9)"
+       stroke="#ff2a2a"
+       stroke-width=".265"
+       id="path146" />
+    <g
+       stroke-width=".26458"
+       id="g162">
+      <text
+         x="145.52083"
+         y="62.843975"
+         fill="#000000"
+         font-family="sans-serif"
+         font-size="3.5278px"
+         letter-spacing="0px"
+         word-spacing="0px"
+         style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25"
+         xml:space="preserve"
+         id="text150"><tspan
+           x="145.52083"
+           y="62.843975"
+           font-family="sans-serif"
+           font-size="2.8222px"
+           stroke-width=".26458"
+           style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal"
+           id="tspan148">read</tspan></text>
+      <text
+         x="145.52083"
+         y="68.135651"
+         fill="#000000"
+         font-family="sans-serif"
+         font-size="3.5278px"
+         letter-spacing="0px"
+         word-spacing="0px"
+         style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25"
+         xml:space="preserve"
+         id="text154"><tspan
+           x="145.52083"
+           y="68.135651"
+           font-family="sans-serif"
+           font-size="2.8222px"
+           stroke-width=".26458"
+           style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal"
+           id="tspan152">allocate and copy cluster</tspan></text>
+      <rect
+         x="132.29"
+         y="70.781"
+         width="10.583"
+         height="2.6458"
+         fill="none"
+         stroke="#000"
+         stroke-dasharray="0.52916664, 0.52916664"
+         id="rect156" />
+      <text
+         x="145.52083"
+         y="73.427307"
+         fill="#000000"
+         font-family="sans-serif"
+         font-size="3.5278px"
+         letter-spacing="0px"
+         word-spacing="0px"
+         style="line-height:1.25;font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
+         xml:space="preserve"
+         id="text160"><tspan
+           x="145.52083"
+           y="73.427307"
+           font-family="sans-serif"
+           font-size="2.8222px"
+           stroke-width="0.26458"
+           style="font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
+           id="tspan158">external snapshot cluster</tspan></text>
+    </g>
+    <rect
+       x="132.29"
+       y="76.073"
+       width="10.583"
+       height="2.6458"
+       fill="none"
+       stroke="#000"
+       stroke-width=".265"
+       id="rect164" />
+    <text
+       x="145.52083"
+       y="78.718971"
+       fill="#000000"
+       font-family="sans-serif"
+       font-size="3.5278px"
+       letter-spacing="0px"
+       stroke-width=".26458"
+       word-spacing="0px"
+       style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25"
+       xml:space="preserve"
+       id="text168"><tspan
+         x="145.52083"
+         y="78.718971"
+         font-family="sans-serif"
+         font-size="2.8222px"
+         stroke-width=".26458"
+         style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal"
+         id="tspan166">allocated cluster</tspan></text>
+  </g>
+</svg>
--- a/doc/jsonrpc.md
+++ b/doc/jsonrpc.md
@ -441,6 +441,7 @@ Example response:
    "framework_get_subsystems",
    "framework_monitor_context_switch",
    "spdk_kill_instance",
+    "accel_set_options",
    "accel_set_driver",
    "accel_crypto_key_create",
    "accel_crypto_key_destroy",
@ -1943,6 +1944,97 @@ Example response:
 }
 ~~~

+### accel_set_options {#rpc_accel_set_options}
+
+Set accel framework's options.
+
+#### Parameters
+
+Name                    | Optional | Type        | Description
+----------------------- |----------| ----------- | -----------------
+small_cache_size        | Optional | number      | Size of the small iobuf cache
+large_cache_size        | Optional | number      | Size of the large iobuf cache
+task_count              | Optional | number      | Maximum number of tasks per IO channel
+sequence_count          | Optional | number      | Maximum number of sequences per IO channel
+buf_count               | Optional | number      | Maximum number of accel buffers per IO channel
+
+#### Example
+
+Example request:
+
+~~~json
+{
+  "jsonrpc": "2.0",
+  "method": "accel_set_options",
+  "id": 1,
+  "params": {
+    "small_cache_size": 128,
+    "large_cache_size": 32
+  }
+}
+~~~
+
+Example response:
+
+~~~json
+{
+  "jsonrpc": "2.0",
+  "id": 1,
+  "result": true
+}
+~~~
+
+### accel_get_stats {#rpc_accel_get_stats}
+
+Retrieve accel framework's statistics.  Statistics for opcodes that have never been executed (i.e.
+all their stats are at 0) aren't included in the `operations` array.
+
+#### Parameters
+
+None.
+
+#### Example
+
+Example request:
+
+~~~json
+{
+  "jsonrpc": "2.0",
+  "method": "accel_get_stats",
+  "id": 1
+}
+~~~
+
+Example response:
+
+~~~json
+{
+  "jsonrpc": "2.0",
+  "id": 1,
+  "result": {
+    "sequence_executed": 256,
+    "sequence_failed": 0,
+    "operations": [
+      {
+        "opcode": "copy",
+        "executed": 256,
+        "failed": 0
+      },
+      {
+        "opcode": "encrypt",
+        "executed": 128,
+        "failed": 0
+      },
+      {
+        "opcode": "decrypt",
+        "executed": 128,
+        "failed": 0
+      }
+    ]
+  }
+}
+~~~
+
 ### compressdev_scan_accel_module {#rpc_compressdev_scan_accel_module}

 Set config and enable compressdev accel module offload.
@ -2956,6 +3048,9 @@ Example request:

 ~~~json
 {
+  "params": {
+    "name": "ocf0"
+  },
  "jsonrpc": "2.0",
  "method": "bdev_ocf_get_stats",
  "id": 1
@ -3140,6 +3235,45 @@ Example response:
 }
 ~~~

+### bdev_ocf_reset_stats {#rpc_bdev_ocf_reset_stats}
+
+Reset statistics of chosen OCF block device.
+
+#### Parameters
+
+Name                    | Optional | Type        | Description
+----------------------- | -------- | ----------- | -----------
+name                    | Required | string      | Block device name
+
+#### Response
+
+Completion status of reset statistics operation returned as a boolean.
+
+#### Example
+
+Example request:
+
+~~~json
+{
+  "params": {
+    "name": "ocf0"
+  },
+  "jsonrpc": "2.0",
+  "method": "bdev_ocf_reset_stats",
+  "id": 1
+}
+~~~
+
+Example response:
+
+~~~json
+{
+  "jsonrpc": "2.0",
+  "id": 1,
+  "result": true
+}
+~~~
+
 ### bdev_ocf_get_bdevs {#rpc_bdev_ocf_get_bdevs}

 Get list of OCF devices including unregistered ones.
@ -3847,11 +3981,12 @@ hdgst                      | Optional | bool        | Enable TCP header digest
 ddgst                      | Optional | bool        | Enable TCP data digest
 fabrics_connect_timeout_us | Optional | bool        | Timeout for fabrics connect (in microseconds)
 multipath                  | Optional | string      | Multipathing behavior: disable, failover, multipath. Default is failover.
-num_io_queues              | Optional | uint32_t    | The number of IO queues to request during initialization. Range: (0, UINT16_MAX + 1], Default is 1024.
+num_io_queues              | Optional | number      | The number of IO queues to request during initialization. Range: (0, UINT16_MAX + 1], Default is 1024.
 ctrlr_loss_timeout_sec     | Optional | number      | Time to wait until ctrlr is reconnected before deleting ctrlr.  -1 means infinite reconnects. 0 means no reconnect.
 reconnect_delay_sec        | Optional | number      | Time to delay a reconnect trial. 0 means no reconnect.
 fast_io_fail_timeout_sec   | Optional | number      | Time to wait until ctrlr is reconnected before failing I/O to ctrlr. 0 means no such timeout.
 psk                        | Optional | string      | PSK in hexadecimal digits, e.g. 1234567890ABCDEF (Enables SSL socket implementation for TCP)
+max_bdevs                  | Optional | number      | The size of the name array for newly created bdevs. Default is 128.

 #### Example

@ -5208,6 +5343,7 @@ Construct error bdev.
 Name                    | Optional | Type        | Description
 ----------------------- | -------- | ----------- | -----------
 base_name               | Required | string      | Base bdev name
+uuid                    | Optional | string      | UUID for this bdev

 #### Example

@ -9542,6 +9678,55 @@ Example response:
 }
 ~~~

+### bdev_lvol_clone_bdev {#rpc_bdev_lvol_clone_bdev}
+
+Create a logical volume based on an external snapshot bdev. The external snapshot bdev
+is a bdev that will not be written to by any consumer and must not be an lvol in the
+lvstore as the clone.
+
+Regardless of whether the bdev is specified by name or UUID, the bdev UUID will be stored
+in the logical volume's metadata for use while the lvolstore is loading. For this reason,
+it is important that the bdev chosen has a static UUID.
+
+#### Parameters
+
+Name                    | Optional | Type        | Description
+----------------------- | -------- | ----------- | -----------
+bdev                    | Required | string      | Name or UUID for bdev that acts as the external snapshot
+lvs_name                | Required | string      | logical volume store name
+clone_name              | Required | string      | Name for the logical volume to create
+
+#### Response
+
+UUID of the created logical volume clone is returned.
+
+#### Example
+
+Example request:
+
+~~~json
+{
+  "jsonrpc": "2.0",
+  "method": "bdev_lvol_clone_bdev",
+  "id": 1,
+  "params": {
+    "bdev_uuid": "e4b40d8b-f623-416d-8234-baf5a4c83cbd",
+    "lvs_name": "lvs1",
+    "clone_name": "clone2"
+  }
+}
+~~~
+
+Example response:
+
+~~~json
+{
+  "jsonrpc": "2.0",
+  "id": 1,
+  "result": "336f662b-08e5-4006-8e06-e2023f7f9886"
+}
+~~~
+
 ### bdev_lvol_rename {#rpc_bdev_lvol_rename}

 Rename a logical volume. New name will rename only the alias of the logical volume.
@ -9760,6 +9945,58 @@ Example response:
 }
 ~~~

+### bdev_lvol_get_lvols {#rpc_bdev_lvol_get_lvols}
+
+Get a list of logical volumes. This list can be limited by lvol store and will display volumes even if
+they are degraded. Degraded lvols do not have an associated bdev, thus this RPC call may return lvols
+not returned by `bdev_get_bdevs`.
+
+#### Parameters
+
+Name                    | Optional | Type        | Description
+----------------------- | -------- | ----------- | -----------
+lvs_uuid                | Optional | string      | Only show volumes in the logical volume store with this UUID
+lvs_name                | Optional | string      | Only show volumes in the logical volume store with this name
+
+Either lvs_uuid or lvs_name may be specified, but not both.
+If both lvs_uuid and lvs_name are omitted, information about lvols in all logical volume stores is returned.
+
+#### Example
+
+Example request:
+
+~~~json
+{
+  "jsonrpc": "2.0",
+  "method": "bdev_lvol_get_lvols",
+  "id": 1,
+  "params": {
+    "lvs_name": "lvs_test"
+  }
+}
+~~~
+
+Example response:
+
+~~~json
+[
+  {
+    "alias": "lvs_test/lvol1",
+    "uuid": "b335c368-851d-4099-81e0-018cc494fdf6",
+    "name": "lvol1",
+    "is_thin_provisioned": false,
+    "is_snapshot": false,
+    "is_clone": false,
+    "is_esnap_clone": false,
+    "is_degraded": false,
+    "lvs": {
+      "name": "lvs_test",
+      "uuid": "a1c8d950-5715-4558-936d-ab9e6eca0794"
+    }
+  }
+]
+~~~
+
 ## RAID

 ### bdev_raid_get_bdevs {#rpc_bdev_raid_get_bdevs}
--- a/doc/lvol.md
+++ b/doc/lvol.md
@ -74,6 +74,18 @@ A snapshot can be removed only if there is a single clone on top of it. The rela
 The cluster map of clone and snapshot will be merged and entries for unallocated clusters in the clone will be updated with
 addresses from the snapshot cluster map. The entire operation modifies metadata only - no data is copied during this process.

+### External Snapshots
+
+With the external snapshots feature, clones can be made of any bdev. These clones are commonly called *esnap clones*.
+Esnap clones work very similarly to thin provisioning. Rather than the back device being an zeroes device, the external snapshot
+bdev is used as the back device.
+
+![Clone of External Snapshot](lvol_esnap_clone.svg)
+
+A bdev that is used as an external snapshot cannot be opened for writing by anything else so long as an esnap clone exists.
+
+A bdev may have multiple esnap clones and esnap clones can themselves be snapshotted and cloned.
+
 ### Inflation {#lvol_inflation}

 Blobs can be inflated to copy data from backing devices (e.g. snapshots) and allocate all remaining clusters. As a result of this
@ -138,6 +150,12 @@ bdev_lvol_create [-h] [-u UUID] [-l LVS_NAME] [-t] [-c CLEAR_METHOD] lvol_name s
    optional arguments:
    -h, --help  show help
    -c, --clear-method specify data clusters clear method "none", "unmap" (default), "write_zeroes"
+bdev_lvol_get_lvols [-h] [-u LVS_UUID] [-l LVS_NAME]
+    Display logical volume list, including those that do not have associated bdevs.
+    optional arguments:
+    -h, --help  show help
+    -u LVS_UUID, --lvs_uuid UUID  show volumes only in the specified lvol store
+    -l LVS_NAME, --lvs_name LVS_NAME  show volumes only in the specified lvol store
 bdev_get_bdevs [-h] [-b NAME]
    User can view created bdevs using this call including those created on top of lvols.
    optional arguments:
@ -155,6 +173,10 @@ bdev_lvol_clone [-h] snapshot_name clone_name
    Create a clone with clone_name of a given lvol snapshot.
    optional arguments:
    -h, --help  show help
+bdev_lvol_clone_bdev [-h] bdev_name_or_uuid lvs_name clone_name
+    Create a clone with clone_name of a bdev. The bdev must not be an lvol in the lvs_name lvstore.
+    optional arguments:
+    -h, --help  show help
 bdev_lvol_rename [-h] old_name new_name
    Change lvol bdev name
    optional arguments:
--- a/doc/performance_reports.md
+++ b/doc/performance_reports.md
@ -5,6 +5,7 @@
 - [SPDK 23.01 NVMe Bdev Performance Report](https://ci.spdk.io/download/performance-reports/SPDK_nvme_bdev_perf_report_2301.pdf)
 - [SPDK 23.01 Vhost Performance Report](https://ci.spdk.io/download/performance-reports/SPDK_vhost_perf_report_2301.pdf)
 - [SPDK 23.01 NVMe-oF TCP Performance Report (Mellanox ConnectX-5)](https://ci.spdk.io/download/performance-reports/SPDK_tcp_mlx_perf_report_2301.pdf)
+- [SPDK 23.01 NVMe-oF TCP Performance Report (Intel E810-CQDA2)](https://ci.spdk.io/download/performance-reports/SPDK_tcp_cvl_perf_report_2301.pdf)
 - [SPDK 23.01 NVMe-oF RDMA Performance Report (Mellanox ConnectX-5)](https://ci.spdk.io/download/performance-reports/SPDK_rdma_mlx_perf_report_2301.pdf)
 - [SPDK 23.01 NVMe-oF RDMA Performance Report (Intel E810-CQDA2 iWARP)](https://ci.spdk.io/download/performance-reports/SPDK_rdma_cvl_iwarp_perf_report_2301.pdf)
 - [SPDK 23.01 NVMe-oF RDMA Performance Report (Intel E810-CQDA2 RoCEv2)](https://ci.spdk.io/download/performance-reports/SPDK_rdma_cvl_roce_perf_report_2301.pdf)
--- a/dpdkbuild/Makefile
+++ b/dpdkbuild/Makefile
@ -103,6 +103,10 @@ endif

 DPDK_CFLAGS += -fPIC

+ifneq ($(CONFIG_MAX_LCORES),)
+DPDK_OPTS += -Dmax_lcores=$(CONFIG_MAX_LCORES)
+endif
+
 ifeq ($(CONFIG_WERROR),y)
 DPDK_CFLAGS += -Werror
 else
--- a/examples/accel/perf/accel_perf.c
+++ b/examples/accel/perf/accel_perf.c
@ -87,7 +87,7 @@ struct ap_task {

 struct worker_thread {
 	struct spdk_io_channel		*ch;
-	uint64_t			xfer_completed;
+	struct spdk_accel_opcode_stats	stats;
 	uint64_t			xfer_failed;
 	uint64_t			injected_miscompares;
 	uint64_t			current_queue_depth;
@ -270,6 +270,8 @@ unregister_worker(void *arg1)
 {
 	struct worker_thread *worker = arg1;

+	spdk_accel_get_opcode_stats(worker->ch, worker->workload,
+				    &worker->stats, sizeof(worker->stats));
 	free(worker->task_base);
 	spdk_put_io_channel(worker->ch);
 	spdk_thread_exit(spdk_get_thread());
@ -668,7 +670,6 @@ accel_done(void *arg1, int status)
 		worker->xfer_failed++;
 	}

-	worker->xfer_completed++;
 	worker->current_queue_depth--;

 	if (!worker->is_draining && status == 0) {
@ -693,11 +694,11 @@ dump_result(void)
 	printf("------------------------------------------------------------------------\n");
 	while (worker != NULL) {

-		uint64_t xfer_per_sec = worker->xfer_completed / g_time_in_sec;
-		uint64_t bw_in_MiBps = (worker->xfer_completed * g_xfer_size_bytes) /
+		uint64_t xfer_per_sec = worker->stats.executed / g_time_in_sec;
+		uint64_t bw_in_MiBps = worker->stats.num_bytes /
 				       (g_time_in_sec * 1024 * 1024);

-		total_completed += worker->xfer_completed;
+		total_completed += worker->stats.executed;
 		total_failed += worker->xfer_failed;
 		total_miscompared += worker->injected_miscompares;

@ -1114,6 +1115,26 @@ error_end:
 	spdk_app_stop(rc);
 }

+static void
+worker_shutdown(void *ctx)
+{
+	_worker_stop(ctx);
+}
+
+static void
+shutdown_cb(void)
+{
+	struct worker_thread *worker;
+
+	pthread_mutex_lock(&g_workers_lock);
+	worker = g_workers;
+	while (worker) {
+		spdk_thread_send_msg(worker->thread, worker_shutdown, worker);
+		worker = worker->next;
+	}
+	pthread_mutex_unlock(&g_workers_lock);
+}
+
 int
 main(int argc, char **argv)
 {
@ -1123,6 +1144,7 @@ main(int argc, char **argv)
 	spdk_app_opts_init(&g_opts, sizeof(g_opts));
 	g_opts.name = "accel_perf";
 	g_opts.reactor_mask = "0x1";
+	g_opts.shutdown_cb = shutdown_cb;
 	if (spdk_app_parse_args(argc, argv, &g_opts, "a:C:o:q:t:yw:P:f:T:l:x:", NULL, parse_args,
 				usage) != SPDK_APP_PARSE_ARGS_SUCCESS) {
 		g_rc = -1;
--- a/examples/bdev/bdevperf/bdevperf.c
+++ b/examples/bdev/bdevperf/bdevperf.c
@ -1865,7 +1865,7 @@ config_filename_next(const char *filename, char *out)
 	for (i = 0, k = 0;
 	     filename[i] != '\0' &&
 	     filename[i] != ':' &&
-	     i < BDEVPERF_CONFIG_MAX_FILENAME;
+	     k < BDEVPERF_CONFIG_MAX_FILENAME;
 	     i++) {
 		if (filename[i] == ' ' || filename[i] == '\t') {
 			continue;
--- a/examples/bdev/fio_plugin/fio_plugin.c
+++ b/examples/bdev/fio_plugin/fio_plugin.c
@ -619,6 +619,10 @@ spdk_fio_setup_oat(void *_ctx)
 		struct spdk_bdev *bdev;

 		if (strcmp(f->file_name, "*") == 0) {
+			/* Explicitly set file size to 0 here to make sure fio doesn't try to
+			 * actually send I/O to this "*" file.
+			 */
+			f->real_file_size = 0;
 			continue;
 		}

@ -1436,7 +1440,7 @@ static struct fio_option options[] = {
 struct ioengine_ops ioengine = {
 	.name			= "spdk_bdev",
 	.version		= FIO_IOOPS_VERSION,
-	.flags			= FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN,
+	.flags			= FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN | FIO_DISKLESSIO,
 	.setup			= spdk_fio_setup,
 	.init			= spdk_fio_init,
 	/* .prep		= unused, */
--- a/examples/idxd/perf/perf.c
+++ b/examples/idxd/perf/perf.c
@ -523,7 +523,7 @@ _submit_single(struct idxd_chan_entry *t, struct idxd_task *task)
 		/* For fill use the first byte of the task->dst buffer */
 		diov.iov_base = task->dst;
 		diov.iov_len = g_xfer_size_bytes;
-		rc = spdk_idxd_submit_fill(t->ch, &diov, 1, *(uint8_t *)task->src,
+		rc = spdk_idxd_submit_fill(t->ch, &diov, 1, *(uint64_t *)task->src,
 					   flags, idxd_done, task);
 		break;
 	case IDXD_CRC32C:
--- a/examples/nvme/fio_plugin/README.md
+++ b/examples/nvme/fio_plugin/README.md
@ -200,3 +200,17 @@ smalloc: OOM. Consider using --alloc-size to increase the shared memory availabl
 This is because fio needs to allocate memory for the zone-report, that is, retrieve the state of
 zones on the device including auxiliary accounting information. To solve this, then you can follow
 fio's advice and increase ``--alloc-size``.
+
+## FDP
+
+To use FDP enabled device build and run the io-engine against fio version >= 3.34 and add:
+
+```bash
+fdp=1
+```
+
+to your fio-script, also have a look at script-example provided with fio:
+
+```bash
+fio/examples/uring-cmd-fdp.fio
+```
--- a/examples/nvme/fio_plugin/fio_plugin.c
+++ b/examples/nvme/fio_plugin/fio_plugin.c
@ -11,6 +11,7 @@
 #include "spdk/env.h"
 #include "spdk/string.h"
 #include "spdk/log.h"
+#include "spdk/likely.h"
 #include "spdk/endian.h"
 #include "spdk/dif.h"
 #include "spdk/util.h"
@ -21,8 +22,10 @@

 #ifdef for_each_rw_ddir
 #define FIO_HAS_ZBD (FIO_IOOPS_VERSION >= 26)
+#define FIO_HAS_FDP (FIO_IOOPS_VERSION >= 32)
 #else
 #define FIO_HAS_ZBD (0)
+#define FIO_HAS_FDP (0)
 #endif

 /* FreeBSD is missing CLOCK_MONOTONIC_RAW,
@ -1020,6 +1023,9 @@ spdk_fio_queue(struct thread_data *td, struct io_u *io_u)
 	struct spdk_nvme_ns	*ns = NULL;
 	void			*md_buf = NULL;
 	struct spdk_dif_ctx	*dif_ctx = &fio_req->dif_ctx;
+#if FIO_HAS_FDP
+	struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
+#endif
 	uint32_t		block_size;
 	uint64_t		lba;
 	uint32_t		lba_count;
@ -1039,6 +1045,15 @@ spdk_fio_queue(struct thread_data *td, struct io_u *io_u)
 	lba = io_u->offset / block_size;
 	lba_count = io_u->xfer_buflen / block_size;

+#if FIO_HAS_FDP
+	/* Only SGL support for write command with directives */
+	if (io_u->ddir == DDIR_WRITE && io_u->dtype && !g_spdk_enable_sgl) {
+		log_err("spdk/nvme: queue() directives require SGL to be enabled\n");
+		io_u->error = -EINVAL;
+		return FIO_Q_COMPLETED;
+	}
+#endif
+
 	/* TODO: considering situations that fio will randomize and verify io_u */
 	if (fio_qpair->nvme_pi_enabled) {
 		if (fio_qpair->extended_lba) {
@ -1081,6 +1096,19 @@ spdk_fio_queue(struct thread_data *td, struct io_u *io_u)
 			}
 		} else {
 			if (!fio_qpair->zone_append_enabled) {
+#if FIO_HAS_FDP
+				if (spdk_unlikely(io_u->dtype)) {
+					ext_opts.io_flags = fio_qpair->io_flags | (io_u->dtype << 20);
+					ext_opts.metadata = md_buf;
+					ext_opts.cdw13 = (io_u->dspec << 16);
+					ext_opts.apptag = dif_ctx->app_tag;
+					ext_opts.apptag_mask = dif_ctx->apptag_mask;
+					rc = spdk_nvme_ns_cmd_writev_ext(ns, fio_qpair->qpair, lba, lba_count,
+									 spdk_fio_completion_cb, fio_req,
+									 spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, &ext_opts);
+					break;
+				}
+#endif
 				rc = spdk_nvme_ns_cmd_writev_with_md(ns, fio_qpair->qpair, lba,
 								     lba_count, spdk_fio_completion_cb, fio_req, fio_qpair->io_flags,
 								     spdk_nvme_io_reset_sgl, spdk_nvme_io_next_sge, md_buf,
@ -1419,6 +1447,58 @@ spdk_fio_get_max_open_zones(struct thread_data *td, struct fio_file *f,
 }
 #endif

+#if FIO_HAS_FDP
+static int
+spdk_fio_fdp_fetch_ruhs(struct thread_data *td, struct fio_file *f,
+			struct fio_ruhs_info *fruhs_info)
+{
+	struct spdk_fio_thread *fio_thread = td->io_ops_data;
+	struct spdk_fio_qpair *fio_qpair = NULL;
+	struct spdk_nvme_qpair *tmp_qpair;
+	struct {
+		struct spdk_nvme_fdp_ruhs ruhs;
+		struct spdk_nvme_fdp_ruhs_desc desc[128];
+	} fdp_ruhs;
+	uint16_t idx;
+	int completed = 0, err;
+
+	fio_qpair = get_fio_qpair(fio_thread, f);
+	if (!fio_qpair) {
+		log_err("spdk/nvme: no ns/qpair or file_name: '%s'\n", f->file_name);
+		return -ENODEV;
+	}
+
+	/* qpair has not been allocated yet (it gets allocated in spdk_fio_open()).
+	 * Create a temporary qpair in order to perform report zones.
+	 */
+	assert(!fio_qpair->qpair);
+
+	tmp_qpair = spdk_nvme_ctrlr_alloc_io_qpair(fio_qpair->fio_ctrlr->ctrlr, NULL, 0);
+	if (!tmp_qpair) {
+		log_err("spdk/nvme: cannot allocate a temporary qpair\n");
+		return -EIO;
+	}
+
+	err = spdk_nvme_ns_cmd_io_mgmt_recv(fio_qpair->ns, tmp_qpair, &fdp_ruhs, sizeof(fdp_ruhs),
+					    SPDK_NVME_FDP_IO_MGMT_RECV_RUHS, 0, pcu_cb, &completed);
+	if (err || pcu(tmp_qpair, &completed) || completed < 0) {
+		log_err("spdk/nvme: fetch_ruhs(): err: %d, cpl: %d\n", err, completed);
+		err = err ? err : -EIO;
+		goto exit;
+	}
+
+	fruhs_info->nr_ruhs = fdp_ruhs.ruhs.nruhsd;
+	for (idx = 0; idx < fdp_ruhs.ruhs.nruhsd; idx++) {
+		fruhs_info->plis[idx] = fdp_ruhs.desc[idx].pid;
+	}
+
+exit:
+	spdk_nvme_ctrlr_free_io_qpair(tmp_qpair);
+
+	return err;
+}
+#endif
+
 static void
 spdk_fio_cleanup(struct thread_data *td)
 {
@ -1723,7 +1803,10 @@ struct ioengine_ops ioengine = {
 #if FIO_IOOPS_VERSION >= 30
 	.get_max_open_zones	= spdk_fio_get_max_open_zones,
 #endif
-	.flags			= FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN,
+#if FIO_HAS_FDP
+	.fdp_fetch_ruhs		= spdk_fio_fdp_fetch_ruhs,
+#endif
+	.flags			= FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN | FIO_DISKLESSIO,
 	.options		= options,
 	.option_struct_size	= sizeof(struct spdk_fio_options),
 };
--- a/examples/nvme/perf/perf.c
+++ b/examples/nvme/perf/perf.c
@ -207,9 +207,9 @@ static bool g_vmd;
 static const char *g_workload_type;
 static TAILQ_HEAD(, ctrlr_entry) g_controllers = TAILQ_HEAD_INITIALIZER(g_controllers);
 static TAILQ_HEAD(, ns_entry) g_namespaces = TAILQ_HEAD_INITIALIZER(g_namespaces);
-static int g_num_namespaces;
+static uint32_t g_num_namespaces;
 static TAILQ_HEAD(, worker_thread) g_workers = TAILQ_HEAD_INITIALIZER(g_workers);
-static int g_num_workers = 0;
+static uint32_t g_num_workers = 0;
 static bool g_use_every_core = false;
 static uint32_t g_main_core;
 static pthread_barrier_t g_worker_sync_barrier;
@ -983,7 +983,8 @@ nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
 	struct ns_entry *entry = ns_ctx->entry;
 	struct spdk_nvme_poll_group *group;
 	struct spdk_nvme_qpair *qpair;
-	int i;
+	uint64_t poll_timeout_tsc;
+	int i, rc;

 	ns_ctx->u.nvme.num_active_qpairs = g_nr_io_queues_per_ns;
 	ns_ctx->u.nvme.num_all_qpairs = g_nr_io_queues_per_ns + g_nr_unused_io_queues;
@ -998,6 +999,7 @@ nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
 	}
 	opts.delay_cmd_submit = true;
 	opts.create_only = true;
+	opts.async_mode = true;

 	ns_ctx->u.nvme.group = spdk_nvme_poll_group_create(NULL, NULL);
 	if (ns_ctx->u.nvme.group == NULL) {
@ -1027,7 +1029,22 @@ nvme_init_ns_worker_ctx(struct ns_worker_ctx *ns_ctx)
 		}
 	}

+	/* Busy poll here until all qpairs are connected - this ensures once we start
+	 * I/O we aren't still waiting for some qpairs to connect. Limit the poll to
+	 * 10 seconds though.
+	 */
+	poll_timeout_tsc = spdk_get_ticks() + 10 * spdk_get_ticks_hz();
+	rc = -EAGAIN;
+	while (spdk_get_ticks() < poll_timeout_tsc && rc == -EAGAIN) {
+		spdk_nvme_poll_group_process_completions(group, 0, perf_disconnect_cb);
+		rc = spdk_nvme_poll_group_all_connected(group);
+		if (rc == 0) {
 			return 0;
+		}
+	}
+
+	/* If we reach here, it means we either timed out, or some connection failed. */
+	assert(spdk_get_ticks() > poll_timeout_tsc || rc == -EIO);

 qpair_failed:
 	for (; i > 0; --i) {
@ -1265,7 +1282,7 @@ register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns)
 	entry->fn_table = &nvme_fn_table;
 	entry->u.nvme.ctrlr = ctrlr;
 	entry->u.nvme.ns = ns;
-	entry->num_io_requests = g_queue_depth * entries;
+	entry->num_io_requests = entries * spdk_divide_round_up(g_queue_depth, g_nr_io_queues_per_ns);

 	entry->size_in_ios = ns_size / g_io_size_bytes;
 	entry->io_size_blocks = g_io_size_bytes / sector_size;
@ -2843,6 +2860,9 @@ probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
 	memcpy(opts->hostnqn, trid_entry->hostnqn, sizeof(opts->hostnqn));

 	opts->transport_tos = g_transport_tos;
+	if (opts->num_io_queues < g_num_workers * g_nr_io_queues_per_ns) {
+		opts->num_io_queues = g_num_workers * g_nr_io_queues_per_ns;
+	}

 	return true;
 }
--- a/examples/nvme/reconnect/reconnect.c
+++ b/examples/nvme/reconnect/reconnect.c
@ -411,6 +411,8 @@ submit_single_io(struct perf_task *task)

 	if (spdk_unlikely(rc != 0)) {
 		fprintf(stderr, "starting I/O failed\n");
+		spdk_dma_free(task->iov.iov_base);
+		free(task);
 	} else {
 		ns_ctx->current_queue_depth++;
 	}
@ -537,7 +539,6 @@ work_fn(void *arg)
 				ns_ctx->is_draining = true;
 			}

-			if (ns_ctx->current_queue_depth > 0) {
 			check_io(ns_ctx);
 			if (ns_ctx->current_queue_depth == 0) {
 				nvme_cleanup_ns_worker_ctx(ns_ctx);
@ -545,7 +546,6 @@ work_fn(void *arg)
 				unfinished_ns_ctx++;
 			}
 		}
-		}
 	} while (unfinished_ns_ctx > 0);

 	return 0;
--- a/include/spdk/accel.h
+++ b/include/spdk/accel.h
@ -471,6 +471,27 @@ int spdk_accel_append_decrypt(struct spdk_accel_sequence **seq, struct spdk_io_c
 			      uint64_t iv, uint32_t block_size, int flags,
 			      spdk_accel_step_cb cb_fn, void *cb_arg);

+/**
+ * Append a crc32c operation to a sequence.
+ *
+ * \param seq Sequence object.  If NULL, a new sequence object will be created.
+ * \param ch I/O channel.
+ * \param dst Destination to write the calculated value.
+ * \param iovs Source I/O vector array.
+ * \param iovcnt Size of the `iovs` array.
+ * \param domain Memory domain to which the source buffers belong.
+ * \param domain_ctx Source buffer domain context.
+ * \param seed Initial value.
+ * \param cb_fn Callback to be executed once this operation is completed.
+ * \param cb_arg Argument to be passed to `cb_fn`.
+ *
+ * \return 0 if operation was successfully added to the sequence, negative errno otherwise.
+ */
+int spdk_accel_append_crc32c(struct spdk_accel_sequence **seq, struct spdk_io_channel *ch,
+			     uint32_t *dst, struct iovec *iovs, uint32_t iovcnt,
+			     struct spdk_memory_domain *domain, void *domain_ctx,
+			     uint32_t seed, spdk_accel_step_cb cb_fn, void *cb_arg);
+
 /**
 * Finish a sequence and execute all its operations. After the completion callback is executed, the
 * sequence object is automatically freed.
@ -478,10 +499,8 @@ int spdk_accel_append_decrypt(struct spdk_accel_sequence **seq, struct spdk_io_c
 * \param seq Sequence to finish.
 * \param cb_fn Completion callback to be executed once all operations are executed.
 * \param cb_arg Argument to be passed to `cb_fn`.
- *
- * \return 0 on success, negative errno otherwise.
 */
-int spdk_accel_sequence_finish(struct spdk_accel_sequence *seq,
+void spdk_accel_sequence_finish(struct spdk_accel_sequence *seq,
 				spdk_accel_completion_cb cb_fn, void *cb_arg);

 /**
@ -641,6 +660,57 @@ int spdk_accel_set_driver(const char *name);
 */
 struct spdk_memory_domain *spdk_accel_get_memory_domain(void);

+struct spdk_accel_opts {
+	/** Size of this structure */
+	size_t		size;
+	/** Size of the small iobuf cache */
+	uint32_t	small_cache_size;
+	/** Size of the large iobuf cache */
+	uint32_t	large_cache_size;
+	/** Maximum number of tasks per IO channel */
+	uint32_t	task_count;
+	/** Maximum number of sequences per IO channel */
+	uint32_t	sequence_count;
+	/** Maximum number of accel buffers per IO channel */
+	uint32_t	buf_count;
+} __attribute__((packed));
+
+/**
+ * Set the options for the accel framework.
+ *
+ * \param opts Accel options.
+ *
+ * \return 0 on success, negative errno otherwise.
+ */
+int spdk_accel_set_opts(const struct spdk_accel_opts *opts);
+
+/**
+ * Get the options for the accel framework.
+ *
+ * \param opts Accel options.
+ */
+void spdk_accel_get_opts(struct spdk_accel_opts *opts);
+
+struct spdk_accel_opcode_stats {
+	/** Number of executed operations */
+	uint64_t	executed;
+	/** Number of failed operations */
+	uint64_t	failed;
+	/** Number of processed bytes */
+	uint64_t	num_bytes;
+} __attribute__((packed));
+
+/**
+ * Retrieve opcode statistics for a given IO channel.
+ *
+ * \param ch I/O channel.
+ * \param opcode Operation to retrieve statistics.
+ * \param stats Per-channel statistics.
+ * \param size Size of the `stats` structure.
+ */
+void spdk_accel_get_opcode_stats(struct spdk_io_channel *ch, enum accel_opcode opcode,
+				 struct spdk_accel_opcode_stats *stats, size_t size);
+
 #ifdef __cplusplus
 }
 #endif
--- a/include/spdk_internal/accel_module.h
+++ b/include/spdk_internal/accel_module.h
@ -65,6 +65,7 @@ struct spdk_accel_task {
 	void				*src_domain_ctx;
 	struct spdk_memory_domain	*dst_domain;
 	void				*dst_domain_ctx;
+	uint64_t			nbytes;
 	union {
 		struct {
 			struct iovec		*iovs; /* iovs passed by the caller */
--- a/include/spdk/bdev_module.h
+++ b/include/spdk/bdev_module.h
@ -509,7 +509,7 @@ struct spdk_bdev {
 	/**
 	 * UUID for this bdev.
 	 *
-	 * Fill with zeroes if no uuid is available.
+	 * If not provided, it will be generated by bdev layer.
 	 */
 	struct spdk_uuid uuid;

@ -953,6 +953,9 @@ struct spdk_bdev_io {
 		/** Indicates whether the IO is split */
 		bool split;

+		/** Retry state (resubmit, re-pull, re-push, etc.) */
+		uint8_t retry_state;
+
 		/** bdev allocated memory associated with this request */
 		void *buf;

@ -1460,6 +1463,24 @@ int spdk_bdev_part_base_construct_ext(const char *bdev_name,
 				      spdk_io_channel_destroy_cb ch_destroy_cb,
 				      struct spdk_bdev_part_base **base);

+/** Options used when constructing a part bdev. */
+struct spdk_bdev_part_construct_opts {
+	/* Size of this structure in bytes */
+	uint64_t opts_size;
+	/** UUID of the bdev */
+	struct spdk_uuid uuid;
+};
+
+SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_part_construct_opts) == 24, "Incorrect size");
+
+/**
+ * Initialize options that will be passed to spdk_bdev_part_construct_ext().
+ *
+ * \param opts Options structure to initialize
+ * \param size Size of opts structure.
+ */
+void spdk_bdev_part_construct_opts_init(struct spdk_bdev_part_construct_opts *opts, uint64_t size);
+
 /**
 * Create a logical spdk_bdev_part on top of a base.
 *
@ -1477,6 +1498,25 @@ int spdk_bdev_part_construct(struct spdk_bdev_part *part, struct spdk_bdev_part_
 			     char *name, uint64_t offset_blocks, uint64_t num_blocks,
 			     char *product_name);

+/**
+ * Create a logical spdk_bdev_part on top of a base with a non-NULL bdev UUID
+ *
+ * \param part The part object allocated by the user.
+ * \param base The base from which to create the part.
+ * \param name The name of the new spdk_bdev_part.
+ * \param offset_blocks The offset into the base bdev at which this part begins.
+ * \param num_blocks The number of blocks that this part will span.
+ * \param product_name Unique name for this type of block device.
+ * \param opts Additional options.
+ *
+ * \return 0 on success.
+ * \return -1 if the bases underlying bdev cannot be claimed by the current module.
+ */
+int spdk_bdev_part_construct_ext(struct spdk_bdev_part *part, struct spdk_bdev_part_base *base,
+				 char *name, uint64_t offset_blocks, uint64_t num_blocks,
+				 char *product_name,
+				 const struct spdk_bdev_part_construct_opts *opts);
+
 /**
 * Forwards I/O from an spdk_bdev_part to the underlying base bdev.
 *
--- a/include/spdk/blob.h
+++ b/include/spdk/blob.h
@ -244,6 +244,8 @@ struct spdk_bs_dev {
 		     uint64_t dst_lba, uint64_t src_lba, uint64_t lba_count,
 		     struct spdk_bs_dev_cb_args *cb_args);

+	bool (*is_degraded)(struct spdk_bs_dev *dev);
+
 	uint64_t	blockcnt;
 	uint32_t	blocklen; /* In bytes */
 };
@ -687,11 +689,13 @@ bool spdk_blob_is_read_only(struct spdk_blob *blob);
 bool spdk_blob_is_snapshot(struct spdk_blob *blob);

 /**
- * Check if blob is a clone.
+ * Check if blob is a clone of a blob.
+ *
+ * Clones of external snapshots will return false. See spdk_blob_is_esnap_clone.
 *
 * \param blob Blob.
 *
- * \return true if blob is a clone.
+ * \return true if blob is a clone of a blob.
 */
 bool spdk_blob_is_clone(struct spdk_blob *blob);

@ -705,7 +709,7 @@ bool spdk_blob_is_clone(struct spdk_blob *blob);
 bool spdk_blob_is_thin_provisioned(struct spdk_blob *blob);

 /**
- * Check if blob is a clone of an external bdev.
+ * Check if blob is a clone of an external snapshot.
 *
 * \param blob Blob.
 *
@ -1134,6 +1138,24 @@ void spdk_bs_set_bstype(struct spdk_blob_store *bs, struct spdk_bs_type bstype);
 void spdk_blob_set_esnap_bs_dev(struct spdk_blob *blob, struct spdk_bs_dev *back_bs_dev,
 				spdk_blob_op_complete cb_fn, void *cb_arg);

+/**
+ * Get the existing external snapshot device
+ *
+ * \param blob A blob that is an esnap clone
+ *
+ * \return NULL if the blob is not an esnap clone, else the current external snapshot device.
+ */
+struct spdk_bs_dev *spdk_blob_get_esnap_bs_dev(const struct spdk_blob *blob);
+
+/**
+ * Determine if the blob is degraded. A degraded blob cannot perform IO.
+ *
+ * \param blob A blob
+ *
+ * \return true if the blob or any snapshots upon which it depends are degraded, else false.
+ */
+bool spdk_blob_is_degraded(const struct spdk_blob *blob);
+
 #ifdef __cplusplus
 }
 #endif
--- a/include/spdk/blob_bdev.h
+++ b/include/spdk/blob_bdev.h
@ -67,7 +67,7 @@ int spdk_bdev_create_bs_dev(const char *bdev_name, bool write,
 * Claim the bdev module for the given blobstore.
 *
 * If bs_dev was opened read-write using spdk_bdev_create_bs_dev_ext(), a read-write-once claim is
- * taken. If bs_dev was opened read-only using spdk_bdev_create_bs_dev_ro(), a read-only-many claim
+ * taken. If bs_dev was opened read-only using spdk_bdev_create_bs_dev(), a read-only-many claim
 * is taken.
 *
 * \param bs_dev Blobstore block device.
--- a/include/spdk/env.h
+++ b/include/spdk/env.h
@ -47,6 +47,7 @@ extern "C" {
 struct spdk_env_opts {
 	const char		*name;
 	const char		*core_mask;
+	const char		*lcore_map;
 	int			shm_id;
 	int			mem_channel;
 	int			main_core;
--- a/include/spdk/event.h
+++ b/include/spdk/event.h
@ -66,7 +66,6 @@ struct spdk_app_opts {

 	/* Hole at bytes 17-23. */
 	uint8_t	reserved17[7];
-
 	const char *rpc_addr; /* Can be UNIX domain socket path or IP address + TCP port */
 	const char *reactor_mask;
 	const char *tpoint_group_mask;
@ -163,8 +162,14 @@ struct spdk_app_opts {
 	 * The vf_token is an UUID that shared between SR-IOV PF and VF.
 	 */
 	const char		*vf_token;
+
+	/**
+	 * Used to store lcore to CPU mappig to pass it to DPDK
+	 */
+	const char *lcore_map; /* lcore mapping */
+
 } __attribute__((packed));
-SPDK_STATIC_ASSERT(sizeof(struct spdk_app_opts) == 216, "Incorrect size");
+SPDK_STATIC_ASSERT(sizeof(struct spdk_app_opts) == 224, "Incorrect size");

 /**
 * Initialize the default value of opts
--- a/include/spdk/lvol.h
+++ b/include/spdk/lvol.h
@ -13,6 +13,7 @@

 #include "spdk/stdinc.h"
 #include "spdk/blob.h"
+#include "spdk/uuid.h"

 #ifdef __cplusplus
 extern "C" {
@ -115,6 +116,15 @@ typedef void (*spdk_lvol_op_with_handle_complete)(void *cb_arg, struct spdk_lvol
 */
 typedef void (*spdk_lvol_op_complete)(void *cb_arg, int lvolerrno);

+/**
+ * Callback definition for spdk_lvol_iter_clones.
+ *
+ * \param lvol An iterated lvol.
+ * \param cb_arg Opaque context passed to spdk_lvol_iter_clone().
+ * \return 0 to continue iterating, any other value to stop iterating.
+ */
+typedef int (*spdk_lvol_iter_cb)(void *cb_arg, struct spdk_lvol *lvol);
+
 /**
 * Initialize lvolstore on given bs_bdev.
 *
@ -214,11 +224,15 @@ void spdk_lvol_create_clone(struct spdk_lvol *lvol, const char *clone_name,
 *
 * \param esnap_id The identifier that will be passed to the spdk_bs_esnap_dev_create callback.
 * \param id_len The length of esnap_id, in bytes.
- * \param size_bytes The size of the external snapshot device, in bytes.
+ * \param size_bytes The size of the external snapshot device, in bytes. This must be an integer
+ * multiple of the lvolstore's cluster size. See \c cluster_sz in \struct spdk_lvs_opts.
 * \param lvs Handle to lvolstore.
 * \param clone_name Name of created clone.
 * \param cb_fn Completion callback.
 * \param cb_arg Completion callback custom arguments.
+ * \return 0 if parameters pass verification checks and the esnap creation is started, in which case
+ * the \c cb_fn will be used to report the completion status. If an error is encountered, a negative
+ * errno will be returned and \c cb_fn will not be called.
 */
 int spdk_lvol_create_esnap_clone(const void *esnap_id, uint32_t id_len, uint64_t size_bytes,
 				 struct spdk_lvol_store *lvs, const char *clone_name,
@ -260,6 +274,35 @@ void spdk_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void
 */
 void spdk_lvol_close(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg);

+/**
+ * Iterate clones of an lvol.
+ *
+ * Iteration stops if cb_fn(cb_arg, clone_lvol) returns non-zero.
+ *
+ * \param lvol Handle to lvol.
+ * \param cb_fn Function to call for each lvol that clones this lvol.
+ * \param cb_arg Context to pass wtih cb_fn.
+ * \return -ENOMEM if memory allocation failed, non-zero return from cb_fn(), or 0.
+ */
+int spdk_lvol_iter_immediate_clones(struct spdk_lvol *lvol, spdk_lvol_iter_cb cb_fn, void *cb_arg);
+
+/**
+ * Get the lvol that has a particular UUID.
+ *
+ * \param uuid The lvol's UUID.
+ * \return A pointer to the requested lvol on success, else NULL.
+ */
+struct spdk_lvol *spdk_lvol_get_by_uuid(const struct spdk_uuid *uuid);
+
+/**
+ * Get the lvol that has the specified name in the specified lvolstore.
+ *
+ * \param lvs_name Name of the lvolstore.
+ * \param lvol_name Name ofthe lvol.
+ * \return A pointer to the requested lvol on success, else NULL.
+ */
+struct spdk_lvol *spdk_lvol_get_by_names(const char *lvs_name, const char *lvol_name);
+
 /**
 * Get I/O channel of bdev associated with specified lvol.
 *
@ -330,6 +373,14 @@ void spdk_lvol_inflate(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void
 */
 void spdk_lvol_decouple_parent(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg);

+/**
+ * Determine if an lvol is degraded. A degraded lvol cannot perform IO.
+ *
+ * \param lvol Handle to lvol
+ * \return true if the lvol has no open blob or the lvol's blob is degraded, else false.
+ */
+bool spdk_lvol_is_degraded(const struct spdk_lvol *lvol);
+
 #ifdef __cplusplus
 }
 #endif
--- a/include/spdk/nvme.h
+++ b/include/spdk/nvme.h
@ -1095,18 +1095,6 @@ void spdk_nvme_ctrlr_set_remove_cb(struct spdk_nvme_ctrlr *ctrlr,
 */
 int spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr);

-/**
- * Inform the driver that the application is preparing to reset the specified NVMe controller.
- * (Deprecated, please use spdk_nvme_ctrlr_disconnect() before freeing I/O qpairs instead.)
- *
- * This function allows the driver to make decisions knowing that a reset is about to happen.
- * For example, the pcie transport in this case could skip sending DELETE_CQ and DELETE_SQ
- * commands to the controller if an io qpair is freed after this function is called.
- *
- * \param ctrlr Opaque handle to NVMe controller.
- */
-void spdk_nvme_ctrlr_prepare_for_reset(struct spdk_nvme_ctrlr *ctrlr);
-
 /**
 * Disconnect the given NVMe controller.
 *
@ -1602,9 +1590,7 @@ struct spdk_nvme_io_qpair_opts {

 	/**
 	 * This flag if set to true enables the creation of submission and completion queue
-	 * asynchronously. This mode is currently supported at PCIe layer and tracks the
-	 * qpair creation with state machine and returns to the user.Default mode is set to
-	 * false to create io qpair synchronously.
+	 * asynchronously. Default mode is set to false to create io qpair synchronously.
 	 */
 	bool async_mode;

@ -1874,6 +1860,16 @@ int32_t spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair,
 */
 spdk_nvme_qp_failure_reason spdk_nvme_qpair_get_failure_reason(struct spdk_nvme_qpair *qpair);

+/**
+ * Control if DNR is set or not for aborted commands.
+ *
+ * The default value is false.
+ *
+ * \param qpair The qpair to set.
+ * \param dnr Set the DNR bit to 1 if true or 0 if false for aborted commands.
+ */
+void spdk_nvme_qpair_set_abort_dnr(struct spdk_nvme_qpair *qpair, bool dnr);
+
 /**
 * Send the given admin command to the NVMe controller.
 *
@ -2685,6 +2681,24 @@ int spdk_nvme_poll_group_destroy(struct spdk_nvme_poll_group *group);
 int64_t spdk_nvme_poll_group_process_completions(struct spdk_nvme_poll_group *group,
 		uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb);

+/**
+ * Check if all qpairs in the poll group are connected.
+ *
+ * This function allows the caller to check if all qpairs in a poll group are
+ * connected. This API is generally only suitable during application startup,
+ * to check when a large number of async connections have completed.
+ *
+ * It is useful for applications like benchmarking tools to create
+ * a large number of qpairs, but then ensuring they are all fully connected before
+ * proceeding with I/O.
+ *
+ * \param group The group on which to poll connecting qpairs.
+ *
+ * return 0 if all qpairs are in CONNECTED state, -EIO if any connections failed to connect, -EAGAIN if
+ * any qpairs are still trying to connected.
+ */
+int spdk_nvme_poll_group_all_connected(struct spdk_nvme_poll_group *group);
+
 /**
 * Retrieve the user context for this specific poll group.
 *
--- a/include/spdk/nvmf.h
+++ b/include/spdk/nvmf.h
@ -1,7 +1,7 @@
 /*   SPDX-License-Identifier: BSD-3-Clause
 *   Copyright (C) 2016 Intel Corporation. All rights reserved.
 *   Copyright (c) 2018-2021 Mellanox Technologies LTD. All rights reserved.
- *   Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *   Copyright (c) 2021, 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 */

 /** \file
@ -291,11 +291,12 @@ typedef void (*nvmf_qpair_disconnect_cb)(void *ctx);
 * Disconnect an NVMe-oF qpair
 *
 * \param qpair The NVMe-oF qpair to disconnect.
- * \param cb_fn The function to call upon completion of the disconnect.
- * \param ctx The context to pass to the callback function.
+ * \param cb_fn Deprecated, will be removed in v23.09. The function to call upon completion of the disconnect.
+ * \param ctx Deprecated, will be removed in v23.09. The context to pass to the callback function.
 *
 * \return 0 upon success.
 * \return -ENOMEM if the function specific context could not be allocated.
+ * \return -EINPROGRESS if the qpair is already in the process of disconnect.
 */
 int spdk_nvmf_qpair_disconnect(struct spdk_nvmf_qpair *qpair, nvmf_qpair_disconnect_cb cb_fn,
 			       void *ctx);
@ -303,6 +304,11 @@ int spdk_nvmf_qpair_disconnect(struct spdk_nvmf_qpair *qpair, nvmf_qpair_disconn
 /**
 * Get the peer's transport ID for this queue pair.
 *
+ * This function will first zero the trid structure, and then fill
+ * in the relevant trid fields to identify the listener. The relevant
+ * fields will depend on the transport, but the subnqn will never
+ * be a relevant field for purposes of this function.
+ *
 * \param qpair The NVMe-oF qpair
 * \param trid Output parameter that will contain the transport id.
 *
@ -315,6 +321,11 @@ int spdk_nvmf_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
 /**
 * Get the local transport ID for this queue pair.
 *
+ * This function will first zero the trid structure, and then fill
+ * in the relevant trid fields to identify the listener. The relevant
+ * fields will depend on the transport, but the subnqn will never
+ * be a relevant field for purposes of this function.
+ *
 * \param qpair The NVMe-oF qpair
 * \param trid Output parameter that will contain the transport id.
 *
@ -327,6 +338,11 @@ int spdk_nvmf_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
 /**
 * Get the associated listener transport ID for this queue pair.
 *
+ * This function will first zero the trid structure, and then fill
+ * in the relevant trid fields to identify the listener. The relevant
+ * fields will depend on the transport, but the subnqn will never
+ * be a relevant field for purposes of this function.
+ *
 * \param qpair The NVMe-oF qpair
 * \param trid Output parameter that will contain the transport id.
 *
@ -1165,8 +1181,12 @@ spdk_nvmf_transport_stop_listen(struct spdk_nvmf_transport *transport,
 * qpairs that are connected to the specified listener. Because
 * this function disconnects the qpairs, it has to be asynchronous.
 *
+ * The subsystem is matched using the subsystem parameter, not the
+ * subnqn field in the trid.
+ *
 * \param transport The transport associated with the listen address.
- * \param trid The address to stop listening at.
+ * \param trid The address to stop listening at. subnqn must be an empty
+ *             string.
 * \param subsystem The subsystem to match for qpairs with the specified
 *                  trid. If NULL, it will disconnect all qpairs with the
 *                  specified trid.
--- a/include/spdk/nvmf_transport.h
+++ b/include/spdk/nvmf_transport.h
@ -126,7 +126,10 @@ struct spdk_nvmf_qpair {
 	bool					connect_received;
 	bool					disconnect_started;

+	union {
 		struct spdk_nvmf_request	*first_fused_req;
+		struct spdk_nvmf_request	*connect_req;
+	};

 	TAILQ_HEAD(, spdk_nvmf_request)		outstanding;
 	TAILQ_ENTRY(spdk_nvmf_qpair)		link;
--- a/include/spdk/pipe.h
+++ b/include/spdk/pipe.h
@ -23,9 +23,6 @@ struct spdk_pipe;
 * Construct a pipe around the given memory buffer. The pipe treats the memory
 * buffer as a circular ring of bytes.
 *
- * The available size for writing will be one less byte than provided. A single
- * byte must be reserved to distinguish queue full from queue empty conditions.
- *
 * \param buf The data buffer that backs this pipe.
 * \param sz The size of the data buffer.
 *
--- a/include/spdk/stdinc.h
+++ b/include/spdk/stdinc.h
@ -1,6 +1,7 @@
 /*   SPDX-License-Identifier: BSD-3-Clause
 *   Copyright (C) 2017 Intel Corporation.
 *   All rights reserved.
+ *   Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 */

 /** \file
@ -39,6 +40,7 @@ extern "C" {
 #include <fcntl.h>
 #include <glob.h>
 #include <ifaddrs.h>
+#include <libgen.h>
 #include <netdb.h>
 #include <poll.h>
 #include <pthread.h>
--- a/include/spdk/thread.h
+++ b/include/spdk/thread.h
@ -904,6 +904,9 @@ bool spdk_interrupt_mode_is_enabled(void);
 struct spdk_spinlock {
 	pthread_spinlock_t spinlock;
 	struct spdk_thread *thread;
+	struct spdk_spinlock_internal *internal;
+	bool initialized;
+	bool destroyed;
 };

 /**
@ -1103,39 +1106,8 @@ void spdk_iobuf_entry_abort(struct spdk_iobuf_channel *ch, struct spdk_iobuf_ent
 *
 * \return pointer to a buffer or NULL if no buffers are currently available.
 */
-static inline void *
-spdk_iobuf_get(struct spdk_iobuf_channel *ch, uint64_t len,
-	       struct spdk_iobuf_entry *entry, spdk_iobuf_get_cb cb_fn)
-{
-	struct spdk_iobuf_pool *pool;
-	void *buf;
-
-	assert(spdk_io_channel_get_thread(ch->parent) == spdk_get_thread());
-	if (len <= ch->small.bufsize) {
-		pool = &ch->small;
-	} else {
-		assert(len <= ch->large.bufsize);
-		pool = &ch->large;
-	}
-
-	buf = (void *)STAILQ_FIRST(&pool->cache);
-	if (buf) {
-		STAILQ_REMOVE_HEAD(&pool->cache, stailq);
-		assert(pool->cache_count > 0);
-		pool->cache_count--;
-	} else {
-		buf = spdk_mempool_get(pool->pool);
-		if (!buf) {
-			STAILQ_INSERT_TAIL(pool->queue, entry, stailq);
-			entry->module = ch->module;
-			entry->cb_fn = cb_fn;
-
-			return NULL;
-		}
-	}
-
-	return (char *)buf;
-}
+void *spdk_iobuf_get(struct spdk_iobuf_channel *ch, uint64_t len, struct spdk_iobuf_entry *entry,
+		     spdk_iobuf_get_cb cb_fn);

 /**
 * Release a buffer back to the iobuf pool.  If there are outstanding requests waiting for a buffer,
@ -1145,32 +1117,7 @@ spdk_iobuf_get(struct spdk_iobuf_channel *ch, uint64_t len,
 * \param buf Buffer to release
 * \param len Length of the buffer (must be the exact same value as specified in `spdk_iobuf_get()`).
 */
-static inline void
-spdk_iobuf_put(struct spdk_iobuf_channel *ch, void *buf, uint64_t len)
-{
-	struct spdk_iobuf_entry *entry;
-	struct spdk_iobuf_pool *pool;
-
-	assert(spdk_io_channel_get_thread(ch->parent) == spdk_get_thread());
-	if (len <= ch->small.bufsize) {
-		pool = &ch->small;
-	} else {
-		pool = &ch->large;
-	}
-
-	if (STAILQ_EMPTY(pool->queue)) {
-		if (pool->cache_count < pool->cache_size) {
-			STAILQ_INSERT_HEAD(&pool->cache, (struct spdk_iobuf_buffer *)buf, stailq);
-			pool->cache_count++;
-		} else {
-			spdk_mempool_put(pool->pool, buf);
-		}
-	} else {
-		entry = STAILQ_FIRST(pool->queue);
-		STAILQ_REMOVE_HEAD(pool->queue, stailq);
-		entry->cb_fn(entry, buf);
-	}
-}
+void spdk_iobuf_put(struct spdk_iobuf_channel *ch, void *buf, uint64_t len);

 #ifdef __cplusplus
 }
--- a/include/spdk_internal/lvolstore.h
+++ b/include/spdk_internal/lvolstore.h
@ -10,6 +10,7 @@
 #include "spdk/blob.h"
 #include "spdk/lvol.h"
 #include "spdk/queue.h"
+#include "spdk/tree.h"
 #include "spdk/uuid.h"

 /* Default size of blobstore cluster */
@ -38,6 +39,8 @@ struct spdk_lvol_req {
 	spdk_lvol_op_complete   cb_fn;
 	void                    *cb_arg;
 	struct spdk_lvol	*lvol;
+	/* Only set while lvol is being deleted and has a clone. */
+	struct spdk_lvol	*clone_lvol;
 	size_t			sz;
 	struct spdk_io_channel	*channel;
 	char			name[SPDK_LVOL_NAME_MAX];
@ -62,8 +65,11 @@ struct spdk_lvol_with_handle_req {
 	spdk_lvol_op_with_handle_complete cb_fn;
 	void				*cb_arg;
 	struct spdk_lvol		*lvol;
+	struct spdk_lvol		*origlvol;
 };

+struct spdk_lvs_degraded_lvol_set;
+
 struct spdk_lvol_store {
 	struct spdk_bs_dev		*bs_dev;
 	struct spdk_blob_store		*blobstore;
@ -81,6 +87,8 @@ struct spdk_lvol_store {
 	char				name[SPDK_LVS_NAME_MAX];
 	char				new_name[SPDK_LVS_NAME_MAX];
 	spdk_bs_esnap_dev_create	esnap_bs_dev_create;
+	RB_HEAD(degraded_lvol_sets_tree, spdk_lvs_degraded_lvol_set)	degraded_lvol_sets_tree;
+	struct spdk_thread		*thread;
 };

 struct spdk_lvol {
@ -96,6 +104,8 @@ struct spdk_lvol {
 	bool				action_in_progress;
 	enum blob_clear_method		clear_method;
 	TAILQ_ENTRY(spdk_lvol)		link;
+	struct spdk_lvs_degraded_lvol_set *degraded_set;
+	TAILQ_ENTRY(spdk_lvol)		degraded_link;
 };

 struct lvol_store_bdev *vbdev_lvol_store_first(void);
@ -107,4 +117,10 @@ void spdk_lvol_resize(struct spdk_lvol *lvol, uint64_t sz, spdk_lvol_op_complete
 void spdk_lvol_set_read_only(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn,
 			     void *cb_arg);

+int spdk_lvs_esnap_missing_add(struct spdk_lvol_store *lvs, struct spdk_lvol *lvol,
+			       const void *esnap_id, uint32_t id_len);
+void spdk_lvs_esnap_missing_remove(struct spdk_lvol *lvol);
+bool spdk_lvs_notify_hotplug(const void *esnap_id, uint32_t id_len,
+			     spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg);
+
 #endif /* SPDK_INTERNAL_LVOLSTORE_H */
--- a/include/spdk_internal/nvme_tcp.h
+++ b/include/spdk_internal/nvme_tcp.h
@ -120,6 +120,9 @@ enum nvme_tcp_pdu_recv_state {
 	/* Active tqpair waiting for payload */
 	NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD,

+	/* Active tqpair waiting for all outstanding PDUs to complete */
+	NVME_TCP_PDU_RECV_STATE_QUIESCING,
+
 	/* Active tqpair does not wait for payload */
 	NVME_TCP_PDU_RECV_STATE_ERROR,
 };
--- a/include/spdk_internal/usdt.h
+++ b/include/spdk_internal/usdt.h
@ -17,11 +17,21 @@

 #include <sys/sdt.h>

-#define SPDK_DTRACE_PROBE(name)			DTRACE_PROBE1(spdk,name,spdk_get_ticks())
-#define SPDK_DTRACE_PROBE1(name,a1)		DTRACE_PROBE2(spdk,name,spdk_get_ticks(),a1)
-#define SPDK_DTRACE_PROBE2(name,a1,a2)		DTRACE_PROBE3(spdk,name,spdk_get_ticks(),a1,a2)
-#define SPDK_DTRACE_PROBE3(name,a1,a2,a3)	DTRACE_PROBE4(spdk,name,spdk_get_ticks(),a1,a2,a3)
-#define SPDK_DTRACE_PROBE4(name,a1,a2,a3,a4)	DTRACE_PROBE5(spdk,name,spdk_get_ticks(),a1,a2,a3,a4)
+#define SPDK_DTRACE_PROBE(name)			DTRACE_PROBE1(spdk,name,0)
+#define SPDK_DTRACE_PROBE1(name,a1)		DTRACE_PROBE2(spdk,name,0,a1)
+#define SPDK_DTRACE_PROBE2(name,a1,a2)		DTRACE_PROBE3(spdk,name,0,a1,a2)
+#define SPDK_DTRACE_PROBE3(name,a1,a2,a3)	DTRACE_PROBE4(spdk,name,0,a1,a2,a3)
+#define SPDK_DTRACE_PROBE4(name,a1,a2,a3,a4)	DTRACE_PROBE5(spdk,name,0,a1,a2,a3,a4)
+
+/* These variants implicitly add a TSC argument at the front of the caller's arguments.
+ * These are useful for scripts that require an exact timestamp for correlating
+ * USDT events with those captured by the lower-overhead SPDK tracing framework.
+ */
+#define SPDK_DTRACE_PROBE_TICKS(name)			DTRACE_PROBE1(spdk,name,spdk_get_ticks())
+#define SPDK_DTRACE_PROBE1_TICKS(name,a1)		DTRACE_PROBE2(spdk,name,spdk_get_ticks(),a1)
+#define SPDK_DTRACE_PROBE2_TICKS(name,a1,a2)		DTRACE_PROBE3(spdk,name,spdk_get_ticks(),a1,a2)
+#define SPDK_DTRACE_PROBE3_TICKS(name,a1,a2,a3)		DTRACE_PROBE4(spdk,name,spdk_get_ticks(),a1,a2,a3)
+#define SPDK_DTRACE_PROBE4_TICKS(name,a1,a2,a3,a4)	DTRACE_PROBE5(spdk,name,spdk_get_ticks(),a1,a2,a3,a4)

 #else

@ -31,6 +41,12 @@
 #define SPDK_DTRACE_PROBE3(...)
 #define SPDK_DTRACE_PROBE4(...)

+#define SPDK_DTRACE_PROBE_TICKS(...)
+#define SPDK_DTRACE_PROBE1_TICKS(...)
+#define SPDK_DTRACE_PROBE2_TICKS(...)
+#define SPDK_DTRACE_PROBE3_TICKS(...)
+#define SPDK_DTRACE_PROBE4_TICKS(...)
+
 #endif

 #endif /* SPDK_INTERNAL_USDT_H */
--- a/lib/accel/accel.c
+++ b/lib/accel/accel.c
@ -6,7 +6,7 @@

 #include "spdk/stdinc.h"

-#include "spdk_internal/accel_module.h"
+#include "spdk/accel_module.h"

 #include "accel_internal.h"

@ -62,6 +62,15 @@ static struct accel_module g_modules_opc[ACCEL_OPC_LAST] = {};
 static char *g_modules_opc_override[ACCEL_OPC_LAST] = {};
 TAILQ_HEAD(, spdk_accel_driver) g_accel_drivers = TAILQ_HEAD_INITIALIZER(g_accel_drivers);
 static struct spdk_accel_driver *g_accel_driver;
+static struct spdk_accel_opts g_opts = {
+	.small_cache_size = ACCEL_SMALL_CACHE_SIZE,
+	.large_cache_size = ACCEL_LARGE_CACHE_SIZE,
+	.task_count = MAX_TASKS_PER_CHANNEL,
+	.sequence_count = MAX_TASKS_PER_CHANNEL,
+	.buf_count = MAX_TASKS_PER_CHANNEL,
+};
+static struct accel_stats g_stats;
+static struct spdk_spinlock g_stats_lock;

 static const char *g_opcode_strings[ACCEL_OPC_LAST] = {
 	"copy", "fill", "dualcast", "compare", "crc32c", "copy_crc32c",
@ -134,6 +143,7 @@ struct accel_io_channel {
 	TAILQ_HEAD(, spdk_accel_sequence)	seq_pool;
 	TAILQ_HEAD(, accel_buffer)		buf_pool;
 	struct spdk_iobuf_channel		iobuf;
+	struct accel_stats			stats;
 };

 TAILQ_HEAD(accel_sequence_tasks, spdk_accel_task);
@ -151,6 +161,14 @@ struct spdk_accel_sequence {
 	TAILQ_ENTRY(spdk_accel_sequence)	link;
 };

+#define accel_update_stats(ch, event, v) \
+	do { \
+		(ch)->stats.event += (v); \
+	} while (0)
+
+#define accel_update_task_stats(ch, task, event, v) \
+	accel_update_stats(ch, operations[(task)->op_code].event, v)
+
 static inline void
 accel_sequence_set_state(struct spdk_accel_sequence *seq, enum accel_sequence_state state)
 {
@ -253,6 +271,12 @@ spdk_accel_task_complete(struct spdk_accel_task *accel_task, int status)
 	 */
 	TAILQ_INSERT_HEAD(&accel_ch->task_pool, accel_task, link);

+	accel_update_task_stats(accel_ch, accel_task, executed, 1);
+	accel_update_task_stats(accel_ch, accel_task, num_bytes, accel_task->nbytes);
+	if (spdk_unlikely(status != 0)) {
+		accel_update_task_stats(accel_ch, accel_task, failed, 1);
+	}
+
 	cb_fn(cb_arg, status);
 }

@ -279,6 +303,34 @@ _get_task(struct accel_io_channel *accel_ch, spdk_accel_completion_cb cb_fn, voi
 	return accel_task;
 }

+static inline int
+accel_submit_task(struct accel_io_channel *accel_ch, struct spdk_accel_task *task)
+{
+	struct spdk_io_channel *module_ch = accel_ch->module_ch[task->op_code];
+	struct spdk_accel_module_if *module = g_modules_opc[task->op_code].module;
+	int rc;
+
+	rc = module->submit_tasks(module_ch, task);
+	if (spdk_unlikely(rc != 0)) {
+		accel_update_task_stats(accel_ch, task, failed, 1);
+	}
+
+	return rc;
+}
+
+static inline uint64_t
+accel_get_iovlen(struct iovec *iovs, uint32_t iovcnt)
+{
+	uint64_t result = 0;
+	uint32_t i;
+
+	for (i = 0; i < iovcnt; ++i) {
+		result += iovs[i].iov_len;
+	}
+
+	return result;
+}
+
 /* Accel framework public API for copy function */
 int
 spdk_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src,
@ -286,8 +338,6 @@ spdk_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src,
 {
 	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
 	struct spdk_accel_task *accel_task;
-	struct spdk_accel_module_if *module = g_modules_opc[ACCEL_OPC_COPY].module;
-	struct spdk_io_channel *module_ch = accel_ch->module_ch[ACCEL_OPC_COPY];

 	accel_task = _get_task(accel_ch, cb_fn, cb_arg);
 	if (accel_task == NULL) {
@ -302,13 +352,14 @@ spdk_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src,
 	accel_task->s.iovs[0].iov_base = src;
 	accel_task->s.iovs[0].iov_len = nbytes;
 	accel_task->s.iovcnt = 1;
+	accel_task->nbytes = nbytes;
 	accel_task->op_code = ACCEL_OPC_COPY;
 	accel_task->flags = flags;
 	accel_task->src_domain = NULL;
 	accel_task->dst_domain = NULL;
 	accel_task->step_cb_fn = NULL;

-	return module->submit_tasks(module_ch, accel_task);
+	return accel_submit_task(accel_ch, accel_task);
 }

 /* Accel framework public API for dual cast copy function */
@ -319,8 +370,6 @@ spdk_accel_submit_dualcast(struct spdk_io_channel *ch, void *dst1,
 {
 	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
 	struct spdk_accel_task *accel_task;
-	struct spdk_accel_module_if *module = g_modules_opc[ACCEL_OPC_DUALCAST].module;
-	struct spdk_io_channel *module_ch = accel_ch->module_ch[ACCEL_OPC_DUALCAST];

 	if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) {
 		SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n");
@ -344,13 +393,14 @@ spdk_accel_submit_dualcast(struct spdk_io_channel *ch, void *dst1,
 	accel_task->s.iovs[0].iov_base = src;
 	accel_task->s.iovs[0].iov_len = nbytes;
 	accel_task->s.iovcnt = 1;
+	accel_task->nbytes = nbytes;
 	accel_task->flags = flags;
 	accel_task->op_code = ACCEL_OPC_DUALCAST;
 	accel_task->src_domain = NULL;
 	accel_task->dst_domain = NULL;
 	accel_task->step_cb_fn = NULL;

-	return module->submit_tasks(module_ch, accel_task);
+	return accel_submit_task(accel_ch, accel_task);
 }

 /* Accel framework public API for compare function */
@ -361,8 +411,6 @@ spdk_accel_submit_compare(struct spdk_io_channel *ch, void *src1,
 {
 	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
 	struct spdk_accel_task *accel_task;
-	struct spdk_accel_module_if *module = g_modules_opc[ACCEL_OPC_COMPARE].module;
-	struct spdk_io_channel *module_ch = accel_ch->module_ch[ACCEL_OPC_COMPARE];

 	accel_task = _get_task(accel_ch, cb_fn, cb_arg);
 	if (accel_task == NULL) {
@ -377,12 +425,13 @@ spdk_accel_submit_compare(struct spdk_io_channel *ch, void *src1,
 	accel_task->s2.iovs[0].iov_base = src2;
 	accel_task->s2.iovs[0].iov_len = nbytes;
 	accel_task->s2.iovcnt = 1;
+	accel_task->nbytes = nbytes;
 	accel_task->op_code = ACCEL_OPC_COMPARE;
 	accel_task->src_domain = NULL;
 	accel_task->dst_domain = NULL;
 	accel_task->step_cb_fn = NULL;

-	return module->submit_tasks(module_ch, accel_task);
+	return accel_submit_task(accel_ch, accel_task);
 }

 /* Accel framework public API for fill function */
@ -393,8 +442,6 @@ spdk_accel_submit_fill(struct spdk_io_channel *ch, void *dst,
 {
 	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
 	struct spdk_accel_task *accel_task;
-	struct spdk_accel_module_if *module = g_modules_opc[ACCEL_OPC_FILL].module;
-	struct spdk_io_channel *module_ch = accel_ch->module_ch[ACCEL_OPC_FILL];

 	accel_task = _get_task(accel_ch, cb_fn, cb_arg);
 	if (accel_task == NULL) {
@ -405,6 +452,7 @@ spdk_accel_submit_fill(struct spdk_io_channel *ch, void *dst,
 	accel_task->d.iovs[0].iov_base = dst;
 	accel_task->d.iovs[0].iov_len = nbytes;
 	accel_task->d.iovcnt = 1;
+	accel_task->nbytes = nbytes;
 	memset(&accel_task->fill_pattern, fill, sizeof(uint64_t));
 	accel_task->flags = flags;
 	accel_task->op_code = ACCEL_OPC_FILL;
@ -412,7 +460,7 @@ spdk_accel_submit_fill(struct spdk_io_channel *ch, void *dst,
 	accel_task->dst_domain = NULL;
 	accel_task->step_cb_fn = NULL;

-	return module->submit_tasks(module_ch, accel_task);
+	return accel_submit_task(accel_ch, accel_task);
 }

 /* Accel framework public API for CRC-32C function */
@ -423,8 +471,6 @@ spdk_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *crc_dst,
 {
 	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
 	struct spdk_accel_task *accel_task;
-	struct spdk_accel_module_if *module = g_modules_opc[ACCEL_OPC_CRC32C].module;
-	struct spdk_io_channel *module_ch = accel_ch->module_ch[ACCEL_OPC_CRC32C];

 	accel_task = _get_task(accel_ch, cb_fn, cb_arg);
 	if (accel_task == NULL) {
@ -435,6 +481,7 @@ spdk_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *crc_dst,
 	accel_task->s.iovs[0].iov_base = src;
 	accel_task->s.iovs[0].iov_len = nbytes;
 	accel_task->s.iovcnt = 1;
+	accel_task->nbytes = nbytes;
 	accel_task->crc_dst = crc_dst;
 	accel_task->seed = seed;
 	accel_task->op_code = ACCEL_OPC_CRC32C;
@ -442,7 +489,7 @@ spdk_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *crc_dst,
 	accel_task->dst_domain = NULL;
 	accel_task->step_cb_fn = NULL;

-	return module->submit_tasks(module_ch, accel_task);
+	return accel_submit_task(accel_ch, accel_task);
 }

 /* Accel framework public API for chained CRC-32C function */
@ -453,8 +500,6 @@ spdk_accel_submit_crc32cv(struct spdk_io_channel *ch, uint32_t *crc_dst,
 {
 	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
 	struct spdk_accel_task *accel_task;
-	struct spdk_accel_module_if *module = g_modules_opc[ACCEL_OPC_CRC32C].module;
-	struct spdk_io_channel *module_ch = accel_ch->module_ch[ACCEL_OPC_CRC32C];

 	if (iov == NULL) {
 		SPDK_ERRLOG("iov should not be NULL");
@ -475,6 +520,7 @@ spdk_accel_submit_crc32cv(struct spdk_io_channel *ch, uint32_t *crc_dst,

 	accel_task->s.iovs = iov;
 	accel_task->s.iovcnt = iov_cnt;
+	accel_task->nbytes = accel_get_iovlen(iov, iov_cnt);
 	accel_task->crc_dst = crc_dst;
 	accel_task->seed = seed;
 	accel_task->op_code = ACCEL_OPC_CRC32C;
@ -482,7 +528,7 @@ spdk_accel_submit_crc32cv(struct spdk_io_channel *ch, uint32_t *crc_dst,
 	accel_task->dst_domain = NULL;
 	accel_task->step_cb_fn = NULL;

-	return module->submit_tasks(module_ch, accel_task);
+	return accel_submit_task(accel_ch, accel_task);
 }

 /* Accel framework public API for copy with CRC-32C function */
@ -493,8 +539,6 @@ spdk_accel_submit_copy_crc32c(struct spdk_io_channel *ch, void *dst,
 {
 	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
 	struct spdk_accel_task *accel_task;
-	struct spdk_accel_module_if *module = g_modules_opc[ACCEL_OPC_COPY_CRC32C].module;
-	struct spdk_io_channel *module_ch = accel_ch->module_ch[ACCEL_OPC_COPY_CRC32C];

 	accel_task = _get_task(accel_ch, cb_fn, cb_arg);
 	if (accel_task == NULL) {
@ -509,6 +553,7 @@ spdk_accel_submit_copy_crc32c(struct spdk_io_channel *ch, void *dst,
 	accel_task->s.iovs[0].iov_base = src;
 	accel_task->s.iovs[0].iov_len = nbytes;
 	accel_task->s.iovcnt = 1;
+	accel_task->nbytes = nbytes;
 	accel_task->crc_dst = crc_dst;
 	accel_task->seed = seed;
 	accel_task->flags = flags;
@ -517,7 +562,7 @@ spdk_accel_submit_copy_crc32c(struct spdk_io_channel *ch, void *dst,
 	accel_task->dst_domain = NULL;
 	accel_task->step_cb_fn = NULL;

-	return module->submit_tasks(module_ch, accel_task);
+	return accel_submit_task(accel_ch, accel_task);
 }

 /* Accel framework public API for chained copy + CRC-32C function */
@ -528,10 +573,7 @@ spdk_accel_submit_copy_crc32cv(struct spdk_io_channel *ch, void *dst,
 {
 	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
 	struct spdk_accel_task *accel_task;
-	struct spdk_accel_module_if *module = g_modules_opc[ACCEL_OPC_COPY_CRC32C].module;
-	struct spdk_io_channel *module_ch = accel_ch->module_ch[ACCEL_OPC_COPY_CRC32C];
 	uint64_t nbytes;
-	uint32_t i;

 	if (src_iovs == NULL) {
 		SPDK_ERRLOG("iov should not be NULL");
@ -550,17 +592,14 @@ spdk_accel_submit_copy_crc32cv(struct spdk_io_channel *ch, void *dst,
 		return -ENOMEM;
 	}

-	nbytes = 0;
-	for (i = 0; i < iov_cnt; i++) {
-		nbytes += src_iovs[i].iov_len;
-	}
-
+	nbytes = accel_get_iovlen(src_iovs, iov_cnt);
 	accel_task->d.iovs = &accel_task->aux_iovs[SPDK_ACCEL_AUX_IOV_DST];
 	accel_task->d.iovs[0].iov_base = dst;
 	accel_task->d.iovs[0].iov_len = nbytes;
 	accel_task->d.iovcnt = 1;
 	accel_task->s.iovs = src_iovs;
 	accel_task->s.iovcnt = iov_cnt;
+	accel_task->nbytes = nbytes;
 	accel_task->crc_dst = crc_dst;
 	accel_task->seed = seed;
 	accel_task->flags = flags;
@ -569,7 +608,7 @@ spdk_accel_submit_copy_crc32cv(struct spdk_io_channel *ch, void *dst,
 	accel_task->dst_domain = NULL;
 	accel_task->step_cb_fn = NULL;

-	return module->submit_tasks(module_ch, accel_task);
+	return accel_submit_task(accel_ch, accel_task);
 }

 int
@ -579,8 +618,6 @@ spdk_accel_submit_compress(struct spdk_io_channel *ch, void *dst, uint64_t nbyte
 {
 	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
 	struct spdk_accel_task *accel_task;
-	struct spdk_accel_module_if *module = g_modules_opc[ACCEL_OPC_COMPRESS].module;
-	struct spdk_io_channel *module_ch = accel_ch->module_ch[ACCEL_OPC_COMPRESS];

 	accel_task = _get_task(accel_ch, cb_fn, cb_arg);
 	if (accel_task == NULL) {
@ -594,13 +631,14 @@ spdk_accel_submit_compress(struct spdk_io_channel *ch, void *dst, uint64_t nbyte
 	accel_task->output_size = output_size;
 	accel_task->s.iovs = src_iovs;
 	accel_task->s.iovcnt = src_iovcnt;
+	accel_task->nbytes = nbytes;
 	accel_task->flags = flags;
 	accel_task->op_code = ACCEL_OPC_COMPRESS;
 	accel_task->src_domain = NULL;
 	accel_task->dst_domain = NULL;
 	accel_task->step_cb_fn = NULL;

-	return module->submit_tasks(module_ch, accel_task);
+	return accel_submit_task(accel_ch, accel_task);
 }

 int
@ -611,8 +649,6 @@ spdk_accel_submit_decompress(struct spdk_io_channel *ch, struct iovec *dst_iovs,
 {
 	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
 	struct spdk_accel_task *accel_task;
-	struct spdk_accel_module_if *module = g_modules_opc[ACCEL_OPC_DECOMPRESS].module;
-	struct spdk_io_channel *module_ch = accel_ch->module_ch[ACCEL_OPC_DECOMPRESS];

 	accel_task = _get_task(accel_ch, cb_fn, cb_arg);
 	if (accel_task == NULL) {
@ -624,13 +660,14 @@ spdk_accel_submit_decompress(struct spdk_io_channel *ch, struct iovec *dst_iovs,
 	accel_task->s.iovcnt = src_iovcnt;
 	accel_task->d.iovs = dst_iovs;
 	accel_task->d.iovcnt = dst_iovcnt;
+	accel_task->nbytes = accel_get_iovlen(src_iovs, src_iovcnt);
 	accel_task->flags = flags;
 	accel_task->op_code = ACCEL_OPC_DECOMPRESS;
 	accel_task->src_domain = NULL;
 	accel_task->dst_domain = NULL;
 	accel_task->step_cb_fn = NULL;

-	return module->submit_tasks(module_ch, accel_task);
+	return accel_submit_task(accel_ch, accel_task);
 }

 int
@ -642,8 +679,6 @@ spdk_accel_submit_encrypt(struct spdk_io_channel *ch, struct spdk_accel_crypto_k
 {
 	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
 	struct spdk_accel_task *accel_task;
-	struct spdk_accel_module_if *module = g_modules_opc[ACCEL_OPC_ENCRYPT].module;
-	struct spdk_io_channel *module_ch = accel_ch->module_ch[ACCEL_OPC_ENCRYPT];

 	if (spdk_unlikely(!dst_iovs || !dst_iovcnt || !src_iovs || !src_iovcnt || !key || !block_size)) {
 		return -EINVAL;
@ -659,6 +694,7 @@ spdk_accel_submit_encrypt(struct spdk_io_channel *ch, struct spdk_accel_crypto_k
 	accel_task->s.iovcnt = src_iovcnt;
 	accel_task->d.iovs = dst_iovs;
 	accel_task->d.iovcnt = dst_iovcnt;
+	accel_task->nbytes = accel_get_iovlen(src_iovs, src_iovcnt);
 	accel_task->iv = iv;
 	accel_task->block_size = block_size;
 	accel_task->flags = flags;
@ -667,7 +703,7 @@ spdk_accel_submit_encrypt(struct spdk_io_channel *ch, struct spdk_accel_crypto_k
 	accel_task->dst_domain = NULL;
 	accel_task->step_cb_fn = NULL;

-	return module->submit_tasks(module_ch, accel_task);
+	return accel_submit_task(accel_ch, accel_task);
 }

 int
@ -679,8 +715,6 @@ spdk_accel_submit_decrypt(struct spdk_io_channel *ch, struct spdk_accel_crypto_k
 {
 	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
 	struct spdk_accel_task *accel_task;
-	struct spdk_accel_module_if *module = g_modules_opc[ACCEL_OPC_DECRYPT].module;
-	struct spdk_io_channel *module_ch = accel_ch->module_ch[ACCEL_OPC_DECRYPT];

 	if (spdk_unlikely(!dst_iovs || !dst_iovcnt || !src_iovs || !src_iovcnt || !key || !block_size)) {
 		return -EINVAL;
@ -696,6 +730,7 @@ spdk_accel_submit_decrypt(struct spdk_io_channel *ch, struct spdk_accel_crypto_k
 	accel_task->s.iovcnt = src_iovcnt;
 	accel_task->d.iovs = dst_iovs;
 	accel_task->d.iovcnt = dst_iovcnt;
+	accel_task->nbytes = accel_get_iovlen(src_iovs, src_iovcnt);
 	accel_task->iv = iv;
 	accel_task->block_size = block_size;
 	accel_task->flags = flags;
@ -704,7 +739,7 @@ spdk_accel_submit_decrypt(struct spdk_io_channel *ch, struct spdk_accel_crypto_k
 	accel_task->dst_domain = NULL;
 	accel_task->step_cb_fn = NULL;

-	return module->submit_tasks(module_ch, accel_task);
+	return accel_submit_task(accel_ch, accel_task);
 }

 int
@ -713,8 +748,6 @@ spdk_accel_submit_xor(struct spdk_io_channel *ch, void *dst, void **sources, uin
 {
 	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
 	struct spdk_accel_task *accel_task;
-	struct spdk_accel_module_if *module = g_modules_opc[ACCEL_OPC_XOR].module;
-	struct spdk_io_channel *module_ch = accel_ch->module_ch[ACCEL_OPC_XOR];

 	accel_task = _get_task(accel_ch, cb_fn, cb_arg);
 	if (accel_task == NULL) {
@ -727,12 +760,13 @@ spdk_accel_submit_xor(struct spdk_io_channel *ch, void *dst, void **sources, uin
 	accel_task->d.iovs[0].iov_base = dst;
 	accel_task->d.iovs[0].iov_len = nbytes;
 	accel_task->d.iovcnt = 1;
+	accel_task->nbytes = nbytes;
 	accel_task->op_code = ACCEL_OPC_XOR;
 	accel_task->src_domain = NULL;
 	accel_task->dst_domain = NULL;
 	accel_task->step_cb_fn = NULL;

-	return module->submit_tasks(module_ch, accel_task);
+	return accel_submit_task(accel_ch, accel_task);
 }

 static inline struct accel_buffer *
@ -863,6 +897,7 @@ spdk_accel_append_copy(struct spdk_accel_sequence **pseq, struct spdk_io_channel
 	task->src_domain_ctx = src_domain_ctx;
 	task->s.iovs = src_iovs;
 	task->s.iovcnt = src_iovcnt;
+	task->nbytes = accel_get_iovlen(src_iovs, src_iovcnt);
 	task->flags = flags;
 	task->op_code = ACCEL_OPC_COPY;

@ -905,6 +940,7 @@ spdk_accel_append_fill(struct spdk_accel_sequence **pseq, struct spdk_io_channel
 	task->d.iovs[0].iov_base = buf;
 	task->d.iovs[0].iov_len = len;
 	task->d.iovcnt = 1;
+	task->nbytes = len;
 	task->src_domain = NULL;
 	task->dst_domain = domain;
 	task->dst_domain_ctx = domain_ctx;
@ -956,6 +992,7 @@ spdk_accel_append_decompress(struct spdk_accel_sequence **pseq, struct spdk_io_c
 	task->src_domain_ctx = src_domain_ctx;
 	task->s.iovs = src_iovs;
 	task->s.iovcnt = src_iovcnt;
+	task->nbytes = accel_get_iovlen(src_iovs, src_iovcnt);
 	task->flags = flags;
 	task->op_code = ACCEL_OPC_DECOMPRESS;

@ -1010,6 +1047,7 @@ spdk_accel_append_encrypt(struct spdk_accel_sequence **pseq, struct spdk_io_chan
 	task->dst_domain_ctx = dst_domain_ctx;
 	task->d.iovs = dst_iovs;
 	task->d.iovcnt = dst_iovcnt;
+	task->nbytes = accel_get_iovlen(src_iovs, src_iovcnt);
 	task->iv = iv;
 	task->block_size = block_size;
 	task->flags = flags;
@ -1066,6 +1104,7 @@ spdk_accel_append_decrypt(struct spdk_accel_sequence **pseq, struct spdk_io_chan
 	task->dst_domain_ctx = dst_domain_ctx;
 	task->d.iovs = dst_iovs;
 	task->d.iovcnt = dst_iovcnt;
+	task->nbytes = accel_get_iovlen(src_iovs, src_iovcnt);
 	task->iv = iv;
 	task->block_size = block_size;
 	task->flags = flags;
@ -1077,6 +1116,49 @@ spdk_accel_append_decrypt(struct spdk_accel_sequence **pseq, struct spdk_io_chan
 	return 0;
 }

+int
+spdk_accel_append_crc32c(struct spdk_accel_sequence **pseq, struct spdk_io_channel *ch,
+			 uint32_t *dst, struct iovec *iovs, uint32_t iovcnt,
+			 struct spdk_memory_domain *domain, void *domain_ctx,
+			 uint32_t seed, spdk_accel_step_cb cb_fn, void *cb_arg)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+	struct spdk_accel_task *task;
+	struct spdk_accel_sequence *seq = *pseq;
+
+	if (seq == NULL) {
+		seq = accel_sequence_get(accel_ch);
+		if (spdk_unlikely(seq == NULL)) {
+			return -ENOMEM;
+		}
+	}
+
+	assert(seq->ch == accel_ch);
+	task = accel_sequence_get_task(accel_ch, seq, cb_fn, cb_arg);
+	if (spdk_unlikely(task == NULL)) {
+		if (*pseq == NULL) {
+			accel_sequence_put(seq);
+		}
+
+		return -ENOMEM;
+	}
+
+	task->s.iovs = iovs;
+	task->s.iovcnt = iovcnt;
+	task->src_domain = domain;
+	task->src_domain_ctx = domain_ctx;
+	task->nbytes = accel_get_iovlen(iovs, iovcnt);
+	task->crc_dst = dst;
+	task->seed = seed;
+	task->op_code = ACCEL_OPC_CRC32C;
+	task->dst_domain = NULL;
+
+	TAILQ_INSERT_TAIL(&seq->tasks, task, seq_link);
+	*pseq = seq;
+
+	return 0;
+}
+
 int
 spdk_accel_get_buf(struct spdk_io_channel *ch, uint64_t len, void **buf,
 		   struct spdk_memory_domain **domain, void **domain_ctx)
@ -1146,6 +1228,11 @@ accel_sequence_complete(struct spdk_accel_sequence *seq)
 {
 	SPDK_DEBUGLOG(accel, "Completed sequence: %p with status: %d\n", seq, seq->status);

+	accel_update_stats(seq->ch, sequence_executed, 1);
+	if (spdk_unlikely(seq->status != 0)) {
+		accel_update_stats(seq->ch, sequence_failed, 1);
+	}
+
 	/* First notify all users that appended operations to this sequence */
 	accel_sequence_complete_tasks(seq);

@ -1304,19 +1391,6 @@ spdk_accel_sequence_next_task(struct spdk_accel_task *task)
 	return TAILQ_NEXT(task, seq_link);
 }

-static inline uint64_t
-accel_get_iovlen(struct iovec *iovs, uint32_t iovcnt)
-{
-	uint64_t result = 0;
-	uint32_t i;
-
-	for (i = 0; i < iovcnt; ++i) {
-		result += iovs[i].iov_len;
-	}
-
-	return result;
-}
-
 static inline void
 accel_set_bounce_buffer(struct spdk_accel_bounce_buffer *bounce, struct iovec **iovs,
 			uint32_t *iovcnt, struct spdk_memory_domain **domain, void **domain_ctx,
@ -1499,8 +1573,6 @@ static void
 accel_process_sequence(struct spdk_accel_sequence *seq)
 {
 	struct accel_io_channel *accel_ch = seq->ch;
-	struct spdk_accel_module_if *module;
-	struct spdk_io_channel *module_ch;
 	struct spdk_accel_task *task;
 	enum accel_sequence_state state;
 	int rc;
@ -1558,11 +1630,8 @@ accel_process_sequence(struct spdk_accel_sequence *seq)
 			SPDK_DEBUGLOG(accel, "Executing %s operation, sequence: %p\n",
 				      g_opcode_strings[task->op_code], seq);

-			module = g_modules_opc[task->op_code].module;
-			module_ch = accel_ch->module_ch[task->op_code];
-
 			accel_sequence_set_state(seq, ACCEL_SEQUENCE_STATE_AWAIT_TASK);
-			rc = module->submit_tasks(module_ch, task);
+			rc = accel_submit_task(accel_ch, task);
 			if (spdk_unlikely(rc != 0)) {
 				SPDK_ERRLOG("Failed to submit %s operation, sequence: %p\n",
 					    g_opcode_strings[task->op_code], seq);
@ -1715,6 +1784,57 @@ accel_compare_iovs(struct iovec *iova, uint32_t iovacnt, struct iovec *iovb, uin
 	return memcmp(iova, iovb, sizeof(*iova) * iovacnt) == 0;
 }

+static bool
+accel_task_set_dstbuf(struct spdk_accel_task *task, struct spdk_accel_task *next)
+{
+	struct spdk_accel_task *prev;
+
+	switch (task->op_code) {
+	case ACCEL_OPC_DECOMPRESS:
+	case ACCEL_OPC_FILL:
+	case ACCEL_OPC_ENCRYPT:
+	case ACCEL_OPC_DECRYPT:
+		if (task->dst_domain != next->src_domain) {
+			return false;
+		}
+		if (!accel_compare_iovs(task->d.iovs, task->d.iovcnt,
+					next->s.iovs, next->s.iovcnt)) {
+			return false;
+		}
+		task->d.iovs = next->d.iovs;
+		task->d.iovcnt = next->d.iovcnt;
+		task->dst_domain = next->dst_domain;
+		task->dst_domain_ctx = next->dst_domain_ctx;
+		break;
+	case ACCEL_OPC_CRC32C:
+		/* crc32 is special, because it doesn't have a dst buffer */
+		if (task->src_domain != next->src_domain) {
+			return false;
+		}
+		if (!accel_compare_iovs(task->s.iovs, task->s.iovcnt,
+					next->s.iovs, next->s.iovcnt)) {
+			return false;
+		}
+		/* We can only change crc32's buffer if we can change previous task's buffer */
+		prev = TAILQ_PREV(task, accel_sequence_tasks, seq_link);
+		if (prev == NULL) {
+			return false;
+		}
+		if (!accel_task_set_dstbuf(prev, next)) {
+			return false;
+		}
+		task->s.iovs = next->d.iovs;
+		task->s.iovcnt = next->d.iovcnt;
+		task->src_domain = next->dst_domain;
+		task->src_domain_ctx = next->dst_domain_ctx;
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
 static void
 accel_sequence_merge_tasks(struct spdk_accel_sequence *seq, struct spdk_accel_task *task,
 			   struct spdk_accel_task **next_task)
@ -1731,7 +1851,8 @@ accel_sequence_merge_tasks(struct spdk_accel_sequence *seq, struct spdk_accel_ta
 		if (next->op_code != ACCEL_OPC_DECOMPRESS &&
 		    next->op_code != ACCEL_OPC_COPY &&
 		    next->op_code != ACCEL_OPC_ENCRYPT &&
-		    next->op_code != ACCEL_OPC_DECRYPT) {
+		    next->op_code != ACCEL_OPC_DECRYPT &&
+		    next->op_code != ACCEL_OPC_CRC32C) {
 			break;
 		}
 		if (task->dst_domain != next->src_domain) {
@ -1744,6 +1865,7 @@ accel_sequence_merge_tasks(struct spdk_accel_sequence *seq, struct spdk_accel_ta
 		next->s.iovs = task->s.iovs;
 		next->s.iovcnt = task->s.iovcnt;
 		next->src_domain = task->src_domain;
+		next->src_domain_ctx = task->src_domain_ctx;
 		TAILQ_REMOVE(&seq->tasks, task, seq_link);
 		TAILQ_INSERT_TAIL(&seq->completed, task, seq_link);
 		break;
@ -1751,20 +1873,14 @@ accel_sequence_merge_tasks(struct spdk_accel_sequence *seq, struct spdk_accel_ta
 	case ACCEL_OPC_FILL:
 	case ACCEL_OPC_ENCRYPT:
 	case ACCEL_OPC_DECRYPT:
+	case ACCEL_OPC_CRC32C:
 		/* We can only merge tasks when one of them is a copy */
 		if (next->op_code != ACCEL_OPC_COPY) {
 			break;
 		}
-		if (task->dst_domain != next->src_domain) {
+		if (!accel_task_set_dstbuf(task, next)) {
 			break;
 		}
-		if (!accel_compare_iovs(task->d.iovs, task->d.iovcnt,
-					next->s.iovs, next->s.iovcnt)) {
-			break;
-		}
-		task->d.iovs = next->d.iovs;
-		task->d.iovcnt = next->d.iovcnt;
-		task->dst_domain = next->dst_domain;
 		/* We're removing next_task from the tasks queue, so we need to update its pointer,
 		 * so that the TAILQ_FOREACH_SAFE() loop below works correctly */
 		*next_task = TAILQ_NEXT(next, seq_link);
@ -1777,7 +1893,7 @@ accel_sequence_merge_tasks(struct spdk_accel_sequence *seq, struct spdk_accel_ta
 	}
 }

-int
+void
 spdk_accel_sequence_finish(struct spdk_accel_sequence *seq,
 			   spdk_accel_completion_cb cb_fn, void *cb_arg)
 {
@ -1795,8 +1911,6 @@ spdk_accel_sequence_finish(struct spdk_accel_sequence *seq,
 	seq->cb_arg = cb_arg;

 	accel_process_sequence(seq);
-
-	return 0;
 }

 void
@ -2097,19 +2211,20 @@ accel_create_channel(void *io_device, void *ctx_buf)
 	struct spdk_accel_sequence *seq;
 	struct accel_buffer *buf;
 	uint8_t *task_mem;
-	int i = 0, j, rc;
+	uint32_t i = 0, j;
+	int rc;

-	accel_ch->task_pool_base = calloc(MAX_TASKS_PER_CHANNEL, g_max_accel_module_size);
+	accel_ch->task_pool_base = calloc(g_opts.task_count, g_max_accel_module_size);
 	if (accel_ch->task_pool_base == NULL) {
 		return -ENOMEM;
 	}

-	accel_ch->seq_pool_base = calloc(MAX_TASKS_PER_CHANNEL, sizeof(struct spdk_accel_sequence));
+	accel_ch->seq_pool_base = calloc(g_opts.sequence_count, sizeof(struct spdk_accel_sequence));
 	if (accel_ch->seq_pool_base == NULL) {
 		goto err;
 	}

-	accel_ch->buf_pool_base = calloc(MAX_TASKS_PER_CHANNEL, sizeof(struct accel_buffer));
+	accel_ch->buf_pool_base = calloc(g_opts.buf_count, sizeof(struct accel_buffer));
 	if (accel_ch->buf_pool_base == NULL) {
 		goto err;
 	}
@ -2117,16 +2232,21 @@ accel_create_channel(void *io_device, void *ctx_buf)
 	TAILQ_INIT(&accel_ch->task_pool);
 	TAILQ_INIT(&accel_ch->seq_pool);
 	TAILQ_INIT(&accel_ch->buf_pool);
+
 	task_mem = accel_ch->task_pool_base;
-	for (i = 0 ; i < MAX_TASKS_PER_CHANNEL; i++) {
+	for (i = 0; i < g_opts.task_count; i++) {
 		accel_task = (struct spdk_accel_task *)task_mem;
-		seq = &accel_ch->seq_pool_base[i];
-		buf = &accel_ch->buf_pool_base[i];
 		TAILQ_INSERT_TAIL(&accel_ch->task_pool, accel_task, link);
-		TAILQ_INSERT_TAIL(&accel_ch->seq_pool, seq, link);
-		TAILQ_INSERT_TAIL(&accel_ch->buf_pool, buf, link);
 		task_mem += g_max_accel_module_size;
 	}
+	for (i = 0; i < g_opts.sequence_count; i++) {
+		seq = &accel_ch->seq_pool_base[i];
+		TAILQ_INSERT_TAIL(&accel_ch->seq_pool, seq, link);
+	}
+	for (i = 0; i < g_opts.buf_count; i++) {
+		buf = &accel_ch->buf_pool_base[i];
+		TAILQ_INSERT_TAIL(&accel_ch->buf_pool, buf, link);
+	}

 	/* Assign modules and get IO channels for each */
 	for (i = 0; i < ACCEL_OPC_LAST; i++) {
@ -2137,8 +2257,8 @@ accel_create_channel(void *io_device, void *ctx_buf)
 		}
 	}

-	rc = spdk_iobuf_channel_init(&accel_ch->iobuf, "accel", ACCEL_SMALL_CACHE_SIZE,
-				     ACCEL_LARGE_CACHE_SIZE);
+	rc = spdk_iobuf_channel_init(&accel_ch->iobuf, "accel", g_opts.small_cache_size,
+				     g_opts.large_cache_size);
 	if (rc != 0) {
 		SPDK_ERRLOG("Failed to initialize iobuf accel channel\n");
 		goto err;
@ -2152,9 +2272,24 @@ err:
 	free(accel_ch->task_pool_base);
 	free(accel_ch->seq_pool_base);
 	free(accel_ch->buf_pool_base);
+
 	return -ENOMEM;
 }

+static void
+accel_add_stats(struct accel_stats *total, struct accel_stats *stats)
+{
+	int i;
+
+	total->sequence_executed += stats->sequence_executed;
+	total->sequence_failed += stats->sequence_failed;
+	for (i = 0; i < ACCEL_OPC_LAST; ++i) {
+		total->operations[i].executed += stats->operations[i].executed;
+		total->operations[i].failed += stats->operations[i].failed;
+		total->operations[i].num_bytes += stats->operations[i].num_bytes;
+	}
+}
+
 /* Framework level channel destroy callback. */
 static void
 accel_destroy_channel(void *io_device, void *ctx_buf)
@ -2170,6 +2305,11 @@ accel_destroy_channel(void *io_device, void *ctx_buf)
 		accel_ch->module_ch[i] = NULL;
 	}

+	/* Update global stats to make sure channel's stats aren't lost after a channel is gone */
+	spdk_spin_lock(&g_stats_lock);
+	accel_add_stats(&g_stats, &accel_ch->stats);
+	spdk_spin_unlock(&g_stats_lock);
+
 	free(accel_ch->task_pool_base);
 	free(accel_ch->seq_pool_base);
 	free(accel_ch->buf_pool_base);
@ -2217,6 +2357,7 @@ spdk_accel_initialize(void)
 	}

 	spdk_spin_init(&g_keyring_spin);
+	spdk_spin_init(&g_stats_lock);

 	g_modules_started = true;
 	accel_module_initialize();
@ -2341,6 +2482,21 @@ _accel_crypto_key_write_config_json(struct spdk_json_write_ctx *w,
 	spdk_json_write_object_end(w);
 }

+static void
+accel_write_options(struct spdk_json_write_ctx *w)
+{
+	spdk_json_write_object_begin(w);
+	spdk_json_write_named_string(w, "method", "accel_set_options");
+	spdk_json_write_named_object_begin(w, "params");
+	spdk_json_write_named_uint32(w, "small_cache_size", g_opts.small_cache_size);
+	spdk_json_write_named_uint32(w, "large_cache_size", g_opts.large_cache_size);
+	spdk_json_write_named_uint32(w, "task_count", g_opts.task_count);
+	spdk_json_write_named_uint32(w, "sequence_count", g_opts.sequence_count);
+	spdk_json_write_named_uint32(w, "buf_count", g_opts.buf_count);
+	spdk_json_write_object_end(w);
+	spdk_json_write_object_end(w);
+}
+
 static void
 _accel_crypto_keys_write_config_json(struct spdk_json_write_ctx *w, bool full_dump)
 {
@ -2369,11 +2525,9 @@ spdk_accel_write_config_json(struct spdk_json_write_ctx *w)
 	struct spdk_accel_module_if *accel_module;
 	int i;

-	/*
-	 * The accel fw has no config, there may be some in
-	 * the modules though.
-	 */
 	spdk_json_write_array_begin(w);
+	accel_write_options(w);
+
 	TAILQ_FOREACH(accel_module, &spdk_accel_module_list, tailq) {
 		if (accel_module->write_config_json) {
 			accel_module->write_config_json(w);
@ -2401,6 +2555,7 @@ spdk_accel_module_finish(void)

 	if (!g_accel_module) {
 		spdk_spin_destroy(&g_keyring_spin);
+		spdk_spin_destroy(&g_stats_lock);
 		accel_module_finish_cb();
 		return;
 	}
@ -2412,17 +2567,12 @@ spdk_accel_module_finish(void)
 	}
 }

-void
-spdk_accel_finish(spdk_accel_fini_cb cb_fn, void *cb_arg)
+static void
+accel_io_device_unregister_cb(void *io_device)
 {
 	struct spdk_accel_crypto_key *key, *key_tmp;
 	enum accel_opcode op;

-	assert(cb_fn != NULL);
-
-	g_fini_cb_fn = cb_fn;
-	g_fini_cb_arg = cb_arg;
-
 	spdk_spin_lock(&g_keyring_spin);
 	TAILQ_FOREACH_SAFE(key, &g_keyring, link, key_tmp) {
 		accel_crypto_key_destroy_unsafe(key);
@ -2437,10 +2587,20 @@ spdk_accel_finish(spdk_accel_fini_cb cb_fn, void *cb_arg)
 		g_modules_opc[op].module = NULL;
 	}

-	spdk_io_device_unregister(&spdk_accel_module_list, NULL);
 	spdk_accel_module_finish();
 }

+void
+spdk_accel_finish(spdk_accel_fini_cb cb_fn, void *cb_arg)
+{
+	assert(cb_fn != NULL);
+
+	g_fini_cb_fn = cb_fn;
+	g_fini_cb_arg = cb_arg;
+
+	spdk_io_device_unregister(&spdk_accel_module_list, accel_io_device_unregister_cb);
+}
+
 static struct spdk_accel_driver *
 accel_find_driver(const char *name)
 {
@ -2483,4 +2643,98 @@ spdk_accel_driver_register(struct spdk_accel_driver *driver)
 	TAILQ_INSERT_TAIL(&g_accel_drivers, driver, tailq);
 }

+int
+spdk_accel_set_opts(const struct spdk_accel_opts *opts)
+{
+	if (opts->size > sizeof(*opts)) {
+		return -EINVAL;
+	}
+
+	memcpy(&g_opts, opts, opts->size);
+
+	return 0;
+}
+
+void
+spdk_accel_get_opts(struct spdk_accel_opts *opts)
+{
+	size_t size = opts->size;
+
+	assert(size <= sizeof(*opts));
+
+	memcpy(opts, &g_opts, spdk_min(sizeof(*opts), size));
+	opts->size = size;
+}
+
+struct accel_get_stats_ctx {
+	struct accel_stats	stats;
+	accel_get_stats_cb	cb_fn;
+	void			*cb_arg;
+};
+
+static void
+accel_get_channel_stats_done(struct spdk_io_channel_iter *iter, int status)
+{
+	struct accel_get_stats_ctx *ctx = spdk_io_channel_iter_get_ctx(iter);
+
+	ctx->cb_fn(&ctx->stats, ctx->cb_arg);
+	free(ctx);
+}
+
+static void
+accel_get_channel_stats(struct spdk_io_channel_iter *iter)
+{
+	struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(iter);
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+	struct accel_get_stats_ctx *ctx = spdk_io_channel_iter_get_ctx(iter);
+
+	accel_add_stats(&ctx->stats, &accel_ch->stats);
+	spdk_for_each_channel_continue(iter, 0);
+}
+
+int
+accel_get_stats(accel_get_stats_cb cb_fn, void *cb_arg)
+{
+	struct accel_get_stats_ctx *ctx;
+
+	ctx = calloc(1, sizeof(*ctx));
+	if (ctx == NULL) {
+		return -ENOMEM;
+	}
+
+	spdk_spin_lock(&g_stats_lock);
+	accel_add_stats(&ctx->stats, &g_stats);
+	spdk_spin_unlock(&g_stats_lock);
+
+	ctx->cb_fn = cb_fn;
+	ctx->cb_arg = cb_arg;
+
+	spdk_for_each_channel(&spdk_accel_module_list, accel_get_channel_stats, ctx,
+			      accel_get_channel_stats_done);
+
+	return 0;
+}
+
+void
+spdk_accel_get_opcode_stats(struct spdk_io_channel *ch, enum accel_opcode opcode,
+			    struct spdk_accel_opcode_stats *stats, size_t size)
+{
+	struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+
+#define FIELD_OK(field) \
+	offsetof(struct spdk_accel_opcode_stats, field) + sizeof(stats->field) <= size
+
+#define SET_FIELD(field, value) \
+	if (FIELD_OK(field)) { \
+		stats->field = value; \
+	}
+
+	SET_FIELD(executed, accel_ch->stats.operations[opcode].executed);
+	SET_FIELD(failed, accel_ch->stats.operations[opcode].failed);
+	SET_FIELD(num_bytes, accel_ch->stats.operations[opcode].num_bytes);
+
+#undef FIELD_OK
+#undef SET_FIELD
+}
+
 SPDK_LOG_REGISTER_COMPONENT(accel)
--- a/lib/accel/accel_internal.h
+++ b/lib/accel/accel_internal.h
@ -20,10 +20,24 @@ struct module_info {
 	uint32_t num_ops;
 };

+struct accel_operation_stats {
+	uint64_t executed;
+	uint64_t failed;
+	uint64_t num_bytes;
+};
+
+struct accel_stats {
+	struct accel_operation_stats	operations[ACCEL_OPC_LAST];
+	uint64_t			sequence_executed;
+	uint64_t			sequence_failed;
+};
+
 typedef void (*_accel_for_each_module_fn)(struct module_info *info);
 void _accel_for_each_module(struct module_info *info, _accel_for_each_module_fn fn);
 int _accel_get_opc_name(enum accel_opcode opcode, const char **opcode_name);
 void _accel_crypto_key_dump_param(struct spdk_json_write_ctx *w, struct spdk_accel_crypto_key *key);
 void _accel_crypto_keys_dump_param(struct spdk_json_write_ctx *w);
+typedef void (*accel_get_stats_cb)(struct accel_stats *stats, void *cb_arg);
+int accel_get_stats(accel_get_stats_cb cb_fn, void *cb_arg);

 #endif
--- a/lib/accel/accel_rpc.c
+++ b/lib/accel/accel_rpc.c
@ -5,7 +5,7 @@
 */

 #include "accel_internal.h"
-#include "spdk_internal/accel_module.h"
+#include "spdk/accel_module.h"

 #include "spdk/rpc.h"
 #include "spdk/util.h"
@ -354,3 +354,102 @@ cleanup:
 	free_rpc_accel_set_driver(&req);
 }
 SPDK_RPC_REGISTER("accel_set_driver", rpc_accel_set_driver, SPDK_RPC_STARTUP)
+
+struct rpc_accel_opts {
+	uint32_t	small_cache_size;
+	uint32_t	large_cache_size;
+	uint32_t	task_count;
+	uint32_t	sequence_count;
+	uint32_t	buf_count;
+};
+
+static const struct spdk_json_object_decoder rpc_accel_set_options_decoders[] = {
+	{"small_cache_size", offsetof(struct rpc_accel_opts, small_cache_size), spdk_json_decode_uint32, true},
+	{"large_cache_size", offsetof(struct rpc_accel_opts, large_cache_size), spdk_json_decode_uint32, true},
+	{"task_count", offsetof(struct rpc_accel_opts, task_count), spdk_json_decode_uint32, true},
+	{"sequence_count", offsetof(struct rpc_accel_opts, sequence_count), spdk_json_decode_uint32, true},
+	{"buf_count", offsetof(struct rpc_accel_opts, buf_count), spdk_json_decode_uint32, true},
+};
+
+static void
+rpc_accel_set_options(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params)
+{
+	struct spdk_accel_opts opts = { .size = sizeof(opts) };
+	struct rpc_accel_opts rpc_opts;
+	int rc;
+
+	/* We can't pass spdk_accel_opts directly to spdk_json_decode_object(), because that
+	 * structure is packed, leading undefined behavior due to misaligned pointer access */
+	spdk_accel_get_opts(&opts);
+	rpc_opts.small_cache_size = opts.small_cache_size;
+	rpc_opts.large_cache_size = opts.large_cache_size;
+	rpc_opts.task_count = opts.task_count;
+	rpc_opts.sequence_count = opts.sequence_count;
+	rpc_opts.buf_count = opts.buf_count;
+
+	if (spdk_json_decode_object(params, rpc_accel_set_options_decoders,
+				    SPDK_COUNTOF(rpc_accel_set_options_decoders), &rpc_opts)) {
+		spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_PARSE_ERROR,
+						 "spdk_json_decode_object failed");
+		return;
+	}
+
+	opts.small_cache_size = rpc_opts.small_cache_size;
+	opts.large_cache_size = rpc_opts.large_cache_size;
+	opts.task_count = rpc_opts.task_count;
+	opts.sequence_count = rpc_opts.sequence_count;
+	opts.buf_count = rpc_opts.buf_count;
+
+	rc = spdk_accel_set_opts(&opts);
+	if (rc != 0) {
+		spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+		return;
+	}
+
+	spdk_jsonrpc_send_bool_response(request, true);
+}
+SPDK_RPC_REGISTER("accel_set_options", rpc_accel_set_options, SPDK_RPC_STARTUP)
+
+static void
+rpc_accel_get_stats_done(struct accel_stats *stats, void *cb_arg)
+{
+	struct spdk_jsonrpc_request *request = cb_arg;
+	struct spdk_json_write_ctx *w;
+	const char *name;
+	int i;
+
+	w = spdk_jsonrpc_begin_result(request);
+	spdk_json_write_object_begin(w);
+
+	spdk_json_write_named_uint64(w, "sequence_executed", stats->sequence_executed);
+	spdk_json_write_named_uint64(w, "sequence_failed", stats->sequence_failed);
+	spdk_json_write_named_array_begin(w, "operations");
+	for (i = 0; i < ACCEL_OPC_LAST; ++i) {
+		if (stats->operations[i].executed + stats->operations[i].failed == 0) {
+			continue;
+		}
+		_accel_get_opc_name(i, &name);
+		spdk_json_write_object_begin(w);
+		spdk_json_write_named_string(w, "opcode", name);
+		spdk_json_write_named_uint64(w, "executed", stats->operations[i].executed);
+		spdk_json_write_named_uint64(w, "failed", stats->operations[i].failed);
+		spdk_json_write_named_uint64(w, "num_bytes", stats->operations[i].num_bytes);
+		spdk_json_write_object_end(w);
+	}
+	spdk_json_write_array_end(w);
+
+	spdk_json_write_object_end(w);
+	spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_accel_get_stats(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params)
+{
+	int rc;
+
+	rc = accel_get_stats(rpc_accel_get_stats_done, request);
+	if (rc != 0) {
+		spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+	}
+}
+SPDK_RPC_REGISTER("accel_get_stats", rpc_accel_get_stats, SPDK_RPC_RUNTIME)
--- a/lib/accel/accel_sw.c
+++ b/lib/accel/accel_sw.c
@ -6,7 +6,7 @@

 #include "spdk/stdinc.h"

-#include "spdk_internal/accel_module.h"
+#include "spdk/accel_module.h"
 #include "accel_internal.h"

 #include "spdk/env.h"
--- a/lib/accel/spdk_accel.map
+++ b/lib/accel/spdk_accel.map
@ -26,6 +26,7 @@
 	spdk_accel_append_decompress;
 	spdk_accel_append_encrypt;
 	spdk_accel_append_decrypt;
+	spdk_accel_append_crc32c;
 	spdk_accel_sequence_finish;
 	spdk_accel_sequence_abort;
 	spdk_accel_sequence_reverse;
@ -36,6 +37,9 @@
 	spdk_accel_crypto_key_get;
 	spdk_accel_set_driver;
 	spdk_accel_get_memory_domain;
+	spdk_accel_set_opts;
+	spdk_accel_get_opts;
+	spdk_accel_get_opcode_stats;

 	# functions needed by modules
 	spdk_accel_module_list_add;
--- a/lib/bdev/bdev.c
+++ b/lib/bdev/bdev.c
--- a/lib/bdev/bdev_rpc.c
+++ b/lib/bdev/bdev_rpc.c
@ -663,6 +663,7 @@ rpc_dump_bdev_info(void *ctx, struct spdk_bdev *bdev)
 	struct spdk_bdev_alias *tmp;
 	uint64_t qos_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
 	struct spdk_memory_domain **domains;
+	char uuid_str[SPDK_UUID_STRING_LEN];
 	int i, rc;

 	spdk_json_write_object_begin(w);
@ -683,12 +684,8 @@ rpc_dump_bdev_info(void *ctx, struct spdk_bdev *bdev)

 	spdk_json_write_named_uint64(w, "num_blocks", spdk_bdev_get_num_blocks(bdev));

-	if (!spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) {
-		char uuid_str[SPDK_UUID_STRING_LEN];
-
 	spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid);
 	spdk_json_write_named_string(w, "uuid", uuid_str);
-	}

 	if (spdk_bdev_get_md_size(bdev) != 0) {
 		spdk_json_write_named_uint32(w, "md_size", spdk_bdev_get_md_size(bdev));
--- a/lib/bdev/part.c
+++ b/lib/bdev/part.c
@ -497,13 +497,68 @@ spdk_bdev_part_base_construct_ext(const char *bdev_name,
 	return 0;
 }

+void
+spdk_bdev_part_construct_opts_init(struct spdk_bdev_part_construct_opts *opts, uint64_t size)
+{
+	if (opts == NULL) {
+		SPDK_ERRLOG("opts should not be NULL\n");
+		assert(opts != NULL);
+		return;
+	}
+	if (size == 0) {
+		SPDK_ERRLOG("size should not be zero\n");
+		assert(size != 0);
+		return;
+	}
+
+	memset(opts, 0, size);
+	opts->opts_size = size;
+}
+
+static void
+part_construct_opts_copy(const struct spdk_bdev_part_construct_opts *src,
+			 struct spdk_bdev_part_construct_opts *dst)
+{
+	if (src->opts_size == 0) {
+		SPDK_ERRLOG("size should not be zero\n");
+		assert(false);
+	}
+
+	memset(dst, 0, sizeof(*dst));
+	dst->opts_size = src->opts_size;
+
+#define FIELD_OK(field) \
+        offsetof(struct spdk_bdev_part_construct_opts, field) + sizeof(src->field) <= src->opts_size
+
+#define SET_FIELD(field) \
+        if (FIELD_OK(field)) { \
+                dst->field = src->field; \
+        } \
+
+	SET_FIELD(uuid);
+
+	/* You should not remove this statement, but need to update the assert statement
+	 * if you add a new field, and also add a corresponding SET_FIELD statement */
+	SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_part_construct_opts) == 24, "Incorrect size");
+
+#undef FIELD_OK
+#undef SET_FIELD
+}
+
 int
-spdk_bdev_part_construct(struct spdk_bdev_part *part, struct spdk_bdev_part_base *base,
+spdk_bdev_part_construct_ext(struct spdk_bdev_part *part, struct spdk_bdev_part_base *base,
 			     char *name, uint64_t offset_blocks, uint64_t num_blocks,
-			 char *product_name)
+			     char *product_name, const struct spdk_bdev_part_construct_opts *_opts)
 {
 	int rc;
 	bool first_claimed = false;
+	struct spdk_bdev_part_construct_opts opts;
+
+	if (_opts == NULL) {
+		spdk_bdev_part_construct_opts_init(&opts, sizeof(opts));
+	} else {
+		part_construct_opts_copy(_opts, &opts);
+	}

 	part->internal.bdev.blocklen = base->bdev->blocklen;
 	part->internal.bdev.blockcnt = num_blocks;
@ -535,6 +590,8 @@ spdk_bdev_part_construct(struct spdk_bdev_part *part, struct spdk_bdev_part_base
 		return -1;
 	}

+	spdk_uuid_copy(&part->internal.bdev.uuid, &opts.uuid);
+
 	base->ref++;
 	part->internal.base = base;

@ -575,3 +632,12 @@ spdk_bdev_part_construct(struct spdk_bdev_part *part, struct spdk_bdev_part_base

 	return rc;
 }
+
+int
+spdk_bdev_part_construct(struct spdk_bdev_part *part, struct spdk_bdev_part_base *base,
+			 char *name, uint64_t offset_blocks, uint64_t num_blocks,
+			 char *product_name)
+{
+	return spdk_bdev_part_construct_ext(part, base, name, offset_blocks, num_blocks,
+					    product_name, NULL);
+}
--- a/lib/bdev/spdk_bdev.map
+++ b/lib/bdev/spdk_bdev.map
@ -152,7 +152,9 @@
 	spdk_bdev_part_free;
 	spdk_bdev_part_base_hotremove;
 	spdk_bdev_part_base_construct_ext;
+	spdk_bdev_part_construct_opts_init;
 	spdk_bdev_part_construct;
+	spdk_bdev_part_construct_ext;
 	spdk_bdev_part_submit_request;
 	spdk_bdev_part_submit_request_ext;
 	spdk_bdev_part_get_bdev;
--- a/lib/blob/blob_bs_dev.c
+++ b/lib/blob/blob_bs_dev.c
@ -154,6 +154,14 @@ blob_bs_translate_lba(struct spdk_bs_dev *dev, uint64_t lba, uint64_t *base_lba)
 						base_lba);
 }

+static bool
+blob_bs_is_degraded(struct spdk_bs_dev *dev)
+{
+	struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)dev;
+
+	return spdk_blob_is_degraded(b->blob);
+}
+
 struct spdk_bs_dev *
 bs_create_blob_bs_dev(struct spdk_blob *blob)
 {
@ -180,6 +188,7 @@ bs_create_blob_bs_dev(struct spdk_blob *blob)
 	b->bs_dev.unmap = blob_bs_dev_unmap;
 	b->bs_dev.is_zeroes = blob_bs_is_zeroes;
 	b->bs_dev.translate_lba = blob_bs_translate_lba;
+	b->bs_dev.is_degraded = blob_bs_is_degraded;
 	b->blob = blob;

 	return &b->bs_dev;
--- a/lib/blob/blobstore.c
+++ b/lib/blob/blobstore.c
@ -68,6 +68,7 @@ blob_is_esnap_clone(const struct spdk_blob *blob)
 static int
 blob_id_cmp(struct spdk_blob *blob1, struct spdk_blob *blob2)
 {
+	assert(blob1 != NULL && blob2 != NULL);
 	return (blob1->id < blob2->id ? -1 : blob1->id > blob2->id);
 }

@ -892,8 +893,8 @@ blob_parse(const struct spdk_blob_md_page *pages, uint32_t page_count,
 	 * happen for example if a bogus blobid is passed in through open.
 	 */
 	if (blob->id != pages[0].id) {
-		SPDK_ERRLOG("Blobid (%" PRIu64 ") doesn't match what's in metadata (%" PRIu64 ")\n",
-			    blob->id, pages[0].id);
+		SPDK_ERRLOG("Blobid (0x%" PRIx64 ") doesn't match what's in metadata "
+			    "(0x%" PRIx64 ")\n", blob->id, pages[0].id);
 		return -ENOENT;
 	}

@ -1596,7 +1597,7 @@ blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
 	}

 	if (bserrno) {
-		SPDK_ERRLOG("Metadata page %d read failed for blobid %" PRIu64 ": %d\n",
+		SPDK_ERRLOG("Metadata page %d read failed for blobid 0x%" PRIx64 ": %d\n",
 			    current_page, blob->id, bserrno);
 		blob_load_final(ctx, bserrno);
 		return;
@ -1605,7 +1606,7 @@ blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
 	page = &ctx->pages[ctx->num_pages - 1];
 	crc = blob_md_page_calc_crc(page);
 	if (crc != page->crc) {
-		SPDK_ERRLOG("Metadata page %d crc mismatch for blobid %" PRIu64 "\n",
+		SPDK_ERRLOG("Metadata page %d crc mismatch for blobid 0x%" PRIx64 "\n",
 			    current_page, blob->id);
 		blob_load_final(ctx, -EINVAL);
 		return;
@ -5985,7 +5986,7 @@ bs_create_blob(struct spdk_blob_store *bs,

 	id = bs_page_to_blobid(page_idx);

-	SPDK_DEBUGLOG(blob, "Creating blob with id %" PRIu64 " at page %u\n", id, page_idx);
+	SPDK_DEBUGLOG(blob, "Creating blob with id 0x%" PRIx64 " at page %u\n", id, page_idx);

 	spdk_blob_opts_init(&opts_local, sizeof(opts_local));
 	if (opts) {
@ -6028,6 +6029,7 @@ bs_create_blob(struct spdk_blob_store *bs,
 		if (opts_local.esnap_id_len > UINT16_MAX) {
 			SPDK_ERRLOG("esnap id length %" PRIu64 "is too long\n",
 				    opts_local.esnap_id_len);
+			rc = -EINVAL;
 			goto error;

 		}
@ -6487,8 +6489,8 @@ bs_snapshot_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno
 	ctx->original.blob = _blob;

 	if (_blob->data_ro || _blob->md_ro) {
-		SPDK_DEBUGLOG(blob, "Cannot create snapshot from read only blob with id %" PRIu64 "\n",
-			      _blob->id);
+		SPDK_DEBUGLOG(blob, "Cannot create snapshot from read only blob with id 0x%"
+			      PRIx64 "\n", _blob->id);
 		ctx->bserrno = -EINVAL;
 		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
 		return;
@ -8476,7 +8478,8 @@ spdk_blob_is_clone(struct spdk_blob *blob)
 {
 	assert(blob != NULL);

-	if (blob->parent_id != SPDK_BLOBID_INVALID) {
+	if (blob->parent_id != SPDK_BLOBID_INVALID &&
+	    blob->parent_id != SPDK_BLOBID_EXTERNAL_SNAPSHOT) {
 		assert(spdk_blob_is_thin_provisioned(blob));
 		return true;
 	}
@ -9102,5 +9105,29 @@ spdk_blob_set_esnap_bs_dev(struct spdk_blob *blob, struct spdk_bs_dev *back_bs_d
 	blob_freeze_io(blob, blob_frozen_destroy_esnap_channels, ctx);
 }

+struct spdk_bs_dev *
+spdk_blob_get_esnap_bs_dev(const struct spdk_blob *blob)
+{
+	if (!blob_is_esnap_clone(blob)) {
+		SPDK_ERRLOG("blob 0x%" PRIx64 ": not an esnap clone\n", blob->id);
+		return NULL;
+	}
+
+	return blob->back_bs_dev;
+}
+
+bool
+spdk_blob_is_degraded(const struct spdk_blob *blob)
+{
+	if (blob->bs->dev->is_degraded != NULL && blob->bs->dev->is_degraded(blob->bs->dev)) {
+		return true;
+	}
+	if (blob->back_bs_dev == NULL || blob->back_bs_dev->is_degraded == NULL) {
+		return false;
+	}
+
+	return blob->back_bs_dev->is_degraded(blob->back_bs_dev);
+}
+
 SPDK_LOG_REGISTER_COMPONENT(blob)
 SPDK_LOG_REGISTER_COMPONENT(blob_esnap)
--- a/lib/blob/spdk_blob.map
+++ b/lib/blob/spdk_blob.map
@ -66,7 +66,9 @@
 	spdk_xattr_names_free;
 	spdk_bs_get_bstype;
 	spdk_bs_set_bstype;
+	spdk_blob_get_esnap_bs_dev;
 	spdk_blob_set_esnap_bs_dev;
+	spdk_blob_is_degraded;

 	local: *;
 };
--- a/lib/env_dpdk/env.c
+++ b/lib/env_dpdk/env.c
@ -18,6 +18,8 @@
 #include <rte_memzone.h>
 #include <rte_version.h>

+static __thread bool g_is_thread_unaffinitized;
+
 static uint64_t
 virt_to_phys(void *vaddr)
 {
@ -353,6 +355,10 @@ spdk_unaffinitize_thread(void)
 	rte_cpuset_t new_cpuset;
 	long num_cores, i;

+	if (g_is_thread_unaffinitized) {
+		return;
+	}
+
 	CPU_ZERO(&new_cpuset);

 	num_cores = sysconf(_SC_NPROCESSORS_CONF);
@ -363,6 +369,7 @@ spdk_unaffinitize_thread(void)
 	}

 	rte_thread_set_affinity(&new_cpuset);
+	g_is_thread_unaffinitized = true;
 }

 void *
@ -375,13 +382,17 @@ spdk_call_unaffinitized(void *cb(void *arg), void *arg)
 		return NULL;
 	}

+	if (g_is_thread_unaffinitized) {
+		ret = cb(arg);
+	} else {
 		rte_thread_get_affinity(&orig_cpuset);
-
 		spdk_unaffinitize_thread();

 		ret = cb(arg);

 		rte_thread_set_affinity(&orig_cpuset);
+		g_is_thread_unaffinitized = false;
+	}

 	return ret;
 }
--- a/lib/env_dpdk/env.mk
+++ b/lib/env_dpdk/env.mk
@ -33,7 +33,7 @@ endif
 DPDK_INC := -I$(DPDK_INC_DIR)

 DPDK_LIB_LIST = rte_eal rte_mempool rte_ring rte_mbuf rte_bus_pci rte_pci rte_mempool_ring
-DPDK_LIB_LIST += rte_telemetry rte_kvargs
+DPDK_LIB_LIST += rte_telemetry rte_kvargs rte_rcu

 DPDK_POWER=n

@ -116,9 +116,6 @@ endif

 ifeq ($(LINK_HASH),y)
 DPDK_LIB_LIST += rte_hash
-ifneq (, $(wildcard $(DPDK_LIB_DIR)/librte_rcu.*))
-DPDK_LIB_LIST += rte_rcu
-endif
 endif


@ -126,7 +123,7 @@ DPDK_LIB_LIST_SORTED = $(sort $(DPDK_LIB_LIST))

 DPDK_SHARED_LIB = $(DPDK_LIB_LIST_SORTED:%=$(DPDK_LIB_DIR)/lib%.so)
 DPDK_STATIC_LIB = $(DPDK_LIB_LIST_SORTED:%=$(DPDK_LIB_DIR)/lib%.a)
-DPDK_SHARED_LIB_LINKER_ARGS = $(call add_no_as_needed,$(DPDK_SHARED_LIB))
+DPDK_SHARED_LIB_LINKER_ARGS = $(call add_no_as_needed,$(DPDK_SHARED_LIB)) -Wl,-rpath=$(DPDK_LIB_DIR)
 DPDK_STATIC_LIB_LINKER_ARGS = $(call add_whole_archive,$(DPDK_STATIC_LIB))

 ENV_CFLAGS = $(DPDK_INC) -DALLOW_EXPERIMENTAL_API
--- a/lib/env_dpdk/init.c
+++ b/lib/env_dpdk/init.c
@ -225,6 +225,22 @@ build_eal_cmdline(const struct spdk_env_opts *opts)
 		}
 	}

+	/* Either lcore_map or core_mask must be set. If both, or none specified, fail */
+	if ((opts->core_mask == NULL) == (opts->lcore_map == NULL)) {
+		if (opts->core_mask && opts->lcore_map) {
+			fprintf(stderr,
+				"Both, lcore map and core mask are provided, while only one can be set\n");
+		} else {
+			fprintf(stderr, "Core mask or lcore map must be specified\n");
+		}
+		free_args(args, argcount);
+		return -1;
+	}
+
+	if (opts->lcore_map) {
+		/* If lcore list is set, generate --lcores parameter */
+		args = push_arg(args, &argcount, _sprintf_alloc("--lcores=%s", opts->lcore_map));
+	} else if (opts->core_mask[0] == '-') {
 		/*
 		 * Set the coremask:
 		 *
@ -237,7 +253,6 @@ build_eal_cmdline(const struct spdk_env_opts *opts)
 		 * - otherwise, it's a CPU mask of the form "0xff.." as expected by the
 		 *   -c option.
 		 */
-	if (opts->core_mask[0] == '-') {
 		args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->core_mask));
 	} else if (opts->core_mask[0] == '[') {
 		char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1);
@ -291,6 +306,13 @@ build_eal_cmdline(const struct spdk_env_opts *opts)
 		}
 	}

+	if (opts->env_context && strstr(opts->env_context, "--no-huge") != NULL) {
+		if (opts->hugepage_single_segments || opts->unlink_hugepage || opts->hugedir) {
+			fprintf(stderr, "--no-huge invalid with other hugepage options\n");
+			free_args(args, argcount);
+			return -1;
+		}
+	} else {
 		/* create just one hugetlbfs file */
 		if (opts->hugepage_single_segments) {
 			args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
@ -320,6 +342,7 @@ build_eal_cmdline(const struct spdk_env_opts *opts)
 				return -1;
 			}
 		}
+	}

 	if (opts->num_pci_addr) {
 		size_t i;
--- a/lib/env_dpdk/pci.c
+++ b/lib/env_dpdk/pci.c
@ -233,6 +233,13 @@ pci_device_rte_dev_event(const char *device_name,
 		TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
 			struct rte_pci_device *rte_dev = dev->dev_handle;

+			/* Note: these ERRLOGs are useful for triaging issue #2983. */
+			if (dev->internal.pending_removal || dev->internal.removed) {
+				SPDK_ERRLOG("Received event for device SPDK already tried to remove\n");
+				SPDK_ERRLOG("pending_removal=%d removed=%d\n", dev->internal.pending_removal,
+					    dev->internal.removed);
+			}
+
 			if (strcmp(dpdk_pci_device_get_name(rte_dev), device_name) == 0 &&
 			    !dev->internal.pending_removal) {
 				can_detach = !dev->internal.attached;
--- a/lib/env_dpdk/pci_dpdk.c
+++ b/lib/env_dpdk/pci_dpdk.c
@ -32,15 +32,15 @@ dpdk_pci_init(void)
 	 * Only DPDK in development has additional suffix past minor version.
 	 */
 	if (strlen(release) != 0) {
-		if (year == 23 && month == 3 && minor == 0) {
+		if (year == 23 && month == 7 && minor == 0) {
 			g_dpdk_fn_table = &fn_table_2211;
-			SPDK_NOTICELOG("DPDK version 23.03.0 not supported yet. Enabled only for validation.\n");
+			SPDK_NOTICELOG("DPDK version 23.07.0 not supported yet. Enabled only for validation.\n");
 			return 0;
 		}
 	}

-	/* Anything 23.x or higher is not supported. */
-	if (year > 22) {
+	/* Anything 24.x or higher is not supported. */
+	if (year > 23) {
 		SPDK_ERRLOG("DPDK version %d.%02d.%d not supported.\n", year, month, minor);
 		return -EINVAL;
 	}
@ -57,6 +57,14 @@ dpdk_pci_init(void)
 			return -EINVAL;
 		}
 		g_dpdk_fn_table = &fn_table_2211;
+	} else if (year == 23) {
+		/* Only 23.03.0 is supported */
+		if (month != 3 || minor != 0) {
+			SPDK_ERRLOG("DPDK version 23.%02d.%d is not supported.\n", month, minor);
+			return -EINVAL;
+		}
+		/* There were no changes between 22.11 and 23.03, so use the 22.11 implementation */
+		g_dpdk_fn_table = &fn_table_2211;
 	} else {
 		/* Everything else we use the 22.07 implementation. */
 		g_dpdk_fn_table = &fn_table_2207;
--- a/lib/env_ocf/Makefile
+++ b/lib/env_ocf/Makefile
@ -16,7 +16,7 @@ include $(SPDK_ROOT_DIR)/mk/spdk.common.mk

 LIBNAME := ocfenv

-CFLAGS +=  $(ENV_CFLAGS) -I$(CURDIR) -I$(CURDIR)/include -w
+CFLAGS +=  $(ENV_CFLAGS) -I$(CURDIR) -I$(CURDIR)/include -w -MMD
 C_SRCS = $(shell find -name \*.c)

 LIB = $(call spdk_lib_list_to_static_libs,$(LIBNAME))
@ -64,6 +64,8 @@ clean: ocf_distclean
 $(LIB): $(OBJS)
 	$(LIB_C)

+-include $(OBJS:.o=.d)
+
 install:

 uninstall:
--- a/lib/env_ocf/ocf_env.c
+++ b/lib/env_ocf/ocf_env.c
@ -62,7 +62,7 @@ env_allocator_create_extended(uint32_t size, const char *name, int limit, bool z
 	snprintf(qualified_name, OCF_ALLOCATOR_NAME_MAX, "ocf_env_%d:%s",
 		 env_atomic_inc_return(&g_env_allocator_index), name);

-	allocator = calloc(1, sizeof(*allocator));
+	allocator = env_zalloc(sizeof(*allocator), ENV_MEM_NOIO);
 	if (!allocator) {
 		return NULL;
 	}
@ -101,7 +101,7 @@ env_allocator_destroy(env_allocator *allocator)
 		}

 		spdk_mempool_free(allocator->mempool);
-		free(allocator);
+		env_free(allocator);
 	}
 }
 /* *** CRC *** */
--- a/lib/event/app.c
+++ b/lib/event/app.c
@ -129,6 +129,8 @@ static const struct option g_cmdline_options[] = {
 	{"vfio-vf-token",		required_argument,	NULL, ENV_VF_TOKEN_OPT_IDX},
 #define MSG_MEMPOOL_SIZE_OPT_IDX 270
 	{"msg-mempool-size",		required_argument,	NULL, MSG_MEMPOOL_SIZE_OPT_IDX},
+#define LCORES_OPT_IDX	271
+	{"lcores",			required_argument,	NULL, LCORES_OPT_IDX},
 };

 static void
@ -203,7 +205,6 @@ spdk_app_opts_init(struct spdk_app_opts *opts, size_t opts_size)
 	SET_FIELD(mem_size, SPDK_APP_DPDK_DEFAULT_MEM_SIZE);
 	SET_FIELD(main_core, SPDK_APP_DPDK_DEFAULT_MAIN_CORE);
 	SET_FIELD(mem_channel, SPDK_APP_DPDK_DEFAULT_MEM_CHANNEL);
-	SET_FIELD(reactor_mask, SPDK_APP_DPDK_DEFAULT_CORE_MASK);
 	SET_FIELD(base_virtaddr, SPDK_APP_DPDK_DEFAULT_BASE_VIRTADDR);
 	SET_FIELD(print_level, SPDK_APP_DEFAULT_LOG_PRINT_LEVEL);
 	SET_FIELD(rpc_addr, SPDK_DEFAULT_RPC_ADDR);
@ -326,6 +327,7 @@ app_setup_env(struct spdk_app_opts *opts)

 	env_opts.name = opts->name;
 	env_opts.core_mask = opts->reactor_mask;
+	env_opts.lcore_map = opts->lcore_map;
 	env_opts.shm_id = opts->shm_id;
 	env_opts.mem_channel = opts->mem_channel;
 	env_opts.main_core = opts->main_core;
@ -497,6 +499,7 @@ app_copy_opts(struct spdk_app_opts *opts, struct spdk_app_opts *opts_user, size_
 	SET_FIELD(json_config_ignore_errors);
 	SET_FIELD(rpc_addr);
 	SET_FIELD(reactor_mask);
+	SET_FIELD(lcore_map);
 	SET_FIELD(tpoint_group_mask);
 	SET_FIELD(shm_id);
 	SET_FIELD(shutdown_cb);
@ -525,7 +528,7 @@ app_copy_opts(struct spdk_app_opts *opts, struct spdk_app_opts *opts_user, size_

 	/* You should not remove this statement, but need to update the assert statement
 	 * if you add a new field, and also add a corresponding SET_FIELD statement */
-	SPDK_STATIC_ASSERT(sizeof(struct spdk_app_opts) == 216, "Incorrect size");
+	SPDK_STATIC_ASSERT(sizeof(struct spdk_app_opts) == 224, "Incorrect size");

 #undef SET_FIELD
 }
@ -669,6 +672,11 @@ spdk_app_start(struct spdk_app_opts *opts_user, spdk_msg_fn start_fn,
 		return 1;
 	}

+	if (!(opts->lcore_map || opts->reactor_mask)) {
+		/* Set default CPU mask */
+		opts->reactor_mask = SPDK_APP_DPDK_DEFAULT_CORE_MASK;
+	}
+
 	tty = ttyname(STDERR_FILENO);
 	if (opts->print_level > SPDK_LOG_WARN &&
 	    isatty(STDERR_FILENO) &&
@ -860,6 +868,13 @@ usage(void (*app_usage)(void))
 	printf(" -h, --help                show this usage\n");
 	printf(" -i, --shm-id <id>         shared memory ID (optional)\n");
 	printf(" -m, --cpumask <mask or list>    core mask (like 0xF) or core list of '[]' embraced (like [0,1,10]) for DPDK\n");
+	printf("     --lcores <list>       lcore to CPU mapping list. The list is in the format:\n");
+	printf("                           <lcores[@CPUs]>[<,lcores[@CPUs]>...]\n");
+	printf("                           lcores and cpus list are grouped by '(' and ')', e.g '--lcores \"(5-7)@(10-12)\"'\n");
+	printf("                           Within the group, '-' is used for range separator,\n");
+	printf("                           ',' is used for single number separator.\n");
+	printf("                           '( )' can be omitted for single element group,\n");
+	printf("                           '@' can be omitted if cpus and lcores have the same value\n");
 	printf(" -n, --mem-channels <num>  channel number of memory channels used for DPDK\n");
 	printf(" -p, --main-core <id>      main (primary) core for DPDK\n");
 	printf(" -r, --rpc-socket <path>   RPC listen address (default %s)\n", SPDK_DEFAULT_RPC_ADDR);
@ -1001,8 +1016,19 @@ spdk_app_parse_args(int argc, char **argv, struct spdk_app_opts *opts,
 			}
 			break;
 		case CPUMASK_OPT_IDX:
+			if (opts->lcore_map) {
+				SPDK_ERRLOG("lcore map and core mask can't be set simultaneously\n");
+				goto out;
+			}
 			opts->reactor_mask = optarg;
 			break;
+		case LCORES_OPT_IDX:
+			if (opts->reactor_mask) {
+				SPDK_ERRLOG("lcore map and core mask can't be set simultaneously\n");
+				goto out;
+			}
+			opts->lcore_map = optarg;
+			break;
 		case DISABLE_CPUMASK_LOCKS_OPT_IDX:
 			g_disable_cpumask_locks = true;
 			break;
--- a/lib/ftl/ftl_internal.h
+++ b/lib/ftl/ftl_internal.h
@ -141,7 +141,7 @@ ftl_p2l_validate_ckpt(struct ftl_band *band)
 }
 #endif

-int ftl_mngt_p2l_ckpt_get_seq_id(struct spdk_ftl_dev *dev, int md_region);
+uint64_t ftl_mngt_p2l_ckpt_get_seq_id(struct spdk_ftl_dev *dev, int md_region);

 int ftl_mngt_p2l_ckpt_restore(struct ftl_band *band, uint32_t md_region, uint64_t seq_id);

--- a/lib/ftl/ftl_p2l.c
+++ b/lib/ftl/ftl_p2l.c
@ -352,7 +352,7 @@ ftl_mngt_persist_bands_p2l(struct ftl_mngt_process *mngt)
 	ftl_mngt_persist_band_p2l(mngt, ctx);
 }

-int
+uint64_t
 ftl_mngt_p2l_ckpt_get_seq_id(struct spdk_ftl_dev *dev, int md_region)
 {
 	struct ftl_layout *layout = &dev->layout;
--- a/lib/ftl/mngt/ftl_mngt_band.c
+++ b/lib/ftl/mngt/ftl_mngt_band.c
@ -233,9 +233,9 @@ void
 ftl_recover_max_seq(struct spdk_ftl_dev *dev)
 {
 	struct ftl_band *band;
-	size_t band_close_seq_id = 0, band_open_seq_id = 0;
-	size_t chunk_close_seq_id = 0, chunk_open_seq_id = 0;
-	size_t max = 0;
+	uint64_t band_close_seq_id = 0, band_open_seq_id = 0;
+	uint64_t chunk_close_seq_id = 0, chunk_open_seq_id = 0;
+	uint64_t max = 0;

 	TAILQ_FOREACH(band, &dev->shut_bands, queue_entry) {
 		band_open_seq_id = spdk_max(band_open_seq_id, band->md->seq);
--- a/lib/init/subsystem.c
+++ b/lib/init/subsystem.c
@ -91,42 +91,47 @@ subsystem_get_next_depend(struct spdk_subsystem_depend *cur_depend)
 static void
 subsystem_sort(void)
 {
-	bool depends_on, depends_on_sorted;
+	bool has_dependency, all_dependencies_met;
 	struct spdk_subsystem *subsystem, *subsystem_tmp;
 	struct spdk_subsystem_depend *subsystem_dep;
+	struct spdk_subsystem_list sorted_list;

-	struct spdk_subsystem_list subsystems_list = TAILQ_HEAD_INITIALIZER(subsystems_list);
-
+	TAILQ_INIT(&sorted_list);
+	/* We will move subsystems from the original g_subsystems TAILQ to the temporary
+	 * sorted_list one at a time. We can only move a subsystem if it either (a) has no
+	 * dependencies, or (b) all of its dependencies have already been moved to the
+	 * sorted_list.
+	 *
+	 * Once all of the subsystems have been moved to the temporary list, we will move
+	 * the list as-is back to the original g_subsystems TAILQ - they will now be sorted
+	 * in the order which they must be initialized.
+	 */
 	while (!TAILQ_EMPTY(&g_subsystems)) {
 		TAILQ_FOREACH_SAFE(subsystem, &g_subsystems, tailq, subsystem_tmp) {
-			depends_on = false;
+			has_dependency = false;
+			all_dependencies_met = true;
 			TAILQ_FOREACH(subsystem_dep, &g_subsystems_deps, tailq) {
 				if (strcmp(subsystem->name, subsystem_dep->name) == 0) {
-					depends_on = true;
-					depends_on_sorted = !!_subsystem_find(&subsystems_list, subsystem_dep->depends_on);
-					if (depends_on_sorted) {
-						continue;
-					}
+					has_dependency = true;
+					if (!_subsystem_find(&sorted_list, subsystem_dep->depends_on)) {
+						/* We found a dependency that isn't in the sorted_list yet.
+						 * Clear the flag and break from the inner loop, we know
+						 * we can't move this subsystem to the sorted_list yet.
+						 */
+						all_dependencies_met = false;
 						break;
 					}
 				}
-
-			if (depends_on == false) {
-				TAILQ_REMOVE(&g_subsystems, subsystem, tailq);
-				TAILQ_INSERT_TAIL(&subsystems_list, subsystem, tailq);
-			} else {
-				if (depends_on_sorted == true) {
-					TAILQ_REMOVE(&g_subsystems, subsystem, tailq);
-					TAILQ_INSERT_TAIL(&subsystems_list, subsystem, tailq);
 			}
+
+			if (!has_dependency || all_dependencies_met) {
+				TAILQ_REMOVE(&g_subsystems, subsystem, tailq);
+				TAILQ_INSERT_TAIL(&sorted_list, subsystem, tailq);
 			}
 		}
 	}

-	TAILQ_FOREACH_SAFE(subsystem, &subsystems_list, tailq, subsystem_tmp) {
-		TAILQ_REMOVE(&subsystems_list, subsystem, tailq);
-		TAILQ_INSERT_TAIL(&g_subsystems, subsystem, tailq);
-	}
+	TAILQ_SWAP(&sorted_list, &g_subsystems, spdk_subsystem, tailq);
 }

 void
--- a/lib/iscsi/iscsi.c
+++ b/lib/iscsi/iscsi.c
@ -1077,6 +1077,11 @@ iscsi_conn_params_update(struct spdk_iscsi_conn *conn)
 		}
 	}

+	if (conn->sock == NULL) {
+		SPDK_INFOLOG(iscsi, "socket is already closed.\n");
+		return -ENXIO;
+	}
+
 	/* The socket receive buffer may need to be adjusted based on the new parameters */

 	/* Don't allow the recv buffer to be 0 or very large. */
--- a/lib/lvol/lvol.c
+++ b/lib/lvol/lvol.c
@ -9,6 +9,7 @@
 #include "spdk/string.h"
 #include "spdk/thread.h"
 #include "spdk/blob_bdev.h"
+#include "spdk/tree.h"
 #include "spdk/util.h"

 /* Default blob channel opts for lvol */
@ -18,6 +19,14 @@

 SPDK_LOG_REGISTER_COMPONENT(lvol)

+struct spdk_lvs_degraded_lvol_set {
+	struct spdk_lvol_store			*lvol_store;
+	const void				*esnap_id;
+	uint32_t				id_len;
+	TAILQ_HEAD(degraded_lvols, spdk_lvol)	lvols;
+	RB_ENTRY(spdk_lvs_degraded_lvol_set)	node;
+};
+
 static TAILQ_HEAD(, spdk_lvol_store) g_lvol_stores = TAILQ_HEAD_INITIALIZER(g_lvol_stores);
 static pthread_mutex_t g_lvol_stores_mutex = PTHREAD_MUTEX_INITIALIZER;

@ -25,6 +34,11 @@ static inline int lvs_opts_copy(const struct spdk_lvs_opts *src, struct spdk_lvs
 static int lvs_esnap_bs_dev_create(void *bs_ctx, void *blob_ctx, struct spdk_blob *blob,
 				   const void *esnap_id, uint32_t id_len,
 				   struct spdk_bs_dev **_bs_dev);
+static struct spdk_lvol *lvs_get_lvol_by_blob_id(struct spdk_lvol_store *lvs, spdk_blob_id blob_id);
+static void lvs_degraded_lvol_set_add(struct spdk_lvs_degraded_lvol_set *degraded_set,
+				      struct spdk_lvol *lvol);
+static void lvs_degraded_lvol_set_remove(struct spdk_lvs_degraded_lvol_set *degraded_set,
+		struct spdk_lvol *lvol);

 static int
 add_lvs_to_list(struct spdk_lvol_store *lvs)
@ -63,6 +77,8 @@ lvs_alloc(void)
 	TAILQ_INIT(&lvs->retry_open_lvols);

 	lvs->load_esnaps = false;
+	RB_INIT(&lvs->degraded_lvol_sets_tree);
+	lvs->thread = spdk_get_thread();

 	return lvs;
 }
@ -76,6 +92,8 @@ lvs_free(struct spdk_lvol_store *lvs)
 	}
 	pthread_mutex_unlock(&g_lvol_stores_mutex);

+	assert(RB_EMPTY(&lvs->degraded_lvol_sets_tree));
+
 	free(lvs);
 }

@ -321,7 +339,7 @@ lvs_read_uuid(void *cb_arg, struct spdk_blob *blob, int lvolerrno)

 	rc = spdk_blob_get_xattr_value(blob, "uuid", (const void **)&attr, &value_len);
 	if (rc != 0 || value_len != SPDK_UUID_STRING_LEN || attr[SPDK_UUID_STRING_LEN - 1] != '\0') {
-		SPDK_INFOLOG(lvol, "missing or incorrect UUID\n");
+		SPDK_INFOLOG(lvol, "degraded_set or incorrect UUID\n");
 		req->lvserrno = -EINVAL;
 		spdk_blob_close(blob, close_super_blob_with_error_cb, req);
 		return;
@ -336,7 +354,7 @@ lvs_read_uuid(void *cb_arg, struct spdk_blob *blob, int lvolerrno)

 	rc = spdk_blob_get_xattr_value(blob, "name", (const void **)&attr, &value_len);
 	if (rc != 0 || value_len > SPDK_LVS_NAME_MAX) {
-		SPDK_INFOLOG(lvol, "missing or invalid name\n");
+		SPDK_INFOLOG(lvol, "degraded_set or invalid name\n");
 		req->lvserrno = -EINVAL;
 		spdk_blob_close(blob, close_super_blob_with_error_cb, req);
 		return;
@ -422,6 +440,7 @@ lvs_load(struct spdk_bs_dev *bs_dev, const struct spdk_lvs_opts *_lvs_opts,
 		if (lvs_opts_copy(_lvs_opts, &lvs_opts) != 0) {
 			SPDK_ERRLOG("Invalid options\n");
 			cb_fn(cb_arg, NULL, -EINVAL);
+			return;
 		}
 	}

@ -877,6 +896,7 @@ spdk_lvs_unload(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn,
 	}

 	TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) {
+		spdk_lvs_esnap_missing_remove(lvol);
 		TAILQ_REMOVE(&lvs->lvols, lvol, link);
 		lvol_free(lvol);
 	}
@ -1001,6 +1021,7 @@ lvol_delete_blob_cb(void *cb_arg, int lvolerrno)
 {
 	struct spdk_lvol_req *req = cb_arg;
 	struct spdk_lvol *lvol = req->lvol;
+	struct spdk_lvol *clone_lvol = req->clone_lvol;

 	if (lvolerrno < 0) {
 		SPDK_ERRLOG("Could not remove blob on lvol gracefully - forced removal\n");
@ -1008,6 +1029,22 @@ lvol_delete_blob_cb(void *cb_arg, int lvolerrno)
 		SPDK_INFOLOG(lvol, "Lvol %s deleted\n", lvol->unique_id);
 	}

+	if (lvol->degraded_set != NULL) {
+		if (clone_lvol != NULL) {
+			/*
+			 * A degraded esnap clone that has a blob clone has been deleted. clone_lvol
+			 * becomes an esnap clone and needs to be associated with the
+			 * spdk_lvs_degraded_lvol_set.
+			 */
+			struct spdk_lvs_degraded_lvol_set *degraded_set = lvol->degraded_set;
+
+			lvs_degraded_lvol_set_remove(degraded_set, lvol);
+			lvs_degraded_lvol_set_add(degraded_set, clone_lvol);
+		} else {
+			spdk_lvs_esnap_missing_remove(lvol);
+		}
+	}
+
 	TAILQ_REMOVE(&lvol->lvol_store->lvols, lvol, link);
 	lvol_free(lvol);
 	req->cb_fn(req->cb_arg, lvolerrno);
@ -1071,6 +1108,18 @@ lvol_create_cb(void *cb_arg, spdk_blob_id blobid, int lvolerrno)
 	opts.esnap_ctx = req->lvol;
 	bs = req->lvol->lvol_store->blobstore;

+	if (req->origlvol != NULL && req->origlvol->degraded_set != NULL) {
+		/*
+		 * A snapshot was created from a degraded esnap clone. The new snapshot is now a
+		 * degraded esnap clone. The previous clone is now a regular clone of a blob. Update
+		 * the set of directly-related clones to the missing external snapshot.
+		 */
+		struct spdk_lvs_degraded_lvol_set *degraded_set = req->origlvol->degraded_set;
+
+		lvs_degraded_lvol_set_remove(degraded_set, req->origlvol);
+		lvs_degraded_lvol_set_add(degraded_set, req->lvol);
+	}
+
 	spdk_bs_open_blob_ext(bs, blobid, &opts, lvol_create_open_cb, req);
 }

@ -1189,6 +1238,7 @@ spdk_lvol_create_esnap_clone(const void *esnap_id, uint32_t id_len, uint64_t siz
 	struct spdk_blob_store *bs;
 	struct spdk_lvol *lvol;
 	struct spdk_blob_opts opts;
+	uint64_t cluster_sz;
 	char *xattr_names[] = {LVOL_NAME, "uuid"};
 	int rc;

@ -1204,6 +1254,14 @@ spdk_lvol_create_esnap_clone(const void *esnap_id, uint32_t id_len, uint64_t siz

 	bs = lvs->blobstore;

+	cluster_sz = spdk_bs_get_cluster_size(bs);
+	if ((size_bytes % cluster_sz) != 0) {
+		SPDK_ERRLOG("Cannot create '%s/%s': size %" PRIu64 " is not an integer multiple of "
+			    "cluster size %" PRIu64 "\n", lvs->name, clone_name, size_bytes,
+			    cluster_sz);
+		return -EINVAL;
+	}
+
 	req = calloc(1, sizeof(*req));
 	if (!req) {
 		SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
@ -1224,7 +1282,7 @@ spdk_lvol_create_esnap_clone(const void *esnap_id, uint32_t id_len, uint64_t siz
 	opts.esnap_id = esnap_id;
 	opts.esnap_id_len = id_len;
 	opts.thin_provision = true;
-	opts.num_clusters = spdk_divide_round_up(size_bytes, spdk_bs_get_cluster_size(bs));
+	opts.num_clusters = spdk_divide_round_up(size_bytes, cluster_sz);
 	opts.clear_method = lvol->clear_method;
 	opts.xattrs.count = SPDK_COUNTOF(xattr_names);
 	opts.xattrs.names = xattr_names;
@ -1289,6 +1347,7 @@ spdk_lvol_create_snapshot(struct spdk_lvol *origlvol, const char *snapshot_name,
 	snapshot_xattrs.names = xattr_names;
 	snapshot_xattrs.get_value = lvol_get_xattr_value;
 	req->lvol = newlvol;
+	req->origlvol = origlvol;
 	req->cb_fn = cb_fn;
 	req->cb_arg = cb_arg;

@ -1495,6 +1554,10 @@ spdk_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_
 {
 	struct spdk_lvol_req *req;
 	struct spdk_blob_store *bs;
+	struct spdk_lvol_store	*lvs = lvol->lvol_store;
+	spdk_blob_id	clone_id;
+	size_t		count = 1;
+	int		rc;

 	assert(cb_fn != NULL);

@ -1524,6 +1587,18 @@ spdk_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_
 	req->lvol = lvol;
 	bs = lvol->lvol_store->blobstore;

+	rc = spdk_blob_get_clones(lvs->blobstore, lvol->blob_id, &clone_id, &count);
+	if (rc == 0 && count == 1) {
+		req->clone_lvol = lvs_get_lvol_by_blob_id(lvs, clone_id);
+	} else if (rc == -ENOMEM) {
+		SPDK_INFOLOG(lvol, "lvol %s: cannot destroy: has %" PRIu64 " clones\n",
+			     lvol->unique_id, count);
+		free(req);
+		assert(count > 1);
+		cb_fn(cb_arg, -EBUSY);
+		return;
+	}
+
 	spdk_bs_delete_blob(bs, lvol->blob_id, lvol_delete_blob_cb, req);
 }

@ -1757,3 +1832,379 @@ lvs_esnap_bs_dev_create(void *bs_ctx, void *blob_ctx, struct spdk_blob *blob,

 	return lvs->esnap_bs_dev_create(lvs, lvol, blob, esnap_id, id_len, bs_dev);
 }
+
+/*
+ * The theory of missing external snapshots
+ *
+ * The lvs->esnap_bs_dev_create() callback may be unable to create an external snapshot bs_dev when
+ * it is called. This can happen, for instance, as when the device containing the lvolstore is
+ * examined prior to spdk_bdev_register() being called on a bdev that acts as an external snapshot.
+ * In such a case, the esnap_bs_dev_create() callback will call spdk_lvs_esnap_missing_add().
+ *
+ * Missing external snapshots are tracked in a per-lvolstore tree, lvs->degraded_lvol_sets_tree.
+ * Each tree node (struct spdk_lvs_degraded_lvol_set) contains a tailq of lvols that are missing
+ * that particular external snapshot.
+ *
+ * When a potential missing snapshot becomes available, spdk_lvs_notify_hotplug() may be called to
+ * notify this library that it is available. It will then iterate through the active lvolstores and
+ * search each lvs->degraded_lvol_sets_tree for a set of degraded lvols that are missing an external
+ * snapshot matching the id passed in the notification. The lvols in the tailq on each matching tree
+ * node are then asked to create an external snapshot bs_dev using the esnap_bs_dev_create()
+ * callback that the consumer registered with the lvolstore. If lvs->esnap_bs_dev_create() returns
+ * 0, the lvol is removed from the spdk_lvs_degraded_lvol_set's lvol tailq. When this tailq becomes
+ * empty, the degraded lvol set node for this missing external snapshot is removed.
+ */
+static int
+lvs_esnap_name_cmp(struct spdk_lvs_degraded_lvol_set *m1, struct spdk_lvs_degraded_lvol_set *m2)
+{
+	if (m1->id_len == m2->id_len) {
+		return memcmp(m1->esnap_id, m2->esnap_id, m1->id_len);
+	}
+	return (m1->id_len > m2->id_len) ? 1 : -1;
+}
+
+RB_GENERATE_STATIC(degraded_lvol_sets_tree, spdk_lvs_degraded_lvol_set, node, lvs_esnap_name_cmp)
+
+static void
+lvs_degraded_lvol_set_add(struct spdk_lvs_degraded_lvol_set *degraded_set, struct spdk_lvol *lvol)
+{
+	assert(lvol->lvol_store->thread == spdk_get_thread());
+
+	lvol->degraded_set = degraded_set;
+	TAILQ_INSERT_TAIL(&degraded_set->lvols, lvol, degraded_link);
+}
+
+static void
+lvs_degraded_lvol_set_remove(struct spdk_lvs_degraded_lvol_set *degraded_set,
+			     struct spdk_lvol *lvol)
+{
+	assert(lvol->lvol_store->thread == spdk_get_thread());
+
+	lvol->degraded_set = NULL;
+	TAILQ_REMOVE(&degraded_set->lvols, lvol, degraded_link);
+	/* degraded_set->lvols may be empty. Caller should check if not immediately adding a new
+	 * lvol. */
+}
+
+/*
+ * Record in lvs->degraded_lvol_sets_tree that a bdev of the specified name is needed by the
+ * specified lvol.
+ */
+int
+spdk_lvs_esnap_missing_add(struct spdk_lvol_store *lvs, struct spdk_lvol *lvol,
+			   const void *esnap_id, uint32_t id_len)
+{
+	struct spdk_lvs_degraded_lvol_set find, *degraded_set;
+
+	assert(lvs->thread == spdk_get_thread());
+
+	find.esnap_id = esnap_id;
+	find.id_len = id_len;
+	degraded_set = RB_FIND(degraded_lvol_sets_tree, &lvs->degraded_lvol_sets_tree, &find);
+	if (degraded_set == NULL) {
+		degraded_set = calloc(1, sizeof(*degraded_set));
+		if (degraded_set == NULL) {
+			SPDK_ERRLOG("lvol %s: cannot create degraded_set node: out of memory\n",
+				    lvol->unique_id);
+			return -ENOMEM;
+		}
+		degraded_set->esnap_id = calloc(1, id_len);
+		if (degraded_set->esnap_id == NULL) {
+			free(degraded_set);
+			SPDK_ERRLOG("lvol %s: cannot create degraded_set node: out of memory\n",
+				    lvol->unique_id);
+			return -ENOMEM;
+		}
+		memcpy((void *)degraded_set->esnap_id, esnap_id, id_len);
+		degraded_set->id_len = id_len;
+		degraded_set->lvol_store = lvs;
+		TAILQ_INIT(&degraded_set->lvols);
+		RB_INSERT(degraded_lvol_sets_tree, &lvs->degraded_lvol_sets_tree, degraded_set);
+	}
+
+	lvs_degraded_lvol_set_add(degraded_set, lvol);
+
+	return 0;
+}
+
+/*
+ * Remove the record of the specified lvol needing a degraded_set bdev.
+ */
+void
+spdk_lvs_esnap_missing_remove(struct spdk_lvol *lvol)
+{
+	struct spdk_lvol_store		*lvs = lvol->lvol_store;
+	struct spdk_lvs_degraded_lvol_set	*degraded_set = lvol->degraded_set;
+
+	assert(lvs->thread == spdk_get_thread());
+
+	if (degraded_set == NULL) {
+		return;
+	}
+
+	lvs_degraded_lvol_set_remove(degraded_set, lvol);
+
+	if (!TAILQ_EMPTY(&degraded_set->lvols)) {
+		return;
+	}
+
+	RB_REMOVE(degraded_lvol_sets_tree, &lvs->degraded_lvol_sets_tree, degraded_set);
+
+	free((char *)degraded_set->esnap_id);
+	free(degraded_set);
+}
+
+struct lvs_esnap_hotplug_req {
+	struct spdk_lvol			*lvol;
+	spdk_lvol_op_with_handle_complete	cb_fn;
+	void					*cb_arg;
+};
+
+static void
+lvs_esnap_hotplug_done(void *cb_arg, int bserrno)
+{
+	struct lvs_esnap_hotplug_req *req = cb_arg;
+	struct spdk_lvol	*lvol = req->lvol;
+	struct spdk_lvol_store	*lvs = lvol->lvol_store;
+
+	if (bserrno != 0) {
+		SPDK_ERRLOG("lvol %s/%s: failed to hotplug blob_bdev due to error %d\n",
+			    lvs->name, lvol->name, bserrno);
+	}
+	req->cb_fn(req->cb_arg, lvol, bserrno);
+	free(req);
+}
+
+static void
+lvs_esnap_degraded_hotplug(struct spdk_lvs_degraded_lvol_set *degraded_set,
+			   spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_lvol_store	*lvs = degraded_set->lvol_store;
+	struct spdk_lvol	*lvol, *tmp, *last_missing;
+	struct spdk_bs_dev	*bs_dev;
+	const void		*esnap_id = degraded_set->esnap_id;
+	uint32_t		id_len = degraded_set->id_len;
+	struct lvs_esnap_hotplug_req *req;
+	int			rc;
+
+	assert(lvs->thread == spdk_get_thread());
+
+	/*
+	 * When lvs->esnap_bs_bdev_create() tries to load an external snapshot, it can encounter
+	 * errors that lead it to calling spdk_lvs_esnap_missing_add(). This function needs to be
+	 * sure that such modifications do not lead to degraded_set->lvols tailqs or references
+	 * to memory that this function will free.
+	 *
+	 * While this function is running, no other thread can add items to degraded_set->lvols. If
+	 * the list is mutated, it must have been done by this function or something in its call
+	 * graph running on this thread.
+	 */
+
+	/* Remember the last lvol on the list. Iteration will stop once it has been processed. */
+	last_missing = TAILQ_LAST(&degraded_set->lvols, degraded_lvols);
+
+	TAILQ_FOREACH_SAFE(lvol, &degraded_set->lvols, degraded_link, tmp) {
+		req = calloc(1, sizeof(*req));
+		if (req == NULL) {
+			SPDK_ERRLOG("lvol %s: failed to create esnap bs_dev: out of memory\n",
+				    lvol->unique_id);
+			cb_fn(cb_arg, lvol, -ENOMEM);
+			/* The next one likely won't succeed either, but keep going so that all the
+			 * failed hotplugs are logged.
+			 */
+			goto next;
+		}
+
+		/*
+		 * Remove the lvol from the tailq so that tailq corruption is avoided if
+		 * lvs->esnap_bs_dev_create() calls spdk_lvs_esnap_missing_add(lvol).
+		 */
+		TAILQ_REMOVE(&degraded_set->lvols, lvol, degraded_link);
+		lvol->degraded_set = NULL;
+
+		bs_dev = NULL;
+		rc = lvs->esnap_bs_dev_create(lvs, lvol, lvol->blob, esnap_id, id_len, &bs_dev);
+		if (rc != 0) {
+			SPDK_ERRLOG("lvol %s: failed to create esnap bs_dev: error %d\n",
+				    lvol->unique_id, rc);
+			lvol->degraded_set = degraded_set;
+			TAILQ_INSERT_TAIL(&degraded_set->lvols, lvol, degraded_link);
+			cb_fn(cb_arg, lvol, rc);
+			free(req);
+			goto next;
+		}
+
+		req->lvol = lvol;
+		req->cb_fn = cb_fn;
+		req->cb_arg = cb_arg;
+		spdk_blob_set_esnap_bs_dev(lvol->blob, bs_dev, lvs_esnap_hotplug_done, req);
+
+next:
+		if (lvol == last_missing) {
+			/*
+			 * Anything after last_missing was added due to some problem encountered
+			 * while trying to create the esnap bs_dev.
+			 */
+			break;
+		}
+	}
+
+	if (TAILQ_EMPTY(&degraded_set->lvols)) {
+		RB_REMOVE(degraded_lvol_sets_tree, &lvs->degraded_lvol_sets_tree, degraded_set);
+		free((void *)degraded_set->esnap_id);
+		free(degraded_set);
+	}
+}
+
+/*
+ * Notify each lvstore created on this thread that is missing a bdev by the specified name or uuid
+ * that the bdev now exists.
+ */
+bool
+spdk_lvs_notify_hotplug(const void *esnap_id, uint32_t id_len,
+			spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg)
+{
+	struct spdk_lvs_degraded_lvol_set *found;
+	struct spdk_lvs_degraded_lvol_set find = { 0 };
+	struct spdk_lvol_store	*lvs;
+	struct spdk_thread	*thread = spdk_get_thread();
+	bool			ret = false;
+
+	find.esnap_id = esnap_id;
+	find.id_len = id_len;
+
+	pthread_mutex_lock(&g_lvol_stores_mutex);
+	TAILQ_FOREACH(lvs, &g_lvol_stores, link) {
+		if (thread != lvs->thread) {
+			/*
+			 * It is expected that this is called from vbdev_lvol's examine_config()
+			 * callback. The lvstore was likely loaded do a creation happening as a
+			 * result of an RPC call or opening of an existing lvstore via
+			 * examine_disk() callback. RPC calls, examine_disk(), and examine_config()
+			 * should all be happening only on the app thread. The "wrong thread"
+			 * condition will only happen when an application is doing something weird.
+			 */
+			SPDK_NOTICELOG("Discarded examine for lvstore %s: wrong thread\n",
+				       lvs->name);
+			continue;
+		}
+
+		found = RB_FIND(degraded_lvol_sets_tree, &lvs->degraded_lvol_sets_tree, &find);
+		if (found == NULL) {
+			continue;
+		}
+
+		ret = true;
+		lvs_esnap_degraded_hotplug(found, cb_fn, cb_arg);
+	}
+	pthread_mutex_unlock(&g_lvol_stores_mutex);
+
+	return ret;
+}
+
+int
+spdk_lvol_iter_immediate_clones(struct spdk_lvol *lvol, spdk_lvol_iter_cb cb_fn, void *cb_arg)
+{
+	struct spdk_lvol_store *lvs = lvol->lvol_store;
+	struct spdk_blob_store *bs = lvs->blobstore;
+	struct spdk_lvol *clone;
+	spdk_blob_id *ids;
+	size_t id_cnt = 0;
+	size_t i;
+	int rc;
+
+	rc = spdk_blob_get_clones(bs, lvol->blob_id, NULL, &id_cnt);
+	if (rc != -ENOMEM) {
+		/* -ENOMEM says id_cnt is valid, no other errors should be returned. */
+		assert(rc == 0);
+		return rc;
+	}
+
+	ids = calloc(id_cnt, sizeof(*ids));
+	if (ids == NULL) {
+		SPDK_ERRLOG("lvol %s: out of memory while iterating clones\n", lvol->unique_id);
+		return -ENOMEM;
+	}
+
+	rc = spdk_blob_get_clones(bs, lvol->blob_id, ids, &id_cnt);
+	if (rc != 0) {
+		SPDK_ERRLOG("lvol %s: unable to get clone blob IDs: %d\n", lvol->unique_id, rc);
+		free(ids);
+		return rc;
+	}
+
+	for (i = 0; i < id_cnt; i++) {
+		clone = lvs_get_lvol_by_blob_id(lvs, ids[i]);
+		if (clone == NULL) {
+			SPDK_NOTICELOG("lvol %s: unable to find clone lvol with blob id 0x%"
+				       PRIx64 "\n", lvol->unique_id, ids[i]);
+			continue;
+		}
+		rc = cb_fn(cb_arg, clone);
+		if (rc != 0) {
+			SPDK_DEBUGLOG(lvol, "lvol %s: iteration stopped when lvol %s (blob 0x%"
+				      PRIx64 ") returned %d\n", lvol->unique_id, clone->unique_id,
+				      ids[i], rc);
+			break;
+		}
+	}
+
+	free(ids);
+	return rc;
+}
+
+struct spdk_lvol *
+spdk_lvol_get_by_uuid(const struct spdk_uuid *uuid)
+{
+	struct spdk_lvol_store *lvs;
+	struct spdk_lvol *lvol;
+
+	pthread_mutex_lock(&g_lvol_stores_mutex);
+
+	TAILQ_FOREACH(lvs, &g_lvol_stores, link) {
+		TAILQ_FOREACH(lvol, &lvs->lvols, link) {
+			if (spdk_uuid_compare(uuid, &lvol->uuid) == 0) {
+				pthread_mutex_unlock(&g_lvol_stores_mutex);
+				return lvol;
+			}
+		}
+	}
+
+	pthread_mutex_unlock(&g_lvol_stores_mutex);
+	return NULL;
+}
+
+struct spdk_lvol *
+spdk_lvol_get_by_names(const char *lvs_name, const char *lvol_name)
+{
+	struct spdk_lvol_store *lvs;
+	struct spdk_lvol *lvol;
+
+	pthread_mutex_lock(&g_lvol_stores_mutex);
+
+	TAILQ_FOREACH(lvs, &g_lvol_stores, link) {
+		if (strcmp(lvs_name, lvs->name) != 0) {
+			continue;
+		}
+		TAILQ_FOREACH(lvol, &lvs->lvols, link) {
+			if (strcmp(lvol_name, lvol->name) == 0) {
+				pthread_mutex_unlock(&g_lvol_stores_mutex);
+				return lvol;
+			}
+		}
+	}
+
+	pthread_mutex_unlock(&g_lvol_stores_mutex);
+	return NULL;
+}
+
+bool
+spdk_lvol_is_degraded(const struct spdk_lvol *lvol)
+{
+	struct spdk_blob *blob = lvol->blob;
+
+	if (blob == NULL) {
+		return true;
+	}
+	return spdk_blob_is_degraded(blob);
+}
--- a/lib/lvol/spdk_lvol.map
+++ b/lib/lvol/spdk_lvol.map
@ -22,10 +22,17 @@
 	spdk_lvol_inflate;
 	spdk_lvol_decouple_parent;
 	spdk_lvol_create_esnap_clone;
+	spdk_lvol_iter_immediate_clones;
+	spdk_lvol_get_by_uuid;
+	spdk_lvol_get_by_names;
+	spdk_lvol_is_degraded;

 	# internal functions
 	spdk_lvol_resize;
 	spdk_lvol_set_read_only;
+	spdk_lvs_esnap_missing_add;
+	spdk_lvs_esnap_missing_remove;
+	spdk_lvs_notify_hotplug;

 	local: *;
 };
--- a/lib/nvme/nvme.c
+++ b/lib/nvme/nvme.c
@ -1009,7 +1009,7 @@ void
 spdk_nvme_trid_populate_transport(struct spdk_nvme_transport_id *trid,
 				  enum spdk_nvme_transport_type trtype)
 {
-	const char *trstring = "";
+	const char *trstring;

 	trid->trtype = trtype;
 	switch (trtype) {
--- a/lib/nvme/nvme_ctrlr.c
+++ b/lib/nvme/nvme_ctrlr.c
@ -611,7 +611,7 @@ spdk_nvme_ctrlr_free_io_qpair(struct spdk_nvme_qpair *qpair)
 	 * with that qpair, since the callbacks will also be foreign to this process.
 	 */
 	if (qpair->active_proc == nvme_ctrlr_get_current_process(ctrlr)) {
-		nvme_qpair_abort_all_queued_reqs(qpair, 0);
+		nvme_qpair_abort_all_queued_reqs(qpair);
 	}

 	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
@ -1838,19 +1838,6 @@ spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr)
 	return rc;
 }

-SPDK_LOG_DEPRECATION_REGISTER(nvme_ctrlr_prepare_for_reset,
-			      "spdk_nvme_ctrlr_prepare_for_reset() is deprecated",
-			      "SPDK 22.01", 0);
-
-void
-spdk_nvme_ctrlr_prepare_for_reset(struct spdk_nvme_ctrlr *ctrlr)
-{
-	SPDK_LOG_DEPRECATED(nvme_ctrlr_prepare_for_reset);
-	nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
-	ctrlr->prepare_for_reset = true;
-	nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
-}
-
 int
 spdk_nvme_ctrlr_reset_subsystem(struct spdk_nvme_ctrlr *ctrlr)
 {
@ -3819,7 +3806,7 @@ nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr)
 			 * resubmitted while the controller is resetting and subsequent commands
 			 * would get queued too.
 			 */
-			nvme_qpair_abort_queued_reqs(ctrlr->adminq, 0);
+			nvme_qpair_abort_queued_reqs(ctrlr->adminq);
 			break;
 		case NVME_QPAIR_DISCONNECTING:
 			assert(ctrlr->adminq->async == true);
--- a/lib/nvme/nvme_internal.h
+++ b/lib/nvme/nvme_internal.h
@ -439,6 +439,8 @@ struct spdk_nvme_qpair {
 	uint8_t					transport_failure_reason: 2;
 	uint8_t					last_transport_failure_reason: 2;

+	uint8_t					abort_dnr: 1;
+
 	enum spdk_nvme_transport_type		trtype;

 	uint32_t				num_outstanding_reqs;
@ -1200,9 +1202,9 @@ void	nvme_qpair_deinit(struct spdk_nvme_qpair *qpair);
 void	nvme_qpair_complete_error_reqs(struct spdk_nvme_qpair *qpair);
 int	nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair,
 				  struct nvme_request *req);
-void	nvme_qpair_abort_all_queued_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
+void	nvme_qpair_abort_all_queued_reqs(struct spdk_nvme_qpair *qpair);
 uint32_t nvme_qpair_abort_queued_reqs_with_cbarg(struct spdk_nvme_qpair *qpair, void *cmd_cb_arg);
-void	nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
+void	nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair);
 void	nvme_qpair_resubmit_requests(struct spdk_nvme_qpair *qpair, uint32_t num_requests);
 int	nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr);
 void	nvme_ns_set_identify_data(struct spdk_nvme_ns *ns);
@ -1525,7 +1527,7 @@ void nvme_transport_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr,
 void nvme_transport_ctrlr_disconnect_qpair_done(struct spdk_nvme_qpair *qpair);
 int nvme_transport_ctrlr_get_memory_domains(const struct spdk_nvme_ctrlr *ctrlr,
 		struct spdk_memory_domain **domains, int array_size);
-void nvme_transport_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
+void nvme_transport_qpair_abort_reqs(struct spdk_nvme_qpair *qpair);
 int nvme_transport_qpair_reset(struct spdk_nvme_qpair *qpair);
 int nvme_transport_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req);
 int32_t nvme_transport_qpair_process_completions(struct spdk_nvme_qpair *qpair,
--- a/lib/nvme/nvme_pcie_common.c
+++ b/lib/nvme/nvme_pcie_common.c
@ -530,7 +530,6 @@ _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme
 	int	rc;

 	/* Statistics may already be allocated in the case of controller reset */
-	if (!pqpair->stat) {
 	if (qpair->poll_group) {
 		struct nvme_pcie_poll_group *group = SPDK_CONTAINEROF(qpair->poll_group,
 						     struct nvme_pcie_poll_group, group);
@ -538,6 +537,7 @@ _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme
 		pqpair->stat = &group->stats;
 		pqpair->shared_stats = true;
 	} else {
+		if (pqpair->stat == NULL) {
 			pqpair->stat = calloc(1, sizeof(*pqpair->stat));
 			if (!pqpair->stat) {
 				SPDK_ERRLOG("Failed to allocate qpair statistics\n");
@ -547,7 +547,6 @@ _nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme
 		}
 	}

-
 	rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_create_cq_cb, qpair);

 	if (rc != 0) {
--- a/lib/nvme/nvme_poll_group.c
+++ b/lib/nvme/nvme_poll_group.c
@ -140,6 +140,36 @@ spdk_nvme_poll_group_process_completions(struct spdk_nvme_poll_group *group,
 	return error_reason ? error_reason : num_completions;
 }

+int
+spdk_nvme_poll_group_all_connected(struct spdk_nvme_poll_group *group)
+{
+	struct spdk_nvme_transport_poll_group *tgroup;
+	struct spdk_nvme_qpair *qpair;
+	int rc = 0;
+
+	STAILQ_FOREACH(tgroup, &group->tgroups, link) {
+		if (!STAILQ_EMPTY(&tgroup->disconnected_qpairs)) {
+			/* Treat disconnected qpairs as highest priority for notification.
+			 * This means we can just return immediately here.
+			 */
+			return -EIO;
+		}
+		STAILQ_FOREACH(qpair, &tgroup->connected_qpairs, poll_group_stailq) {
+			if (nvme_qpair_get_state(qpair) < NVME_QPAIR_CONNECTING) {
+				return -EIO;
+			} else if (nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING) {
+				rc = -EAGAIN;
+				/* Break so that we can check the remaining transport groups,
+				 * in case any of them have a disconnected qpair.
+				 */
+				break;
+			}
+		}
+	}
+
+	return rc;
+}
+
 void *
 spdk_nvme_poll_group_get_ctx(struct spdk_nvme_poll_group *group)
 {
--- a/lib/nvme/nvme_qpair.c
+++ b/lib/nvme/nvme_qpair.c
@ -565,7 +565,7 @@ nvme_qpair_manual_complete_request(struct spdk_nvme_qpair *qpair,
 }

 void
-nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair)
 {
 	struct nvme_request		*req;
 	STAILQ_HEAD(, nvme_request)	tmp;
@ -580,7 +580,7 @@ nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
 			SPDK_ERRLOG("aborting queued i/o\n");
 		}
 		nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC,
-						   SPDK_NVME_SC_ABORTED_SQ_DELETION, dnr, true);
+						   SPDK_NVME_SC_ABORTED_SQ_DELETION, qpair->abort_dnr, true);
 	}
 }

@ -654,8 +654,8 @@ nvme_qpair_check_enabled(struct spdk_nvme_qpair *qpair)
 		 */
 		if (qpair->ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE &&
 		    !qpair->is_new_qpair) {
-			nvme_qpair_abort_all_queued_reqs(qpair, 0);
-			nvme_transport_qpair_abort_reqs(qpair, 0);
+			nvme_qpair_abort_all_queued_reqs(qpair);
+			nvme_transport_qpair_abort_reqs(qpair);
 		}

 		nvme_qpair_set_state(qpair, NVME_QPAIR_ENABLED);
@ -760,8 +760,8 @@ spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_
 			  nvme_qpair_get_state(qpair) != NVME_QPAIR_DISCONNECTING)) {
 		if (qpair->ctrlr->is_removed) {
 			nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING);
-			nvme_qpair_abort_all_queued_reqs(qpair, 0);
-			nvme_transport_qpair_abort_reqs(qpair, 0);
+			nvme_qpair_abort_all_queued_reqs(qpair);
+			nvme_transport_qpair_abort_reqs(qpair);
 		}
 		return -ENXIO;
 	}
@ -783,7 +783,7 @@ spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_
 				STAILQ_REMOVE(&qpair->err_req_head, req, nvme_request, stailq);
 				nvme_qpair_manual_complete_request(qpair, req,
 								   req->cpl.status.sct,
-								   req->cpl.status.sc, 0, true);
+								   req->cpl.status.sc, qpair->abort_dnr, true);
 			}
 		}
 	}
@ -830,6 +830,12 @@ spdk_nvme_qpair_get_failure_reason(struct spdk_nvme_qpair *qpair)
 	return qpair->transport_failure_reason;
 }

+void
+spdk_nvme_qpair_set_abort_dnr(struct spdk_nvme_qpair *qpair, bool dnr)
+{
+	qpair->abort_dnr = dnr ? 1 : 0;
+}
+
 int
 nvme_qpair_init(struct spdk_nvme_qpair *qpair, uint16_t id,
 		struct spdk_nvme_ctrlr *ctrlr,
@ -896,7 +902,7 @@ nvme_qpair_complete_error_reqs(struct spdk_nvme_qpair *qpair)
 		STAILQ_REMOVE_HEAD(&qpair->err_req_head, stailq);
 		nvme_qpair_manual_complete_request(qpair, req,
 						   req->cpl.status.sct,
-						   req->cpl.status.sc, 0, true);
+						   req->cpl.status.sc, qpair->abort_dnr, true);
 	}
 }

@ -905,7 +911,7 @@ nvme_qpair_deinit(struct spdk_nvme_qpair *qpair)
 {
 	struct nvme_error_cmd *cmd, *entry;

-	nvme_qpair_abort_queued_reqs(qpair, 0);
+	nvme_qpair_abort_queued_reqs(qpair);
 	_nvme_qpair_complete_abort_queued_reqs(qpair);
 	nvme_qpair_complete_error_reqs(qpair);

@ -1111,10 +1117,10 @@ nvme_qpair_resubmit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *
 }

 void
-nvme_qpair_abort_all_queued_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+nvme_qpair_abort_all_queued_reqs(struct spdk_nvme_qpair *qpair)
 {
 	nvme_qpair_complete_error_reqs(qpair);
-	nvme_qpair_abort_queued_reqs(qpair, dnr);
+	nvme_qpair_abort_queued_reqs(qpair);
 	_nvme_qpair_complete_abort_queued_reqs(qpair);
 	if (nvme_qpair_is_admin_queue(qpair)) {
 		nvme_ctrlr_abort_queued_aborts(qpair->ctrlr);
--- a/lib/nvme/nvme_tcp.c
+++ b/lib/nvme/nvme_tcp.c
@ -25,7 +25,12 @@
 #include "spdk_internal/trace_defs.h"

 #define NVME_TCP_RW_BUFFER_SIZE 131072
-#define NVME_TCP_TIME_OUT_IN_SECONDS 2
+
+/* For async connect workloads, allow more time since we are more likely
+ * to be processing lots ICREQs at once.
+ */
+#define ICREQ_TIMEOUT_SYNC 2 /* in seconds */
+#define ICREQ_TIMEOUT_ASYNC 10 /* in seconds */

 #define NVME_TCP_HPDA_DEFAULT			0
 #define NVME_TCP_MAX_R2T_DEFAULT		1
@ -814,6 +819,11 @@ nvme_tcp_qpair_set_recv_state(struct nvme_tcp_qpair *tqpair,
 			    tqpair, state);
 		return;
 	}
+
+	if (state == NVME_TCP_PDU_RECV_STATE_ERROR) {
+		assert(TAILQ_EMPTY(&tqpair->outstanding_reqs));
+	}
+
 	tqpair->recv_state = state;
 }

@ -856,7 +866,7 @@ nvme_tcp_qpair_send_h2c_term_req(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_

 	/* Contain the header len of the wrong received pdu */
 	h2c_term_req->common.plen = h2c_term_req->common.hlen + copy_len;
-	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING);
 	nvme_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvme_tcp_qpair_send_h2c_term_req_complete, tqpair);
 }

@ -1037,7 +1047,7 @@ nvme_tcp_c2h_term_req_payload_handle(struct nvme_tcp_qpair *tqpair,
 				     struct nvme_tcp_pdu *pdu)
 {
 	nvme_tcp_c2h_term_req_dump(&pdu->hdr.term_req);
-	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+	nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING);
 }

 static void
@ -1630,7 +1640,7 @@ nvme_tcp_read_pdu(struct nvme_tcp_qpair *tqpair, uint32_t *reaped, uint32_t max_
 						sizeof(struct spdk_nvme_tcp_common_pdu_hdr) - pdu->ch_valid_bytes,
 						(uint8_t *)&pdu->hdr.common + pdu->ch_valid_bytes);
 			if (rc < 0) {
-				nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+				nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING);
 				break;
 			}
 			pdu->ch_valid_bytes += rc;
@ -1648,7 +1658,7 @@ nvme_tcp_read_pdu(struct nvme_tcp_qpair *tqpair, uint32_t *reaped, uint32_t max_
 						pdu->psh_len - pdu->psh_valid_bytes,
 						(uint8_t *)&pdu->hdr.raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes);
 			if (rc < 0) {
-				nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+				nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING);
 				break;
 			}

@ -1676,7 +1686,7 @@ nvme_tcp_read_pdu(struct nvme_tcp_qpair *tqpair, uint32_t *reaped, uint32_t max_

 			rc = nvme_tcp_read_payload_data(tqpair->sock, pdu);
 			if (rc < 0) {
-				nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+				nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING);
 				break;
 			}

@ -1689,6 +1699,11 @@ nvme_tcp_read_pdu(struct nvme_tcp_qpair *tqpair, uint32_t *reaped, uint32_t max_
 			/* All of this PDU has now been read from the socket. */
 			nvme_tcp_pdu_payload_handle(tqpair, reaped);
 			break;
+		case NVME_TCP_PDU_RECV_STATE_QUIESCING:
+			if (TAILQ_EMPTY(&tqpair->outstanding_reqs)) {
+				nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+			}
+			break;
 		case NVME_TCP_PDU_RECV_STATE_ERROR:
 			memset(pdu, 0, sizeof(struct nvme_tcp_pdu));
 			return NVME_TCP_PDU_FATAL;
@ -1837,6 +1852,7 @@ nvme_tcp_qpair_icreq_send(struct nvme_tcp_qpair *tqpair)
 {
 	struct spdk_nvme_tcp_ic_req *ic_req;
 	struct nvme_tcp_pdu *pdu;
+	uint32_t timeout_in_sec;

 	pdu = tqpair->send_pdu;
 	memset(tqpair->send_pdu, 0, sizeof(*tqpair->send_pdu));
@ -1853,7 +1869,8 @@ nvme_tcp_qpair_icreq_send(struct nvme_tcp_qpair *tqpair)

 	nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_send_icreq_complete, tqpair);

-	tqpair->icreq_timeout_tsc = spdk_get_ticks() + (NVME_TCP_TIME_OUT_IN_SECONDS * spdk_get_ticks_hz());
+	timeout_in_sec = tqpair->qpair.async ? ICREQ_TIMEOUT_ASYNC : ICREQ_TIMEOUT_SYNC;
+	tqpair->icreq_timeout_tsc = spdk_get_ticks() + (timeout_in_sec * spdk_get_ticks_hz());
 	return 0;
 }

@ -2031,12 +2048,18 @@ nvme_tcp_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpa
 		tqpair->stats = &tgroup->stats;
 		tqpair->shared_stats = true;
 	} else {
+		/* When resetting a controller, we disconnect adminq and then reconnect. The stats
+		 * is not freed when disconnecting. So when reconnecting, don't allocate memory
+		 * again.
+		 */
+		if (tqpair->stats == NULL) {
 			tqpair->stats = calloc(1, sizeof(*tqpair->stats));
 			if (!tqpair->stats) {
 				SPDK_ERRLOG("tcp stats memory allocation failed\n");
 				return -ENOMEM;
 			}
 		}
+	}

 	tqpair->maxr2t = NVME_TCP_MAX_R2T_DEFAULT;
 	/* Explicitly set the state and recv_state of tqpair */
--- a/lib/nvme/nvme_transport.c
+++ b/lib/nvme/nvme_transport.c
@ -43,7 +43,7 @@ nvme_get_next_transport(const struct spdk_nvme_transport *transport)

 /*
 * Unfortunately, due to NVMe PCIe multiprocess support, we cannot store the
- * transport object in either the controller struct or the admin qpair. THis means
+ * transport object in either the controller struct or the admin qpair. This means
 * that a lot of admin related transport calls will have to call nvme_get_transport
 * in order to know which functions to call.
 * In the I/O path, we have the ability to store the transport struct in the I/O
@ -539,7 +539,7 @@ nvme_transport_ctrlr_disconnect_qpair_done(struct spdk_nvme_qpair *qpair)
 {
 	if (qpair->active_proc == nvme_ctrlr_get_current_process(qpair->ctrlr) ||
 	    nvme_qpair_is_admin_queue(qpair)) {
-		nvme_qpair_abort_all_queued_reqs(qpair, 0);
+		nvme_qpair_abort_all_queued_reqs(qpair);
 	}
 	nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
 }
@ -559,17 +559,16 @@ nvme_transport_ctrlr_get_memory_domains(const struct spdk_nvme_ctrlr *ctrlr,
 }

 void
-nvme_transport_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+nvme_transport_qpair_abort_reqs(struct spdk_nvme_qpair *qpair)
 {
 	const struct spdk_nvme_transport *transport;

-	assert(dnr <= 1);
 	if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) {
-		qpair->transport->ops.qpair_abort_reqs(qpair, dnr);
+		qpair->transport->ops.qpair_abort_reqs(qpair, qpair->abort_dnr);
 	} else {
 		transport = nvme_get_transport(qpair->ctrlr->trid.trstring);
 		assert(transport != NULL);
-		transport->ops.qpair_abort_reqs(qpair, dnr);
+		transport->ops.qpair_abort_reqs(qpair, qpair->abort_dnr);
 	}
 }

@ -859,6 +858,11 @@ spdk_nvme_ctrlr_get_registers(struct spdk_nvme_ctrlr *ctrlr)
 {
 	const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);

+	if (transport == NULL) {
+		/* Transport does not exist. */
+		return NULL;
+	}
+
 	if (transport->ops.ctrlr_get_registers) {
 		return transport->ops.ctrlr_get_registers(ctrlr);
 	}
--- a/lib/nvme/spdk_nvme.map
+++ b/lib/nvme/spdk_nvme.map
@ -39,7 +39,6 @@
 	spdk_nvme_ctrlr_set_trid;
 	spdk_nvme_ctrlr_reset_subsystem;
 	spdk_nvme_ctrlr_reset;
-	spdk_nvme_ctrlr_prepare_for_reset;
 	spdk_nvme_ctrlr_reset_async;
 	spdk_nvme_ctrlr_reset_poll_async;
 	spdk_nvme_ctrlr_disconnect;
@ -124,6 +123,7 @@
 	spdk_nvme_poll_group_remove;
 	spdk_nvme_poll_group_destroy;
 	spdk_nvme_poll_group_process_completions;
+	spdk_nvme_poll_group_all_connected;
 	spdk_nvme_poll_group_get_ctx;

 	spdk_nvme_ns_get_data;
@ -185,6 +185,7 @@
 	spdk_nvme_qpair_print_completion;
 	spdk_nvme_qpair_get_id;
 	spdk_nvme_qpair_get_num_outstanding_reqs;
+	spdk_nvme_qpair_set_abort_dnr;

 	spdk_nvme_print_command;
 	spdk_nvme_print_completion;
--- a/lib/nvmf/ctrlr.c
+++ b/lib/nvmf/ctrlr.c
@ -1,7 +1,7 @@
 /*   SPDX-License-Identifier: BSD-3-Clause
 *   Copyright (C) 2017 Intel Corporation. All rights reserved.
 *   Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved.
- *   Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *   Copyright (c) 2021, 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 */

 #include "spdk/stdinc.h"
@ -31,6 +31,8 @@

 #define NVMF_CTRLR_RESET_SHN_TIMEOUT_IN_MS	(NVMF_CC_RESET_SHN_TIMEOUT_IN_MS + 5000)

+#define DUPLICATE_QID_RETRY_US 100
+
 /*
 * Report the SPDK version as the firmware revision.
 * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts.
@ -129,11 +131,15 @@ _nvmf_ctrlr_disconnect_qpairs_on_pg(struct spdk_io_channel_iter *i, bool include
 		if (qpair->ctrlr == ctrlr && (include_admin || !nvmf_qpair_is_admin_queue(qpair))) {
 			rc = spdk_nvmf_qpair_disconnect(qpair, NULL, NULL);
 			if (rc) {
+				if (rc == -EINPROGRESS) {
+					rc = 0;
+				} else {
 					SPDK_ERRLOG("Qpair disconnect failed\n");
 					return rc;
 				}
 			}
 		}
+	}

 	return rc;
 }
@ -209,6 +215,8 @@ nvmf_ctrlr_start_keep_alive_timer(struct spdk_nvmf_ctrlr *ctrlr)
 	}
 }

+static int _retry_qid_check(void *ctx);
+
 static void
 ctrlr_add_qpair_and_send_rsp(struct spdk_nvmf_qpair *qpair,
 			     struct spdk_nvmf_ctrlr *ctrlr,
@ -219,10 +227,22 @@ ctrlr_add_qpair_and_send_rsp(struct spdk_nvmf_qpair *qpair,
 	assert(ctrlr->admin_qpair->group->thread == spdk_get_thread());

 	if (spdk_bit_array_get(ctrlr->qpair_mask, qpair->qid)) {
+		if (qpair->connect_req != NULL) {
 			SPDK_ERRLOG("Got I/O connect with duplicate QID %u\n", qpair->qid);
 			rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
 			rsp->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
+			qpair->connect_req = NULL;
+			qpair->ctrlr = NULL;
 			spdk_nvmf_request_complete(req);
+		} else {
+			SPDK_WARNLOG("Duplicate QID detected, re-check in %dus\n",
+				     DUPLICATE_QID_RETRY_US);
+			qpair->connect_req = req;
+			/* Set qpair->ctrlr here so that we'll have it when the poller expires. */
+			qpair->ctrlr = ctrlr;
+			req->poller = SPDK_POLLER_REGISTER(_retry_qid_check, qpair,
+							   DUPLICATE_QID_RETRY_US);
+		}
 		return;
 	}

@ -235,10 +255,22 @@ ctrlr_add_qpair_and_send_rsp(struct spdk_nvmf_qpair *qpair,
 		      rsp->status_code_specific.success.cntlid);
 	spdk_nvmf_request_complete(req);

-	SPDK_DTRACE_PROBE4(nvmf_ctrlr_add_qpair, qpair, qpair->qid, ctrlr->subsys->subnqn,
+	SPDK_DTRACE_PROBE4_TICKS(nvmf_ctrlr_add_qpair, qpair, qpair->qid, ctrlr->subsys->subnqn,
 				 ctrlr->hostnqn);
 }

+static int
+_retry_qid_check(void *ctx)
+{
+	struct spdk_nvmf_qpair *qpair = ctx;
+	struct spdk_nvmf_request *req = qpair->connect_req;
+	struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+
+	spdk_poller_unregister(&req->poller);
+	ctrlr_add_qpair_and_send_rsp(qpair, ctrlr, req);
+	return SPDK_POLLER_BUSY;
+}
+
 static void
 _nvmf_ctrlr_add_admin_qpair(void *ctx)
 {
@ -342,7 +374,7 @@ nvmf_ctrlr_create(struct spdk_nvmf_subsystem *subsystem,
 		ctrlr->cntlid = connect_data->cntlid;
 	}

-	SPDK_DTRACE_PROBE3(nvmf_ctrlr_create, ctrlr, subsystem->subnqn,
+	SPDK_DTRACE_PROBE3_TICKS(nvmf_ctrlr_create, ctrlr, subsystem->subnqn,
 				 spdk_thread_get_id(req->qpair->group->thread));

 	STAILQ_INIT(&ctrlr->async_events);
@ -490,7 +522,7 @@ _nvmf_ctrlr_destruct(void *ctx)
 	struct spdk_nvmf_reservation_log *log, *log_tmp;
 	struct spdk_nvmf_async_event_completion *event, *event_tmp;

-	SPDK_DTRACE_PROBE3(nvmf_ctrlr_destruct, ctrlr, ctrlr->subsys->subnqn,
+	SPDK_DTRACE_PROBE3_TICKS(nvmf_ctrlr_destruct, ctrlr, ctrlr->subsys->subnqn,
 				 spdk_thread_get_id(ctrlr->thread));

 	assert(spdk_get_thread() == ctrlr->thread);
@ -535,7 +567,7 @@ nvmf_ctrlr_add_io_qpair(void *ctx)
 	struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
 	struct spdk_nvmf_qpair *admin_qpair = ctrlr->admin_qpair;

-	SPDK_DTRACE_PROBE4(nvmf_ctrlr_add_io_qpair, ctrlr, req->qpair, req->qpair->qid,
+	SPDK_DTRACE_PROBE4_TICKS(nvmf_ctrlr_add_io_qpair, ctrlr, req->qpair, req->qpair->qid,
 				 spdk_thread_get_id(ctrlr->thread));

 	/* Unit test will check qpair->ctrlr after calling spdk_nvmf_ctrlr_connect.
@ -973,7 +1005,7 @@ nvmf_ctrlr_association_remove(void *ctx)

 	if (ctrlr->admin_qpair) {
 		rc = spdk_nvmf_qpair_disconnect(ctrlr->admin_qpair, NULL, NULL);
-		if (rc < 0) {
+		if (rc < 0 && rc != -EINPROGRESS) {
 			SPDK_ERRLOG("Fail to disconnect admin ctrlr qpair\n");
 			assert(false);
 		}
@ -4186,7 +4218,7 @@ nvmf_ctrlr_process_io_cmd(struct spdk_nvmf_request *req)
 	}

 	if (spdk_likely(ctrlr->listener != NULL)) {
-		SPDK_DTRACE_PROBE3(nvmf_request_io_exec_path, req,
+		SPDK_DTRACE_PROBE3_TICKS(nvmf_request_io_exec_path, req,
 					 ctrlr->listener->trid->traddr,
 					 ctrlr->listener->trid->trsvcid);
 	}
--- a/lib/nvmf/ctrlr_discovery.c
+++ b/lib/nvmf/ctrlr_discovery.c
@ -74,7 +74,6 @@ nvmf_generate_discovery_log(struct spdk_nvmf_tgt *tgt, const char *hostnqn, size
 	struct spdk_nvmf_discovery_log_page_entry *entry;
 	struct spdk_nvmf_discovery_log_page *disc_log;
 	size_t cur_size;
-	uint32_t sid;

 	SPDK_DEBUGLOG(nvmf, "Generating log page for genctr %" PRIu64 "\n",
 		      tgt->discovery_genctr);
@ -86,10 +85,10 @@ nvmf_generate_discovery_log(struct spdk_nvmf_tgt *tgt, const char *hostnqn, size
 		return NULL;
 	}

-	for (sid = 0; sid < tgt->max_subsystems; sid++) {
-		subsystem = tgt->subsystems[sid];
-		if ((subsystem == NULL) ||
-		    (subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE) ||
+	for (subsystem = spdk_nvmf_subsystem_get_first(tgt);
+	     subsystem != NULL;
+	     subsystem = spdk_nvmf_subsystem_get_next(subsystem)) {
+		if ((subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE) ||
 		    (subsystem->state == SPDK_NVMF_SUBSYSTEM_DEACTIVATING)) {
 			continue;
 		}
--- a/lib/nvmf/fc.c
+++ b/lib/nvmf/fc.c
@ -3663,7 +3663,6 @@ out:
 		SPDK_DEBUGLOG(nvmf_fc_adm_api, "%s", log_str);
 	}

-	free(args);
 	free(arg);
 }

--- a/lib/nvmf/fc_ls.c
+++ b/lib/nvmf/fc_ls.c
@ -2,6 +2,7 @@
 *   Copyright (C) 2020 Intel Corporation.
 *   Copyright (c) 2018-2019 Broadcom.  All Rights Reserved.
 *   The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries.
+ *   Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 */

 #include "spdk/env.h"
@ -1482,8 +1483,8 @@ nvmf_fc_poller_conn_abort_done(void *hwqp, int32_t status, void *cb_args)

 			if (!conn_args->backend_initiated && (fc_conn->qpair.state != SPDK_NVMF_QPAIR_DEACTIVATING)) {
 				/* disconnect qpair from nvmf controller */
-				spdk_nvmf_qpair_disconnect(&fc_conn->qpair,
-							   nvmf_fc_disconnect_qpair_cb, &conn_args->cb_info);
+				spdk_nvmf_qpair_disconnect(&fc_conn->qpair, NULL, NULL);
+				nvmf_fc_disconnect_qpair_cb(&conn_args->cb_info);
 			} else {
 				nvmf_fc_poller_api_perform_cb(&conn_args->cb_info, SPDK_NVMF_FC_POLLER_API_SUCCESS);
 			}
@ -1542,8 +1543,8 @@ nvmf_fc_poller_api_del_connection(void *arg)

 		if (!conn_args->backend_initiated && (fc_conn->qpair.state != SPDK_NVMF_QPAIR_DEACTIVATING)) {
 			/* disconnect qpair from nvmf controller */
-			spdk_nvmf_qpair_disconnect(&fc_conn->qpair, nvmf_fc_disconnect_qpair_cb,
-						   &conn_args->cb_info);
+			spdk_nvmf_qpair_disconnect(&fc_conn->qpair, NULL, NULL);
+			nvmf_fc_disconnect_qpair_cb(&conn_args->cb_info);
 		} else {
 			nvmf_fc_poller_api_perform_cb(&conn_args->cb_info, SPDK_NVMF_FC_POLLER_API_SUCCESS);
 		}
--- a/lib/nvmf/nvmf.c
+++ b/lib/nvmf/nvmf.c
@ -1,7 +1,7 @@
 /*   SPDX-License-Identifier: BSD-3-Clause
 *   Copyright (C) 2016 Intel Corporation. All rights reserved.
 *   Copyright (c) 2018-2019, 2021 Mellanox Technologies LTD. All rights reserved.
- *   Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *   Copyright (c) 2021, 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 */

 #include "spdk/stdinc.h"
@ -46,7 +46,6 @@ struct nvmf_qpair_disconnect_many_ctx {
 	struct spdk_nvmf_poll_group *group;
 	spdk_nvmf_poll_group_mod_done cpl_fn;
 	void *cpl_ctx;
-	uint32_t count;
 };

 static void
@ -127,7 +126,7 @@ nvmf_tgt_destroy_poll_group(void *io_device, void *ctx_buf)
 	struct spdk_nvmf_tgt *tgt = io_device;
 	struct spdk_nvmf_poll_group *group = ctx_buf;

-	SPDK_DTRACE_PROBE1(nvmf_destroy_poll_group, spdk_thread_get_id(group->thread));
+	SPDK_DTRACE_PROBE1_TICKS(nvmf_destroy_poll_group, spdk_thread_get_id(group->thread));

 	pthread_mutex_lock(&tgt->mutex);
 	TAILQ_REMOVE(&tgt->poll_groups, group, link);
@ -156,7 +155,8 @@ nvmf_poll_group_add_transport(struct spdk_nvmf_poll_group *group,
 		SPDK_ERRLOG("Unable to create poll group for transport\n");
 		return -1;
 	}
-	SPDK_DTRACE_PROBE2(nvmf_transport_poll_group_create, transport, spdk_thread_get_id(group->thread));
+	SPDK_DTRACE_PROBE2_TICKS(nvmf_transport_poll_group_create, transport,
+				 spdk_thread_get_id(group->thread));

 	tgroup->group = group;
 	TAILQ_INSERT_TAIL(&group->tgroups, tgroup, link);
@ -170,8 +170,8 @@ nvmf_tgt_create_poll_group(void *io_device, void *ctx_buf)
 	struct spdk_nvmf_tgt *tgt = io_device;
 	struct spdk_nvmf_poll_group *group = ctx_buf;
 	struct spdk_nvmf_transport *transport;
+	struct spdk_nvmf_subsystem *subsystem;
 	struct spdk_thread *thread = spdk_get_thread();
-	uint32_t sid;
 	int rc;

 	group->tgt = tgt;
@ -182,7 +182,7 @@ nvmf_tgt_create_poll_group(void *io_device, void *ctx_buf)

 	group->poller = SPDK_POLLER_REGISTER(nvmf_poll_group_poll, group, 0);

-	SPDK_DTRACE_PROBE1(nvmf_create_poll_group, spdk_thread_get_id(thread));
+	SPDK_DTRACE_PROBE1_TICKS(nvmf_create_poll_group, spdk_thread_get_id(thread));

 	TAILQ_FOREACH(transport, &tgt->transports, link) {
 		rc = nvmf_poll_group_add_transport(group, transport);
@ -199,14 +199,9 @@ nvmf_tgt_create_poll_group(void *io_device, void *ctx_buf)
 		return -ENOMEM;
 	}

-	for (sid = 0; sid < tgt->max_subsystems; sid++) {
-		struct spdk_nvmf_subsystem *subsystem;
-
-		subsystem = tgt->subsystems[sid];
-		if (!subsystem) {
-			continue;
-		}
-
+	for (subsystem = spdk_nvmf_subsystem_get_first(tgt);
+	     subsystem != NULL;
+	     subsystem = spdk_nvmf_subsystem_get_next(subsystem)) {
 		if (nvmf_poll_group_add_subsystem(group, subsystem, NULL, NULL) != 0) {
 			nvmf_tgt_cleanup_poll_group(group);
 			return -1;
@ -222,26 +217,31 @@ nvmf_tgt_create_poll_group(void *io_device, void *ctx_buf)
 }

 static void
-_nvmf_tgt_disconnect_next_qpair(void *ctx)
+_nvmf_tgt_disconnect_qpairs(void *ctx)
 {
-	struct spdk_nvmf_qpair *qpair;
+	struct spdk_nvmf_qpair *qpair, *qpair_tmp;
 	struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx;
 	struct spdk_nvmf_poll_group *group = qpair_ctx->group;
 	struct spdk_io_channel *ch;
-	int rc = 0;
+	int rc;

-	qpair = TAILQ_FIRST(&group->qpairs);
-
-	if (qpair) {
-		rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_tgt_disconnect_next_qpair, ctx);
+	TAILQ_FOREACH_SAFE(qpair, &group->qpairs, link, qpair_tmp) {
+		rc = spdk_nvmf_qpair_disconnect(qpair, NULL, NULL);
+		if (rc && rc != -EINPROGRESS) {
+			break;
+		}
 	}

-	if (!qpair || rc != 0) {
+	if (TAILQ_EMPTY(&group->qpairs)) {
 		/* When the refcount from the channels reaches 0, nvmf_tgt_destroy_poll_group will be called. */
 		ch = spdk_io_channel_from_ctx(group);
 		spdk_put_io_channel(ch);
 		free(qpair_ctx);
+		return;
 	}
+
+	/* Some qpairs are in process of being disconnected. Send a message and try to remove them again */
+	spdk_thread_send_msg(spdk_get_thread(), _nvmf_tgt_disconnect_qpairs, ctx);
 }

 static void
@ -249,7 +249,7 @@ nvmf_tgt_destroy_poll_group_qpairs(struct spdk_nvmf_poll_group *group)
 {
 	struct nvmf_qpair_disconnect_many_ctx *ctx;

-	SPDK_DTRACE_PROBE1(nvmf_destroy_poll_group_qpairs, spdk_thread_get_id(group->thread));
+	SPDK_DTRACE_PROBE1_TICKS(nvmf_destroy_poll_group_qpairs, spdk_thread_get_id(group->thread));

 	ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_many_ctx));
 	if (!ctx) {
@ -258,7 +258,7 @@ nvmf_tgt_destroy_poll_group_qpairs(struct spdk_nvmf_poll_group *group)
 	}

 	ctx->group = group;
-	_nvmf_tgt_disconnect_next_qpair(ctx);
+	_nvmf_tgt_disconnect_qpairs(ctx);
 }

 struct spdk_nvmf_tgt *
@ -360,29 +360,36 @@ static void
 nvmf_tgt_destroy_cb(void *io_device)
 {
 	struct spdk_nvmf_tgt *tgt = io_device;
-	uint32_t i;
+	struct spdk_nvmf_subsystem *subsystem, *subsystem_next;
 	int rc;

-	if (tgt->subsystems) {
-		for (i = 0; i < tgt->max_subsystems; i++) {
-			if (tgt->subsystems[i]) {
-				nvmf_subsystem_remove_all_listeners(tgt->subsystems[i], true);
+	if (tgt->subsystems == NULL) {
+		_nvmf_tgt_destroy_next_transport(tgt);
+		return;
+	}

-				rc = spdk_nvmf_subsystem_destroy(tgt->subsystems[i], nvmf_tgt_destroy_cb, tgt);
+	/* We will be freeing subsystems in this loop, so we always need to get the next one
+	 * ahead of time, since we can't call get_next() on a subsystem that's been freed.
+	 */
+	for (subsystem = spdk_nvmf_subsystem_get_first(tgt),
+	     subsystem_next = spdk_nvmf_subsystem_get_next(subsystem);
+	     subsystem != NULL;
+	     subsystem = subsystem_next,
+	     subsystem_next = spdk_nvmf_subsystem_get_next(subsystem_next)) {
+		nvmf_subsystem_remove_all_listeners(subsystem, true);
+
+		rc = spdk_nvmf_subsystem_destroy(subsystem, nvmf_tgt_destroy_cb, tgt);
 		if (rc) {
 			if (rc == -EINPROGRESS) {
 				/* If rc is -EINPROGRESS, nvmf_tgt_destroy_cb will be called again when subsystem #i
 				 * is destroyed, nvmf_tgt_destroy_cb will continue to destroy other subsystems if any */
 				return;
 			} else {
-						SPDK_ERRLOG("Failed to destroy subsystem %s, rc %d\n", tgt->subsystems[i]->subnqn, rc);
-					}
+				SPDK_ERRLOG("Failed to destroy subsystem %s, rc %d\n", subsystem->subnqn, rc);
 			}
 		}
 	}
 	free(tgt->subsystems);
-	}
-
 	_nvmf_tgt_destroy_next_transport(tgt);
 }

@ -780,7 +787,7 @@ spdk_nvmf_tgt_add_transport(struct spdk_nvmf_tgt *tgt,
 {
 	struct spdk_nvmf_tgt_add_transport_ctx *ctx;

-	SPDK_DTRACE_PROBE2(nvmf_tgt_add_transport, transport, tgt->name);
+	SPDK_DTRACE_PROBE2_TICKS(nvmf_tgt_add_transport, transport, tgt->name);

 	if (spdk_nvmf_tgt_get_transport(tgt, transport->ops->name)) {
 		cb_fn(cb_arg, -EEXIST);
@ -838,7 +845,7 @@ spdk_nvmf_tgt_pause_polling(struct spdk_nvmf_tgt *tgt, spdk_nvmf_tgt_pause_polli
 {
 	struct nvmf_tgt_pause_ctx *ctx;

-	SPDK_DTRACE_PROBE2(nvmf_tgt_pause_polling, tgt, tgt->name);
+	SPDK_DTRACE_PROBE2_TICKS(nvmf_tgt_pause_polling, tgt, tgt->name);

 	switch (tgt->state) {
 	case NVMF_TGT_PAUSING:
@ -898,7 +905,7 @@ spdk_nvmf_tgt_resume_polling(struct spdk_nvmf_tgt *tgt, spdk_nvmf_tgt_resume_pol
 {
 	struct nvmf_tgt_pause_ctx *ctx;

-	SPDK_DTRACE_PROBE2(nvmf_tgt_resume_polling, tgt, tgt->name);
+	SPDK_DTRACE_PROBE2_TICKS(nvmf_tgt_resume_polling, tgt, tgt->name);

 	switch (tgt->state) {
 	case NVMF_TGT_PAUSING:
@ -1076,7 +1083,7 @@ spdk_nvmf_poll_group_add(struct spdk_nvmf_poll_group *group,

 	/* We add the qpair to the group only it is successfully added into the tgroup */
 	if (rc == 0) {
-		SPDK_DTRACE_PROBE2(nvmf_poll_group_add_qpair, qpair, spdk_thread_get_id(group->thread));
+		SPDK_DTRACE_PROBE2_TICKS(nvmf_poll_group_add_qpair, qpair, spdk_thread_get_id(group->thread));
 		TAILQ_INSERT_TAIL(&group->qpairs, qpair, link);
 		nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_ACTIVE);
 	}
@ -1152,7 +1159,7 @@ spdk_nvmf_poll_group_remove(struct spdk_nvmf_qpair *qpair)
 	struct spdk_nvmf_transport_poll_group *tgroup;
 	int rc;

-	SPDK_DTRACE_PROBE2(nvmf_poll_group_remove_qpair, qpair,
+	SPDK_DTRACE_PROBE2_TICKS(nvmf_poll_group_remove_qpair, qpair,
 				 spdk_thread_get_id(qpair->group->thread));
 	nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_ERROR);

@ -1238,6 +1245,9 @@ _nvmf_qpair_disconnect_msg(void *ctx)
 	free(ctx);
 }

+SPDK_LOG_DEPRECATION_REGISTER(spdk_nvmf_qpair_disconnect, "cb_fn and ctx are deprecated", "v23.09",
+			      0);
+
 int
 spdk_nvmf_qpair_disconnect(struct spdk_nvmf_qpair *qpair, nvmf_qpair_disconnect_cb cb_fn, void *ctx)
 {
@ -1245,10 +1255,11 @@ spdk_nvmf_qpair_disconnect(struct spdk_nvmf_qpair *qpair, nvmf_qpair_disconnect_
 	struct nvmf_qpair_disconnect_ctx *qpair_ctx;

 	if (__atomic_test_and_set(&qpair->disconnect_started, __ATOMIC_RELAXED)) {
-		if (cb_fn) {
-			cb_fn(ctx);
+		return -EINPROGRESS;
 	}
-		return 0;
+
+	if (cb_fn || ctx) {
+		SPDK_LOG_DEPRECATED(spdk_nvmf_qpair_disconnect);
 	}

 	/* If we get a qpair in the uninitialized state, we can just destroy it immediately */
@ -1277,7 +1288,7 @@ spdk_nvmf_qpair_disconnect(struct spdk_nvmf_qpair *qpair, nvmf_qpair_disconnect_
 		return 0;
 	}

-	SPDK_DTRACE_PROBE2(nvmf_qpair_disconnect, qpair, spdk_thread_get_id(group->thread));
+	SPDK_DTRACE_PROBE2_TICKS(nvmf_qpair_disconnect, qpair, spdk_thread_get_id(group->thread));
 	assert(qpair->state == SPDK_NVMF_QPAIR_ACTIVE);
 	nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_DEACTIVATING);

@ -1294,7 +1305,7 @@ spdk_nvmf_qpair_disconnect(struct spdk_nvmf_qpair *qpair, nvmf_qpair_disconnect_

 	/* Check for outstanding I/O */
 	if (!TAILQ_EMPTY(&qpair->outstanding)) {
-		SPDK_DTRACE_PROBE2(nvmf_poll_group_drain_qpair, qpair, spdk_thread_get_id(group->thread));
+		SPDK_DTRACE_PROBE2_TICKS(nvmf_poll_group_drain_qpair, qpair, spdk_thread_get_id(group->thread));
 		qpair->state_cb = _nvmf_qpair_destroy;
 		qpair->state_cb_arg = qpair_ctx;
 		nvmf_qpair_abort_pending_zcopy_reqs(qpair);
@ -1311,6 +1322,7 @@ int
 spdk_nvmf_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
 			      struct spdk_nvme_transport_id *trid)
 {
+	memset(trid, 0, sizeof(*trid));
 	return nvmf_transport_qpair_get_peer_trid(qpair, trid);
 }

@ -1318,6 +1330,7 @@ int
 spdk_nvmf_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
 			       struct spdk_nvme_transport_id *trid)
 {
+	memset(trid, 0, sizeof(*trid));
 	return nvmf_transport_qpair_get_local_trid(qpair, trid);
 }

@ -1325,6 +1338,7 @@ int
 spdk_nvmf_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
 				struct spdk_nvme_transport_id *trid)
 {
+	memset(trid, 0, sizeof(*trid));
 	return nvmf_transport_qpair_get_listen_trid(qpair, trid);
 }

@ -1528,7 +1542,7 @@ fini:
 		cb_fn(cb_arg, rc);
 	}

-	SPDK_DTRACE_PROBE2(nvmf_poll_group_add_subsystem, spdk_thread_get_id(group->thread),
+	SPDK_DTRACE_PROBE2_TICKS(nvmf_poll_group_add_subsystem, spdk_thread_get_id(group->thread),
 				 subsystem->subnqn);

 	return rc;
@ -1574,23 +1588,6 @@ fini:

 static void nvmf_poll_group_remove_subsystem_msg(void *ctx);

-static void
-remove_subsystem_qpair_cb(void *ctx)
-{
-	struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx;
-
-	assert(qpair_ctx->count > 0);
-	qpair_ctx->count--;
-	if (qpair_ctx->count == 0) {
-		/* All of the asynchronous callbacks for this context have been
-		 * completed.  Call nvmf_poll_group_remove_subsystem_msg() again
-		 * to check if all associated qpairs for this subsystem have
-		 * been removed from the poll group.
-		 */
-		nvmf_poll_group_remove_subsystem_msg(ctx);
-	}
-}
-
 static void
 nvmf_poll_group_remove_subsystem_msg(void *ctx)
 {
@ -1604,39 +1601,24 @@ nvmf_poll_group_remove_subsystem_msg(void *ctx)
 	group = qpair_ctx->group;
 	subsystem = qpair_ctx->subsystem;

-	/* Initialize count to 1.  This acts like a ref count, to ensure that if spdk_nvmf_qpair_disconnect
-	 * immediately invokes the callback (i.e. the qpairs is already in process of being disconnected)
-	 * that we don't recursively call nvmf_poll_group_remove_subsystem_msg before we've iterated the
-	 * full list of qpairs.
-	 */
-	qpair_ctx->count = 1;
 	TAILQ_FOREACH_SAFE(qpair, &group->qpairs, link, qpair_tmp) {
 		if ((qpair->ctrlr != NULL) && (qpair->ctrlr->subsys == subsystem)) {
 			qpairs_found = true;
-			qpair_ctx->count++;
-			rc = spdk_nvmf_qpair_disconnect(qpair, remove_subsystem_qpair_cb, ctx);
-			if (rc) {
+			rc = spdk_nvmf_qpair_disconnect(qpair, NULL, NULL);
+			if (rc && rc != -EINPROGRESS) {
 				break;
 			}
 		}
 	}
-	qpair_ctx->count--;

 	if (!qpairs_found) {
 		_nvmf_poll_group_remove_subsystem_cb(ctx, 0);
 		return;
 	}

-	if (qpair_ctx->count == 0 || rc) {
-		/* If count == 0, it means there were some qpairs in the poll group but they
-		 * were already in process of being disconnected.  So we send a message to this
-		 * same thread so that this function executes again later.  We won't actually
-		 * invoke the remove_subsystem_cb until all of the qpairs are actually removed
-		 * from the poll group.
-		 */
+	/* Some qpairs are in process of being disconnected. Send a message and try to remove them again */
 	spdk_thread_send_msg(spdk_get_thread(), nvmf_poll_group_remove_subsystem_msg, ctx);
 }
-}

 void
 nvmf_poll_group_remove_subsystem(struct spdk_nvmf_poll_group *group,
@ -1647,7 +1629,7 @@ nvmf_poll_group_remove_subsystem(struct spdk_nvmf_poll_group *group,
 	struct nvmf_qpair_disconnect_many_ctx *ctx;
 	uint32_t i;

-	SPDK_DTRACE_PROBE3(nvmf_poll_group_remove_subsystem, group, spdk_thread_get_id(group->thread),
+	SPDK_DTRACE_PROBE3_TICKS(nvmf_poll_group_remove_subsystem, group, spdk_thread_get_id(group->thread),
 				 subsystem->subnqn);

 	ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_many_ctx));
--- a/lib/nvmf/tcp.c
+++ b/lib/nvmf/tcp.c
@ -263,6 +263,8 @@ struct spdk_nvmf_tcp_qpair {
 	TAILQ_HEAD(, spdk_nvmf_tcp_req)		tcp_req_working_queue;
 	TAILQ_HEAD(, spdk_nvmf_tcp_req)		tcp_req_free_queue;
 	SLIST_HEAD(, nvme_tcp_pdu)		tcp_pdu_free_queue;
+	/* Number of working pdus */
+	uint32_t				tcp_pdu_working_count;

 	/* Number of requests in each state */
 	uint32_t				state_cntr[TCP_REQUEST_NUM_STATES];
@ -897,7 +899,7 @@ nvmf_tcp_qpair_disconnect(struct spdk_nvmf_tcp_qpair *tqpair)

 	if (tqpair->state <= NVME_TCP_QPAIR_STATE_RUNNING) {
 		nvmf_tcp_qpair_set_state(tqpair, NVME_TCP_QPAIR_STATE_EXITING);
-		nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+		assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_ERROR);
 		spdk_poller_unregister(&tqpair->timeout_poller);

 		/* This will end up calling nvmf_tcp_close_qpair */
@ -912,7 +914,7 @@ _mgmt_pdu_write_done(void *_tqpair, int err)
 	struct nvme_tcp_pdu *pdu = tqpair->mgmt_pdu;

 	if (spdk_unlikely(err != 0)) {
-		nvmf_tcp_qpair_disconnect(tqpair);
+		nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING);
 		return;
 	}

@ -937,7 +939,7 @@ _req_pdu_write_done(void *req, int err)
 	}

 	if (spdk_unlikely(err != 0)) {
-		nvmf_tcp_qpair_disconnect(tqpair);
+		nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING);
 		return;
 	}

@ -1154,6 +1156,7 @@ nvmf_tcp_qpair_init_mem_resource(struct spdk_nvmf_tcp_qpair *tqpair)
 	tqpair->mgmt_pdu->qpair = tqpair;
 	tqpair->pdu_in_progress = SLIST_FIRST(&tqpair->tcp_pdu_free_queue);
 	SLIST_REMOVE_HEAD(&tqpair->tcp_pdu_free_queue, slist);
+	tqpair->tcp_pdu_working_count = 1;

 	tqpair->recv_buf_size = (in_capsule_data_size + sizeof(struct spdk_nvme_tcp_cmd) + 2 *
 				 SPDK_NVME_TCP_DIGEST_LEN) * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR;
@ -1457,6 +1460,17 @@ nvmf_tcp_qpair_set_recv_state(struct spdk_nvmf_tcp_qpair *tqpair,
 		return;
 	}

+	if (spdk_unlikely(state == NVME_TCP_PDU_RECV_STATE_QUIESCING)) {
+		if (tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH && tqpair->pdu_in_progress) {
+			SLIST_INSERT_HEAD(&tqpair->tcp_pdu_free_queue, tqpair->pdu_in_progress, slist);
+			tqpair->tcp_pdu_working_count--;
+		}
+	}
+
+	if (spdk_unlikely(state == NVME_TCP_PDU_RECV_STATE_ERROR)) {
+		assert(tqpair->tcp_pdu_working_count == 0);
+	}
+
 	if (tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_REQ) {
 		/* When leaving the await req state, move the qpair to the main list */
 		TAILQ_REMOVE(&tqpair->group->await_req, tqpair, link);
@ -1527,7 +1541,7 @@ nvmf_tcp_send_c2h_term_req(struct spdk_nvmf_tcp_qpair *tqpair, struct nvme_tcp_p

 	/* Contain the header of the wrong received pdu */
 	c2h_term_req->common.plen = c2h_term_req->common.hlen + copy_len;
-	nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+	nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING);
 	nvmf_tcp_qpair_write_mgmt_pdu(tqpair, nvmf_tcp_send_c2h_term_req_complete, tqpair);
 }

@ -1556,7 +1570,7 @@ nvmf_tcp_capsule_cmd_hdr_handle(struct spdk_nvmf_tcp_transport *ttransport,

 		/* The host sent more commands than the maximum queue depth. */
 		SPDK_ERRLOG("Cannot allocate tcp_req on tqpair=%p\n", tqpair);
-		nvmf_tcp_qpair_disconnect(tqpair);
+		nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING);
 		return;
 	}

@ -1848,7 +1862,7 @@ nvmf_tcp_h2c_term_req_payload_handle(struct spdk_nvmf_tcp_qpair *tqpair,
 	struct spdk_nvme_tcp_term_req_hdr *h2c_term_req = &pdu->hdr.term_req;

 	nvmf_tcp_h2c_term_req_dump(h2c_term_req);
-	nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+	nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING);
 }

 static void
@ -1875,6 +1889,7 @@ _nvmf_tcp_pdu_payload_handle(struct spdk_nvmf_tcp_qpair *tqpair, struct nvme_tcp
 		break;
 	}
 	SLIST_INSERT_HEAD(&tqpair->tcp_pdu_free_queue, pdu, slist);
+	tqpair->tcp_pdu_working_count--;
 }

 static void
@ -2182,6 +2197,7 @@ nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
 				}
 				SLIST_REMOVE_HEAD(&tqpair->tcp_pdu_free_queue, slist);
 				tqpair->pdu_in_progress = pdu;
+				tqpair->tcp_pdu_working_count++;
 			}
 			memset(pdu, 0, offsetof(struct nvme_tcp_pdu, qpair));
 			nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH);
@ -2196,7 +2212,8 @@ nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
 						(void *)&pdu->hdr.common + pdu->ch_valid_bytes);
 			if (rc < 0) {
 				SPDK_DEBUGLOG(nvmf_tcp, "will disconnect tqpair=%p\n", tqpair);
-				return NVME_TCP_PDU_FATAL;
+				nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING);
+				break;
 			} else if (rc > 0) {
 				pdu->ch_valid_bytes += rc;
 				spdk_trace_record(TRACE_TCP_READ_FROM_SOCKET_DONE, tqpair->qpair.qid, rc, 0, tqpair);
@ -2215,7 +2232,8 @@ nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
 						pdu->psh_len - pdu->psh_valid_bytes,
 						(void *)&pdu->hdr.raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes);
 			if (rc < 0) {
-				return NVME_TCP_PDU_FATAL;
+				nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING);
+				break;
 			} else if (rc > 0) {
 				spdk_trace_record(TRACE_TCP_READ_FROM_SOCKET_DONE, tqpair->qpair.qid, rc, 0, tqpair);
 				pdu->psh_valid_bytes += rc;
@ -2248,7 +2266,8 @@ nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)

 			rc = nvme_tcp_read_payload_data(tqpair->sock, pdu);
 			if (rc < 0) {
-				return NVME_TCP_PDU_FATAL;
+				nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING);
+				break;
 			}
 			pdu->rw_offset += rc;

@ -2261,17 +2280,24 @@ nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
 			    spdk_dif_generate_stream(pdu->data_iov, pdu->data_iovcnt, 0, data_len,
 						     pdu->dif_ctx) != 0) {
 				SPDK_ERRLOG("DIF generate failed\n");
-				return NVME_TCP_PDU_FATAL;
+				nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_QUIESCING);
+				break;
 			}

 			/* All of this PDU has now been read from the socket. */
 			nvmf_tcp_pdu_payload_handle(tqpair, pdu);
 			break;
+		case NVME_TCP_PDU_RECV_STATE_QUIESCING:
+			if (tqpair->tcp_pdu_working_count != 0) {
+				return NVME_TCP_PDU_IN_PROGRESS;
+			}
+			nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+			break;
 		case NVME_TCP_PDU_RECV_STATE_ERROR:
 			if (!spdk_sock_is_connected(tqpair->sock)) {
 				return NVME_TCP_PDU_FATAL;
 			}
-			break;
+			return NVME_TCP_PDU_IN_PROGRESS;
 		default:
 			SPDK_ERRLOG("The state(%d) is invalid\n", tqpair->recv_state);
 			abort();
@ -3215,7 +3241,12 @@ nvmf_tcp_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
 	}

 	TAILQ_FOREACH_SAFE(tqpair, &tgroup->await_req, link, tqpair_tmp) {
-		nvmf_tcp_sock_process(tqpair);
+		rc = nvmf_tcp_sock_process(tqpair);
+
+		/* If there was a new socket error, disconnect */
+		if (rc < 0) {
+			nvmf_tcp_qpair_disconnect(tqpair);
+		}
 	}

 	return rc;
--- a/lib/nvmf/transport.c
+++ b/lib/nvmf/transport.c
@ -1,6 +1,7 @@
 /*   SPDX-License-Identifier: BSD-3-Clause
 *   Copyright (C) 2016 Intel Corporation. All rights reserved.
 *   Copyright (c) 2018-2019, 2021 Mellanox Technologies LTD. All rights reserved.
+ *   Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 */

 #include "spdk/stdinc.h"
@ -469,6 +470,14 @@ nvmf_stop_listen_fini(struct spdk_io_channel_iter *i, int status)
 	free(ctx);
 }

+static void nvmf_stop_listen_disconnect_qpairs(struct spdk_io_channel_iter *i);
+
+static void
+nvmf_stop_listen_disconnect_qpairs_msg(void *ctx)
+{
+	nvmf_stop_listen_disconnect_qpairs((struct spdk_io_channel_iter *)ctx);
+}
+
 static void
 nvmf_stop_listen_disconnect_qpairs(struct spdk_io_channel_iter *i)
 {
@ -477,24 +486,33 @@ nvmf_stop_listen_disconnect_qpairs(struct spdk_io_channel_iter *i)
 	struct spdk_io_channel *ch;
 	struct spdk_nvmf_qpair *qpair, *tmp_qpair;
 	struct spdk_nvme_transport_id tmp_trid;
+	bool qpair_found = false;

 	ctx = spdk_io_channel_iter_get_ctx(i);
 	ch = spdk_io_channel_iter_get_channel(i);
 	group = spdk_io_channel_get_ctx(ch);

 	TAILQ_FOREACH_SAFE(qpair, &group->qpairs, link, tmp_qpair) {
-		/* skip qpairs that don't match the TRID. */
 		if (spdk_nvmf_qpair_get_listen_trid(qpair, &tmp_trid)) {
 			continue;
 		}

+		/* Skip qpairs that don't match the listen trid and subsystem pointer.  If
+		 * the ctx->subsystem is NULL, it means disconnect all qpairs that match
+		 * the listen trid. */
 		if (!spdk_nvme_transport_id_compare(&ctx->trid, &tmp_trid)) {
-			if (ctx->subsystem == NULL || qpair->ctrlr == NULL ||
-			    ctx->subsystem == qpair->ctrlr->subsys) {
+			if (ctx->subsystem == NULL ||
+			    (qpair->ctrlr != NULL && ctx->subsystem == qpair->ctrlr->subsys)) {
 				spdk_nvmf_qpair_disconnect(qpair, NULL, NULL);
+				qpair_found = true;
 			}
 		}
 	}
+	if (qpair_found) {
+		spdk_thread_send_msg(spdk_get_thread(), nvmf_stop_listen_disconnect_qpairs_msg, i);
+		return;
+	}
+
 	spdk_for_each_channel_continue(i, 0);
 }

@ -507,6 +525,11 @@ spdk_nvmf_transport_stop_listen_async(struct spdk_nvmf_transport *transport,
 {
 	struct nvmf_stop_listen_ctx *ctx;

+	if (trid->subnqn[0] != '\0') {
+		SPDK_ERRLOG("subnqn should be empty, use subsystem pointer instead\n");
+		return -EINVAL;
+	}
+
 	ctx = calloc(1, sizeof(struct nvmf_stop_listen_ctx));
 	if (ctx == NULL) {
 		return -ENOMEM;
--- a/lib/nvmf/vfio_user.c
+++ b/lib/nvmf/vfio_user.c
@ -1,7 +1,7 @@
 /*   SPDX-License-Identifier: BSD-3-Clause
 *   Copyright (C) 2020 Intel Corporation.
 *   Copyright (c) 2019-2022, Nutanix Inc. All rights reserved.
- *   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *   Copyright (c) 2022, 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 */

 /*
@ -318,6 +318,8 @@ struct nvmf_vfio_user_sq {
 	 */
 	struct spdk_nvme_cmd			create_io_sq_cmd;

+	struct vfio_user_delete_sq_ctx		*delete_ctx;
+
 	/* Currently unallocated reqs. */
 	TAILQ_HEAD(, nvmf_vfio_user_req)	free_reqs;
 	/* Poll group entry */
@ -605,53 +607,6 @@ cq_tail_advance(struct nvmf_vfio_user_cq *cq)
 	}
 }

-static uint32_t
-cq_free_slots(struct nvmf_vfio_user_cq *cq)
-{
-	uint32_t free_slots;
-
-	assert(cq != NULL);
-
-	if (cq->tail == cq->last_head) {
-		free_slots = cq->size;
-	} else if (cq->tail > cq->last_head) {
-		free_slots = cq->size - (cq->tail - cq->last_head);
-	} else {
-		free_slots = cq->last_head - cq->tail;
-	}
-	assert(free_slots > 0);
-
-	return free_slots - 1;
-}
-
-/*
- * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow
- * control: if there is no space in the CQ, we should wait until there is.
- *
- * In practice, we just fail the controller instead: as it happens, all host
- * implementations we care about right-size the CQ: this is required anyway for
- * NVMEoF support (see 3.3.2.8).
- *
- * Since reading the head doorbell is relatively expensive, we use the cached
- * value, so we only have to read it for real if it appears that we are full.
- */
-static inline bool
-cq_is_full(struct nvmf_vfio_user_cq *cq)
-{
-	uint32_t free_cq_slots;
-
-	assert(cq != NULL);
-
-	free_cq_slots = cq_free_slots(cq);
-
-	if (spdk_unlikely(free_cq_slots == 0)) {
-		cq->last_head = *cq_dbl_headp(cq);
-		free_cq_slots = cq_free_slots(cq);
-	}
-
-	return free_cq_slots == 0;
-}
-
 static bool
 io_q_exists(struct nvmf_vfio_user_ctrlr *vu_ctrlr, const uint16_t qid, const bool is_cq)
 {
@ -1724,6 +1679,46 @@ vfio_user_map_cmd(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvmf_request *
 static int handle_cmd_req(struct nvmf_vfio_user_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
 			  struct nvmf_vfio_user_sq *sq);

+static uint32_t
+cq_free_slots(struct nvmf_vfio_user_cq *cq)
+{
+	uint32_t free_slots;
+
+	assert(cq != NULL);
+
+	if (cq->tail == cq->last_head) {
+		free_slots = cq->size;
+	} else if (cq->tail > cq->last_head) {
+		free_slots = cq->size - (cq->tail - cq->last_head);
+	} else {
+		free_slots = cq->last_head - cq->tail;
+	}
+	assert(free_slots > 0);
+
+	return free_slots - 1;
+}
+
+/*
+ * Since reading the head doorbell is relatively expensive, we use the cached
+ * value, so we only have to read it for real if it appears that we are full.
+ */
+static inline bool
+cq_is_full(struct nvmf_vfio_user_cq *cq)
+{
+	uint32_t free_cq_slots;
+
+	assert(cq != NULL);
+
+	free_cq_slots = cq_free_slots(cq);
+
+	if (spdk_unlikely(free_cq_slots == 0)) {
+		cq->last_head = *cq_dbl_headp(cq);
+		free_cq_slots = cq_free_slots(cq);
+	}
+
+	return free_cq_slots == 0;
+}
+
 /*
 * Posts a CQE in the completion queue.
 *
@ -1753,6 +1748,14 @@ post_completion(struct nvmf_vfio_user_ctrlr *ctrlr, struct nvmf_vfio_user_cq *cq
 		assert(spdk_get_thread() == cq->group->group->thread);
 	}

+	/*
+	 * As per NVMe Base spec 3.3.1.2.1, we are supposed to implement CQ flow
+	 * control: if there is no space in the CQ, we should wait until there is.
+	 *
+	 * In practice, we just fail the controller instead: as it happens, all host
+	 * implementations we care about right-size the CQ: this is required anyway for
+	 * NVMEoF support (see 3.3.2.8).
+	 */
 	if (cq_is_full(cq)) {
 		SPDK_ERRLOG("%s: cqid:%d full (tail=%d, head=%d)\n",
 			    ctrlr_id(ctrlr), cq->qid, *cq_tailp(cq),
@ -2239,11 +2242,11 @@ out:
 }

 /* For ADMIN I/O DELETE SUBMISSION QUEUE the NVMf library will disconnect and free
- * queue pair, so save the command in a context.
+ * queue pair, so save the command id and controller in a context.
 */
 struct vfio_user_delete_sq_ctx {
 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
-	struct spdk_nvme_cmd delete_io_sq_cmd;
+	uint16_t cid;
 };

 static void
@ -2262,7 +2265,7 @@ vfio_user_qpair_delete_cb(void *cb_arg)
 				     cb_arg);
 	} else {
 		post_completion(vu_ctrlr, admin_cq, 0, 0,
-				ctx->delete_io_sq_cmd.cid,
+				ctx->cid,
 				SPDK_NVME_SC_SUCCESS, SPDK_NVME_SCT_GENERIC);
 		free(ctx);
 	}
@ -2279,7 +2282,6 @@ handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
 	uint16_t sc = SPDK_NVME_SC_SUCCESS;
 	struct nvmf_vfio_user_sq *sq;
 	struct nvmf_vfio_user_cq *cq;
-	struct vfio_user_delete_sq_ctx *ctx;

 	SPDK_DEBUGLOG(nvmf_vfio, "%s: delete I/O %cqid:%d\n",
 		      ctrlr_id(ctrlr), is_cq ? 'c' : 's',
@ -2308,21 +2310,20 @@ handle_del_io_q(struct nvmf_vfio_user_ctrlr *ctrlr,
 		 * VM reboot or CC.EN change, so we have to delete it in all
 		 * other cases.
 		 */
-		ctx = calloc(1, sizeof(*ctx));
-		if (!ctx) {
+		sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid];
+		sq->delete_ctx = calloc(1, sizeof(*sq->delete_ctx));
+		if (!sq->delete_ctx) {
 			sct = SPDK_NVME_SCT_GENERIC;
 			sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
 			goto out;
 		}
-		ctx->vu_ctrlr = ctrlr;
-		ctx->delete_io_sq_cmd = *cmd;
-
-		sq = ctrlr->sqs[cmd->cdw10_bits.delete_io_q.qid];
+		sq->delete_ctx->vu_ctrlr = ctrlr;
+		sq->delete_ctx->cid = cmd->cid;
 		sq->sq_state = VFIO_USER_SQ_DELETED;
 		assert(ctrlr->cqs[sq->cqid]->cq_ref);
 		ctrlr->cqs[sq->cqid]->cq_ref--;

-		spdk_nvmf_qpair_disconnect(&sq->qpair, vfio_user_qpair_delete_cb, ctx);
+		spdk_nvmf_qpair_disconnect(&sq->qpair, NULL, NULL);
 		return 0;
 	}

@ -5324,11 +5325,14 @@ nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair,
 	struct nvmf_vfio_user_sq *sq;
 	struct nvmf_vfio_user_ctrlr *vu_ctrlr;
 	struct nvmf_vfio_user_endpoint *endpoint;
+	struct vfio_user_delete_sq_ctx *del_ctx;

 	assert(qpair != NULL);
 	sq = SPDK_CONTAINEROF(qpair, struct nvmf_vfio_user_sq, qpair);
 	vu_ctrlr = sq->ctrlr;
 	endpoint = vu_ctrlr->endpoint;
+	del_ctx = sq->delete_ctx;
+	sq->delete_ctx = NULL;

 	pthread_mutex_lock(&endpoint->lock);
 	TAILQ_REMOVE(&vu_ctrlr->connected_sqs, sq, tailq);
@ -5347,6 +5351,10 @@ nvmf_vfio_user_close_qpair(struct spdk_nvmf_qpair *qpair,
 	}
 	pthread_mutex_unlock(&endpoint->lock);

+	if (del_ctx) {
+		vfio_user_qpair_delete_cb(del_ctx);
+	}
+
 	if (cb_fn) {
 		cb_fn(cb_arg);
 	}
--- a/lib/reduce/reduce.c
+++ b/lib/reduce/reduce.c
@ -537,9 +537,6 @@ _allocate_bit_arrays(struct spdk_reduce_vol *vol)
 	return 0;
 }

-SPDK_LOG_DEPRECATION_REGISTER(libreduce_pm_file,
-			      "PMDK libpmem reduce integration", "SPDK 23.05", 0);
-
 void
 spdk_reduce_vol_init(struct spdk_reduce_vol_params *params,
 		     struct spdk_reduce_backing_dev *backing_dev,
@ -552,8 +549,6 @@ spdk_reduce_vol_init(struct spdk_reduce_vol_params *params,
 	size_t mapped_len;
 	int dir_len, max_dir_len, rc;

-	SPDK_LOG_DEPRECATED(libreduce_pm_file);
-
 	/* We need to append a path separator and the UUID to the supplied
 	 * path.
 	 */
--- a/lib/thread/Makefile
+++ b/lib/thread/Makefile
@ -9,7 +9,7 @@ include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
 SO_VER := 8
 SO_MINOR := 0

-C_SRCS = thread.c
+C_SRCS = thread.c iobuf.c
 LIBNAME = thread

 SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_thread.map)
--- a/lib/thread/iobuf.c
+++ b/lib/thread/iobuf.c
@ -0,0 +1,410 @@
+/*   SPDX-License-Identifier: BSD-3-Clause
+ *   Copyright (C) 2023 Intel Corporation.
+ *   All rights reserved.
+ */
+
+#include "spdk/env.h"
+#include "spdk/util.h"
+#include "spdk/likely.h"
+#include "spdk/log.h"
+#include "spdk/thread.h"
+#include "spdk/bdev.h"
+
+#define IOBUF_MIN_SMALL_POOL_SIZE	8191
+#define IOBUF_MIN_LARGE_POOL_SIZE	1023
+#define IOBUF_ALIGNMENT			512
+#define IOBUF_MIN_SMALL_BUFSIZE		(SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + \
+					 IOBUF_ALIGNMENT)
+#define IOBUF_MIN_LARGE_BUFSIZE		(SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + \
+					 IOBUF_ALIGNMENT)
+
+SPDK_STATIC_ASSERT(sizeof(struct spdk_iobuf_buffer) <= IOBUF_MIN_SMALL_BUFSIZE,
+		   "Invalid data offset");
+
+struct iobuf_channel {
+	spdk_iobuf_entry_stailq_t small_queue;
+	spdk_iobuf_entry_stailq_t large_queue;
+};
+
+struct iobuf_module {
+	char				*name;
+	TAILQ_ENTRY(iobuf_module)	tailq;
+};
+
+struct iobuf {
+	struct spdk_mempool		*small_pool;
+	struct spdk_mempool		*large_pool;
+	struct spdk_iobuf_opts		opts;
+	TAILQ_HEAD(, iobuf_module)	modules;
+	spdk_iobuf_finish_cb		finish_cb;
+	void				*finish_arg;
+};
+
+static struct iobuf g_iobuf = {
+	.modules = TAILQ_HEAD_INITIALIZER(g_iobuf.modules),
+	.opts = {
+		.small_pool_count = IOBUF_MIN_SMALL_POOL_SIZE,
+		.large_pool_count = IOBUF_MIN_LARGE_POOL_SIZE,
+		.small_bufsize = IOBUF_MIN_SMALL_BUFSIZE,
+		.large_bufsize = IOBUF_MIN_LARGE_BUFSIZE,
+	},
+};
+
+static int
+iobuf_channel_create_cb(void *io_device, void *ctx)
+{
+	struct iobuf_channel *ch = ctx;
+
+	STAILQ_INIT(&ch->small_queue);
+	STAILQ_INIT(&ch->large_queue);
+
+	return 0;
+}
+
+static void
+iobuf_channel_destroy_cb(void *io_device, void *ctx)
+{
+	struct iobuf_channel *ch __attribute__((unused)) = ctx;
+
+	assert(STAILQ_EMPTY(&ch->small_queue));
+	assert(STAILQ_EMPTY(&ch->large_queue));
+}
+
+int
+spdk_iobuf_initialize(void)
+{
+	struct spdk_iobuf_opts *opts = &g_iobuf.opts;
+	int rc = 0;
+
+	g_iobuf.small_pool = spdk_mempool_create("iobuf_small_pool", opts->small_pool_count,
+			     opts->small_bufsize, 0, SPDK_ENV_SOCKET_ID_ANY);
+	if (!g_iobuf.small_pool) {
+		SPDK_ERRLOG("Failed to create small iobuf pool\n");
+		rc = -ENOMEM;
+		goto error;
+	}
+
+	g_iobuf.large_pool = spdk_mempool_create("iobuf_large_pool", opts->large_pool_count,
+			     opts->large_bufsize, 0, SPDK_ENV_SOCKET_ID_ANY);
+	if (!g_iobuf.large_pool) {
+		SPDK_ERRLOG("Failed to create large iobuf pool\n");
+		rc = -ENOMEM;
+		goto error;
+	}
+
+	spdk_io_device_register(&g_iobuf, iobuf_channel_create_cb, iobuf_channel_destroy_cb,
+				sizeof(struct iobuf_channel), "iobuf");
+
+	return 0;
+error:
+	spdk_mempool_free(g_iobuf.small_pool);
+	return rc;
+}
+
+static void
+iobuf_unregister_cb(void *io_device)
+{
+	struct iobuf_module *module;
+
+	while (!TAILQ_EMPTY(&g_iobuf.modules)) {
+		module = TAILQ_FIRST(&g_iobuf.modules);
+		TAILQ_REMOVE(&g_iobuf.modules, module, tailq);
+		free(module->name);
+		free(module);
+	}
+
+	if (spdk_mempool_count(g_iobuf.small_pool) != g_iobuf.opts.small_pool_count) {
+		SPDK_ERRLOG("small iobuf pool count is %zu, expected %"PRIu64"\n",
+			    spdk_mempool_count(g_iobuf.small_pool), g_iobuf.opts.small_pool_count);
+	}
+
+	if (spdk_mempool_count(g_iobuf.large_pool) != g_iobuf.opts.large_pool_count) {
+		SPDK_ERRLOG("large iobuf pool count is %zu, expected %"PRIu64"\n",
+			    spdk_mempool_count(g_iobuf.large_pool), g_iobuf.opts.large_pool_count);
+	}
+
+	spdk_mempool_free(g_iobuf.small_pool);
+	spdk_mempool_free(g_iobuf.large_pool);
+
+	if (g_iobuf.finish_cb != NULL) {
+		g_iobuf.finish_cb(g_iobuf.finish_arg);
+	}
+}
+
+void
+spdk_iobuf_finish(spdk_iobuf_finish_cb cb_fn, void *cb_arg)
+{
+	g_iobuf.finish_cb = cb_fn;
+	g_iobuf.finish_arg = cb_arg;
+
+	spdk_io_device_unregister(&g_iobuf, iobuf_unregister_cb);
+}
+
+int
+spdk_iobuf_set_opts(const struct spdk_iobuf_opts *opts)
+{
+	if (opts->small_pool_count < IOBUF_MIN_SMALL_POOL_SIZE) {
+		SPDK_ERRLOG("small_pool_count must be at least %" PRIu32 "\n",
+			    IOBUF_MIN_SMALL_POOL_SIZE);
+		return -EINVAL;
+	}
+	if (opts->large_pool_count < IOBUF_MIN_LARGE_POOL_SIZE) {
+		SPDK_ERRLOG("large_pool_count must be at least %" PRIu32 "\n",
+			    IOBUF_MIN_LARGE_POOL_SIZE);
+		return -EINVAL;
+	}
+	if (opts->small_bufsize < IOBUF_MIN_SMALL_BUFSIZE) {
+		SPDK_ERRLOG("small_bufsize must be at least %" PRIu32 "\n",
+			    IOBUF_MIN_SMALL_BUFSIZE);
+		return -EINVAL;
+	}
+	if (opts->large_bufsize < IOBUF_MIN_LARGE_BUFSIZE) {
+		SPDK_ERRLOG("large_bufsize must be at least %" PRIu32 "\n",
+			    IOBUF_MIN_LARGE_BUFSIZE);
+		return -EINVAL;
+	}
+
+	g_iobuf.opts = *opts;
+
+	return 0;
+}
+
+void
+spdk_iobuf_get_opts(struct spdk_iobuf_opts *opts)
+{
+	*opts = g_iobuf.opts;
+}
+
+int
+spdk_iobuf_channel_init(struct spdk_iobuf_channel *ch, const char *name,
+			uint32_t small_cache_size, uint32_t large_cache_size)
+{
+	struct spdk_io_channel *ioch;
+	struct iobuf_channel *iobuf_ch;
+	struct iobuf_module *module;
+	struct spdk_iobuf_buffer *buf;
+	uint32_t i;
+
+	TAILQ_FOREACH(module, &g_iobuf.modules, tailq) {
+		if (strcmp(name, module->name) == 0) {
+			break;
+		}
+	}
+
+	if (module == NULL) {
+		SPDK_ERRLOG("Couldn't find iobuf module: '%s'\n", name);
+		return -ENODEV;
+	}
+
+	ioch = spdk_get_io_channel(&g_iobuf);
+	if (ioch == NULL) {
+		SPDK_ERRLOG("Couldn't get iobuf IO channel\n");
+		return -ENOMEM;
+	}
+
+	iobuf_ch = spdk_io_channel_get_ctx(ioch);
+
+	ch->small.queue = &iobuf_ch->small_queue;
+	ch->large.queue = &iobuf_ch->large_queue;
+	ch->small.pool = g_iobuf.small_pool;
+	ch->large.pool = g_iobuf.large_pool;
+	ch->small.bufsize = g_iobuf.opts.small_bufsize;
+	ch->large.bufsize = g_iobuf.opts.large_bufsize;
+	ch->parent = ioch;
+	ch->module = module;
+	ch->small.cache_size = small_cache_size;
+	ch->large.cache_size = large_cache_size;
+	ch->small.cache_count = 0;
+	ch->large.cache_count = 0;
+
+	STAILQ_INIT(&ch->small.cache);
+	STAILQ_INIT(&ch->large.cache);
+
+	for (i = 0; i < small_cache_size; ++i) {
+		buf = spdk_mempool_get(g_iobuf.small_pool);
+		if (buf == NULL) {
+			SPDK_ERRLOG("Failed to populate iobuf small buffer cache. "
+				    "You may need to increase spdk_iobuf_opts.small_pool_count\n");
+			goto error;
+		}
+		STAILQ_INSERT_TAIL(&ch->small.cache, buf, stailq);
+		ch->small.cache_count++;
+	}
+	for (i = 0; i < large_cache_size; ++i) {
+		buf = spdk_mempool_get(g_iobuf.large_pool);
+		if (buf == NULL) {
+			SPDK_ERRLOG("Failed to populate iobuf large buffer cache. "
+				    "You may need to increase spdk_iobuf_opts.large_pool_count\n");
+			goto error;
+		}
+		STAILQ_INSERT_TAIL(&ch->large.cache, buf, stailq);
+		ch->large.cache_count++;
+	}
+
+	return 0;
+error:
+	spdk_iobuf_channel_fini(ch);
+
+	return -ENOMEM;
+}
+
+void
+spdk_iobuf_channel_fini(struct spdk_iobuf_channel *ch)
+{
+	struct spdk_iobuf_entry *entry __attribute__((unused));
+	struct spdk_iobuf_buffer *buf;
+
+	/* Make sure none of the wait queue entries are coming from this module */
+	STAILQ_FOREACH(entry, ch->small.queue, stailq) {
+		assert(entry->module != ch->module);
+	}
+	STAILQ_FOREACH(entry, ch->large.queue, stailq) {
+		assert(entry->module != ch->module);
+	}
+
+	/* Release cached buffers back to the pool */
+	while (!STAILQ_EMPTY(&ch->small.cache)) {
+		buf = STAILQ_FIRST(&ch->small.cache);
+		STAILQ_REMOVE_HEAD(&ch->small.cache, stailq);
+		spdk_mempool_put(ch->small.pool, buf);
+		ch->small.cache_count--;
+	}
+	while (!STAILQ_EMPTY(&ch->large.cache)) {
+		buf = STAILQ_FIRST(&ch->large.cache);
+		STAILQ_REMOVE_HEAD(&ch->large.cache, stailq);
+		spdk_mempool_put(ch->large.pool, buf);
+		ch->large.cache_count--;
+	}
+
+	assert(ch->small.cache_count == 0);
+	assert(ch->large.cache_count == 0);
+
+	spdk_put_io_channel(ch->parent);
+	ch->parent = NULL;
+}
+
+int
+spdk_iobuf_register_module(const char *name)
+{
+	struct iobuf_module *module;
+
+	TAILQ_FOREACH(module, &g_iobuf.modules, tailq) {
+		if (strcmp(name, module->name) == 0) {
+			return -EEXIST;
+		}
+	}
+
+	module = calloc(1, sizeof(*module));
+	if (module == NULL) {
+		return -ENOMEM;
+	}
+
+	module->name = strdup(name);
+	if (module->name == NULL) {
+		free(module);
+		return -ENOMEM;
+	}
+
+	TAILQ_INSERT_TAIL(&g_iobuf.modules, module, tailq);
+
+	return 0;
+}
+
+int
+spdk_iobuf_for_each_entry(struct spdk_iobuf_channel *ch, struct spdk_iobuf_pool *pool,
+			  spdk_iobuf_for_each_entry_fn cb_fn, void *cb_ctx)
+{
+	struct spdk_iobuf_entry *entry, *tmp;
+	int rc;
+
+	STAILQ_FOREACH_SAFE(entry, pool->queue, stailq, tmp) {
+		/* We only want to iterate over the entries requested by the module which owns ch */
+		if (entry->module != ch->module) {
+			continue;
+		}
+
+		rc = cb_fn(ch, entry, cb_ctx);
+		if (rc != 0) {
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+void
+spdk_iobuf_entry_abort(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry,
+		       uint64_t len)
+{
+	struct spdk_iobuf_pool *pool;
+
+	if (len <= ch->small.bufsize) {
+		pool = &ch->small;
+	} else {
+		assert(len <= ch->large.bufsize);
+		pool = &ch->large;
+	}
+
+	STAILQ_REMOVE(pool->queue, entry, spdk_iobuf_entry, stailq);
+}
+
+void *
+spdk_iobuf_get(struct spdk_iobuf_channel *ch, uint64_t len,
+	       struct spdk_iobuf_entry *entry, spdk_iobuf_get_cb cb_fn)
+{
+	struct spdk_iobuf_pool *pool;
+	void *buf;
+
+	assert(spdk_io_channel_get_thread(ch->parent) == spdk_get_thread());
+	if (len <= ch->small.bufsize) {
+		pool = &ch->small;
+	} else {
+		assert(len <= ch->large.bufsize);
+		pool = &ch->large;
+	}
+
+	buf = (void *)STAILQ_FIRST(&pool->cache);
+	if (buf) {
+		STAILQ_REMOVE_HEAD(&pool->cache, stailq);
+		assert(pool->cache_count > 0);
+		pool->cache_count--;
+	} else {
+		buf = spdk_mempool_get(pool->pool);
+		if (!buf) {
+			STAILQ_INSERT_TAIL(pool->queue, entry, stailq);
+			entry->module = ch->module;
+			entry->cb_fn = cb_fn;
+
+			return NULL;
+		}
+	}
+
+	return (char *)buf;
+}
+
+void
+spdk_iobuf_put(struct spdk_iobuf_channel *ch, void *buf, uint64_t len)
+{
+	struct spdk_iobuf_entry *entry;
+	struct spdk_iobuf_pool *pool;
+
+	assert(spdk_io_channel_get_thread(ch->parent) == spdk_get_thread());
+	if (len <= ch->small.bufsize) {
+		pool = &ch->small;
+	} else {
+		pool = &ch->large;
+	}
+
+	if (STAILQ_EMPTY(pool->queue)) {
+		if (pool->cache_count < pool->cache_size) {
+			STAILQ_INSERT_HEAD(&pool->cache, (struct spdk_iobuf_buffer *)buf, stailq);
+			pool->cache_count++;
+		} else {
+			spdk_mempool_put(pool->pool, buf);
+		}
+	} else {
+		entry = STAILQ_FIRST(pool->queue);
+		STAILQ_REMOVE_HEAD(pool->queue, stailq);
+		entry->cb_fn(entry, buf);
+	}
+}
--- a/lib/thread/spdk_thread.map
+++ b/lib/thread/spdk_thread.map
@ -72,6 +72,8 @@
 	spdk_iobuf_register_module;
 	spdk_iobuf_for_each_entry;
 	spdk_iobuf_entry_abort;
+	spdk_iobuf_get;
+	spdk_iobuf_put;

 	# internal functions in spdk_internal/thread.h
 	spdk_poller_get_name;
--- a/lib/thread/thread.c
+++ b/lib/thread/thread.c
@ -7,7 +7,6 @@
 #include "spdk/stdinc.h"

 #include "spdk/env.h"
-#include "spdk/bdev.h"
 #include "spdk/likely.h"
 #include "spdk/queue.h"
 #include "spdk/string.h"
@ -26,6 +25,11 @@
 #ifdef __linux__
 #include <sys/timerfd.h>
 #include <sys/eventfd.h>
+#include <execinfo.h>
+#endif
+
+#ifdef __FreeBSD__
+#include <execinfo.h>
 #endif

 #define SPDK_MSG_BATCH_SIZE		8
@ -33,13 +37,6 @@
 #define SPDK_THREAD_EXIT_TIMEOUT_SEC	5
 #define SPDK_MAX_POLLER_NAME_LEN	256
 #define SPDK_MAX_THREAD_NAME_LEN	256
-#define IOBUF_MIN_SMALL_POOL_SIZE	8191
-#define IOBUF_MIN_LARGE_POOL_SIZE	1023
-#define IOBUF_ALIGNMENT			512
-#define IOBUF_MIN_SMALL_BUFSIZE		(SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + \
-					 IOBUF_ALIGNMENT)
-#define IOBUF_MIN_LARGE_BUFSIZE		(SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + \
-					 IOBUF_ALIGNMENT)

 static struct spdk_thread *g_app_thread;

@ -188,6 +185,11 @@ enum spin_error {
 	 * deadlock when another SPDK thread on the same pthread tries to take that lock.
 	 */
 	SPIN_ERR_HOLD_DURING_SWITCH,
+	/* Trying to use a lock that was destroyed (but not re-initialized) */
+	SPIN_ERR_DESTROYED,
+	/* Trying to use a lock that is not initialized */
+	SPIN_ERR_NOT_INITIALIZED,
+
 	/* Must be last, not an actual error code */
 	SPIN_ERR_LAST
 };
@ -201,6 +203,8 @@ static const char *spin_error_strings[] = {
 	[SPIN_ERR_LOCK_HELD]		= "Destroying a held spinlock",
 	[SPIN_ERR_LOCK_COUNT]		= "Lock count is invalid",
 	[SPIN_ERR_HOLD_DURING_SWITCH]	= "Lock(s) held while SPDK thread going off CPU",
+	[SPIN_ERR_DESTROYED]		= "Lock has been destroyed",
+	[SPIN_ERR_NOT_INITIALIZED]	= "Lock has not been initialized",
 };

 #define SPIN_ERROR_STRING(err) (err < 0 || err >= SPDK_COUNTOF(spin_error_strings)) \
@ -215,18 +219,20 @@ __posix_abort(enum spin_error err)
 typedef void (*spin_abort)(enum spin_error err);
 spin_abort g_spin_abort_fn = __posix_abort;

-#define SPIN_ASSERT_IMPL(cond, err, ret) \
+#define SPIN_ASSERT_IMPL(cond, err, extra_log, ret) \
 	do { \
 		if (spdk_unlikely(!(cond))) { \
 			SPDK_ERRLOG("unrecoverable spinlock error %d: %s (%s)\n", err, \
 				    SPIN_ERROR_STRING(err), #cond); \
+			extra_log; \
 			g_spin_abort_fn(err); \
 			ret; \
 		} \
 	} while (0)
-#define SPIN_ASSERT_RETURN_VOID(cond, err)	SPIN_ASSERT_IMPL(cond, err, return)
-#define SPIN_ASSERT_RETURN(cond, err, ret)	SPIN_ASSERT_IMPL(cond, err, return ret)
-#define SPIN_ASSERT(cond, err)			SPIN_ASSERT_IMPL(cond, err,)
+#define SPIN_ASSERT_LOG_STACKS(cond, err, lock) \
+	SPIN_ASSERT_IMPL(cond, err, sspin_stacks_print(sspin), return)
+#define SPIN_ASSERT_RETURN(cond, err, ret)	SPIN_ASSERT_IMPL(cond, err, , return ret)
+#define SPIN_ASSERT(cond, err)			SPIN_ASSERT_IMPL(cond, err, ,)

 struct io_device {
 	void				*io_device;
@ -245,35 +251,6 @@ struct io_device {
 	bool				unregistered;
 };

-struct iobuf_channel {
-	spdk_iobuf_entry_stailq_t small_queue;
-	spdk_iobuf_entry_stailq_t large_queue;
-};
-
-struct iobuf_module {
-	char				*name;
-	TAILQ_ENTRY(iobuf_module)	tailq;
-};
-
-struct iobuf {
-	struct spdk_mempool		*small_pool;
-	struct spdk_mempool		*large_pool;
-	struct spdk_iobuf_opts		opts;
-	TAILQ_HEAD(, iobuf_module)	modules;
-	spdk_iobuf_finish_cb		finish_cb;
-	void				*finish_arg;
-};
-
-static struct iobuf g_iobuf = {
-	.modules = TAILQ_HEAD_INITIALIZER(g_iobuf.modules),
-	.opts = {
-		.small_pool_count = IOBUF_MIN_SMALL_POOL_SIZE,
-		.large_pool_count = IOBUF_MIN_LARGE_POOL_SIZE,
-		.small_bufsize = IOBUF_MIN_SMALL_BUFSIZE,
-		.large_bufsize = IOBUF_MIN_LARGE_BUFSIZE,
-	},
-};
-
 static RB_HEAD(io_device_tree, io_device) g_io_devices = RB_INITIALIZER(g_io_devices);

 static int
@ -2913,6 +2890,84 @@ spdk_interrupt_mode_is_enabled(void)
 	return g_interrupt_mode;
 }

+#define SSPIN_DEBUG_STACK_FRAMES 16
+
+struct sspin_stack {
+	void *addrs[SSPIN_DEBUG_STACK_FRAMES];
+	uint32_t depth;
+};
+
+struct spdk_spinlock_internal {
+	struct sspin_stack init_stack;
+	struct sspin_stack lock_stack;
+	struct sspin_stack unlock_stack;
+};
+
+static void
+sspin_init_internal(struct spdk_spinlock *sspin)
+{
+#ifdef DEBUG
+	sspin->internal = calloc(1, sizeof(*sspin->internal));
+#endif
+}
+
+static void
+sspin_fini_internal(struct spdk_spinlock *sspin)
+{
+#ifdef DEBUG
+	free(sspin->internal);
+	sspin->internal = NULL;
+#endif
+}
+
+#ifdef DEBUG
+#define SSPIN_GET_STACK(sspin, which) \
+	do { \
+		if (sspin->internal != NULL) { \
+			struct sspin_stack *stack = &sspin->internal->which ## _stack; \
+			stack->depth = backtrace(stack->addrs, SPDK_COUNTOF(stack->addrs)); \
+		} \
+	} while (0)
+#else
+#define SSPIN_GET_STACK(sspin, which) do { } while (0)
+#endif
+
+static void
+sspin_stack_print(const char *title, const struct sspin_stack *sspin_stack)
+{
+	char **stack;
+	size_t i;
+
+	stack = backtrace_symbols(sspin_stack->addrs, sspin_stack->depth);
+	if (stack == NULL) {
+		SPDK_ERRLOG("Out of memory while allocate stack for %s\n", title);
+		return;
+	}
+	SPDK_ERRLOG("  %s:\n", title);
+	for (i = 0; i < sspin_stack->depth; i++) {
+		/*
+		 * This does not print line numbers. In gdb, use something like "list *0x444b6b" or
+		 * "list *sspin_stack->addrs[0]".  Or more conveniently, load the spdk gdb macros
+		 * and use use "print *sspin" or "print sspin->internal.lock_stack".  See
+		 * gdb_macros.md in the docs directory for details.
+		 */
+		SPDK_ERRLOG("    #%" PRIu64 ": %s\n", i, stack[i]);
+	}
+	free(stack);
+}
+
+static void
+sspin_stacks_print(const struct spdk_spinlock *sspin)
+{
+	if (sspin->internal == NULL) {
+		return;
+	}
+	SPDK_ERRLOG("spinlock %p\n", sspin);
+	sspin_stack_print("Lock initalized at", &sspin->internal->init_stack);
+	sspin_stack_print("Last locked at", &sspin->internal->lock_stack);
+	sspin_stack_print("Last unlocked at", &sspin->internal->unlock_stack);
+}
+
 void
 spdk_spin_init(struct spdk_spinlock *sspin)
 {
@ -2920,7 +2975,10 @@ spdk_spin_init(struct spdk_spinlock *sspin)

 	memset(sspin, 0, sizeof(*sspin));
 	rc = pthread_spin_init(&sspin->spinlock, PTHREAD_PROCESS_PRIVATE);
-	SPIN_ASSERT_RETURN_VOID(rc == 0, SPIN_ERR_PTHREAD);
+	SPIN_ASSERT_LOG_STACKS(rc == 0, SPIN_ERR_PTHREAD, sspin);
+	sspin_init_internal(sspin);
+	SSPIN_GET_STACK(sspin, init);
+	sspin->initialized = true;
 }

 void
@ -2928,10 +2986,16 @@ spdk_spin_destroy(struct spdk_spinlock *sspin)
 {
 	int rc;

-	SPIN_ASSERT_RETURN_VOID(sspin->thread == NULL, SPIN_ERR_LOCK_HELD);
+	SPIN_ASSERT_LOG_STACKS(!sspin->destroyed, SPIN_ERR_DESTROYED, sspin);
+	SPIN_ASSERT_LOG_STACKS(sspin->initialized, SPIN_ERR_NOT_INITIALIZED, sspin);
+	SPIN_ASSERT_LOG_STACKS(sspin->thread == NULL, SPIN_ERR_LOCK_HELD, sspin);

 	rc = pthread_spin_destroy(&sspin->spinlock);
-	SPIN_ASSERT_RETURN_VOID(rc == 0, SPIN_ERR_PTHREAD);
+	SPIN_ASSERT_LOG_STACKS(rc == 0, SPIN_ERR_PTHREAD, sspin);
+
+	sspin_fini_internal(sspin);
+	sspin->initialized = false;
+	sspin->destroyed = true;
 }

 void
@ -2940,14 +3004,18 @@ spdk_spin_lock(struct spdk_spinlock *sspin)
 	struct spdk_thread *thread = spdk_get_thread();
 	int rc;

-	SPIN_ASSERT_RETURN_VOID(thread != NULL, SPIN_ERR_NOT_SPDK_THREAD);
-	SPIN_ASSERT_RETURN_VOID(thread != sspin->thread, SPIN_ERR_DEADLOCK);
+	SPIN_ASSERT_LOG_STACKS(!sspin->destroyed, SPIN_ERR_DESTROYED, sspin);
+	SPIN_ASSERT_LOG_STACKS(sspin->initialized, SPIN_ERR_NOT_INITIALIZED, sspin);
+	SPIN_ASSERT_LOG_STACKS(thread != NULL, SPIN_ERR_NOT_SPDK_THREAD, sspin);
+	SPIN_ASSERT_LOG_STACKS(thread != sspin->thread, SPIN_ERR_DEADLOCK, sspin);

 	rc = pthread_spin_lock(&sspin->spinlock);
-	SPIN_ASSERT_RETURN_VOID(rc == 0, SPIN_ERR_PTHREAD);
+	SPIN_ASSERT_LOG_STACKS(rc == 0, SPIN_ERR_PTHREAD, sspin);

 	sspin->thread = thread;
 	sspin->thread->lock_count++;
+
+	SSPIN_GET_STACK(sspin, lock);
 }

 void
@ -2956,15 +3024,19 @@ spdk_spin_unlock(struct spdk_spinlock *sspin)
 	struct spdk_thread *thread = spdk_get_thread();
 	int rc;

-	SPIN_ASSERT_RETURN_VOID(thread != NULL, SPIN_ERR_NOT_SPDK_THREAD);
-	SPIN_ASSERT_RETURN_VOID(thread == sspin->thread, SPIN_ERR_WRONG_THREAD);
+	SPIN_ASSERT_LOG_STACKS(!sspin->destroyed, SPIN_ERR_DESTROYED, sspin);
+	SPIN_ASSERT_LOG_STACKS(sspin->initialized, SPIN_ERR_NOT_INITIALIZED, sspin);
+	SPIN_ASSERT_LOG_STACKS(thread != NULL, SPIN_ERR_NOT_SPDK_THREAD, sspin);
+	SPIN_ASSERT_LOG_STACKS(thread == sspin->thread, SPIN_ERR_WRONG_THREAD, sspin);

-	SPIN_ASSERT_RETURN_VOID(thread->lock_count > 0, SPIN_ERR_LOCK_COUNT);
+	SPIN_ASSERT_LOG_STACKS(thread->lock_count > 0, SPIN_ERR_LOCK_COUNT, sspin);
 	thread->lock_count--;
 	sspin->thread = NULL;

+	SSPIN_GET_STACK(sspin, unlock);
+
 	rc = pthread_spin_unlock(&sspin->spinlock);
-	SPIN_ASSERT_RETURN_VOID(rc == 0, SPIN_ERR_PTHREAD);
+	SPIN_ASSERT_LOG_STACKS(rc == 0, SPIN_ERR_PTHREAD, sspin);
 }

 bool
@ -2977,302 +3049,4 @@ spdk_spin_held(struct spdk_spinlock *sspin)
 	return sspin->thread == thread;
 }

-static int
-iobuf_channel_create_cb(void *io_device, void *ctx)
-{
-	struct iobuf_channel *ch = ctx;
-
-	STAILQ_INIT(&ch->small_queue);
-	STAILQ_INIT(&ch->large_queue);
-
-	return 0;
-}
-
-static void
-iobuf_channel_destroy_cb(void *io_device, void *ctx)
-{
-	struct iobuf_channel *ch __attribute__((unused)) = ctx;
-
-	assert(STAILQ_EMPTY(&ch->small_queue));
-	assert(STAILQ_EMPTY(&ch->large_queue));
-}
-
-int
-spdk_iobuf_initialize(void)
-{
-	struct spdk_iobuf_opts *opts = &g_iobuf.opts;
-	int rc = 0;
-
-	g_iobuf.small_pool = spdk_mempool_create("iobuf_small_pool", opts->small_pool_count,
-			     opts->small_bufsize, 0, SPDK_ENV_SOCKET_ID_ANY);
-	if (!g_iobuf.small_pool) {
-		SPDK_ERRLOG("Failed to create small iobuf pool\n");
-		rc = -ENOMEM;
-		goto error;
-	}
-
-	g_iobuf.large_pool = spdk_mempool_create("iobuf_large_pool", opts->large_pool_count,
-			     opts->large_bufsize, 0, SPDK_ENV_SOCKET_ID_ANY);
-	if (!g_iobuf.large_pool) {
-		SPDK_ERRLOG("Failed to create large iobuf pool\n");
-		rc = -ENOMEM;
-		goto error;
-	}
-
-	spdk_io_device_register(&g_iobuf, iobuf_channel_create_cb, iobuf_channel_destroy_cb,
-				sizeof(struct iobuf_channel), "iobuf");
-
-	return 0;
-error:
-	spdk_mempool_free(g_iobuf.small_pool);
-	return rc;
-}
-
-static void
-iobuf_unregister_cb(void *io_device)
-{
-	struct iobuf_module *module;
-
-	while (!TAILQ_EMPTY(&g_iobuf.modules)) {
-		module = TAILQ_FIRST(&g_iobuf.modules);
-		TAILQ_REMOVE(&g_iobuf.modules, module, tailq);
-		free(module->name);
-		free(module);
-	}
-
-	if (spdk_mempool_count(g_iobuf.small_pool) != g_iobuf.opts.small_pool_count) {
-		SPDK_ERRLOG("small iobuf pool count is %zu, expected %"PRIu64"\n",
-			    spdk_mempool_count(g_iobuf.small_pool), g_iobuf.opts.small_pool_count);
-	}
-
-	if (spdk_mempool_count(g_iobuf.large_pool) != g_iobuf.opts.large_pool_count) {
-		SPDK_ERRLOG("large iobuf pool count is %zu, expected %"PRIu64"\n",
-			    spdk_mempool_count(g_iobuf.large_pool), g_iobuf.opts.large_pool_count);
-	}
-
-	spdk_mempool_free(g_iobuf.small_pool);
-	spdk_mempool_free(g_iobuf.large_pool);
-
-	if (g_iobuf.finish_cb != NULL) {
-		g_iobuf.finish_cb(g_iobuf.finish_arg);
-	}
-}
-
-void
-spdk_iobuf_finish(spdk_iobuf_finish_cb cb_fn, void *cb_arg)
-{
-	g_iobuf.finish_cb = cb_fn;
-	g_iobuf.finish_arg = cb_arg;
-
-	spdk_io_device_unregister(&g_iobuf, iobuf_unregister_cb);
-}
-
-int
-spdk_iobuf_set_opts(const struct spdk_iobuf_opts *opts)
-{
-	if (opts->small_pool_count < IOBUF_MIN_SMALL_POOL_SIZE) {
-		SPDK_ERRLOG("small_pool_count must be at least %" PRIu32 "\n",
-			    IOBUF_MIN_SMALL_POOL_SIZE);
-		return -EINVAL;
-	}
-	if (opts->large_pool_count < IOBUF_MIN_LARGE_POOL_SIZE) {
-		SPDK_ERRLOG("large_pool_count must be at least %" PRIu32 "\n",
-			    IOBUF_MIN_LARGE_POOL_SIZE);
-		return -EINVAL;
-	}
-	if (opts->small_bufsize < IOBUF_MIN_SMALL_BUFSIZE) {
-		SPDK_ERRLOG("small_bufsize must be at least %" PRIu32 "\n",
-			    IOBUF_MIN_SMALL_BUFSIZE);
-		return -EINVAL;
-	}
-	if (opts->large_bufsize < IOBUF_MIN_LARGE_BUFSIZE) {
-		SPDK_ERRLOG("large_bufsize must be at least %" PRIu32 "\n",
-			    IOBUF_MIN_LARGE_BUFSIZE);
-		return -EINVAL;
-	}
-
-	g_iobuf.opts = *opts;
-
-	return 0;
-}
-
-void
-spdk_iobuf_get_opts(struct spdk_iobuf_opts *opts)
-{
-	*opts = g_iobuf.opts;
-}
-
-int
-spdk_iobuf_channel_init(struct spdk_iobuf_channel *ch, const char *name,
-			uint32_t small_cache_size, uint32_t large_cache_size)
-{
-	struct spdk_io_channel *ioch;
-	struct iobuf_channel *iobuf_ch;
-	struct iobuf_module *module;
-	struct spdk_iobuf_buffer *buf;
-	uint32_t i;
-
-	TAILQ_FOREACH(module, &g_iobuf.modules, tailq) {
-		if (strcmp(name, module->name) == 0) {
-			break;
-		}
-	}
-
-	if (module == NULL) {
-		SPDK_ERRLOG("Couldn't find iobuf module: '%s'\n", name);
-		return -ENODEV;
-	}
-
-	ioch = spdk_get_io_channel(&g_iobuf);
-	if (ioch == NULL) {
-		SPDK_ERRLOG("Couldn't get iobuf IO channel\n");
-		return -ENOMEM;
-	}
-
-	iobuf_ch = spdk_io_channel_get_ctx(ioch);
-
-	ch->small.queue = &iobuf_ch->small_queue;
-	ch->large.queue = &iobuf_ch->large_queue;
-	ch->small.pool = g_iobuf.small_pool;
-	ch->large.pool = g_iobuf.large_pool;
-	ch->small.bufsize = g_iobuf.opts.small_bufsize;
-	ch->large.bufsize = g_iobuf.opts.large_bufsize;
-	ch->parent = ioch;
-	ch->module = module;
-	ch->small.cache_size = small_cache_size;
-	ch->large.cache_size = large_cache_size;
-	ch->small.cache_count = 0;
-	ch->large.cache_count = 0;
-
-	STAILQ_INIT(&ch->small.cache);
-	STAILQ_INIT(&ch->large.cache);
-
-	for (i = 0; i < small_cache_size; ++i) {
-		buf = spdk_mempool_get(g_iobuf.small_pool);
-		if (buf == NULL) {
-			SPDK_ERRLOG("Failed to populate iobuf small buffer cache. "
-				    "You may need to increase spdk_iobuf_opts.small_pool_count\n");
-			goto error;
-		}
-		STAILQ_INSERT_TAIL(&ch->small.cache, buf, stailq);
-		ch->small.cache_count++;
-	}
-	for (i = 0; i < large_cache_size; ++i) {
-		buf = spdk_mempool_get(g_iobuf.large_pool);
-		if (buf == NULL) {
-			SPDK_ERRLOG("Failed to populate iobuf large buffer cache. "
-				    "You may need to increase spdk_iobuf_opts.large_pool_count\n");
-			goto error;
-		}
-		STAILQ_INSERT_TAIL(&ch->large.cache, buf, stailq);
-		ch->large.cache_count++;
-	}
-
-	return 0;
-error:
-	spdk_iobuf_channel_fini(ch);
-
-	return -ENOMEM;
-}
-
-void
-spdk_iobuf_channel_fini(struct spdk_iobuf_channel *ch)
-{
-	struct spdk_iobuf_entry *entry __attribute__((unused));
-	struct spdk_iobuf_buffer *buf;
-
-	/* Make sure none of the wait queue entries are coming from this module */
-	STAILQ_FOREACH(entry, ch->small.queue, stailq) {
-		assert(entry->module != ch->module);
-	}
-	STAILQ_FOREACH(entry, ch->large.queue, stailq) {
-		assert(entry->module != ch->module);
-	}
-
-	/* Release cached buffers back to the pool */
-	while (!STAILQ_EMPTY(&ch->small.cache)) {
-		buf = STAILQ_FIRST(&ch->small.cache);
-		STAILQ_REMOVE_HEAD(&ch->small.cache, stailq);
-		spdk_mempool_put(ch->small.pool, buf);
-		ch->small.cache_count--;
-	}
-	while (!STAILQ_EMPTY(&ch->large.cache)) {
-		buf = STAILQ_FIRST(&ch->large.cache);
-		STAILQ_REMOVE_HEAD(&ch->large.cache, stailq);
-		spdk_mempool_put(ch->large.pool, buf);
-		ch->large.cache_count--;
-	}
-
-	assert(ch->small.cache_count == 0);
-	assert(ch->large.cache_count == 0);
-
-	spdk_put_io_channel(ch->parent);
-	ch->parent = NULL;
-}
-
-int
-spdk_iobuf_register_module(const char *name)
-{
-	struct iobuf_module *module;
-
-	TAILQ_FOREACH(module, &g_iobuf.modules, tailq) {
-		if (strcmp(name, module->name) == 0) {
-			return -EEXIST;
-		}
-	}
-
-	module = calloc(1, sizeof(*module));
-	if (module == NULL) {
-		return -ENOMEM;
-	}
-
-	module->name = strdup(name);
-	if (module->name == NULL) {
-		free(module);
-		return -ENOMEM;
-	}
-
-	TAILQ_INSERT_TAIL(&g_iobuf.modules, module, tailq);
-
-	return 0;
-}
-
-int
-spdk_iobuf_for_each_entry(struct spdk_iobuf_channel *ch, struct spdk_iobuf_pool *pool,
-			  spdk_iobuf_for_each_entry_fn cb_fn, void *cb_ctx)
-{
-	struct spdk_iobuf_entry *entry, *tmp;
-	int rc;
-
-	STAILQ_FOREACH_SAFE(entry, pool->queue, stailq, tmp) {
-		/* We only want to iterate over the entries requested by the module which owns ch */
-		if (entry->module != ch->module) {
-			continue;
-		}
-
-		rc = cb_fn(ch, entry, cb_ctx);
-		if (rc != 0) {
-			return rc;
-		}
-	}
-
-	return 0;
-}
-
-void
-spdk_iobuf_entry_abort(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry,
-		       uint64_t len)
-{
-	struct spdk_iobuf_pool *pool;
-
-	if (len <= ch->small.bufsize) {
-		pool = &ch->small;
-	} else {
-		assert(len <= ch->large.bufsize);
-		pool = &ch->large;
-	}
-
-	STAILQ_REMOVE(pool->queue, entry, spdk_iobuf_entry, stailq);
-}
-
 SPDK_LOG_REGISTER_COMPONENT(thread)
--- a/lib/util/pipe.c
+++ b/lib/util/pipe.c
@ -73,7 +73,7 @@ spdk_pipe_writer_get_buffer(struct spdk_pipe *pipe, uint32_t requested_sz, struc
 	} else {
 		sz = spdk_min(requested_sz, read - write);

-		iovs[0].iov_base = (sz == 0) ? NULL : (pipe->buf + write);
+		iovs[0].iov_base = pipe->buf + write;
 		iovs[0].iov_len = sz;
 		iovs[1].iov_base = NULL;
 		iovs[1].iov_len = 0;
@ -156,7 +156,7 @@ spdk_pipe_reader_get_buffer(struct spdk_pipe *pipe, uint32_t requested_sz, struc
 	read = pipe->read;
 	write = pipe->write;

-	if (read == write && !pipe->full) {
+	if ((read == write && !pipe->full) || requested_sz == 0) {
 		iovs[0].iov_base = NULL;
 		iovs[0].iov_len = 0;
 		iovs[1].iov_base = NULL;
@ -164,14 +164,14 @@ spdk_pipe_reader_get_buffer(struct spdk_pipe *pipe, uint32_t requested_sz, struc
 	} else if (read < write) {
 		sz = spdk_min(requested_sz, write - read);

-		iovs[0].iov_base = (sz == 0) ? NULL : (pipe->buf + read);
+		iovs[0].iov_base = pipe->buf + read;
 		iovs[0].iov_len = sz;
 		iovs[1].iov_base = NULL;
 		iovs[1].iov_len = 0;
 	} else {
 		sz = spdk_min(requested_sz, pipe->sz - read);

-		iovs[0].iov_base = (sz == 0) ? NULL : (pipe->buf + read);
+		iovs[0].iov_base = pipe->buf + read;
 		iovs[0].iov_len = sz;

 		requested_sz -= sz;
--- a/lib/vhost/rte_vhost_user.c
+++ b/lib/vhost/rte_vhost_user.c
@ -1098,17 +1098,27 @@ enable_device_vq(struct spdk_vhost_session *vsession, uint16_t qid)
 		q->packed.used_phase = q->last_used_idx >> 15;
 		q->last_used_idx = q->last_used_idx & 0x7FFF;

-		if (!vsession->interrupt_mode) {
+		if (!spdk_interrupt_mode_is_enabled()) {
 			/* Disable I/O submission notifications, we'll be polling. */
 			q->vring.device_event->flags = VRING_PACKED_EVENT_FLAG_DISABLE;
+		} else {
+			/* Enable I/O submission notifications, we'll be interrupting. */
+			q->vring.device_event->flags = VRING_PACKED_EVENT_FLAG_ENABLE;
 		}
 	} else {
-		if (!vsession->interrupt_mode) {
+		if (!spdk_interrupt_mode_is_enabled()) {
 			/* Disable I/O submission notifications, we'll be polling. */
 			q->vring.used->flags = VRING_USED_F_NO_NOTIFY;
+		} else {
+			/* Enable I/O submission notifications, we'll be interrupting. */
+			q->vring.used->flags = 0;
 		}
 	}

+	if (spdk_interrupt_mode_is_enabled() && backend->register_vq_interrupt) {
+		backend->register_vq_interrupt(vsession, q);
+	}
+
 	q->packed.packed_ring = packed_ring;
 	vsession->max_queues = spdk_max(vsession->max_queues, qid + 1);

@ -1405,11 +1415,8 @@ void
 vhost_user_session_set_interrupt_mode(struct spdk_vhost_session *vsession, bool interrupt_mode)
 {
 	uint16_t i;
-	bool packed_ring;
 	int rc = 0;

-	packed_ring = ((vsession->negotiated_features & (1ULL << VIRTIO_F_RING_PACKED)) != 0);
-
 	for (i = 0; i < vsession->max_queues; i++) {
 		struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i];
 		uint64_t num_events = 1;
@ -1422,12 +1429,6 @@ vhost_user_session_set_interrupt_mode(struct spdk_vhost_session *vsession, bool
 		}

 		if (interrupt_mode) {
-			/* Enable I/O submission notifications, we'll be interrupting. */
-			if (packed_ring) {
-				* (volatile uint16_t *) &q->vring.device_event->flags = VRING_PACKED_EVENT_FLAG_ENABLE;
-			} else {
-				* (volatile uint16_t *) &q->vring.used->flags = 0;
-			}

 			/* In case of race condition, always kick vring when switch to intr */
 			rc = write(q->vring.kickfd, &num_events, sizeof(num_events));
@ -1437,19 +1438,12 @@ vhost_user_session_set_interrupt_mode(struct spdk_vhost_session *vsession, bool

 			vsession->interrupt_mode = true;
 		} else {
-			/* Disable I/O submission notifications, we'll be polling. */
-			if (packed_ring) {
-				* (volatile uint16_t *) &q->vring.device_event->flags = VRING_PACKED_EVENT_FLAG_DISABLE;
-			} else {
-				* (volatile uint16_t *) &q->vring.used->flags = VRING_USED_F_NO_NOTIFY;
-			}

 			vsession->interrupt_mode = false;
 		}
 	}
 }

-
 static int
 extern_vhost_pre_msg_handler(int vid, void *_msg)
 {
--- a/lib/vhost/vhost_blk.c
+++ b/lib/vhost/vhost_blk.c
@ -464,6 +464,8 @@ virtio_blk_process_request(struct spdk_vhost_dev *vdev, struct spdk_io_channel *
 	uint16_t iovcnt;
 	int rc;

+	assert(bvdev != NULL);
+
 	task->cb = cb;
 	task->cb_arg = cb_arg;

@ -1070,9 +1072,35 @@ vhost_blk_session_unregister_interrupts(struct spdk_vhost_blk_session *bvsession
 	}
 }

+static void
+_vhost_blk_vq_register_interrupt(void *arg)
+{
+	struct spdk_vhost_virtqueue *vq = arg;
+	struct spdk_vhost_session *vsession = vq->vsession;
+	struct spdk_vhost_blk_dev *bvdev =  to_blk_dev(vsession->vdev);
+
+	if (bvdev->bdev) {
+		vq->intr = spdk_interrupt_register(vq->vring.kickfd, vdev_vq_worker, vq, "vdev_vq_worker");
+	} else {
+		vq->intr = spdk_interrupt_register(vq->vring.kickfd, no_bdev_vdev_vq_worker, vq,
+						   "no_bdev_vdev_vq_worker");
+	}
+
+	if (vq->intr == NULL) {
+		SPDK_ERRLOG("Fail to register req notifier handler.\n");
+		assert(false);
+	}
+}
+
+static void
+vhost_blk_vq_register_interrupt(struct spdk_vhost_session *vsession,
+				struct spdk_vhost_virtqueue *vq)
+{
+	spdk_thread_send_msg(vsession->vdev->thread, _vhost_blk_vq_register_interrupt, vq);
+}
+
 static int
-vhost_blk_session_register_interrupts(struct spdk_vhost_blk_session *bvsession,
-				      spdk_interrupt_fn fn, const char *name)
+vhost_blk_session_register_no_bdev_interrupts(struct spdk_vhost_blk_session *bvsession)
 {
 	struct spdk_vhost_session *vsession = &bvsession->vsession;
 	struct spdk_vhost_virtqueue *vq = NULL;
@ -1083,19 +1111,18 @@ vhost_blk_session_register_interrupts(struct spdk_vhost_blk_session *bvsession,
 		vq = &vsession->virtqueue[i];
 		SPDK_DEBUGLOG(vhost_blk, "Register vq[%d]'s kickfd is %d\n",
 			      i, vq->vring.kickfd);
-
-		vq->intr = spdk_interrupt_register(vq->vring.kickfd, fn, vq, name);
+		vq->intr = spdk_interrupt_register(vq->vring.kickfd, no_bdev_vdev_vq_worker, vq,
+						   "no_bdev_vdev_vq_worker");
 		if (vq->intr == NULL) {
-			SPDK_ERRLOG("Fail to register req notifier handler.\n");
 			goto err;
 		}
+
 	}

 	return 0;

 err:
 	vhost_blk_session_unregister_interrupts(bvsession);
-
 	return -1;
 }

@ -1157,10 +1184,9 @@ vhost_user_session_bdev_remove_cb(struct spdk_vhost_dev *vdev,
 	bvsession = to_blk_session(vsession);
 	if (bvsession->requestq_poller) {
 		spdk_poller_unregister(&bvsession->requestq_poller);
-		if (vsession->virtqueue[0].intr) {
+		if (vsession->interrupt_mode) {
 			vhost_blk_session_unregister_interrupts(bvsession);
-			rc = vhost_blk_session_register_interrupts(bvsession, no_bdev_vdev_vq_worker,
-					"no_bdev_vdev_vq_worker");
+			rc = vhost_blk_session_register_no_bdev_interrupts(bvsession);
 			if (rc) {
 				SPDK_ERRLOG("%s: Interrupt register failed\n", vsession->name);
 				return rc;
@ -1209,6 +1235,8 @@ bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
 	struct spdk_vhost_dev *vdev = (struct spdk_vhost_dev *)event_ctx;
 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);

+	assert(bvdev != NULL);
+
 	SPDK_DEBUGLOG(vhost_blk, "Bdev event: type %d, name %s\n",
 		      type,
 		      bdev->name);
@ -1292,7 +1320,7 @@ vhost_blk_start(struct spdk_vhost_dev *vdev,
 {
 	struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
 	struct spdk_vhost_blk_dev *bvdev;
-	int i, rc = 0;
+	int i;

 	/* return if start is already in progress */
 	if (bvsession->requestq_poller) {
@ -1324,23 +1352,6 @@ vhost_blk_start(struct spdk_vhost_dev *vdev,
 		}
 	}

-	if (spdk_interrupt_mode_is_enabled()) {
-		if (bvdev->bdev) {
-			rc = vhost_blk_session_register_interrupts(bvsession,
-					vdev_vq_worker,
-					"vdev_vq_worker");
-		} else {
-			rc = vhost_blk_session_register_interrupts(bvsession,
-					no_bdev_vdev_vq_worker,
-					"no_bdev_vdev_vq_worker");
-		}
-
-		if (rc) {
-			SPDK_ERRLOG("%s: Interrupt register failed\n", vsession->name);
-			return rc;
-		}
-	}
-
 	if (bvdev->bdev) {
 		bvsession->requestq_poller = SPDK_POLLER_REGISTER(vdev_worker, bvsession, 0);
 	} else {
@ -1409,10 +1420,7 @@ vhost_blk_stop(struct spdk_vhost_dev *vdev,
 	}

 	spdk_poller_unregister(&bvsession->requestq_poller);
-
-	if (vsession->virtqueue[0].intr) {
 	vhost_blk_session_unregister_interrupts(bvsession);
-	}

 	/* vhost_user_session_send_event timeout is 3 seconds, here set retry within 4 seconds */
 	bvsession->vsession.stop_retry_count = 4000;
@ -1541,6 +1549,8 @@ vhost_blk_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
 {
 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);

+	assert(bvdev != NULL);
+
 	return bvdev->ops->set_coalescing(vdev, delay_base_us, iops_threshold);
 }

@ -1550,6 +1560,8 @@ vhost_blk_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us,
 {
 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);

+	assert(bvdev != NULL);
+
 	bvdev->ops->get_coalescing(vdev, delay_base_us, iops_threshold);
 }

@ -1558,6 +1570,7 @@ static const struct spdk_vhost_user_dev_backend vhost_blk_user_device_backend =
 	.start_session =  vhost_blk_start,
 	.stop_session = vhost_blk_stop,
 	.alloc_vq_tasks = alloc_vq_task_pool,
+	.register_vq_interrupt = vhost_blk_vq_register_interrupt,
 };

 static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
@ -1577,6 +1590,8 @@ virtio_blk_construct_ctrlr(struct spdk_vhost_dev *vdev, const char *address,
 {
 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);

+	assert(bvdev != NULL);
+
 	return bvdev->ops->create_ctrlr(vdev, cpumask, address, params, (void *)user_backend);
 }

@ -1666,6 +1681,8 @@ virtio_blk_destroy_ctrlr(struct spdk_vhost_dev *vdev)
 {
 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);

+	assert(bvdev != NULL);
+
 	return bvdev->ops->destroy_ctrlr(vdev);
 }

@ -1702,6 +1719,8 @@ vhost_blk_get_io_channel(struct spdk_vhost_dev *vdev)
 {
 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);

+	assert(bvdev != NULL);
+
 	return spdk_bdev_get_io_channel(bvdev->bdev_desc);
 }

@ -1759,6 +1778,8 @@ vhost_user_blk_create_ctrlr(struct spdk_vhost_dev *vdev, struct spdk_cpuset *cpu
 	struct rpc_vhost_blk req = {0};
 	struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);

+	assert(bvdev != NULL);
+
 	if (spdk_json_decode_object_relaxed(params, rpc_construct_vhost_blk,
 					    SPDK_COUNTOF(rpc_construct_vhost_blk),
 					    &req)) {
--- a/lib/vhost/vhost_internal.h
+++ b/lib/vhost/vhost_internal.h
@ -224,6 +224,7 @@ struct spdk_vhost_user_dev_backend {
 	spdk_vhost_session_fn start_session;
 	spdk_vhost_session_fn stop_session;
 	int (*alloc_vq_tasks)(struct spdk_vhost_session *vsession, uint16_t qid);
+	void (*register_vq_interrupt)(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *vq);
 };

 enum vhost_backend_type {
--- a/mdl_rules.rb
+++ b/mdl_rules.rb
@ -1,5 +1,6 @@
 all
 exclude_rule 'MD004'
+rule 'MD007', :indent => 2
 exclude_rule 'MD010'
 rule 'MD013', :line_length => 170
 exclude_rule 'MD024'
--- a/mk/spdk.common.mk
+++ b/mk/spdk.common.mk
@ -97,8 +97,8 @@ COMMON_CFLAGS += -Werror
 endif

 ifeq ($(CONFIG_LTO),y)
-COMMON_CFLAGS += -flto
-LDFLAGS += -flto
+COMMON_CFLAGS += -flto=auto
+LDFLAGS += -flto=auto
 endif

 ifeq ($(CONFIG_PGO_CAPTURE),y)
@ -154,7 +154,7 @@ endif
 SYS_LIBS =

 ifeq ($(OS),FreeBSD)
-SYS_LIBS += -L/usr/local/lib
+SYS_LIBS += -lexecinfo -L/usr/local/lib
 COMMON_CFLAGS += -I/usr/local/include
 endif

--- a/mk/spdk.lib_deps.mk
+++ b/mk/spdk.lib_deps.mk
@ -52,7 +52,7 @@ DEPDIRS-accel := log util thread json rpc jsonrpc dma
 DEPDIRS-jsonrpc := log util json
 DEPDIRS-virtio := log util json thread vfio_user

-DEPDIRS-lvol := log util blob
+DEPDIRS-lvol := log util blob thread
 DEPDIRS-rpc := log util json jsonrpc

 DEPDIRS-net := log util $(JSON_LIBS)
@ -140,7 +140,7 @@ DEPDIRS-bdev_compress := $(BDEV_DEPS_THREAD) reduce accel
 DEPDIRS-bdev_crypto := $(BDEV_DEPS_THREAD) accel
 DEPDIRS-bdev_delay := $(BDEV_DEPS_THREAD)
 DEPDIRS-bdev_iscsi := $(BDEV_DEPS_THREAD)
-DEPDIRS-bdev_malloc := $(BDEV_DEPS_THREAD) accel
+DEPDIRS-bdev_malloc := $(BDEV_DEPS_THREAD) accel dma
 DEPDIRS-bdev_null := $(BDEV_DEPS_THREAD)
 DEPDIRS-bdev_nvme = $(BDEV_DEPS_THREAD) accel nvme trace
 DEPDIRS-bdev_ocf := $(BDEV_DEPS_THREAD)
--- a/module/accel/dpdk_compressdev/accel_dpdk_compressdev.c
+++ b/module/accel/dpdk_compressdev/accel_dpdk_compressdev.c
@ -5,7 +5,7 @@
 */

 #include "accel_dpdk_compressdev.h"
-#include "spdk_internal/accel_module.h"
+#include "spdk/accel_module.h"

 #include "spdk/stdinc.h"
 #include "spdk/rpc.h"
--- a/module/accel/dpdk_cryptodev/accel_dpdk_cryptodev.c
+++ b/module/accel/dpdk_cryptodev/accel_dpdk_cryptodev.c
@ -1,13 +1,13 @@
 /*   SPDX-License-Identifier: BSD-3-Clause
 *   Copyright (C) 2018 Intel Corporation.
- *   Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.
+ *   Copyright (c) 2022, 2023 NVIDIA CORPORATION & AFFILIATES.
 *   All rights reserved.
 */

 #include "accel_dpdk_cryptodev.h"

 #include "spdk/accel.h"
-#include "spdk_internal/accel_module.h"
+#include "spdk/accel_module.h"
 #include "spdk/env.h"
 #include "spdk/likely.h"
 #include "spdk/thread.h"
@ -66,7 +66,6 @@
                (ACCEL_DPDK_CRYPTODEV_DEFAULT_NUM_XFORMS * \
                 sizeof(struct rte_crypto_sym_xform)))
 #define ACCEL_DPDK_CRYPTODEV_IV_LENGTH			16
-#define ACCEL_DPDK_CRYPTODEV_QUEUED_OP_OFFSET (ACCEL_DPDK_CRYPTODEV_IV_OFFSET + ACCEL_DPDK_CRYPTODEV_IV_LENGTH)

 /* Driver names */
 #define ACCEL_DPDK_CRYPTODEV_AESNI_MB	"crypto_aesni_mb"
@ -145,15 +144,6 @@ struct accel_dpdk_cryptodev_key_priv {
 	TAILQ_HEAD(, accel_dpdk_cryptodev_key_handle) dev_keys;
 };

-/* For queueing up crypto operations that we can't submit for some reason */
-struct accel_dpdk_cryptodev_queued_op {
-	struct accel_dpdk_cryptodev_qp *qp;
-	struct rte_crypto_op *crypto_op;
-	struct accel_dpdk_cryptodev_task *task;
-	TAILQ_ENTRY(accel_dpdk_cryptodev_queued_op) link;
-};
-#define ACCEL_DPDK_CRYPTODEV_QUEUED_OP_LENGTH (sizeof(struct accel_dpdk_cryptodev_queued_op))
-
 /* The crypto channel struct. It is allocated and freed on my behalf by the io channel code.
 * We store things in here that are needed on per thread basis like the base_channel for this thread,
 * and the poller for this thread.
@ -163,10 +153,11 @@ struct accel_dpdk_cryptodev_io_channel {
 	struct spdk_poller *poller;
 	/* Array of qpairs for each available device. The specific device will be selected depending on the crypto key */
 	struct accel_dpdk_cryptodev_qp *device_qp[ACCEL_DPDK_CRYPTODEV_DRIVER_LAST];
-	/* queued for re-submission to CryptoDev. Used when for some reason crypto op was not processed by the driver */
-	TAILQ_HEAD(, accel_dpdk_cryptodev_queued_op) queued_cry_ops;
-	/* Used to queue tasks when qpair is full. No crypto operation was submitted to the driver by the task */
+	/* Used to queue tasks when qpair is full or only part of crypto ops was submitted to the PMD */
 	TAILQ_HEAD(, accel_dpdk_cryptodev_task) queued_tasks;
+	/* Used to queue tasks that were completed in submission path - to avoid calling cpl_cb and possibly overflow
+	 * call stack */
+	TAILQ_HEAD(, accel_dpdk_cryptodev_task) completed_tasks;
 };

 struct accel_dpdk_cryptodev_task {
@ -245,43 +236,6 @@ accel_dpdk_cryptodev_get_driver(void)
 	return g_driver_names[g_dpdk_cryptodev_driver];
 }

-static void
-cancel_queued_crypto_ops(struct accel_dpdk_cryptodev_io_channel *crypto_ch,
-			 struct accel_dpdk_cryptodev_task *task)
-{
-	struct rte_mbuf *mbufs_to_free[2 * ACCEL_DPDK_CRYPTODEV_MAX_DEQUEUE_BURST_SIZE];
-	struct rte_crypto_op *cancelled_ops[ACCEL_DPDK_CRYPTODEV_MAX_DEQUEUE_BURST_SIZE];
-	struct accel_dpdk_cryptodev_queued_op *op_to_cancel, *tmp_op;
-	struct rte_crypto_op *crypto_op;
-	int num_mbufs = 0, num_dequeued_ops = 0;
-
-	/* Remove all ops from the failed IO. Since we don't know the
-	 * order we have to check them all. */
-	TAILQ_FOREACH_SAFE(op_to_cancel, &crypto_ch->queued_cry_ops, link, tmp_op) {
-		/* Checking if this is our op. One IO contains multiple ops. */
-		if (task == op_to_cancel->task) {
-			crypto_op = op_to_cancel->crypto_op;
-			TAILQ_REMOVE(&crypto_ch->queued_cry_ops, op_to_cancel, link);
-
-			/* Populating lists for freeing mbufs and ops. */
-			mbufs_to_free[num_mbufs++] = (void *)crypto_op->sym->m_src;
-			if (crypto_op->sym->m_dst) {
-				mbufs_to_free[num_mbufs++] = (void *)crypto_op->sym->m_dst;
-			}
-			cancelled_ops[num_dequeued_ops++] = crypto_op;
-		}
-	}
-
-	/* Now bulk free both mbufs and crypto operations. */
-	if (num_dequeued_ops > 0) {
-		rte_mempool_put_bulk(g_crypto_op_mp, (void **)cancelled_ops,
-				     num_dequeued_ops);
-		assert(num_mbufs > 0);
-		/* This also releases chained mbufs if any. */
-		rte_pktmbuf_free_bulk(mbufs_to_free, num_mbufs);
-	}
-}
-
 static inline uint16_t
 accel_dpdk_cryptodev_poll_qp(struct accel_dpdk_cryptodev_qp *qp,
 			     struct accel_dpdk_cryptodev_io_channel *crypto_ch)
@ -340,6 +294,11 @@ accel_dpdk_cryptodev_poll_qp(struct accel_dpdk_cryptodev_qp *qp,
 				if (rc == -ENOMEM) {
 					TAILQ_INSERT_TAIL(&crypto_ch->queued_tasks, task, link);
 					continue;
+				} else if (rc == -EALREADY) {
+					/* -EALREADY means that a task is completed, but it might be unsafe to complete
+					 * it if we are in the submission path. Since we are in the poller context, we can
+					 * complete th task immediately */
+					rc = 0;
 				}
 				spdk_accel_task_complete(&task->base, rc);
 			}
@ -369,10 +328,8 @@ accel_dpdk_cryptodev_poller(void *args)
 	struct accel_dpdk_cryptodev_io_channel *crypto_ch = args;
 	struct accel_dpdk_cryptodev_qp *qp;
 	struct accel_dpdk_cryptodev_task *task, *task_tmp;
-	struct accel_dpdk_cryptodev_queued_op *op_to_resubmit, *op_to_resubmit_tmp;
 	TAILQ_HEAD(, accel_dpdk_cryptodev_task) queued_tasks_tmp;
-	uint32_t num_dequeued_ops = 0, num_enqueued_ops = 0;
-	uint16_t enqueued;
+	uint32_t num_dequeued_ops = 0, num_enqueued_ops = 0, num_completed_tasks = 0;
 	int i, rc;

 	for (i = 0; i < ACCEL_DPDK_CRYPTODEV_DRIVER_LAST; i++) {
@ -383,42 +340,6 @@ accel_dpdk_cryptodev_poller(void *args)
 		}
 	}

-	/* Check if there are any queued crypto ops to process */
-	TAILQ_FOREACH_SAFE(op_to_resubmit, &crypto_ch->queued_cry_ops, link, op_to_resubmit_tmp) {
-		task = op_to_resubmit->task;
-		qp = op_to_resubmit->qp;
-		if (qp->num_enqueued_ops == qp->device->qp_desc_nr) {
-			continue;
-		}
-		enqueued = rte_cryptodev_enqueue_burst(qp->device->cdev_id,
-						       qp->qp,
-						       &op_to_resubmit->crypto_op,
-						       1);
-		if (enqueued == 1) {
-			TAILQ_REMOVE(&crypto_ch->queued_cry_ops, op_to_resubmit, link);
-			qp->num_enqueued_ops++;
-			num_enqueued_ops++;
-		} else {
-			if (op_to_resubmit->crypto_op->status == RTE_CRYPTO_OP_STATUS_NOT_PROCESSED) {
-				/* If we couldn't get one, just break and try again later. */
-				break;
-			} else {
-				/* Something is really wrong with the op. Most probably the
-				 * mbuf is broken or the HW is not able to process the request.
-				 * Fail the IO and remove its ops from the queued ops list. */
-				task->is_failed = true;
-
-				cancel_queued_crypto_ops(crypto_ch, task);
-
-				task->cryop_completed++;
-				/* Fail the IO if there is nothing left on device. */
-				if (task->cryop_completed == task->cryop_submitted) {
-					spdk_accel_task_complete(&task->base, -EFAULT);
-				}
-			}
-		}
-	}
-
 	if (!TAILQ_EMPTY(&crypto_ch->queued_tasks)) {
 		TAILQ_INIT(&queued_tasks_tmp);

@ -431,8 +352,14 @@ accel_dpdk_cryptodev_poller(void *args)
 					/* Other queued tasks may belong to other qpairs,
 					 * so process the whole list */
 					continue;
+				} else if (rc == -EALREADY) {
+					/* -EALREADY means that a task is completed, but it might be unsafe to complete
+					 * it if we are in the submission path. Since we are in the poller context, we can
+					 * complete th task immediately */
+					rc = 0;
 				}
 				spdk_accel_task_complete(&task->base, rc);
+				num_completed_tasks++;
 			} else {
 				num_enqueued_ops++;
 			}
@ -441,7 +368,13 @@ accel_dpdk_cryptodev_poller(void *args)
 		TAILQ_SWAP(&crypto_ch->queued_tasks, &queued_tasks_tmp, accel_dpdk_cryptodev_task, link);
 	}

-	return !!(num_dequeued_ops + num_enqueued_ops);
+	TAILQ_FOREACH_SAFE(task, &crypto_ch->completed_tasks, link, task_tmp) {
+		TAILQ_REMOVE(&crypto_ch->completed_tasks, task, link);
+		spdk_accel_task_complete(&task->base, 0);
+		num_completed_tasks++;
+	}
+
+	return !!(num_dequeued_ops + num_enqueued_ops + num_completed_tasks);
 }

 /* Allocate the new mbuf of @remainder size with data pointed by @addr and attach
@ -589,8 +522,10 @@ accel_dpdk_cryptodev_mbuf_add_single_block(struct spdk_iov_sgl *sgl, struct rte_
 	uint8_t *buf_addr;
 	uint64_t phys_len;
 	uint64_t remainder;
-	uint64_t buf_len = spdk_min(task->base.block_size, sgl->iov->iov_len - sgl->iov_offset);
+	uint64_t buf_len;

+	assert(sgl->iov->iov_len > sgl->iov_offset);
+	buf_len = spdk_min(task->base.block_size, sgl->iov->iov_len - sgl->iov_offset);
 	buf_addr = sgl->iov->iov_base + sgl->iov_offset;
 	phys_len = accel_dpdk_cryptodev_mbuf_attach_buf(task, mbuf, buf_addr, buf_len);
 	if (spdk_unlikely(phys_len == 0)) {
@ -626,6 +561,18 @@ accel_dpdk_cryptodev_op_set_iv(struct rte_crypto_op *crypto_op, uint64_t iv)
 	rte_memcpy(iv_ptr, &iv, sizeof(uint64_t));
 }

+static inline void
+accel_dpdk_cryptodev_update_resources_from_pools(struct rte_crypto_op **crypto_ops,
+		struct rte_mbuf **src_mbufs, struct rte_mbuf **dst_mbufs,
+		uint32_t num_enqueued_ops, uint32_t cryop_cnt)
+{
+	memmove(crypto_ops, &crypto_ops[num_enqueued_ops], sizeof(crypto_ops[0]) * cryop_cnt);
+	memmove(src_mbufs, &src_mbufs[num_enqueued_ops], sizeof(src_mbufs[0]) * cryop_cnt);
+	if (dst_mbufs) {
+		memmove(dst_mbufs, &dst_mbufs[num_enqueued_ops], sizeof(dst_mbufs[0]) * cryop_cnt);
+	}
+}
+
 static int
 accel_dpdk_cryptodev_process_task(struct accel_dpdk_cryptodev_io_channel *crypto_ch,
 				  struct accel_dpdk_cryptodev_task *task)
@ -637,7 +584,6 @@ accel_dpdk_cryptodev_process_task(struct accel_dpdk_cryptodev_io_channel *crypto
 	uint32_t sgl_offset;
 	uint32_t qp_capacity;
 	uint64_t iv_start;
-	struct accel_dpdk_cryptodev_queued_op *op_to_queue;
 	uint32_t i, crypto_index;
 	struct rte_crypto_op *crypto_ops[ACCEL_DPDK_CRYPTODEV_MAX_ENQUEUE_ARRAY_SIZE];
 	struct rte_mbuf *src_mbufs[ACCEL_DPDK_CRYPTODEV_MAX_ENQUEUE_ARRAY_SIZE];
@ -649,6 +595,7 @@ accel_dpdk_cryptodev_process_task(struct accel_dpdk_cryptodev_io_channel *crypto
 	struct accel_dpdk_cryptodev_device *dev;
 	struct spdk_iov_sgl src, dst = {};
 	int rc;
+	bool inplace = task->inplace;

 	if (spdk_unlikely(!task->base.crypto_key ||
 			  task->base.crypto_key->module_if != &g_accel_dpdk_cryptodev_module)) {
@ -722,13 +669,11 @@ accel_dpdk_cryptodev_process_task(struct accel_dpdk_cryptodev_io_channel *crypto
 		return -EINVAL;
 	}

-	rc = accel_dpdk_cryptodev_task_alloc_resources(src_mbufs, task->inplace ? NULL : dst_mbufs,
+	rc = accel_dpdk_cryptodev_task_alloc_resources(src_mbufs, inplace ? NULL : dst_mbufs,
 			crypto_ops, cryop_cnt);
 	if (rc) {
 		return rc;
 	}
-	/* This value is used in the completion callback to determine when the accel task is complete. */
-	task->cryop_submitted += cryop_cnt;

 	/* As we don't support chaining because of a decision to use LBA as IV, construction
 	 * of crypto operations is straightforward. We build both the op, the mbuf and the
@ -737,15 +682,17 @@ accel_dpdk_cryptodev_process_task(struct accel_dpdk_cryptodev_io_channel *crypto
 	 * LBA sized chunk of memory will correspond 1:1 to a crypto operation and a single
 	 * mbuf per crypto operation.
 	 */
-	spdk_iov_sgl_init(&src, task->base.s.iovs, task->base.s.iovcnt, sgl_offset);
-	if (!task->inplace) {
-		spdk_iov_sgl_init(&dst, task->base.d.iovs, task->base.d.iovcnt, sgl_offset);
+	spdk_iov_sgl_init(&src, task->base.s.iovs, task->base.s.iovcnt, 0);
+	spdk_iov_sgl_advance(&src, sgl_offset);
+	if (!inplace) {
+		spdk_iov_sgl_init(&dst, task->base.d.iovs, task->base.d.iovcnt, 0);
+		spdk_iov_sgl_advance(&dst, sgl_offset);
 	}

 	for (crypto_index = 0; crypto_index < cryop_cnt; crypto_index++) {
 		rc = accel_dpdk_cryptodev_mbuf_add_single_block(&src, src_mbufs[crypto_index], task);
 		if (spdk_unlikely(rc)) {
-			goto err_free_ops;
+			goto free_ops;
 		}
 		accel_dpdk_cryptodev_op_set_iv(crypto_ops[crypto_index], iv_start);
 		iv_start++;
@ -758,14 +705,14 @@ accel_dpdk_cryptodev_process_task(struct accel_dpdk_cryptodev_io_channel *crypto
 		/* link the mbuf to the crypto op. */
 		crypto_ops[crypto_index]->sym->m_src = src_mbufs[crypto_index];

-		if (task->inplace) {
+		if (inplace) {
 			crypto_ops[crypto_index]->sym->m_dst = NULL;
 		} else {
 #ifndef __clang_analyzer__
 			/* scan-build thinks that dst_mbufs is not initialized */
 			rc = accel_dpdk_cryptodev_mbuf_add_single_block(&dst, dst_mbufs[crypto_index], task);
 			if (spdk_unlikely(rc)) {
-				goto err_free_ops;
+				goto free_ops;
 			}
 			crypto_ops[crypto_index]->sym->m_dst = dst_mbufs[crypto_index];
 #endif
@ -776,25 +723,50 @@ accel_dpdk_cryptodev_process_task(struct accel_dpdk_cryptodev_io_channel *crypto
 	 * configured the crypto device for.
 	 */
 	num_enqueued_ops = rte_cryptodev_enqueue_burst(dev->cdev_id, qp->qp, crypto_ops, cryop_cnt);
-
+	/* This value is used in the completion callback to determine when the accel task is complete. */
+	task->cryop_submitted += num_enqueued_ops;
 	qp->num_enqueued_ops += num_enqueued_ops;
 	/* We were unable to enqueue everything but did get some, so need to decide what
 	 * to do based on the status of the last op.
 	 */
 	if (num_enqueued_ops < cryop_cnt) {
 		switch (crypto_ops[num_enqueued_ops]->status) {
-		case RTE_CRYPTO_OP_STATUS_NOT_PROCESSED:
-			/* Queue them up on a linked list to be resubmitted via the poller. */
-			for (crypto_index = num_enqueued_ops; crypto_index < cryop_cnt; crypto_index++) {
-				op_to_queue = (struct accel_dpdk_cryptodev_queued_op *)rte_crypto_op_ctod_offset(
-						      crypto_ops[crypto_index],
-						      uint8_t *, ACCEL_DPDK_CRYPTODEV_QUEUED_OP_OFFSET);
-				op_to_queue->qp = qp;
-				op_to_queue->crypto_op = crypto_ops[crypto_index];
-				op_to_queue->task = task;
-				TAILQ_INSERT_TAIL(&crypto_ch->queued_cry_ops, op_to_queue, link);
+		case RTE_CRYPTO_OP_STATUS_SUCCESS:
+			/* Crypto operation might be completed successfully but enqueuing to a completion ring might fail.
+			 * That might happen with SW PMDs like openssl
+			 * We can't retry such operation on next turn since if crypto operation was inplace, we can encrypt/
+			 * decrypt already processed buffer. See github issue #2907 for more details.
+			 * Handle this case as the crypto op was completed successfully - increment cryop_submitted and
+			 * cryop_completed.
+			 * We won't receive a completion for such operation, so we need to cleanup mbufs and crypto_ops */
+			assert(task->cryop_total > task->cryop_completed);
+			task->cryop_completed++;
+			task->cryop_submitted++;
+			if (task->cryop_completed == task->cryop_total) {
+				assert(num_enqueued_ops == 0);
+				/* All crypto ops are completed. We can't complete the task immediately since this function might be
+				 * called in scope of spdk_accel_submit_* function and user's logic in the completion callback
+				 * might lead to stack overflow */
+				cryop_cnt -= num_enqueued_ops;
+				accel_dpdk_cryptodev_update_resources_from_pools(crypto_ops, src_mbufs, inplace ? NULL : dst_mbufs,
+						num_enqueued_ops, cryop_cnt);
+				rc = -EALREADY;
+				goto free_ops;
 			}
-			break;
+		/* fallthrough */
+		case RTE_CRYPTO_OP_STATUS_NOT_PROCESSED:
+			if (num_enqueued_ops == 0) {
+				/* Nothing was submitted. Free crypto ops and mbufs, treat this case as NOMEM */
+				rc = -ENOMEM;
+				goto free_ops;
+			}
+			/* Part of the crypto operations were not submitted, release mbufs and crypto ops.
+			 * The rest crypto ops will be submitted again once current batch is completed */
+			cryop_cnt -= num_enqueued_ops;
+			accel_dpdk_cryptodev_update_resources_from_pools(crypto_ops, src_mbufs, inplace ? NULL : dst_mbufs,
+					num_enqueued_ops, cryop_cnt);
+			rc = 0;
+			goto free_ops;
 		default:
 			/* For all other statuses, mark task as failed so that the poller will pick
 			 * the failure up for the overall task status.
@ -805,7 +777,7 @@ accel_dpdk_cryptodev_process_task(struct accel_dpdk_cryptodev_io_channel *crypto
 				 * busy, fail it now as the poller won't know anything about it.
 				 */
 				rc = -EINVAL;
-				goto err_free_ops;
+				goto free_ops;
 			}
 			break;
 		}
@ -814,8 +786,8 @@ accel_dpdk_cryptodev_process_task(struct accel_dpdk_cryptodev_io_channel *crypto
 	return 0;

 	/* Error cleanup paths. */
-err_free_ops:
-	if (!task->inplace) {
+free_ops:
+	if (!inplace) {
 		/* This also releases chained mbufs if any. */
 		rte_pktmbuf_free_bulk(dst_mbufs, cryop_cnt);
 	}
@ -935,10 +907,9 @@ _accel_dpdk_cryptodev_create_cb(void *io_device, void *ctx_buf)
 		return -EINVAL;
 	}

-	/* We use this to queue up crypto ops when the device is busy. */
-	TAILQ_INIT(&crypto_ch->queued_cry_ops);
 	/* We use this to queue tasks when qpair is full or no resources in pools */
 	TAILQ_INIT(&crypto_ch->queued_tasks);
+	TAILQ_INIT(&crypto_ch->completed_tasks);

 	return 0;
 }
@ -991,9 +962,17 @@ accel_dpdk_cryptodev_submit_tasks(struct spdk_io_channel *_ch, struct spdk_accel
 	}

 	rc = accel_dpdk_cryptodev_process_task(ch, task);
-	if (spdk_unlikely(rc == -ENOMEM)) {
+	if (spdk_unlikely(rc)) {
+		if (rc == -ENOMEM) {
 			TAILQ_INSERT_TAIL(&ch->queued_tasks, task, link);
 			rc = 0;
+		} else if (rc == -EALREADY) {
+			/* -EALREADY means that a task is completed, but it might be unsafe to complete
+			 * it if we are in the submission path. Hence put it into a dedicated queue to and
+			 * process it during polling */
+			TAILQ_INSERT_TAIL(&ch->completed_tasks, task, link);
+			rc = 0;
+		}
 	}

 	return rc;
@ -1244,7 +1223,7 @@ accel_dpdk_cryptodev_init(void)
 	g_crypto_op_mp = rte_crypto_op_pool_create("dpdk_crypto_op_mp",
 			 RTE_CRYPTO_OP_TYPE_SYMMETRIC, ACCEL_DPDK_CRYPTODEV_NUM_MBUFS, ACCEL_DPDK_CRYPTODEV_POOL_CACHE_SIZE,
 			 (ACCEL_DPDK_CRYPTODEV_DEFAULT_NUM_XFORMS * sizeof(struct rte_crypto_sym_xform)) +
-			 ACCEL_DPDK_CRYPTODEV_IV_LENGTH + ACCEL_DPDK_CRYPTODEV_QUEUED_OP_LENGTH, rte_socket_id());
+			 ACCEL_DPDK_CRYPTODEV_IV_LENGTH, rte_socket_id());
 	if (g_crypto_op_mp == NULL) {
 		SPDK_ERRLOG("Cannot create op pool\n");
 		rc = -ENOMEM;
--- a/Show More
+++ b/Show More