lvol: lvol destruction race leads to null deref

As an lvolstore is being destroyed, _vbdev_lvs_remove() starts an
interation through the lvols to delete each one, ultimately leading to
the destruction of the lvolstore with a call to lvs_free(). The callback
passed to vbdev_lvs_destruct() is always called asynchronously via
spdk_io_device_unregister() in bs_free().

When the lvolstore resides on bdevs that perform async IO (i.e. most
bdevs other than malloc), this gives a small window when the lvol bdev
is not registered but a lookup with spdk_lvol_get_by_uuid() or
spdk_lvol_get_by_names() will succeed. If rpc_bdev_lvol_delete() runs
during this window, it can get a reference to an lvol that has just been
unregistered and lvol->blob may be NULL. This lvol is then passed to
vbdev_lvol_destroy().

Before this fix, vbdev_lvol_destroy() would call:

   spdk_blob_is_degraded(lvol->blob);

Which would then lead to a NULL pointer dereference, as
spdk_blob_is_degraded() assumes a valid blob is passed. While a NULL
check would avoid this particular problem, a NULL blob is not
necessarily caused by the condition described above. It would better to
flag the lvstore's destruction before returning from
vbdev_lvs_destruct() and use that flag to prevent operations on the
lvolstore that is being deleted. Such a flag already exists in the form
of 'lvs_bdev->req != NULL', but that is set too late to close this race.

This fix introduces lvs_bdev->removal_in_progress which is set prior to
returning from vbdev_lvs_unload() and vbdev_lvs_destruct(). It is
checked by vbdev_lvol_destroy() before trying to destroy the lvol.  Now,
any lvol destruction initiated by something other than
vbdev_lvs_destruct() while an lvolstore unload or destroy is in progress
will fail with -ENODEV.

Fixes issue: #2998

Signed-off-by: Mike Gerdts <mgerdts@nvidia.com>
Change-Id: I4d861879097703b0d8e3180e6de7ad6898f340fd
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/17891
Community-CI: Mellanox Build Bot
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
This commit is contained in:
Mike Gerdts 2023-05-01 13:02:47 -05:00 committed by David Ko
parent ee5ea993a0
commit b240c2b103
3 changed files with 129 additions and 7 deletions

View File

@ -41,6 +41,8 @@ struct spdk_bdev_module g_lvol_if = {
SPDK_BDEV_MODULE_REGISTER(lvol, &g_lvol_if) SPDK_BDEV_MODULE_REGISTER(lvol, &g_lvol_if)
static void _vbdev_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg);
struct lvol_store_bdev * struct lvol_store_bdev *
vbdev_get_lvs_bdev_by_lvs(struct spdk_lvol_store *lvs_orig) vbdev_get_lvs_bdev_by_lvs(struct spdk_lvol_store *lvs_orig)
{ {
@ -50,8 +52,11 @@ vbdev_get_lvs_bdev_by_lvs(struct spdk_lvol_store *lvs_orig)
while (lvs_bdev != NULL) { while (lvs_bdev != NULL) {
lvs = lvs_bdev->lvs; lvs = lvs_bdev->lvs;
if (lvs == lvs_orig) { if (lvs == lvs_orig) {
if (lvs_bdev->req != NULL) { if (lvs_bdev->removal_in_progress) {
/* We do not allow access to lvs that are being destroyed */ /* We do not allow access to lvs that are being unloaded or
* destroyed */
SPDK_DEBUGLOG(vbdev_lvol, "lvs %s: removal in progress\n",
lvs_orig->name);
return NULL; return NULL;
} else { } else {
return lvs_bdev; return lvs_bdev;
@ -120,8 +125,11 @@ vbdev_get_lvs_bdev_by_bdev(struct spdk_bdev *bdev_orig)
while (lvs_bdev != NULL) { while (lvs_bdev != NULL) {
if (lvs_bdev->bdev == bdev_orig) { if (lvs_bdev->bdev == bdev_orig) {
if (lvs_bdev->req != NULL) { if (lvs_bdev->removal_in_progress) {
/* We do not allow access to lvs that are being destroyed */ /* We do not allow access to lvs that are being unloaded or
* destroyed */
SPDK_DEBUGLOG(vbdev_lvol, "lvs %s: removal in progress\n",
lvs_bdev->lvs->name);
return NULL; return NULL;
} else { } else {
return lvs_bdev; return lvs_bdev;
@ -359,7 +367,7 @@ _vbdev_lvs_remove_lvol_cb(void *cb_arg, int lvolerrno)
lvol = TAILQ_FIRST(&lvs->lvols); lvol = TAILQ_FIRST(&lvs->lvols);
while (lvol != NULL) { while (lvol != NULL) {
if (spdk_lvol_deletable(lvol)) { if (spdk_lvol_deletable(lvol)) {
vbdev_lvol_destroy(lvol, _vbdev_lvs_remove_lvol_cb, lvs_bdev); _vbdev_lvol_destroy(lvol, _vbdev_lvs_remove_lvol_cb, lvs_bdev);
return; return;
} }
lvol = TAILQ_NEXT(lvol, link); lvol = TAILQ_NEXT(lvol, link);
@ -425,6 +433,8 @@ _vbdev_lvs_remove(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void
return; return;
} }
lvs_bdev->removal_in_progress = true;
req->cb_fn = cb_fn; req->cb_fn = cb_fn;
req->cb_arg = cb_arg; req->cb_arg = cb_arg;
lvs_bdev->req = req; lvs_bdev->req = req;
@ -611,8 +621,8 @@ _vbdev_lvol_destroy_cb(void *cb_arg, int bdeverrno)
free(ctx); free(ctx);
} }
void static void
vbdev_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg) _vbdev_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg)
{ {
struct vbdev_lvol_destroy_ctx *ctx; struct vbdev_lvol_destroy_ctx *ctx;
size_t count; size_t count;
@ -620,6 +630,10 @@ vbdev_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb
assert(lvol != NULL); assert(lvol != NULL);
assert(cb_fn != NULL); assert(cb_fn != NULL);
/* Callers other than _vbdev_lvs_remove() must ensure the lvstore is not being removed. */
assert(cb_fn == _vbdev_lvs_remove_lvol_cb ||
vbdev_get_lvs_bdev_by_lvs(lvol->lvol_store) != NULL);
/* Check if it is possible to delete lvol */ /* Check if it is possible to delete lvol */
spdk_blob_get_clones(lvol->lvol_store->blobstore, lvol->blob_id, NULL, &count); spdk_blob_get_clones(lvol->lvol_store->blobstore, lvol->blob_id, NULL, &count);
if (count > 1) { if (count > 1) {
@ -647,6 +661,26 @@ vbdev_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb
spdk_bdev_unregister(lvol->bdev, _vbdev_lvol_destroy_cb, ctx); spdk_bdev_unregister(lvol->bdev, _vbdev_lvol_destroy_cb, ctx);
} }
void
vbdev_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg)
{
struct lvol_store_bdev *lvs_bdev;
/*
* During destruction of an lvolstore, _vbdev_lvs_unload() iterates through lvols until they
* are all deleted. There may be some IO required
*/
lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvol->lvol_store);
if (lvs_bdev == NULL) {
SPDK_DEBUGLOG(vbdev_lvol, "lvol %s: lvolstore is being removed\n",
lvol->unique_id);
cb_fn(cb_arg, -ENODEV);
return;
}
_vbdev_lvol_destroy(lvol, cb_fn, cb_arg);
}
static char * static char *
vbdev_lvol_find_name(struct spdk_lvol *lvol, spdk_blob_id blob_id) vbdev_lvol_find_name(struct spdk_lvol *lvol, spdk_blob_id blob_id)
{ {
@ -1086,6 +1120,7 @@ _create_lvol_disk(struct spdk_lvol *lvol, bool destroy)
lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvol->lvol_store); lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvol->lvol_store);
if (lvs_bdev == NULL) { if (lvs_bdev == NULL) {
SPDK_ERRLOG("No spdk lvs-bdev pair found for lvol %s\n", lvol->unique_id); SPDK_ERRLOG("No spdk lvs-bdev pair found for lvol %s\n", lvol->unique_id);
assert(false);
return -ENODEV; return -ENODEV;
} }

View File

@ -16,6 +16,7 @@ struct lvol_store_bdev {
struct spdk_lvol_store *lvs; struct spdk_lvol_store *lvs;
struct spdk_bdev *bdev; struct spdk_bdev *bdev;
struct spdk_lvs_req *req; struct spdk_lvs_req *req;
bool removal_in_progress;
TAILQ_ENTRY(lvol_store_bdev) lvol_stores; TAILQ_ENTRY(lvol_store_bdev) lvol_stores;
}; };

View File

@ -672,6 +672,91 @@ esnap_remove_degraded(void)
CU_ASSERT(rc == 0); CU_ASSERT(rc == 0);
} }
static void
late_delete(void)
{
char aiopath[PATH_MAX];
struct spdk_lvol_store *lvs = NULL;
const uint32_t bs_size_bytes = 10 * 1024 * 1024;
const uint32_t bs_block_size = 4096;
const uint32_t cluster_size = 32 * 1024;
struct op_with_handle_data owh_data;
struct lvol_bdev *lvol_bdev;
struct spdk_lvol *lvol;
int rc, rc2;
int count;
/* Create device for lvstore. This cannot be a malloc bdev because we need to sneak a
* deletion in while blob_persist() is in progress.
*/
rc = make_test_file(bs_size_bytes, aiopath, sizeof(aiopath), "late_delete.aio");
SPDK_CU_ASSERT_FATAL(rc == 0);
rc = create_aio_bdev("aio1", aiopath, bs_block_size, false);
SPDK_CU_ASSERT_FATAL(rc == 0);
poll_threads();
/* Create lvstore */
rc = vbdev_lvs_create("aio1", "lvs1", cluster_size, 0, 0,
lvs_op_with_handle_cb, clear_owh(&owh_data));
SPDK_CU_ASSERT_FATAL(rc == 0);
poll_error_updated(&owh_data.lvserrno);
SPDK_CU_ASSERT_FATAL(owh_data.lvserrno == 0);
SPDK_CU_ASSERT_FATAL(owh_data.u.lvs != NULL);
lvs = owh_data.u.lvs;
/* Create an lvol */
vbdev_lvol_create(lvs, "lvol", 1, true, LVOL_CLEAR_WITH_DEFAULT,
lvol_op_with_handle_cb, clear_owh(&owh_data));
poll_error_updated(&owh_data.lvserrno);
CU_ASSERT(owh_data.lvserrno == 0);
CU_ASSERT(owh_data.u.lvol != NULL);
/* Verify the lvol can be found both ways */
CU_ASSERT(spdk_bdev_get_by_name("lvs1/lvol") != NULL);
CU_ASSERT(spdk_lvol_get_by_names("lvs1", "lvol") != NULL);
/*
* Once the lvolstore deletion starts, it will briefly be possible to look up lvol bdevs and
* degraded lvols in that lvolstore but any attempt to delete them will fail with -ENODEV.
*/
rc = 0xbad;
count = 0;
vbdev_lvs_destruct(lvs, lvol_op_complete_cb, &rc);
do {
lvol_bdev = (struct lvol_bdev *)spdk_bdev_get_by_name("lvs1/lvol");
if (lvol_bdev != NULL) {
rc2 = 0xbad;
vbdev_lvol_destroy(lvol_bdev->lvol, lvol_op_complete_cb, &rc2);
CU_ASSERT(rc2 == -ENODEV);
}
lvol = spdk_lvol_get_by_names("lvs1", "lvol");
if (lvol != NULL) {
/* If we are here, we are likely reproducing #2998 */
rc2 = 0xbad;
vbdev_lvol_destroy(lvol, lvol_op_complete_cb, &rc2);
CU_ASSERT(rc2 == -ENODEV);
count++;
}
spdk_thread_poll(g_ut_threads[0].thread, 0, 0);
} while (rc == 0xbad);
/* Ensure that the test exercised the race */
CU_ASSERT(count != 0);
/* The lvol is now gone */
CU_ASSERT(spdk_bdev_get_by_name("lvs1/lvol") == NULL);
CU_ASSERT(spdk_lvol_get_by_names("lvs1", "lvol") == NULL);
/* Clean up */
rc = 0xbad;
bdev_aio_delete("aio1", unregister_cb, &rc);
poll_error_updated(&rc);
CU_ASSERT(rc == 0);
rc = unlink(aiopath);
CU_ASSERT(rc == 0);
}
static void static void
bdev_init_cb(void *arg, int rc) bdev_init_cb(void *arg, int rc)
{ {
@ -706,6 +791,7 @@ main(int argc, char **argv)
CU_ADD_TEST(suite, esnap_clone_io); CU_ADD_TEST(suite, esnap_clone_io);
CU_ADD_TEST(suite, esnap_hotplug); CU_ADD_TEST(suite, esnap_hotplug);
CU_ADD_TEST(suite, esnap_remove_degraded); CU_ADD_TEST(suite, esnap_remove_degraded);
CU_ADD_TEST(suite, late_delete);
allocate_threads(2); allocate_threads(2);
set_thread(0); set_thread(0);