rdma: Maintain per device PD which is persistent across reconnect

SPDK NVMe RDMA initiator used the default PD per RDMA device. Default PD
may be changed when all QPs for the RDMA device are destroyed and created
again.

For multipath, the RDMA zero copy feature require the PD per RDMA device
to be persistent when all QPs for the RDMA device are destroyed and
created again.

Maintain such persistent PDs in this patch.

Add two APIs, spdk_rdma_get_pd() and spdk_rdma_put_pd().

In each call of two APIs, synchronize RDMA device list with
rdma_get_devices().

Context may be deleted anytime by rdma-core. To avoid such deletion,
hold the returned array by rdma_get_devices().

RDMA device has PD, context, ref. count, and removed flag. If context
is missing in rdma_get_devices(), set the removed flag to true. Then,
if the ref count becomes zero, free the PD and the RDMA device.

The ref. count of a RDMA device is incremented when spdk_rdma_get_pd()
is called and decremented when spdk_rdma_put_pd() is called.

To simplify synchronization, sort the returned array by
rdma_get_devices().

To avoid resource leakage, add destructor function and free all PDs
and related data at termination.

Signed-off-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
Change-Id: I093cb4ec2c7d8432642edfbffa270797ccf3e715
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/13769
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
This commit is contained in:
Shuhei Matsumoto 2022-07-25 14:32:44 +09:00 committed by Tomasz Zawadzki
parent 3403be99bb
commit b5f360c425
10 changed files with 497 additions and 1 deletions

View File

@ -273,4 +273,20 @@ spdk_rdma_memory_translation_get_rkey(struct spdk_rdma_memory_translation
translation->mr_or_key.mr->rkey : (uint32_t)translation->mr_or_key.key;
}
/**
* Get a Protection Domain for an RDMA device context.
*
* \param context RDMA device context
* \return Pointer to the allocated Protection Domain
*/
struct ibv_pd *
spdk_rdma_get_pd(struct ibv_context *context);
/**
* Return a Protection Domain.
*
* \param pd Pointer to the Protection Domain
*/
void spdk_rdma_put_pd(struct ibv_pd *pd);
#endif /* SPDK_RDMA_H */

View File

@ -7,7 +7,7 @@ SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
SO_VER := 4
SO_MINOR := 0
SO_MINOR := 1
SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_rdma.map)

View File

@ -13,6 +13,14 @@
#include "spdk_internal/rdma.h"
#include "spdk_internal/assert.h"
struct spdk_rdma_device {
struct ibv_pd *pd;
struct ibv_context *context;
int ref;
bool removed;
TAILQ_ENTRY(spdk_rdma_device) tailq;
};
struct spdk_rdma_mem_map {
struct spdk_mem_map *map;
struct ibv_pd *pd;
@ -22,6 +30,10 @@ struct spdk_rdma_mem_map {
LIST_ENTRY(spdk_rdma_mem_map) link;
};
static pthread_mutex_t g_dev_mutex = PTHREAD_MUTEX_INITIALIZER;
static struct ibv_context **g_ctx_list = NULL;
static TAILQ_HEAD(, spdk_rdma_device) g_dev_list = TAILQ_HEAD_INITIALIZER(g_dev_list);
static LIST_HEAD(, spdk_rdma_mem_map) g_rdma_mr_maps = LIST_HEAD_INITIALIZER(&g_rdma_mr_maps);
static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER;
@ -350,3 +362,209 @@ spdk_rdma_qp_flush_recv_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_recv_w
return rc;
}
static struct spdk_rdma_device *
rdma_add_dev(struct ibv_context *context)
{
struct spdk_rdma_device *dev;
dev = calloc(1, sizeof(*dev));
if (dev == NULL) {
SPDK_ERRLOG("Failed to allocate RDMA device object.\n");
return NULL;
}
dev->pd = ibv_alloc_pd(context);
if (dev->pd == NULL) {
SPDK_ERRLOG("ibv_alloc_pd() failed: %s (%d)\n", spdk_strerror(errno), errno);
free(dev);
return NULL;
}
dev->context = context;
TAILQ_INSERT_TAIL(&g_dev_list, dev, tailq);
return dev;
}
static void
rdma_remove_dev(struct spdk_rdma_device *dev)
{
if (!dev->removed || dev->ref > 0) {
return;
}
/* Deallocate protection domain only if the device is already removed and
* there is no reference.
*/
TAILQ_REMOVE(&g_dev_list, dev, tailq);
ibv_dealloc_pd(dev->pd);
free(dev);
}
static int
ctx_cmp(const void *_c1, const void *_c2)
{
struct ibv_context *c1 = *(struct ibv_context **)_c1;
struct ibv_context *c2 = *(struct ibv_context **)_c2;
return c1 < c2 ? -1 : c1 > c2;
}
static int
rdma_sync_dev_list(void)
{
struct ibv_context **new_ctx_list;
int i, j;
int num_devs = 0;
/*
* rdma_get_devices() returns a NULL terminated array of opened RDMA devices,
* and sets num_devs to the number of the returned devices.
*/
new_ctx_list = rdma_get_devices(&num_devs);
if (new_ctx_list == NULL) {
SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
return -ENODEV;
}
if (num_devs == 0) {
rdma_free_devices(new_ctx_list);
SPDK_ERRLOG("Returned RDMA device array was empty\n");
return -ENODEV;
}
/*
* Sort new_ctx_list by addresses to update devices easily.
*/
qsort(new_ctx_list, num_devs, sizeof(struct ibv_context *), ctx_cmp);
if (g_ctx_list == NULL) {
/* If no old array, this is the first call. Add all devices. */
for (i = 0; new_ctx_list[i] != NULL; i++) {
rdma_add_dev(new_ctx_list[i]);
}
goto exit;
}
for (i = j = 0; new_ctx_list[i] != NULL || g_ctx_list[j] != NULL;) {
struct ibv_context *new_ctx = new_ctx_list[i];
struct ibv_context *old_ctx = g_ctx_list[j];
bool add = false, remove = false;
/*
* If a context exists only in the new array, create a device for it,
* or if a context exists only in the old array, try removing the
* corresponding device.
*/
if (old_ctx == NULL) {
add = true;
} else if (new_ctx == NULL) {
remove = true;
} else if (new_ctx < old_ctx) {
add = true;
} else if (old_ctx < new_ctx) {
remove = true;
}
if (add) {
rdma_add_dev(new_ctx_list[i]);
i++;
} else if (remove) {
struct spdk_rdma_device *dev, *tmp;
TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
if (dev->context == g_ctx_list[j]) {
dev->removed = true;
rdma_remove_dev(dev);
}
}
j++;
} else {
i++;
j++;
}
}
/* Free the old array. */
rdma_free_devices(g_ctx_list);
exit:
/*
* Keep the newly returned array so that allocated protection domains
* are not freed unexpectedly.
*/
g_ctx_list = new_ctx_list;
return 0;
}
struct ibv_pd *
spdk_rdma_get_pd(struct ibv_context *context)
{
struct spdk_rdma_device *dev;
int rc;
pthread_mutex_lock(&g_dev_mutex);
rc = rdma_sync_dev_list();
if (rc != 0) {
pthread_mutex_unlock(&g_dev_mutex);
SPDK_ERRLOG("Failed to sync RDMA device list\n");
return NULL;
}
TAILQ_FOREACH(dev, &g_dev_list, tailq) {
if (dev->context == context && !dev->removed) {
dev->ref++;
pthread_mutex_unlock(&g_dev_mutex);
return dev->pd;
}
}
pthread_mutex_unlock(&g_dev_mutex);
SPDK_ERRLOG("Failed to get PD\n");
return NULL;
}
void
spdk_rdma_put_pd(struct ibv_pd *pd)
{
struct spdk_rdma_device *dev, *tmp;
pthread_mutex_lock(&g_dev_mutex);
TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
if (dev->pd == pd) {
assert(dev->ref > 0);
dev->ref--;
rdma_remove_dev(dev);
}
}
rdma_sync_dev_list();
pthread_mutex_unlock(&g_dev_mutex);
}
__attribute__((destructor)) static void
_rdma_fini(void)
{
struct spdk_rdma_device *dev, *tmp;
TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
dev->removed = true;
dev->ref = 0;
rdma_remove_dev(dev);
}
if (g_ctx_list != NULL) {
rdma_free_devices(g_ctx_list);
g_ctx_list = NULL;
}
}

View File

@ -18,6 +18,8 @@
spdk_rdma_get_translation;
spdk_rdma_qp_queue_recv_wrs;
spdk_rdma_qp_flush_recv_wrs;
spdk_rdma_get_pd;
spdk_rdma_put_pd;
local: *;
};

View File

@ -13,6 +13,7 @@ DIRS-$(CONFIG_REDUCE) += reduce
ifeq ($(OS),Linux)
DIRS-$(CONFIG_VHOST) += vhost
DIRS-y += ftl
DIRS-$(CONFIG_RDMA) += rdma
endif
.PHONY: all clean $(DIRS-y)

View File

@ -0,0 +1,15 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..)
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
DIRS-y = common.c
.PHONY: all clean $(DIRS-y)
all: $(DIRS-y)
clean: $(DIRS-y)
include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk

View File

@ -0,0 +1 @@
common_ut

View File

@ -0,0 +1,9 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../../..)
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
TEST_FILE = common_ut.c
include $(SPDK_ROOT_DIR)/mk/spdk.unittest.mk

View File

@ -0,0 +1,233 @@
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*/
#include "spdk/stdinc.h"
#include "spdk_cunit.h"
#include "spdk_internal/mock.h"
#include "common/lib/test_env.c"
#include "rdma/common.c"
DEFINE_STUB(spdk_mem_map_alloc, struct spdk_mem_map *, (uint64_t default_translation,
const struct spdk_mem_map_ops *ops, void *cb_ctx), NULL);
DEFINE_STUB_V(spdk_mem_map_free, (struct spdk_mem_map **pmap));
DEFINE_STUB(spdk_mem_map_set_translation, int, (struct spdk_mem_map *map, uint64_t vaddr,
uint64_t size, uint64_t translation), 0);
DEFINE_STUB(spdk_mem_map_clear_translation, int, (struct spdk_mem_map *map, uint64_t vaddr,
uint64_t size), 0);
DEFINE_STUB(spdk_mem_map_translate, uint64_t, (const struct spdk_mem_map *map, uint64_t vaddr,
uint64_t *size), 0);
struct ut_rdma_device {
struct ibv_context *context;
bool removed;
TAILQ_ENTRY(ut_rdma_device) tailq;
};
static TAILQ_HEAD(, ut_rdma_device) g_ut_dev_list = TAILQ_HEAD_INITIALIZER(g_ut_dev_list);
struct ibv_context **
rdma_get_devices(int *num_devices)
{
struct ibv_context **ctx_list;
struct ut_rdma_device *ut_dev;
int num_ut_devs = 0;
int i = 0;
TAILQ_FOREACH(ut_dev, &g_ut_dev_list, tailq) {
if (!ut_dev->removed) {
num_ut_devs++;
}
}
ctx_list = malloc(sizeof(*ctx_list) * (num_ut_devs + 1));
SPDK_CU_ASSERT_FATAL(ctx_list);
TAILQ_FOREACH(ut_dev, &g_ut_dev_list, tailq) {
if (!ut_dev->removed) {
ctx_list[i++] = ut_dev->context;
}
}
ctx_list[i] = NULL;
if (num_devices) {
*num_devices = num_ut_devs;
}
return ctx_list;
}
void
rdma_free_devices(struct ibv_context **list)
{
free(list);
}
struct ibv_pd *
ibv_alloc_pd(struct ibv_context *context)
{
struct ibv_pd *pd;
struct ut_rdma_device *ut_dev;
TAILQ_FOREACH(ut_dev, &g_ut_dev_list, tailq) {
if (ut_dev->context == context && !ut_dev->removed) {
break;
}
}
if (!ut_dev) {
return NULL;
}
pd = calloc(1, sizeof(*pd));
SPDK_CU_ASSERT_FATAL(pd);
pd->context = context;
return pd;
}
int
ibv_dealloc_pd(struct ibv_pd *pd)
{
free(pd);
return 0;
}
static struct ut_rdma_device *
ut_rdma_add_dev(struct ibv_context *context)
{
struct ut_rdma_device *ut_dev;
ut_dev = calloc(1, sizeof(*ut_dev));
if (!ut_dev) {
return NULL;
}
ut_dev->context = context;
TAILQ_INSERT_TAIL(&g_ut_dev_list, ut_dev, tailq);
return ut_dev;
}
static void
ut_rdma_remove_dev(struct ut_rdma_device *ut_dev)
{
TAILQ_REMOVE(&g_ut_dev_list, ut_dev, tailq);
free(ut_dev);
}
static struct spdk_rdma_device *
_rdma_get_dev(struct ibv_context *context)
{
struct spdk_rdma_device *dev;
TAILQ_FOREACH(dev, &g_dev_list, tailq) {
if (dev->context == context) {
break;
}
}
return dev;
}
static void
test_spdk_rdma_pd(void)
{
struct ut_rdma_device *ut_dev0, *ut_dev1, *ut_dev2;
struct ibv_pd *pd1, *pd1_1, *pd2;
ut_dev0 = ut_rdma_add_dev((struct ibv_context *)0xface);
SPDK_CU_ASSERT_FATAL(ut_dev0 != NULL);
ut_dev1 = ut_rdma_add_dev((struct ibv_context *)0xc0ffee);
SPDK_CU_ASSERT_FATAL(ut_dev1 != NULL);
ut_dev2 = ut_rdma_add_dev((struct ibv_context *)0xf00d);
SPDK_CU_ASSERT_FATAL(ut_dev2 != NULL);
/* There are ut_dev0 and ut_dev1. */
ut_dev2->removed = true;
/* Call spdk_rdma_get_pd() to non-existent ut_dev2. */
pd2 = spdk_rdma_get_pd(ut_dev2->context);
/* Then, spdk_rdma_get_pd() should return NULL and g_dev_list should have dev0 and dev1. */
CU_ASSERT(pd2 == NULL);
CU_ASSERT(_rdma_get_dev(ut_dev0->context) != NULL);
CU_ASSERT(_rdma_get_dev(ut_dev1->context) != NULL);
CU_ASSERT(_rdma_get_dev(ut_dev2->context) == NULL);
/* Remove ut_dev0 and add ut_dev2. */
ut_dev0->removed = true;
ut_dev2->removed = false;
/* Call spdk_rdma_get_pd() to ut_dev1. */
pd1 = spdk_rdma_get_pd(ut_dev1->context);
/* Then, spdk_rdma_get_pd() should return pd1 and g_dev_list should have dev1 and dev2. */
CU_ASSERT(pd1 != NULL);
CU_ASSERT(_rdma_get_dev(ut_dev0->context) == NULL);
CU_ASSERT(_rdma_get_dev(ut_dev1->context) != NULL);
CU_ASSERT(_rdma_get_dev(ut_dev2->context) != NULL);
/* Remove ut_dev1. */
ut_dev1->removed = true;
/* Call spdk_rdma_get_pd() again to ut_dev1 which does not exist anymore. */
pd1_1 = spdk_rdma_get_pd(ut_dev1->context);
/* Then, spdk_rdma_get_pd() should return NULL and g_dev_list should still have dev1. */
CU_ASSERT(pd1_1 == NULL);
CU_ASSERT(_rdma_get_dev(ut_dev0->context) == NULL);
CU_ASSERT(_rdma_get_dev(ut_dev1->context) != NULL);
CU_ASSERT(_rdma_get_dev(ut_dev2->context) != NULL);
/* Call spdk_rdma_put_pd() to pd1. */
spdk_rdma_put_pd(pd1);
/* Then, dev1 should be removed from g_dev_list. */
CU_ASSERT(_rdma_get_dev(ut_dev0->context) == NULL);
CU_ASSERT(_rdma_get_dev(ut_dev1->context) == NULL);
CU_ASSERT(_rdma_get_dev(ut_dev2->context) != NULL);
/* Call spdk_rdma_get_pd() to ut_dev2. */
pd2 = spdk_rdma_get_pd(ut_dev2->context);
/* spdk_rdma_get_pd() should succeed and g_dev_list should still have dev2
* even after spdk_rdma_put_pd() is called to pd2.
*/
CU_ASSERT(pd2 != NULL);
spdk_rdma_put_pd(pd2);
CU_ASSERT(_rdma_get_dev(ut_dev0->context) == NULL);
CU_ASSERT(_rdma_get_dev(ut_dev1->context) == NULL);
CU_ASSERT(_rdma_get_dev(ut_dev2->context) != NULL);
_rdma_fini();
ut_rdma_remove_dev(ut_dev0);
ut_rdma_remove_dev(ut_dev1);
ut_rdma_remove_dev(ut_dev2);
}
int
main(int argc, char **argv)
{
CU_pSuite suite = NULL;
unsigned int num_failures;
CU_set_error_action(CUEA_ABORT);
CU_initialize_registry();
suite = CU_add_suite("rdma_common", NULL, NULL);
CU_ADD_TEST(suite, test_spdk_rdma_pd);
CU_basic_set_mode(CU_BRM_VERBOSE);
CU_basic_run_tests();
num_failures = CU_get_number_of_failures();
CU_cleanup_registry();
return num_failures;
}

View File

@ -231,6 +231,7 @@ run_test "unittest_lvol" $valgrind $testdir/lib/lvol/lvol.c/lvol_ut
if grep -q '#define SPDK_CONFIG_RDMA 1' $rootdir/include/spdk/config.h; then
run_test "unittest_nvme_rdma" $valgrind $testdir/lib/nvme/nvme_rdma.c/nvme_rdma_ut
run_test "unittest_nvmf_transport" $valgrind $testdir/lib/nvmf/transport.c/transport_ut
run_test "unittest_rdma" $valgrind $testdir/lib/rdma/common.c/common_ut
fi
if grep -q '#define SPDK_CONFIG_NVME_CUSE 1' $rootdir/include/spdk/config.h; then