nvmf/rdma: Factor out creating/destroying rdma resources into helper functions

The following patches will support dynamically create/destory resources for
IB devices. Make these resource management functions reusable.

Signed-off-by: sijie.sun <sijie.sun@smartx.com>
Change-Id: I471f7c9e1afd01d13936836b87d0ae72da5d98bc
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/15614
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
Reviewed-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
This commit is contained in:
sijie.sun 2022-11-24 09:58:15 +08:00 committed by Tomasz Zawadzki
parent 51124e5c35
commit a5283034de

View File

@ -2362,13 +2362,136 @@ nvmf_rdma_is_rxe_device(struct spdk_nvmf_rdma_device *device)
} }
static int nvmf_rdma_accept(void *ctx); static int nvmf_rdma_accept(void *ctx);
static int
create_ib_device(struct spdk_nvmf_rdma_transport *rtransport, struct ibv_context *context,
struct spdk_nvmf_rdma_device **new_device)
{
struct spdk_nvmf_rdma_device *device;
int flag = 0;
int rc = 0;
device = calloc(1, sizeof(*device));
if (!device) {
SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n");
return -ENOMEM;
}
device->context = context;
rc = ibv_query_device(device->context, &device->attr);
if (rc < 0) {
SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
free(device);
return rc;
}
#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) == 0) {
SPDK_WARNLOG("The libibverbs on this system supports SEND_WITH_INVALIDATE,");
SPDK_WARNLOG("but the device with vendor ID %u does not.\n", device->attr.vendor_id);
}
/**
* The vendor ID is assigned by the IEEE and an ID of 0 implies Soft-RoCE.
* The Soft-RoCE RXE driver does not currently support send with invalidate,
* but incorrectly reports that it does. There are changes making their way
* through the kernel now that will enable this feature. When they are merged,
* we can conditionally enable this feature.
*
* TODO: enable this for versions of the kernel rxe driver that support it.
*/
if (nvmf_rdma_is_rxe_device(device)) {
device->attr.device_cap_flags &= ~(IBV_DEVICE_MEM_MGT_EXTENSIONS);
}
#endif
/* set up device context async ev fd as NON_BLOCKING */
flag = fcntl(device->context->async_fd, F_GETFL);
rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK);
if (rc < 0) {
SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n");
free(device);
return rc;
}
TAILQ_INSERT_TAIL(&rtransport->devices, device, link);
SPDK_DEBUGLOG(rdma, "New device %p is added to RDMA trasport\n", device);
if (g_nvmf_hooks.get_ibv_pd) {
device->pd = g_nvmf_hooks.get_ibv_pd(NULL, device->context);
} else {
device->pd = ibv_alloc_pd(device->context);
}
if (!device->pd) {
SPDK_ERRLOG("Unable to allocate protection domain.\n");
return -ENOMEM;
}
assert(device->map == NULL);
device->map = spdk_rdma_create_mem_map(device->pd, &g_nvmf_hooks, SPDK_RDMA_MEMORY_MAP_ROLE_TARGET);
if (!device->map) {
SPDK_ERRLOG("Unable to allocate memory map for listen address\n");
return -ENOMEM;
}
assert(device->map != NULL);
assert(device->pd != NULL);
if (new_device) {
*new_device = device;
}
return 0;
}
static void
free_poll_fds(struct spdk_nvmf_rdma_transport *rtransport)
{
if (rtransport->poll_fds) {
free(rtransport->poll_fds);
rtransport->poll_fds = NULL;
}
rtransport->npoll_fds = 0;
}
static int
generate_poll_fds(struct spdk_nvmf_rdma_transport *rtransport)
{
/* Set up poll descriptor array to monitor events from RDMA and IB
* in a single poll syscall
*/
int device_count = 0;
int i = 0;
struct spdk_nvmf_rdma_device *device, *tmp;
TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) {
device_count++;
}
rtransport->npoll_fds = device_count + 1;
rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd));
if (rtransport->poll_fds == NULL) {
SPDK_ERRLOG("poll_fds allocation failed\n");
return -ENOMEM;
}
rtransport->poll_fds[i].fd = rtransport->event_channel->fd;
rtransport->poll_fds[i++].events = POLLIN;
TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) {
rtransport->poll_fds[i].fd = device->context->async_fd;
rtransport->poll_fds[i++].events = POLLIN;
}
return 0;
}
static struct spdk_nvmf_transport * static struct spdk_nvmf_transport *
nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)
{ {
int rc; int rc;
struct spdk_nvmf_rdma_transport *rtransport; struct spdk_nvmf_rdma_transport *rtransport;
struct spdk_nvmf_rdma_device *device, *tmp; struct spdk_nvmf_rdma_device *device;
struct ibv_context **contexts; struct ibv_context **contexts;
uint32_t i; uint32_t i;
int flag; int flag;
@ -2506,78 +2629,12 @@ nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)
i = 0; i = 0;
rc = 0; rc = 0;
while (contexts[i] != NULL) { while (contexts[i] != NULL) {
device = calloc(1, sizeof(*device)); rc = create_ib_device(rtransport, contexts[i], &device);
if (!device) {
SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n");
rc = -ENOMEM;
break;
}
device->context = contexts[i];
rc = ibv_query_device(device->context, &device->attr);
if (rc < 0) { if (rc < 0) {
SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
free(device);
break;
}
max_device_sge = spdk_min(max_device_sge, device->attr.max_sge);
#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) == 0) {
SPDK_WARNLOG("The libibverbs on this system supports SEND_WITH_INVALIDATE,");
SPDK_WARNLOG("but the device with vendor ID %u does not.\n", device->attr.vendor_id);
}
/**
* The vendor ID is assigned by the IEEE and an ID of 0 implies Soft-RoCE.
* The Soft-RoCE RXE driver does not currently support send with invalidate,
* but incorrectly reports that it does. There are changes making their way
* through the kernel now that will enable this feature. When they are merged,
* we can conditionally enable this feature.
*
* TODO: enable this for versions of the kernel rxe driver that support it.
*/
if (nvmf_rdma_is_rxe_device(device)) {
device->attr.device_cap_flags &= ~(IBV_DEVICE_MEM_MGT_EXTENSIONS);
}
#endif
/* set up device context async ev fd as NON_BLOCKING */
flag = fcntl(device->context->async_fd, F_GETFL);
rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK);
if (rc < 0) {
SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n");
free(device);
break; break;
} }
TAILQ_INSERT_TAIL(&rtransport->devices, device, link);
i++; i++;
max_device_sge = spdk_min(max_device_sge, device->attr.max_sge);
if (g_nvmf_hooks.get_ibv_pd) {
device->pd = g_nvmf_hooks.get_ibv_pd(NULL, device->context);
} else {
device->pd = ibv_alloc_pd(device->context);
}
if (!device->pd) {
SPDK_ERRLOG("Unable to allocate protection domain.\n");
rc = -ENOMEM;
break;
}
assert(device->map == NULL);
device->map = spdk_rdma_create_mem_map(device->pd, &g_nvmf_hooks, SPDK_RDMA_MEMORY_MAP_ROLE_TARGET);
if (!device->map) {
SPDK_ERRLOG("Unable to allocate memory map for listen address\n");
rc = -ENOMEM;
break;
}
assert(device->map != NULL);
assert(device->pd != NULL);
} }
rdma_free_devices(contexts); rdma_free_devices(contexts);
@ -2598,26 +2655,12 @@ nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)
return NULL; return NULL;
} }
/* Set up poll descriptor array to monitor events from RDMA and IB rc = generate_poll_fds(rtransport);
* in a single poll syscall if (rc < 0) {
*/
rtransport->npoll_fds = i + 1;
i = 0;
rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd));
if (rtransport->poll_fds == NULL) {
SPDK_ERRLOG("poll_fds allocation failed\n");
nvmf_rdma_destroy(&rtransport->transport, NULL, NULL); nvmf_rdma_destroy(&rtransport->transport, NULL, NULL);
return NULL; return NULL;
} }
rtransport->poll_fds[i].fd = rtransport->event_channel->fd;
rtransport->poll_fds[i++].events = POLLIN;
TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) {
rtransport->poll_fds[i].fd = device->context->async_fd;
rtransport->poll_fds[i++].events = POLLIN;
}
rtransport->accept_poller = SPDK_POLLER_REGISTER(nvmf_rdma_accept, &rtransport->transport, rtransport->accept_poller = SPDK_POLLER_REGISTER(nvmf_rdma_accept, &rtransport->transport,
opts->acceptor_poll_rate); opts->acceptor_poll_rate);
if (!rtransport->accept_poller) { if (!rtransport->accept_poller) {
@ -2628,6 +2671,20 @@ nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)
return &rtransport->transport; return &rtransport->transport;
} }
static void
destroy_ib_device(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_device *device)
{
TAILQ_REMOVE(&rtransport->devices, device, link);
spdk_rdma_free_mem_map(&device->map);
if (device->pd) {
if (!g_nvmf_hooks.get_ibv_pd) {
ibv_dealloc_pd(device->pd);
}
}
free(device);
}
static void static void
nvmf_rdma_dump_opts(struct spdk_nvmf_transport *transport, struct spdk_json_write_ctx *w) nvmf_rdma_dump_opts(struct spdk_nvmf_transport *transport, struct spdk_json_write_ctx *w)
{ {
@ -2660,23 +2717,14 @@ nvmf_rdma_destroy(struct spdk_nvmf_transport *transport,
free(port); free(port);
} }
if (rtransport->poll_fds != NULL) { free_poll_fds(rtransport);
free(rtransport->poll_fds);
}
if (rtransport->event_channel != NULL) { if (rtransport->event_channel != NULL) {
rdma_destroy_event_channel(rtransport->event_channel); rdma_destroy_event_channel(rtransport->event_channel);
} }
TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) { TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) {
TAILQ_REMOVE(&rtransport->devices, device, link); destroy_ib_device(rtransport, device);
spdk_rdma_free_mem_map(&device->map);
if (device->pd) {
if (!g_nvmf_hooks.get_ibv_pd) {
ibv_dealloc_pd(device->pd);
}
}
free(device);
} }
if (rtransport->data_wr_pool != NULL) { if (rtransport->data_wr_pool != NULL) {
@ -3390,6 +3438,86 @@ nvmf_rdma_discover(struct spdk_nvmf_transport *transport,
entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM; entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM;
} }
static int
nvmf_rdma_poller_create(struct spdk_nvmf_rdma_transport *rtransport,
struct spdk_nvmf_rdma_poll_group *rgroup, struct spdk_nvmf_rdma_device *device,
struct spdk_nvmf_rdma_poller **out_poller)
{
struct spdk_nvmf_rdma_poller *poller;
struct spdk_rdma_srq_init_attr srq_init_attr;
struct spdk_nvmf_rdma_resource_opts opts;
int num_cqe;
poller = calloc(1, sizeof(*poller));
if (!poller) {
SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n");
return -1;
}
poller->device = device;
poller->group = rgroup;
*out_poller = poller;
RB_INIT(&poller->qpairs);
STAILQ_INIT(&poller->qpairs_pending_send);
STAILQ_INIT(&poller->qpairs_pending_recv);
TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link);
SPDK_DEBUGLOG(rdma, "Create poller %p on device %p in poll group %p.\n", poller, device, rgroup);
if (rtransport->rdma_opts.no_srq == false && device->num_srq < device->attr.max_srq) {
if ((int)rtransport->rdma_opts.max_srq_depth > device->attr.max_srq_wr) {
SPDK_WARNLOG("Requested SRQ depth %u, max supported by dev %s is %d\n",
rtransport->rdma_opts.max_srq_depth, device->context->device->name, device->attr.max_srq_wr);
}
poller->max_srq_depth = spdk_min((int)rtransport->rdma_opts.max_srq_depth, device->attr.max_srq_wr);
device->num_srq++;
memset(&srq_init_attr, 0, sizeof(srq_init_attr));
srq_init_attr.pd = device->pd;
srq_init_attr.stats = &poller->stat.qp_stats.recv;
srq_init_attr.srq_init_attr.attr.max_wr = poller->max_srq_depth;
srq_init_attr.srq_init_attr.attr.max_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_RX_SGE);
poller->srq = spdk_rdma_srq_create(&srq_init_attr);
if (!poller->srq) {
SPDK_ERRLOG("Unable to create shared receive queue, errno %d\n", errno);
return -1;
}
opts.qp = poller->srq;
opts.map = device->map;
opts.qpair = NULL;
opts.shared = true;
opts.max_queue_depth = poller->max_srq_depth;
opts.in_capsule_data_size = rtransport->transport.opts.in_capsule_data_size;
poller->resources = nvmf_rdma_resources_create(&opts);
if (!poller->resources) {
SPDK_ERRLOG("Unable to allocate resources for shared receive queue.\n");
return -1;
}
}
/*
* When using an srq, we can limit the completion queue at startup.
* The following formula represents the calculation:
* num_cqe = num_recv + num_data_wr + num_send_wr.
* where num_recv=num_data_wr=and num_send_wr=poller->max_srq_depth
*/
if (poller->srq) {
num_cqe = poller->max_srq_depth * 3;
} else {
num_cqe = rtransport->rdma_opts.num_cqe;
}
poller->cq = ibv_create_cq(device->context, num_cqe, poller, NULL, 0);
if (!poller->cq) {
SPDK_ERRLOG("Unable to create completion queue\n");
return -1;
}
poller->num_cqe = num_cqe;
return 0;
}
static void nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group); static void nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group);
static struct spdk_nvmf_transport_poll_group * static struct spdk_nvmf_transport_poll_group *
@ -3400,9 +3528,7 @@ nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport,
struct spdk_nvmf_rdma_poll_group *rgroup; struct spdk_nvmf_rdma_poll_group *rgroup;
struct spdk_nvmf_rdma_poller *poller; struct spdk_nvmf_rdma_poller *poller;
struct spdk_nvmf_rdma_device *device; struct spdk_nvmf_rdma_device *device;
struct spdk_rdma_srq_init_attr srq_init_attr; int rc;
struct spdk_nvmf_rdma_resource_opts opts;
int num_cqe;
rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
@ -3414,75 +3540,11 @@ nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport,
TAILQ_INIT(&rgroup->pollers); TAILQ_INIT(&rgroup->pollers);
TAILQ_FOREACH(device, &rtransport->devices, link) { TAILQ_FOREACH(device, &rtransport->devices, link) {
poller = calloc(1, sizeof(*poller)); rc = nvmf_rdma_poller_create(rtransport, rgroup, device, &poller);
if (!poller) { if (rc < 0) {
SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n");
nvmf_rdma_poll_group_destroy(&rgroup->group); nvmf_rdma_poll_group_destroy(&rgroup->group);
return NULL; return NULL;
} }
poller->device = device;
poller->group = rgroup;
RB_INIT(&poller->qpairs);
STAILQ_INIT(&poller->qpairs_pending_send);
STAILQ_INIT(&poller->qpairs_pending_recv);
TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link);
if (rtransport->rdma_opts.no_srq == false && device->num_srq < device->attr.max_srq) {
if ((int)rtransport->rdma_opts.max_srq_depth > device->attr.max_srq_wr) {
SPDK_WARNLOG("Requested SRQ depth %u, max supported by dev %s is %d\n",
rtransport->rdma_opts.max_srq_depth, device->context->device->name, device->attr.max_srq_wr);
}
poller->max_srq_depth = spdk_min((int)rtransport->rdma_opts.max_srq_depth, device->attr.max_srq_wr);
device->num_srq++;
memset(&srq_init_attr, 0, sizeof(srq_init_attr));
srq_init_attr.pd = device->pd;
srq_init_attr.stats = &poller->stat.qp_stats.recv;
srq_init_attr.srq_init_attr.attr.max_wr = poller->max_srq_depth;
srq_init_attr.srq_init_attr.attr.max_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_RX_SGE);
poller->srq = spdk_rdma_srq_create(&srq_init_attr);
if (!poller->srq) {
SPDK_ERRLOG("Unable to create shared receive queue, errno %d\n", errno);
nvmf_rdma_poll_group_destroy(&rgroup->group);
return NULL;
}
opts.qp = poller->srq;
opts.map = device->map;
opts.qpair = NULL;
opts.shared = true;
opts.max_queue_depth = poller->max_srq_depth;
opts.in_capsule_data_size = transport->opts.in_capsule_data_size;
poller->resources = nvmf_rdma_resources_create(&opts);
if (!poller->resources) {
SPDK_ERRLOG("Unable to allocate resources for shared receive queue.\n");
nvmf_rdma_poll_group_destroy(&rgroup->group);
return NULL;
}
}
/*
* When using an srq, we can limit the completion queue at startup.
* The following formula represents the calculation:
* num_cqe = num_recv + num_data_wr + num_send_wr.
* where num_recv=num_data_wr=and num_send_wr=poller->max_srq_depth
*/
if (poller->srq) {
num_cqe = poller->max_srq_depth * 3;
} else {
num_cqe = rtransport->rdma_opts.num_cqe;
}
poller->cq = ibv_create_cq(device->context, num_cqe, poller, NULL, 0);
if (!poller->cq) {
SPDK_ERRLOG("Unable to create completion queue\n");
nvmf_rdma_poll_group_destroy(&rgroup->group);
return NULL;
}
poller->num_cqe = num_cqe;
} }
TAILQ_INSERT_TAIL(&rtransport->poll_groups, rgroup, link); TAILQ_INSERT_TAIL(&rtransport->poll_groups, rgroup, link);
@ -3566,12 +3628,34 @@ nvmf_rdma_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair)
return result; return result;
} }
static void
nvmf_rdma_poller_destroy(struct spdk_nvmf_rdma_poller *poller)
{
struct spdk_nvmf_rdma_qpair *qpair, *tmp_qpair;
RB_FOREACH_SAFE(qpair, qpairs_tree, &poller->qpairs, tmp_qpair) {
nvmf_rdma_qpair_destroy(qpair);
}
if (poller->srq) {
if (poller->resources) {
nvmf_rdma_resources_destroy(poller->resources);
}
spdk_rdma_srq_destroy(poller->srq);
SPDK_DEBUGLOG(rdma, "Destroyed RDMA shared queue %p\n", poller->srq);
}
if (poller->cq) {
ibv_destroy_cq(poller->cq);
}
free(poller);
}
static void static void
nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
{ {
struct spdk_nvmf_rdma_poll_group *rgroup, *next_rgroup; struct spdk_nvmf_rdma_poll_group *rgroup, *next_rgroup;
struct spdk_nvmf_rdma_poller *poller, *tmp; struct spdk_nvmf_rdma_poller *poller, *tmp;
struct spdk_nvmf_rdma_qpair *qpair, *tmp_qpair;
struct spdk_nvmf_rdma_transport *rtransport; struct spdk_nvmf_rdma_transport *rtransport;
rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
@ -3581,24 +3665,7 @@ nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) { TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) {
TAILQ_REMOVE(&rgroup->pollers, poller, link); TAILQ_REMOVE(&rgroup->pollers, poller, link);
nvmf_rdma_poller_destroy(poller);
RB_FOREACH_SAFE(qpair, qpairs_tree, &poller->qpairs, tmp_qpair) {
nvmf_rdma_qpair_destroy(qpair);
}
if (poller->srq) {
if (poller->resources) {
nvmf_rdma_resources_destroy(poller->resources);
}
spdk_rdma_srq_destroy(poller->srq);
SPDK_DEBUGLOG(rdma, "Destroyed RDMA shared queue %p\n", poller->srq);
}
if (poller->cq) {
ibv_destroy_cq(poller->cq);
}
free(poller);
} }
if (rgroup->group.transport == NULL) { if (rgroup->group.transport == NULL) {