SPDK NVMe RDMA initiator used the default PD per RDMA device. Default PD may be changed when all QPs for the RDMA device are destroyed and created again. For multipath, the RDMA zero copy feature require the PD per RDMA device to be persistent when all QPs for the RDMA device are destroyed and created again. Maintain such persistent PDs in this patch. Add two APIs, spdk_rdma_get_pd() and spdk_rdma_put_pd(). In each call of two APIs, synchronize RDMA device list with rdma_get_devices(). Context may be deleted anytime by rdma-core. To avoid such deletion, hold the returned array by rdma_get_devices(). RDMA device has PD, context, ref. count, and removed flag. If context is missing in rdma_get_devices(), set the removed flag to true. Then, if the ref count becomes zero, free the PD and the RDMA device. The ref. count of a RDMA device is incremented when spdk_rdma_get_pd() is called and decremented when spdk_rdma_put_pd() is called. To simplify synchronization, sort the returned array by rdma_get_devices(). To avoid resource leakage, add destructor function and free all PDs and related data at termination. Signed-off-by: Shuhei Matsumoto <smatsumoto@nvidia.com> Change-Id: I093cb4ec2c7d8432642edfbffa270797ccf3e715 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/13769 Community-CI: Mellanox Build Bot Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Aleksey Marchuk <alexeymar@nvidia.com>
571 lines
12 KiB
C
571 lines
12 KiB
C
/* SPDX-License-Identifier: BSD-3-Clause
|
|
* Copyright (c) Intel Corporation. All rights reserved.
|
|
* Copyright (c) 2020, 2021 Mellanox Technologies LTD. All rights reserved.
|
|
*/
|
|
|
|
#include <rdma/rdma_cma.h>
|
|
|
|
#include "spdk/log.h"
|
|
#include "spdk/env.h"
|
|
#include "spdk/string.h"
|
|
#include "spdk/likely.h"
|
|
|
|
#include "spdk_internal/rdma.h"
|
|
#include "spdk_internal/assert.h"
|
|
|
|
struct spdk_rdma_device {
|
|
struct ibv_pd *pd;
|
|
struct ibv_context *context;
|
|
int ref;
|
|
bool removed;
|
|
TAILQ_ENTRY(spdk_rdma_device) tailq;
|
|
};
|
|
|
|
struct spdk_rdma_mem_map {
|
|
struct spdk_mem_map *map;
|
|
struct ibv_pd *pd;
|
|
struct spdk_nvme_rdma_hooks *hooks;
|
|
uint32_t ref_count;
|
|
enum spdk_rdma_memory_map_role role;
|
|
LIST_ENTRY(spdk_rdma_mem_map) link;
|
|
};
|
|
|
|
static pthread_mutex_t g_dev_mutex = PTHREAD_MUTEX_INITIALIZER;
|
|
static struct ibv_context **g_ctx_list = NULL;
|
|
static TAILQ_HEAD(, spdk_rdma_device) g_dev_list = TAILQ_HEAD_INITIALIZER(g_dev_list);
|
|
|
|
static LIST_HEAD(, spdk_rdma_mem_map) g_rdma_mr_maps = LIST_HEAD_INITIALIZER(&g_rdma_mr_maps);
|
|
static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER;
|
|
|
|
static int
|
|
rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map,
|
|
enum spdk_mem_map_notify_action action,
|
|
void *vaddr, size_t size)
|
|
{
|
|
struct spdk_rdma_mem_map *rmap = cb_ctx;
|
|
struct ibv_pd *pd = rmap->pd;
|
|
struct ibv_mr *mr;
|
|
uint32_t access_flags = 0;
|
|
int rc;
|
|
|
|
switch (action) {
|
|
case SPDK_MEM_MAP_NOTIFY_REGISTER:
|
|
if (rmap->hooks && rmap->hooks->get_rkey) {
|
|
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, rmap->hooks->get_rkey(pd, vaddr,
|
|
size));
|
|
} else {
|
|
switch (rmap->role) {
|
|
case SPDK_RDMA_MEMORY_MAP_ROLE_TARGET:
|
|
access_flags = IBV_ACCESS_LOCAL_WRITE;
|
|
if (pd->context->device->transport_type == IBV_TRANSPORT_IWARP) {
|
|
/* IWARP requires REMOTE_WRITE permission for RDMA_READ operation */
|
|
access_flags |= IBV_ACCESS_REMOTE_WRITE;
|
|
}
|
|
break;
|
|
case SPDK_RDMA_MEMORY_MAP_ROLE_INITIATOR:
|
|
access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE;
|
|
break;
|
|
default:
|
|
SPDK_UNREACHABLE();
|
|
}
|
|
mr = ibv_reg_mr(pd, vaddr, size, access_flags);
|
|
if (mr == NULL) {
|
|
SPDK_ERRLOG("ibv_reg_mr() failed\n");
|
|
return -1;
|
|
} else {
|
|
rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr);
|
|
}
|
|
}
|
|
break;
|
|
case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
|
|
if (rmap->hooks == NULL || rmap->hooks->get_rkey == NULL) {
|
|
mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL);
|
|
if (mr) {
|
|
ibv_dereg_mr(mr);
|
|
}
|
|
}
|
|
rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size);
|
|
break;
|
|
default:
|
|
SPDK_UNREACHABLE();
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int
|
|
rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2)
|
|
{
|
|
/* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */
|
|
return addr_1 == addr_2;
|
|
}
|
|
|
|
const struct spdk_mem_map_ops g_rdma_map_ops = {
|
|
.notify_cb = rdma_mem_notify,
|
|
.are_contiguous = rdma_check_contiguous_entries
|
|
};
|
|
|
|
static void
|
|
_rdma_free_mem_map(struct spdk_rdma_mem_map *map)
|
|
{
|
|
assert(map);
|
|
|
|
if (map->hooks) {
|
|
spdk_free(map);
|
|
} else {
|
|
free(map);
|
|
}
|
|
}
|
|
|
|
struct spdk_rdma_mem_map *
|
|
spdk_rdma_create_mem_map(struct ibv_pd *pd, struct spdk_nvme_rdma_hooks *hooks,
|
|
enum spdk_rdma_memory_map_role role)
|
|
{
|
|
struct spdk_rdma_mem_map *map;
|
|
|
|
pthread_mutex_lock(&g_rdma_mr_maps_mutex);
|
|
/* Look up existing mem map registration for this pd */
|
|
LIST_FOREACH(map, &g_rdma_mr_maps, link) {
|
|
if (map->pd == pd && map->role == role) {
|
|
map->ref_count++;
|
|
pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
|
|
return map;
|
|
}
|
|
}
|
|
|
|
if (hooks) {
|
|
map = spdk_zmalloc(sizeof(*map), 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
|
|
} else {
|
|
map = calloc(1, sizeof(*map));
|
|
}
|
|
if (!map) {
|
|
pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
|
|
SPDK_ERRLOG("Memory allocation failed\n");
|
|
return NULL;
|
|
}
|
|
map->pd = pd;
|
|
map->ref_count = 1;
|
|
map->hooks = hooks;
|
|
map->role = role;
|
|
map->map = spdk_mem_map_alloc(0, &g_rdma_map_ops, map);
|
|
if (!map->map) {
|
|
SPDK_ERRLOG("Unable to create memory map\n");
|
|
_rdma_free_mem_map(map);
|
|
pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
|
|
return NULL;
|
|
}
|
|
LIST_INSERT_HEAD(&g_rdma_mr_maps, map, link);
|
|
|
|
pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
|
|
|
|
return map;
|
|
}
|
|
|
|
void
|
|
spdk_rdma_free_mem_map(struct spdk_rdma_mem_map **_map)
|
|
{
|
|
struct spdk_rdma_mem_map *map;
|
|
|
|
if (!_map) {
|
|
return;
|
|
}
|
|
|
|
map = *_map;
|
|
if (!map) {
|
|
return;
|
|
}
|
|
*_map = NULL;
|
|
|
|
pthread_mutex_lock(&g_rdma_mr_maps_mutex);
|
|
assert(map->ref_count > 0);
|
|
map->ref_count--;
|
|
if (map->ref_count != 0) {
|
|
pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
|
|
return;
|
|
}
|
|
|
|
LIST_REMOVE(map, link);
|
|
pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
|
|
if (map->map) {
|
|
spdk_mem_map_free(&map->map);
|
|
}
|
|
_rdma_free_mem_map(map);
|
|
}
|
|
|
|
int
|
|
spdk_rdma_get_translation(struct spdk_rdma_mem_map *map, void *address,
|
|
size_t length, struct spdk_rdma_memory_translation *translation)
|
|
{
|
|
uint64_t real_length = length;
|
|
|
|
assert(map);
|
|
assert(address);
|
|
assert(translation);
|
|
|
|
if (map->hooks && map->hooks->get_rkey) {
|
|
translation->translation_type = SPDK_RDMA_TRANSLATION_KEY;
|
|
translation->mr_or_key.key = spdk_mem_map_translate(map->map, (uint64_t)address, &real_length);
|
|
} else {
|
|
translation->translation_type = SPDK_RDMA_TRANSLATION_MR;
|
|
translation->mr_or_key.mr = (struct ibv_mr *)spdk_mem_map_translate(map->map, (uint64_t)address,
|
|
&real_length);
|
|
if (spdk_unlikely(!translation->mr_or_key.mr)) {
|
|
SPDK_ERRLOG("No translation for ptr %p, size %zu\n", address, length);
|
|
return -EINVAL;
|
|
}
|
|
}
|
|
|
|
assert(real_length >= length);
|
|
|
|
return 0;
|
|
}
|
|
|
|
struct spdk_rdma_srq *
|
|
spdk_rdma_srq_create(struct spdk_rdma_srq_init_attr *init_attr)
|
|
{
|
|
assert(init_attr);
|
|
assert(init_attr->pd);
|
|
|
|
struct spdk_rdma_srq *rdma_srq = calloc(1, sizeof(*rdma_srq));
|
|
|
|
if (!rdma_srq) {
|
|
SPDK_ERRLOG("Can't allocate memory for SRQ handle\n");
|
|
return NULL;
|
|
}
|
|
|
|
if (init_attr->stats) {
|
|
rdma_srq->stats = init_attr->stats;
|
|
rdma_srq->shared_stats = true;
|
|
} else {
|
|
rdma_srq->stats = calloc(1, sizeof(*rdma_srq->stats));
|
|
if (!rdma_srq->stats) {
|
|
SPDK_ERRLOG("SRQ statistics memory allocation failed");
|
|
free(rdma_srq);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
rdma_srq->srq = ibv_create_srq(init_attr->pd, &init_attr->srq_init_attr);
|
|
if (!rdma_srq->srq) {
|
|
if (!init_attr->stats) {
|
|
free(rdma_srq->stats);
|
|
}
|
|
SPDK_ERRLOG("Unable to create SRQ, errno %d (%s)\n", errno, spdk_strerror(errno));
|
|
free(rdma_srq);
|
|
return NULL;
|
|
}
|
|
|
|
return rdma_srq;
|
|
}
|
|
|
|
int
|
|
spdk_rdma_srq_destroy(struct spdk_rdma_srq *rdma_srq)
|
|
{
|
|
int rc;
|
|
|
|
if (!rdma_srq) {
|
|
return 0;
|
|
}
|
|
|
|
assert(rdma_srq->srq);
|
|
|
|
if (rdma_srq->recv_wrs.first != NULL) {
|
|
SPDK_WARNLOG("Destroying RDMA SRQ with queued recv WRs\n");
|
|
}
|
|
|
|
rc = ibv_destroy_srq(rdma_srq->srq);
|
|
if (rc) {
|
|
SPDK_ERRLOG("SRQ destroy failed with %d\n", rc);
|
|
}
|
|
|
|
if (!rdma_srq->shared_stats) {
|
|
free(rdma_srq->stats);
|
|
}
|
|
|
|
free(rdma_srq);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static inline bool
|
|
rdma_queue_recv_wrs(struct spdk_rdma_recv_wr_list *recv_wrs, struct ibv_recv_wr *first,
|
|
struct spdk_rdma_wr_stats *recv_stats)
|
|
{
|
|
struct ibv_recv_wr *last;
|
|
|
|
recv_stats->num_submitted_wrs++;
|
|
last = first;
|
|
while (last->next != NULL) {
|
|
last = last->next;
|
|
recv_stats->num_submitted_wrs++;
|
|
}
|
|
|
|
if (recv_wrs->first == NULL) {
|
|
recv_wrs->first = first;
|
|
recv_wrs->last = last;
|
|
return true;
|
|
} else {
|
|
recv_wrs->last->next = first;
|
|
recv_wrs->last = last;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool
|
|
spdk_rdma_srq_queue_recv_wrs(struct spdk_rdma_srq *rdma_srq, struct ibv_recv_wr *first)
|
|
{
|
|
assert(rdma_srq);
|
|
assert(first);
|
|
|
|
return rdma_queue_recv_wrs(&rdma_srq->recv_wrs, first, rdma_srq->stats);
|
|
}
|
|
|
|
int
|
|
spdk_rdma_srq_flush_recv_wrs(struct spdk_rdma_srq *rdma_srq, struct ibv_recv_wr **bad_wr)
|
|
{
|
|
int rc;
|
|
|
|
if (spdk_unlikely(rdma_srq->recv_wrs.first == NULL)) {
|
|
return 0;
|
|
}
|
|
|
|
rc = ibv_post_srq_recv(rdma_srq->srq, rdma_srq->recv_wrs.first, bad_wr);
|
|
|
|
rdma_srq->recv_wrs.first = NULL;
|
|
rdma_srq->stats->doorbell_updates++;
|
|
|
|
return rc;
|
|
}
|
|
|
|
bool
|
|
spdk_rdma_qp_queue_recv_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_recv_wr *first)
|
|
{
|
|
assert(spdk_rdma_qp);
|
|
assert(first);
|
|
|
|
return rdma_queue_recv_wrs(&spdk_rdma_qp->recv_wrs, first, &spdk_rdma_qp->stats->recv);
|
|
}
|
|
|
|
int
|
|
spdk_rdma_qp_flush_recv_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_recv_wr **bad_wr)
|
|
{
|
|
int rc;
|
|
|
|
if (spdk_unlikely(spdk_rdma_qp->recv_wrs.first == NULL)) {
|
|
return 0;
|
|
}
|
|
|
|
rc = ibv_post_recv(spdk_rdma_qp->qp, spdk_rdma_qp->recv_wrs.first, bad_wr);
|
|
|
|
spdk_rdma_qp->recv_wrs.first = NULL;
|
|
spdk_rdma_qp->stats->recv.doorbell_updates++;
|
|
|
|
return rc;
|
|
}
|
|
|
|
static struct spdk_rdma_device *
|
|
rdma_add_dev(struct ibv_context *context)
|
|
{
|
|
struct spdk_rdma_device *dev;
|
|
|
|
dev = calloc(1, sizeof(*dev));
|
|
if (dev == NULL) {
|
|
SPDK_ERRLOG("Failed to allocate RDMA device object.\n");
|
|
return NULL;
|
|
}
|
|
|
|
dev->pd = ibv_alloc_pd(context);
|
|
if (dev->pd == NULL) {
|
|
SPDK_ERRLOG("ibv_alloc_pd() failed: %s (%d)\n", spdk_strerror(errno), errno);
|
|
free(dev);
|
|
return NULL;
|
|
}
|
|
|
|
dev->context = context;
|
|
TAILQ_INSERT_TAIL(&g_dev_list, dev, tailq);
|
|
|
|
return dev;
|
|
}
|
|
|
|
static void
|
|
rdma_remove_dev(struct spdk_rdma_device *dev)
|
|
{
|
|
if (!dev->removed || dev->ref > 0) {
|
|
return;
|
|
}
|
|
|
|
/* Deallocate protection domain only if the device is already removed and
|
|
* there is no reference.
|
|
*/
|
|
TAILQ_REMOVE(&g_dev_list, dev, tailq);
|
|
ibv_dealloc_pd(dev->pd);
|
|
free(dev);
|
|
}
|
|
|
|
static int
|
|
ctx_cmp(const void *_c1, const void *_c2)
|
|
{
|
|
struct ibv_context *c1 = *(struct ibv_context **)_c1;
|
|
struct ibv_context *c2 = *(struct ibv_context **)_c2;
|
|
|
|
return c1 < c2 ? -1 : c1 > c2;
|
|
}
|
|
|
|
static int
|
|
rdma_sync_dev_list(void)
|
|
{
|
|
struct ibv_context **new_ctx_list;
|
|
int i, j;
|
|
int num_devs = 0;
|
|
|
|
/*
|
|
* rdma_get_devices() returns a NULL terminated array of opened RDMA devices,
|
|
* and sets num_devs to the number of the returned devices.
|
|
*/
|
|
new_ctx_list = rdma_get_devices(&num_devs);
|
|
if (new_ctx_list == NULL) {
|
|
SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
|
|
return -ENODEV;
|
|
}
|
|
|
|
if (num_devs == 0) {
|
|
rdma_free_devices(new_ctx_list);
|
|
SPDK_ERRLOG("Returned RDMA device array was empty\n");
|
|
return -ENODEV;
|
|
}
|
|
|
|
/*
|
|
* Sort new_ctx_list by addresses to update devices easily.
|
|
*/
|
|
qsort(new_ctx_list, num_devs, sizeof(struct ibv_context *), ctx_cmp);
|
|
|
|
if (g_ctx_list == NULL) {
|
|
/* If no old array, this is the first call. Add all devices. */
|
|
for (i = 0; new_ctx_list[i] != NULL; i++) {
|
|
rdma_add_dev(new_ctx_list[i]);
|
|
}
|
|
|
|
goto exit;
|
|
}
|
|
|
|
for (i = j = 0; new_ctx_list[i] != NULL || g_ctx_list[j] != NULL;) {
|
|
struct ibv_context *new_ctx = new_ctx_list[i];
|
|
struct ibv_context *old_ctx = g_ctx_list[j];
|
|
bool add = false, remove = false;
|
|
|
|
/*
|
|
* If a context exists only in the new array, create a device for it,
|
|
* or if a context exists only in the old array, try removing the
|
|
* corresponding device.
|
|
*/
|
|
|
|
if (old_ctx == NULL) {
|
|
add = true;
|
|
} else if (new_ctx == NULL) {
|
|
remove = true;
|
|
} else if (new_ctx < old_ctx) {
|
|
add = true;
|
|
} else if (old_ctx < new_ctx) {
|
|
remove = true;
|
|
}
|
|
|
|
if (add) {
|
|
rdma_add_dev(new_ctx_list[i]);
|
|
i++;
|
|
} else if (remove) {
|
|
struct spdk_rdma_device *dev, *tmp;
|
|
|
|
TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
|
|
if (dev->context == g_ctx_list[j]) {
|
|
dev->removed = true;
|
|
rdma_remove_dev(dev);
|
|
}
|
|
}
|
|
j++;
|
|
} else {
|
|
i++;
|
|
j++;
|
|
}
|
|
}
|
|
|
|
/* Free the old array. */
|
|
rdma_free_devices(g_ctx_list);
|
|
|
|
exit:
|
|
/*
|
|
* Keep the newly returned array so that allocated protection domains
|
|
* are not freed unexpectedly.
|
|
*/
|
|
g_ctx_list = new_ctx_list;
|
|
return 0;
|
|
}
|
|
|
|
struct ibv_pd *
|
|
spdk_rdma_get_pd(struct ibv_context *context)
|
|
{
|
|
struct spdk_rdma_device *dev;
|
|
int rc;
|
|
|
|
pthread_mutex_lock(&g_dev_mutex);
|
|
|
|
rc = rdma_sync_dev_list();
|
|
if (rc != 0) {
|
|
pthread_mutex_unlock(&g_dev_mutex);
|
|
|
|
SPDK_ERRLOG("Failed to sync RDMA device list\n");
|
|
return NULL;
|
|
}
|
|
|
|
TAILQ_FOREACH(dev, &g_dev_list, tailq) {
|
|
if (dev->context == context && !dev->removed) {
|
|
dev->ref++;
|
|
pthread_mutex_unlock(&g_dev_mutex);
|
|
|
|
return dev->pd;
|
|
}
|
|
}
|
|
|
|
pthread_mutex_unlock(&g_dev_mutex);
|
|
|
|
SPDK_ERRLOG("Failed to get PD\n");
|
|
return NULL;
|
|
}
|
|
|
|
void
|
|
spdk_rdma_put_pd(struct ibv_pd *pd)
|
|
{
|
|
struct spdk_rdma_device *dev, *tmp;
|
|
|
|
pthread_mutex_lock(&g_dev_mutex);
|
|
|
|
TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
|
|
if (dev->pd == pd) {
|
|
assert(dev->ref > 0);
|
|
dev->ref--;
|
|
|
|
rdma_remove_dev(dev);
|
|
}
|
|
}
|
|
|
|
rdma_sync_dev_list();
|
|
|
|
pthread_mutex_unlock(&g_dev_mutex);
|
|
}
|
|
|
|
__attribute__((destructor)) static void
|
|
_rdma_fini(void)
|
|
{
|
|
struct spdk_rdma_device *dev, *tmp;
|
|
|
|
TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
|
|
dev->removed = true;
|
|
dev->ref = 0;
|
|
rdma_remove_dev(dev);
|
|
}
|
|
|
|
if (g_ctx_list != NULL) {
|
|
rdma_free_devices(g_ctx_list);
|
|
g_ctx_list = NULL;
|
|
}
|
|
}
|