From 3b26e2c594118d962e99ddc000a288840d885423 Mon Sep 17 00:00:00 2001 From: Evgeniy Kochetov Date: Thu, 7 Jul 2022 14:28:00 +0300 Subject: [PATCH] nvme/rdma: Create poller and CQ on demand Original implementation creates pollers and CQs for all discovered devices at poll group creation. Device (ibv_context) that has no references, i.e. has no QPs, may be removed from the system and ibv_context may be closed by rdma_cm. In this case we will have a CQ that refers to closed ibv_context and it may crash in ibv_poll_cq. With this patch pollers are created on demand when we create the first QP for a device. When there are no more QPs on the poller, we destroy the poller. This also helps to avoid polling CQs that don't have any QPs attached. Signed-off-by: Evgeniy Kochetov Change-Id: I46dd2c8b9b2902168dba24e139c904f51bd1b101 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/13692 Tested-by: SPDK CI Jenkins Community-CI: Broadcom CI Community-CI: Mellanox Build Bot Reviewed-by: Shuhei Matsumoto Reviewed-by: Ben Walker --- lib/nvme/nvme_rdma.c | 123 +++++++++----- test/unit/lib/nvme/nvme_rdma.c/nvme_rdma_ut.c | 156 +++++++++++------- 2 files changed, 177 insertions(+), 102 deletions(-) diff --git a/lib/nvme/nvme_rdma.c b/lib/nvme/nvme_rdma.c index 49b9de72b..50043c5c0 100644 --- a/lib/nvme/nvme_rdma.c +++ b/lib/nvme/nvme_rdma.c @@ -137,6 +137,7 @@ struct nvme_rdma_poller_stats { struct nvme_rdma_poller { struct ibv_context *device; struct ibv_cq *cq; + uint32_t refcnt; int required_num_wc; int current_num_wc; struct nvme_rdma_poller_stats stats; @@ -295,6 +296,10 @@ static const char *rdma_cm_event_str[] = { struct nvme_rdma_qpair *nvme_rdma_poll_group_get_qpair_by_id(struct nvme_rdma_poll_group *group, uint32_t qp_num); +static struct nvme_rdma_poller *nvme_rdma_poll_group_get_poller(struct nvme_rdma_poll_group *group, + struct ibv_context *device); +static void nvme_rdma_poll_group_put_poller(struct nvme_rdma_poll_group *group, + struct nvme_rdma_poller *poller); static TAILQ_HEAD(, nvme_rdma_memory_domain) g_memory_domains = TAILQ_HEAD_INITIALIZER( g_memory_domains); @@ -711,22 +716,19 @@ nvme_rdma_poll_group_set_cq(struct spdk_nvme_qpair *qpair) assert(rqpair->cq == NULL); - STAILQ_FOREACH(poller, &group->pollers, link) { - if (poller->device == rqpair->cm_id->verbs) { - if (nvme_rdma_resize_cq(rqpair, poller)) { - return -EPROTO; - } - rqpair->cq = poller->cq; - rqpair->poller = poller; - break; - } - } - - if (rqpair->cq == NULL) { + poller = nvme_rdma_poll_group_get_poller(group, rqpair->cm_id->verbs); + if (!poller) { SPDK_ERRLOG("Unable to find a cq for qpair %p on poll group %p\n", qpair, qpair->poll_group); return -EINVAL; } + if (nvme_rdma_resize_cq(rqpair, poller)) { + nvme_rdma_poll_group_put_poller(group, poller); + return -EPROTO; + } + + rqpair->cq = poller->cq; + rqpair->poller = poller; return 0; } @@ -2155,6 +2157,13 @@ nvme_rdma_stale_conn_retry(struct nvme_rdma_qpair *rqpair) rqpair->stale_conn_retry_count, qpair->ctrlr->cntlid, qpair->id); if (qpair->poll_group) { + struct nvme_rdma_poll_group *group; + + group = nvme_rdma_poll_group(qpair->poll_group); + if (rqpair->poller) { + nvme_rdma_poll_group_put_poller(group, rqpair->poller); + rqpair->poller = NULL; + } rqpair->cq = NULL; } @@ -2816,7 +2825,7 @@ nvme_rdma_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) } } -static int +static struct nvme_rdma_poller * nvme_rdma_poller_create(struct nvme_rdma_poll_group *group, struct ibv_context *ctx) { struct nvme_rdma_poller *poller; @@ -2824,22 +2833,23 @@ nvme_rdma_poller_create(struct nvme_rdma_poll_group *group, struct ibv_context * poller = calloc(1, sizeof(*poller)); if (poller == NULL) { SPDK_ERRLOG("Unable to allocate poller.\n"); - return -ENOMEM; + return NULL; } poller->device = ctx; poller->cq = ibv_create_cq(poller->device, DEFAULT_NVME_RDMA_CQ_SIZE, group, NULL, 0); if (poller->cq == NULL) { + SPDK_ERRLOG("Unable to create CQ, errno %d.\n", errno); free(poller); - return -EINVAL; + return NULL; } STAILQ_INSERT_HEAD(&group->pollers, poller, link); group->num_pollers++; poller->current_num_wc = DEFAULT_NVME_RDMA_CQ_SIZE; poller->required_num_wc = 0; - return 0; + return poller; } static void @@ -2848,6 +2858,12 @@ nvme_rdma_poll_group_free_pollers(struct nvme_rdma_poll_group *group) struct nvme_rdma_poller *poller, *tmp_poller; STAILQ_FOREACH_SAFE(poller, &group->pollers, link, tmp_poller) { + assert(poller->refcnt == 0); + if (poller->refcnt) { + SPDK_WARNLOG("Destroying poller with non-zero ref count: poller %p, refcnt %d\n", + poller, poller->refcnt); + } + if (poller->cq) { ibv_destroy_cq(poller->cq); } @@ -2856,12 +2872,47 @@ nvme_rdma_poll_group_free_pollers(struct nvme_rdma_poll_group *group) } } +static struct nvme_rdma_poller * +nvme_rdma_poll_group_get_poller(struct nvme_rdma_poll_group *group, struct ibv_context *device) +{ + struct nvme_rdma_poller *poller = NULL; + + STAILQ_FOREACH(poller, &group->pollers, link) { + if (poller->device == device) { + break; + } + } + + if (!poller) { + poller = nvme_rdma_poller_create(group, device); + if (!poller) { + SPDK_ERRLOG("Failed to create a poller for device %p\n", device); + return NULL; + } + } + + poller->refcnt++; + return poller; +} + +static void +nvme_rdma_poll_group_put_poller(struct nvme_rdma_poll_group *group, struct nvme_rdma_poller *poller) +{ + assert(poller->refcnt > 0); + if (--poller->refcnt == 0) { + if (poller->cq) { + ibv_destroy_cq(poller->cq); + } + STAILQ_REMOVE(&group->pollers, poller, nvme_rdma_poller, link); + free(poller); + group->num_pollers--; + } +} + static struct spdk_nvme_transport_poll_group * nvme_rdma_poll_group_create(void) { struct nvme_rdma_poll_group *group; - struct ibv_context **contexts; - int i = 0; group = calloc(1, sizeof(*group)); if (group == NULL) { @@ -2870,26 +2921,6 @@ nvme_rdma_poll_group_create(void) } STAILQ_INIT(&group->pollers); - - contexts = rdma_get_devices(NULL); - if (contexts == NULL) { - SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); - free(group); - return NULL; - } - - while (contexts[i] != NULL) { - if (nvme_rdma_poller_create(group, contexts[i])) { - nvme_rdma_poll_group_free_pollers(group); - free(group); - rdma_free_devices(contexts); - return NULL; - } - i++; - } - - rdma_free_devices(contexts); - return &group->group; } @@ -2927,7 +2958,13 @@ static int nvme_rdma_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) { struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + struct nvme_rdma_poll_group *group; + group = nvme_rdma_poll_group(qpair->poll_group); + if (rqpair->poller) { + nvme_rdma_poll_group_put_poller(group, rqpair->poller); + rqpair->poller = NULL; + } rqpair->cq = NULL; return 0; @@ -3003,7 +3040,9 @@ nvme_rdma_poll_group_process_completions(struct spdk_nvme_transport_poll_group * } completions_allowed = completions_per_qpair * num_qpairs; - completions_per_poller = spdk_max(completions_allowed / group->num_pollers, 1); + if (group->num_pollers) { + completions_per_poller = spdk_max(completions_allowed / group->num_pollers, 1); + } STAILQ_FOREACH(poller, &group->pollers, link) { poller_completions = 0; @@ -3086,6 +3125,12 @@ nvme_rdma_poll_group_get_stats(struct spdk_nvme_transport_poll_group *tgroup, } stats->trtype = SPDK_NVME_TRANSPORT_RDMA; stats->rdma.num_devices = group->num_pollers; + + if (stats->rdma.num_devices == 0) { + *_stats = stats; + return 0; + } + stats->rdma.device_stats = calloc(stats->rdma.num_devices, sizeof(*stats->rdma.device_stats)); if (!stats->rdma.device_stats) { SPDK_ERRLOG("Can't allocate memory for RDMA device stats\n"); diff --git a/test/unit/lib/nvme/nvme_rdma.c/nvme_rdma_ut.c b/test/unit/lib/nvme/nvme_rdma.c/nvme_rdma_ut.c index bebfa3be8..31948c309 100644 --- a/test/unit/lib/nvme/nvme_rdma.c/nvme_rdma_ut.c +++ b/test/unit/lib/nvme/nvme_rdma.c/nvme_rdma_ut.c @@ -601,20 +601,50 @@ static void test_nvme_rdma_poller_create(void) { struct nvme_rdma_poll_group group = {}; - struct ibv_context *contexts = (struct ibv_context *)0xDEADBEEF; + struct ibv_context context = { + .device = (struct ibv_device *)0xDEADBEEF + }; + struct ibv_context context_2 = { + .device = (struct ibv_device *)0xBAADBEEF + }; + struct nvme_rdma_poller *poller_1, *poller_2, *poller_3; /* Case: calloc and ibv not need to fail test */ STAILQ_INIT(&group.pollers); - group.num_pollers = 1; - int rc = nvme_rdma_poller_create(&group, contexts); - CU_ASSERT(rc == 0); - CU_ASSERT(group.num_pollers = 2); - CU_ASSERT(&group.pollers != NULL); - CU_ASSERT(group.pollers.stqh_first->device == contexts); - CU_ASSERT(group.pollers.stqh_first->cq == (struct ibv_cq *)0xFEEDBEEF); - CU_ASSERT(group.pollers.stqh_first->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE); - CU_ASSERT(group.pollers.stqh_first->required_num_wc == 0); + poller_1 = nvme_rdma_poll_group_get_poller(&group, &context); + SPDK_CU_ASSERT_FATAL(poller_1 != NULL); + CU_ASSERT(group.num_pollers == 1); + CU_ASSERT(STAILQ_FIRST(&group.pollers) == poller_1); + CU_ASSERT(poller_1->refcnt == 1); + CU_ASSERT(poller_1->device == &context); + CU_ASSERT(poller_1->cq == (struct ibv_cq *)0xFEEDBEEF); + CU_ASSERT(poller_1->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE); + CU_ASSERT(poller_1->required_num_wc == 0); + + poller_2 = nvme_rdma_poll_group_get_poller(&group, &context_2); + SPDK_CU_ASSERT_FATAL(poller_2 != NULL); + CU_ASSERT(group.num_pollers == 2); + CU_ASSERT(STAILQ_FIRST(&group.pollers) == poller_2); + CU_ASSERT(poller_2->refcnt == 1); + CU_ASSERT(poller_2->device == &context_2); + + poller_3 = nvme_rdma_poll_group_get_poller(&group, &context); + SPDK_CU_ASSERT_FATAL(poller_3 != NULL); + CU_ASSERT(poller_3 == poller_1); + CU_ASSERT(group.num_pollers == 2); + CU_ASSERT(poller_3->refcnt == 2); + + nvme_rdma_poll_group_put_poller(&group, poller_2); + CU_ASSERT(group.num_pollers == 1); + + nvme_rdma_poll_group_put_poller(&group, poller_1); + CU_ASSERT(group.num_pollers == 1); + CU_ASSERT(poller_3->refcnt == 1); + + nvme_rdma_poll_group_put_poller(&group, poller_3); + CU_ASSERT(STAILQ_EMPTY(&group.pollers)); + CU_ASSERT(group.num_pollers == 0); nvme_rdma_poll_group_free_pollers(&group); } @@ -1321,20 +1351,15 @@ test_nvme_rdma_poll_group_get_stats(void) /* Initialization */ STAILQ_INIT(&tgroup.pollers); - rc = nvme_rdma_poller_create(&tgroup, &contexts1); - CU_ASSERT(rc == 0); + tpoller2 = nvme_rdma_poller_create(&tgroup, &contexts1); + SPDK_CU_ASSERT_FATAL(tpoller2 != NULL); CU_ASSERT(tgroup.num_pollers == 1); - rc = nvme_rdma_poller_create(&tgroup, &contexts2); - CU_ASSERT(rc == 0); + tpoller1 = nvme_rdma_poller_create(&tgroup, &contexts2); + SPDK_CU_ASSERT_FATAL(tpoller1 != NULL); CU_ASSERT(tgroup.num_pollers == 2); CU_ASSERT(&tgroup.pollers != NULL); - tpoller1 = STAILQ_FIRST(&tgroup.pollers); - SPDK_CU_ASSERT_FATAL(tpoller1 != NULL); - tpoller2 = STAILQ_NEXT(tpoller1, link); - SPDK_CU_ASSERT_FATAL(tpoller2 != NULL); - CU_ASSERT(tpoller1->device == &contexts2); CU_ASSERT(tpoller2->device == &contexts1); CU_ASSERT(strcmp(tpoller1->device->device->name, "/dev/test2") == 0); @@ -1407,86 +1432,91 @@ test_nvme_rdma_poll_group_set_cq(void) int rc = -1; struct nvme_rdma_poll_group *group; struct spdk_nvme_transport_poll_group *tgroup; - struct nvme_rdma_poller *poller1; - struct nvme_rdma_poller *poller2; + struct nvme_rdma_poller *poller; struct nvme_rdma_qpair rqpair = {}; struct rdma_cm_id cm_id = {}; - /* Case1: Test function nvme_rdma_poll_group_create - Test1: Function rdma_get_devices failed */ - MOCK_SET(rdma_get_devices, NULL); - + /* Case1: Test function nvme_rdma_poll_group_create */ + /* Test1: Function nvme_rdma_poll_group_create success */ tgroup = nvme_rdma_poll_group_create(); - CU_ASSERT(tgroup == NULL); - - /* Test2: Function nvme_rdma_poller_create failed */ - MOCK_CLEAR(rdma_get_devices); - MOCK_SET(ibv_create_cq, NULL); - - tgroup = nvme_rdma_poll_group_create(); - CU_ASSERT(tgroup == NULL); - - /* Test3: Function nvme_rdma_poll_group_create success */ - MOCK_SET(ibv_create_cq, (struct ibv_cq *)0xFEEDBEEF); - - tgroup = nvme_rdma_poll_group_create(); - CU_ASSERT(tgroup != NULL); + SPDK_CU_ASSERT_FATAL(tgroup != NULL); group = nvme_rdma_poll_group(tgroup); CU_ASSERT(group != NULL); - - poller1 = STAILQ_FIRST(&group->pollers); - SPDK_CU_ASSERT_FATAL(poller1 != NULL); - poller2 = STAILQ_NEXT(poller1, link); - SPDK_CU_ASSERT_FATAL(poller2 != NULL); - - CU_ASSERT(poller1->device == (struct ibv_context *)0xFEEDBEEF); - CU_ASSERT(poller1->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE); - CU_ASSERT(poller1->required_num_wc == 0); - CU_ASSERT(poller2->device == (struct ibv_context *)0xDEADBEEF); - CU_ASSERT(poller2->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE); - CU_ASSERT(poller2->required_num_wc == 0); + CU_ASSERT(STAILQ_EMPTY(&group->pollers)); /* Case2: Test function nvme_rdma_poll_group_set_cq */ rqpair.qpair.poll_group = tgroup; rqpair.qpair.trtype = SPDK_NVME_TRANSPORT_RDMA; rqpair.cm_id = &cm_id; - /* Test1: Unable to find a cq for qpair on poll group */ + /* Test1: Function ibv_create_cq failed */ + cm_id.verbs = (void *)0xFEEDBEEF; + MOCK_SET(ibv_create_cq, NULL); + + rc = nvme_rdma_poll_group_set_cq(&rqpair.qpair); + CU_ASSERT(rc == -EINVAL); + CU_ASSERT(rqpair.cq == NULL); + CU_ASSERT(STAILQ_EMPTY(&group->pollers)); + + MOCK_CLEAR(ibv_create_cq); + + /* Test2: Unable to find a cq for qpair on poll group */ cm_id.verbs = NULL; rc = nvme_rdma_poll_group_set_cq(&rqpair.qpair); CU_ASSERT(rc == -EINVAL); CU_ASSERT(rqpair.cq == NULL); + CU_ASSERT(STAILQ_EMPTY(&group->pollers)); + + /* Test3: Match cq success, current_num_wc is enough */ + MOCK_SET(ibv_create_cq, (struct ibv_cq *)0xFEEDBEEF); - /* Test2: Match cq success, current_num_wc is enough */ cm_id.verbs = (void *)0xFEEDBEEF; rqpair.num_entries = 0; rc = nvme_rdma_poll_group_set_cq(&rqpair.qpair); CU_ASSERT(rc == 0); - CU_ASSERT(poller1->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE); - CU_ASSERT(poller1->required_num_wc == 0); + CU_ASSERT(rqpair.cq == (void *)0xFEEDBEEF); - /* Test3: Match cq success, function ibv_resize_cq failed */ + poller = STAILQ_FIRST(&group->pollers); + SPDK_CU_ASSERT_FATAL(poller != NULL); + CU_ASSERT(STAILQ_NEXT(poller, link) == NULL); + CU_ASSERT(poller->device == (struct ibv_context *)0xFEEDBEEF); + CU_ASSERT(poller->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE); + CU_ASSERT(poller->required_num_wc == 0); + CU_ASSERT(rqpair.poller == poller); + + rc = nvme_rdma_poll_group_disconnect_qpair(&rqpair.qpair); + CU_ASSERT(rc == 0); + CU_ASSERT(rqpair.cq == NULL); + CU_ASSERT(rqpair.poller == NULL); + CU_ASSERT(STAILQ_EMPTY(&group->pollers)); + + /* Test4: Match cq success, function ibv_resize_cq failed */ rqpair.cq = NULL; rqpair.num_entries = DEFAULT_NVME_RDMA_CQ_SIZE - 1; MOCK_SET(ibv_resize_cq, -1); rc = nvme_rdma_poll_group_set_cq(&rqpair.qpair); CU_ASSERT(rc == -EPROTO); - CU_ASSERT(poller1->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE); - CU_ASSERT(poller1->required_num_wc == 0); + CU_ASSERT(STAILQ_EMPTY(&group->pollers)); - /* Test4: Current_num_wc is not enough, resize success */ + /* Test5: Current_num_wc is not enough, resize success */ MOCK_SET(ibv_resize_cq, 0); rc = nvme_rdma_poll_group_set_cq(&rqpair.qpair); CU_ASSERT(rc == 0); - CU_ASSERT(poller1->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE * 2); - CU_ASSERT(poller1->required_num_wc == (DEFAULT_NVME_RDMA_CQ_SIZE - 1) * 2); - CU_ASSERT(rqpair.cq == poller1->cq); - CU_ASSERT(rqpair.poller == poller1); + + poller = STAILQ_FIRST(&group->pollers); + SPDK_CU_ASSERT_FATAL(poller != NULL); + CU_ASSERT(poller->current_num_wc == DEFAULT_NVME_RDMA_CQ_SIZE * 2); + CU_ASSERT(poller->required_num_wc == (DEFAULT_NVME_RDMA_CQ_SIZE - 1) * 2); + CU_ASSERT(rqpair.cq == poller->cq); + CU_ASSERT(rqpair.poller == poller); + + rc = nvme_rdma_poll_group_disconnect_qpair(&rqpair.qpair); + CU_ASSERT(rc == 0); rc = nvme_rdma_poll_group_destroy(tgroup); CU_ASSERT(rc == 0);