diff --git a/lib/nvmf/rdma.c b/lib/nvmf/rdma.c index dde5097fe..584fd5389 100644 --- a/lib/nvmf/rdma.c +++ b/lib/nvmf/rdma.c @@ -450,11 +450,19 @@ struct spdk_nvmf_rdma_poll_group_stat { }; struct spdk_nvmf_rdma_poll_group { - struct spdk_nvmf_transport_poll_group group; + struct spdk_nvmf_transport_poll_group group; - TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers; + /* Requests that are waiting to obtain a data buffer */ - struct spdk_nvmf_rdma_poll_group_stat stat; + TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers; + + struct spdk_nvmf_rdma_poll_group_stat stat; + + /* + * buffers which are split across multiple RDMA + * memory regions cannot be used by this transport. + */ + STAILQ_HEAD(, spdk_nvmf_transport_pg_cache_buf) retired_bufs; }; /* Assuming rdma_cm uses just one protection domain per ibv_context. */ @@ -1456,6 +1464,34 @@ nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport, return 0; } +/* This function is used in the rare case that we have a buffer split over multiple memory regions. */ +static int +nvmf_rdma_replace_buffer(struct spdk_nvmf_rdma_poll_group *rgroup, void **buf) +{ + struct spdk_nvmf_transport_poll_group *group = &rgroup->group; + struct spdk_nvmf_transport *transport = group->transport; + struct spdk_nvmf_transport_pg_cache_buf *old_buf; + void *new_buf; + + if (!(STAILQ_EMPTY(&group->buf_cache))) { + group->buf_cache_count--; + new_buf = STAILQ_FIRST(&group->buf_cache); + STAILQ_REMOVE_HEAD(&group->buf_cache, link); + assert(*buf != NULL); + } else { + new_buf = spdk_mempool_get(transport->data_buf_pool); + } + + if (*buf == NULL) { + return -ENOMEM; + } + + old_buf = *buf; + STAILQ_INSERT_HEAD(&rgroup->retired_bufs, old_buf, link); + *buf = new_buf; + return 0; +} + static int nvmf_rdma_fill_buffers(struct spdk_nvmf_rdma_transport *rtransport, struct spdk_nvmf_rdma_poll_group *rgroup, @@ -1483,9 +1519,13 @@ nvmf_rdma_fill_buffers(struct spdk_nvmf_rdma_transport *rtransport, (uint64_t)req->iov[req->iovcnt].iov_base, &translation_len); } - if (translation_len < req->iov[req->iovcnt].iov_len) { - SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n"); - return -EINVAL; + /* This is a very rare case that can occur when using DPDK version < 19.05 */ + if (spdk_unlikely(translation_len < req->iov[req->iovcnt].iov_len)) { + SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions. Removing it from circulation.\n"); + if (nvmf_rdma_replace_buffer(rgroup, &req->buffers[req->iovcnt]) == -ENOMEM) { + return -ENOMEM; + } + continue; } length -= req->iov[req->iovcnt].iov_len; @@ -2930,6 +2970,7 @@ spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport) } TAILQ_INIT(&rgroup->pollers); + STAILQ_INIT(&rgroup->retired_bufs); pthread_mutex_lock(&rtransport->lock); TAILQ_FOREACH(device, &rtransport->devices, link) { @@ -3012,6 +3053,7 @@ spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) struct spdk_nvmf_rdma_poll_group *rgroup; struct spdk_nvmf_rdma_poller *poller, *tmp; struct spdk_nvmf_rdma_qpair *qpair, *tmp_qpair; + struct spdk_nvmf_transport_pg_cache_buf *buf, *tmp_buf; rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); @@ -3019,6 +3061,12 @@ spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) return; } + /* free all retired buffers back to the transport so we don't short the mempool. */ + STAILQ_FOREACH_SAFE(buf, &rgroup->retired_bufs, link, tmp_buf) { + STAILQ_REMOVE(&rgroup->retired_bufs, buf, spdk_nvmf_transport_pg_cache_buf, link); + spdk_mempool_put(group->transport->data_buf_pool, buf); + } + TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) { TAILQ_REMOVE(&rgroup->pollers, poller, link); diff --git a/test/unit/lib/nvmf/rdma.c/rdma_ut.c b/test/unit/lib/nvmf/rdma.c/rdma_ut.c index 968c8cf1b..9274b48f0 100644 --- a/test/unit/lib/nvmf/rdma.c/rdma_ut.c +++ b/test/unit/lib/nvmf/rdma.c/rdma_ut.c @@ -37,6 +37,7 @@ #include "nvmf/rdma.c" uint64_t g_mr_size; +uint64_t g_mr_next_size; struct ibv_mr g_rdma_mr; #define RDMA_UT_UNITS_IN_MAX_IO 16 @@ -135,6 +136,9 @@ spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t { if (g_mr_size != 0) { *(uint32_t *)size = g_mr_size; + if (g_mr_next_size != 0) { + g_mr_size = g_mr_next_size; + } } return (uint64_t)&g_rdma_mr; @@ -177,12 +181,16 @@ test_spdk_nvmf_rdma_request_parse_sgl(void) struct spdk_nvmf_transport_pg_cache_buf bufs[4]; struct spdk_nvme_sgl_descriptor sgl_desc[SPDK_NVMF_MAX_SGL_ENTRIES] = {{0}}; struct spdk_nvmf_rdma_request_data data; + struct spdk_nvmf_transport_pg_cache_buf buffer; + struct spdk_nvmf_transport_pg_cache_buf *buffer_ptr; int rc, i; data.wr.sg_list = data.sgl; STAILQ_INIT(&group.group.buf_cache); group.group.buf_cache_size = 0; group.group.buf_cache_count = 0; + group.group.transport = &rtransport.transport; + STAILQ_INIT(&group.retired_bufs); poller.group = &group; rqpair.poller = &poller; rqpair.max_send_sge = SPDK_NVMF_MAX_SGL_ENTRIES; @@ -268,7 +276,6 @@ test_spdk_nvmf_rdma_request_parse_sgl(void) CU_ASSERT(rdma_req.data.wr.sg_list[0].length == 0); CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == 0); - rdma_req.recv->buf = (void *)0xDDDD; /* Test 2: sgl type: keyed data block subtype: offset (in capsule data) */ sgl->generic.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; @@ -408,7 +415,6 @@ test_spdk_nvmf_rdma_request_parse_sgl(void) } /* part 1: use the four buffers from the pg cache */ - group.group.buf_cache_size = 4; group.group.buf_cache_count = 4; MOCK_SET(spdk_mempool_get, (void *)0x2000); @@ -432,8 +438,8 @@ test_spdk_nvmf_rdma_request_parse_sgl(void) ~NVMF_DATA_BUFFER_MASK)); CU_ASSERT(rdma_req.data.wr.sg_list[i].length == rtransport.transport.opts.io_unit_size); } - /* part 2: now that we have used the buffers from the cache, try again. We should get mempool buffers. */ + /* part 2: now that we have used the buffers from the cache, try again. We should get mempool buffers. */ reset_nvmf_rdma_request(&rdma_req); rc = spdk_nvmf_rdma_request_parse_sgl(&rtransport, &device, &rdma_req); @@ -482,6 +488,36 @@ test_spdk_nvmf_rdma_request_parse_sgl(void) CU_ASSERT(rdma_req.data.wr.sg_list[i].addr == 0x2000); CU_ASSERT(rdma_req.data.wr.sg_list[i].length == rtransport.transport.opts.io_unit_size); } + + reset_nvmf_rdma_request(&rdma_req); + /* Test 5 dealing with a buffer split over two Memory Regions */ + MOCK_SET(spdk_mempool_get, (void *)&buffer); + sgl->generic.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; + sgl->keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; + sgl->keyed.length = rtransport.transport.opts.io_unit_size / 2; + g_mr_size = rtransport.transport.opts.io_unit_size / 4; + g_mr_next_size = rtransport.transport.opts.io_unit_size / 2; + + rc = spdk_nvmf_rdma_request_parse_sgl(&rtransport, &device, &rdma_req); + SPDK_CU_ASSERT_FATAL(rc == 0); + CU_ASSERT(rdma_req.req.data_from_pool == true); + CU_ASSERT(rdma_req.req.length == rtransport.transport.opts.io_unit_size / 2); + CU_ASSERT((uint64_t)rdma_req.req.data == (((uint64_t)&buffer + NVMF_DATA_BUFFER_MASK) & + ~NVMF_DATA_BUFFER_MASK)); + CU_ASSERT(rdma_req.data.wr.num_sge == 1); + CU_ASSERT(rdma_req.data.wr.wr.rdma.rkey == 0xEEEE); + CU_ASSERT(rdma_req.data.wr.wr.rdma.remote_addr == 0xFFFF); + CU_ASSERT(rdma_req.req.buffers[0] == &buffer); + CU_ASSERT(rdma_req.data.wr.sg_list[0].addr == (((uint64_t)&buffer + NVMF_DATA_BUFFER_MASK) & + ~NVMF_DATA_BUFFER_MASK)); + CU_ASSERT(rdma_req.data.wr.sg_list[0].length == rtransport.transport.opts.io_unit_size / 2); + CU_ASSERT(rdma_req.data.wr.sg_list[0].lkey == g_rdma_mr.lkey); + buffer_ptr = STAILQ_FIRST(&group.retired_bufs); + CU_ASSERT(buffer_ptr == &buffer); + STAILQ_REMOVE(&group.retired_bufs, buffer_ptr, spdk_nvmf_transport_pg_cache_buf, link); + CU_ASSERT(STAILQ_EMPTY(&group.retired_bufs)); + + reset_nvmf_rdma_request(&rdma_req); } static struct spdk_nvmf_rdma_recv *