From bd6de97a0edfb88ad9040415db67bd65d0dffb39 Mon Sep 17 00:00:00 2001 From: Xiaodong Liu Date: Wed, 20 Dec 2017 03:50:43 -0500 Subject: [PATCH] nbd: improve nbd to handle overlapped I/O Previous nbd implementation only processes one I/O at a time. This patch increases spdk_nbd_disk io queue-depth. lists are added to accommodate and coordinate overplapped nbd_io. Change-Id: I3bda2c957a561b884ebfe9550b7cb6f3c991aef8 Signed-off-by: Xiaodong Liu Reviewed-on: https://review.gerrithub.io/392609 Tested-by: SPDK Automated Test System Reviewed-by: Changpeng Liu Reviewed-by: Jim Harris --- lib/nbd/nbd.c | 292 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 246 insertions(+), 46 deletions(-) diff --git a/lib/nbd/nbd.c b/lib/nbd/nbd.c index 4642be986..470ab55a9 100644 --- a/lib/nbd/nbd.c +++ b/lib/nbd/nbd.c @@ -46,14 +46,15 @@ #include "spdk/io_channel.h" #include "spdk_internal/log.h" +#include "spdk/queue.h" + +#define GET_IO_LOOP_COUNT 16 enum nbd_io_state_t { /* Receiving or ready to receive nbd request header */ NBD_IO_RECV_REQ = 0, /* Receiving write payload */ NBD_IO_RECV_PAYLOAD, - /* processing or ready to process of SPDK bdev module */ - NBD_IO_PROCESS, /* Transmitting or ready to transmit nbd response header */ NBD_IO_XMIT_RESP, /* Transmitting read payload */ @@ -64,7 +65,6 @@ struct nbd_io { struct spdk_nbd_disk *nbd; enum nbd_io_state_t state; - int ref; void *payload; /* NOTE: for TRIM, this represents number of bytes to trim. */ @@ -78,6 +78,16 @@ struct nbd_io { * response, or payload from the nbd socket. */ uint32_t offset; + + TAILQ_ENTRY(nbd_io) tailq; +}; + +enum nbd_disk_state_t { + NBD_DISK_STATE_RUNNING = 0, + /* soft disconnection caused by receiving nbd_cmd_disc */ + NBD_DISK_STATE_SOFTDISC, + /* hard disconnection caused by mandatory conditions */ + NBD_DISK_STATE_HARDDISC, }; struct spdk_nbd_disk { @@ -88,10 +98,17 @@ struct spdk_nbd_disk { char *nbd_path; int kernel_sp_fd; int spdk_sp_fd; - struct nbd_io io; struct spdk_poller *nbd_poller; uint32_t buf_align; + struct nbd_io *io_in_recv; + TAILQ_HEAD(, nbd_io) received_io_list; + TAILQ_HEAD(, nbd_io) executed_io_list; + + enum nbd_disk_state_t state; + /* count of nbd_io in spdk_nbd_disk */ + int io_count; + TAILQ_ENTRY(spdk_nbd_disk) tailq; }; @@ -205,6 +222,99 @@ nbd_disconnect(struct spdk_nbd_disk *nbd) ioctl(nbd->dev_fd, NBD_DISCONNECT); } +static struct nbd_io * +spdk_get_nbd_io(struct spdk_nbd_disk *nbd) +{ + struct nbd_io *io; + + io = calloc(1, sizeof(*io)); + if (!io) { + SPDK_ERRLOG("Unable to get nbd_io\n"); + abort(); + } + + io->nbd = nbd; + to_be32(&io->resp.magic, NBD_REPLY_MAGIC); + + nbd->io_count++; + + return io; +} + +static void +spdk_put_nbd_io(struct spdk_nbd_disk *nbd, struct nbd_io *io) +{ + if (io->payload) { + spdk_dma_free(io->payload); + } + free(io); + + nbd->io_count--; +} + +/* + * Check whether received nbd_io are all transmitted. + * + * \return 1 there is still some nbd_io not transmitted. + * 0 all nbd_io received are transmitted. + */ +static int +spdk_nbd_io_xmit_check(struct spdk_nbd_disk *nbd) +{ + if (nbd->io_count == 0) { + return 0; + } else if (nbd->io_count == 1 && nbd->io_in_recv != NULL) { + return 0; + } + + return 1; +} + +/* + * Check whether received nbd_io are all executed, + * and put back executed nbd_io instead of transmitting them + * + * \return 1 there is still some nbd_io under executing + * 0 all nbd_io gotten are freed. + */ +static int +spdk_nbd_cleanup_io(struct spdk_nbd_disk *nbd) +{ + struct nbd_io *io, *io_tmp; + + /* free io_in_recv */ + if (nbd->io_in_recv != NULL) { + spdk_put_nbd_io(nbd, nbd->io_in_recv); + nbd->io_in_recv = NULL; + } + + /* free io in received_io_list */ + if (!TAILQ_EMPTY(&nbd->received_io_list)) { + TAILQ_FOREACH_SAFE(io, &nbd->received_io_list, tailq, io_tmp) { + TAILQ_REMOVE(&nbd->received_io_list, io, tailq); + spdk_put_nbd_io(nbd, io); + } + } + + /* free io in executed_io_list */ + if (!TAILQ_EMPTY(&nbd->executed_io_list)) { + TAILQ_FOREACH_SAFE(io, &nbd->executed_io_list, tailq, io_tmp) { + TAILQ_REMOVE(&nbd->executed_io_list, io, tailq); + spdk_put_nbd_io(nbd, io); + } + } + + /* + * Some nbd_io may be under executing in bdev. + * Wait for their done operation. + */ + if (nbd->io_count != 0) { + return 1; + } + + return 0; +} + static void _nbd_stop(struct spdk_nbd_disk *nbd) { @@ -250,8 +360,12 @@ spdk_nbd_stop(struct spdk_nbd_disk *nbd) return; } - nbd->io.ref--; - if (nbd->io.ref == 0) { + nbd->state = NBD_DISK_STATE_HARDDISC; + + /* + * Stop action should be called only after all nbd_io are executed. + */ + if (!spdk_nbd_cleanup_io(nbd)) { _nbd_stop(nbd); } } @@ -305,14 +419,13 @@ nbd_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) } memcpy(&io->resp.handle, &io->req.handle, sizeof(io->resp.handle)); - io->state = NBD_IO_XMIT_RESP; + TAILQ_INSERT_TAIL(&nbd->executed_io_list, io, tailq); if (bdev_io != NULL) { spdk_bdev_free_io(bdev_io); } - io->ref--; - if (io->ref == 0) { + if (nbd->state == NBD_DISK_STATE_HARDDISC && !spdk_nbd_cleanup_io(nbd)) { _nbd_stop(nbd); } } @@ -324,8 +437,6 @@ nbd_submit_bdev_io(struct spdk_nbd_disk *nbd, struct nbd_io *io) struct spdk_io_channel *ch = nbd->ch; int rc = 0; - io->ref++; - switch (from_be32(&io->req.type)) { case NBD_CMD_READ: rc = spdk_bdev_read(desc, ch, io->payload, from_be64(&io->req.from), @@ -349,8 +460,9 @@ nbd_submit_bdev_io(struct spdk_nbd_disk *nbd, struct nbd_io *io) break; #endif case NBD_CMD_DISC: - io->ref--; - return -ECONNRESET; + spdk_put_nbd_io(nbd, io); + nbd->state = NBD_DISK_STATE_SOFTDISC; + break; default: rc = -1; } @@ -363,28 +475,50 @@ nbd_submit_bdev_io(struct spdk_nbd_disk *nbd, struct nbd_io *io) } static int -spdk_nbd_io_exec(struct nbd_io *io) +spdk_nbd_io_exec(struct spdk_nbd_disk *nbd) { - struct spdk_nbd_disk *nbd = io->nbd; + struct nbd_io *io, *io_tmp; int ret = 0; - if (io->state == NBD_IO_PROCESS) { - ret = nbd_submit_bdev_io(nbd, io); + /* + * For soft disconnection, nbd server must handle all outstanding + * request before closing connection. + */ + if (nbd->state == NBD_DISK_STATE_HARDDISC) { + return 0; + } + + if (!TAILQ_EMPTY(&nbd->received_io_list)) { + TAILQ_FOREACH_SAFE(io, &nbd->received_io_list, tailq, io_tmp) { + TAILQ_REMOVE(&nbd->received_io_list, io, tailq); + ret = nbd_submit_bdev_io(nbd, io); + if (ret < 0) { + break; + } + } } return ret; } static int -spdk_nbd_io_recv(struct nbd_io *io) +spdk_nbd_io_recv_internal(struct spdk_nbd_disk *nbd) { - struct spdk_nbd_disk *nbd = io->nbd; + struct nbd_io *io; int ret = 0; + if (nbd->io_in_recv == NULL) { + nbd->io_in_recv = spdk_get_nbd_io(nbd); + } + + io = nbd->io_in_recv; + if (io->state == NBD_IO_RECV_REQ) { ret = read_from_socket(nbd->spdk_sp_fd, (char *)&io->req + io->offset, sizeof(io->req) - io->offset); - if (ret <= 0) { + if (ret < 0) { + spdk_put_nbd_io(nbd, io); + nbd->io_in_recv = NULL; return ret; } @@ -397,6 +531,8 @@ spdk_nbd_io_recv(struct nbd_io *io) /* req magic check */ if (from_be32(&io->req.magic) != NBD_REQUEST_MAGIC) { SPDK_ERRLOG("invalid request magic\n"); + spdk_put_nbd_io(nbd, io); + nbd->io_in_recv = NULL; return -EINVAL; } @@ -413,6 +549,8 @@ spdk_nbd_io_recv(struct nbd_io *io) io->payload = spdk_dma_malloc(io->payload_size, nbd->buf_align, NULL); if (io->payload == NULL) { SPDK_ERRLOG("could not allocate io->payload of size %d\n", io->payload_size); + spdk_put_nbd_io(nbd, io); + nbd->io_in_recv = NULL; return -ENOMEM; } } else { @@ -423,15 +561,18 @@ spdk_nbd_io_recv(struct nbd_io *io) if (from_be32(&io->req.type) == NBD_CMD_WRITE) { io->state = NBD_IO_RECV_PAYLOAD; } else { - io->state = NBD_IO_PROCESS; - ret = spdk_nbd_io_exec(io); + io->state = NBD_IO_XMIT_RESP; + nbd->io_in_recv = NULL; + TAILQ_INSERT_TAIL(&nbd->received_io_list, io, tailq); } } } if (io->state == NBD_IO_RECV_PAYLOAD) { ret = read_from_socket(nbd->spdk_sp_fd, io->payload + io->offset, io->payload_size - io->offset); - if (ret <= 0) { + if (ret < 0) { + spdk_put_nbd_io(nbd, io); + nbd->io_in_recv = NULL; return ret; } @@ -440,21 +581,50 @@ spdk_nbd_io_recv(struct nbd_io *io) /* request payload is fully received */ if (io->offset == io->payload_size) { io->offset = 0; - io->state = NBD_IO_PROCESS; - ret = spdk_nbd_io_exec(io); + io->state = NBD_IO_XMIT_RESP; + nbd->io_in_recv = NULL; + TAILQ_INSERT_TAIL(&nbd->received_io_list, io, tailq); } } - return ret; + return 0; } static int -spdk_nbd_io_xmit(struct nbd_io *io) +spdk_nbd_io_recv(struct spdk_nbd_disk *nbd) { - struct spdk_nbd_disk *nbd = io->nbd; + int i, ret = 0; + + /* + * nbd server should not accept request in both soft and hard + * disconnect states. + */ + if (nbd->state != NBD_DISK_STATE_RUNNING) { + return 0; + } + + for (i = 0; i < GET_IO_LOOP_COUNT; i++) { + ret = spdk_nbd_io_recv_internal(nbd); + if (ret != 0) { + return ret; + } + } + + return 0; +} + +static int +spdk_nbd_io_xmit_internal(struct spdk_nbd_disk *nbd) +{ + struct nbd_io *io; int ret = 0; + io = TAILQ_FIRST(&nbd->executed_io_list); + if (io == NULL) { + return 0; + } + /* resp error and handler are already set in io_done */ if (io->state == NBD_IO_XMIT_RESP) { @@ -472,11 +642,9 @@ spdk_nbd_io_xmit(struct nbd_io *io) /* transmit payload only when NBD_CMD_READ with no resp error */ if (from_be32(&io->req.type) != NBD_CMD_READ || io->resp.error != 0) { - io->state = NBD_IO_RECV_REQ; - if (io->payload) { - spdk_dma_free(io->payload); - io->payload = NULL; - } + TAILQ_REMOVE(&nbd->executed_io_list, io, tailq); + spdk_put_nbd_io(nbd, io); + return 0; } else { io->state = NBD_IO_XMIT_PAYLOAD; } @@ -493,18 +661,45 @@ spdk_nbd_io_xmit(struct nbd_io *io) /* read payload is fully transmitted */ if (io->offset == io->payload_size) { - io->offset = 0; - io->state = NBD_IO_RECV_REQ; - if (io->payload) { - spdk_dma_free(io->payload); - io->payload = NULL; - } + TAILQ_REMOVE(&nbd->executed_io_list, io, tailq); + spdk_put_nbd_io(nbd, io); } } return 0; } +static int +spdk_nbd_io_xmit(struct spdk_nbd_disk *nbd) +{ + int ret = 0; + + /* + * For soft disconnection, nbd server must handle all outstanding + * request before closing connection. + */ + if (nbd->state == NBD_DISK_STATE_HARDDISC) { + return 0; + } + + while (!TAILQ_EMPTY(&nbd->executed_io_list)) { + ret = spdk_nbd_io_xmit_internal(nbd); + if (ret != 0) { + return ret; + } + } + + /* + * For soft disconnection, nbd server can close connection after all + * outstanding request are transmitted. + */ + if (nbd->state == NBD_DISK_STATE_SOFTDISC && !spdk_nbd_io_xmit_check(nbd)) { + return -1; + } + + return 0; +} + /** * Poll an NBD instance. * @@ -513,15 +708,20 @@ spdk_nbd_io_xmit(struct nbd_io *io) static int _spdk_nbd_poll(struct spdk_nbd_disk *nbd) { - struct nbd_io *io = &nbd->io; int rc; - rc = spdk_nbd_io_recv(io); + /* transmit executed io first */ + rc = spdk_nbd_io_xmit(nbd); if (rc < 0) { return rc; } - rc = spdk_nbd_io_xmit(io); + rc = spdk_nbd_io_recv(nbd); + if (rc < 0) { + return rc; + } + + rc = spdk_nbd_io_exec(nbd); return rc; } @@ -582,7 +782,6 @@ spdk_nbd_start(const char *bdev_name, const char *nbd_path) return NULL; } - nbd->io.nbd = nbd; nbd->dev_fd = -1; nbd->spdk_sp_fd = -1; nbd->kernel_sp_fd = -1; @@ -593,7 +792,6 @@ spdk_nbd_start(const char *bdev_name, const char *nbd_path) goto err; } - nbd->io.ref = 1; nbd->bdev = bdev; nbd->ch = spdk_bdev_get_io_channel(nbd->bdev_desc); @@ -612,6 +810,10 @@ spdk_nbd_start(const char *bdev_name, const char *nbd_path) SPDK_ERRLOG("strdup allocation failure\n"); goto err; } + + TAILQ_INIT(&nbd->received_io_list); + TAILQ_INIT(&nbd->executed_io_list); + /* Add nbd_disk to the end of disk list */ rc = spdk_nbd_disk_register(nbd); if (rc != 0) { @@ -678,8 +880,6 @@ spdk_nbd_start(const char *bdev_name, const char *nbd_path) goto err; } - to_be32(&nbd->io.resp.magic, NBD_REPLY_MAGIC); - nbd->nbd_poller = spdk_poller_register(spdk_nbd_poll, nbd, 0); return nbd;