diff --git a/lib/vhost/rte_vhost_user.c b/lib/vhost/rte_vhost_user.c index 88a3d0d5a..068beb5f7 100644 --- a/lib/vhost/rte_vhost_user.c +++ b/lib/vhost/rte_vhost_user.c @@ -354,14 +354,6 @@ int vhost_vq_used_signal(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue) { - /* The flag is true when DPDK "vhost-events" thread is holding all - * VQ's access lock, we will skip to post IRQs this round poll, and - * try to post IRQs in next poll or after starting the device again. - */ - if (spdk_unlikely(vsession->skip_used_signal)) { - return 0; - } - if (virtqueue->used_req_cnt == 0) { return 0; } @@ -905,6 +897,7 @@ _stop_session(struct spdk_vhost_session *vsession) } rte_vhost_set_vring_base(vsession->vid, i, q->last_avail_idx, q->last_used_idx); + q->vring.desc = NULL; } vsession->max_queues = 0; @@ -1001,6 +994,46 @@ vhost_user_session_start(struct spdk_vhost_dev *vdev, struct spdk_vhost_session return vhost_user_session_send_event(vsession, vhost_user_session_start_cb, 3, "start session"); } +static int +set_device_vq_callfd(struct spdk_vhost_session *vsession, uint16_t qid) +{ + struct spdk_vhost_virtqueue *q; + + if (qid >= SPDK_VHOST_MAX_VQUEUES) { + return -EINVAL; + } + + q = &vsession->virtqueue[qid]; + /* vq isn't enabled yet */ + if (q->vring_idx != qid) { + return 0; + } + + /* vring.desc and vring.desc_packed are in a union struct + * so q->vring.desc can replace q->vring.desc_packed. + */ + if (q->vring.desc == NULL || q->vring.size == 0) { + return 0; + } + + /* + * Not sure right now but this look like some kind of QEMU bug and guest IO + * might be frozed without kicking all queues after live-migration. This look like + * the previous vhost instance failed to effectively deliver all interrupts before + * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts + * should be ignored by guest virtio driver. + * + * Tested on QEMU 2.10.91 and 2.11.50. + * + * Make sure a successful call of + * `rte_vhost_vring_call` will happen + * after starting the device. + */ + q->used_req_cnt += 1; + + return 0; +} + static int enable_device_vq(struct spdk_vhost_session *vsession, uint16_t qid) { @@ -1043,21 +1076,6 @@ enable_device_vq(struct spdk_vhost_session *vsession, uint16_t qid) return rc; } - /* - * Not sure right now but this look like some kind of QEMU bug and guest IO - * might be frozed without kicking all queues after live-migration. This look like - * the previous vhost instance failed to effectively deliver all interrupts before - * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts - * should be ignored by guest virtio driver. - * - * Tested on QEMU 2.10.91 and 2.11.50. - * - * Make sure a successful call of - * `rte_vhost_vring_call` will happen - * after starting the device. - */ - q->used_req_cnt += 1; - if (packed_ring) { /* Use the inflight mem to restore the last_avail_idx and last_used_idx. * When the vring format is packed, there is no used_idx in the @@ -1103,7 +1121,6 @@ start_device(int vid) struct spdk_vhost_dev *vdev; struct spdk_vhost_session *vsession; int rc = -1; - uint16_t i; spdk_vhost_lock(); @@ -1125,13 +1142,6 @@ start_device(int vid) goto out; } - for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) { - rc = enable_device_vq(vsession, i); - if (rc != 0) { - goto out; - } - } - vhost_user_session_set_coalescing(vdev, vsession, NULL); vsession->initialized = true; rc = vhost_user_session_start(vdev, vsession); @@ -1473,61 +1483,8 @@ extern_vhost_pre_msg_handler(int vid, void *_msg) switch (msg->request) { case VHOST_USER_GET_VRING_BASE: - if (vsession->forced_polling && vsession->started) { - /* Our queue is stopped for whatever reason, but we may still - * need to poll it after it's initialized again. - */ - g_spdk_vhost_ops.destroy_device(vid); - } - break; - case VHOST_USER_SET_VRING_BASE: - case VHOST_USER_SET_VRING_ADDR: - case VHOST_USER_SET_VRING_NUM: - /* For vhost-user socket messages except VHOST_USER_GET_VRING_BASE, - * rte_vhost holds all VQ's access lock, then after DPDK 22.07 release, - * `rte_vhost_vring_call` also needs to hold VQ's access lock, so we - * can't call this function in DPDK "vhost-events" thread context, here - * SPDK vring poller will avoid executing this function when it's TRUE. - */ - vsession->skip_used_signal = true; - if (vsession->forced_polling && vsession->started) { - /* Additional queues are being initialized, so we either processed - * enough I/Os and are switching from SeaBIOS to the OS now, or - * we were never in SeaBIOS in the first place. Either way, we - * don't need our workaround anymore. - */ - g_spdk_vhost_ops.destroy_device(vid); - vsession->forced_polling = false; - } - break; - case VHOST_USER_SET_VRING_KICK: - /* rte_vhost(after 20.08) will call new_device after one active vring is - * configured, we will start the session before all vrings are available, - * so for each new vring, if the session is started, we need to restart it - * again. - */ - case VHOST_USER_SET_VRING_CALL: - /* rte_vhost will close the previous callfd and won't notify - * us about any change. This will effectively make SPDK fail - * to deliver any subsequent interrupts until a session is - * restarted. We stop the session here before closing the previous - * fd (so that all interrupts must have been delivered by the - * time the descriptor is closed) and start right after (which - * will make SPDK retrieve the latest, up-to-date callfd from - * rte_vhost. - */ - case VHOST_USER_SET_MEM_TABLE: - vsession->skip_used_signal = true; - /* rte_vhost will unmap previous memory that SPDK may still - * have pending DMA operations on. We can't let that happen, - * so stop the device before letting rte_vhost unmap anything. - * This will block until all pending I/Os are finished. - * We will start the device again from the post-processing - * message handler. - */ if (vsession->started) { g_spdk_vhost_ops.destroy_device(vid); - vsession->needs_restart = true; } break; case VHOST_USER_GET_CONFIG: { @@ -1562,7 +1519,6 @@ extern_vhost_pre_msg_handler(int vid, void *_msg) break; } - vsession->skip_used_signal = false; return RTE_VHOST_MSG_RESULT_NOT_HANDLED; } @@ -1571,6 +1527,7 @@ extern_vhost_post_msg_handler(int vid, void *_msg) { struct vhost_user_msg *msg = _msg; struct spdk_vhost_session *vsession; + uint16_t qid; int rc; vsession = vhost_session_find_by_vid(vid); @@ -1584,12 +1541,6 @@ extern_vhost_post_msg_handler(int vid, void *_msg) vhost_register_memtable_if_required(vsession, vid); } - if (vsession->needs_restart) { - g_spdk_vhost_ops.new_device(vid); - vsession->needs_restart = false; - return RTE_VHOST_MSG_RESULT_NOT_HANDLED; - } - switch (msg->request) { case VHOST_USER_SET_FEATURES: rc = vhost_get_negotiated_features(vid, &vsession->negotiated_features); @@ -1597,28 +1548,25 @@ extern_vhost_post_msg_handler(int vid, void *_msg) SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid); return RTE_VHOST_MSG_RESULT_ERR; } - - /* rte_vhost requires all queues to be fully initialized in order - * to start I/O processing. This behavior is not compliant with the - * vhost-user specification and doesn't work with QEMU 2.12+, which - * will only initialize 1 I/O queue for the SeaBIOS boot. - * Theoretically, we should start polling each virtqueue individually - * after receiving its SET_VRING_KICK message, but rte_vhost is not - * designed to poll individual queues. So here we use a workaround - * to detect when the vhost session could be potentially at that SeaBIOS - * stage and we mark it to start polling as soon as its first virtqueue - * gets initialized. This doesn't hurt any non-QEMU vhost slaves - * and allows QEMU 2.12+ to boot correctly. SET_FEATURES could be sent - * at any time, but QEMU will send it at least once on SeaBIOS - * initialization - whenever powered-up or rebooted. - */ - vsession->forced_polling = true; + break; + case VHOST_USER_SET_VRING_CALL: + qid = (uint16_t)msg->payload.u64; + rc = set_device_vq_callfd(vsession, qid); + if (rc) { + return RTE_VHOST_MSG_RESULT_ERR; + } break; case VHOST_USER_SET_VRING_KICK: + qid = (uint16_t)msg->payload.u64; + rc = enable_device_vq(vsession, qid); + if (rc) { + return RTE_VHOST_MSG_RESULT_ERR; + } + /* vhost-user spec tells us to start polling a queue after receiving * its SET_VRING_KICK message. Let's do it! */ - if (vsession->forced_polling && !vsession->started) { + if (!vsession->started) { g_spdk_vhost_ops.new_device(vid); } break; diff --git a/lib/vhost/vhost_blk.c b/lib/vhost/vhost_blk.c index dbe2086f9..d32b42265 100644 --- a/lib/vhost/vhost_blk.c +++ b/lib/vhost/vhost_blk.c @@ -1268,7 +1268,6 @@ alloc_vq_task_pool(struct spdk_vhost_session *vsession, uint16_t qid) /* sanity check */ SPDK_ERRLOG("%s: virtqueue %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n", vsession->name, qid, task_cnt, SPDK_VHOST_MAX_VQ_SIZE); - free_task_pool(bvsession); return -1; } vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_user_blk_task) * task_cnt, @@ -1277,7 +1276,6 @@ alloc_vq_task_pool(struct spdk_vhost_session *vsession, uint16_t qid) if (vq->tasks == NULL) { SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", vsession->name, task_cnt, qid); - free_task_pool(bvsession); return -1; } @@ -1299,9 +1297,11 @@ vhost_blk_start(struct spdk_vhost_dev *vdev, struct spdk_vhost_blk_dev *bvdev; int i, rc = 0; - bvdev = to_blk_dev(vdev); - assert(bvdev != NULL); - bvsession->bvdev = bvdev; + /* return if start is already in progress */ + if (bvsession->requestq_poller) { + SPDK_INFOLOG(vhost, "%s: start in progress\n", vsession->name); + return -EINPROGRESS; + } /* validate all I/O queues are in a contiguous index range */ for (i = 0; i < vsession->max_queues; i++) { @@ -1314,6 +1314,10 @@ vhost_blk_start(struct spdk_vhost_dev *vdev, } } + bvdev = to_blk_dev(vdev); + assert(bvdev != NULL); + bvsession->bvdev = bvdev; + if (bvdev->bdev) { bvsession->io_channel = vhost_blk_get_io_channel(vdev); if (!bvsession->io_channel) { @@ -1351,7 +1355,7 @@ vhost_blk_start(struct spdk_vhost_dev *vdev, spdk_poller_register_interrupt(bvsession->requestq_poller, vhost_blk_poller_set_interrupt_mode, bvsession); - return rc; + return 0; } static int @@ -1401,6 +1405,11 @@ vhost_blk_stop_cb(struct spdk_vhost_dev *vdev, { struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession); + /* return if stop is already in progress */ + if (bvsession->stop_poller) { + return -EINPROGRESS; + } + spdk_poller_unregister(&bvsession->requestq_poller); if (vsession->virtqueue[0].intr) { diff --git a/lib/vhost/vhost_internal.h b/lib/vhost/vhost_internal.h index 7cd9b319b..1f03afede 100644 --- a/lib/vhost/vhost_internal.h +++ b/lib/vhost/vhost_internal.h @@ -116,10 +116,7 @@ struct spdk_vhost_session { bool initialized; bool started; - bool needs_restart; - bool forced_polling; bool interrupt_mode; - bool skip_used_signal; struct rte_vhost_memory *mem; diff --git a/lib/vhost/vhost_scsi.c b/lib/vhost/vhost_scsi.c index 8ab71d557..b3082a672 100644 --- a/lib/vhost/vhost_scsi.c +++ b/lib/vhost/vhost_scsi.c @@ -1330,7 +1330,6 @@ alloc_vq_task_pool(struct spdk_vhost_session *vsession, uint16_t qid) /* sanity check */ SPDK_ERRLOG("%s: virtqueue %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n", vsession->name, qid, task_cnt, SPDK_VHOST_MAX_VQ_SIZE); - free_task_pool(svsession); return -1; } vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_scsi_task) * task_cnt, @@ -1339,7 +1338,6 @@ alloc_vq_task_pool(struct spdk_vhost_session *vsession, uint16_t qid) if (vq->tasks == NULL) { SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", vsession->name, task_cnt, qid); - free_task_pool(svsession); return -1; } @@ -1363,9 +1361,11 @@ vhost_scsi_start(struct spdk_vhost_dev *vdev, uint32_t i; int rc; - svdev = to_scsi_dev(vsession->vdev); - assert(svdev != NULL); - svsession->svdev = svdev; + /* return if start is already in progress */ + if (svsession->requestq_poller) { + SPDK_INFOLOG(vhost, "%s: start in progress\n", vsession->name); + return -EINPROGRESS; + } /* validate all I/O queues are in a contiguous index range */ if (vsession->max_queues < VIRTIO_SCSI_REQUESTQ + 1) { @@ -1379,6 +1379,10 @@ vhost_scsi_start(struct spdk_vhost_dev *vdev, } } + svdev = to_scsi_dev(vsession->vdev); + assert(svdev != NULL); + svsession->svdev = svdev; + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { state = &svdev->scsi_dev_state[i]; if (state->dev == NULL || state->status == VHOST_SCSI_DEV_REMOVING) { @@ -1407,7 +1411,7 @@ vhost_scsi_start(struct spdk_vhost_dev *vdev, svsession->requestq_poller = SPDK_POLLER_REGISTER(vdev_worker, svsession, 0); svsession->mgmt_poller = SPDK_POLLER_REGISTER(vdev_mgmt_worker, svsession, MGMT_POLL_PERIOD_US); - return rc; + return 0; } static int @@ -1477,6 +1481,11 @@ vhost_scsi_stop_cb(struct spdk_vhost_dev *vdev, { struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession); + /* return if stop is already in progress */ + if (svsession->stop_poller) { + return -EINPROGRESS; + } + /* Stop receiving new I/O requests */ spdk_poller_unregister(&svsession->requestq_poller);