lib/vhost: don't restart device multiple times
We will stop/start the device multiple times when a new vring is added, and also stop/start the device when set vring's callfd, actually we only need to start the device after a I/O queue is enabled, DPDK rte_vhost will not help us to start the device in some scenarios, so this is controlled in SPDK. Now we improve the workaround to make it consistent with vhost-user specification. For each SET_VRING_KICK message, we will setup the new added vring, and then we try to start the device. For each SET_VRING_CALL message, we will add one more interrupt count, previously this is done when enable the vring, which is not accurate. For each GET_VRING_BASE message, we will stop the device before the first message. With above changes, we will start/stop the device once, any new added vrings after starting the device will be polled in next `vdev_worker` poller. Change-Id: I5a87c73d34ce7c5f96db7502a68c5fa2cb2e4f74 Signed-off-by: Changpeng Liu <changpeng.liu@intel.com> Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/14928 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Community-CI: Mellanox Build Bot Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com>
This commit is contained in:
parent
b7facb30f8
commit
23baa6761d
@ -354,14 +354,6 @@ int
|
||||
vhost_vq_used_signal(struct spdk_vhost_session *vsession,
|
||||
struct spdk_vhost_virtqueue *virtqueue)
|
||||
{
|
||||
/* The flag is true when DPDK "vhost-events" thread is holding all
|
||||
* VQ's access lock, we will skip to post IRQs this round poll, and
|
||||
* try to post IRQs in next poll or after starting the device again.
|
||||
*/
|
||||
if (spdk_unlikely(vsession->skip_used_signal)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (virtqueue->used_req_cnt == 0) {
|
||||
return 0;
|
||||
}
|
||||
@ -905,6 +897,7 @@ _stop_session(struct spdk_vhost_session *vsession)
|
||||
}
|
||||
|
||||
rte_vhost_set_vring_base(vsession->vid, i, q->last_avail_idx, q->last_used_idx);
|
||||
q->vring.desc = NULL;
|
||||
}
|
||||
vsession->max_queues = 0;
|
||||
|
||||
@ -1001,6 +994,46 @@ vhost_user_session_start(struct spdk_vhost_dev *vdev, struct spdk_vhost_session
|
||||
return vhost_user_session_send_event(vsession, vhost_user_session_start_cb, 3, "start session");
|
||||
}
|
||||
|
||||
static int
|
||||
set_device_vq_callfd(struct spdk_vhost_session *vsession, uint16_t qid)
|
||||
{
|
||||
struct spdk_vhost_virtqueue *q;
|
||||
|
||||
if (qid >= SPDK_VHOST_MAX_VQUEUES) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
q = &vsession->virtqueue[qid];
|
||||
/* vq isn't enabled yet */
|
||||
if (q->vring_idx != qid) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* vring.desc and vring.desc_packed are in a union struct
|
||||
* so q->vring.desc can replace q->vring.desc_packed.
|
||||
*/
|
||||
if (q->vring.desc == NULL || q->vring.size == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Not sure right now but this look like some kind of QEMU bug and guest IO
|
||||
* might be frozed without kicking all queues after live-migration. This look like
|
||||
* the previous vhost instance failed to effectively deliver all interrupts before
|
||||
* the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts
|
||||
* should be ignored by guest virtio driver.
|
||||
*
|
||||
* Tested on QEMU 2.10.91 and 2.11.50.
|
||||
*
|
||||
* Make sure a successful call of
|
||||
* `rte_vhost_vring_call` will happen
|
||||
* after starting the device.
|
||||
*/
|
||||
q->used_req_cnt += 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
enable_device_vq(struct spdk_vhost_session *vsession, uint16_t qid)
|
||||
{
|
||||
@ -1043,21 +1076,6 @@ enable_device_vq(struct spdk_vhost_session *vsession, uint16_t qid)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* Not sure right now but this look like some kind of QEMU bug and guest IO
|
||||
* might be frozed without kicking all queues after live-migration. This look like
|
||||
* the previous vhost instance failed to effectively deliver all interrupts before
|
||||
* the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts
|
||||
* should be ignored by guest virtio driver.
|
||||
*
|
||||
* Tested on QEMU 2.10.91 and 2.11.50.
|
||||
*
|
||||
* Make sure a successful call of
|
||||
* `rte_vhost_vring_call` will happen
|
||||
* after starting the device.
|
||||
*/
|
||||
q->used_req_cnt += 1;
|
||||
|
||||
if (packed_ring) {
|
||||
/* Use the inflight mem to restore the last_avail_idx and last_used_idx.
|
||||
* When the vring format is packed, there is no used_idx in the
|
||||
@ -1103,7 +1121,6 @@ start_device(int vid)
|
||||
struct spdk_vhost_dev *vdev;
|
||||
struct spdk_vhost_session *vsession;
|
||||
int rc = -1;
|
||||
uint16_t i;
|
||||
|
||||
spdk_vhost_lock();
|
||||
|
||||
@ -1125,13 +1142,6 @@ start_device(int vid)
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) {
|
||||
rc = enable_device_vq(vsession, i);
|
||||
if (rc != 0) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
vhost_user_session_set_coalescing(vdev, vsession, NULL);
|
||||
vsession->initialized = true;
|
||||
rc = vhost_user_session_start(vdev, vsession);
|
||||
@ -1473,61 +1483,8 @@ extern_vhost_pre_msg_handler(int vid, void *_msg)
|
||||
|
||||
switch (msg->request) {
|
||||
case VHOST_USER_GET_VRING_BASE:
|
||||
if (vsession->forced_polling && vsession->started) {
|
||||
/* Our queue is stopped for whatever reason, but we may still
|
||||
* need to poll it after it's initialized again.
|
||||
*/
|
||||
g_spdk_vhost_ops.destroy_device(vid);
|
||||
}
|
||||
break;
|
||||
case VHOST_USER_SET_VRING_BASE:
|
||||
case VHOST_USER_SET_VRING_ADDR:
|
||||
case VHOST_USER_SET_VRING_NUM:
|
||||
/* For vhost-user socket messages except VHOST_USER_GET_VRING_BASE,
|
||||
* rte_vhost holds all VQ's access lock, then after DPDK 22.07 release,
|
||||
* `rte_vhost_vring_call` also needs to hold VQ's access lock, so we
|
||||
* can't call this function in DPDK "vhost-events" thread context, here
|
||||
* SPDK vring poller will avoid executing this function when it's TRUE.
|
||||
*/
|
||||
vsession->skip_used_signal = true;
|
||||
if (vsession->forced_polling && vsession->started) {
|
||||
/* Additional queues are being initialized, so we either processed
|
||||
* enough I/Os and are switching from SeaBIOS to the OS now, or
|
||||
* we were never in SeaBIOS in the first place. Either way, we
|
||||
* don't need our workaround anymore.
|
||||
*/
|
||||
g_spdk_vhost_ops.destroy_device(vid);
|
||||
vsession->forced_polling = false;
|
||||
}
|
||||
break;
|
||||
case VHOST_USER_SET_VRING_KICK:
|
||||
/* rte_vhost(after 20.08) will call new_device after one active vring is
|
||||
* configured, we will start the session before all vrings are available,
|
||||
* so for each new vring, if the session is started, we need to restart it
|
||||
* again.
|
||||
*/
|
||||
case VHOST_USER_SET_VRING_CALL:
|
||||
/* rte_vhost will close the previous callfd and won't notify
|
||||
* us about any change. This will effectively make SPDK fail
|
||||
* to deliver any subsequent interrupts until a session is
|
||||
* restarted. We stop the session here before closing the previous
|
||||
* fd (so that all interrupts must have been delivered by the
|
||||
* time the descriptor is closed) and start right after (which
|
||||
* will make SPDK retrieve the latest, up-to-date callfd from
|
||||
* rte_vhost.
|
||||
*/
|
||||
case VHOST_USER_SET_MEM_TABLE:
|
||||
vsession->skip_used_signal = true;
|
||||
/* rte_vhost will unmap previous memory that SPDK may still
|
||||
* have pending DMA operations on. We can't let that happen,
|
||||
* so stop the device before letting rte_vhost unmap anything.
|
||||
* This will block until all pending I/Os are finished.
|
||||
* We will start the device again from the post-processing
|
||||
* message handler.
|
||||
*/
|
||||
if (vsession->started) {
|
||||
g_spdk_vhost_ops.destroy_device(vid);
|
||||
vsession->needs_restart = true;
|
||||
}
|
||||
break;
|
||||
case VHOST_USER_GET_CONFIG: {
|
||||
@ -1562,7 +1519,6 @@ extern_vhost_pre_msg_handler(int vid, void *_msg)
|
||||
break;
|
||||
}
|
||||
|
||||
vsession->skip_used_signal = false;
|
||||
return RTE_VHOST_MSG_RESULT_NOT_HANDLED;
|
||||
}
|
||||
|
||||
@ -1571,6 +1527,7 @@ extern_vhost_post_msg_handler(int vid, void *_msg)
|
||||
{
|
||||
struct vhost_user_msg *msg = _msg;
|
||||
struct spdk_vhost_session *vsession;
|
||||
uint16_t qid;
|
||||
int rc;
|
||||
|
||||
vsession = vhost_session_find_by_vid(vid);
|
||||
@ -1584,12 +1541,6 @@ extern_vhost_post_msg_handler(int vid, void *_msg)
|
||||
vhost_register_memtable_if_required(vsession, vid);
|
||||
}
|
||||
|
||||
if (vsession->needs_restart) {
|
||||
g_spdk_vhost_ops.new_device(vid);
|
||||
vsession->needs_restart = false;
|
||||
return RTE_VHOST_MSG_RESULT_NOT_HANDLED;
|
||||
}
|
||||
|
||||
switch (msg->request) {
|
||||
case VHOST_USER_SET_FEATURES:
|
||||
rc = vhost_get_negotiated_features(vid, &vsession->negotiated_features);
|
||||
@ -1597,28 +1548,25 @@ extern_vhost_post_msg_handler(int vid, void *_msg)
|
||||
SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid);
|
||||
return RTE_VHOST_MSG_RESULT_ERR;
|
||||
}
|
||||
|
||||
/* rte_vhost requires all queues to be fully initialized in order
|
||||
* to start I/O processing. This behavior is not compliant with the
|
||||
* vhost-user specification and doesn't work with QEMU 2.12+, which
|
||||
* will only initialize 1 I/O queue for the SeaBIOS boot.
|
||||
* Theoretically, we should start polling each virtqueue individually
|
||||
* after receiving its SET_VRING_KICK message, but rte_vhost is not
|
||||
* designed to poll individual queues. So here we use a workaround
|
||||
* to detect when the vhost session could be potentially at that SeaBIOS
|
||||
* stage and we mark it to start polling as soon as its first virtqueue
|
||||
* gets initialized. This doesn't hurt any non-QEMU vhost slaves
|
||||
* and allows QEMU 2.12+ to boot correctly. SET_FEATURES could be sent
|
||||
* at any time, but QEMU will send it at least once on SeaBIOS
|
||||
* initialization - whenever powered-up or rebooted.
|
||||
*/
|
||||
vsession->forced_polling = true;
|
||||
break;
|
||||
case VHOST_USER_SET_VRING_CALL:
|
||||
qid = (uint16_t)msg->payload.u64;
|
||||
rc = set_device_vq_callfd(vsession, qid);
|
||||
if (rc) {
|
||||
return RTE_VHOST_MSG_RESULT_ERR;
|
||||
}
|
||||
break;
|
||||
case VHOST_USER_SET_VRING_KICK:
|
||||
qid = (uint16_t)msg->payload.u64;
|
||||
rc = enable_device_vq(vsession, qid);
|
||||
if (rc) {
|
||||
return RTE_VHOST_MSG_RESULT_ERR;
|
||||
}
|
||||
|
||||
/* vhost-user spec tells us to start polling a queue after receiving
|
||||
* its SET_VRING_KICK message. Let's do it!
|
||||
*/
|
||||
if (vsession->forced_polling && !vsession->started) {
|
||||
if (!vsession->started) {
|
||||
g_spdk_vhost_ops.new_device(vid);
|
||||
}
|
||||
break;
|
||||
|
@ -1268,7 +1268,6 @@ alloc_vq_task_pool(struct spdk_vhost_session *vsession, uint16_t qid)
|
||||
/* sanity check */
|
||||
SPDK_ERRLOG("%s: virtqueue %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
|
||||
vsession->name, qid, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
|
||||
free_task_pool(bvsession);
|
||||
return -1;
|
||||
}
|
||||
vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_user_blk_task) * task_cnt,
|
||||
@ -1277,7 +1276,6 @@ alloc_vq_task_pool(struct spdk_vhost_session *vsession, uint16_t qid)
|
||||
if (vq->tasks == NULL) {
|
||||
SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
|
||||
vsession->name, task_cnt, qid);
|
||||
free_task_pool(bvsession);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@ -1299,9 +1297,11 @@ vhost_blk_start(struct spdk_vhost_dev *vdev,
|
||||
struct spdk_vhost_blk_dev *bvdev;
|
||||
int i, rc = 0;
|
||||
|
||||
bvdev = to_blk_dev(vdev);
|
||||
assert(bvdev != NULL);
|
||||
bvsession->bvdev = bvdev;
|
||||
/* return if start is already in progress */
|
||||
if (bvsession->requestq_poller) {
|
||||
SPDK_INFOLOG(vhost, "%s: start in progress\n", vsession->name);
|
||||
return -EINPROGRESS;
|
||||
}
|
||||
|
||||
/* validate all I/O queues are in a contiguous index range */
|
||||
for (i = 0; i < vsession->max_queues; i++) {
|
||||
@ -1314,6 +1314,10 @@ vhost_blk_start(struct spdk_vhost_dev *vdev,
|
||||
}
|
||||
}
|
||||
|
||||
bvdev = to_blk_dev(vdev);
|
||||
assert(bvdev != NULL);
|
||||
bvsession->bvdev = bvdev;
|
||||
|
||||
if (bvdev->bdev) {
|
||||
bvsession->io_channel = vhost_blk_get_io_channel(vdev);
|
||||
if (!bvsession->io_channel) {
|
||||
@ -1351,7 +1355,7 @@ vhost_blk_start(struct spdk_vhost_dev *vdev,
|
||||
spdk_poller_register_interrupt(bvsession->requestq_poller, vhost_blk_poller_set_interrupt_mode,
|
||||
bvsession);
|
||||
|
||||
return rc;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
@ -1401,6 +1405,11 @@ vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
|
||||
{
|
||||
struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
|
||||
|
||||
/* return if stop is already in progress */
|
||||
if (bvsession->stop_poller) {
|
||||
return -EINPROGRESS;
|
||||
}
|
||||
|
||||
spdk_poller_unregister(&bvsession->requestq_poller);
|
||||
|
||||
if (vsession->virtqueue[0].intr) {
|
||||
|
@ -116,10 +116,7 @@ struct spdk_vhost_session {
|
||||
|
||||
bool initialized;
|
||||
bool started;
|
||||
bool needs_restart;
|
||||
bool forced_polling;
|
||||
bool interrupt_mode;
|
||||
bool skip_used_signal;
|
||||
|
||||
struct rte_vhost_memory *mem;
|
||||
|
||||
|
@ -1330,7 +1330,6 @@ alloc_vq_task_pool(struct spdk_vhost_session *vsession, uint16_t qid)
|
||||
/* sanity check */
|
||||
SPDK_ERRLOG("%s: virtqueue %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
|
||||
vsession->name, qid, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
|
||||
free_task_pool(svsession);
|
||||
return -1;
|
||||
}
|
||||
vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_scsi_task) * task_cnt,
|
||||
@ -1339,7 +1338,6 @@ alloc_vq_task_pool(struct spdk_vhost_session *vsession, uint16_t qid)
|
||||
if (vq->tasks == NULL) {
|
||||
SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
|
||||
vsession->name, task_cnt, qid);
|
||||
free_task_pool(svsession);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@ -1363,9 +1361,11 @@ vhost_scsi_start(struct spdk_vhost_dev *vdev,
|
||||
uint32_t i;
|
||||
int rc;
|
||||
|
||||
svdev = to_scsi_dev(vsession->vdev);
|
||||
assert(svdev != NULL);
|
||||
svsession->svdev = svdev;
|
||||
/* return if start is already in progress */
|
||||
if (svsession->requestq_poller) {
|
||||
SPDK_INFOLOG(vhost, "%s: start in progress\n", vsession->name);
|
||||
return -EINPROGRESS;
|
||||
}
|
||||
|
||||
/* validate all I/O queues are in a contiguous index range */
|
||||
if (vsession->max_queues < VIRTIO_SCSI_REQUESTQ + 1) {
|
||||
@ -1379,6 +1379,10 @@ vhost_scsi_start(struct spdk_vhost_dev *vdev,
|
||||
}
|
||||
}
|
||||
|
||||
svdev = to_scsi_dev(vsession->vdev);
|
||||
assert(svdev != NULL);
|
||||
svsession->svdev = svdev;
|
||||
|
||||
for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) {
|
||||
state = &svdev->scsi_dev_state[i];
|
||||
if (state->dev == NULL || state->status == VHOST_SCSI_DEV_REMOVING) {
|
||||
@ -1407,7 +1411,7 @@ vhost_scsi_start(struct spdk_vhost_dev *vdev,
|
||||
svsession->requestq_poller = SPDK_POLLER_REGISTER(vdev_worker, svsession, 0);
|
||||
svsession->mgmt_poller = SPDK_POLLER_REGISTER(vdev_mgmt_worker, svsession,
|
||||
MGMT_POLL_PERIOD_US);
|
||||
return rc;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
@ -1477,6 +1481,11 @@ vhost_scsi_stop_cb(struct spdk_vhost_dev *vdev,
|
||||
{
|
||||
struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession);
|
||||
|
||||
/* return if stop is already in progress */
|
||||
if (svsession->stop_poller) {
|
||||
return -EINPROGRESS;
|
||||
}
|
||||
|
||||
/* Stop receiving new I/O requests */
|
||||
spdk_poller_unregister(&svsession->requestq_poller);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user