Spdk/lib/vhost/vhost_blk.c
Changpeng Liu b45556e2b2 include/bdev_module.h: add SPDK_ prefix to macros
`BDEV_IO_NUM_CHILD_IOV` and `BDEV_RESET_IO_DRAIN_RECOMMENDED_VALUE`
are public macro definitions without `SPDK_` prefix, so we add the
`SPDK_` prefix to them.

Change-Id: I4be86459f0b6ba3a4636a2c8130b2f12757ea2da
Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/15425
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Paul Luse <paul.e.luse@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Tomasz Zawadzki <tomasz.zawadzki@intel.com>
2022-11-22 10:03:57 +00:00

1799 lines
50 KiB
C

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright (C) 2017 Intel Corporation. All rights reserved.
* All rights reserved.
*/
#include <linux/virtio_blk.h>
#include "spdk/env.h"
#include "spdk/bdev.h"
#include "spdk/bdev_module.h"
#include "spdk/thread.h"
#include "spdk/likely.h"
#include "spdk/string.h"
#include "spdk/util.h"
#include "spdk/vhost.h"
#include "vhost_internal.h"
#include <rte_version.h>
/* Minimal set of features supported by every SPDK VHOST-BLK device */
#define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \
(1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \
(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
(1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER) | \
(1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
(1ULL << VIRTIO_BLK_F_MQ))
/* Not supported features */
#define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
(1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
(1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI))
/* Vhost-blk support protocol features */
#define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \
(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
#define VIRTIO_BLK_DEFAULT_TRANSPORT "vhost_user_blk"
struct spdk_vhost_user_blk_task {
struct spdk_vhost_blk_task blk_task;
struct spdk_vhost_blk_session *bvsession;
struct spdk_vhost_virtqueue *vq;
uint16_t req_idx;
uint16_t num_descs;
uint16_t buffer_id;
uint16_t inflight_head;
/* If set, the task is currently used for I/O processing. */
bool used;
};
struct spdk_vhost_blk_dev {
struct spdk_vhost_dev vdev;
struct spdk_bdev *bdev;
struct spdk_bdev_desc *bdev_desc;
const struct spdk_virtio_blk_transport_ops *ops;
/* dummy_io_channel is used to hold a bdev reference */
struct spdk_io_channel *dummy_io_channel;
bool readonly;
};
struct spdk_vhost_blk_session {
/* The parent session must be the very first field in this struct */
struct spdk_vhost_session vsession;
struct spdk_vhost_blk_dev *bvdev;
struct spdk_poller *requestq_poller;
struct spdk_io_channel *io_channel;
struct spdk_poller *stop_poller;
};
/* forward declaration */
static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
static void vhost_user_blk_request_finish(uint8_t status, struct spdk_vhost_blk_task *task,
void *cb_arg);
static int
vhost_user_process_blk_request(struct spdk_vhost_user_blk_task *user_task)
{
struct spdk_vhost_blk_session *bvsession = user_task->bvsession;
struct spdk_vhost_dev *vdev = &bvsession->bvdev->vdev;
return virtio_blk_process_request(vdev, bvsession->io_channel, &user_task->blk_task,
vhost_user_blk_request_finish, NULL);
}
static struct spdk_vhost_blk_dev *
to_blk_dev(struct spdk_vhost_dev *vdev)
{
if (vdev == NULL) {
return NULL;
}
if (vdev->backend->type != VHOST_BACKEND_BLK) {
SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
return NULL;
}
return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
}
struct spdk_bdev *
vhost_blk_get_bdev(struct spdk_vhost_dev *vdev)
{
struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
return bvdev->bdev;
}
static struct spdk_vhost_blk_session *
to_blk_session(struct spdk_vhost_session *vsession)
{
assert(vsession->vdev->backend->type == VHOST_BACKEND_BLK);
return (struct spdk_vhost_blk_session *)vsession;
}
static void
blk_task_finish(struct spdk_vhost_user_blk_task *task)
{
assert(task->bvsession->vsession.task_cnt > 0);
task->bvsession->vsession.task_cnt--;
task->used = false;
}
static void
blk_task_init(struct spdk_vhost_user_blk_task *task)
{
struct spdk_vhost_blk_task *blk_task = &task->blk_task;
task->used = true;
blk_task->iovcnt = SPDK_COUNTOF(blk_task->iovs);
blk_task->status = NULL;
blk_task->used_len = 0;
blk_task->payload_size = 0;
}
static void
blk_task_enqueue(struct spdk_vhost_user_blk_task *task)
{
if (task->vq->packed.packed_ring) {
vhost_vq_packed_ring_enqueue(&task->bvsession->vsession, task->vq,
task->num_descs,
task->buffer_id, task->blk_task.used_len,
task->inflight_head);
} else {
vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq,
task->req_idx, task->blk_task.used_len);
}
}
static void
vhost_user_blk_request_finish(uint8_t status, struct spdk_vhost_blk_task *task, void *cb_arg)
{
struct spdk_vhost_user_blk_task *user_task;
user_task = SPDK_CONTAINEROF(task, struct spdk_vhost_user_blk_task, blk_task);
blk_task_enqueue(user_task);
SPDK_DEBUGLOG(vhost_blk, "Finished task (%p) req_idx=%d\n status: %" PRIu8"\n",
user_task, user_task->req_idx, status);
blk_task_finish(user_task);
}
static void
blk_request_finish(uint8_t status, struct spdk_vhost_blk_task *task)
{
if (task->status) {
*task->status = status;
}
task->cb(status, task, task->cb_arg);
}
/*
* Process task's descriptor chain and setup data related fields.
* Return
* total size of supplied buffers
*
* FIXME: Make this function return to rd_cnt and wr_cnt
*/
static int
blk_iovs_split_queue_setup(struct spdk_vhost_blk_session *bvsession,
struct spdk_vhost_virtqueue *vq,
uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_dev *vdev = vsession->vdev;
struct vring_desc *desc, *desc_table;
uint16_t out_cnt = 0, cnt = 0;
uint32_t desc_table_size, len = 0;
uint32_t desc_handled_cnt;
int rc;
rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
if (rc != 0) {
SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
return -1;
}
desc_handled_cnt = 0;
while (1) {
/*
* Maximum cnt reached?
* Should not happen if request is well formatted, otherwise this is a BUG.
*/
if (spdk_unlikely(cnt == *iovs_cnt)) {
SPDK_DEBUGLOG(vhost_blk, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
vsession->name, req_idx);
return -1;
}
if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
SPDK_DEBUGLOG(vhost_blk, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
vsession->name, req_idx, cnt);
return -1;
}
len += desc->len;
out_cnt += vhost_vring_desc_is_wr(desc);
rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
if (rc != 0) {
SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
vsession->name, req_idx);
return -1;
} else if (desc == NULL) {
break;
}
desc_handled_cnt++;
if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
/* Break a cycle and report an error, if any. */
SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
vsession->name, desc_table_size, desc_handled_cnt);
return -1;
}
}
/*
* There must be least two descriptors.
* First contain request so it must be readable.
* Last descriptor contain buffer for response so it must be writable.
*/
if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
return -1;
}
*length = len;
*iovs_cnt = cnt;
return 0;
}
static int
blk_iovs_packed_desc_setup(struct spdk_vhost_session *vsession,
struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
struct vring_packed_desc *desc_table, uint16_t desc_table_size,
struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
{
struct vring_packed_desc *desc;
uint16_t cnt = 0, out_cnt = 0;
uint32_t len = 0;
if (desc_table == NULL) {
desc = &vq->vring.desc_packed[req_idx];
} else {
req_idx = 0;
desc = desc_table;
}
while (1) {
/*
* Maximum cnt reached?
* Should not happen if request is well formatted, otherwise this is a BUG.
*/
if (spdk_unlikely(cnt == *iovs_cnt)) {
SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
vsession->name, req_idx);
return -EINVAL;
}
if (spdk_unlikely(vhost_vring_packed_desc_to_iov(vsession, iovs, &cnt, desc))) {
SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
vsession->name, req_idx, cnt);
return -EINVAL;
}
len += desc->len;
out_cnt += vhost_vring_packed_desc_is_wr(desc);
/* desc is NULL means we reach the last desc of this request */
vhost_vring_packed_desc_get_next(&desc, &req_idx, vq, desc_table, desc_table_size);
if (desc == NULL) {
break;
}
}
/*
* There must be least two descriptors.
* First contain request so it must be readable.
* Last descriptor contain buffer for response so it must be writable.
*/
if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
return -EINVAL;
}
*length = len;
*iovs_cnt = cnt;
return 0;
}
static int
blk_iovs_packed_queue_setup(struct spdk_vhost_blk_session *bvsession,
struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_dev *vdev = vsession->vdev;
struct vring_packed_desc *desc = NULL, *desc_table;
uint32_t desc_table_size;
int rc;
rc = vhost_vq_get_desc_packed(vsession, vq, req_idx, &desc,
&desc_table, &desc_table_size);
if (spdk_unlikely(rc != 0)) {
SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
return rc;
}
return blk_iovs_packed_desc_setup(vsession, vq, req_idx, desc_table, desc_table_size,
iovs, iovs_cnt, length);
}
static int
blk_iovs_inflight_queue_setup(struct spdk_vhost_blk_session *bvsession,
struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_dev *vdev = vsession->vdev;
spdk_vhost_inflight_desc *inflight_desc;
struct vring_packed_desc *desc_table;
uint16_t out_cnt = 0, cnt = 0;
uint32_t desc_table_size, len = 0;
int rc = 0;
rc = vhost_inflight_queue_get_desc(vsession, vq->vring_inflight.inflight_packed->desc,
req_idx, &inflight_desc, &desc_table, &desc_table_size);
if (spdk_unlikely(rc != 0)) {
SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
return rc;
}
if (desc_table != NULL) {
return blk_iovs_packed_desc_setup(vsession, vq, req_idx, desc_table, desc_table_size,
iovs, iovs_cnt, length);
}
while (1) {
/*
* Maximum cnt reached?
* Should not happen if request is well formatted, otherwise this is a BUG.
*/
if (spdk_unlikely(cnt == *iovs_cnt)) {
SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
vsession->name, req_idx);
return -EINVAL;
}
if (spdk_unlikely(vhost_vring_inflight_desc_to_iov(vsession, iovs, &cnt, inflight_desc))) {
SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
vsession->name, req_idx, cnt);
return -EINVAL;
}
len += inflight_desc->len;
out_cnt += vhost_vring_inflight_desc_is_wr(inflight_desc);
/* Without F_NEXT means it's the last desc */
if ((inflight_desc->flags & VRING_DESC_F_NEXT) == 0) {
break;
}
inflight_desc = &vq->vring_inflight.inflight_packed->desc[inflight_desc->next];
}
/*
* There must be least two descriptors.
* First contain request so it must be readable.
* Last descriptor contain buffer for response so it must be writable.
*/
if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
return -EINVAL;
}
*length = len;
*iovs_cnt = cnt;
return 0;
}
static void
blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
{
struct spdk_vhost_blk_task *task = cb_arg;
spdk_bdev_free_io(bdev_io);
blk_request_finish(success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR, task);
}
static void
blk_request_resubmit(void *arg)
{
struct spdk_vhost_blk_task *task = arg;
int rc = 0;
rc = virtio_blk_process_request(task->bdev_io_wait_vdev, task->bdev_io_wait_ch, task,
task->cb, task->cb_arg);
if (rc == 0) {
SPDK_DEBUGLOG(vhost_blk, "====== Task %p resubmitted ======\n", task);
} else {
SPDK_DEBUGLOG(vhost_blk, "====== Task %p failed ======\n", task);
}
}
static inline void
blk_request_queue_io(struct spdk_vhost_dev *vdev, struct spdk_io_channel *ch,
struct spdk_vhost_blk_task *task)
{
int rc;
struct spdk_bdev *bdev = vhost_blk_get_bdev(vdev);
task->bdev_io_wait.bdev = bdev;
task->bdev_io_wait.cb_fn = blk_request_resubmit;
task->bdev_io_wait.cb_arg = task;
task->bdev_io_wait_ch = ch;
task->bdev_io_wait_vdev = vdev;
rc = spdk_bdev_queue_io_wait(bdev, ch, &task->bdev_io_wait);
if (rc != 0) {
blk_request_finish(VIRTIO_BLK_S_IOERR, task);
}
}
int
virtio_blk_process_request(struct spdk_vhost_dev *vdev, struct spdk_io_channel *ch,
struct spdk_vhost_blk_task *task, virtio_blk_request_cb cb, void *cb_arg)
{
struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
struct virtio_blk_outhdr req;
struct virtio_blk_discard_write_zeroes *desc;
struct iovec *iov;
uint32_t type;
uint64_t flush_bytes;
uint32_t payload_len;
uint16_t iovcnt;
int rc;
task->cb = cb;
task->cb_arg = cb_arg;
iov = &task->iovs[0];
if (spdk_unlikely(iov->iov_len != sizeof(req))) {
SPDK_DEBUGLOG(vhost_blk,
"First descriptor size is %zu but expected %zu (task = %p).\n",
iov->iov_len, sizeof(req), task);
blk_request_finish(VIRTIO_BLK_S_UNSUPP, task);
return -1;
}
/* Some SeaBIOS versions don't align the virtio_blk_outhdr on an 8-byte boundary, which
* triggers ubsan errors. So copy this small 16-byte structure to the stack to workaround
* this problem.
*/
memcpy(&req, iov->iov_base, sizeof(req));
iov = &task->iovs[task->iovcnt - 1];
if (spdk_unlikely(iov->iov_len != 1)) {
SPDK_DEBUGLOG(vhost_blk,
"Last descriptor size is %zu but expected %d (task = %p).\n",
iov->iov_len, 1, task);
blk_request_finish(VIRTIO_BLK_S_UNSUPP, task);
return -1;
}
payload_len = task->payload_size;
task->status = iov->iov_base;
payload_len -= sizeof(req) + sizeof(*task->status);
iovcnt = task->iovcnt - 2;
type = req.type;
#ifdef VIRTIO_BLK_T_BARRIER
/* Don't care about barrier for now (as QEMU's virtio-blk do). */
type &= ~VIRTIO_BLK_T_BARRIER;
#endif
switch (type) {
case VIRTIO_BLK_T_IN:
case VIRTIO_BLK_T_OUT:
if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (task = %p).\n",
type ? "WRITE" : "READ", task);
blk_request_finish(VIRTIO_BLK_S_UNSUPP, task);
return -1;
}
if (type == VIRTIO_BLK_T_IN) {
task->used_len = payload_len + sizeof(*task->status);
rc = spdk_bdev_readv(bvdev->bdev_desc, ch,
&task->iovs[1], iovcnt, req.sector * 512,
payload_len, blk_request_complete_cb, task);
} else if (!bvdev->readonly) {
task->used_len = sizeof(*task->status);
rc = spdk_bdev_writev(bvdev->bdev_desc, ch,
&task->iovs[1], iovcnt, req.sector * 512,
payload_len, blk_request_complete_cb, task);
} else {
SPDK_DEBUGLOG(vhost_blk, "Device is in read-only mode!\n");
rc = -1;
}
if (rc) {
if (rc == -ENOMEM) {
SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
blk_request_queue_io(vdev, ch, task);
} else {
blk_request_finish(VIRTIO_BLK_S_IOERR, task);
return -1;
}
}
break;
case VIRTIO_BLK_T_DISCARD:
desc = task->iovs[1].iov_base;
if (payload_len != sizeof(*desc)) {
SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
blk_request_finish(VIRTIO_BLK_S_IOERR, task);
return -1;
}
if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
SPDK_ERRLOG("UNMAP flag is only used for WRITE ZEROES command\n");
blk_request_finish(VIRTIO_BLK_S_UNSUPP, task);
return -1;
}
rc = spdk_bdev_unmap(bvdev->bdev_desc, ch,
desc->sector * 512, desc->num_sectors * 512,
blk_request_complete_cb, task);
if (rc) {
if (rc == -ENOMEM) {
SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
blk_request_queue_io(vdev, ch, task);
} else {
blk_request_finish(VIRTIO_BLK_S_IOERR, task);
return -1;
}
}
break;
case VIRTIO_BLK_T_WRITE_ZEROES:
desc = task->iovs[1].iov_base;
if (payload_len != sizeof(*desc)) {
SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
blk_request_finish(VIRTIO_BLK_S_IOERR, task);
return -1;
}
/* Unmap this range, SPDK doesn't support it, kernel will enable this flag by default
* without checking unmap feature is negotiated or not, the flag isn't mandatory, so
* just print a warning.
*/
if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
SPDK_WARNLOG("Ignore the unmap flag for WRITE ZEROES from %"PRIx64", len %"PRIx64"\n",
(uint64_t)desc->sector * 512, (uint64_t)desc->num_sectors * 512);
}
rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, ch,
desc->sector * 512, desc->num_sectors * 512,
blk_request_complete_cb, task);
if (rc) {
if (rc == -ENOMEM) {
SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
blk_request_queue_io(vdev, ch, task);
} else {
blk_request_finish(VIRTIO_BLK_S_IOERR, task);
return -1;
}
}
break;
case VIRTIO_BLK_T_FLUSH:
flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
if (req.sector != 0) {
SPDK_NOTICELOG("sector must be zero for flush command\n");
blk_request_finish(VIRTIO_BLK_S_IOERR, task);
return -1;
}
rc = spdk_bdev_flush(bvdev->bdev_desc, ch,
0, flush_bytes,
blk_request_complete_cb, task);
if (rc) {
if (rc == -ENOMEM) {
SPDK_DEBUGLOG(vhost_blk, "No memory, start to queue io.\n");
blk_request_queue_io(vdev, ch, task);
} else {
blk_request_finish(VIRTIO_BLK_S_IOERR, task);
return -1;
}
}
break;
case VIRTIO_BLK_T_GET_ID:
if (!iovcnt || !payload_len) {
blk_request_finish(VIRTIO_BLK_S_UNSUPP, task);
return -1;
}
task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_name(bvdev->bdev),
task->used_len, ' ');
blk_request_finish(VIRTIO_BLK_S_OK, task);
break;
default:
SPDK_DEBUGLOG(vhost_blk, "Not supported request type '%"PRIu32"'.\n", type);
blk_request_finish(VIRTIO_BLK_S_UNSUPP, task);
return -1;
}
return 0;
}
static void
process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx)
{
struct spdk_vhost_user_blk_task *task;
struct spdk_vhost_blk_task *blk_task;
int rc;
assert(vq->packed.packed_ring == false);
task = &((struct spdk_vhost_user_blk_task *)vq->tasks)[req_idx];
blk_task = &task->blk_task;
if (spdk_unlikely(task->used)) {
SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
task->bvsession->vsession.name, req_idx);
blk_task->used_len = 0;
blk_task_enqueue(task);
return;
}
task->bvsession->vsession.task_cnt++;
blk_task_init(task);
rc = blk_iovs_split_queue_setup(task->bvsession, vq, task->req_idx,
blk_task->iovs, &blk_task->iovcnt, &blk_task->payload_size);
if (rc) {
SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
/* Only READ and WRITE are supported for now. */
vhost_user_blk_request_finish(VIRTIO_BLK_S_UNSUPP, blk_task, NULL);
return;
}
if (vhost_user_process_blk_request(task) == 0) {
SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task,
req_idx);
} else {
SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, req_idx);
}
}
static void
process_packed_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx)
{
struct spdk_vhost_user_blk_task *task;
struct spdk_vhost_blk_task *blk_task;
uint16_t task_idx = req_idx, num_descs;
int rc;
assert(vq->packed.packed_ring);
/* Packed ring used the buffer_id as the task_idx to get task struct.
* In kernel driver, it uses the vq->free_head to set the buffer_id so the value
* must be in the range of 0 ~ vring.size. The free_head value must be unique
* in the outstanding requests.
* We can't use the req_idx as the task_idx because the desc can be reused in
* the next phase even when it's not completed in the previous phase. For example,
* At phase 0, last_used_idx was 2 and desc0 was not completed.Then after moving
* phase 1, last_avail_idx is updated to 1. In this case, req_idx can not be used
* as task_idx because we will know task[0]->used is true at phase 1.
* The split queue is quite different, the desc would insert into the free list when
* device completes the request, the driver gets the desc from the free list which
* ensures the req_idx is unique in the outstanding requests.
*/
task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
task = &((struct spdk_vhost_user_blk_task *)vq->tasks)[task_idx];
blk_task = &task->blk_task;
if (spdk_unlikely(task->used)) {
SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
task->bvsession->vsession.name, task_idx);
blk_task->used_len = 0;
blk_task_enqueue(task);
return;
}
task->req_idx = req_idx;
task->num_descs = num_descs;
task->buffer_id = task_idx;
rte_vhost_set_inflight_desc_packed(task->bvsession->vsession.vid, vq->vring_idx,
req_idx, (req_idx + num_descs - 1) % vq->vring.size,
&task->inflight_head);
task->bvsession->vsession.task_cnt++;
blk_task_init(task);
rc = blk_iovs_packed_queue_setup(task->bvsession, vq, task->req_idx, blk_task->iovs,
&blk_task->iovcnt,
&blk_task->payload_size);
if (rc) {
SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
/* Only READ and WRITE are supported for now. */
vhost_user_blk_request_finish(VIRTIO_BLK_S_UNSUPP, blk_task, NULL);
return;
}
if (vhost_user_process_blk_request(task) == 0) {
SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task,
task_idx);
} else {
SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx);
}
}
static void
process_packed_inflight_blk_task(struct spdk_vhost_virtqueue *vq,
uint16_t req_idx)
{
spdk_vhost_inflight_desc *desc_array = vq->vring_inflight.inflight_packed->desc;
spdk_vhost_inflight_desc *desc = &desc_array[req_idx];
struct spdk_vhost_user_blk_task *task;
struct spdk_vhost_blk_task *blk_task;
uint16_t task_idx, num_descs;
int rc;
task_idx = desc_array[desc->last].id;
num_descs = desc->num;
/* In packed ring reconnection, we use the last_used_idx as the
* initial value. So when we process the inflight descs we still
* need to update the available ring index.
*/
vq->last_avail_idx += num_descs;
if (vq->last_avail_idx >= vq->vring.size) {
vq->last_avail_idx -= vq->vring.size;
vq->packed.avail_phase = !vq->packed.avail_phase;
}
task = &((struct spdk_vhost_user_blk_task *)vq->tasks)[task_idx];
blk_task = &task->blk_task;
if (spdk_unlikely(task->used)) {
SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
task->bvsession->vsession.name, task_idx);
blk_task->used_len = 0;
blk_task_enqueue(task);
return;
}
task->req_idx = req_idx;
task->num_descs = num_descs;
task->buffer_id = task_idx;
/* It's for cleaning inflight entries */
task->inflight_head = req_idx;
task->bvsession->vsession.task_cnt++;
blk_task_init(task);
rc = blk_iovs_inflight_queue_setup(task->bvsession, vq, task->req_idx, blk_task->iovs,
&blk_task->iovcnt,
&blk_task->payload_size);
if (rc) {
SPDK_DEBUGLOG(vhost_blk, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
/* Only READ and WRITE are supported for now. */
vhost_user_blk_request_finish(VIRTIO_BLK_S_UNSUPP, blk_task, NULL);
return;
}
if (vhost_user_process_blk_request(task) == 0) {
SPDK_DEBUGLOG(vhost_blk, "====== Task %p req_idx %d submitted ======\n", task,
task_idx);
} else {
SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx);
}
}
static int
submit_inflight_desc(struct spdk_vhost_blk_session *bvsession,
struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_session *vsession;
spdk_vhost_resubmit_info *resubmit;
spdk_vhost_resubmit_desc *resubmit_list;
uint16_t req_idx;
int i, resubmit_cnt;
resubmit = vq->vring_inflight.resubmit_inflight;
if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL ||
resubmit->resubmit_num == 0)) {
return 0;
}
resubmit_list = resubmit->resubmit_list;
vsession = &bvsession->vsession;
for (i = resubmit->resubmit_num - 1; i >= 0; --i) {
req_idx = resubmit_list[i].index;
SPDK_DEBUGLOG(vhost_blk, "====== Start processing resubmit request idx %"PRIu16"======\n",
req_idx);
if (spdk_unlikely(req_idx >= vq->vring.size)) {
SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
vsession->name, req_idx, vq->vring.size);
vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
continue;
}
if (vq->packed.packed_ring) {
process_packed_inflight_blk_task(vq, req_idx);
} else {
process_blk_task(vq, req_idx);
}
}
resubmit_cnt = resubmit->resubmit_num;
resubmit->resubmit_num = 0;
return resubmit_cnt;
}
static int
process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
uint16_t reqs[SPDK_VHOST_VQ_MAX_SUBMISSIONS];
uint16_t reqs_cnt, i;
int resubmit_cnt = 0;
resubmit_cnt = submit_inflight_desc(bvsession, vq);
reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
if (!reqs_cnt) {
return resubmit_cnt;
}
for (i = 0; i < reqs_cnt; i++) {
SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n",
reqs[i]);
if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
vsession->name, reqs[i], vq->vring.size);
vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
continue;
}
rte_vhost_set_inflight_desc_split(vsession->vid, vq->vring_idx, reqs[i]);
process_blk_task(vq, reqs[i]);
}
return reqs_cnt;
}
static int
process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
{
uint16_t i = 0;
uint16_t count = 0;
int resubmit_cnt = 0;
resubmit_cnt = submit_inflight_desc(bvsession, vq);
while (i++ < SPDK_VHOST_VQ_MAX_SUBMISSIONS &&
vhost_vq_packed_ring_is_avail(vq)) {
SPDK_DEBUGLOG(vhost_blk, "====== Starting processing request idx %"PRIu16"======\n",
vq->last_avail_idx);
count++;
process_packed_blk_task(vq, vq->last_avail_idx);
}
return count > 0 ? count : resubmit_cnt;
}
static int
_vdev_vq_worker(struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_session *vsession = vq->vsession;
struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
bool packed_ring;
int rc = 0;
packed_ring = vq->packed.packed_ring;
if (packed_ring) {
rc = process_packed_vq(bvsession, vq);
} else {
rc = process_vq(bvsession, vq);
}
vhost_session_vq_used_signal(vq);
return rc;
}
static int
vdev_vq_worker(void *arg)
{
struct spdk_vhost_virtqueue *vq = arg;
return _vdev_vq_worker(vq);
}
static int
vdev_worker(void *arg)
{
struct spdk_vhost_blk_session *bvsession = arg;
struct spdk_vhost_session *vsession = &bvsession->vsession;
uint16_t q_idx;
int rc = 0;
for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
rc += _vdev_vq_worker(&vsession->virtqueue[q_idx]);
}
return rc > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
}
static void
no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct iovec iovs[SPDK_VHOST_IOVS_MAX];
uint32_t length;
uint16_t iovcnt, req_idx;
if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
return;
}
iovcnt = SPDK_COUNTOF(iovs);
if (blk_iovs_split_queue_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
*(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx);
}
vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
}
static void
no_bdev_process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_user_blk_task *task;
struct spdk_vhost_blk_task *blk_task;
uint32_t length;
uint16_t req_idx = vq->last_avail_idx;
uint16_t task_idx, num_descs;
if (!vhost_vq_packed_ring_is_avail(vq)) {
return;
}
task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
task = &((struct spdk_vhost_user_blk_task *)vq->tasks)[task_idx];
blk_task = &task->blk_task;
if (spdk_unlikely(task->used)) {
SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
vsession->name, req_idx);
vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
task->buffer_id, blk_task->used_len,
task->inflight_head);
return;
}
task->req_idx = req_idx;
task->num_descs = num_descs;
task->buffer_id = task_idx;
blk_task_init(task);
if (blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, blk_task->iovs, &blk_task->iovcnt,
&length)) {
*(volatile uint8_t *)(blk_task->iovs[blk_task->iovcnt - 1].iov_base) = VIRTIO_BLK_S_IOERR;
SPDK_DEBUGLOG(vhost_blk_data, "Aborting request %" PRIu16"\n", req_idx);
}
task->used = false;
vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
task->buffer_id, blk_task->used_len,
task->inflight_head);
}
static int
_no_bdev_vdev_vq_worker(struct spdk_vhost_virtqueue *vq)
{
struct spdk_vhost_session *vsession = vq->vsession;
struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
bool packed_ring;
packed_ring = vq->packed.packed_ring;
if (packed_ring) {
no_bdev_process_packed_vq(bvsession, vq);
} else {
no_bdev_process_vq(bvsession, vq);
}
vhost_session_vq_used_signal(vq);
if (vsession->task_cnt == 0 && bvsession->io_channel) {
vhost_blk_put_io_channel(bvsession->io_channel);
bvsession->io_channel = NULL;
}
return SPDK_POLLER_BUSY;
}
static int
no_bdev_vdev_vq_worker(void *arg)
{
struct spdk_vhost_virtqueue *vq = arg;
return _no_bdev_vdev_vq_worker(vq);
}
static int
no_bdev_vdev_worker(void *arg)
{
struct spdk_vhost_blk_session *bvsession = arg;
struct spdk_vhost_session *vsession = &bvsession->vsession;
uint16_t q_idx;
for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
_no_bdev_vdev_vq_worker(&vsession->virtqueue[q_idx]);
}
return SPDK_POLLER_BUSY;
}
static void
vhost_blk_session_unregister_interrupts(struct spdk_vhost_blk_session *bvsession)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_virtqueue *vq;
int i;
SPDK_DEBUGLOG(vhost_blk, "unregister virtqueues interrupt\n");
for (i = 0; i < vsession->max_queues; i++) {
vq = &vsession->virtqueue[i];
if (vq->intr == NULL) {
break;
}
SPDK_DEBUGLOG(vhost_blk, "unregister vq[%d]'s kickfd is %d\n",
i, vq->vring.kickfd);
spdk_interrupt_unregister(&vq->intr);
}
}
static int
vhost_blk_session_register_interrupts(struct spdk_vhost_blk_session *bvsession,
spdk_interrupt_fn fn, const char *name)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_virtqueue *vq = NULL;
int i;
SPDK_DEBUGLOG(vhost_blk, "Register virtqueues interrupt\n");
for (i = 0; i < vsession->max_queues; i++) {
vq = &vsession->virtqueue[i];
SPDK_DEBUGLOG(vhost_blk, "Register vq[%d]'s kickfd is %d\n",
i, vq->vring.kickfd);
vq->intr = spdk_interrupt_register(vq->vring.kickfd, fn, vq, name);
if (vq->intr == NULL) {
SPDK_ERRLOG("Fail to register req notifier handler.\n");
goto err;
}
}
return 0;
err:
vhost_blk_session_unregister_interrupts(bvsession);
return -1;
}
static void
vhost_blk_poller_set_interrupt_mode(struct spdk_poller *poller, void *cb_arg, bool interrupt_mode)
{
struct spdk_vhost_blk_session *bvsession = cb_arg;
vhost_user_session_set_interrupt_mode(&bvsession->vsession, interrupt_mode);
}
static void
bdev_event_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
{
enum spdk_bdev_event_type type = (enum spdk_bdev_event_type)(uintptr_t)ctx;
struct spdk_vhost_blk_dev *bvdev;
if (type == SPDK_BDEV_EVENT_REMOVE) {
/* All sessions have been notified, time to close the bdev */
bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
spdk_put_io_channel(bvdev->dummy_io_channel);
spdk_bdev_close(bvdev->bdev_desc);
bvdev->bdev_desc = NULL;
bvdev->bdev = NULL;
}
}
static int
vhost_session_bdev_resize_cb(struct spdk_vhost_dev *vdev,
struct spdk_vhost_session *vsession,
void *ctx)
{
#if RTE_VERSION >= RTE_VERSION_NUM(20, 02, 0, 0)
SPDK_NOTICELOG("bdev send slave msg to vid(%d)\n", vsession->vid);
rte_vhost_slave_config_change(vsession->vid, false);
#else
SPDK_NOTICELOG("bdev does not support resize until DPDK submodule version >= 20.02\n");
#endif
return 0;
}
static void
vhost_user_blk_resize_cb(struct spdk_vhost_dev *vdev, bdev_event_cb_complete cb, void *cb_arg)
{
spdk_vhost_lock();
vhost_user_dev_foreach_session(vdev, vhost_session_bdev_resize_cb,
cb, cb_arg);
spdk_vhost_unlock();
}
static int
vhost_user_session_bdev_remove_cb(struct spdk_vhost_dev *vdev,
struct spdk_vhost_session *vsession,
void *ctx)
{
struct spdk_vhost_blk_session *bvsession;
int rc;
bvsession = to_blk_session(vsession);
if (bvsession->requestq_poller) {
spdk_poller_unregister(&bvsession->requestq_poller);
if (vsession->virtqueue[0].intr) {
vhost_blk_session_unregister_interrupts(bvsession);
rc = vhost_blk_session_register_interrupts(bvsession, no_bdev_vdev_vq_worker,
"no_bdev_vdev_vq_worker");
if (rc) {
SPDK_ERRLOG("%s: Interrupt register failed\n", vsession->name);
return rc;
}
}
bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0);
spdk_poller_register_interrupt(bvsession->requestq_poller, vhost_blk_poller_set_interrupt_mode,
bvsession);
}
return 0;
}
static void
vhost_user_bdev_remove_cb(struct spdk_vhost_dev *vdev, bdev_event_cb_complete cb, void *cb_arg)
{
SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n",
vdev->name);
spdk_vhost_lock();
vhost_user_dev_foreach_session(vdev, vhost_user_session_bdev_remove_cb,
cb, cb_arg);
spdk_vhost_unlock();
}
static void
vhost_user_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_vhost_dev *vdev,
bdev_event_cb_complete cb, void *cb_arg)
{
switch (type) {
case SPDK_BDEV_EVENT_REMOVE:
vhost_user_bdev_remove_cb(vdev, cb, cb_arg);
break;
case SPDK_BDEV_EVENT_RESIZE:
vhost_user_blk_resize_cb(vdev, cb, cb_arg);
break;
default:
assert(false);
return;
}
}
static void
bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
void *event_ctx)
{
struct spdk_vhost_dev *vdev = (struct spdk_vhost_dev *)event_ctx;
struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
SPDK_DEBUGLOG(vhost_blk, "Bdev event: type %d, name %s\n",
type,
bdev->name);
switch (type) {
case SPDK_BDEV_EVENT_REMOVE:
case SPDK_BDEV_EVENT_RESIZE:
bvdev->ops->bdev_event(type, vdev, bdev_event_cpl_cb, (void *)type);
break;
default:
SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
break;
}
}
static void
free_task_pool(struct spdk_vhost_blk_session *bvsession)
{
struct spdk_vhost_session *vsession = &bvsession->vsession;
struct spdk_vhost_virtqueue *vq;
uint16_t i;
for (i = 0; i < vsession->max_queues; i++) {
vq = &vsession->virtqueue[i];
if (vq->tasks == NULL) {
continue;
}
spdk_free(vq->tasks);
vq->tasks = NULL;
}
}
static int
alloc_vq_task_pool(struct spdk_vhost_session *vsession, uint16_t qid)
{
struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
struct spdk_vhost_virtqueue *vq;
struct spdk_vhost_user_blk_task *task;
uint32_t task_cnt;
uint32_t j;
if (qid >= SPDK_VHOST_MAX_VQUEUES) {
return -EINVAL;
}
vq = &vsession->virtqueue[qid];
if (vq->vring.desc == NULL) {
return 0;
}
task_cnt = vq->vring.size;
if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
/* sanity check */
SPDK_ERRLOG("%s: virtqueue %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
vsession->name, qid, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
return -1;
}
vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_user_blk_task) * task_cnt,
SPDK_CACHE_LINE_SIZE, NULL,
SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
if (vq->tasks == NULL) {
SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
vsession->name, task_cnt, qid);
return -1;
}
for (j = 0; j < task_cnt; j++) {
task = &((struct spdk_vhost_user_blk_task *)vq->tasks)[j];
task->bvsession = bvsession;
task->req_idx = j;
task->vq = vq;
}
return 0;
}
static int
vhost_blk_start(struct spdk_vhost_dev *vdev,
struct spdk_vhost_session *vsession, void *unused)
{
struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
struct spdk_vhost_blk_dev *bvdev;
int i, rc = 0;
/* return if start is already in progress */
if (bvsession->requestq_poller) {
SPDK_INFOLOG(vhost, "%s: start in progress\n", vsession->name);
return -EINPROGRESS;
}
/* validate all I/O queues are in a contiguous index range */
for (i = 0; i < vsession->max_queues; i++) {
/* vring.desc and vring.desc_packed are in a union struct
* so q->vring.desc can replace q->vring.desc_packed.
*/
if (vsession->virtqueue[i].vring.desc == NULL) {
SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
return -1;
}
}
bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
bvsession->bvdev = bvdev;
if (bvdev->bdev) {
bvsession->io_channel = vhost_blk_get_io_channel(vdev);
if (!bvsession->io_channel) {
free_task_pool(bvsession);
SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name);
return -1;
}
}
if (spdk_interrupt_mode_is_enabled()) {
if (bvdev->bdev) {
rc = vhost_blk_session_register_interrupts(bvsession,
vdev_vq_worker,
"vdev_vq_worker");
} else {
rc = vhost_blk_session_register_interrupts(bvsession,
no_bdev_vdev_vq_worker,
"no_bdev_vdev_vq_worker");
}
if (rc) {
SPDK_ERRLOG("%s: Interrupt register failed\n", vsession->name);
return rc;
}
}
if (bvdev->bdev) {
bvsession->requestq_poller = SPDK_POLLER_REGISTER(vdev_worker, bvsession, 0);
} else {
bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0);
}
SPDK_INFOLOG(vhost, "%s: started poller on lcore %d\n",
vsession->name, spdk_env_get_current_core());
spdk_poller_register_interrupt(bvsession->requestq_poller, vhost_blk_poller_set_interrupt_mode,
bvsession);
return 0;
}
static int
destroy_session_poller_cb(void *arg)
{
struct spdk_vhost_blk_session *bvsession = arg;
struct spdk_vhost_session *vsession = &bvsession->vsession;
int i;
if (vsession->task_cnt > 0 || spdk_vhost_trylock() != 0) {
assert(vsession->stop_retry_count > 0);
vsession->stop_retry_count--;
if (vsession->stop_retry_count == 0) {
SPDK_ERRLOG("%s: Timedout when destroy session (task_cnt %d)\n", vsession->name,
vsession->task_cnt);
spdk_poller_unregister(&bvsession->stop_poller);
vhost_user_session_stop_done(vsession, -ETIMEDOUT);
}
return SPDK_POLLER_BUSY;
}
for (i = 0; i < vsession->max_queues; i++) {
vsession->virtqueue[i].next_event_time = 0;
vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
}
SPDK_INFOLOG(vhost, "%s: stopping poller on lcore %d\n",
vsession->name, spdk_env_get_current_core());
if (bvsession->io_channel) {
vhost_blk_put_io_channel(bvsession->io_channel);
bvsession->io_channel = NULL;
}
free_task_pool(bvsession);
spdk_poller_unregister(&bvsession->stop_poller);
vhost_user_session_stop_done(vsession, 0);
spdk_vhost_unlock();
return SPDK_POLLER_BUSY;
}
static int
vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
struct spdk_vhost_session *vsession, void *unused)
{
struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
/* return if stop is already in progress */
if (bvsession->stop_poller) {
return -EINPROGRESS;
}
spdk_poller_unregister(&bvsession->requestq_poller);
if (vsession->virtqueue[0].intr) {
vhost_blk_session_unregister_interrupts(bvsession);
}
/* vhost_user_session_send_event timeout is 3 seconds, here set retry within 4 seconds */
bvsession->vsession.stop_retry_count = 4000;
bvsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb,
bvsession, 1000);
return 0;
}
static int
vhost_blk_stop(struct spdk_vhost_session *vsession)
{
return vhost_user_session_send_event(vsession, vhost_blk_stop_cb,
3, "stop session");
}
static void
vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
{
struct spdk_vhost_blk_dev *bvdev;
bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
spdk_json_write_named_object_begin(w, "block");
spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
spdk_json_write_name(w, "bdev");
if (bvdev->bdev) {
spdk_json_write_string(w, spdk_bdev_get_name(bvdev->bdev));
} else {
spdk_json_write_null(w);
}
spdk_json_write_named_string(w, "transport", bvdev->ops->name);
spdk_json_write_object_end(w);
}
static void
vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
{
struct spdk_vhost_blk_dev *bvdev;
bvdev = to_blk_dev(vdev);
assert(bvdev != NULL);
if (!bvdev->bdev) {
return;
}
spdk_json_write_object_begin(w);
spdk_json_write_named_string(w, "method", "vhost_create_blk_controller");
spdk_json_write_named_object_begin(w, "params");
spdk_json_write_named_string(w, "ctrlr", vdev->name);
spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
spdk_json_write_named_string(w, "cpumask",
spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
spdk_json_write_named_string(w, "transport", bvdev->ops->name);
spdk_json_write_object_end(w);
spdk_json_write_object_end(w);
}
static int vhost_blk_destroy(struct spdk_vhost_dev *dev);
static int
vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
uint32_t len)
{
struct virtio_blk_config blkcfg;
struct spdk_bdev *bdev;
uint32_t blk_size;
uint64_t blkcnt;
memset(&blkcfg, 0, sizeof(blkcfg));
bdev = vhost_blk_get_bdev(vdev);
if (bdev == NULL) {
/* We can't just return -1 here as this GET_CONFIG message might
* be caused by a QEMU VM reboot. Returning -1 will indicate an
* error to QEMU, who might then decide to terminate itself.
* We don't want that. A simple reboot shouldn't break the system.
*
* Presenting a block device with block size 0 and block count 0
* doesn't cause any problems on QEMU side and the virtio-pci
* device is even still available inside the VM, but there will
* be no block device created for it - the kernel drivers will
* silently reject it.
*/
blk_size = 0;
blkcnt = 0;
} else {
blk_size = spdk_bdev_get_block_size(bdev);
blkcnt = spdk_bdev_get_num_blocks(bdev);
if (spdk_bdev_get_buf_align(bdev) > 1) {
blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, SPDK_BDEV_IO_NUM_CHILD_IOV - 2 - 1);
} else {
blkcfg.size_max = 131072;
/* -2 for REQ and RESP and -1 for region boundary splitting */
blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
}
}
blkcfg.blk_size = blk_size;
/* minimum I/O size in blocks */
blkcfg.min_io_size = 1;
/* expressed in 512 Bytes sectors */
blkcfg.capacity = (blkcnt * blk_size) / 512;
/* QEMU can overwrite this value when started */
blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
/* 16MiB, expressed in 512 Bytes */
blkcfg.max_discard_sectors = 32768;
blkcfg.max_discard_seg = 1;
blkcfg.discard_sector_alignment = blk_size / 512;
}
if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
blkcfg.max_write_zeroes_sectors = 32768;
blkcfg.max_write_zeroes_seg = 1;
}
memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
return 0;
}
static const struct spdk_vhost_user_dev_backend vhost_blk_user_device_backend = {
.session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
.start_session = vhost_blk_start,
.stop_session = vhost_blk_stop,
.alloc_vq_tasks = alloc_vq_task_pool,
};
static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
.type = VHOST_BACKEND_BLK,
.vhost_get_config = vhost_blk_get_config,
.dump_info_json = vhost_blk_dump_info_json,
.write_config_json = vhost_blk_write_config_json,
.remove_device = vhost_blk_destroy,
};
int
virtio_blk_construct_ctrlr(struct spdk_vhost_dev *vdev, const char *address,
struct spdk_cpuset *cpumask, const struct spdk_json_val *params,
const struct spdk_vhost_user_dev_backend *user_backend)
{
struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
return bvdev->ops->create_ctrlr(vdev, cpumask, address, params, (void *)user_backend);
}
int
spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name,
const char *transport, const struct spdk_json_val *params)
{
struct spdk_vhost_blk_dev *bvdev = NULL;
struct spdk_vhost_dev *vdev;
struct spdk_bdev *bdev;
const char *transport_name = VIRTIO_BLK_DEFAULT_TRANSPORT;
int ret = 0;
spdk_vhost_lock();
bvdev = calloc(1, sizeof(*bvdev));
if (bvdev == NULL) {
ret = -ENOMEM;
goto out;
}
if (transport != NULL) {
transport_name = transport;
}
bvdev->ops = virtio_blk_get_transport_ops(transport_name);
if (!bvdev->ops) {
ret = -EINVAL;
SPDK_ERRLOG("Transport type '%s' unavailable.\n", transport_name);
goto out;
}
ret = spdk_bdev_open_ext(dev_name, true, bdev_event_cb, bvdev, &bvdev->bdev_desc);
if (ret != 0) {
SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n",
name, dev_name, ret);
goto out;
}
bdev = spdk_bdev_desc_get_bdev(bvdev->bdev_desc);
vdev = &bvdev->vdev;
vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE;
vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES;
vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES;
if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD);
}
if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
}
if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH);
}
/*
* When starting qemu with multiqueue enable, the vhost device will
* be started/stopped many times, related to the queues num, as the
* exact number of queues used for this device is not known at the time.
* The target has to stop and start the device once got a valid IO queue.
* When stoping and starting the vhost device, the backend bdev io device
* will be deleted and created repeatedly.
* Hold a bdev reference so that in the struct spdk_vhost_blk_dev, so that
* the io device will not be deleted.
*/
bvdev->dummy_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
bvdev->bdev = bdev;
bvdev->readonly = false;
ret = vhost_dev_register(vdev, name, cpumask, params, &vhost_blk_device_backend,
&vhost_blk_user_device_backend);
if (ret != 0) {
spdk_put_io_channel(bvdev->dummy_io_channel);
spdk_bdev_close(bvdev->bdev_desc);
goto out;
}
SPDK_INFOLOG(vhost, "%s: using bdev '%s'\n", name, dev_name);
out:
if (ret != 0 && bvdev) {
free(bvdev);
}
spdk_vhost_unlock();
return ret;
}
int
virtio_blk_destroy_ctrlr(struct spdk_vhost_dev *vdev)
{
struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
return bvdev->ops->destroy_ctrlr(vdev);
}
static int
vhost_blk_destroy(struct spdk_vhost_dev *vdev)
{
struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
int rc;
assert(bvdev != NULL);
rc = vhost_dev_unregister(&bvdev->vdev);
if (rc != 0) {
return rc;
}
/* if the bdev is removed, don't need call spdk_put_io_channel. */
if (bvdev->bdev) {
spdk_put_io_channel(bvdev->dummy_io_channel);
}
if (bvdev->bdev_desc) {
spdk_bdev_close(bvdev->bdev_desc);
bvdev->bdev_desc = NULL;
}
bvdev->bdev = NULL;
free(bvdev);
return 0;
}
struct spdk_io_channel *
vhost_blk_get_io_channel(struct spdk_vhost_dev *vdev)
{
struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
return spdk_bdev_get_io_channel(bvdev->bdev_desc);
}
void
vhost_blk_put_io_channel(struct spdk_io_channel *ch)
{
spdk_put_io_channel(ch);
}
static struct spdk_virtio_blk_transport *
vhost_user_blk_create(const struct spdk_json_val *params)
{
int ret;
struct spdk_virtio_blk_transport *vhost_user_blk;
vhost_user_blk = calloc(1, sizeof(*vhost_user_blk));
if (!vhost_user_blk) {
return NULL;
}
ret = vhost_user_init();
if (ret != 0) {
free(vhost_user_blk);
return NULL;
}
return vhost_user_blk;
}
static int
vhost_user_blk_destroy(struct spdk_virtio_blk_transport *transport,
spdk_vhost_fini_cb cb_fn)
{
vhost_user_fini(cb_fn);
free(transport);
return 0;
}
struct rpc_vhost_blk {
bool readonly;
bool packed_ring;
bool packed_ring_recovery;
};
static const struct spdk_json_object_decoder rpc_construct_vhost_blk[] = {
{"readonly", offsetof(struct rpc_vhost_blk, readonly), spdk_json_decode_bool, true},
{"packed_ring", offsetof(struct rpc_vhost_blk, packed_ring), spdk_json_decode_bool, true},
{"packed_ring_recovery", offsetof(struct rpc_vhost_blk, packed_ring_recovery), spdk_json_decode_bool, true},
};
static int
vhost_user_blk_create_ctrlr(struct spdk_vhost_dev *vdev, struct spdk_cpuset *cpumask,
const char *address, const struct spdk_json_val *params, void *custom_opts)
{
struct rpc_vhost_blk req = {0};
struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
if (spdk_json_decode_object_relaxed(params, rpc_construct_vhost_blk,
SPDK_COUNTOF(rpc_construct_vhost_blk),
&req)) {
SPDK_DEBUGLOG(vhost_blk, "spdk_json_decode_object failed\n");
return -EINVAL;
}
vdev->packed_ring_recovery = false;
if (req.packed_ring) {
vdev->virtio_features |= (uint64_t)req.packed_ring << VIRTIO_F_RING_PACKED;
vdev->packed_ring_recovery = req.packed_ring_recovery;
}
if (req.readonly) {
vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO);
bvdev->readonly = req.readonly;
}
return vhost_user_dev_register(vdev, address, cpumask, custom_opts);
}
static int
vhost_user_blk_destroy_ctrlr(struct spdk_vhost_dev *vdev)
{
return vhost_user_dev_unregister(vdev);
}
static const struct spdk_virtio_blk_transport_ops vhost_user_blk = {
.name = "vhost_user_blk",
.dump_opts = NULL,
.create = vhost_user_blk_create,
.destroy = vhost_user_blk_destroy,
.create_ctrlr = vhost_user_blk_create_ctrlr,
.destroy_ctrlr = vhost_user_blk_destroy_ctrlr,
.bdev_event = vhost_user_bdev_event_cb,
};
SPDK_VIRTIO_BLK_TRANSPORT_REGISTER(vhost_user_blk, &vhost_user_blk);
SPDK_LOG_REGISTER_COMPONENT(vhost_blk)
SPDK_LOG_REGISTER_COMPONENT(vhost_blk_data)