diff --git a/module/bdev/nvme/bdev_ocssd.c b/module/bdev/nvme/bdev_ocssd.c index 4804f0c63..85b5fc104 100644 --- a/module/bdev/nvme/bdev_ocssd.c +++ b/module/bdev/nvme/bdev_ocssd.c @@ -39,15 +39,29 @@ #include "spdk/nvme_ocssd.h" #include "spdk/nvme_ocssd_spec.h" #include "spdk_internal/log.h" +#include "spdk/nvme.h" #include "common.h" #include "bdev_ocssd.h" +struct bdev_ocssd_lba_offsets { + uint32_t grp; + uint32_t pu; + uint32_t chk; + uint32_t lbk; +}; + +struct bdev_ocssd_io { + size_t iov_pos; + size_t iov_off; +}; + struct ocssd_bdev { struct nvme_bdev nvme_bdev; }; struct bdev_ocssd_ns { struct spdk_ocssd_geometry_data geometry; + struct bdev_ocssd_lba_offsets lba_offsets; }; static struct bdev_ocssd_ns * @@ -76,7 +90,7 @@ bdev_ocssd_config_json(struct spdk_json_write_ctx *w) static int bdev_ocssd_get_ctx_size(void) { - return 0; + return sizeof(struct bdev_ocssd_io); } static struct spdk_bdev_module ocssd_if = { @@ -112,16 +126,217 @@ bdev_ocssd_destruct(void *ctx) return 0; } +static uint64_t +bdev_ocssd_to_disk_lba(struct ocssd_bdev *ocssd_bdev, uint64_t lba) +{ + struct nvme_bdev_ns *nvme_ns = ocssd_bdev->nvme_bdev.nvme_ns; + struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_nvme(nvme_ns); + const struct spdk_ocssd_geometry_data *geo = &ocssd_ns->geometry; + const struct bdev_ocssd_lba_offsets *offsets = &ocssd_ns->lba_offsets; + uint64_t addr_shift, lbk, chk, pu, grp; + + /* To achieve best performance, we need to make sure that adjacent zones can be accessed + * in parallel. We accomplish this by having the following addressing scheme: + * + * [ zone id ][ zone offset ] User's LBA + * [ chunk ][ group ][ parallel unit ][ logical block ] Open Channel's LBA + * + * which means that neighbouring zones are placed in a different group and parallel unit. + */ + lbk = lba % geo->clba; + addr_shift = geo->clba; + + pu = (lba / addr_shift) % geo->num_pu; + addr_shift *= geo->num_pu; + + grp = (lba / addr_shift) % geo->num_grp; + addr_shift *= geo->num_grp; + + chk = (lba / addr_shift) % geo->num_chk; + + return (lbk << offsets->lbk) | + (chk << offsets->chk) | + (pu << offsets->pu) | + (grp << offsets->grp); +} + +static void +bdev_ocssd_reset_sgl(void *cb_arg, uint32_t offset) +{ + struct spdk_bdev_io *bdev_io = cb_arg; + struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx; + struct iovec *iov; + + ocdev_io->iov_pos = 0; + ocdev_io->iov_off = 0; + + for (; ocdev_io->iov_pos < (size_t)bdev_io->u.bdev.iovcnt; ++ocdev_io->iov_pos) { + iov = &bdev_io->u.bdev.iovs[ocdev_io->iov_pos]; + if (offset < iov->iov_len) { + ocdev_io->iov_off = offset; + return; + } + + offset -= iov->iov_len; + } + + assert(false && "Invalid offset length"); +} + +static int +bdev_ocssd_next_sge(void *cb_arg, void **address, uint32_t *length) +{ + struct spdk_bdev_io *bdev_io = cb_arg; + struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx; + struct iovec *iov; + + assert(ocdev_io->iov_pos < (size_t)bdev_io->u.bdev.iovcnt); + iov = &bdev_io->u.bdev.iovs[ocdev_io->iov_pos]; + + *address = iov->iov_base; + *length = iov->iov_len; + + if (ocdev_io->iov_off != 0) { + assert(ocdev_io->iov_off < iov->iov_len); + *address = (char *)*address + ocdev_io->iov_off; + *length -= ocdev_io->iov_off; + } + + assert(ocdev_io->iov_off + *length == iov->iov_len); + ocdev_io->iov_off = 0; + ocdev_io->iov_pos++; + + return 0; +} + +static void +bdev_ocssd_read_cb(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_bdev_io *bdev_io = ctx; + + spdk_bdev_io_complete_nvme_status(bdev_io, 0, cpl->status.sct, cpl->status.sc); +} + +static int +bdev_ocssd_read(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io) +{ + struct ocssd_bdev *ocssd_bdev = bdev_io->bdev->ctxt; + struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev; + struct nvme_io_channel *nvme_ioch = spdk_io_channel_get_ctx(ioch); + struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx; + const size_t zone_size = nvme_bdev->disk.zone_size; + uint64_t lba; + + if ((bdev_io->u.bdev.offset_blocks % zone_size) + bdev_io->u.bdev.num_blocks > zone_size) { + SPDK_ERRLOG("Tried to cross zone boundary during read command\n"); + return -EINVAL; + } + + ocdev_io->iov_pos = 0; + ocdev_io->iov_off = 0; + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev_io->u.bdev.offset_blocks); + + return spdk_nvme_ns_cmd_readv_with_md(nvme_bdev->nvme_ns->ns, nvme_ioch->qpair, lba, + bdev_io->u.bdev.num_blocks, bdev_ocssd_read_cb, + bdev_io, 0, bdev_ocssd_reset_sgl, + bdev_ocssd_next_sge, bdev_io->u.bdev.md_buf, 0, 0); +} + +static void +bdev_ocssd_write_cb(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_bdev_io *bdev_io = ctx; + + spdk_bdev_io_complete_nvme_status(bdev_io, 0, cpl->status.sct, cpl->status.sc); +} + +static int +bdev_ocssd_write(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io) +{ + struct ocssd_bdev *ocssd_bdev = bdev_io->bdev->ctxt; + struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev; + struct nvme_io_channel *nvme_ioch = spdk_io_channel_get_ctx(ioch); + struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx; + const size_t zone_size = nvme_bdev->disk.zone_size; + uint64_t lba; + + if ((bdev_io->u.bdev.offset_blocks % zone_size) + bdev_io->u.bdev.num_blocks > zone_size) { + SPDK_ERRLOG("Tried to cross zone boundary during write command\n"); + return -EINVAL; + } + + ocdev_io->iov_pos = 0; + ocdev_io->iov_off = 0; + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev_io->u.bdev.offset_blocks); + + return spdk_nvme_ns_cmd_writev_with_md(nvme_bdev->nvme_ns->ns, nvme_ioch->qpair, lba, + bdev_io->u.bdev.num_blocks, bdev_ocssd_write_cb, + bdev_io, 0, bdev_ocssd_reset_sgl, + bdev_ocssd_next_sge, bdev_io->u.bdev.md_buf, 0, 0); +} + +static void +bdev_ocssd_io_get_buf_cb(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io, bool success) +{ + int rc; + + if (!success) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + return; + } + + rc = bdev_ocssd_read(ioch, bdev_io); + if (spdk_likely(rc != 0)) { + if (rc == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + static void bdev_ocssd_submit_request(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io) { - spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + int rc = 0; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_ocssd_io_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + + case SPDK_BDEV_IO_TYPE_WRITE: + rc = bdev_ocssd_write(ioch, bdev_io); + break; + + default: + rc = -EINVAL; + break; + } + + if (spdk_unlikely(rc != 0)) { + if (rc == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } } static bool bdev_ocssd_io_type_supported(void *ctx, enum spdk_bdev_io_type type) { - return false; + switch (type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + return true; + + default: + return false; + } } static struct spdk_io_channel * @@ -310,6 +525,7 @@ bdev_ocssd_geometry_cb(void *_ctx, const struct spdk_nvme_cpl *cpl) { struct bdev_ocssd_populate_ns_ctx *ctx = _ctx; struct nvme_bdev_ns *nvme_ns = ctx->nvme_ns; + struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_nvme(nvme_ns); int rc = 0; if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) { @@ -317,6 +533,14 @@ bdev_ocssd_geometry_cb(void *_ctx, const struct spdk_nvme_cpl *cpl) free(nvme_ns->type_ctx); nvme_ns->type_ctx = NULL; rc = -EIO; + } else { + ocssd_ns->lba_offsets.lbk = 0; + ocssd_ns->lba_offsets.chk = ocssd_ns->lba_offsets.lbk + + ocssd_ns->geometry.lbaf.lbk_len; + ocssd_ns->lba_offsets.pu = ocssd_ns->lba_offsets.chk + + ocssd_ns->geometry.lbaf.chk_len; + ocssd_ns->lba_offsets.grp = ocssd_ns->lba_offsets.pu + + ocssd_ns->geometry.lbaf.pu_len; } nvme_ctrlr_populate_namespace_done(ctx->nvme_ctx, nvme_ns, rc); @@ -330,8 +554,15 @@ bdev_ocssd_populate_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, { struct bdev_ocssd_ns *ocssd_ns; struct bdev_ocssd_populate_ns_ctx *ctx; + struct spdk_nvme_ns *ns; int rc; + ns = spdk_nvme_ctrlr_get_ns(nvme_bdev_ctrlr->ctrlr, nvme_ns->id); + if (ns == NULL) { + nvme_ctrlr_populate_namespace_done(nvme_ctx, nvme_ns, -EINVAL); + return; + } + ctx = calloc(1, sizeof(*ctx)); if (ctx == NULL) { nvme_ctrlr_populate_namespace_done(nvme_ctx, nvme_ns, -ENOMEM); @@ -346,6 +577,7 @@ bdev_ocssd_populate_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, } nvme_ns->type_ctx = ocssd_ns; + nvme_ns->ns = ns; ctx->nvme_ctx = nvme_ctx; ctx->nvme_ns = nvme_ns; diff --git a/test/unit/lib/bdev/bdev_ocssd.c/bdev_ocssd_ut.c b/test/unit/lib/bdev/bdev_ocssd.c/bdev_ocssd_ut.c index a8f7a57f8..b09e197b1 100644 --- a/test/unit/lib/bdev/bdev_ocssd.c/bdev_ocssd_ut.c +++ b/test/unit/lib/bdev/bdev_ocssd.c/bdev_ocssd_ut.c @@ -50,6 +50,8 @@ DEFINE_STUB(spdk_nvme_ns_get_extended_sector_size, uint32_t, (struct spdk_nvme_n DEFINE_STUB(spdk_nvme_ns_is_active, bool, (struct spdk_nvme_ns *ns), true); DEFINE_STUB_V(spdk_opal_close, (struct spdk_opal_dev *dev)); DEFINE_STUB(spdk_opal_revert_poll, int, (struct spdk_opal_dev *dev), 0); +DEFINE_STUB_V(spdk_bdev_io_complete_nvme_status, (struct spdk_bdev_io *bdev_io, uint32_t cdw0, + int sct, int sc)); struct nvme_request { spdk_nvme_cmd_cb cb_fn; @@ -381,6 +383,22 @@ spdk_nvme_ns_cmd_readv_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair * return 0; } +int +spdk_nvme_ns_cmd_writev_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata, + uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + + req = alloc_request(cb_fn, cb_arg); + TAILQ_INSERT_TAIL(&qpair->requests, req, tailq); + + return 0; +} + static void create_bdev_cb(const char *bdev_name, int status, void *ctx) { @@ -528,6 +546,133 @@ test_device_geometry(void) free_controller(ctrlr); } +static uint64_t +generate_lba(const struct spdk_ocssd_geometry_data *geo, uint64_t lbk, + uint64_t chk, uint64_t pu, uint64_t grp) +{ + uint64_t lba, len; + + lba = lbk; + len = geo->lbaf.lbk_len; + CU_ASSERT(lbk < (1ull << geo->lbaf.lbk_len)); + + lba |= chk << len; + len += geo->lbaf.chk_len; + CU_ASSERT(chk < (1ull << geo->lbaf.chk_len)); + + lba |= pu << len; + len += geo->lbaf.pu_len; + CU_ASSERT(pu < (1ull << geo->lbaf.pu_len)); + + lba |= grp << len; + + return lba; +} + +static void +test_lba_translation(void) +{ + struct spdk_nvme_ctrlr *ctrlr; + struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; + struct spdk_nvme_transport_id trid = { .traddr = "00:00:00" }; + const char *controller_name = "nvme0"; + const char *bdev_name = "nvme0n1"; + struct spdk_ocssd_geometry_data geometry = {}; + struct ocssd_bdev *ocssd_bdev; + struct spdk_bdev *bdev; + uint64_t lba; + int rc; + + geometry = (struct spdk_ocssd_geometry_data) { + .clba = 512, + .num_chk = 64, + .num_pu = 8, + .num_grp = 4, + .lbaf = { + .lbk_len = 9, + .chk_len = 6, + .pu_len = 3, + .grp_len = 2, + } + }; + + ctrlr = create_controller(&trid, 1, &geometry); + nvme_bdev_ctrlr = create_nvme_bdev_controller(&trid, controller_name); + + rc = create_bdev(controller_name, bdev_name, 1); + CU_ASSERT_EQUAL(rc, 0); + + bdev = spdk_bdev_get_by_name(bdev_name); + SPDK_CU_ASSERT_FATAL(bdev != NULL); + ocssd_bdev = SPDK_CONTAINEROF(bdev, struct ocssd_bdev, nvme_bdev.disk); + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, 0); + CU_ASSERT_EQUAL(lba, generate_lba(&geometry, 0, 0, 0, 0)); + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev->zone_size - 1); + CU_ASSERT_EQUAL(lba, generate_lba(&geometry, bdev->zone_size - 1, 0, 0, 0)); + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev->zone_size); + CU_ASSERT_EQUAL(lba, generate_lba(&geometry, 0, 0, 1, 0)); + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev->zone_size * geometry.num_pu); + CU_ASSERT_EQUAL(lba, generate_lba(&geometry, 0, 0, 0, 1)); + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev->zone_size * geometry.num_pu + 68); + CU_ASSERT_EQUAL(lba, generate_lba(&geometry, 68, 0, 0, 1)); + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev->zone_size + 68); + CU_ASSERT_EQUAL(lba, generate_lba(&geometry, 68, 0, 1, 0)); + + delete_nvme_bdev_controller(nvme_bdev_ctrlr); + free_controller(ctrlr); + + geometry = (struct spdk_ocssd_geometry_data) { + .clba = 5120, + .num_chk = 501, + .num_pu = 9, + .num_grp = 1, + .lbaf = { + .lbk_len = 13, + .chk_len = 9, + .pu_len = 4, + .grp_len = 1, + } + }; + + ctrlr = create_controller(&trid, 1, &geometry); + nvme_bdev_ctrlr = create_nvme_bdev_controller(&trid, controller_name); + + rc = create_bdev(controller_name, bdev_name, 1); + CU_ASSERT_EQUAL(rc, 0); + + bdev = spdk_bdev_get_by_name(bdev_name); + SPDK_CU_ASSERT_FATAL(bdev != NULL); + ocssd_bdev = SPDK_CONTAINEROF(bdev, struct ocssd_bdev, nvme_bdev.disk); + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, 0); + CU_ASSERT_EQUAL(lba, generate_lba(&geometry, 0, 0, 0, 0)); + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev->zone_size - 1); + CU_ASSERT_EQUAL(lba, generate_lba(&geometry, bdev->zone_size - 1, 0, 0, 0)); + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev->zone_size); + CU_ASSERT_EQUAL(lba, generate_lba(&geometry, 0, 0, 1, 0)); + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev->zone_size * (geometry.num_pu - 1)); + CU_ASSERT_EQUAL(lba, generate_lba(&geometry, 0, 0, geometry.num_pu - 1, 0)); + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev->zone_size * (geometry.num_pu)); + CU_ASSERT_EQUAL(lba, generate_lba(&geometry, 0, 1, 0, 0)); + + lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev->zone_size * (geometry.num_pu) + 68); + CU_ASSERT_EQUAL(lba, generate_lba(&geometry, 68, 1, 0, 0)); + + delete_nvme_bdev_controller(nvme_bdev_ctrlr); + + free_controller(ctrlr); +} + int main(int argc, const char **argv) { @@ -546,7 +691,8 @@ main(int argc, const char **argv) if ( CU_add_test(suite, "test_create_controller", test_create_controller) == NULL || - CU_add_test(suite, "test_device_geometry", test_device_geometry) == NULL + CU_add_test(suite, "test_device_geometry", test_device_geometry) == NULL || + CU_add_test(suite, "test_lba_translation", test_lba_translation) == NULL ) { CU_cleanup_registry(); return CU_get_error();