diff --git a/CONFIG b/CONFIG index 2d4e53eb9..801f22c1f 100644 --- a/CONFIG +++ b/CONFIG @@ -104,6 +104,10 @@ CONFIG_FC_PATH= # Requires librbd development libraries CONFIG_RBD=n +# Build DAOS support in bdev modules +# Requires daos development libraries +CONFIG_DAOS=n + # Build vhost library. CONFIG_VHOST=y diff --git a/configure b/configure index 541c0771c..a64571ace 100755 --- a/configure +++ b/configure @@ -78,6 +78,8 @@ function usage() { echo " --without-fc If an argument is provided, it is considered a directory containing" echo " libufc.a and fc_lld.h. Otherwise the regular system paths will" echo " be searched." + echo " --with-daos Build DAOS bdev module." + echo " --without-daos No path required." echo " --with-shared Build spdk shared libraries." echo " --without-shared No path required." echo " --with-iscsi-initiator Build with iscsi bdev module." @@ -483,6 +485,12 @@ for i in "$@"; do CONFIG[FC]=n CONFIG[FC_PATH]= ;; + --with-daos) + CONFIG[DAOS]=y + ;; + --without-daos) + CONFIG[DAOS]=n + ;; --with-shared) CONFIG[SHARED]=y ;; @@ -1036,6 +1044,16 @@ if [[ "${CONFIG[ISCSI_INITIATOR]}" = "y" ]]; then fi fi +if [[ "${CONFIG[DAOS]}" = "y" ]]; then + if ! echo -e '#include \n#include \n' \ + 'int main(void) { return 0; }\n' \ + | "${BUILD_CMD[@]}" -lgurt -ldaos -ldaos_common -ldfs - 2> /dev/null; then + echo "--with-daos requires libdaos, libdaos_common, libdfs and libgurt" + echo "Please install then re-run this script." + exit 1 + fi +fi + if [[ "${CONFIG[ASAN]}" = "y" ]]; then if ! echo -e 'int main(void) { return 0; }\n' \ | "${BUILD_CMD[@]}" -fsanitize=address - 2> /dev/null; then diff --git a/doc/bdev.md b/doc/bdev.md index c4b570a88..49c1827a3 100644 --- a/doc/bdev.md +++ b/doc/bdev.md @@ -672,3 +672,24 @@ Virtio-SCSI devices can be removed with the following command `rpc.py bdev_virtio_detach_controller VirtioScsi0` Removing a Virtio-SCSI device will destroy all its bdevs. + +## DAOS bdev {#bdev_config_daos} + +DAOS bdev creates SPDK block device on top of DAOS DFS, the name of the bdev defines the file name in DFS namespace. +Note that DAOS container has to be POSIX type, e.g.: ` daos cont create --pool=test-pool --label=test-cont --type=POSIX` + +To build SPDK with daos support, daos-devel package has to be installed, please see the setup [guide](https://docs.daos.io/v2.0/). +To enable the module, configure SPDK using `--with-daos` flag. + +Running `daos_agent` service on the target machine is required for the SPDK DAOS bdev communication with a DAOS cluster. + +The implementation uses the independent pool and container connections per device's channel for the best IO throughput, therefore, +running a target application with multiple cores (`-m [0-7], for example) is highly advisable. + +Example command for creating daos bdev: + +`rpc.py bdev_daos_create daosdev0 test-pool test-cont 64 4096` + +Example command for removing daos bdev: + +`rpc.py bdev_daos_delete daosdev0` diff --git a/doc/jsonrpc.md b/doc/jsonrpc.md index 453a54610..34e63f6cc 100644 --- a/doc/jsonrpc.md +++ b/doc/jsonrpc.md @@ -481,7 +481,9 @@ Example response: "bdev_lvol_create", "bdev_lvol_delete_lvstore", "bdev_lvol_rename_lvstore", - "bdev_lvol_create_lvstore" + "bdev_lvol_create_lvstore", + "bdev_daos_delete", + "bdev_daos_create" ] } ~~~ @@ -10141,3 +10143,86 @@ Example response: } ~~~ + +### bdev_daos_create {#rpc_bdev_daos_create} + +Construct @ref bdev_config_daos + +#### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +name | Required | string | Bdev name to use +pool | Required | string | DAOS pool label or its uuid +cont | Required | string | DAOS cont label or its uuid +block_size | Required | number | Block size in bytes -must be multiple of 512 +num_blocks | Required | number | Number of blocks +uuid | Optional | string | UUID of new bdev + +#### Result + +Name of newly created bdev. + +#### Example + +Example request: + +~~~json +{ + "params": { + "block_size": 4096, + "num_blocks": 16384, + "name": "daosdev0", + "pool": "test-pool", + "cont": "test-cont", + }, + "jsonrpc": "2.0", + "method": "bdev_daos_create", + "id": 1 +} +~~~ + +Example response: + +~~~json +{ + "jsonrpc": "2.0", + "id": 1, + "result": "daosdev0" +} +~~~ + +### bdev_daos_delete {#rpc_bdev_daos_delete} + +Delete @ref bdev_config_daos + +#### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +name | Required | string | Bdev name + +#### Example + +Example request: + +~~~json +{ + "params": { + "name": "daosdev0" + }, + "jsonrpc": "2.0", + "method": "bdev_daos_delete", + "id": 1 +} +~~~ + +Example response: + +~~~json +{ + "jsonrpc": "2.0", + "id": 1, + "result": true +} +~~~ diff --git a/mk/spdk.modules.mk b/mk/spdk.modules.mk index cd24e4331..92f1f0382 100644 --- a/mk/spdk.modules.mk +++ b/mk/spdk.modules.mk @@ -84,6 +84,11 @@ BLOCKDEV_MODULES_LIST += bdev_pmem BLOCKDEV_MODULES_PRIVATE_LIBS += -lpmemblk -lpmem endif +ifeq ($(CONFIG_DAOS),y) +BLOCKDEV_MODULES_LIST += bdev_daos +BLOCKDEV_MODULES_PRIVATE_LIBS += -ldaos -ldaos_common -ldfs -lgurt -luuid -ldl +endif + SOCK_MODULES_LIST = sock_posix ifeq ($(OS), Linux) diff --git a/module/bdev/Makefile b/module/bdev/Makefile index cff0ab0ac..338bb0314 100644 --- a/module/bdev/Makefile +++ b/module/bdev/Makefile @@ -27,6 +27,8 @@ endif DIRS-$(CONFIG_RBD) += rbd +DIRS-$(CONFIG_DAOS) += daos + .PHONY: all clean $(DIRS-y) all: $(DIRS-y) diff --git a/module/bdev/daos/Makefile b/module/bdev/daos/Makefile new file mode 100644 index 000000000..09478bc1a --- /dev/null +++ b/module/bdev/daos/Makefile @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) Intel Corporation. +# All rights reserved. +# Copyright (c) croit GmbH. +# All rights reserved. + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 1 +SO_MINOR := 0 + +C_SRCS = bdev_daos.c bdev_daos_rpc.c +LIBNAME = bdev_daos + +SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/module/bdev/daos/bdev_daos.c b/module/bdev/daos/bdev_daos.c new file mode 100644 index 000000000..d65225f93 --- /dev/null +++ b/module/bdev/daos/bdev_daos.c @@ -0,0 +1,786 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) Intel Corporation. + * All rights reserved. + * Copyright (c) croit GmbH. + * All rights reserved. + */ + +#include + +#include "spdk/bdev.h" +#include "spdk/bdev_module.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/json.h" +#include "spdk/thread.h" +#include "spdk/queue.h" +#include "spdk/string.h" +#include "spdk/stdinc.h" +#include "spdk/log.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "bdev_daos.h" + +#define BDEV_DAOS_IOVECS_MAX 32 + +struct bdev_daos_task { + daos_event_t ev; + struct spdk_thread *submit_td; + struct spdk_bdev_io *bdev_io; + + enum spdk_bdev_io_status status; + + uint64_t offset; + + /* DAOS version of iovec and scatter/gather */ + daos_size_t read_size; + d_iov_t diovs[BDEV_DAOS_IOVECS_MAX]; + d_sg_list_t sgl; +}; + +struct bdev_daos { + struct spdk_bdev disk; + + char pool_name[DAOS_PROP_MAX_LABEL_BUF_LEN]; + char cont_name[DAOS_PROP_MAX_LABEL_BUF_LEN]; + + struct bdev_daos_task *reset_task; + struct spdk_poller *reset_retry_timer; +}; + +struct bdev_daos_io_channel { + struct bdev_daos *disk; + struct spdk_poller *poller; + + daos_handle_t pool; + daos_handle_t cont; + + dfs_t *dfs; + dfs_obj_t *obj; + daos_handle_t queue; +}; + +static uint32_t g_bdev_daos_init_count = 0; +static pthread_mutex_t g_bdev_daos_init_mutex = PTHREAD_MUTEX_INITIALIZER; + +static int bdev_daos_initialize(void); + +static int bdev_get_daos_engine(void); +static int bdev_daos_put_engine(void); + +static int +bdev_daos_get_ctx_size(void) +{ + return sizeof(struct bdev_daos_task); +} + +static struct spdk_bdev_module daos_if = { + .name = "daos", + .module_init = bdev_daos_initialize, + .get_ctx_size = bdev_daos_get_ctx_size, +}; + +SPDK_BDEV_MODULE_REGISTER(daos, &daos_if) + +static void +bdev_daos_free(struct bdev_daos *bdev_daos) +{ + if (!bdev_daos) { + return; + } + + free(bdev_daos->disk.name); + free(bdev_daos); +} + +static void +bdev_daos_destruct_cb(void *io_device) +{ + int rc; + struct bdev_daos *daos = io_device; + + assert(daos != NULL); + + bdev_daos_free(daos); + + rc = bdev_daos_put_engine(); + if (rc) { + SPDK_ERRLOG("could not de-initialize DAOS engine: " DF_RC "\n", DP_RC(rc)); + } +} + +static int +bdev_daos_destruct(void *ctx) +{ + struct bdev_daos *daos = ctx; + + SPDK_NOTICELOG("%s: destroying bdev_daos device\n", daos->disk.name); + + spdk_io_device_unregister(daos, bdev_daos_destruct_cb); + + return 0; +} + +static void +_bdev_daos_io_complete(void *bdev_daos_task) +{ + struct bdev_daos_task *task = bdev_daos_task; + + SPDK_DEBUGLOG(bdev_daos, "completed IO at %#lx with status %s\n", task->offset, + task->status == SPDK_BDEV_IO_STATUS_SUCCESS ? "SUCCESS" : "FAILURE"); + + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); +} + +static void +bdev_daos_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) +{ + struct bdev_daos_task *task = (struct bdev_daos_task *)bdev_io->driver_ctx; + struct spdk_thread *current_thread = spdk_get_thread(); + + assert(task->submit_td != NULL); + + task->status = status; + if (task->submit_td != current_thread) { + spdk_thread_send_msg(task->submit_td, _bdev_daos_io_complete, task); + } else { + _bdev_daos_io_complete(task); + } +} + +static int64_t +bdev_daos_writev(struct bdev_daos *daos, struct bdev_daos_io_channel *ch, + struct bdev_daos_task *task, + struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) +{ + int rc; + struct iovec *io = iov; + + SPDK_DEBUGLOG(bdev_daos, "write %d iovs size %lu to off: %#lx\n", + iovcnt, nbytes, offset); + + assert(ch != NULL); + assert(daos != NULL); + assert(task != NULL); + assert(iov != NULL); + + if (iovcnt > BDEV_DAOS_IOVECS_MAX) { + SPDK_ERRLOG("iovs number [%d] exceeds max allowed limit [%d]\n", iovcnt, + BDEV_DAOS_IOVECS_MAX); + return -E2BIG; + } + + if ((rc = daos_event_init(&task->ev, ch->queue, NULL))) { + SPDK_ERRLOG("%s: could not initialize async event: " DF_RC "\n", + daos->disk.name, DP_RC(rc)); + return -EINVAL; + } + + for (int i = 0; i < iovcnt; i++, iov++) { + d_iov_set(&(task->diovs[i]), io->iov_base, io->iov_len); + } + + task->sgl.sg_nr = iovcnt; + task->sgl.sg_nr_out = 0; + task->sgl.sg_iovs = task->diovs; + task->offset = offset; + + if ((rc = dfs_write(ch->dfs, ch->obj, &task->sgl, offset, &task->ev))) { + SPDK_ERRLOG("%s: could not start async write: " DF_RC "\n", + daos->disk.name, DP_RC(rc)); + daos_event_fini(&task->ev); + return -EINVAL; + } + + return nbytes; +} + +static int64_t +bdev_daos_readv(struct bdev_daos *daos, struct bdev_daos_io_channel *ch, + struct bdev_daos_task *task, + struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) +{ + int rc; + struct iovec *io = iov; + + SPDK_DEBUGLOG(bdev_daos, "read %d iovs size %lu to off: %#lx\n", + iovcnt, nbytes, offset); + + assert(ch != NULL); + assert(daos != NULL); + assert(task != NULL); + assert(iov != NULL); + + if (iovcnt > BDEV_DAOS_IOVECS_MAX) { + SPDK_ERRLOG("iovs number [%d] exceeds max allowed limit [%d]\n", iovcnt, + BDEV_DAOS_IOVECS_MAX); + return -E2BIG; + } + + if ((rc = daos_event_init(&task->ev, ch->queue, NULL))) { + SPDK_ERRLOG("%s: could not initialize async event: " DF_RC "\n", + daos->disk.name, DP_RC(rc)); + return -EINVAL; + } + + for (int i = 0; i < iovcnt; i++, io++) { + d_iov_set(&(task->diovs[i]), io->iov_base, io->iov_len); + } + + task->sgl.sg_nr = iovcnt; + task->sgl.sg_nr_out = 0; + task->sgl.sg_iovs = task->diovs; + task->offset = offset; + + if ((rc = dfs_read(ch->dfs, ch->obj, &task->sgl, offset, &task->read_size, &task->ev))) { + SPDK_ERRLOG("%s: could not start async read: " DF_RC "\n", + daos->disk.name, DP_RC(rc)); + daos_event_fini(&task->ev); + return -EINVAL; + } + + return nbytes; +} + +static void +bdev_daos_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, + bool success) +{ + int64_t rc; + struct bdev_daos_io_channel *dch = spdk_io_channel_get_ctx(ch); + + if (!success) { + bdev_daos_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + rc = bdev_daos_readv((struct bdev_daos *)bdev_io->bdev->ctxt, + dch, + (struct bdev_daos_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); + + if (rc < 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } +} + +static void +_bdev_daos_get_io_inflight(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct bdev_daos_io_channel *dch = spdk_io_channel_get_ctx(ch); + int io_inflight = daos_eq_query(dch->queue, DAOS_EQR_WAITING, 0, NULL); + + if (io_inflight > 0) { + spdk_for_each_channel_continue(i, -1); + return; + } + + spdk_for_each_channel_continue(i, 0); +} + +static int bdev_daos_reset_retry_timer(void *arg); + +static void +_bdev_daos_get_io_inflight_done(struct spdk_io_channel_iter *i, int status) +{ + struct bdev_daos *daos = spdk_io_channel_iter_get_ctx(i); + + if (status == -1) { + daos->reset_retry_timer = SPDK_POLLER_REGISTER(bdev_daos_reset_retry_timer, daos, 1000); + return; + } + + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(daos->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS); +} + +static int +bdev_daos_reset_retry_timer(void *arg) +{ + struct bdev_daos *daos = arg; + + if (daos->reset_retry_timer) { + spdk_poller_unregister(&daos->reset_retry_timer); + } + + spdk_for_each_channel(daos, + _bdev_daos_get_io_inflight, + daos, + _bdev_daos_get_io_inflight_done); + + return SPDK_POLLER_BUSY; +} + +static void +bdev_daos_reset(struct bdev_daos *daos, struct bdev_daos_task *task) +{ + assert(daos != NULL); + assert(task != NULL); + + daos->reset_task = task; + bdev_daos_reset_retry_timer(daos); +} + + +static int64_t +bdev_daos_unmap(struct bdev_daos_io_channel *ch, uint64_t nbytes, + uint64_t offset) +{ + SPDK_DEBUGLOG(bdev_daos, "unmap at %#lx with size %#lx\n", offset, nbytes); + return dfs_punch(ch->dfs, ch->obj, offset, nbytes); +} + +static void +_bdev_daos_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct bdev_daos_io_channel *dch = spdk_io_channel_get_ctx(ch); + + int64_t rc; + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_daos_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + + case SPDK_BDEV_IO_TYPE_WRITE: + rc = bdev_daos_writev((struct bdev_daos *)bdev_io->bdev->ctxt, + dch, + (struct bdev_daos_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); + if (rc < 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + break; + + case SPDK_BDEV_IO_TYPE_RESET: + /* Can't cancel in-flight requests, but can wait for their completions */ + bdev_daos_reset((struct bdev_daos *)bdev_io->bdev->ctxt, + (struct bdev_daos_task *)bdev_io->driver_ctx); + break; + + case SPDK_BDEV_IO_TYPE_FLUSH: + /* NOOP because DAOS requests land on PMEM and writes are persistent upon completion */ + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + break; + + case SPDK_BDEV_IO_TYPE_UNMAP: + rc = bdev_daos_unmap(dch, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); + if (!rc) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else { + SPDK_DEBUGLOG(bdev_daos, "%s: could not unmap: " DF_RC "\n", + dch->disk->disk.name, DP_RC((int)rc)); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + + break; + + default: + SPDK_ERRLOG("Wrong io type\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + break; + } +} + +static void +bdev_daos_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct bdev_daos_task *task = (struct bdev_daos_task *)bdev_io->driver_ctx; + struct spdk_thread *submit_td = spdk_io_channel_get_thread(ch); + + assert(task != NULL); + + task->submit_td = submit_td; + task->bdev_io = bdev_io; + + _bdev_daos_submit_request(ch, bdev_io); +} + +#define POLLING_EVENTS_NUM 64 + +static int +bdev_daos_channel_poll(void *arg) +{ + daos_event_t *evp[POLLING_EVENTS_NUM]; + struct bdev_daos_io_channel *ch = arg; + + assert(ch != NULL); + assert(ch->disk != NULL); + + int rc = daos_eq_poll(ch->queue, 0, DAOS_EQ_NOWAIT, + POLLING_EVENTS_NUM, evp); + + if (rc < 0) { + SPDK_DEBUGLOG(bdev_daos, "%s: could not poll daos event queue: " DF_RC "\n", + ch->disk->disk.name, DP_RC(rc)); + /* + * TODO: There are cases when this is self healing, e.g. + * brief network issues, DAOS agent restarting etc. + * However, if the issue persists over some time better would be + * to remove a bdev or the whole controller + */ + return SPDK_POLLER_BUSY; + } + + for (int i = 0; i < rc; ++i) { + struct bdev_daos_task *task = container_of(evp[i], struct bdev_daos_task, ev); + enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; + + assert(task != NULL); + + if (task->ev.ev_error != DER_SUCCESS) { + status = SPDK_BDEV_IO_STATUS_FAILED; + } + + daos_event_fini(&task->ev); + bdev_daos_io_complete(task->bdev_io, status); + } + + return rc > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; +} + +static bool +bdev_daos_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_UNMAP: + return true; + + default: + return false; + } +} + +static struct spdk_io_channel * +bdev_daos_get_io_channel(void *ctx) +{ + return spdk_get_io_channel(ctx); +} + +static void +bdev_daos_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + char uuid_str[SPDK_UUID_STRING_LEN]; + struct bdev_daos *daos = bdev->ctxt; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "bdev_daos_create"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + spdk_json_write_named_string(w, "pool", daos->pool_name); + spdk_json_write_named_string(w, "cont", daos->cont_name); + spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt); + spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); + spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid); + spdk_json_write_named_string(w, "uuid", uuid_str); + + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_bdev_fn_table daos_fn_table = { + .destruct = bdev_daos_destruct, + .submit_request = bdev_daos_submit_request, + .io_type_supported = bdev_daos_io_type_supported, + .get_io_channel = bdev_daos_get_io_channel, + .write_config_json = bdev_daos_write_json_config, +}; + +static void * +_bdev_daos_io_channel_create_cb(void *ctx) +{ + int rc = 0 ; + struct bdev_daos_io_channel *ch = ctx; + struct bdev_daos *daos = ch->disk; + + daos_pool_info_t pinfo; + daos_cont_info_t cinfo; + daos_oclass_id_t obj_class = OC_SX; + + int fd_oflag = O_CREAT | O_RDWR; + mode_t mode = S_IFREG | S_IRWXU | S_IRWXG | S_IRWXO; + + rc = bdev_get_daos_engine(); + if (rc) { + SPDK_ERRLOG("could not initialize DAOS engine: " DF_RC "\n", DP_RC(rc)); + return NULL; + } + + SPDK_DEBUGLOG(bdev_daos, "connecting to daos pool '%s'\n", daos->pool_name); + if ((rc = daos_pool_connect(daos->pool_name, NULL, DAOS_PC_RW, &ch->pool, &pinfo, NULL))) { + SPDK_ERRLOG("%s: could not connect to daos pool: " DF_RC "\n", + daos->disk.name, DP_RC(rc)); + return NULL; + } + SPDK_DEBUGLOG(bdev_daos, "connecting to daos container '%s'\n", daos->cont_name); + if ((rc = daos_cont_open(ch->pool, daos->cont_name, DAOS_COO_RW, &ch->cont, &cinfo, NULL))) { + SPDK_ERRLOG("%s: could not open daos container: " DF_RC "\n", + daos->disk.name, DP_RC(rc)); + goto cleanup_pool; + } + SPDK_DEBUGLOG(bdev_daos, "mounting daos dfs\n"); + if ((rc = dfs_mount(ch->pool, ch->cont, O_RDWR, &ch->dfs))) { + SPDK_ERRLOG("%s: could not mount daos dfs: " DF_RC "\n", + daos->disk.name, DP_RC(rc)); + goto cleanup_cont; + } + SPDK_DEBUGLOG(bdev_daos, "opening dfs object\n"); + if ((rc = dfs_open(ch->dfs, NULL, daos->disk.name, mode, fd_oflag, obj_class, + 0, NULL, &ch->obj))) { + SPDK_ERRLOG("%s: could not open dfs object: " DF_RC "\n", + daos->disk.name, DP_RC(rc)); + goto cleanup_mount; + } + if ((rc = daos_eq_create(&ch->queue))) { + SPDK_ERRLOG("%s: could not create daos event queue: " DF_RC "\n", + daos->disk.name, DP_RC(rc)); + goto cleanup_obj; + } + + return ctx; + +cleanup_obj: + dfs_release(ch->obj); +cleanup_mount: + dfs_umount(ch->dfs); +cleanup_cont: + daos_cont_close(ch->cont, NULL); +cleanup_pool: + daos_pool_disconnect(ch->pool, NULL); + + return NULL; +} + +static int +bdev_daos_io_channel_create_cb(void *io_device, void *ctx_buf) +{ + struct bdev_daos_io_channel *ch = ctx_buf; + + ch->disk = io_device; + + if (spdk_call_unaffinitized(_bdev_daos_io_channel_create_cb, ch) == NULL) { + return EINVAL; + } + + SPDK_DEBUGLOG(bdev_daos, "%s: starting daos event queue poller\n", + ch->disk->disk.name); + + ch->poller = SPDK_POLLER_REGISTER(bdev_daos_channel_poll, ch, 0); + + return 0; +} + +static void +bdev_daos_io_channel_destroy_cb(void *io_device, void *ctx_buf) +{ + int rc; + struct bdev_daos_io_channel *ch = ctx_buf; + + SPDK_DEBUGLOG(bdev_daos, "stopping daos event queue poller\n"); + + spdk_poller_unregister(&ch->poller); + + if ((rc = daos_eq_destroy(ch->queue, DAOS_EQ_DESTROY_FORCE))) { + SPDK_ERRLOG("could not destroy daos event queue: " DF_RC "\n", DP_RC(rc)); + } + if ((rc = dfs_release(ch->obj))) { + SPDK_ERRLOG("could not release dfs object: " DF_RC "\n", DP_RC(rc)); + } + if ((rc = dfs_umount(ch->dfs))) { + SPDK_ERRLOG("could not unmount dfs: " DF_RC "\n", DP_RC(rc)); + } + if ((rc = daos_cont_close(ch->cont, NULL))) { + SPDK_ERRLOG("could not close container: " DF_RC "\n", DP_RC(rc)); + } + if ((rc = daos_pool_disconnect(ch->pool, NULL))) { + SPDK_ERRLOG("could not disconnect from pool: " DF_RC "\n", DP_RC(rc)); + } + rc = bdev_daos_put_engine(); + if (rc) { + SPDK_ERRLOG("could not de-initialize DAOS engine: " DF_RC "\n", DP_RC(rc)); + } +} + +int +create_bdev_daos(struct spdk_bdev **bdev, + const char *name, const struct spdk_uuid *uuid, + const char *pool, const char *cont, + uint64_t num_blocks, uint32_t block_size) +{ + int rc; + size_t len; + struct bdev_daos *daos; + + SPDK_NOTICELOG("%s: creating bdev_daos disk on '%s:%s'\n", name, pool, cont); + + if (num_blocks == 0) { + SPDK_ERRLOG("Disk num_blocks must be greater than 0"); + return -EINVAL; + } + + if (block_size % 512) { + SPDK_ERRLOG("block size must be 512 bytes aligned\n"); + return -EINVAL; + } + + if (!name) { + SPDK_ERRLOG("device name cannot be empty\n"); + return -EINVAL; + } + + if (!pool) { + SPDK_ERRLOG("daos pool cannot be empty\n"); + return -EINVAL; + } + if (!cont) { + SPDK_ERRLOG("daos cont cannot be empty\n"); + return -EINVAL; + } + + daos = calloc(1, sizeof(*daos)); + if (!daos) { + SPDK_ERRLOG("calloc() failed\n"); + return -ENOMEM; + } + + len = strlen(pool); + if (len > DAOS_PROP_LABEL_MAX_LEN) { + SPDK_ERRLOG("daos pool name is too long\n"); + free(daos); + return -EINVAL; + } + memcpy(daos->pool_name, pool, len); + + len = strlen(cont); + if (len > DAOS_PROP_LABEL_MAX_LEN) { + SPDK_ERRLOG("daos cont name is too long\n"); + free(daos); + return -EINVAL; + } + memcpy(daos->cont_name, cont, len); + + daos->disk.name = strdup(name); + daos->disk.product_name = "DAOS bdev"; + + daos->disk.write_cache = 0; + daos->disk.blocklen = block_size; + daos->disk.blockcnt = num_blocks; + + if (uuid) { + daos->disk.uuid = *uuid; + } else { + spdk_uuid_generate(&daos->disk.uuid); + } + + daos->disk.ctxt = daos; + daos->disk.fn_table = &daos_fn_table; + daos->disk.module = &daos_if; + + rc = bdev_get_daos_engine(); + if (rc) { + SPDK_ERRLOG("could not initialize DAOS engine: " DF_RC "\n", DP_RC(rc)); + bdev_daos_free(daos); + return rc; + } + + spdk_io_device_register(daos, bdev_daos_io_channel_create_cb, + bdev_daos_io_channel_destroy_cb, + sizeof(struct bdev_daos_io_channel), + daos->disk.name); + + + rc = spdk_bdev_register(&daos->disk); + if (rc) { + spdk_io_device_unregister(daos, NULL); + bdev_daos_free(daos); + return rc; + } + + *bdev = &(daos->disk); + + return rc; +} + +void +delete_bdev_daos(struct spdk_bdev *bdev, spdk_delete_daos_complete cb_fn, void *cb_arg) +{ + if (!bdev || bdev->module != &daos_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +static int +bdev_get_daos_engine(void) +{ + int rc = 0; + + pthread_mutex_lock(&g_bdev_daos_init_mutex); + if (g_bdev_daos_init_count++ > 0) { + pthread_mutex_unlock(&g_bdev_daos_init_mutex); + return 0; + } + SPDK_DEBUGLOG(bdev_daos, "initializing DAOS engine\n"); + + rc = daos_init(); + pthread_mutex_unlock(&g_bdev_daos_init_mutex); + + if (rc != -DER_ALREADY && rc) { + return rc; + } + return 0; +} + +static int +bdev_daos_put_engine(void) +{ + int rc = 0; + + pthread_mutex_lock(&g_bdev_daos_init_mutex); + if (--g_bdev_daos_init_count > 0) { + pthread_mutex_unlock(&g_bdev_daos_init_mutex); + return 0; + } + SPDK_DEBUGLOG(bdev_daos, "de-initializing DAOS engine\n"); + + rc = daos_fini(); + pthread_mutex_unlock(&g_bdev_daos_init_mutex); + + return rc; +} + +static int +bdev_daos_initialize(void) +{ + /* DAOS engine and client initialization happens + during the first bdev creation */ + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT(bdev_daos) diff --git a/module/bdev/daos/bdev_daos.h b/module/bdev/daos/bdev_daos.h new file mode 100644 index 000000000..a04e22322 --- /dev/null +++ b/module/bdev/daos/bdev_daos.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) Intel Corporation. + * All rights reserved. + * Copyright (c) croit GmbH. + * All rights reserved. + */ + +#ifndef SPDK_BDEV_DAOS_H +#define SPDK_BDEV_DAOS_H + +#include "spdk/stdinc.h" +#include "spdk/bdev.h" + +typedef void (*spdk_delete_daos_complete)(void *cb_arg, int bdeverrno); + +int create_bdev_daos(struct spdk_bdev **bdev, const char *name, const struct spdk_uuid *uuid, + const char *pool, const char *cont, + uint64_t num_blocks, uint32_t block_size); + +void delete_bdev_daos(struct spdk_bdev *bdev, spdk_delete_daos_complete cb_fn, void *cb_arg); + +#endif /* SPDK_BDEV_DAOS_H */ diff --git a/module/bdev/daos/bdev_daos_rpc.c b/module/bdev/daos/bdev_daos_rpc.c new file mode 100644 index 000000000..a50d6523d --- /dev/null +++ b/module/bdev/daos/bdev_daos_rpc.c @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) Intel Corporation. + * All rights reserved. + * Copyright (c) croit GmbH. + * All rights reserved. + */ + +#include "bdev_daos.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/uuid.h" +#include "spdk/string.h" +#include "spdk/log.h" + +struct rpc_construct_daos { + char *name; + char *uuid; + char *pool; + char *cont; + uint64_t num_blocks; + uint32_t block_size; +}; + +static void +free_rpc_construct_daos(struct rpc_construct_daos *r) +{ + free(r->name); + free(r->uuid); + free(r->pool); + free(r->cont); +} + +static const struct spdk_json_object_decoder rpc_construct_daos_decoders[] = { + {"name", offsetof(struct rpc_construct_daos, name), spdk_json_decode_string}, + {"uuid", offsetof(struct rpc_construct_daos, uuid), spdk_json_decode_string, true}, + {"pool", offsetof(struct rpc_construct_daos, pool), spdk_json_decode_string}, + {"cont", offsetof(struct rpc_construct_daos, cont), spdk_json_decode_string}, + {"num_blocks", offsetof(struct rpc_construct_daos, num_blocks), spdk_json_decode_uint64}, + {"block_size", offsetof(struct rpc_construct_daos, block_size), spdk_json_decode_uint32}, +}; + +static void +rpc_bdev_daos_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_daos req = {NULL}; + struct spdk_json_write_ctx *w; + struct spdk_uuid *uuid = NULL; + struct spdk_uuid decoded_uuid; + struct spdk_bdev *bdev; + int rc = 0; + + if (spdk_json_decode_object(params, rpc_construct_daos_decoders, + SPDK_COUNTOF(rpc_construct_daos_decoders), + &req)) { + SPDK_DEBUGLOG(bdev_daos, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_PARSE_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + if (req.uuid) { + if (spdk_uuid_parse(&decoded_uuid, req.uuid)) { + spdk_jsonrpc_send_error_response(request, -EINVAL, + "Failed to parse bdev UUID"); + goto cleanup; + } + uuid = &decoded_uuid; + } + + rc = create_bdev_daos(&bdev, req.name, uuid, req.pool, req.cont, + req.num_blocks, req.block_size); + if (rc) { + spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); + goto cleanup; + } + + free_rpc_construct_daos(&req); + + w = spdk_jsonrpc_begin_result(request); + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + spdk_jsonrpc_end_result(request, w); + return; + +cleanup: + free_rpc_construct_daos(&req); +} +SPDK_RPC_REGISTER("bdev_daos_create", rpc_bdev_daos_create, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_daos_create, construct_daos_bdev) + +struct rpc_delete_daos { + char *name; +}; + +static void +free_rpc_delete_daos(struct rpc_delete_daos *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_delete_daos_decoders[] = { + {"name", offsetof(struct rpc_delete_daos, name), spdk_json_decode_string}, +}; + +static void +rpc_bdev_daos_delete_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + + spdk_jsonrpc_send_bool_response(request, bdeverrno == 0); +} + +static void +rpc_bdev_daos_delete(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_daos req = {NULL}; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_delete_daos_decoders, + SPDK_COUNTOF(rpc_delete_daos_decoders), + &req)) { + SPDK_DEBUGLOG(bdev_daos, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_PARSE_ERROR, + "spdk_json_decode_object failed"); + goto cleanup; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_INFOLOG(bdev_daos, "bdev '%s' does not exist\n", req.name); + spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); + goto cleanup; + } + + delete_bdev_daos(bdev, rpc_bdev_daos_delete_cb, request); + +cleanup: + free_rpc_delete_daos(&req); +} + +SPDK_RPC_REGISTER("bdev_daos_delete", rpc_bdev_daos_delete, SPDK_RPC_RUNTIME) +SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_daos_delete, delete_daos_bdev) diff --git a/python/spdk/rpc/bdev.py b/python/spdk/rpc/bdev.py index 2b5e81c65..213990d7d 100644 --- a/python/spdk/rpc/bdev.py +++ b/python/spdk/rpc/bdev.py @@ -1534,3 +1534,33 @@ def bdev_nvme_get_controller_health_info(client, name): params = {} params['name'] = name return client.call('bdev_nvme_get_controller_health_info', params) + + +def bdev_daos_create(client, num_blocks, block_size, pool, cont, name, uuid=None): + """Construct DAOS block device. + + Args: + num_blocks: size of block device in blocks + block_size: block size of device; must be a power of 2 and at least 512 + name: name of block device (also the name of the backend file on DAOS DFS) + pool: UUID of DAOS pool + cont: UUID of DAOS container + uuid: UUID of block device (optional) + + Returns: + Name of created block device. + """ + params = {'num_blocks': num_blocks, 'block_size': block_size, 'pool': pool, 'cont': cont, 'name': name} + if uuid: + params['uuid'] = uuid + return client.call('bdev_daos_create', params) + + +def bdev_daos_delete(client, name): + """Delete DAOS block device. + + Args: + bdev_name: name of DAOS bdev to delete + """ + params = {'name': name} + return client.call('bdev_daos_delete', params) diff --git a/scripts/rpc.py b/scripts/rpc.py index 859ab2a04..2b6511ea1 100755 --- a/scripts/rpc.py +++ b/scripts/rpc.py @@ -2948,6 +2948,35 @@ Format: 'user:u1 secret:s1 muser:mu1 msecret:ms1,user:u2 secret:s2 muser:mu2 mse p.add_argument('-o', '--opc', help="""Opcode of the nvme cmd.""", required=True, type=int) p.set_defaults(func=bdev_nvme_remove_error_injection) + def bdev_daos_create(args): + num_blocks = (args.total_size * 1024 * 1024) // args.block_size + print_json(rpc.bdev.bdev_daos_create(args.client, + num_blocks=int(num_blocks), + block_size=args.block_size, + name=args.name, + uuid=args.uuid, + pool=args.pool, + cont=args.cont)) + p = subparsers.add_parser('bdev_daos_create', + help='Create a bdev with DAOS backend') + p.add_argument('name', help="Name of the bdev") + p.add_argument('pool', help="UUID of the DAOS pool") + p.add_argument('cont', help="UUID of the DAOS container") + p.add_argument( + 'total_size', help='Size of DAOS bdev in MB (float > 0)', type=float) + p.add_argument('block_size', help='Block size for this bdev', type=int) + p.add_argument('-u', '--uuid', help="UUID of the bdev") + p.set_defaults(func=bdev_daos_create) + + def bdev_daos_delete(args): + rpc.bdev.bdev_daos_delete(args.client, + name=args.name) + + p = subparsers.add_parser('bdev_daos_delete', + help='Delete a DAOS disk') + p.add_argument('name', help='DAOS bdev name') + p.set_defaults(func=bdev_daos_delete) + def check_called_name(name): if name in deprecated_aliases: print("{} is deprecated, use {} instead.".format(name, deprecated_aliases[name]), file=sys.stderr) diff --git a/scripts/vagrant/autorun-spdk.conf b/scripts/vagrant/autorun-spdk.conf index bb2a910a8..71078aea8 100644 --- a/scripts/vagrant/autorun-spdk.conf +++ b/scripts/vagrant/autorun-spdk.conf @@ -19,6 +19,7 @@ SPDK_TEST_VHOST_INIT=0 SPDK_TEST_BLOCKDEV=1 SPDK_TEST_URING=0 SPDK_TEST_USDT=0 +SPDK_TEST_DAOS=0 # doesn't work on vm SPDK_TEST_IOAT=0 SPDK_TEST_BLOBFS=0 diff --git a/test/bdev/blockdev.sh b/test/bdev/blockdev.sh index 93491c087..42a5b771c 100755 --- a/test/bdev/blockdev.sh +++ b/test/bdev/blockdev.sh @@ -19,6 +19,10 @@ function cleanup() { rbd_cleanup fi + if [[ $test_type == daos ]]; then + daos_cleanup + fi + if [[ "$test_type" = "gpt" ]]; then "$rootdir/scripts/setup.sh" reset if [[ -b $gpt_nvme ]]; then @@ -169,6 +173,17 @@ function setup_rbd_conf() { "$rpc_py" bdev_rbd_create -b Ceph0 rbd foo 512 } +function setup_daos_conf() { + local pool=testpool + local cont=testcont + + timing_enter daos_setup + daos_setup $pool $cont + timing_exit daos_setup + + "$rpc_py" bdev_daos_create Daos0 $pool $cont 16 4096 +} + function bdev_bounds() { $testdir/bdevio/bdevio -w -s $PRE_RESERVED_MEM --json "$conf_file" "$env_ctx" & bdevio_pid=$! @@ -502,6 +517,9 @@ case "$test_type" in rbd) setup_rbd_conf ;; + daos) + setup_daos_conf + ;; *) echo "invalid test name" exit 1 diff --git a/test/common/autotest_common.sh b/test/common/autotest_common.sh index c35c598f6..a2d5b6ba9 100755 --- a/test/common/autotest_common.sh +++ b/test/common/autotest_common.sh @@ -163,6 +163,8 @@ export SPDK_TEST_SCANBUILD export SPDK_TEST_NVMF_NICS : ${SPDK_TEST_SMA=0} export SPDK_TEST_SMA +: ${SPDK_TEST_DAOS=0} +export SPDK_TEST_DAOS # always test with SPDK shared objects. export SPDK_LIB_DIR="$rootdir/build/lib" @@ -488,6 +490,10 @@ function get_config_params() { config_params+=' --with-sma' fi + if [ -f /usr/include/daos.h ] && [ $SPDK_TEST_DAOS -eq 1 ]; then + config_params+=' --with-daos' + fi + echo "$config_params" xtrace_restore } @@ -918,6 +924,30 @@ function rbd_cleanup() { fi } +function daos_setup() { + # $1 = pool name + # $2 = cont name + if [ -z "$1" ]; then + echo "No pool name provided" + exit 1 + fi + if [ -z "$2" ]; then + echo "No cont name provided" + exit 1 + fi + + dmg pool create --size=10G $1 || true + daos container create --type=POSIX --label=$2 $1 || true +} + +function daos_cleanup() { + local pool=${1:-testpool} + local cont=${2:-testcont} + + daos container destroy -f $pool $cont || true + sudo dmg pool destroy -f $pool || true +} + function _start_stub() { # Disable ASLR for multi-process testing. SPDK does support using DPDK multi-process, # but ASLR can still be unreliable in some cases. diff --git a/test/common/config/vm_setup.sh b/test/common/config/vm_setup.sh index 9eb5b9fe4..10b286443 100755 --- a/test/common/config/vm_setup.sh +++ b/test/common/config/vm_setup.sh @@ -199,6 +199,7 @@ SPDK_TEST_IOAT=0 # requires some extra configuration. see TEST_ENV_SETUP_README SPDK_TEST_VHOST=0 SPDK_TEST_VHOST_INIT=0 +SPDK_TEST_DAOS=0 EOF fi