/*- * BSD LICENSE * * Copyright (c) Intel Corporation. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * Neither the name of Intel Corporation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "spdk/stdinc.h" #include "bdev_rbd.h" #include #include #include "spdk/env.h" #include "spdk/bdev.h" #include "spdk/thread.h" #include "spdk/json.h" #include "spdk/string.h" #include "spdk/util.h" #include "spdk/likely.h" #include "spdk/bdev_module.h" #include "spdk/log.h" static int bdev_rbd_count = 0; struct bdev_rbd { struct spdk_bdev disk; char *rbd_name; char *user_id; char *pool_name; char **config; rados_t cluster; rados_t *cluster_p; char *cluster_name; rados_ioctx_t io_ctx; rbd_image_t image; rbd_image_info_t info; pthread_mutex_t mutex; struct spdk_thread *main_td; struct spdk_thread *destruct_td; uint32_t ch_count; struct spdk_io_channel *group_ch; TAILQ_ENTRY(bdev_rbd) tailq; struct spdk_poller *reset_timer; struct spdk_bdev_io *reset_bdev_io; }; struct bdev_rbd_io_channel { struct bdev_rbd *disk; }; struct bdev_rbd_io { struct spdk_thread *submit_td; enum spdk_bdev_io_status status; rbd_completion_t comp; size_t total_len; }; struct bdev_rbd_cluster { char *name; char *user_id; char **config_param; char *config_file; char *key_file; rados_t cluster; uint32_t ref; STAILQ_ENTRY(bdev_rbd_cluster) link; }; static STAILQ_HEAD(, bdev_rbd_cluster) g_map_bdev_rbd_cluster = STAILQ_HEAD_INITIALIZER( g_map_bdev_rbd_cluster); static pthread_mutex_t g_map_bdev_rbd_cluster_mutex = PTHREAD_MUTEX_INITIALIZER; static void bdev_rbd_cluster_free(struct bdev_rbd_cluster *entry) { assert(entry != NULL); bdev_rbd_free_config(entry->config_param); free(entry->config_file); free(entry->key_file); free(entry->user_id); free(entry->name); free(entry); } static void bdev_rbd_put_cluster(rados_t **cluster) { struct bdev_rbd_cluster *entry; assert(cluster != NULL); /* No need go through the map if *cluster equals to NULL */ if (*cluster == NULL) { return; } pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { if (*cluster != &entry->cluster) { continue; } assert(entry->ref > 0); entry->ref--; *cluster = NULL; pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); return; } pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); SPDK_ERRLOG("Cannot find the entry for cluster=%p\n", cluster); } static void bdev_rbd_free(struct bdev_rbd *rbd) { if (!rbd) { return; } free(rbd->disk.name); free(rbd->rbd_name); free(rbd->user_id); free(rbd->pool_name); bdev_rbd_free_config(rbd->config); if (rbd->io_ctx) { rados_ioctx_destroy(rbd->io_ctx); } if (rbd->cluster_name) { bdev_rbd_put_cluster(&rbd->cluster_p); free(rbd->cluster_name); } else if (rbd->cluster) { rados_shutdown(rbd->cluster); } pthread_mutex_destroy(&rbd->mutex); free(rbd); } void bdev_rbd_free_config(char **config) { char **entry; if (config) { for (entry = config; *entry; entry++) { free(*entry); } free(config); } } char ** bdev_rbd_dup_config(const char *const *config) { size_t count; char **copy; if (!config) { return NULL; } for (count = 0; config[count]; count++) {} copy = calloc(count + 1, sizeof(*copy)); if (!copy) { return NULL; } for (count = 0; config[count]; count++) { if (!(copy[count] = strdup(config[count]))) { bdev_rbd_free_config(copy); return NULL; } } return copy; } static int bdev_rados_cluster_init(const char *user_id, const char *const *config, rados_t *cluster) { int ret; ret = rados_create(cluster, user_id); if (ret < 0) { SPDK_ERRLOG("Failed to create rados_t struct\n"); return -1; } if (config) { const char *const *entry = config; while (*entry) { ret = rados_conf_set(*cluster, entry[0], entry[1]); if (ret < 0) { SPDK_ERRLOG("Failed to set %s = %s\n", entry[0], entry[1]); rados_shutdown(*cluster); return -1; } entry += 2; } } else { ret = rados_conf_read_file(*cluster, NULL); if (ret < 0) { SPDK_ERRLOG("Failed to read conf file\n"); rados_shutdown(*cluster); return -1; } } ret = rados_connect(*cluster); if (ret < 0) { SPDK_ERRLOG("Failed to connect to rbd_pool\n"); rados_shutdown(*cluster); return -1; } return 0; } static int bdev_rbd_get_cluster(const char *cluster_name, rados_t **cluster) { struct bdev_rbd_cluster *entry; if (cluster == NULL) { SPDK_ERRLOG("cluster should not be NULL\n"); return -1; } pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { if (strcmp(cluster_name, entry->name) == 0) { entry->ref++; *cluster = &entry->cluster; pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); return 0; } } pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); return -1; } static int bdev_rbd_shared_cluster_init(const char *cluster_name, rados_t **cluster) { int ret; ret = bdev_rbd_get_cluster(cluster_name, cluster); if (ret < 0) { SPDK_ERRLOG("Failed to create rados_t struct\n"); return -1; } return ret; } static void * bdev_rbd_cluster_handle(void *arg) { void *ret = arg; struct bdev_rbd *rbd = arg; int rc; rc = bdev_rados_cluster_init(rbd->user_id, (const char *const *)rbd->config, &rbd->cluster); if (rc < 0) { SPDK_ERRLOG("Failed to create rados cluster for user_id=%s and rbd_pool=%s\n", rbd->user_id ? rbd->user_id : "admin (the default)", rbd->pool_name); ret = NULL; } return ret; } static void * bdev_rbd_init_context(void *arg) { struct bdev_rbd *rbd = arg; int rc; if (rados_ioctx_create(*(rbd->cluster_p), rbd->pool_name, &rbd->io_ctx) < 0) { SPDK_ERRLOG("Failed to create ioctx on rbd=%p\n", rbd); return NULL; } rc = rbd_open(rbd->io_ctx, rbd->rbd_name, &rbd->image, NULL); if (rc < 0) { SPDK_ERRLOG("Failed to open specified rbd device\n"); return NULL; } rc = rbd_stat(rbd->image, &rbd->info, sizeof(rbd->info)); rbd_close(rbd->image); if (rc < 0) { SPDK_ERRLOG("Failed to stat specified rbd device\n"); return NULL; } return arg; } static int bdev_rbd_init(struct bdev_rbd *rbd) { int ret = 0; if (!rbd->cluster_name) { rbd->cluster_p = &rbd->cluster; /* Cluster should be created in non-SPDK thread to avoid conflict between * Rados and SPDK thread */ if (spdk_call_unaffinitized(bdev_rbd_cluster_handle, rbd) == NULL) { SPDK_ERRLOG("Cannot create the rados object on rbd=%p\n", rbd); return -1; } } else { ret = bdev_rbd_shared_cluster_init(rbd->cluster_name, &rbd->cluster_p); if (ret < 0) { SPDK_ERRLOG("Failed to create rados object for rbd =%p on cluster_name=%s\n", rbd, rbd->cluster_name); return -1; } } if (spdk_call_unaffinitized(bdev_rbd_init_context, rbd) == NULL) { SPDK_ERRLOG("Cannot init rbd context for rbd=%p\n", rbd); } return ret; } static void bdev_rbd_exit(rbd_image_t image) { rbd_flush(image); rbd_close(image); } static void _bdev_rbd_io_complete(void *_rbd_io) { struct bdev_rbd_io *rbd_io = _rbd_io; spdk_bdev_io_complete(spdk_bdev_io_from_ctx(rbd_io), rbd_io->status); } static void bdev_rbd_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) { struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; struct spdk_thread *current_thread = spdk_get_thread(); rbd_io->status = status; assert(rbd_io->submit_td != NULL); if (rbd_io->submit_td != current_thread) { spdk_thread_send_msg(rbd_io->submit_td, _bdev_rbd_io_complete, rbd_io); } else { _bdev_rbd_io_complete(rbd_io); } } static void bdev_rbd_finish_aiocb(rbd_completion_t cb, void *arg) { int io_status; struct spdk_bdev_io *bdev_io; struct bdev_rbd_io *rbd_io; enum spdk_bdev_io_status bio_status; bdev_io = rbd_aio_get_arg(cb); rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; io_status = rbd_aio_get_return_value(cb); bio_status = SPDK_BDEV_IO_STATUS_SUCCESS; if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { if ((int)rbd_io->total_len != io_status) { bio_status = SPDK_BDEV_IO_STATUS_FAILED; } } else { /* For others, 0 means success */ if (io_status != 0) { bio_status = SPDK_BDEV_IO_STATUS_FAILED; } } rbd_aio_release(cb); bdev_rbd_io_complete(bdev_io, bio_status); } static void bdev_rbd_start_aio(struct bdev_rbd *disk, struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, uint64_t offset, size_t len) { int ret; struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; rbd_image_t image = disk->image; ret = rbd_aio_create_completion(bdev_io, bdev_rbd_finish_aiocb, &rbd_io->comp); if (ret < 0) { goto err; } if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { rbd_io->total_len = len; if (spdk_likely(iovcnt == 1)) { ret = rbd_aio_read(image, offset, iov[0].iov_len, iov[0].iov_base, rbd_io->comp); } else { ret = rbd_aio_readv(image, iov, iovcnt, offset, rbd_io->comp); } } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { if (spdk_likely(iovcnt == 1)) { ret = rbd_aio_write(image, offset, iov[0].iov_len, iov[0].iov_base, rbd_io->comp); } else { ret = rbd_aio_writev(image, iov, iovcnt, offset, rbd_io->comp); } } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_FLUSH) { ret = rbd_aio_flush(image, rbd_io->comp); } if (ret < 0) { rbd_aio_release(rbd_io->comp); goto err; } return; err: bdev_rbd_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); } static int bdev_rbd_library_init(void); static void bdev_rbd_library_fini(void); static int bdev_rbd_get_ctx_size(void) { return sizeof(struct bdev_rbd_io); } static struct spdk_bdev_module rbd_if = { .name = "rbd", .module_init = bdev_rbd_library_init, .module_fini = bdev_rbd_library_fini, .get_ctx_size = bdev_rbd_get_ctx_size, }; SPDK_BDEV_MODULE_REGISTER(rbd, &rbd_if) static int bdev_rbd_reset_timer(void *arg) { struct bdev_rbd *disk = arg; /* * TODO: This should check if any I/O is still in flight before completing the reset. * For now, just complete after the timer expires. */ bdev_rbd_io_complete(disk->reset_bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); spdk_poller_unregister(&disk->reset_timer); disk->reset_bdev_io = NULL; return SPDK_POLLER_BUSY; } static void bdev_rbd_reset(struct bdev_rbd *disk, struct spdk_bdev_io *bdev_io) { /* * HACK: Since librbd doesn't provide any way to cancel outstanding aio, just kick off a * timer to wait for in-flight I/O to complete. */ assert(disk->reset_bdev_io == NULL); disk->reset_bdev_io = bdev_io; disk->reset_timer = SPDK_POLLER_REGISTER(bdev_rbd_reset_timer, disk, 1 * 1000 * 1000); } static void _bdev_rbd_destruct_done(void *io_device) { struct bdev_rbd *rbd = io_device; assert(rbd != NULL); assert(rbd->ch_count == 0); spdk_bdev_destruct_done(&rbd->disk, 0); bdev_rbd_free(rbd); } static void bdev_rbd_free_cb(void *io_device) { struct bdev_rbd *rbd = io_device; /* The io device has been unregistered. Send a message back to the * original thread that started the destruct operation, so that the * bdev unregister callback is invoked on the same thread that started * this whole process. */ spdk_thread_send_msg(rbd->destruct_td, _bdev_rbd_destruct_done, rbd); } static void _bdev_rbd_destruct(void *ctx) { struct bdev_rbd *rbd = ctx; spdk_io_device_unregister(rbd, bdev_rbd_free_cb); } static int bdev_rbd_destruct(void *ctx) { struct bdev_rbd *rbd = ctx; struct spdk_thread *td; if (rbd->main_td == NULL) { td = spdk_get_thread(); } else { td = rbd->main_td; } /* Start the destruct operation on the rbd bdev's * main thread. This guarantees it will only start * executing after any messages related to channel * deletions have finished completing. *Always* * send a message, even if this function gets called * from the main thread, in case there are pending * channel delete messages in flight to this thread. */ assert(rbd->destruct_td == NULL); rbd->destruct_td = td; spdk_thread_send_msg(td, _bdev_rbd_destruct, rbd); /* Return 1 to indicate the destruct path is asynchronous. */ return 1; } static void bdev_rbd_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) { struct bdev_rbd *disk = (struct bdev_rbd *)bdev_io->bdev->ctxt; if (!success) { bdev_rbd_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); return; } bdev_rbd_start_aio(disk, bdev_io, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen, bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); } static void _bdev_rbd_submit_request(void *ctx) { struct spdk_bdev_io *bdev_io = ctx; struct bdev_rbd *disk = (struct bdev_rbd *)bdev_io->bdev->ctxt; switch (bdev_io->type) { case SPDK_BDEV_IO_TYPE_READ: spdk_bdev_io_get_buf(bdev_io, bdev_rbd_get_buf_cb, bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); break; case SPDK_BDEV_IO_TYPE_WRITE: case SPDK_BDEV_IO_TYPE_FLUSH: bdev_rbd_start_aio(disk, bdev_io, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen, bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); break; case SPDK_BDEV_IO_TYPE_RESET: bdev_rbd_reset((struct bdev_rbd *)bdev_io->bdev->ctxt, bdev_io); break; default: SPDK_ERRLOG("Unsupported IO type =%d\n", bdev_io->type); bdev_rbd_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); break; } } static void bdev_rbd_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) { struct spdk_thread *submit_td = spdk_io_channel_get_thread(ch); struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; struct bdev_rbd *disk = (struct bdev_rbd *)bdev_io->bdev->ctxt; rbd_io->submit_td = submit_td; if (disk->main_td != submit_td) { spdk_thread_send_msg(disk->main_td, _bdev_rbd_submit_request, bdev_io); } else { _bdev_rbd_submit_request(bdev_io); } } static bool bdev_rbd_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) { switch (io_type) { case SPDK_BDEV_IO_TYPE_READ: case SPDK_BDEV_IO_TYPE_WRITE: case SPDK_BDEV_IO_TYPE_FLUSH: case SPDK_BDEV_IO_TYPE_RESET: return true; default: return false; } } static void bdev_rbd_free_channel_resources(struct bdev_rbd *disk) { assert(disk != NULL); assert(disk->main_td == spdk_get_thread()); assert(disk->ch_count == 0); spdk_put_io_channel(disk->group_ch); if (disk->image) { bdev_rbd_exit(disk->image); } disk->main_td = NULL; disk->group_ch = NULL; } static void * bdev_rbd_handle(void *arg) { struct bdev_rbd *disk = arg; void *ret = arg; if (rbd_open(disk->io_ctx, disk->rbd_name, &disk->image, NULL) < 0) { SPDK_ERRLOG("Failed to open specified rbd device\n"); ret = NULL; } return ret; } static int _bdev_rbd_create_cb(struct bdev_rbd *disk) { disk->group_ch = spdk_get_io_channel(&rbd_if); assert(disk->group_ch != NULL); if (spdk_call_unaffinitized(bdev_rbd_handle, disk) == NULL) { bdev_rbd_free_channel_resources(disk); return -1; } return 0; } static int bdev_rbd_create_cb(void *io_device, void *ctx_buf) { struct bdev_rbd_io_channel *ch = ctx_buf; struct bdev_rbd *disk = io_device; int rc; ch->disk = disk; pthread_mutex_lock(&disk->mutex); if (disk->ch_count == 0) { assert(disk->main_td == NULL); rc = _bdev_rbd_create_cb(disk); if (rc) { SPDK_ERRLOG("Cannot create channel for disk=%p\n", disk); pthread_mutex_unlock(&disk->mutex); return rc; } disk->main_td = spdk_get_thread(); } disk->ch_count++; pthread_mutex_unlock(&disk->mutex); return 0; } static void _bdev_rbd_destroy_cb(void *ctx) { struct bdev_rbd *disk = ctx; pthread_mutex_lock(&disk->mutex); assert(disk->ch_count > 0); disk->ch_count--; if (disk->ch_count > 0) { /* A new channel was created between when message was sent and this function executed */ pthread_mutex_unlock(&disk->mutex); return; } bdev_rbd_free_channel_resources(disk); pthread_mutex_unlock(&disk->mutex); } static void bdev_rbd_destroy_cb(void *io_device, void *ctx_buf) { struct bdev_rbd *disk = io_device; struct spdk_thread *thread; pthread_mutex_lock(&disk->mutex); assert(disk->ch_count > 0); disk->ch_count--; if (disk->ch_count == 0) { assert(disk->main_td != NULL); if (disk->main_td != spdk_get_thread()) { /* The final channel was destroyed on a different thread * than where the first channel was created. Pass a message * to the main thread to unregister the poller. */ disk->ch_count++; thread = disk->main_td; pthread_mutex_unlock(&disk->mutex); spdk_thread_send_msg(thread, _bdev_rbd_destroy_cb, disk); return; } bdev_rbd_free_channel_resources(disk); } pthread_mutex_unlock(&disk->mutex); } static struct spdk_io_channel * bdev_rbd_get_io_channel(void *ctx) { struct bdev_rbd *rbd_bdev = ctx; return spdk_get_io_channel(rbd_bdev); } static void bdev_rbd_cluster_dump_entry(const char *cluster_name, struct spdk_json_write_ctx *w) { struct bdev_rbd_cluster *entry; pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { if (strcmp(cluster_name, entry->name)) { continue; } if (entry->user_id) { spdk_json_write_named_string(w, "user_id", entry->user_id); } if (entry->config_param) { char **config_entry = entry->config_param; spdk_json_write_named_object_begin(w, "config_param"); while (*config_entry) { spdk_json_write_named_string(w, config_entry[0], config_entry[1]); config_entry += 2; } spdk_json_write_object_end(w); } if (entry->config_file) { spdk_json_write_named_string(w, "config_file", entry->config_file); } if (entry->key_file) { spdk_json_write_named_string(w, "key_file", entry->key_file); } pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); return; } pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); } static int bdev_rbd_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) { struct bdev_rbd *rbd_bdev = ctx; spdk_json_write_named_object_begin(w, "rbd"); spdk_json_write_named_string(w, "pool_name", rbd_bdev->pool_name); spdk_json_write_named_string(w, "rbd_name", rbd_bdev->rbd_name); if (rbd_bdev->cluster_name) { bdev_rbd_cluster_dump_entry(rbd_bdev->cluster_name, w); goto end; } if (rbd_bdev->user_id) { spdk_json_write_named_string(w, "user_id", rbd_bdev->user_id); } if (rbd_bdev->config) { char **entry = rbd_bdev->config; spdk_json_write_named_object_begin(w, "config"); while (*entry) { spdk_json_write_named_string(w, entry[0], entry[1]); entry += 2; } spdk_json_write_object_end(w); } end: spdk_json_write_object_end(w); return 0; } static void bdev_rbd_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) { struct bdev_rbd *rbd = bdev->ctxt; char uuid_str[SPDK_UUID_STRING_LEN]; spdk_json_write_object_begin(w); spdk_json_write_named_string(w, "method", "bdev_rbd_create"); spdk_json_write_named_object_begin(w, "params"); spdk_json_write_named_string(w, "name", bdev->name); spdk_json_write_named_string(w, "pool_name", rbd->pool_name); spdk_json_write_named_string(w, "rbd_name", rbd->rbd_name); spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); if (rbd->user_id) { spdk_json_write_named_string(w, "user_id", rbd->user_id); } if (rbd->config) { char **entry = rbd->config; spdk_json_write_named_object_begin(w, "config"); while (*entry) { spdk_json_write_named_string(w, entry[0], entry[1]); entry += 2; } spdk_json_write_object_end(w); } spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid); spdk_json_write_named_string(w, "uuid", uuid_str); spdk_json_write_object_end(w); spdk_json_write_object_end(w); } static void dump_single_cluster_entry(struct bdev_rbd_cluster *entry, struct spdk_json_write_ctx *w) { assert(entry != NULL); spdk_json_write_object_begin(w); spdk_json_write_named_string(w, "cluster_name", entry->name); if (entry->user_id) { spdk_json_write_named_string(w, "user_id", entry->user_id); } if (entry->config_param) { char **config_entry = entry->config_param; spdk_json_write_named_object_begin(w, "config_param"); while (*config_entry) { spdk_json_write_named_string(w, config_entry[0], config_entry[1]); config_entry += 2; } spdk_json_write_object_end(w); } if (entry->config_file) { spdk_json_write_named_string(w, "config_file", entry->config_file); } if (entry->key_file) { spdk_json_write_named_string(w, "key_file", entry->key_file); } spdk_json_write_object_end(w); } int bdev_rbd_get_clusters_info(struct spdk_jsonrpc_request *request, const char *name) { struct bdev_rbd_cluster *entry; struct spdk_json_write_ctx *w; pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); if (STAILQ_EMPTY(&g_map_bdev_rbd_cluster)) { pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); return -ENOENT; } /* If cluster name is provided */ if (name) { STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { if (strcmp(name, entry->name) == 0) { w = spdk_jsonrpc_begin_result(request); dump_single_cluster_entry(entry, w); spdk_jsonrpc_end_result(request, w); pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); return 0; } } pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); return -ENOENT; } w = spdk_jsonrpc_begin_result(request); spdk_json_write_array_begin(w); STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { dump_single_cluster_entry(entry, w); } spdk_json_write_array_end(w); spdk_jsonrpc_end_result(request, w); pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); return 0; } static const struct spdk_bdev_fn_table rbd_fn_table = { .destruct = bdev_rbd_destruct, .submit_request = bdev_rbd_submit_request, .io_type_supported = bdev_rbd_io_type_supported, .get_io_channel = bdev_rbd_get_io_channel, .dump_info_json = bdev_rbd_dump_info_json, .write_config_json = bdev_rbd_write_config_json, }; static int rbd_register_cluster(const char *name, const char *user_id, const char *const *config_param, const char *config_file, const char *key_file) { struct bdev_rbd_cluster *entry; int rc; pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { if (strcmp(name, entry->name) == 0) { SPDK_ERRLOG("Cluster name=%s already exists\n", name); pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); return -1; } } entry = calloc(1, sizeof(*entry)); if (!entry) { SPDK_ERRLOG("Cannot allocate an entry for name=%s\n", name); pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); return -1; } entry->name = strdup(name); if (entry->name == NULL) { SPDK_ERRLOG("Failed to save the name =%s on entry =%p\n", name, entry); goto err_handle; } if (user_id) { entry->user_id = strdup(user_id); if (entry->user_id == NULL) { SPDK_ERRLOG("Failed to save the str =%s on entry =%p\n", user_id, entry); goto err_handle; } } /* Support specify config_param or config_file separately, or both of them. */ if (config_param) { entry->config_param = bdev_rbd_dup_config(config_param); if (entry->config_param == NULL) { SPDK_ERRLOG("Failed to save the config_param=%p on entry = %p\n", config_param, entry); goto err_handle; } } if (config_file) { entry->config_file = strdup(config_file); if (entry->config_file == NULL) { SPDK_ERRLOG("Failed to save the config_file=%s on entry = %p\n", config_file, entry); goto err_handle; } } if (key_file) { entry->key_file = strdup(key_file); if (entry->key_file == NULL) { SPDK_ERRLOG("Failed to save the key_file=%s on entry = %p\n", key_file, entry); goto err_handle; } } rc = rados_create(&entry->cluster, user_id); if (rc < 0) { SPDK_ERRLOG("Failed to create rados_t struct\n"); goto err_handle; } /* Try default location when entry->config_file is NULL, but ignore failure when it is NULL */ rc = rados_conf_read_file(entry->cluster, entry->config_file); if (entry->config_file && rc < 0) { SPDK_ERRLOG("Failed to read conf file %s\n", entry->config_file); rados_shutdown(entry->cluster); goto err_handle; } if (config_param) { const char *const *config_entry = config_param; while (*config_entry) { rc = rados_conf_set(entry->cluster, config_entry[0], config_entry[1]); if (rc < 0) { SPDK_ERRLOG("Failed to set %s = %s\n", config_entry[0], config_entry[1]); rados_shutdown(entry->cluster); goto err_handle; } config_entry += 2; } } if (key_file) { rc = rados_conf_set(entry->cluster, "keyring", key_file); if (rc < 0) { SPDK_ERRLOG("Failed to set keyring = %s\n", key_file); rados_shutdown(entry->cluster); goto err_handle; } } rc = rados_connect(entry->cluster); if (rc < 0) { SPDK_ERRLOG("Failed to connect to rbd_pool on cluster=%p\n", entry->cluster); rados_shutdown(entry->cluster); goto err_handle; } STAILQ_INSERT_TAIL(&g_map_bdev_rbd_cluster, entry, link); pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); return 0; err_handle: bdev_rbd_cluster_free(entry); pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); return -1; } int bdev_rbd_unregister_cluster(const char *name) { struct bdev_rbd_cluster *entry; int rc = 0; if (name == NULL) { return -1; } pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { if (strcmp(name, entry->name) == 0) { if (entry->ref == 0) { STAILQ_REMOVE(&g_map_bdev_rbd_cluster, entry, bdev_rbd_cluster, link); rados_shutdown(entry->cluster); bdev_rbd_cluster_free(entry); } else { SPDK_ERRLOG("Cluster with name=%p is still used and we cannot delete it\n", entry->name); rc = -1; } pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); return rc; } } pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); SPDK_ERRLOG("Could not find the cluster name =%p\n", name); return -1; } static void * _bdev_rbd_register_cluster(void *arg) { struct cluster_register_info *info = arg; void *ret = arg; int rc; rc = rbd_register_cluster((const char *)info->name, (const char *)info->user_id, (const char *const *)info->config_param, (const char *)info->config_file, (const char *)info->key_file); if (rc) { ret = NULL; } return ret; } int bdev_rbd_register_cluster(struct cluster_register_info *info) { assert(info != NULL); /* Rados cluster info need to be created in non SPDK-thread to avoid CPU * resource contention */ if (spdk_call_unaffinitized(_bdev_rbd_register_cluster, info) == NULL) { return -1; } return 0; } int bdev_rbd_create(struct spdk_bdev **bdev, const char *name, const char *user_id, const char *pool_name, const char *const *config, const char *rbd_name, uint32_t block_size, const char *cluster_name, const struct spdk_uuid *uuid) { struct bdev_rbd *rbd; int ret; if ((pool_name == NULL) || (rbd_name == NULL)) { return -EINVAL; } rbd = calloc(1, sizeof(struct bdev_rbd)); if (rbd == NULL) { SPDK_ERRLOG("Failed to allocate bdev_rbd struct\n"); return -ENOMEM; } ret = pthread_mutex_init(&rbd->mutex, NULL); if (ret) { SPDK_ERRLOG("Cannot init mutex on rbd=%p\n", rbd->disk.name); free(rbd); return ret; } rbd->rbd_name = strdup(rbd_name); if (!rbd->rbd_name) { bdev_rbd_free(rbd); return -ENOMEM; } if (user_id) { rbd->user_id = strdup(user_id); if (!rbd->user_id) { bdev_rbd_free(rbd); return -ENOMEM; } } if (cluster_name) { rbd->cluster_name = strdup(cluster_name); if (!rbd->cluster_name) { bdev_rbd_free(rbd); return -ENOMEM; } } rbd->pool_name = strdup(pool_name); if (!rbd->pool_name) { bdev_rbd_free(rbd); return -ENOMEM; } if (config && !(rbd->config = bdev_rbd_dup_config(config))) { bdev_rbd_free(rbd); return -ENOMEM; } ret = bdev_rbd_init(rbd); if (ret < 0) { bdev_rbd_free(rbd); SPDK_ERRLOG("Failed to init rbd device\n"); return ret; } if (uuid) { rbd->disk.uuid = *uuid; } else { spdk_uuid_generate(&rbd->disk.uuid); } if (name) { rbd->disk.name = strdup(name); } else { rbd->disk.name = spdk_sprintf_alloc("Ceph%d", bdev_rbd_count); } if (!rbd->disk.name) { bdev_rbd_free(rbd); return -ENOMEM; } rbd->disk.product_name = "Ceph Rbd Disk"; bdev_rbd_count++; rbd->disk.write_cache = 0; rbd->disk.blocklen = block_size; rbd->disk.blockcnt = rbd->info.size / rbd->disk.blocklen; rbd->disk.ctxt = rbd; rbd->disk.fn_table = &rbd_fn_table; rbd->disk.module = &rbd_if; SPDK_NOTICELOG("Add %s rbd disk to lun\n", rbd->disk.name); spdk_io_device_register(rbd, bdev_rbd_create_cb, bdev_rbd_destroy_cb, sizeof(struct bdev_rbd_io_channel), rbd_name); ret = spdk_bdev_register(&rbd->disk); if (ret) { spdk_io_device_unregister(rbd, NULL); bdev_rbd_free(rbd); return ret; } *bdev = &(rbd->disk); return ret; } void bdev_rbd_delete(struct spdk_bdev *bdev, spdk_delete_rbd_complete cb_fn, void *cb_arg) { if (!bdev || bdev->module != &rbd_if) { cb_fn(cb_arg, -ENODEV); return; } spdk_bdev_unregister(bdev, cb_fn, cb_arg); } int bdev_rbd_resize(struct spdk_bdev *bdev, const uint64_t new_size_in_mb) { struct spdk_io_channel *ch; struct bdev_rbd_io_channel *rbd_io_ch; int rc; uint64_t new_size_in_byte; uint64_t current_size_in_mb; if (bdev->module != &rbd_if) { return -EINVAL; } current_size_in_mb = bdev->blocklen * bdev->blockcnt / (1024 * 1024); if (current_size_in_mb > new_size_in_mb) { SPDK_ERRLOG("The new bdev size must be lager than current bdev size.\n"); return -EINVAL; } ch = bdev_rbd_get_io_channel(bdev); rbd_io_ch = spdk_io_channel_get_ctx(ch); new_size_in_byte = new_size_in_mb * 1024 * 1024; rc = rbd_resize(rbd_io_ch->disk->image, new_size_in_byte); spdk_put_io_channel(ch); if (rc != 0) { SPDK_ERRLOG("failed to resize the ceph bdev.\n"); return rc; } rc = spdk_bdev_notify_blockcnt_change(bdev, new_size_in_byte / bdev->blocklen); if (rc != 0) { SPDK_ERRLOG("failed to notify block cnt change.\n"); return rc; } return rc; } static int bdev_rbd_group_create_cb(void *io_device, void *ctx_buf) { return 0; } static void bdev_rbd_group_destroy_cb(void *io_device, void *ctx_buf) { } static int bdev_rbd_library_init(void) { spdk_io_device_register(&rbd_if, bdev_rbd_group_create_cb, bdev_rbd_group_destroy_cb, 0, "bdev_rbd_poll_groups"); return 0; } static void bdev_rbd_library_fini(void) { spdk_io_device_unregister(&rbd_if, NULL); } SPDK_LOG_REGISTER_COMPONENT(bdev_rbd)