diff --git a/CHANGELOG.md b/CHANGELOG.md index 43c166dde..5551c4106 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,10 @@ A new flag `ACCEL_FLAG_PERSISTENT` was added to indicate the target memory is PM Added `bdev_nvme_add_error_injection` and `bdev_nvme_remove_error_injection` RPCs to add and remove NVMe error injections. +New parameters, `ctrlr_loss_timeout_sec`, `reconnect_delay_sec`, and `fast_io_fail_timeout_sec`, are +added to the RPC `bdev_nvme_set_options`. They can be overridden if they are given by the RPC +`bdev_nvme_attach_controller`. + ### event Added `msg_mempool_size` parameter to `spdk_reactors_init` and `spdk_thread_lib_init_ext`. diff --git a/doc/jsonrpc.md b/doc/jsonrpc.md index ac38c5d68..00f5873da 100644 --- a/doc/jsonrpc.md +++ b/doc/jsonrpc.md @@ -2888,6 +2888,9 @@ Example response: Set global parameters for all bdev NVMe. This RPC may only be called before SPDK subsystems have been initialized or any bdev NVMe has been created. +Parameters, ctrlr_loss_timeout_sec, reconnect_delay_sec, and fast_io_fail_timeout_sec, are for I/O error resiliency. +They can be overridden if they are given by the RPC bdev_nvme_attach_controller. + #### Parameters Name | Optional | Type | Description @@ -2908,6 +2911,9 @@ delay_cmd_submit | Optional | boolean | Enable delaying NVMe comma transport_retry_count | Optional | number | The number of attempts per I/O in the transport layer before an I/O fails. bdev_retry_count | Optional | number | The number of attempts per I/O in the bdev layer before an I/O fails. -1 means infinite retries. transport_ack_timeout | Optional | number | Time to wait ack until packet retransmission. RDMA specific. Range 0-31 where 0 is driver-specific default value. +ctrlr_loss_timeout_sec | Optional | number | Time to wait until ctrlr is reconnected before deleting ctrlr. -1 means infinite reconnects. 0 means no reconnect. +reconnect_delay_sec | Optional | number | Time to delay a reconnect trial. 0 means no reconnect. +fast_io_fail_timeout_sec | Optional | number | Time to wait until ctrlr is reconnected before failing I/O to ctrlr. 0 means no such timeout. #### Example diff --git a/module/bdev/nvme/bdev_nvme.c b/module/bdev/nvme/bdev_nvme.c index d44859e4b..13c19448f 100644 --- a/module/bdev/nvme/bdev_nvme.c +++ b/module/bdev/nvme/bdev_nvme.c @@ -139,6 +139,9 @@ static struct spdk_bdev_nvme_opts g_opts = { .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, .bdev_retry_count = 3, .transport_ack_timeout = 0, + .ctrlr_loss_timeout_sec = 0, + .reconnect_delay_sec = 0, + .fast_io_fail_timeout_sec = 0, }; #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL @@ -3474,6 +3477,15 @@ err: return rc; } +void +bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) +{ + opts->prchk_flags = 0; + opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; + opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; + opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; +} + static void attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) @@ -3587,6 +3599,10 @@ bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) *opts = g_opts; } +static bool bdev_nvme_check_multipath_params(int32_t ctrlr_loss_timeout_sec, + uint32_t reconnect_delay_sec, + uint32_t fast_io_fail_timeout_sec); + static int bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) { @@ -3601,6 +3617,12 @@ bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) return -EINVAL; } + if (!bdev_nvme_check_multipath_params(opts->ctrlr_loss_timeout_sec, + opts->reconnect_delay_sec, + opts->fast_io_fail_timeout_sec)) { + return -EINVAL; + } + return 0; } @@ -3979,6 +4001,8 @@ bdev_nvme_create(struct spdk_nvme_transport_id *trid, if (bdev_opts) { memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); + } else { + bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); } if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { @@ -5678,6 +5702,9 @@ bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); + spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); + spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); + spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); spdk_json_write_object_end(w); spdk_json_write_object_end(w); diff --git a/module/bdev/nvme/bdev_nvme.h b/module/bdev/nvme/bdev_nvme.h index 6f6f71d90..408703bb8 100644 --- a/module/bdev/nvme/bdev_nvme.h +++ b/module/bdev/nvme/bdev_nvme.h @@ -251,6 +251,9 @@ struct spdk_bdev_nvme_opts { /* The number of attempts per I/O in the bdev layer before an I/O fails. */ int32_t bdev_retry_count; uint8_t transport_ack_timeout; + int32_t ctrlr_loss_timeout_sec; + uint32_t reconnect_delay_sec; + uint32_t fast_io_fail_timeout_sec; }; struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch); @@ -258,6 +261,8 @@ void bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts); int bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts); int bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx); +void bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts); + int bdev_nvme_create(struct spdk_nvme_transport_id *trid, const char *base_name, const char **names, diff --git a/module/bdev/nvme/bdev_nvme_rpc.c b/module/bdev/nvme/bdev_nvme_rpc.c index 9f4fcaac3..3ef35177f 100644 --- a/module/bdev/nvme/bdev_nvme_rpc.c +++ b/module/bdev/nvme/bdev_nvme_rpc.c @@ -92,6 +92,9 @@ static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] = {"transport_retry_count", offsetof(struct spdk_bdev_nvme_opts, transport_retry_count), spdk_json_decode_uint32, true}, {"bdev_retry_count", offsetof(struct spdk_bdev_nvme_opts, bdev_retry_count), spdk_json_decode_int32, true}, {"transport_ack_timeout", offsetof(struct spdk_bdev_nvme_opts, transport_ack_timeout), spdk_json_decode_uint8, true}, + {"ctrlr_loss_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, ctrlr_loss_timeout_sec), spdk_json_decode_int32, true}, + {"reconnect_delay_sec", offsetof(struct spdk_bdev_nvme_opts, reconnect_delay_sec), spdk_json_decode_uint32, true}, + {"fast_io_fail_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, fast_io_fail_timeout_sec), spdk_json_decode_uint32, true}, }; static void @@ -324,6 +327,7 @@ rpc_bdev_nvme_attach_controller(struct spdk_jsonrpc_request *request, } spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->req.drv_opts, sizeof(ctx->req.drv_opts)); + bdev_nvme_get_default_ctrlr_opts(&ctx->req.bdev_opts); if (spdk_json_decode_object(params, rpc_bdev_nvme_attach_controller_decoders, SPDK_COUNTOF(rpc_bdev_nvme_attach_controller_decoders), diff --git a/scripts/rpc.py b/scripts/rpc.py index a34cb2d59..616d4a7f2 100755 --- a/scripts/rpc.py +++ b/scripts/rpc.py @@ -480,7 +480,10 @@ if __name__ == "__main__": delay_cmd_submit=args.delay_cmd_submit, transport_retry_count=args.transport_retry_count, bdev_retry_count=args.bdev_retry_count, - transport_ack_timeout=args.transport_ack_timeout) + transport_ack_timeout=args.transport_ack_timeout, + ctrlr_loss_timeout_sec=args.ctrlr_loss_timeout_sec, + reconnect_delay_sec=args.reconnect_delay_sec, + fast_io_fail_timeout_sec=args.fast_io_fail_timeout_sec) p = subparsers.add_parser('bdev_nvme_set_options', aliases=['set_bdev_nvme_options'], help='Set options for the bdev nvme type. This is startup command.') @@ -518,6 +521,29 @@ if __name__ == "__main__": p.add_argument('-e', '--transport-ack-timeout', help="""Time to wait ack until packet retransmission. RDMA specific. Range 0-31 where 0 is driver-specific default value.""", type=int) + p.add_argument('-l', '--ctrlr-loss-timeout-sec', + help="""Time to wait until ctrlr is reconnected before deleting ctrlr. + -1 means infinite reconnect retries. 0 means no reconnect retry. + If reconnect_delay_sec is zero, ctrlr_loss_timeout_sec has to be zero. + If reconnect_delay_sec is non-zero, ctrlr_loss_timeout_sec has to be -1 or not less than + reconnect_delay_sec. + This can be overridden by bdev_nvme_attach_controller.""", + type=int) + p.add_argument('-o', '--reconnect-delay-sec', + help="""Time to delay a reconnect retry. + If ctrlr_loss_timeout_sec is zero, reconnect_delay_sec has to be zero. + If ctrlr_loss_timeout_sec is -1, reconnect_delay_sec has to be non-zero. + If ctrlr_loss_timeout_sec is not -1 or zero, reconnect_delay_sec has to be non-zero and + less than ctrlr_loss_timeout_sec. + This can be overridden by bdev_nvme_attach_controller.""", + type=int) + p.add_argument('-u', '--fast-io-fail-timeout-sec', + help="""Time to wait until ctrlr is reconnected before failing I/O to ctrlr. + 0 means no such timeout. + If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and + less than ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1. + This can be overridden by bdev_nvme_attach_controller.""", + type=int) p.set_defaults(func=bdev_nvme_set_options) diff --git a/scripts/rpc/bdev.py b/scripts/rpc/bdev.py index ac81f2dc5..3717f17c7 100644 --- a/scripts/rpc/bdev.py +++ b/scripts/rpc/bdev.py @@ -443,7 +443,8 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo low_priority_weight=None, medium_priority_weight=None, high_priority_weight=None, nvme_adminq_poll_period_us=None, nvme_ioq_poll_period_us=None, io_queue_requests=None, delay_cmd_submit=None, transport_retry_count=None, bdev_retry_count=None, - transport_ack_timeout=None): + transport_ack_timeout=None, ctrlr_loss_timeout_sec=None, reconnect_delay_sec=None, + fast_io_fail_timeout_sec=None): """Set options for the bdev nvme. This is startup command. Args: @@ -464,6 +465,22 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo bdev_retry_count: The number of attempts per I/O in the bdev layer when an I/O fails. -1 means infinite retries. (optional) transport_ack_timeout: Time to wait ack until packet retransmission. RDMA specific. Range 0-31 where 0 is driver-specific default value (optional) + ctrlr_loss_timeout_sec: Time to wait until ctrlr is reconnected before deleting ctrlr. + -1 means infinite reconnect retries. 0 means no reconnect retry. + If reconnect_delay_sec is zero, ctrlr_loss_timeout_sec has to be zero. + If reconnect_delay_sec is non-zero, ctrlr_loss_timeout_sec has to be -1 or not less than reconnect_delay_sec. + This can be overridden by bdev_nvme_attach_controller. (optional) + reconnect_delay_sec: Time to delay a reconnect retry. + If ctrlr_loss_timeout_sec is zero, reconnect_delay_sec has to be zero. + If ctrlr_loss_timeout_sec is -1, reconnect_delay_sec has to be non-zero. + If ctrlr_loss_timeout_sec is not -1 or zero, reconnect_sec has to be non-zero and less than ctrlr_loss_timeout_sec. + This can be overridden by bdev_nvme_attach_controller. (optional) + fail_io_fast_timeout_sec: Time to wait until ctrlr is reconnected before failing I/O to ctrlr. + 0 means no such timeout. + If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and less than + ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1. + This can be overridden by bdev_nvme_attach_controller. (optional) + """ params = {} @@ -516,6 +533,15 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo if transport_ack_timeout is not None: params['transport_ack_timeout'] = transport_ack_timeout + if ctrlr_loss_timeout_sec is not None: + params['ctrlr_loss_timeout_sec'] = ctrlr_loss_timeout_sec + + if reconnect_delay_sec is not None: + params['reconnect_delay_sec'] = reconnect_delay_sec + + if fast_io_fail_timeout_sec is not None: + params['fast_io_fail_timeout_sec'] = fast_io_fail_timeout_sec + return client.call('bdev_nvme_set_options', params)