bdev/nvme: I/O error resiliency can be configured by global options

Add three options for I/O error resiliency to spdk_nvme_bdev_opts.
Then the RPC bdev_nvme_set_options can configure these.
These can be overridden if these are given by the RPC bdev_nvme_attach_controller.

Change-Id: If3ee23aeef8b7585fe0fb5ec4695df5866fc1e74
Signed-off-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/11830
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
This commit is contained in:
Shuhei Matsumoto 2022-03-09 21:04:14 +09:00 committed by Tomasz Zawadzki
parent 6fb6716d45
commit 0fba8dc8cb
7 changed files with 100 additions and 2 deletions

View File

@ -31,6 +31,10 @@ A new flag `ACCEL_FLAG_PERSISTENT` was added to indicate the target memory is PM
Added `bdev_nvme_add_error_injection` and `bdev_nvme_remove_error_injection` RPCs to add and Added `bdev_nvme_add_error_injection` and `bdev_nvme_remove_error_injection` RPCs to add and
remove NVMe error injections. remove NVMe error injections.
New parameters, `ctrlr_loss_timeout_sec`, `reconnect_delay_sec`, and `fast_io_fail_timeout_sec`, are
added to the RPC `bdev_nvme_set_options`. They can be overridden if they are given by the RPC
`bdev_nvme_attach_controller`.
### event ### event
Added `msg_mempool_size` parameter to `spdk_reactors_init` and `spdk_thread_lib_init_ext`. Added `msg_mempool_size` parameter to `spdk_reactors_init` and `spdk_thread_lib_init_ext`.

View File

@ -2888,6 +2888,9 @@ Example response:
Set global parameters for all bdev NVMe. This RPC may only be called before SPDK subsystems have been initialized Set global parameters for all bdev NVMe. This RPC may only be called before SPDK subsystems have been initialized
or any bdev NVMe has been created. or any bdev NVMe has been created.
Parameters, ctrlr_loss_timeout_sec, reconnect_delay_sec, and fast_io_fail_timeout_sec, are for I/O error resiliency.
They can be overridden if they are given by the RPC bdev_nvme_attach_controller.
#### Parameters #### Parameters
Name | Optional | Type | Description Name | Optional | Type | Description
@ -2908,6 +2911,9 @@ delay_cmd_submit | Optional | boolean | Enable delaying NVMe comma
transport_retry_count | Optional | number | The number of attempts per I/O in the transport layer before an I/O fails. transport_retry_count | Optional | number | The number of attempts per I/O in the transport layer before an I/O fails.
bdev_retry_count | Optional | number | The number of attempts per I/O in the bdev layer before an I/O fails. -1 means infinite retries. bdev_retry_count | Optional | number | The number of attempts per I/O in the bdev layer before an I/O fails. -1 means infinite retries.
transport_ack_timeout | Optional | number | Time to wait ack until packet retransmission. RDMA specific. Range 0-31 where 0 is driver-specific default value. transport_ack_timeout | Optional | number | Time to wait ack until packet retransmission. RDMA specific. Range 0-31 where 0 is driver-specific default value.
ctrlr_loss_timeout_sec | Optional | number | Time to wait until ctrlr is reconnected before deleting ctrlr. -1 means infinite reconnects. 0 means no reconnect.
reconnect_delay_sec | Optional | number | Time to delay a reconnect trial. 0 means no reconnect.
fast_io_fail_timeout_sec | Optional | number | Time to wait until ctrlr is reconnected before failing I/O to ctrlr. 0 means no such timeout.
#### Example #### Example

View File

@ -139,6 +139,9 @@ static struct spdk_bdev_nvme_opts g_opts = {
.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
.bdev_retry_count = 3, .bdev_retry_count = 3,
.transport_ack_timeout = 0, .transport_ack_timeout = 0,
.ctrlr_loss_timeout_sec = 0,
.reconnect_delay_sec = 0,
.fast_io_fail_timeout_sec = 0,
}; };
#define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL
@ -3474,6 +3477,15 @@ err:
return rc; return rc;
} }
void
bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts)
{
opts->prchk_flags = 0;
opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec;
opts->reconnect_delay_sec = g_opts.reconnect_delay_sec;
opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec;
}
static void static void
attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts)
@ -3587,6 +3599,10 @@ bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
*opts = g_opts; *opts = g_opts;
} }
static bool bdev_nvme_check_multipath_params(int32_t ctrlr_loss_timeout_sec,
uint32_t reconnect_delay_sec,
uint32_t fast_io_fail_timeout_sec);
static int static int
bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
{ {
@ -3601,6 +3617,12 @@ bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
return -EINVAL; return -EINVAL;
} }
if (!bdev_nvme_check_multipath_params(opts->ctrlr_loss_timeout_sec,
opts->reconnect_delay_sec,
opts->fast_io_fail_timeout_sec)) {
return -EINVAL;
}
return 0; return 0;
} }
@ -3979,6 +4001,8 @@ bdev_nvme_create(struct spdk_nvme_transport_id *trid,
if (bdev_opts) { if (bdev_opts) {
memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts));
} else {
bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts);
} }
if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
@ -5678,6 +5702,9 @@ bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count);
spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout);
spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec);
spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec);
spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec);
spdk_json_write_object_end(w); spdk_json_write_object_end(w);
spdk_json_write_object_end(w); spdk_json_write_object_end(w);

View File

@ -251,6 +251,9 @@ struct spdk_bdev_nvme_opts {
/* The number of attempts per I/O in the bdev layer before an I/O fails. */ /* The number of attempts per I/O in the bdev layer before an I/O fails. */
int32_t bdev_retry_count; int32_t bdev_retry_count;
uint8_t transport_ack_timeout; uint8_t transport_ack_timeout;
int32_t ctrlr_loss_timeout_sec;
uint32_t reconnect_delay_sec;
uint32_t fast_io_fail_timeout_sec;
}; };
struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch); struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch);
@ -258,6 +261,8 @@ void bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts);
int bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts); int bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts);
int bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx); int bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx);
void bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts);
int bdev_nvme_create(struct spdk_nvme_transport_id *trid, int bdev_nvme_create(struct spdk_nvme_transport_id *trid,
const char *base_name, const char *base_name,
const char **names, const char **names,

View File

@ -92,6 +92,9 @@ static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] =
{"transport_retry_count", offsetof(struct spdk_bdev_nvme_opts, transport_retry_count), spdk_json_decode_uint32, true}, {"transport_retry_count", offsetof(struct spdk_bdev_nvme_opts, transport_retry_count), spdk_json_decode_uint32, true},
{"bdev_retry_count", offsetof(struct spdk_bdev_nvme_opts, bdev_retry_count), spdk_json_decode_int32, true}, {"bdev_retry_count", offsetof(struct spdk_bdev_nvme_opts, bdev_retry_count), spdk_json_decode_int32, true},
{"transport_ack_timeout", offsetof(struct spdk_bdev_nvme_opts, transport_ack_timeout), spdk_json_decode_uint8, true}, {"transport_ack_timeout", offsetof(struct spdk_bdev_nvme_opts, transport_ack_timeout), spdk_json_decode_uint8, true},
{"ctrlr_loss_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, ctrlr_loss_timeout_sec), spdk_json_decode_int32, true},
{"reconnect_delay_sec", offsetof(struct spdk_bdev_nvme_opts, reconnect_delay_sec), spdk_json_decode_uint32, true},
{"fast_io_fail_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, fast_io_fail_timeout_sec), spdk_json_decode_uint32, true},
}; };
static void static void
@ -324,6 +327,7 @@ rpc_bdev_nvme_attach_controller(struct spdk_jsonrpc_request *request,
} }
spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->req.drv_opts, sizeof(ctx->req.drv_opts)); spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->req.drv_opts, sizeof(ctx->req.drv_opts));
bdev_nvme_get_default_ctrlr_opts(&ctx->req.bdev_opts);
if (spdk_json_decode_object(params, rpc_bdev_nvme_attach_controller_decoders, if (spdk_json_decode_object(params, rpc_bdev_nvme_attach_controller_decoders,
SPDK_COUNTOF(rpc_bdev_nvme_attach_controller_decoders), SPDK_COUNTOF(rpc_bdev_nvme_attach_controller_decoders),

View File

@ -480,7 +480,10 @@ if __name__ == "__main__":
delay_cmd_submit=args.delay_cmd_submit, delay_cmd_submit=args.delay_cmd_submit,
transport_retry_count=args.transport_retry_count, transport_retry_count=args.transport_retry_count,
bdev_retry_count=args.bdev_retry_count, bdev_retry_count=args.bdev_retry_count,
transport_ack_timeout=args.transport_ack_timeout) transport_ack_timeout=args.transport_ack_timeout,
ctrlr_loss_timeout_sec=args.ctrlr_loss_timeout_sec,
reconnect_delay_sec=args.reconnect_delay_sec,
fast_io_fail_timeout_sec=args.fast_io_fail_timeout_sec)
p = subparsers.add_parser('bdev_nvme_set_options', aliases=['set_bdev_nvme_options'], p = subparsers.add_parser('bdev_nvme_set_options', aliases=['set_bdev_nvme_options'],
help='Set options for the bdev nvme type. This is startup command.') help='Set options for the bdev nvme type. This is startup command.')
@ -518,6 +521,29 @@ if __name__ == "__main__":
p.add_argument('-e', '--transport-ack-timeout', p.add_argument('-e', '--transport-ack-timeout',
help="""Time to wait ack until packet retransmission. RDMA specific. help="""Time to wait ack until packet retransmission. RDMA specific.
Range 0-31 where 0 is driver-specific default value.""", type=int) Range 0-31 where 0 is driver-specific default value.""", type=int)
p.add_argument('-l', '--ctrlr-loss-timeout-sec',
help="""Time to wait until ctrlr is reconnected before deleting ctrlr.
-1 means infinite reconnect retries. 0 means no reconnect retry.
If reconnect_delay_sec is zero, ctrlr_loss_timeout_sec has to be zero.
If reconnect_delay_sec is non-zero, ctrlr_loss_timeout_sec has to be -1 or not less than
reconnect_delay_sec.
This can be overridden by bdev_nvme_attach_controller.""",
type=int)
p.add_argument('-o', '--reconnect-delay-sec',
help="""Time to delay a reconnect retry.
If ctrlr_loss_timeout_sec is zero, reconnect_delay_sec has to be zero.
If ctrlr_loss_timeout_sec is -1, reconnect_delay_sec has to be non-zero.
If ctrlr_loss_timeout_sec is not -1 or zero, reconnect_delay_sec has to be non-zero and
less than ctrlr_loss_timeout_sec.
This can be overridden by bdev_nvme_attach_controller.""",
type=int)
p.add_argument('-u', '--fast-io-fail-timeout-sec',
help="""Time to wait until ctrlr is reconnected before failing I/O to ctrlr.
0 means no such timeout.
If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and
less than ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1.
This can be overridden by bdev_nvme_attach_controller.""",
type=int)
p.set_defaults(func=bdev_nvme_set_options) p.set_defaults(func=bdev_nvme_set_options)

View File

@ -443,7 +443,8 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
low_priority_weight=None, medium_priority_weight=None, high_priority_weight=None, low_priority_weight=None, medium_priority_weight=None, high_priority_weight=None,
nvme_adminq_poll_period_us=None, nvme_ioq_poll_period_us=None, io_queue_requests=None, nvme_adminq_poll_period_us=None, nvme_ioq_poll_period_us=None, io_queue_requests=None,
delay_cmd_submit=None, transport_retry_count=None, bdev_retry_count=None, delay_cmd_submit=None, transport_retry_count=None, bdev_retry_count=None,
transport_ack_timeout=None): transport_ack_timeout=None, ctrlr_loss_timeout_sec=None, reconnect_delay_sec=None,
fast_io_fail_timeout_sec=None):
"""Set options for the bdev nvme. This is startup command. """Set options for the bdev nvme. This is startup command.
Args: Args:
@ -464,6 +465,22 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
bdev_retry_count: The number of attempts per I/O in the bdev layer when an I/O fails. -1 means infinite retries. (optional) bdev_retry_count: The number of attempts per I/O in the bdev layer when an I/O fails. -1 means infinite retries. (optional)
transport_ack_timeout: Time to wait ack until packet retransmission. RDMA specific. transport_ack_timeout: Time to wait ack until packet retransmission. RDMA specific.
Range 0-31 where 0 is driver-specific default value (optional) Range 0-31 where 0 is driver-specific default value (optional)
ctrlr_loss_timeout_sec: Time to wait until ctrlr is reconnected before deleting ctrlr.
-1 means infinite reconnect retries. 0 means no reconnect retry.
If reconnect_delay_sec is zero, ctrlr_loss_timeout_sec has to be zero.
If reconnect_delay_sec is non-zero, ctrlr_loss_timeout_sec has to be -1 or not less than reconnect_delay_sec.
This can be overridden by bdev_nvme_attach_controller. (optional)
reconnect_delay_sec: Time to delay a reconnect retry.
If ctrlr_loss_timeout_sec is zero, reconnect_delay_sec has to be zero.
If ctrlr_loss_timeout_sec is -1, reconnect_delay_sec has to be non-zero.
If ctrlr_loss_timeout_sec is not -1 or zero, reconnect_sec has to be non-zero and less than ctrlr_loss_timeout_sec.
This can be overridden by bdev_nvme_attach_controller. (optional)
fail_io_fast_timeout_sec: Time to wait until ctrlr is reconnected before failing I/O to ctrlr.
0 means no such timeout.
If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and less than
ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1.
This can be overridden by bdev_nvme_attach_controller. (optional)
""" """
params = {} params = {}
@ -516,6 +533,15 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
if transport_ack_timeout is not None: if transport_ack_timeout is not None:
params['transport_ack_timeout'] = transport_ack_timeout params['transport_ack_timeout'] = transport_ack_timeout
if ctrlr_loss_timeout_sec is not None:
params['ctrlr_loss_timeout_sec'] = ctrlr_loss_timeout_sec
if reconnect_delay_sec is not None:
params['reconnect_delay_sec'] = reconnect_delay_sec
if fast_io_fail_timeout_sec is not None:
params['fast_io_fail_timeout_sec'] = fast_io_fail_timeout_sec
return client.call('bdev_nvme_set_options', params) return client.call('bdev_nvme_set_options', params)