bdev/nvme: I/O error resiliency can be configured by global options

Add three options for I/O error resiliency to spdk_nvme_bdev_opts.
Then the RPC bdev_nvme_set_options can configure these.
These can be overridden if these are given by the RPC bdev_nvme_attach_controller.

Change-Id: If3ee23aeef8b7585fe0fb5ec4695df5866fc1e74
Signed-off-by: Shuhei Matsumoto <smatsumoto@nvidia.com>
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/11830
Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com>
Community-CI: Mellanox Build Bot
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Ben Walker <benjamin.walker@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
This commit is contained in:
Shuhei Matsumoto 2022-03-09 21:04:14 +09:00 committed by Tomasz Zawadzki
parent 6fb6716d45
commit 0fba8dc8cb
7 changed files with 100 additions and 2 deletions

View File

@ -31,6 +31,10 @@ A new flag `ACCEL_FLAG_PERSISTENT` was added to indicate the target memory is PM
Added `bdev_nvme_add_error_injection` and `bdev_nvme_remove_error_injection` RPCs to add and
remove NVMe error injections.
New parameters, `ctrlr_loss_timeout_sec`, `reconnect_delay_sec`, and `fast_io_fail_timeout_sec`, are
added to the RPC `bdev_nvme_set_options`. They can be overridden if they are given by the RPC
`bdev_nvme_attach_controller`.
### event
Added `msg_mempool_size` parameter to `spdk_reactors_init` and `spdk_thread_lib_init_ext`.

View File

@ -2888,6 +2888,9 @@ Example response:
Set global parameters for all bdev NVMe. This RPC may only be called before SPDK subsystems have been initialized
or any bdev NVMe has been created.
Parameters, ctrlr_loss_timeout_sec, reconnect_delay_sec, and fast_io_fail_timeout_sec, are for I/O error resiliency.
They can be overridden if they are given by the RPC bdev_nvme_attach_controller.
#### Parameters
Name | Optional | Type | Description
@ -2908,6 +2911,9 @@ delay_cmd_submit | Optional | boolean | Enable delaying NVMe comma
transport_retry_count | Optional | number | The number of attempts per I/O in the transport layer before an I/O fails.
bdev_retry_count | Optional | number | The number of attempts per I/O in the bdev layer before an I/O fails. -1 means infinite retries.
transport_ack_timeout | Optional | number | Time to wait ack until packet retransmission. RDMA specific. Range 0-31 where 0 is driver-specific default value.
ctrlr_loss_timeout_sec | Optional | number | Time to wait until ctrlr is reconnected before deleting ctrlr. -1 means infinite reconnects. 0 means no reconnect.
reconnect_delay_sec | Optional | number | Time to delay a reconnect trial. 0 means no reconnect.
fast_io_fail_timeout_sec | Optional | number | Time to wait until ctrlr is reconnected before failing I/O to ctrlr. 0 means no such timeout.
#### Example

View File

@ -139,6 +139,9 @@ static struct spdk_bdev_nvme_opts g_opts = {
.delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
.bdev_retry_count = 3,
.transport_ack_timeout = 0,
.ctrlr_loss_timeout_sec = 0,
.reconnect_delay_sec = 0,
.fast_io_fail_timeout_sec = 0,
};
#define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL
@ -3474,6 +3477,15 @@ err:
return rc;
}
void
bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts)
{
opts->prchk_flags = 0;
opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec;
opts->reconnect_delay_sec = g_opts.reconnect_delay_sec;
opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec;
}
static void
attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts)
@ -3587,6 +3599,10 @@ bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
*opts = g_opts;
}
static bool bdev_nvme_check_multipath_params(int32_t ctrlr_loss_timeout_sec,
uint32_t reconnect_delay_sec,
uint32_t fast_io_fail_timeout_sec);
static int
bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
{
@ -3601,6 +3617,12 @@ bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
return -EINVAL;
}
if (!bdev_nvme_check_multipath_params(opts->ctrlr_loss_timeout_sec,
opts->reconnect_delay_sec,
opts->fast_io_fail_timeout_sec)) {
return -EINVAL;
}
return 0;
}
@ -3979,6 +4001,8 @@ bdev_nvme_create(struct spdk_nvme_transport_id *trid,
if (bdev_opts) {
memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts));
} else {
bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts);
}
if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
@ -5678,6 +5702,9 @@ bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count);
spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout);
spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec);
spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec);
spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec);
spdk_json_write_object_end(w);
spdk_json_write_object_end(w);

View File

@ -251,6 +251,9 @@ struct spdk_bdev_nvme_opts {
/* The number of attempts per I/O in the bdev layer before an I/O fails. */
int32_t bdev_retry_count;
uint8_t transport_ack_timeout;
int32_t ctrlr_loss_timeout_sec;
uint32_t reconnect_delay_sec;
uint32_t fast_io_fail_timeout_sec;
};
struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch);
@ -258,6 +261,8 @@ void bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts);
int bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts);
int bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx);
void bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts);
int bdev_nvme_create(struct spdk_nvme_transport_id *trid,
const char *base_name,
const char **names,

View File

@ -92,6 +92,9 @@ static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] =
{"transport_retry_count", offsetof(struct spdk_bdev_nvme_opts, transport_retry_count), spdk_json_decode_uint32, true},
{"bdev_retry_count", offsetof(struct spdk_bdev_nvme_opts, bdev_retry_count), spdk_json_decode_int32, true},
{"transport_ack_timeout", offsetof(struct spdk_bdev_nvme_opts, transport_ack_timeout), spdk_json_decode_uint8, true},
{"ctrlr_loss_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, ctrlr_loss_timeout_sec), spdk_json_decode_int32, true},
{"reconnect_delay_sec", offsetof(struct spdk_bdev_nvme_opts, reconnect_delay_sec), spdk_json_decode_uint32, true},
{"fast_io_fail_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, fast_io_fail_timeout_sec), spdk_json_decode_uint32, true},
};
static void
@ -324,6 +327,7 @@ rpc_bdev_nvme_attach_controller(struct spdk_jsonrpc_request *request,
}
spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->req.drv_opts, sizeof(ctx->req.drv_opts));
bdev_nvme_get_default_ctrlr_opts(&ctx->req.bdev_opts);
if (spdk_json_decode_object(params, rpc_bdev_nvme_attach_controller_decoders,
SPDK_COUNTOF(rpc_bdev_nvme_attach_controller_decoders),

View File

@ -480,7 +480,10 @@ if __name__ == "__main__":
delay_cmd_submit=args.delay_cmd_submit,
transport_retry_count=args.transport_retry_count,
bdev_retry_count=args.bdev_retry_count,
transport_ack_timeout=args.transport_ack_timeout)
transport_ack_timeout=args.transport_ack_timeout,
ctrlr_loss_timeout_sec=args.ctrlr_loss_timeout_sec,
reconnect_delay_sec=args.reconnect_delay_sec,
fast_io_fail_timeout_sec=args.fast_io_fail_timeout_sec)
p = subparsers.add_parser('bdev_nvme_set_options', aliases=['set_bdev_nvme_options'],
help='Set options for the bdev nvme type. This is startup command.')
@ -518,6 +521,29 @@ if __name__ == "__main__":
p.add_argument('-e', '--transport-ack-timeout',
help="""Time to wait ack until packet retransmission. RDMA specific.
Range 0-31 where 0 is driver-specific default value.""", type=int)
p.add_argument('-l', '--ctrlr-loss-timeout-sec',
help="""Time to wait until ctrlr is reconnected before deleting ctrlr.
-1 means infinite reconnect retries. 0 means no reconnect retry.
If reconnect_delay_sec is zero, ctrlr_loss_timeout_sec has to be zero.
If reconnect_delay_sec is non-zero, ctrlr_loss_timeout_sec has to be -1 or not less than
reconnect_delay_sec.
This can be overridden by bdev_nvme_attach_controller.""",
type=int)
p.add_argument('-o', '--reconnect-delay-sec',
help="""Time to delay a reconnect retry.
If ctrlr_loss_timeout_sec is zero, reconnect_delay_sec has to be zero.
If ctrlr_loss_timeout_sec is -1, reconnect_delay_sec has to be non-zero.
If ctrlr_loss_timeout_sec is not -1 or zero, reconnect_delay_sec has to be non-zero and
less than ctrlr_loss_timeout_sec.
This can be overridden by bdev_nvme_attach_controller.""",
type=int)
p.add_argument('-u', '--fast-io-fail-timeout-sec',
help="""Time to wait until ctrlr is reconnected before failing I/O to ctrlr.
0 means no such timeout.
If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and
less than ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1.
This can be overridden by bdev_nvme_attach_controller.""",
type=int)
p.set_defaults(func=bdev_nvme_set_options)

View File

@ -443,7 +443,8 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
low_priority_weight=None, medium_priority_weight=None, high_priority_weight=None,
nvme_adminq_poll_period_us=None, nvme_ioq_poll_period_us=None, io_queue_requests=None,
delay_cmd_submit=None, transport_retry_count=None, bdev_retry_count=None,
transport_ack_timeout=None):
transport_ack_timeout=None, ctrlr_loss_timeout_sec=None, reconnect_delay_sec=None,
fast_io_fail_timeout_sec=None):
"""Set options for the bdev nvme. This is startup command.
Args:
@ -464,6 +465,22 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
bdev_retry_count: The number of attempts per I/O in the bdev layer when an I/O fails. -1 means infinite retries. (optional)
transport_ack_timeout: Time to wait ack until packet retransmission. RDMA specific.
Range 0-31 where 0 is driver-specific default value (optional)
ctrlr_loss_timeout_sec: Time to wait until ctrlr is reconnected before deleting ctrlr.
-1 means infinite reconnect retries. 0 means no reconnect retry.
If reconnect_delay_sec is zero, ctrlr_loss_timeout_sec has to be zero.
If reconnect_delay_sec is non-zero, ctrlr_loss_timeout_sec has to be -1 or not less than reconnect_delay_sec.
This can be overridden by bdev_nvme_attach_controller. (optional)
reconnect_delay_sec: Time to delay a reconnect retry.
If ctrlr_loss_timeout_sec is zero, reconnect_delay_sec has to be zero.
If ctrlr_loss_timeout_sec is -1, reconnect_delay_sec has to be non-zero.
If ctrlr_loss_timeout_sec is not -1 or zero, reconnect_sec has to be non-zero and less than ctrlr_loss_timeout_sec.
This can be overridden by bdev_nvme_attach_controller. (optional)
fail_io_fast_timeout_sec: Time to wait until ctrlr is reconnected before failing I/O to ctrlr.
0 means no such timeout.
If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and less than
ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1.
This can be overridden by bdev_nvme_attach_controller. (optional)
"""
params = {}
@ -516,6 +533,15 @@ def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeo
if transport_ack_timeout is not None:
params['transport_ack_timeout'] = transport_ack_timeout
if ctrlr_loss_timeout_sec is not None:
params['ctrlr_loss_timeout_sec'] = ctrlr_loss_timeout_sec
if reconnect_delay_sec is not None:
params['reconnect_delay_sec'] = reconnect_delay_sec
if fast_io_fail_timeout_sec is not None:
params['fast_io_fail_timeout_sec'] = fast_io_fail_timeout_sec
return client.call('bdev_nvme_set_options', params)