From 49ffdc18afc639a8f59e87e2e0945a379ced497b Mon Sep 17 00:00:00 2001 From: WindYu <787738581@qq.com> Date: Fri, 16 Jul 2021 17:42:20 -0400 Subject: [PATCH] bdev/nvme: Add support to get the health log for NVMe device Add a new RPC method to get the health log of a certain NVMe device. Below is the example: ./scripts/rpc.py bdev_nvme_get_controller_health_info -c Nvme0 { "model_number": "INTEL SSDPE2KX020T8", "serial_number": "BTLJ72430ARH2P0BGN", "firmware_revision": "VDV10110", "traddr": "0000:08:00.0", "temperature_celsius": 33, "available_spare_percentage": 99, "available_spare_threshold_percentage": 10, "percentage_used_percentage": 2, "data_units_read": 1013408619, "data_units_written": 346792685, "host_read_commands": 30457773264, "host_write_commands": 18949677715, "controller_busy_time": 4979, "power_cycles": 49, "power_on_hours": 31114, "unsafe_shutdowns": 18, "media_errors": 17, "num_err_log_entries": 19, "warning_temperature_time_minutes": 0, "critical_composite_temperature_time_minutes": 0 } Change-Id: I53125d2ec16cb36011571473430aece99167b803 Signed-off-by: GangCao Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/8806 Reviewed-by: Shuhei Matsumoto Reviewed-by: Tomasz Zawadzki Tested-by: SPDK CI Jenkins Community-CI: Mellanox Build Bot --- doc/jsonrpc.md | 57 ++++++++ module/bdev/nvme/bdev_nvme_rpc.c | 217 +++++++++++++++++++++++++++++++ scripts/rpc.py | 9 ++ scripts/rpc/bdev.py | 14 ++ 4 files changed, 297 insertions(+) diff --git a/doc/jsonrpc.md b/doc/jsonrpc.md index dc66e54c0..764f080f4 100644 --- a/doc/jsonrpc.md +++ b/doc/jsonrpc.md @@ -457,6 +457,7 @@ Example response: "bdev_passthru_delete" "bdev_nvme_apply_firmware", "bdev_nvme_get_transport_statistics", + "bdev_nvme_get_controller_health_info", "bdev_nvme_detach_controller", "bdev_nvme_attach_controller", "bdev_null_create", @@ -3341,6 +3342,62 @@ Example response: } ~~~ +## bdev_nvme_get_controller_health_info {#rpc_bdev_nvme_get_controller_health_info} + +Display health log of the required NVMe bdev device. + +### Parameters + +Name | Optional | Type | Description +----------------------- | -------- | ----------- | ----------- +name | Required | string | Name of the NVMe bdev controller + +### Response + +The response is the object containing information about health log of the NVMe controller. + +### Example + +Example request: + +~~~ +{ + "jsonrpc": "2.0", + "method": "bdev_nvme_get_controller_health_info", + "id": 1, + "params": { + "name": "Nvme0" + } +} +~~~ + +Example response: + +~~~ +{ + "model_number": "INTEL SSDPE2KX020T8", + "serial_number": "BTLJ72430ARH2P0BGN", + "firmware_revision": "VDV10110", + "traddr": "0000:08:00.0", + "temperature_celsius": 32, + "available_spare_percentage": 99, + "available_spare_threshold_percentage": 10, + "percentage_used": 2, + "data_units_read": 1013408619, + "data_units_written": 346792685, + "host_read_commands": 30457773282, + "host_write_commands": 18949677715, + "controller_busy_time": 4979, + "power_cycles": 49, + "power_on_hours": 31118, + "unsafe_shutdowns": 18, + "media_errors": 17, + "num_err_log_entries": 19, + "warning_temperature_time_minutes": 0, + "critical_composite_temperature_time_minutes": 0 +} +~~~ + ## bdev_rbd_register_cluster {#rpc_bdev_rbd_register_cluster} This method is available only if SPDK was build with Ceph RBD support. diff --git a/module/bdev/nvme/bdev_nvme_rpc.c b/module/bdev/nvme/bdev_nvme_rpc.c index d333e5bbd..dea749e1d 100644 --- a/module/bdev/nvme/bdev_nvme_rpc.c +++ b/module/bdev/nvme/bdev_nvme_rpc.c @@ -41,6 +41,9 @@ #include "spdk/string.h" #include "spdk/rpc.h" #include "spdk/util.h" +#include "spdk/env.h" +#include "spdk/nvme.h" +#include "spdk/nvme_spec.h" #include "spdk/log.h" #include "spdk/bdev_module.h" @@ -1145,3 +1148,217 @@ cleanup: free_rpc_bdev_nvme_reset_controller_req(&req); } SPDK_RPC_REGISTER("bdev_nvme_reset_controller", rpc_bdev_nvme_reset_controller, SPDK_RPC_RUNTIME) + +struct rpc_get_controller_health_info { + char *name; +}; + +struct spdk_nvme_health_info_context { + struct spdk_jsonrpc_request *request; + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_health_information_page health_page; +}; + +static void +free_rpc_get_controller_health_info(struct rpc_get_controller_health_info *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_get_controller_health_info_decoders[] = { + {"name", offsetof(struct rpc_get_controller_health_info, name), spdk_json_decode_string, true}, +}; + +static void nvme_health_info_cleanup(struct spdk_nvme_health_info_context *context, bool response) +{ + if (response == true) { + spdk_jsonrpc_send_error_response(context->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Internal error."); + } + + free(context); +} + +static void +get_health_log_page_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl) +{ + int i; + char buf[128]; + struct spdk_nvme_health_info_context *context = cb_arg; + struct spdk_jsonrpc_request *request = context->request; + struct spdk_json_write_ctx *w; + struct spdk_nvme_ctrlr *ctrlr = context->ctrlr; + const struct spdk_nvme_transport_id *trid = NULL; + const struct spdk_nvme_ctrlr_data *cdata = NULL; + struct spdk_nvme_health_information_page *health_page = NULL; + + if (spdk_nvme_cpl_is_error(cpl)) { + nvme_health_info_cleanup(context, true); + SPDK_ERRLOG("get log page failed\n"); + return; + } + + if (ctrlr == NULL) { + nvme_health_info_cleanup(context, true); + SPDK_ERRLOG("ctrlr is NULL\n"); + return; + } else { + trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); + cdata = spdk_nvme_ctrlr_get_data(ctrlr); + health_page = &(context->health_page); + } + + w = spdk_jsonrpc_begin_result(request); + + spdk_json_write_object_begin(w); + snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); + spdk_str_trim(buf); + spdk_json_write_named_string(w, "model_number", buf); + snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); + spdk_str_trim(buf); + spdk_json_write_named_string(w, "serial_number", buf); + snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); + spdk_str_trim(buf); + spdk_json_write_named_string(w, "firmware_revision", buf); + spdk_json_write_named_string(w, "traddr", trid->traddr); + spdk_json_write_named_uint64(w, "temperature_celsius", health_page->temperature - 273); + spdk_json_write_named_uint64(w, "available_spare_percentage", health_page->available_spare); + spdk_json_write_named_uint64(w, "available_spare_threshold_percentage", + health_page->available_spare_threshold); + spdk_json_write_named_uint64(w, "percentage_used", health_page->percentage_used); + spdk_json_write_named_uint128(w, "data_units_read", + health_page->data_units_read[0], health_page->data_units_read[1]); + spdk_json_write_named_uint128(w, "data_units_written", + health_page->data_units_written[0], health_page->data_units_written[1]); + spdk_json_write_named_uint128(w, "host_read_commands", + health_page->host_read_commands[0], health_page->host_read_commands[1]); + spdk_json_write_named_uint128(w, "host_write_commands", + health_page->host_write_commands[0], health_page->host_write_commands[1]); + spdk_json_write_named_uint128(w, "controller_busy_time", + health_page->controller_busy_time[0], health_page->controller_busy_time[1]); + spdk_json_write_named_uint128(w, "power_cycles", + health_page->power_cycles[0], health_page->power_cycles[1]); + spdk_json_write_named_uint128(w, "power_on_hours", + health_page->power_on_hours[0], health_page->power_on_hours[1]); + spdk_json_write_named_uint128(w, "unsafe_shutdowns", + health_page->unsafe_shutdowns[0], health_page->unsafe_shutdowns[1]); + spdk_json_write_named_uint128(w, "media_errors", + health_page->media_errors[0], health_page->media_errors[1]); + spdk_json_write_named_uint128(w, "num_err_log_entries", + health_page->num_error_info_log_entries[0], health_page->num_error_info_log_entries[1]); + spdk_json_write_named_uint64(w, "warning_temperature_time_minutes", health_page->warning_temp_time); + spdk_json_write_named_uint64(w, "critical_composite_temperature_time_minutes", + health_page->critical_temp_time); + for (i = 0; i < 8; i++) { + if (health_page->temp_sensor[i] != 0) { + spdk_json_write_named_uint64(w, "temperature_sensor_celsius", health_page->temp_sensor[i] - 273); + } + } + spdk_json_write_object_end(w); + + spdk_jsonrpc_end_result(request, w); + nvme_health_info_cleanup(context, false); +} + +static void +get_health_log_page(struct spdk_nvme_health_info_context *context) +{ + struct spdk_nvme_ctrlr *ctrlr = context->ctrlr; + + if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_HEALTH_INFORMATION, + SPDK_NVME_GLOBAL_NS_TAG, + &(context->health_page), sizeof(context->health_page), 0, + get_health_log_page_completion, context)) { + nvme_health_info_cleanup(context, true); + SPDK_ERRLOG("spdk_nvme_ctrlr_cmd_get_log_page() failed\n"); + } +} + +static void +get_temperature_threshold_feature_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_health_info_context *context = cb_arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + nvme_health_info_cleanup(context, true); + SPDK_ERRLOG("feature SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD failed in completion\n"); + } else { + get_health_log_page(context); + } +} + +static int +get_temperature_threshold_feature(struct spdk_nvme_health_info_context *context) +{ + struct spdk_nvme_cmd cmd = {}; + + cmd.opc = SPDK_NVME_OPC_GET_FEATURES; + cmd.cdw10 = SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD; + + return spdk_nvme_ctrlr_cmd_admin_raw(context->ctrlr, &cmd, NULL, 0, + get_temperature_threshold_feature_completion, context); +} + +static void +get_controller_health_info(struct spdk_jsonrpc_request *request, struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_health_info_context *context; + + context = calloc(1, sizeof(struct spdk_nvme_health_info_context)); + if (!context) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Memory allocation error."); + return; + } + + context->request = request; + context->ctrlr = ctrlr; + + if (get_temperature_threshold_feature(context)) { + nvme_health_info_cleanup(context, true); + SPDK_ERRLOG("feature SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD failed to submit\n"); + } + + return; +} + +static void +rpc_bdev_nvme_get_controller_health_info(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_get_controller_health_info req = {}; + struct nvme_ctrlr *nvme_ctrlr = NULL; + + if (!params) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Missing device name"); + + return; + } + if (spdk_json_decode_object(params, rpc_get_controller_health_info_decoders, + SPDK_COUNTOF(rpc_get_controller_health_info_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + free_rpc_get_controller_health_info(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Invalid parameters"); + + return; + } + + nvme_ctrlr = nvme_ctrlr_get_by_name(req.name); + + if (!nvme_ctrlr) { + SPDK_ERRLOG("nvme ctrlr name '%s' does not exist\n", req.name); + free_rpc_get_controller_health_info(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Device not found"); + return; + } + + get_controller_health_info(request, nvme_ctrlr->ctrlr); + free_rpc_get_controller_health_info(&req); + + return; +} +SPDK_RPC_REGISTER("bdev_nvme_get_controller_health_info", + rpc_bdev_nvme_get_controller_health_info, SPDK_RPC_RUNTIME) diff --git a/scripts/rpc.py b/scripts/rpc.py index 8b4940b75..d1f4e526c 100755 --- a/scripts/rpc.py +++ b/scripts/rpc.py @@ -964,6 +964,15 @@ if __name__ == "__main__": help='Get bdev_nvme poll group transport statistics') p.set_defaults(func=bdev_nvme_get_transport_statistics) + def bdev_nvme_get_controller_health_info(args): + print_dict(rpc.bdev.bdev_nvme_get_controller_health_info(args.client, + name=args.name)) + + p = subparsers.add_parser('bdev_nvme_get_controller_health_info', + help='Display health log of the required NVMe bdev controller.') + p.add_argument('-c', '--name', help="Name of the NVMe bdev controller. Example: Nvme0", required=True) + p.set_defaults(func=bdev_nvme_get_controller_health_info) + # iSCSI def iscsi_set_options(args): rpc.iscsi.iscsi_set_options( diff --git a/scripts/rpc/bdev.py b/scripts/rpc/bdev.py index cf40b951c..d05087dee 100644 --- a/scripts/rpc/bdev.py +++ b/scripts/rpc/bdev.py @@ -1276,3 +1276,17 @@ def bdev_nvme_apply_firmware(client, bdev_name, filename): def bdev_nvme_get_transport_statistics(client): """Get bdev_nvme poll group transport statistics""" return client.call('bdev_nvme_get_transport_statistics') + + +def bdev_nvme_get_controller_health_info(client, name): + """Display health log of the required NVMe bdev controller. + + Args: + name: name of the required NVMe bdev controller + + Returns: + Health log for the requested NVMe bdev controller. + """ + params = {} + params['name'] = name + return client.call('bdev_nvme_get_controller_health_info', params)