Spdk/test/nvme/aer/aer.c

759 lines
19 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: BSD-3-Clause
* Copyright (C) 2015 Intel Corporation.
* All rights reserved.
*/
#include "spdk/stdinc.h"
#include "spdk/log.h"
#include "spdk/nvme.h"
#include "spdk/env.h"
#include "spdk/string.h"
#define MAX_DEVS 64
struct dev {
struct spdk_nvme_ctrlr *ctrlr;
/* Expected changed NS ID state before AER */
bool ns_test_active;
struct spdk_nvme_health_information_page *health_page;
uint32_t orig_temp_threshold;
bool reset_temp_active;
char name[SPDK_NVMF_TRADDR_MAX_LEN + 1];
};
static void get_feature_test(struct dev *dev);
static struct dev g_devs[MAX_DEVS];
static int g_num_devs = 0;
#define foreach_dev(iter) \
for (iter = g_devs; iter - g_devs < g_num_devs; iter++)
#define AER_PRINTF(format, ...) printf("%s" format, g_parent_process ? "" : "[Child] ", \
##__VA_ARGS__)
#define AER_FPRINTF(f, format, ...) fprintf(f, "%s" format, g_parent_process ? \
"" : "[Child] ", ##__VA_ARGS__)
static int g_outstanding_commands = 0;
static int g_aer_done = 0;
static int g_temperature_done = 0;
static int g_failed = 0;
static struct spdk_nvme_transport_id g_trid;
static char *g_touch_file;
/* Enable AER temperature test */
static int g_enable_temp_test = 0;
/* Expected changed NS ID */
static uint32_t g_expected_ns_test = 0;
/* For multi-process test */
static int g_multi_process_test = 0;
static bool g_parent_process = true;
static const char *g_sem_init_name = "/init";
static const char *g_sem_child_name = "/child";
static sem_t *g_sem_init_id;
static sem_t *g_sem_child_id;
static void
set_temp_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl)
{
struct dev *dev = cb_arg;
g_outstanding_commands--;
if (spdk_nvme_cpl_is_error(cpl)) {
AER_PRINTF("%s: set feature (temp threshold) failed\n", dev->name);
g_failed = 1;
return;
}
/* Admin command completions are synchronized by the NVMe driver,
* so we don't need to do any special locking here. */
g_temperature_done++;
}
static int
set_temp_threshold(struct dev *dev, uint32_t temp)
{
struct spdk_nvme_cmd cmd = {};
int rc;
cmd.opc = SPDK_NVME_OPC_SET_FEATURES;
cmd.cdw10_bits.set_features.fid = SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD;
cmd.cdw11_bits.feat_temp_threshold.bits.tmpth = temp;
rc = spdk_nvme_ctrlr_cmd_admin_raw(dev->ctrlr, &cmd, NULL, 0, set_temp_completion, dev);
if (rc == 0) {
g_outstanding_commands++;
} else {
AER_FPRINTF(stderr, "Submitting Admin cmd failed with rc: %d\n", rc);
}
return rc;
}
static void
get_temp_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl)
{
struct dev *dev = cb_arg;
g_outstanding_commands--;
if (spdk_nvme_cpl_is_error(cpl)) {
AER_PRINTF("%s: get feature (temp threshold) failed\n", dev->name);
g_failed = 1;
return;
}
dev->orig_temp_threshold = cpl->cdw0;
AER_PRINTF("%s: original temperature threshold: %u Kelvin (%d Celsius)\n",
dev->name, dev->orig_temp_threshold, dev->orig_temp_threshold - 273);
g_temperature_done++;
}
static int
get_temp_threshold(struct dev *dev)
{
struct spdk_nvme_cmd cmd = {};
int rc;
cmd.opc = SPDK_NVME_OPC_GET_FEATURES;
cmd.cdw10_bits.get_features.fid = SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD;
rc = spdk_nvme_ctrlr_cmd_admin_raw(dev->ctrlr, &cmd, NULL, 0, get_temp_completion, dev);
if (rc == 0) {
g_outstanding_commands++;
}
return rc;
}
static void
print_health_page(struct dev *dev, struct spdk_nvme_health_information_page *hip)
{
AER_PRINTF("%s: Current Temperature: %u Kelvin (%d Celsius)\n",
dev->name, hip->temperature, hip->temperature - 273);
}
static void
get_health_log_page_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl)
{
struct dev *dev = cb_arg;
g_outstanding_commands--;
if (spdk_nvme_cpl_is_error(cpl)) {
AER_PRINTF("%s: get log page failed\n", dev->name);
g_failed = 1;
return;
}
print_health_page(dev, dev->health_page);
g_aer_done++;
}
static int
get_health_log_page(struct dev *dev)
{
int rc;
rc = spdk_nvme_ctrlr_cmd_get_log_page(dev->ctrlr, SPDK_NVME_LOG_HEALTH_INFORMATION,
SPDK_NVME_GLOBAL_NS_TAG, dev->health_page,
sizeof(*dev->health_page), 0,
get_health_log_page_completion, dev);
if (rc == 0) {
g_outstanding_commands++;
}
return rc;
}
static void
get_ns_state_test(struct dev *dev, uint32_t nsid)
{
bool new_ns_state;
new_ns_state = spdk_nvme_ctrlr_is_active_ns(dev->ctrlr, nsid);
if (new_ns_state == dev->ns_test_active) {
g_failed = 1;
}
}
static void
cleanup(void)
{
struct dev *dev;
foreach_dev(dev) {
if (dev->health_page) {
spdk_free(dev->health_page);
}
}
}
static void
aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
{
struct dev *dev = arg;
uint32_t log_page_id;
uint32_t aen_event_info;
uint32_t aen_event_type;
union spdk_nvme_async_event_completion aen_cpl;
aen_cpl.raw = cpl->cdw0;
aen_event_info = aen_cpl.bits.async_event_info;
aen_event_type = aen_cpl.bits.async_event_type;
log_page_id = aen_cpl.bits.log_page_identifier;
if (spdk_nvme_cpl_is_error(cpl)) {
AER_FPRINTF(stderr, "%s: AER failed\n", dev->name);
g_failed = 1;
return;
}
/* If we are already resetting the temp, no need to print more AENs */
if (dev->reset_temp_active) {
return;
}
AER_PRINTF("%s: aer_cb for log page %d, aen_event_type: 0x%02x, aen_event_info: 0x%02x\n",
dev->name, log_page_id, aen_event_type, aen_event_info);
/* Temp Test: Verify proper EventType, Event Info and Log Page.
* NOTE: QEMU NVMe controllers return Spare Below Threshold Status event info
* instead of Temperate Threshold even info which is why it's used in the check
* below.
*/
if ((log_page_id == SPDK_NVME_LOG_HEALTH_INFORMATION) && \
(aen_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_SMART) && \
((aen_event_info == SPDK_NVME_ASYNC_EVENT_TEMPERATURE_THRESHOLD) || \
(aen_event_info == SPDK_NVME_ASYNC_EVENT_SPARE_BELOW_THRESHOLD))) {
/* Set the temperature threshold back to the original value to stop triggering */
if (g_parent_process) {
AER_PRINTF("aer_cb - Resetting Temp Threshold for device: %s\n", dev->name);
if (set_temp_threshold(dev, dev->orig_temp_threshold)) {
g_failed = 1;
}
dev->reset_temp_active = true;
}
get_health_log_page(dev);
} else if (log_page_id == SPDK_NVME_LOG_CHANGED_NS_LIST) {
AER_PRINTF("aer_cb - Changed Namespace\n");
get_ns_state_test(dev, g_expected_ns_test);
g_aer_done++;
} else {
AER_PRINTF("aer_cb - Unknown Log Page\n");
}
}
static void
usage(const char *program_name)
{
AER_PRINTF("%s [options]", program_name);
AER_PRINTF("\n");
AER_PRINTF("options:\n");
AER_PRINTF(" -g use single file descriptor for DPDK memory segments]\n");
AER_PRINTF(" -T enable temperature tests\n");
AER_PRINTF(" -n expected Namespace attribute notice ID\n");
AER_PRINTF(" -t <file> touch specified file when ready to receive AER\n");
AER_PRINTF(" -r trid remote NVMe over Fabrics target address\n");
AER_PRINTF(" Format: 'key:value [key:value] ...'\n");
AER_PRINTF(" Keys:\n");
AER_PRINTF(" trtype Transport type (e.g. RDMA)\n");
AER_PRINTF(" adrfam Address family (e.g. IPv4, IPv6)\n");
AER_PRINTF(" traddr Transport address (e.g. 192.168.100.8)\n");
AER_PRINTF(" trsvcid Transport service identifier (e.g. 4420)\n");
AER_PRINTF(" subnqn Subsystem NQN (default: %s)\n", SPDK_NVMF_DISCOVERY_NQN);
AER_PRINTF(" Example: -r 'trtype:RDMA adrfam:IPv4 traddr:192.168.100.8 trsvcid:4420'\n");
spdk_log_usage(stdout, "-L");
AER_PRINTF(" -i <id> shared memory group ID\n");
AER_PRINTF(" -m Multi-Process AER Test (only with Temp Test)\n");
AER_PRINTF(" -H show this usage\n");
}
static int
parse_args(int argc, char **argv, struct spdk_env_opts *env_opts)
{
int op, rc;
long int val;
spdk_nvme_trid_populate_transport(&g_trid, SPDK_NVME_TRANSPORT_PCIE);
snprintf(g_trid.subnqn, sizeof(g_trid.subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
while ((op = getopt(argc, argv, "gi:mn:r:t:HL:T")) != -1) {
switch (op) {
case 'n':
val = spdk_strtol(optarg, 10);
if (val < 0) {
AER_FPRINTF(stderr, "Invalid NS attribute notice ID\n");
return val;
}
g_expected_ns_test = (uint32_t)val;
break;
case 'g':
env_opts->hugepage_single_segments = true;
break;
case 'r':
if (spdk_nvme_transport_id_parse(&g_trid, optarg) != 0) {
AER_FPRINTF(stderr, "Error parsing transport address\n");
return 1;
}
break;
case 't':
g_touch_file = optarg;
break;
case 'L':
rc = spdk_log_set_flag(optarg);
if (rc < 0) {
AER_FPRINTF(stderr, "unknown flag\n");
usage(argv[0]);
exit(EXIT_FAILURE);
}
#ifdef DEBUG
spdk_log_set_print_level(SPDK_LOG_DEBUG);
#endif
break;
case 'T':
g_enable_temp_test = 1;
break;
case 'H':
usage(argv[0]);
exit(EXIT_SUCCESS);
case 'i':
env_opts->shm_id = spdk_strtol(optarg, 10);
if (env_opts->shm_id < 0) {
AER_FPRINTF(stderr, "Invalid shared memory ID\n");
return env_opts->shm_id;
}
break;
case 'm':
g_multi_process_test = 1;
break;
default:
usage(argv[0]);
return 1;
}
}
return 0;
}
static bool
probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
struct spdk_nvme_ctrlr_opts *opts)
{
AER_PRINTF("Attaching to %s\n", trid->traddr);
return true;
}
static void
attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
{
struct dev *dev;
/* add to dev list */
dev = &g_devs[g_num_devs++];
dev->ctrlr = ctrlr;
snprintf(dev->name, sizeof(dev->name), "%s",
trid->traddr);
AER_PRINTF("Attached to %s\n", dev->name);
dev->health_page = spdk_zmalloc(sizeof(*dev->health_page), 4096, NULL, SPDK_ENV_LCORE_ID_ANY,
SPDK_MALLOC_DMA);
if (dev->health_page == NULL) {
AER_PRINTF("Allocation error (health page)\n");
g_failed = 1;
}
}
static void
get_feature_test_cb(void *cb_arg, const struct spdk_nvme_cpl *cpl)
{
struct dev *dev = cb_arg;
g_outstanding_commands--;
if (spdk_nvme_cpl_is_error(cpl)) {
AER_PRINTF("%s: get number of queues failed\n", dev->name);
g_failed = 1;
return;
}
if (g_aer_done < g_num_devs) {
/*
* Resubmit Get Features command to continue filling admin queue
* while the test is running.
*/
get_feature_test(dev);
}
}
static void
get_feature_test(struct dev *dev)
{
struct spdk_nvme_cmd cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.opc = SPDK_NVME_OPC_GET_FEATURES;
cmd.cdw10_bits.get_features.fid = SPDK_NVME_FEAT_NUMBER_OF_QUEUES;
if (spdk_nvme_ctrlr_cmd_admin_raw(dev->ctrlr, &cmd, NULL, 0,
get_feature_test_cb, dev) != 0) {
AER_PRINTF("Failed to send Get Features command for dev=%p\n", dev);
g_failed = 1;
return;
}
g_outstanding_commands++;
}
static int
spdk_aer_temperature_test(void)
{
struct dev *dev;
AER_PRINTF("Getting orig temperature thresholds of all controllers\n");
foreach_dev(dev) {
/* Get the original temperature threshold */
get_temp_threshold(dev);
dev->reset_temp_active = false;
}
while (!g_failed && g_temperature_done < g_num_devs) {
foreach_dev(dev) {
spdk_nvme_ctrlr_process_admin_completions(dev->ctrlr);
}
}
if (g_failed) {
return g_failed;
}
g_temperature_done = 0;
g_aer_done = 0;
/* Send admin commands to test admin queue wraparound while waiting for the AER */
foreach_dev(dev) {
get_feature_test(dev);
}
if (g_failed) {
return g_failed;
}
/* Only single process needs to set and verify lower threshold */
if (g_parent_process) {
/* Wait until child has init'd and ready for test to continue */
if (g_multi_process_test) {
sem_wait(g_sem_child_id);
}
AER_PRINTF("Setting all controllers temperature threshold low to trigger AER\n");
foreach_dev(dev) {
/* Set the temperature threshold to a low value */
set_temp_threshold(dev, 200);
}
AER_PRINTF("Waiting for all controllers temperature threshold to be set lower\n");
while (!g_failed && (g_temperature_done < g_num_devs)) {
foreach_dev(dev) {
spdk_nvme_ctrlr_process_admin_completions(dev->ctrlr);
}
}
if (g_failed) {
return g_failed;
}
}
AER_PRINTF("Waiting for all controllers to trigger AER and reset threshold\n");
/* Let parent know init is done and it's okay to continue */
if (!g_parent_process) {
sem_post(g_sem_child_id);
}
/* Waiting for AEN to be occur here. Each device will increment g_aer_done on an AEN */
while (!g_failed && (g_aer_done < g_num_devs)) {
foreach_dev(dev) {
if (spdk_nvme_ctrlr_process_admin_completions(dev->ctrlr) < 0) {
g_failed = 1;
}
}
}
if (g_failed) {
return g_failed;
}
return 0;
}
static int
spdk_aer_changed_ns_test(void)
{
struct dev *dev;
g_aer_done = 0;
AER_PRINTF("Starting namespace attribute notice tests for all controllers...\n");
foreach_dev(dev) {
get_feature_test(dev);
dev->ns_test_active = spdk_nvme_ctrlr_is_active_ns(dev->ctrlr, g_expected_ns_test);
}
if (g_failed) {
return g_failed;
}
while (!g_failed && (g_aer_done < g_num_devs)) {
foreach_dev(dev) {
spdk_nvme_ctrlr_process_admin_completions(dev->ctrlr);
}
}
if (g_failed) {
return g_failed;
}
return 0;
}
static int
setup_multi_process(void)
{
pid_t pid;
int rc = 0;
/* If AEN test was killed, remove named semaphore to start again */
rc = sem_unlink(g_sem_init_name);
if (rc < 0 && errno != ENOENT) {
AER_FPRINTF(stderr, "Init semaphore removal failure: %s", spdk_strerror(errno));
return rc;
}
rc = sem_unlink(g_sem_child_name);
if (rc < 0 && errno != ENOENT) {
AER_FPRINTF(stderr, "Child semaphore removal failure: %s", spdk_strerror(errno));
return rc;
}
pid = fork();
if (pid == -1) {
perror("Failed to fork\n");
return -1;
} else if (pid == 0) {
AER_PRINTF("Child process pid: %d\n", getpid());
g_parent_process = false;
g_sem_init_id = sem_open(g_sem_init_name, O_CREAT, 0600, 0);
g_sem_child_id = sem_open(g_sem_child_name, O_CREAT, 0600, 0);
if ((g_sem_init_id == SEM_FAILED) || (g_sem_child_id == SEM_FAILED)) {
AER_FPRINTF(stderr, "Sem Open failed for child: %s\n",
spdk_strerror(errno));
return -1;
}
}
/* Parent process */
else {
g_parent_process = true;
g_sem_init_id = sem_open(g_sem_init_name, O_CREAT, 0600, 0);
g_sem_child_id = sem_open(g_sem_child_name, O_CREAT, 0600, 0);
if ((g_sem_init_id == SEM_FAILED) || (g_sem_child_id == SEM_FAILED)) {
AER_FPRINTF(stderr, "Sem Open failed for parent: %s\n",
spdk_strerror(errno));
return -1;
}
}
return 0;
}
int
main(int argc, char **argv)
{
struct dev *dev;
struct spdk_env_opts opts;
int rc;
struct spdk_nvme_detach_ctx *detach_ctx = NULL;
spdk_env_opts_init(&opts);
rc = parse_args(argc, argv, &opts);
if (rc != 0) {
return rc;
}
if (g_multi_process_test) {
/* Multi-Process test only available with Temp Test */
if (!g_enable_temp_test) {
AER_FPRINTF(stderr, "Multi Process only available with Temp Test (-T)\n");
return 1;
}
if (opts.shm_id < 0) {
AER_FPRINTF(stderr, "Multi Process requires shared memory id (-i <id>)\n");
return 1;
}
rc = setup_multi_process();
if (rc != 0) {
AER_FPRINTF(stderr, "Multi Process test failed to setup\n");
return rc;
}
} else {
/* Only one process in test, set it to the parent process */
g_parent_process = true;
}
opts.name = "aer";
if (g_parent_process) {
opts.core_mask = "0x1";
} else {
opts.core_mask = "0x2";
}
/*
* For multi-process test, parent (primary) and child (secondary) processes
* will execute all following code but DPDK setup is serialized
*/
if (!g_parent_process) {
if (sem_wait(g_sem_init_id) < 0) {
AER_FPRINTF(stderr, "sem_wait failed for child process\n");
return (-1);
}
}
if (spdk_env_init(&opts) < 0) {
AER_FPRINTF(stderr, "Unable to initialize SPDK env\n");
return 1;
}
AER_PRINTF("Asynchronous Event Request test\n");
if (spdk_nvme_probe(&g_trid, NULL, probe_cb, attach_cb, NULL) != 0) {
AER_FPRINTF(stderr, "spdk_nvme_probe() failed\n");
return 1;
}
if (g_num_devs == 0) {
AER_FPRINTF(stderr, "No controllers found - exiting\n");
g_failed = 1;
}
if (g_failed) {
goto done;
}
if (g_parent_process && g_enable_temp_test) {
AER_PRINTF("Reset controller to setup AER completions for this process\n");
foreach_dev(dev) {
if (spdk_nvme_ctrlr_reset(dev->ctrlr) != 0) {
AER_FPRINTF(stderr, "nvme reset failed.\n");
return -1;
}
}
}
if (g_parent_process && g_multi_process_test) {
/* Primary can release child/secondary for init now */
sem_post(g_sem_init_id);
}
AER_PRINTF("Registering asynchronous event callbacks...\n");
foreach_dev(dev) {
spdk_nvme_ctrlr_register_aer_callback(dev->ctrlr, aer_cb, dev);
}
if (g_touch_file) {
int fd;
fd = open(g_touch_file, O_CREAT | O_EXCL | O_RDWR, S_IFREG);
if (fd == -1) {
AER_FPRINTF(stderr, "Could not touch %s (%s).\n", g_touch_file,
strerror(errno));
g_failed = true;
goto done;
}
close(fd);
}
/* AER temperature test */
if (g_enable_temp_test) {
if (spdk_aer_temperature_test()) {
goto done;
}
}
/* AER changed namespace list test */
if (g_expected_ns_test) {
if (spdk_aer_changed_ns_test()) {
goto done;
}
}
AER_PRINTF("Cleaning up...\n");
while (g_outstanding_commands) {
foreach_dev(dev) {
spdk_nvme_ctrlr_process_admin_completions(dev->ctrlr);
}
}
/* Only one process cleans up at a time - let child go first */
if (g_multi_process_test && g_parent_process) {
/* Parent waits for child to clean up before executing clean up process */
sem_wait(g_sem_child_id);
}
/* unregister AER callback so we don't fail on aborted AERs when we close out qpairs. */
foreach_dev(dev) {
spdk_nvme_ctrlr_register_aer_callback(dev->ctrlr, NULL, NULL);
}
foreach_dev(dev) {
spdk_nvme_ctrlr_process_admin_completions(dev->ctrlr);
}
foreach_dev(dev) {
spdk_nvme_detach_async(dev->ctrlr, &detach_ctx);
}
if (detach_ctx) {
spdk_nvme_detach_poll(detach_ctx);
}
/* Release semaphore to allow parent to cleanup */
if (!g_parent_process) {
sem_post(g_sem_child_id);
sem_wait(g_sem_init_id);
}
done:
cleanup();
/* Wait for child process to finish and verify it finished correctly before detaching
* resources */
if (g_multi_process_test && g_parent_process) {
int status;
sem_post(g_sem_init_id);
wait(&status);
if (WIFEXITED(status)) {
/* Child ended normally */
if (WEXITSTATUS(status) != 0) {
AER_FPRINTF(stderr, "Child Failed with status: %d.\n",
(int8_t)(WEXITSTATUS(status)));
g_failed = true;
}
}
if (sem_close(g_sem_init_id) != 0) {
perror("sem_close Failed for init\n");
g_failed = true;
}
if (sem_close(g_sem_child_id) != 0) {
perror("sem_close Failed for child\n");
g_failed = true;
}
if (sem_unlink(g_sem_init_name) != 0) {
perror("sem_unlink Failed for init\n");
g_failed = true;
}
if (sem_unlink(g_sem_child_name) != 0) {
perror("sem_unlink Failed for child\n");
g_failed = true;
}
}
return g_failed;
}