diff --git a/examples/nvme/perf/perf.c b/examples/nvme/perf/perf.c index f0bc8e828..89cccec6a 100644 --- a/examples/nvme/perf/perf.c +++ b/examples/nvme/perf/perf.c @@ -43,6 +43,7 @@ #include "spdk/queue.h" #include "spdk/string.h" #include "spdk/nvme_intel.h" +#include "spdk/histogram_data.h" #if HAVE_LIBAIO #include @@ -81,41 +82,6 @@ struct ns_entry { char name[1024]; }; -/* - * Latency tracking is done with ranges of bucket arrays. The bucket - * for any given I/O is determined solely by the TSC delta - any - * translation to microseconds is only done after the test is finished - * and statistics are printed. - * - * Each range has a number of buckets determined by NUM_BUCKETS_PER_RANGE - * which is 128. The buckets in ranges 0 and 1 each map to one specific - * TSC delta. The buckets in subsequent ranges each map to twice as many - * TSC deltas as buckets in the range before it: - * - * Range 0: 1 TSC each - 128 buckets cover 0 to 127 (2^7-1) - * Range 1: 1 TSC each - 128 buckets cover 128 to 255 (2^8-1) - * Range 2: 2 TSC each - 128 buckets cover 256 to 511 (2^9-1) - * Range 3: 4 TSC each - 128 buckets cover 512 to 1023 (2^10-1) - * Range 4: 8 TSC each - 128 buckets cover 1024 to 2047 (2^11-1) - * Range 5: 16 TSC each - 128 buckets cover 2048 to 4095 (2^12-1) - * ... - * Range 55: 2^54 TSC each - 128 buckets cover 2^61 to 2^62-1 - * Range 56: 2^55 TSC each - 128 buckets cover 2^62 to 2^63-1 - * Range 57: 2^56 TSC each - 128 buckets cover 2^63 to 2^64-1 - * - * On a 2.3GHz processor, this strategy results in 50ns buckets in the - * 7-14us range (sweet spot for Intel Optane SSD latency testing). - * - * Buckets can be made more granular by increasing BUCKET_SHIFT. This - * comes at the cost of additional storage per namespace context to - * store the bucket data. - */ -#define BUCKET_SHIFT 7 -#define BUCKET_LSB (64 - BUCKET_SHIFT) -#define NUM_BUCKETS_PER_RANGE (1ULL << BUCKET_SHIFT) -#define BUCKET_MASK (NUM_BUCKETS_PER_RANGE - 1) -#define NUM_BUCKET_RANGES (BUCKET_LSB + 1) - static const double g_latency_cutoffs[] = { 0.01, 0.10, @@ -157,7 +123,7 @@ struct ns_worker_ctx { struct ns_worker_ctx *next; - uint64_t bucket[NUM_BUCKET_RANGES][NUM_BUCKETS_PER_RANGE]; + struct spdk_histogram_data histogram; }; struct perf_task { @@ -215,63 +181,6 @@ static int g_aio_optind; /* Index of first AIO filename in argv */ static void task_complete(struct perf_task *task); -static uint32_t -get_bucket_range(uint64_t tsc) -{ - uint32_t clz, range; - - assert(tsc != 0); - - clz = __builtin_clzll(tsc); - - if (clz <= BUCKET_LSB) { - range = BUCKET_LSB - clz; - } else { - range = 0; - } - - return range; -} - -static uint32_t -get_bucket_index(uint64_t tsc, uint32_t range) -{ - uint32_t shift; - - if (range == 0) { - shift = 0; - } else { - shift = range - 1; - } - - return (tsc >> shift) & BUCKET_MASK; -} - -static double -get_us_from_bucket(uint32_t range, uint32_t index) -{ - uint64_t tsc; - - index += 1; - if (range > 0) { - tsc = 1ULL << (range + BUCKET_SHIFT - 1); - tsc += (uint64_t)index << (range - 1); - } else { - tsc = index; - } - - return (double)tsc * 1000 * 1000 / g_tsc_rate; -} - -static void -track_latency(struct ns_worker_ctx *ns_ctx, uint64_t tsc) -{ - uint32_t range = get_bucket_range(tsc); - uint32_t index = get_bucket_index(tsc, range); - - ns_ctx->bucket[range][index]++; -} - static void register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) { @@ -607,7 +516,7 @@ task_complete(struct perf_task *task) ns_ctx->max_tsc = tsc_diff; } if (g_latency_sw_tracking_level > 0) { - track_latency(ns_ctx, tsc_diff); + spdk_histogram_data_tally(&ns_ctx->histogram, tsc_diff); } rte_mempool_put(task_pool, task); @@ -792,6 +701,41 @@ static void usage(char *program_name) printf("\t[-i shared memory group ID]\n"); } +static void +check_cutoff(void *ctx, uint64_t start, uint64_t end, uint64_t count, + uint64_t total, uint64_t so_far) +{ + double so_far_pct; + double **cutoff = ctx; + + if (count == 0) { + return; + } + + so_far_pct = (double)so_far / total; + while (so_far_pct >= **cutoff && **cutoff > 0) { + printf("%8.4f%% : %9.3fus\n", **cutoff * 100, (double)end * 1000 * 1000 / g_tsc_rate); + (*cutoff)++; + } +} + +static void +print_bucket(void *ctx, uint64_t start, uint64_t end, uint64_t count, + uint64_t total, uint64_t so_far) +{ + double so_far_pct; + + if (count == 0) { + return; + } + + so_far_pct = (double)so_far * 100 / total; + printf("%9.3f - %9.3f: %9.4f%% (%9ju)\n", + (double)start * 1000 * 1000 / g_tsc_rate, + (double)end * 1000 * 1000 / g_tsc_rate, + so_far_pct, count); +} + static void print_performance(void) { @@ -858,27 +802,13 @@ print_performance(void) while (worker) { ns_ctx = worker->ns_ctx; while (ns_ctx) { - uint64_t i, j, so_far = 0; - double so_far_pct = 0, bucket = 0; const double *cutoff = g_latency_cutoffs; printf("Summary latency data for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore); printf("=================================================================================\n"); - for (i = 0; i < NUM_BUCKET_RANGES; i++) { - for (j = 0; j < NUM_BUCKETS_PER_RANGE; j++) { - so_far += ns_ctx->bucket[i][j]; - so_far_pct = (double)so_far / total_io_completed; - bucket = get_us_from_bucket(i, j); - if (ns_ctx->bucket[i][j] == 0) { - continue; - } - while (so_far_pct >= *cutoff && *cutoff > 0) { - printf("%8.4f%% : %9.3fus\n", *cutoff * 100, bucket); - cutoff++; - } - } - } + spdk_histogram_data_iterate(&ns_ctx->histogram, check_cutoff, &cutoff); + printf("\n"); ns_ctx = ns_ctx->next; } @@ -893,27 +823,11 @@ print_performance(void) while (worker) { ns_ctx = worker->ns_ctx; while (ns_ctx) { - uint64_t i, j, so_far = 0; - float so_far_pct = 0; - double last_bucket, bucket = 0; - printf("Latency histogram for %-43.43s from core %u:\n", ns_ctx->entry->name, worker->lcore); printf("==============================================================================\n"); printf(" Range in us Cumulative IO count\n"); - for (i = 0; i < NUM_BUCKET_RANGES; i++) { - for (j = 0; j < NUM_BUCKETS_PER_RANGE; j++) { - so_far += ns_ctx->bucket[i][j]; - so_far_pct = (float)so_far * 100 / total_io_completed; - last_bucket = bucket; - bucket = get_us_from_bucket(i, j); - if (ns_ctx->bucket[i][j] == 0) { - continue; - } - printf("%9.3f - %9.3f: %9.4f%% (%9ju)\n", - last_bucket, bucket, so_far_pct, ns_ctx->bucket[i][j]); - } - } + spdk_histogram_data_iterate(&ns_ctx->histogram, print_bucket, NULL); printf("\n"); ns_ctx = ns_ctx->next; } @@ -1376,6 +1290,7 @@ associate_workers_with_ns(void) ns_ctx->min_tsc = UINT64_MAX; ns_ctx->entry = entry; ns_ctx->next = worker->ns_ctx; + spdk_histogram_data_reset(&ns_ctx->histogram); worker->ns_ctx = ns_ctx; worker = worker->next; diff --git a/include/spdk/histogram_data.h b/include/spdk/histogram_data.h new file mode 100644 index 000000000..e302deb73 --- /dev/null +++ b/include/spdk/histogram_data.h @@ -0,0 +1,189 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * Generic histogram library + */ + +#ifndef _SPDK_HISTOGRAM_DATA_H_ +#define _SPDK_HISTOGRAM_DATA_H_ + +#include "spdk/stdinc.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define SPDK_HISTOGRAM_BUCKET_SHIFT 7 +#define SPDK_HISTOGRAM_BUCKET_LSB (64 - SPDK_HISTOGRAM_BUCKET_SHIFT) +#define SPDK_HISTOGRAM_NUM_BUCKETS_PER_RANGE (1ULL << SPDK_HISTOGRAM_BUCKET_SHIFT) +#define SPDK_HISTOGRAM_BUCKET_MASK (SPDK_HISTOGRAM_NUM_BUCKETS_PER_RANGE - 1) +#define SPDK_HISTOGRAM_NUM_BUCKET_RANGES (SPDK_HISTOGRAM_BUCKET_LSB + 1) + +/* + * SPDK histograms are implemented using ranges of bucket arrays. The most common usage + * model is using TSC datapoints to capture an I/O latency histogram. For this usage model, + * the histogram tracks only TSC deltas - any translation to microseconds is done by the + * histogram user calling spdk_histogram_data_iterate() to iterate over the buckets to perform + * the translations. + * + * Each range has a number of buckets determined by SPDK_HISTOGRAM_NUM_BUCKETS_PER_RANGE + * which is 128. The buckets in ranges 0 and 1 each map to one specific datapoint value. + * The buckets in subsequent ranges each map to twice as many datapoint values as buckets + * in the range before it: + * + * Range 0: 1 value each - 128 buckets cover 0 to 127 (2^7-1) + * Range 1: 1 value each - 128 buckets cover 128 to 255 (2^8-1) + * Range 2: 2 values each - 128 buckets cover 256 to 511 (2^9-1) + * Range 3: 4 values each - 128 buckets cover 512 to 1023 (2^10-1) + * Range 4: 8 values each - 128 buckets cover 1024 to 2047 (2^11-1) + * Range 5: 16 values each - 128 buckets cover 2048 to 4095 (2^12-1) + * ... + * Range 55: 2^54 values each - 128 buckets cover 2^61 to 2^62-1 + * Range 56: 2^55 values each - 128 buckets cover 2^62 to 2^63-1 + * Range 57: 2^56 values each - 128 buckets cover 2^63 to 2^64-1 + * + * On a 2.3GHz processor, this strategy results in 50ns buckets in the 7-14us range (sweet + * spot for Intel Optane SSD latency testing). + * + * Buckets can be made more granular by increasing SPDK_HISTOGRAM_BUCKET_SHIFT. This + * comes at the cost of additional storage per namespace context to store the bucket data. + */ + +struct spdk_histogram_data { + + uint64_t bucket[SPDK_HISTOGRAM_NUM_BUCKET_RANGES][SPDK_HISTOGRAM_NUM_BUCKETS_PER_RANGE]; + +}; + +static inline void +spdk_histogram_data_reset(struct spdk_histogram_data *histogram) +{ + memset(histogram, 0, sizeof(*histogram)); +} + +static inline uint32_t +__spdk_histogram_data_get_bucket_range(uint64_t datapoint) +{ + uint32_t clz, range; + + assert(datapoint != 0); + + clz = __builtin_clzll(datapoint); + + if (clz <= SPDK_HISTOGRAM_BUCKET_LSB) { + range = SPDK_HISTOGRAM_BUCKET_LSB - clz; + } else { + range = 0; + } + + return range; +} + +static inline uint32_t +__spdk_histogram_data_get_bucket_index(uint64_t datapoint, uint32_t range) +{ + uint32_t shift; + + if (range == 0) { + shift = 0; + } else { + shift = range - 1; + } + + return (datapoint >> shift) & SPDK_HISTOGRAM_BUCKET_MASK; +} + +static inline void +spdk_histogram_data_tally(struct spdk_histogram_data *histogram, uint64_t datapoint) +{ + uint32_t range = __spdk_histogram_data_get_bucket_range(datapoint); + uint32_t index = __spdk_histogram_data_get_bucket_index(datapoint, range); + + histogram->bucket[range][index]++; +} + +static inline uint64_t +__spdk_histogram_data_get_bucket_start(uint32_t range, uint32_t index) +{ + uint64_t bucket; + + index += 1; + if (range > 0) { + bucket = 1ULL << (range + SPDK_HISTOGRAM_BUCKET_SHIFT - 1); + bucket += (uint64_t)index << (range - 1); + } else { + bucket = index; + } + + return bucket; +} + +typedef void (*spdk_histogram_data_fn)(void *ctx, uint64_t start, uint64_t end, uint64_t count, + uint64_t total, uint64_t so_far); + +static inline void +spdk_histogram_data_iterate(const struct spdk_histogram_data *histogram, + spdk_histogram_data_fn fn, void *ctx) +{ + uint64_t i, j, count, so_far, total; + uint64_t bucket, last_bucket; + + total = 0; + + for (i = 0; i < SPDK_HISTOGRAM_NUM_BUCKET_RANGES; i++) { + for (j = 0; j < SPDK_HISTOGRAM_NUM_BUCKETS_PER_RANGE; j++) { + total += histogram->bucket[i][j]; + } + } + + so_far = 0; + bucket = 0; + + for (i = 0; i < SPDK_HISTOGRAM_NUM_BUCKET_RANGES; i++) { + for (j = 0; j < SPDK_HISTOGRAM_NUM_BUCKETS_PER_RANGE; j++) { + count = histogram->bucket[i][j]; + so_far += count; + last_bucket = bucket; + bucket = __spdk_histogram_data_get_bucket_start(i, j); + fn(ctx, last_bucket, bucket, count, total, so_far); + } + } +} + +#ifdef __cplusplus +} +#endif + +#endif