From b4f302e5257cfe2a6ffc3173dbb83b60e9b82edc Mon Sep 17 00:00:00 2001 From: paul luse Date: Sat, 5 Jun 2021 10:18:11 -0400 Subject: [PATCH] lib/idxd: rotate portal offset with each submission Allows for better performance by not hitting the same portal address with every submission. Signed-off-by: paul luse Signed-off-by: Ziye Yang Change-Id: I1ec8eae6f3acec9e98161029cd5406ec08603aa6 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/8190 Tested-by: SPDK CI Jenkins Reviewed-by: Ben Walker Reviewed-by: Jim Harris Community-CI: Broadcom CI Community-CI: Mellanox Build Bot --- lib/idxd/idxd.c | 35 +++++++++++++++++++++++------------ lib/idxd/idxd.h | 2 ++ lib/idxd/idxd_kernel.c | 4 ++++ lib/idxd/idxd_spec.h | 5 ++++- lib/idxd/idxd_user.c | 6 +++++- 5 files changed, 38 insertions(+), 14 deletions(-) diff --git a/lib/idxd/idxd.c b/lib/idxd/idxd.c index 1589a9194..6957c578c 100644 --- a/lib/idxd/idxd.c +++ b/lib/idxd/idxd.c @@ -47,7 +47,6 @@ #define ALIGN_4K 0x1000 #define USERSPACE_DRIVER_NAME "user" #define KERNEL_DRIVER_NAME "kernel" -#define CHAN_PER_DEVICE(total_wq_size) ((total_wq_size >= 128) ? 8 : 4) /* * Need to limit how many completions we reap in one poller to avoid starving * other threads as callers can submit new operations on the polling thread. @@ -82,6 +81,15 @@ struct device_config g_dev_cfg1 = { .total_engines = 4, }; +static inline void +_submit_to_hw(struct spdk_idxd_io_channel *chan, struct idxd_hw_desc *desc) +{ + movdir64b(chan->portal + chan->portal_offset, desc); + chan->portal_offset = (chan->portal_offset + chan->idxd->chan_per_device * PORTAL_STRIDE) & + PORTAL_MASK; + +} + struct spdk_idxd_io_channel * spdk_idxd_get_channel(struct spdk_idxd_device *idxd) { @@ -103,13 +111,17 @@ spdk_idxd_get_channel(struct spdk_idxd_device *idxd) } pthread_mutex_lock(&idxd->num_channels_lock); - if (idxd->num_channels == CHAN_PER_DEVICE(idxd->total_wq_size)) { + if (idxd->num_channels == idxd->chan_per_device) { /* too many channels sharing this device */ pthread_mutex_unlock(&idxd->num_channels_lock); free(chan->batch_base); free(chan); return NULL; } + + /* Have each channel start at a different offset. */ + chan->portal_offset = (idxd->num_channels * PORTAL_STRIDE) & PORTAL_MASK; + idxd->num_channels++; pthread_mutex_unlock(&idxd->num_channels_lock); @@ -153,7 +165,7 @@ spdk_idxd_put_channel(struct spdk_idxd_io_channel *chan) int spdk_idxd_chan_get_max_operations(struct spdk_idxd_io_channel *chan) { - return chan->idxd->total_wq_size / CHAN_PER_DEVICE(chan->idxd->total_wq_size); + return chan->idxd->total_wq_size / chan->idxd->chan_per_device; } int @@ -168,8 +180,7 @@ spdk_idxd_configure_chan(struct spdk_idxd_io_channel *chan) chan->idxd->wq_id = 0; } - num_ring_slots = chan->idxd->queues[chan->idxd->wq_id].wqcfg.wq_size / CHAN_PER_DEVICE( - chan->idxd->total_wq_size); + num_ring_slots = chan->idxd->queues[chan->idxd->wq_id].wqcfg.wq_size / chan->idxd->chan_per_device; chan->ring_slots = spdk_bit_array_create(num_ring_slots); if (chan->ring_slots == NULL) { @@ -419,7 +430,7 @@ spdk_idxd_submit_copy(struct spdk_idxd_io_channel *chan, void *dst, const void * desc->flags |= IDXD_FLAG_CACHE_CONTROL; /* direct IO to CPU cache instead of mem */ /* Submit operation. */ - movdir64b(chan->portal, desc); + _submit_to_hw(chan, desc); return 0; } @@ -469,7 +480,7 @@ spdk_idxd_submit_dualcast(struct spdk_idxd_io_channel *chan, void *dst1, void *d desc->flags |= IDXD_FLAG_CACHE_CONTROL; /* direct IO to CPU cache instead of mem */ /* Submit operation. */ - movdir64b(chan->portal, desc); + _submit_to_hw(chan, desc); return 0; } @@ -506,7 +517,7 @@ spdk_idxd_submit_compare(struct spdk_idxd_io_channel *chan, void *src1, const vo desc->xfer_size = nbytes; /* Submit operation. */ - movdir64b(chan->portal, desc); + _submit_to_hw(chan, desc); return 0; } @@ -539,7 +550,7 @@ spdk_idxd_submit_fill(struct spdk_idxd_io_channel *chan, void *dst, uint64_t fil desc->flags |= IDXD_FLAG_CACHE_CONTROL; /* direct IO to CPU cache instead of mem */ /* Submit operation. */ - movdir64b(chan->portal, desc); + _submit_to_hw(chan, desc); return 0; } @@ -575,7 +586,7 @@ spdk_idxd_submit_crc32c(struct spdk_idxd_io_channel *chan, uint32_t *crc_dst, vo comp->crc_dst = crc_dst; /* Submit operation. */ - movdir64b(chan->portal, desc); + _submit_to_hw(chan, desc); return 0; } @@ -616,7 +627,7 @@ spdk_idxd_submit_copy_crc32c(struct spdk_idxd_io_channel *chan, void *dst, void comp->crc_dst = crc_dst; /* Submit operation. */ - movdir64b(chan->portal, desc); + _submit_to_hw(chan, desc); return 0; } @@ -742,7 +753,7 @@ spdk_idxd_batch_submit(struct spdk_idxd_io_channel *chan, struct idxd_batch *bat batch->remaining++; /* Submit operation. */ - movdir64b(chan->portal, desc); + _submit_to_hw(chan, desc); SPDK_DEBUGLOG(idxd, "Submitted batch %p\n", batch); return 0; diff --git a/lib/idxd/idxd.h b/lib/idxd/idxd.h index 3bf18c5b2..276018277 100644 --- a/lib/idxd/idxd.h +++ b/lib/idxd/idxd.h @@ -102,6 +102,7 @@ struct spdk_idxd_io_channel { struct spdk_idxd_device *idxd; /* The portal is the address that we write descriptors to for submission. */ void *portal; + uint32_t portal_offset; uint16_t ring_size; /* @@ -191,6 +192,7 @@ struct spdk_idxd_device { int wq_id; uint32_t num_channels; uint32_t total_wq_size; + uint32_t chan_per_device; pthread_mutex_t num_channels_lock; struct idxd_group *groups; diff --git a/lib/idxd/idxd_kernel.c b/lib/idxd/idxd_kernel.c index 9809907cf..f91e01e0f 100644 --- a/lib/idxd/idxd_kernel.c +++ b/lib/idxd/idxd_kernel.c @@ -210,6 +210,10 @@ config_wqs(struct spdk_kernel_idxd_device *kernel_idxd, return -1; } + /* Spread the channels we allow per device based on the total number of WQE to try + * and achieve optimal performance for common cases. + */ + kernel_idxd->idxd.chan_per_device = (kernel_idxd->idxd.total_wq_size >= 128) ? 8 : 4; return 0; } diff --git a/lib/idxd/idxd_spec.h b/lib/idxd/idxd_spec.h index 5b6980123..36b41a894 100644 --- a/lib/idxd/idxd_spec.h +++ b/lib/idxd/idxd_spec.h @@ -48,7 +48,10 @@ extern "C" { #define IDXD_MMIO_BAR 0 #define IDXD_WQ_BAR 2 -#define PORTAL_SIZE (4096 * 4) +#define PORTAL_SIZE 0x1000 +#define WQ_TOTAL_PORTAL_SIZE (PORTAL_SIZE * 4) +#define PORTAL_STRIDE 0x40 +#define PORTAL_MASK (PORTAL_SIZE - 1) #define CFG_ENGINE_OFFSET 0x20 #define CFG_FLAG_OFFSET 0x28 diff --git a/lib/idxd/idxd_user.c b/lib/idxd/idxd_user.c index 29f7a6f6c..4b624bbae 100644 --- a/lib/idxd/idxd_user.c +++ b/lib/idxd/idxd_user.c @@ -274,6 +274,10 @@ idxd_wq_config(struct spdk_user_idxd_device *user_idxd) assert(LOG2_WQ_MAX_XFER <= user_idxd->registers.gencap.max_xfer_shift); idxd->total_wq_size = user_idxd->registers.wqcap.total_wq_size; + /* Spread the channels we allow per device based on the total number of WQE to try + * and achieve optimal performance for common cases. + */ + idxd->chan_per_device = (idxd->total_wq_size >= 128) ? 8 : 4; idxd->queues = calloc(1, user_idxd->registers.wqcap.num_wqs * sizeof(struct idxd_wq)); if (idxd->queues == NULL) { SPDK_ERRLOG("Failed to allocate queue memory\n"); @@ -519,7 +523,7 @@ user_idxd_dump_sw_err(struct spdk_idxd_device *idxd, void *portal) static char * user_idxd_portal_get_addr(struct spdk_idxd_device *idxd) { - return (char *)idxd->portals + idxd->wq_id * PORTAL_SIZE; + return (char *)idxd->portals + idxd->wq_id * WQ_TOTAL_PORTAL_SIZE; } static bool