Spdk/lib/nvme/nvme_pcie_internal.h

374 lines
12 KiB
C
Raw Normal View History

/*-
* BSD LICENSE
*
* Copyright (c) Intel Corporation. All rights reserved.
* Copyright (c) 2021 Mellanox Technologies LTD. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __NVME_PCIE_INTERNAL_H__
#define __NVME_PCIE_INTERNAL_H__
/*
* Number of completion queue entries to process before ringing the
* completion queue doorbell.
*/
#define NVME_MIN_COMPLETIONS (1)
#define NVME_MAX_COMPLETIONS (128)
/*
* NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL
* segment.
*/
#define NVME_MAX_SGL_DESCRIPTORS (250)
#define NVME_MAX_PRP_LIST_ENTRIES (503)
/* PCIe transport extensions for spdk_nvme_ctrlr */
struct nvme_pcie_ctrlr {
struct spdk_nvme_ctrlr ctrlr;
/** NVMe MMIO register space */
volatile struct spdk_nvme_registers *regs;
/** NVMe MMIO register size */
uint64_t regs_size;
struct {
/* BAR mapping address which contains controller memory buffer */
void *bar_va;
/* BAR physical address which contains controller memory buffer */
uint64_t bar_pa;
/* Controller memory buffer size in Bytes */
uint64_t size;
/* Current offset of controller memory buffer, relative to start of BAR virt addr */
uint64_t current_offset;
void *mem_register_addr;
size_t mem_register_size;
} cmb;
struct {
/* BAR mapping address which contains persistent memory region */
void *bar_va;
/* BAR physical address which contains persistent memory region */
uint64_t bar_pa;
/* Persistent memory region size in Bytes */
uint64_t size;
void *mem_register_addr;
size_t mem_register_size;
} pmr;
/** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */
uint32_t doorbell_stride_u32;
/* Opaque handle to associated PCI device. */
struct spdk_pci_device *devhandle;
/* Flag to indicate the MMIO register has been remapped */
bool is_remapped;
volatile uint32_t *doorbell_base;
};
extern __thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr;
struct nvme_tracker {
TAILQ_ENTRY(nvme_tracker) tq_list;
struct nvme_request *req;
uint16_t cid;
nvme/pcie: defer bad vtophys to completion context The pcie layer can't always detect bad addresses in the request at submission time - for example, the transport may not have any trackers available and the request gets queued at the generic nvme level. So this means that we might detect vtophys failures during submission time, or in a process_completions context - the latter happening when we complete one request which triggers submitting a new request. Currently if the vtophys failure happens during submission context, we return -EFAULT to the caller *and* call the completion callback. Nowhere else in the driver do we do both - the intention has always been that you get one or the other. So make all of this consistent by tagging the tracker and the qpair with a flag if we hit a vtophys error in the submission path. Return 0 to the caller, who will then later get a completion callback for the bad request when the qpair is next processed for completions. I considered a separate TAILQ to hold these 'bad' trackers, but that would have required duplicating quite a bit of the tracker completion code for this one case. The flag on the pqpair is already in the hot cacheline, so it's cheap to check it. We will only interate the outstanding_tr list when that flag is set, so this should have zero impact to performance. Fixes issue #2085. Signed-off-by: Jim Harris <james.r.harris@intel.com> Change-Id: I60b135fb32d899188e51545b69feb1b27758fd7f Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/9234 Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com> Community-CI: Mellanox Build Bot Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Changpeng Liu <changpeng.liu@intel.com> Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com> Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
2021-08-19 12:25:16 +00:00
uint16_t bad_vtophys : 1;
uint16_t rsvd0 : 15;
uint32_t rsvd1;
spdk_nvme_cmd_cb cb_fn;
void *cb_arg;
uint64_t prp_sgl_bus_addr;
/* Don't move, metadata SGL is always contiguous with Data Block SGL */
struct spdk_nvme_sgl_descriptor meta_sgl;
union {
uint64_t prp[NVME_MAX_PRP_LIST_ENTRIES];
struct spdk_nvme_sgl_descriptor sgl[NVME_MAX_SGL_DESCRIPTORS];
} u;
};
/*
* struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary
* and so that there is no padding required to meet alignment requirements.
*/
SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K");
SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned");
SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, meta_sgl) & 7) == 0, "SGL must be Qword aligned");
struct nvme_pcie_poll_group {
struct spdk_nvme_transport_poll_group group;
struct spdk_nvme_pcie_stat stats;
};
enum nvme_pcie_qpair_state {
NVME_PCIE_QPAIR_WAIT_FOR_CQ = 1,
NVME_PCIE_QPAIR_WAIT_FOR_SQ,
NVME_PCIE_QPAIR_READY,
NVME_PCIE_QPAIR_FAILED,
};
/* PCIe transport extensions for spdk_nvme_qpair */
struct nvme_pcie_qpair {
/* Submission queue tail doorbell */
volatile uint32_t *sq_tdbl;
/* Completion queue head doorbell */
volatile uint32_t *cq_hdbl;
/* Submission queue */
struct spdk_nvme_cmd *cmd;
/* Completion queue */
struct spdk_nvme_cpl *cpl;
TAILQ_HEAD(, nvme_tracker) free_tr;
TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr;
/* Array of trackers indexed by command ID. */
struct nvme_tracker *tr;
struct spdk_nvme_pcie_stat *stat;
uint16_t num_entries;
uint8_t pcie_state;
uint8_t retry_count;
uint16_t max_completions_cap;
uint16_t last_sq_tail;
uint16_t sq_tail;
uint16_t cq_head;
uint16_t sq_head;
struct {
uint8_t phase : 1;
uint8_t delay_cmd_submit : 1;
uint8_t has_shadow_doorbell : 1;
nvme/pcie: defer bad vtophys to completion context The pcie layer can't always detect bad addresses in the request at submission time - for example, the transport may not have any trackers available and the request gets queued at the generic nvme level. So this means that we might detect vtophys failures during submission time, or in a process_completions context - the latter happening when we complete one request which triggers submitting a new request. Currently if the vtophys failure happens during submission context, we return -EFAULT to the caller *and* call the completion callback. Nowhere else in the driver do we do both - the intention has always been that you get one or the other. So make all of this consistent by tagging the tracker and the qpair with a flag if we hit a vtophys error in the submission path. Return 0 to the caller, who will then later get a completion callback for the bad request when the qpair is next processed for completions. I considered a separate TAILQ to hold these 'bad' trackers, but that would have required duplicating quite a bit of the tracker completion code for this one case. The flag on the pqpair is already in the hot cacheline, so it's cheap to check it. We will only interate the outstanding_tr list when that flag is set, so this should have zero impact to performance. Fixes issue #2085. Signed-off-by: Jim Harris <james.r.harris@intel.com> Change-Id: I60b135fb32d899188e51545b69feb1b27758fd7f Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/9234 Community-CI: Broadcom CI <spdk-ci.pdl@broadcom.com> Community-CI: Mellanox Build Bot Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Changpeng Liu <changpeng.liu@intel.com> Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com> Reviewed-by: Aleksey Marchuk <alexeymar@mellanox.com>
2021-08-19 12:25:16 +00:00
uint8_t has_pending_vtophys_failures : 1;
uint8_t defer_destruction : 1;
} flags;
/*
* Base qpair structure.
* This is located after the hot data in this structure so that the important parts of
* nvme_pcie_qpair are in the same cache line.
*/
struct spdk_nvme_qpair qpair;
struct {
/* Submission queue shadow tail doorbell */
volatile uint32_t *sq_tdbl;
/* Completion queue shadow head doorbell */
volatile uint32_t *cq_hdbl;
/* Submission queue event index */
volatile uint32_t *sq_eventidx;
/* Completion queue event index */
volatile uint32_t *cq_eventidx;
} shadow_doorbell;
/*
* Fields below this point should not be touched on the normal I/O path.
*/
bool sq_in_cmb;
bool shared_stats;
uint64_t cmd_bus_addr;
uint64_t cpl_bus_addr;
struct spdk_nvme_cmd *sq_vaddr;
struct spdk_nvme_cpl *cq_vaddr;
};
static inline struct nvme_pcie_qpair *
nvme_pcie_qpair(struct spdk_nvme_qpair *qpair)
{
return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair);
}
static inline struct nvme_pcie_ctrlr *
nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
{
return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr);
}
static inline int
nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
{
return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old);
}
static inline bool
nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value,
volatile uint32_t *shadow_db,
volatile uint32_t *eventidx)
{
uint16_t old;
if (!shadow_db) {
return true;
}
spdk_wmb();
old = *shadow_db;
*shadow_db = value;
/*
* Ensure that the doorbell is updated before reading the EventIdx from
* memory
*/
spdk_mb();
if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) {
return false;
}
return true;
}
static inline void
nvme_pcie_qpair_ring_sq_doorbell(struct spdk_nvme_qpair *qpair)
{
struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
bool need_mmio = true;
if (qpair->first_fused_submitted) {
/* This is first cmd of two fused commands - don't ring doorbell */
qpair->first_fused_submitted = 0;
return;
}
if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) {
need_mmio = nvme_pcie_qpair_update_mmio_required(qpair,
pqpair->sq_tail,
pqpair->shadow_doorbell.sq_tdbl,
pqpair->shadow_doorbell.sq_eventidx);
}
if (spdk_likely(need_mmio)) {
spdk_wmb();
pqpair->stat->sq_doobell_updates++;
g_thread_mmio_ctrlr = pctrlr;
spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail);
g_thread_mmio_ctrlr = NULL;
}
}
static inline void
nvme_pcie_qpair_ring_cq_doorbell(struct spdk_nvme_qpair *qpair)
{
struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
bool need_mmio = true;
if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) {
need_mmio = nvme_pcie_qpair_update_mmio_required(qpair,
pqpair->cq_head,
pqpair->shadow_doorbell.cq_hdbl,
pqpair->shadow_doorbell.cq_eventidx);
}
if (spdk_likely(need_mmio)) {
pqpair->stat->cq_doorbell_updates++;
g_thread_mmio_ctrlr = pctrlr;
spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head);
g_thread_mmio_ctrlr = NULL;
}
}
int nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair);
int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair,
const struct spdk_nvme_io_qpair_opts *opts);
int nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t num_entries);
void nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair,
struct nvme_request *req, struct spdk_nvme_cpl *cpl);
void nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair);
int nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr,
struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn,
void *cb_arg);
int nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr,
struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg);
int nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
spdk_nvme_cmd_cb cb_fn, void *cb_arg);
int nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
spdk_nvme_cmd_cb cb_fn, void *cb_arg);
int nvme_pcie_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair);
void nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair);
void nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr);
void nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair,
struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
bool print_on_error);
void nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
struct spdk_nvme_cpl *cpl, bool print_on_error);
void nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr);
void nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair);
void nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair);
void nvme_pcie_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
int32_t nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair,
uint32_t max_completions);
int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair);
struct spdk_nvme_qpair *nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
const struct spdk_nvme_io_qpair_opts *opts);
int nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair);
int nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req);
struct spdk_nvme_transport_poll_group *nvme_pcie_poll_group_create(void);
int nvme_pcie_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair);
int nvme_pcie_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair);
int nvme_pcie_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
struct spdk_nvme_qpair *qpair);
int nvme_pcie_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
struct spdk_nvme_qpair *qpair);
int64_t nvme_pcie_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
uint32_t completions_per_qpair,
spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb);
int nvme_pcie_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup);
#endif