Compare commits
71 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
3f5e32adca | ||
|
089585c8d7 | ||
|
fe3a2c4dcd | ||
|
13cfc610d0 | ||
|
6d8f66269d | ||
|
d1a00ccd13 | ||
|
9de25dc80d | ||
|
4d21fba0c5 | ||
|
e1c4f011e1 | ||
|
c58f0e9117 | ||
|
05b978da0c | ||
|
e5c6a69ed5 | ||
|
010e9a7338 | ||
|
b335ab4765 | ||
|
d145d67c6b | ||
|
e11c4afaad | ||
|
dc3f8f8c58 | ||
|
80c98d80b6 | ||
|
40b4273a14 | ||
|
cf0d953044 | ||
|
85d6682dd4 | ||
|
83bbb8bb4b | ||
|
c2ed724e3b | ||
|
a6f10a33cb | ||
|
c828d09d3a | ||
|
90c60fc372 | ||
|
a5879f56f4 | ||
|
9f6a6b1942 | ||
|
40e461cbb7 | ||
|
6a365c0811 | ||
|
cad2095077 | ||
|
bcbf6e8483 | ||
|
8d31df3061 | ||
|
a629b17d51 | ||
|
d2e533c642 | ||
|
10cb21522a | ||
|
4d4c3fe813 | ||
|
e0c1093936 | ||
|
b0cacd460d | ||
|
ecad1d2cbc | ||
|
17660fa741 | ||
|
e13c1ffbc3 | ||
|
0395d29bf4 | ||
|
792b36e898 | ||
|
6a2de254d6 | ||
|
a9533f4083 | ||
|
f882a577d4 | ||
|
e22df3fbcf | ||
|
d07cc7d35d | ||
|
6481d80514 | ||
|
8befeab1b4 | ||
|
d45c6e54ae | ||
|
bf881b09a7 | ||
|
1e0e636351 | ||
|
0640d3fca5 | ||
|
46dd96c2f0 | ||
|
8529ceadfa | ||
|
6dcace0744 | ||
|
37ad7fd3b8 | ||
|
a8dd54792c | ||
|
14d4c7f06d | ||
|
5c50e8e1b5 | ||
|
18b8ef97ac | ||
|
1bf4f98311 | ||
|
29ae45877a | ||
|
0168d9bc9d | ||
|
7eda85292a | ||
|
b1be663bfb | ||
|
d3dbb9c7cf | ||
|
898bad7d0c | ||
|
2f87aada01 |
@ -1,7 +1,4 @@
|
||||
#!/bin/sh
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# All rights reserved.
|
||||
|
||||
#
|
||||
# Verify what is about to be committed.
|
||||
# Called by "git commit" with no arguments. The hook should
|
||||
|
@ -1,7 +1,4 @@
|
||||
#!/bin/sh
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# All rights reserved.
|
||||
|
||||
# Verify what is about to be pushed. Called by "git
|
||||
# push" after it has checked the remote status, but before anything has been
|
||||
# pushed. If this script exits with a non-zero status nothing will be pushed.
|
||||
@ -23,17 +20,16 @@ SYSTEM=`uname -s`
|
||||
exec 1>&2
|
||||
|
||||
if [ "$SYSTEM" = "FreeBSD" ]; then
|
||||
MAKE="gmake MAKE=gmake -j $(sysctl -a | grep -E -i 'hw.ncpu' | awk '{print $2}')"
|
||||
MAKE="gmake MAKE=gmake -j ${nproc}"
|
||||
COMP="clang"
|
||||
else
|
||||
MAKE="make -j $(nproc)"
|
||||
MAKE="make -j ${nproc}"
|
||||
COMP="gcc"
|
||||
fi
|
||||
|
||||
echo "Running make with $COMP ..."
|
||||
echo "${MAKE} clean " > make.log
|
||||
$MAKE clean >> make.log 2>&1
|
||||
|
||||
echo "${MAKE} CONFIG_DEBUG=n CONFIG_WERROR=y " >> make.log
|
||||
$MAKE CONFIG_DEBUG=n CONFIG_WERROR=y >> make.log 2>&1
|
||||
rc=$?
|
||||
@ -79,6 +75,64 @@ fi
|
||||
echo "$MAKE clean " >> make.log
|
||||
$MAKE clean >> make.log 2>&1
|
||||
|
||||
if [ "$SYSTEM" = "FreeBSD" ]; then
|
||||
echo
|
||||
echo "Pushing to $1 $2"
|
||||
exit $rc
|
||||
fi
|
||||
|
||||
if ! hash clang 2>/dev/null; then
|
||||
echo "clang not found; skipping the clang tests"
|
||||
echo
|
||||
echo "Pushing to $1 $2"
|
||||
exit $rc
|
||||
fi
|
||||
|
||||
echo "Running make with clang ..."
|
||||
echo "make CONFIG_DEBUG=n CONFIG_WERROR=y CC=clang CXX=clang++ " >> make.log
|
||||
$MAKE CONFIG_DEBUG=n CONFIG_WERROR=y CC=clang CXX=clang++ >> make.log 2>&1
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
tail -20 make.log
|
||||
echo ""
|
||||
echo "ERROR make CC=clang CXX=clang++ returned errors!"
|
||||
echo "ERROR Fix the problem and use 'git commit' to update your changes."
|
||||
echo "ERROR See `pwd`/make.log for more information."
|
||||
echo ""
|
||||
exit $rc
|
||||
fi
|
||||
|
||||
echo "make clean CC=clang CXX=clang++ SKIP_DPDK_BUILD=1 " >> make.log
|
||||
$MAKE clean CC=clang CXX=clang++ SKIP_DPDK_BUILD=1 >> make.log 2>&1
|
||||
echo "make CONFIG_DEBUG=y CONFIG_WERROR=y CC=clang CXX=clang++ SKIP_DPDK_BUILD=1 " >> make.log
|
||||
$MAKE CONFIG_DEBUG=y CONFIG_WERROR=y CC=clang CXX=clang++ SKIP_DPDK_BUILD=1 >> make.log 2>&1
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
tail -20 make.log
|
||||
echo ""
|
||||
echo "ERROR make CC=clang CXX=clang++ returned errors!"
|
||||
echo "ERROR Fix the problem and use 'git commit' to update your changes."
|
||||
echo "ERROR See `pwd`/make.log for more information."
|
||||
echo ""
|
||||
exit $rc
|
||||
fi
|
||||
|
||||
echo "Running unittest.sh ..."
|
||||
echo "./test/unit/unittest.sh" >> make.log
|
||||
"./test/unit/unittest.sh" >> make.log 2>&1
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
tail -20 make.log
|
||||
echo ""
|
||||
echo "ERROR unittest returned errors!"
|
||||
echo "ERROR Fix the problem and use 'git commit' to update your changes."
|
||||
echo "ERROR See `pwd`/make.log for more information."
|
||||
echo ""
|
||||
exit $rc
|
||||
fi
|
||||
|
||||
${MAKE} clean CC=clang CXX=clang++ 2> /dev/null
|
||||
|
||||
echo "Pushing to $1 $2"
|
||||
|
||||
exit $rc
|
||||
|
8
.github/ISSUE_TEMPLATE/config.yml
vendored
8
.github/ISSUE_TEMPLATE/config.yml
vendored
@ -1,8 +0,0 @@
|
||||
blank_issues_enabled: false
|
||||
contact_links:
|
||||
- name: SPDK Community
|
||||
url: https://spdk.io/community/
|
||||
about: Please ask and answer questions here.
|
||||
- name: SPDK Common Vulnerabilities and Exposures (CVE) Process
|
||||
url: https://spdk.io/cve_threat/
|
||||
about: Please follow CVE process to responsibly disclose security vulnerabilities.
|
25
.github/ISSUE_TEMPLATE/intermittent_failure.md
vendored
25
.github/ISSUE_TEMPLATE/intermittent_failure.md
vendored
@ -1,25 +0,0 @@
|
||||
---
|
||||
name: CI Intermittent Failure
|
||||
about: Create a report with CI failure unrelated to the patch tested.
|
||||
title: '[test_name] Failure description'
|
||||
labels: 'Intermittent Failure'
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
# CI Intermittent Failure
|
||||
|
||||
<!--- Provide a [test_name] where the issue occurred and brief description in the Title above. -->
|
||||
<!--- Name of the test can be found by last occurrence of: -->
|
||||
<!--- ************************************ -->
|
||||
<!--- START TEST [test_name] -->
|
||||
<!--- ************************************ -->
|
||||
|
||||
## Link to the failed CI build
|
||||
|
||||
<!--- Please provide a link to the failed CI build -->
|
||||
|
||||
## Execution failed at
|
||||
|
||||
<!--- Please provide the first failure in the test. Pointed to by the first occurrence of: -->
|
||||
<!--- ========== Backtrace start: ========== -->
|
11
.github/dependabot.yml
vendored
11
.github/dependabot.yml
vendored
@ -1,11 +0,0 @@
|
||||
# To get started with Dependabot version updates, you'll need to specify which
|
||||
# package ecosystems to update and where the package manifests are located.
|
||||
# Please see the documentation for all configuration options:
|
||||
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
|
||||
|
||||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "" # See documentation for possible values
|
||||
directory: "/" # Location of package manifests
|
||||
schedule:
|
||||
interval: "weekly"
|
10
.github/mistaken-pull-closer.yml
vendored
10
.github/mistaken-pull-closer.yml
vendored
@ -1,10 +0,0 @@
|
||||
filters:
|
||||
- true
|
||||
|
||||
commentBody: |
|
||||
Thanks for your contribution! Unfortunately, we don't use GitHub pull
|
||||
requests to manage code contributions to this repository. Instead, please
|
||||
see https://spdk.io/development which provides instructions on how to
|
||||
submit patches to the SPDK Gerrit instance.
|
||||
|
||||
addLabel: false
|
12
.gitignore
vendored
12
.gitignore
vendored
@ -2,23 +2,17 @@
|
||||
*.a
|
||||
*.cmd
|
||||
*.d
|
||||
*.dll
|
||||
*.exe
|
||||
*.gcda
|
||||
*.gcno
|
||||
*.kdev4
|
||||
*.ko
|
||||
*.lib
|
||||
*.log
|
||||
*.o
|
||||
*.obj
|
||||
*.pdb
|
||||
*.pyc
|
||||
*.so
|
||||
*.so.*
|
||||
*.swp
|
||||
*.DS_Store
|
||||
build/
|
||||
ut_coverage/
|
||||
tags
|
||||
cscope.out
|
||||
@ -31,12 +25,6 @@ CONFIG.local
|
||||
.project
|
||||
.cproject
|
||||
.settings
|
||||
.gitreview
|
||||
mk/cc.mk
|
||||
mk/config.mk
|
||||
mk/cc.flags.mk
|
||||
PYTHON_COMMAND
|
||||
test_completions.txt
|
||||
timing.txt
|
||||
test/common/build_config.sh
|
||||
.coredump_path
|
||||
|
12
.gitmodules
vendored
12
.gitmodules
vendored
@ -7,15 +7,3 @@
|
||||
[submodule "isa-l"]
|
||||
path = isa-l
|
||||
url = https://github.com/spdk/isa-l.git
|
||||
[submodule "ocf"]
|
||||
path = ocf
|
||||
url = https://github.com/Open-CAS/ocf.git
|
||||
[submodule "libvfio-user"]
|
||||
path = libvfio-user
|
||||
url = https://github.com/nutanix/libvfio-user.git
|
||||
[submodule "xnvme"]
|
||||
path = xnvme
|
||||
url = https://github.com/OpenMPDK/xNVMe.git
|
||||
[submodule "isa-l-crypto"]
|
||||
path = isa-l-crypto
|
||||
url = https://github.com/intel/isa-l_crypto
|
||||
|
37
.travis.yml
Normal file
37
.travis.yml
Normal file
@ -0,0 +1,37 @@
|
||||
language: c
|
||||
|
||||
compiler:
|
||||
- gcc
|
||||
- clang
|
||||
|
||||
dist: trusty
|
||||
sudo: false
|
||||
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- libcunit1-dev
|
||||
- libaio-dev
|
||||
- libssl-dev
|
||||
- uuid-dev
|
||||
- libnuma-dev
|
||||
|
||||
before_script:
|
||||
- git submodule update --init
|
||||
- export MAKEFLAGS="-j$(nproc)"
|
||||
|
||||
script:
|
||||
- ./scripts/check_format.sh
|
||||
- ./configure --enable-werror
|
||||
- make
|
||||
- ./test/unit/unittest.sh
|
||||
|
||||
notifications:
|
||||
irc:
|
||||
channels:
|
||||
- "chat.freenode.net#spdk"
|
||||
template:
|
||||
- "(%{repository_name}/%{branch}) %{commit_subject} (%{author})"
|
||||
- "Diff URL: %{compare_url}"
|
||||
on_success: always
|
||||
on_failure: always
|
3109
CHANGELOG.md
3109
CHANGELOG.md
File diff suppressed because it is too large
Load Diff
@ -1,130 +0,0 @@
|
||||
# Contributor Covenant Code of Conduct
|
||||
|
||||
## Our Pledge
|
||||
|
||||
We as members, contributors, and leaders pledge to make participation in our
|
||||
community a harassment-free experience for everyone, regardless of age, body
|
||||
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
||||
identity and expression, level of experience, education, socio-economic status,
|
||||
nationality, personal appearance, race, caste, color, religion, or sexual
|
||||
identity and orientation.
|
||||
|
||||
We pledge to act and interact in ways that contribute to an open, welcoming,
|
||||
diverse, inclusive, and healthy community.
|
||||
|
||||
## Our Standards
|
||||
|
||||
Examples of behavior that contributes to a positive environment for our
|
||||
community include:
|
||||
|
||||
* Demonstrating empathy and kindness toward other people
|
||||
* Being respectful of differing opinions, viewpoints, and experiences
|
||||
* Giving and gracefully accepting constructive feedback
|
||||
* Accepting responsibility and apologizing to those affected by our mistakes,
|
||||
and learning from the experience
|
||||
* Focusing on what is best not just for us as individuals, but for the overall
|
||||
community
|
||||
|
||||
Examples of unacceptable behavior include:
|
||||
|
||||
* The use of sexualized language or imagery, and sexual attention or advances of
|
||||
any kind
|
||||
* Trolling, insulting or derogatory comments, and personal or political attacks
|
||||
* Public or private harassment
|
||||
* Publishing others' private information, such as a physical or email address,
|
||||
without their explicit permission
|
||||
* Other conduct which could reasonably be considered inappropriate in a
|
||||
professional setting
|
||||
|
||||
## Enforcement Responsibilities
|
||||
|
||||
SPDK core [maintainers](https://spdk.io/development/) are responsible for clarifying and enforcing our standards of
|
||||
acceptable behavior and will take appropriate and fair corrective action in
|
||||
response to any behavior that they deem inappropriate, threatening, offensive,
|
||||
or harmful.
|
||||
|
||||
SPDK core maintainers have the right and responsibility to remove, edit, or reject
|
||||
comments, commits, code, wiki edits, issues, and other contributions that are
|
||||
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
||||
decisions when appropriate.
|
||||
|
||||
## Scope
|
||||
|
||||
This Code of Conduct applies within all community spaces, and also applies when
|
||||
an individual is officially representing the community in public spaces.
|
||||
Examples of representing our community include using an official e-mail address,
|
||||
posting via an official social media account, or acting as an appointed
|
||||
representative at an online or offline event.
|
||||
|
||||
## Enforcement
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
reported privately to any of the SPDK core maintainers. All complaints will be
|
||||
reviewed and investigated promptly and fairly.
|
||||
|
||||
All SPDK core maintainers are obligated to respect the privacy and security of the
|
||||
reporter of any incident.
|
||||
|
||||
## Enforcement Guidelines
|
||||
|
||||
SPDK core maintainers will follow these Community Impact Guidelines in determining
|
||||
the consequences for any action they deem in violation of this Code of Conduct:
|
||||
|
||||
### 1. Correction
|
||||
|
||||
**Community Impact**: Use of inappropriate language or other behavior deemed
|
||||
unprofessional or unwelcome in the community.
|
||||
|
||||
**Consequence**: A private, written warning from SPDK core maintainers, providing
|
||||
clarity around the nature of the violation and an explanation of why the
|
||||
behavior was inappropriate. A public apology may be requested.
|
||||
|
||||
### 2. Warning
|
||||
|
||||
**Community Impact**: A violation through a single incident or series of
|
||||
actions.
|
||||
|
||||
**Consequence**: A warning with consequences for continued behavior. No
|
||||
interaction with the people involved, including unsolicited interaction with
|
||||
those enforcing the Code of Conduct, for a specified period of time. This
|
||||
includes avoiding interactions in community spaces as well as external channels
|
||||
like social media. Violating these terms may lead to a temporary or permanent
|
||||
ban.
|
||||
|
||||
### 3. Temporary Ban
|
||||
|
||||
**Community Impact**: A serious violation of community standards, including
|
||||
sustained inappropriate behavior.
|
||||
|
||||
**Consequence**: A temporary ban from any sort of interaction or public
|
||||
communication with the community for a specified period of time. No public or
|
||||
private interaction with the people involved, including unsolicited interaction
|
||||
with those enforcing the Code of Conduct, is allowed during this period.
|
||||
Violating these terms may lead to a permanent ban.
|
||||
|
||||
### 4. Permanent Ban
|
||||
|
||||
**Community Impact**: Demonstrating a pattern of violation of community
|
||||
standards, including sustained inappropriate behavior, harassment of an
|
||||
individual, or aggression toward or disparagement of classes of individuals.
|
||||
|
||||
**Consequence**: A permanent ban from any sort of public interaction within the
|
||||
community.
|
||||
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
||||
version 2.1, available at
|
||||
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
|
||||
|
||||
Community Impact Guidelines were inspired by
|
||||
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
|
||||
|
||||
For answers to common questions about this code of conduct, see the FAQ at
|
||||
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
|
||||
[https://www.contributor-covenant.org/translations][translations].
|
||||
|
||||
[homepage]: https://www.contributor-covenant.org
|
||||
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
|
||||
[Mozilla CoC]: https://github.com/mozilla/diversity
|
||||
[FAQ]: https://www.contributor-covenant.org/faq
|
188
CONFIG
188
CONFIG
@ -1,39 +1,51 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2015 Intel Corporation.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# Copyright (c) 2022 Dell Inc, or its subsidiaries.
|
||||
#
|
||||
|
||||
# configure options: __CONFIGURE_OPTIONS__
|
||||
# BSD LICENSE
|
||||
#
|
||||
# Copyright (c) Intel Corporation.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
|
||||
# Installation prefix
|
||||
CONFIG_PREFIX="/usr/local"
|
||||
|
||||
# Target architecture
|
||||
CONFIG_ARCH=native
|
||||
|
||||
# Destination directory for the libraries
|
||||
CONFIG_LIBDIR=
|
||||
|
||||
# Prefix for cross compilation
|
||||
CONFIG_CROSS_PREFIX=
|
||||
|
||||
# Build with debug logging. Turn off for performance testing and normal usage
|
||||
CONFIG_DEBUG=n
|
||||
|
||||
# Build with support of backtrace printing in log messages. Requires libunwind.
|
||||
CONFIG_LOG_BACKTRACE=n
|
||||
|
||||
# Treat warnings as errors (fail the build on any warning).
|
||||
CONFIG_WERROR=n
|
||||
|
||||
# Build with link-time optimization.
|
||||
CONFIG_LTO=n
|
||||
|
||||
# Generate profile guided optimization data.
|
||||
CONFIG_PGO_CAPTURE=n
|
||||
|
||||
# Use profile guided optimization data.
|
||||
CONFIG_PGO_USE=n
|
||||
|
||||
# Build with code coverage instrumentation.
|
||||
CONFIG_COVERAGE=n
|
||||
|
||||
@ -43,28 +55,12 @@ CONFIG_ASAN=n
|
||||
# Build with Undefined Behavior Sanitizer enabled
|
||||
CONFIG_UBSAN=n
|
||||
|
||||
# Build with LLVM fuzzing enabled
|
||||
CONFIG_FUZZER=n
|
||||
CONFIG_FUZZER_LIB=
|
||||
|
||||
# Build with Thread Sanitizer enabled
|
||||
CONFIG_TSAN=n
|
||||
|
||||
# Build functional tests
|
||||
# Build tests
|
||||
CONFIG_TESTS=y
|
||||
|
||||
# Build unit tests
|
||||
CONFIG_UNIT_TESTS=y
|
||||
|
||||
# Build examples
|
||||
CONFIG_EXAMPLES=y
|
||||
|
||||
# Build apps
|
||||
CONFIG_APPS=y
|
||||
|
||||
# Build with Control-flow Enforcement Technology (CET)
|
||||
CONFIG_CET=n
|
||||
|
||||
# Directory that contains the desired SPDK environment library.
|
||||
# By default, this is implemented using DPDK.
|
||||
CONFIG_ENV=
|
||||
@ -72,13 +68,6 @@ CONFIG_ENV=
|
||||
# This directory should contain 'include' and 'lib' directories for your DPDK
|
||||
# installation.
|
||||
CONFIG_DPDK_DIR=
|
||||
# Automatically set via pkg-config when bare --with-dpdk is set
|
||||
CONFIG_DPDK_LIB_DIR=
|
||||
CONFIG_DPDK_INC_DIR=
|
||||
CONFIG_DPDK_PKG_CONFIG=n
|
||||
|
||||
# This directory should contain 'include' and 'lib' directories for WPDK.
|
||||
CONFIG_WPDK_DIR=
|
||||
|
||||
# Build SPDK FIO plugin. Requires CONFIG_FIO_SOURCE_DIR set to a valid
|
||||
# fio source code directory.
|
||||
@ -92,54 +81,27 @@ CONFIG_FIO_SOURCE_DIR=/usr/src/fio
|
||||
# Requires ibverbs development libraries.
|
||||
CONFIG_RDMA=n
|
||||
CONFIG_RDMA_SEND_WITH_INVAL=n
|
||||
CONFIG_RDMA_SET_ACK_TIMEOUT=n
|
||||
CONFIG_RDMA_SET_TOS=n
|
||||
CONFIG_RDMA_PROV=verbs
|
||||
|
||||
# Enable NVMe Character Devices.
|
||||
CONFIG_NVME_CUSE=n
|
||||
|
||||
# Enable FC support for the NVMf target.
|
||||
# Requires FC low level driver (from FC vendor)
|
||||
CONFIG_FC=n
|
||||
CONFIG_FC_PATH=
|
||||
|
||||
# Build Ceph RBD support in bdev modules
|
||||
# Requires librbd development libraries
|
||||
CONFIG_RBD=n
|
||||
|
||||
# Build DAOS support in bdev modules
|
||||
# Requires daos development libraries
|
||||
CONFIG_DAOS=n
|
||||
CONFIG_DAOS_DIR=
|
||||
|
||||
# Build UBLK support
|
||||
CONFIG_UBLK=n
|
||||
|
||||
# Build vhost library.
|
||||
CONFIG_VHOST=y
|
||||
|
||||
# Build vhost initiator (Virtio) driver.
|
||||
CONFIG_VIRTIO=y
|
||||
|
||||
# Build custom vfio-user transport for NVMf target and NVMe initiator.
|
||||
CONFIG_VFIO_USER=n
|
||||
CONFIG_VFIO_USER_DIR=
|
||||
# Build with PMDK backends
|
||||
CONFIG_PMDK=n
|
||||
CONFIG_PMDK_DIR=
|
||||
|
||||
# Build with xNVMe
|
||||
CONFIG_XNVME=n
|
||||
# Build with "reduce" (SPDK block compression)
|
||||
CONFIG_REDUCE=n
|
||||
|
||||
# Enable the dependencies for building the DPDK accel compress module
|
||||
CONFIG_DPDK_COMPRESSDEV=n
|
||||
|
||||
# Enable the dependencies for building the compress vbdev, includes the reduce library
|
||||
CONFIG_VBDEV_COMPRESS=n
|
||||
|
||||
# Enable mlx5_pci dpdk compress PMD, enabled automatically if CONFIG_VBDEV_COMPRESS=y and libmlx5 exists
|
||||
CONFIG_VBDEV_COMPRESS_MLX5=n
|
||||
|
||||
# Enable mlx5_pci dpdk crypto PMD, enabled automatically if CONFIG_CRYPTO=y and libmlx5 exists
|
||||
CONFIG_CRYPTO_MLX5=n
|
||||
# Build with VPP
|
||||
CONFIG_VPP=n
|
||||
CONFIG_VPP_DIR=
|
||||
|
||||
# Requires libiscsi development libraries.
|
||||
CONFIG_ISCSI_INITIATOR=n
|
||||
@ -150,10 +112,16 @@ CONFIG_CRYPTO=n
|
||||
# Build spdk shared libraries in addition to the static ones.
|
||||
CONFIG_SHARED=n
|
||||
|
||||
# Build with VTune support.
|
||||
# Build with VTune suport.
|
||||
CONFIG_VTUNE=n
|
||||
CONFIG_VTUNE_DIR=
|
||||
|
||||
# Build the dpdk igb_uio driver
|
||||
CONFIG_IGB_UIO_DRIVER=n
|
||||
|
||||
# Build FTL library
|
||||
CONFIG_FTL=n
|
||||
|
||||
# Build Intel IPSEC_MB library
|
||||
CONFIG_IPSEC_MB=n
|
||||
|
||||
@ -164,59 +132,3 @@ CONFIG_CUSTOMOCF=n
|
||||
|
||||
# Build ISA-L library
|
||||
CONFIG_ISAL=y
|
||||
|
||||
# Build ISA-L-crypto library
|
||||
CONFIG_ISAL_CRYPTO=y
|
||||
|
||||
# Build with IO_URING support
|
||||
CONFIG_URING=n
|
||||
|
||||
# Build IO_URING bdev with ZNS support
|
||||
CONFIG_URING_ZNS=n
|
||||
|
||||
# Path to custom built IO_URING library
|
||||
CONFIG_URING_PATH=
|
||||
|
||||
# Path to custom built OPENSSL library
|
||||
CONFIG_OPENSSL_PATH=
|
||||
|
||||
# Build with FUSE support
|
||||
CONFIG_FUSE=n
|
||||
|
||||
# Build with RAID5f support
|
||||
CONFIG_RAID5F=n
|
||||
|
||||
# Build with IDXD support
|
||||
# In this mode, SPDK fully controls the DSA device.
|
||||
CONFIG_IDXD=n
|
||||
|
||||
# Build with USDT support
|
||||
CONFIG_USDT=n
|
||||
|
||||
# Build with IDXD kernel support.
|
||||
# In this mode, SPDK shares the DSA device with the kernel.
|
||||
CONFIG_IDXD_KERNEL=n
|
||||
|
||||
# arc4random is available in stdlib.h
|
||||
CONFIG_HAVE_ARC4RANDOM=n
|
||||
|
||||
# uuid_generate_sha1 is available in uuid/uuid.h
|
||||
CONFIG_HAVE_UUID_GENERATE_SHA1=n
|
||||
|
||||
# Is DPDK using libbsd?
|
||||
CONFIG_HAVE_LIBBSD=n
|
||||
|
||||
# Is DPDK using libarchive?
|
||||
CONFIG_HAVE_LIBARCHIVE=n
|
||||
|
||||
# Path to IPSEC_MB used by DPDK
|
||||
CONFIG_IPSEC_MB_DIR=
|
||||
|
||||
# Generate Storage Management Agent's protobuf interface
|
||||
CONFIG_SMA=n
|
||||
|
||||
# Build with Avahi support
|
||||
CONFIG_AVAHI=n
|
||||
|
||||
# Setup DPDK's RTE_MAX_LCORES
|
||||
CONFIG_MAX_LCORES=
|
||||
|
@ -1,37 +1,25 @@
|
||||
---
|
||||
name: Sighting report
|
||||
about: Create a report to help us improve. Please use the issue tracker only for reporting suspected issues.
|
||||
title: ''
|
||||
labels: 'Sighting'
|
||||
assignees: ''
|
||||
Please use the issue tracker only for reporting suspected issues.
|
||||
|
||||
---
|
||||
|
||||
# Sighting report
|
||||
See [The SPDK Community Page](http://www.spdk.io/community/) for other SPDK communications channels.
|
||||
|
||||
<!--- Provide a general summary of the issue in the Title above -->
|
||||
|
||||
## Expected Behavior
|
||||
|
||||
<!--- Tell us what should happen -->
|
||||
|
||||
## Current Behavior
|
||||
|
||||
<!--- Tell us what happens instead of the expected behavior -->
|
||||
|
||||
## Possible Solution
|
||||
|
||||
<!--- Not obligatory, but suggest a fix/reason for the potential issue, -->
|
||||
<!--- Not obligatory, but suggest a fix/reason for the bug, -->
|
||||
|
||||
## Steps to Reproduce
|
||||
|
||||
<!--- Provide a link to a live example, or an unambiguous set of steps to -->
|
||||
<!--- reproduce this sighting. Include code to reproduce, if relevant -->
|
||||
<!--- reproduce this bug. Include code to reproduce, if relevant -->
|
||||
1.
|
||||
2.
|
||||
3.
|
||||
4.
|
||||
|
||||
## Context (Environment including OS version, SPDK version, etc.)
|
||||
|
||||
<!--- Providing context helps us come up with a solution that is most useful in the real world -->
|
52
LICENSE
52
LICENSE
@ -1,30 +1,30 @@
|
||||
The SPDK repo contains multiple git submodules each with its own
|
||||
license info.
|
||||
BSD LICENSE
|
||||
|
||||
Submodule license info:
|
||||
dpdk: see dpdk/license
|
||||
intel-ipsec-mb: see intel-ipsec-mb/LICENSE
|
||||
isa-l: see isa-l/LICENSE
|
||||
libvfio-user: see libvfio-user/LICENSE
|
||||
ocf: see ocf/LICENSE
|
||||
Copyright (c) Intel Corporation.
|
||||
All rights reserved.
|
||||
|
||||
The rest of the SPDK repository uses the Open Source BSD-3-Clause
|
||||
license. SPDK also uses SPDX Unique License Identifiers to eliminate
|
||||
the need to copy the license text into each individual file.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
Any new file contributions to SPDK shall adhere to the BSD-3-Clause
|
||||
license and use SPDX identifiers. Exceptions are subject to usual
|
||||
review and must be listed in this file.
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
Exceptions:
|
||||
|
||||
* include/linux/* header files are BSD-3-Clause but do not use SPDX
|
||||
identifier to keep them identical to the same header files in the
|
||||
Linux kernel source tree.
|
||||
|
||||
* include/spdk/tree.h and include/spdk/queue_extras are BSD-2-Clause,
|
||||
since there were primarily imported from FreeBSD. tree.h uses an SPDX
|
||||
identifier but also the license text to reduce differences from the
|
||||
FreeBSD source tree.
|
||||
|
||||
* lib/util/base64_neon.c is BSD-2-Clause.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
118
Makefile
118
Makefile
@ -1,9 +1,35 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2015 Intel Corporation.
|
||||
# Copyright (c) 2020, Mellanox Corporation.
|
||||
# Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES
|
||||
#
|
||||
# BSD LICENSE
|
||||
#
|
||||
# Copyright (c) Intel Corporation.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
|
||||
S :=
|
||||
|
||||
@ -11,117 +37,67 @@ SPDK_ROOT_DIR := $(CURDIR)
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
|
||||
|
||||
DIRS-y += lib
|
||||
DIRS-y += module
|
||||
DIRS-$(CONFIG_SHARED) += shared_lib
|
||||
DIRS-y += include
|
||||
DIRS-$(CONFIG_EXAMPLES) += examples
|
||||
DIRS-$(CONFIG_APPS) += app
|
||||
DIRS-y += test
|
||||
DIRS-y += examples app include
|
||||
DIRS-$(CONFIG_TESTS) += test
|
||||
DIRS-$(CONFIG_IPSEC_MB) += ipsecbuild
|
||||
DIRS-$(CONFIG_ISAL) += isalbuild
|
||||
DIRS-$(CONFIG_ISAL_CRYPTO) += isalcryptobuild
|
||||
DIRS-$(CONFIG_VFIO_USER) += vfiouserbuild
|
||||
DIRS-$(CONFIG_SMA) += proto
|
||||
DIRS-$(CONFIG_XNVME) += xnvmebuild
|
||||
|
||||
.PHONY: all clean $(DIRS-y) include/spdk/config.h mk/config.mk \
|
||||
cc_version cxx_version .libs_only_other .ldflags ldflags install \
|
||||
uninstall
|
||||
|
||||
# Workaround for ninja. See dpdkbuild/Makefile
|
||||
export MAKE_PID := $(shell echo $$PPID)
|
||||
.PHONY: all clean $(DIRS-y) include/spdk/config.h mk/config.mk mk/cc.mk \
|
||||
cc_version cxx_version .libs_only_other .ldflags ldflags
|
||||
|
||||
ifeq ($(SPDK_ROOT_DIR)/lib/env_dpdk,$(CONFIG_ENV))
|
||||
ifeq ($(CURDIR)/dpdk/build,$(CONFIG_DPDK_DIR))
|
||||
ifneq ($(SKIP_DPDK_BUILD),1)
|
||||
ifneq ($(CONFIG_DPDK_PKG_CONFIG),y)
|
||||
DPDKBUILD = dpdkbuild
|
||||
DIRS-y += dpdkbuild
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(OS),Windows)
|
||||
ifeq ($(CURDIR)/wpdk/build,$(CONFIG_WPDK_DIR))
|
||||
WPDK = wpdk
|
||||
DIRS-y += wpdk
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_SHARED),y)
|
||||
LIB = shared_lib
|
||||
else
|
||||
LIB = module
|
||||
LIB = lib
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_IPSEC_MB),y)
|
||||
LIB += ipsecbuild
|
||||
DPDK_DEPS += ipsecbuild
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_ISAL),y)
|
||||
ISALBUILD = isalbuild
|
||||
LIB += isalbuild
|
||||
DPDK_DEPS += isalbuild
|
||||
ifeq ($(CONFIG_ISAL_CRYPTO),y)
|
||||
ISALCRYPTOBUILD = isalcryptobuild
|
||||
LIB += isalcryptobuild
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_VFIO_USER),y)
|
||||
VFIOUSERBUILD = vfiouserbuild
|
||||
LIB += vfiouserbuild
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_XNVME),y)
|
||||
XNVMEBUILD = xnvmebuild
|
||||
LIB += xnvmebuild
|
||||
endif
|
||||
|
||||
all: mk/cc.mk $(DIRS-y)
|
||||
all: $(DIRS-y)
|
||||
clean: $(DIRS-y)
|
||||
$(Q)rm -f mk/cc.mk
|
||||
$(Q)rm -f include/spdk/config.h
|
||||
$(Q)rm -rf build
|
||||
|
||||
install: all
|
||||
$(Q)echo "Installed to $(DESTDIR)$(CONFIG_PREFIX)"
|
||||
|
||||
uninstall: $(DIRS-y)
|
||||
$(Q)echo "Uninstalled spdk"
|
||||
|
||||
ifneq ($(SKIP_DPDK_BUILD),1)
|
||||
dpdkdeps $(DPDK_DEPS): $(WPDK)
|
||||
dpdkbuild: $(WPDK) $(DPDK_DEPS)
|
||||
endif
|
||||
|
||||
lib: $(WPDK) $(DPDKBUILD) $(VFIOUSERBUILD) $(XNVMEBUILD) $(ISALBUILD) $(ISALCRYPTOBUILD)
|
||||
module: lib
|
||||
shared_lib: module
|
||||
shared_lib: lib
|
||||
lib: $(DPDKBUILD)
|
||||
app: $(LIB)
|
||||
test: $(LIB)
|
||||
examples: $(LIB)
|
||||
pkgdep:
|
||||
sh ./scripts/pkgdep.sh
|
||||
|
||||
$(DIRS-y): mk/cc.mk build_dir include/spdk/config.h
|
||||
$(DIRS-y): mk/cc.mk include/spdk/config.h
|
||||
|
||||
mk/cc.mk:
|
||||
$(Q)echo "Please run configure prior to make"
|
||||
false
|
||||
|
||||
build_dir: mk/cc.mk
|
||||
$(Q)mkdir -p build/lib/pkgconfig/tmp
|
||||
$(Q)mkdir -p build/bin
|
||||
$(Q)mkdir -p build/fio
|
||||
$(Q)mkdir -p build/examples
|
||||
$(Q)mkdir -p build/include/spdk
|
||||
$(Q)scripts/detect_cc.sh --cc=$(CC) --cxx=$(CXX) --lto=$(CONFIG_LTO) --ld=$(LD) > $@.tmp; \
|
||||
cmp -s $@.tmp $@ || mv $@.tmp $@ ; \
|
||||
rm -f $@.tmp
|
||||
|
||||
include/spdk/config.h: mk/config.mk scripts/genconfig.py
|
||||
$(Q)echo "#ifndef SPDK_CONFIG_H" > $@.tmp; \
|
||||
$(Q)PYCMD=$$(cat PYTHON_COMMAND 2>/dev/null) ; \
|
||||
test -z "$$PYCMD" && PYCMD=python ; \
|
||||
echo "#ifndef SPDK_CONFIG_H" > $@.tmp; \
|
||||
echo "#define SPDK_CONFIG_H" >> $@.tmp; \
|
||||
scripts/genconfig.py $(MAKEFLAGS) >> $@.tmp; \
|
||||
$$PYCMD scripts/genconfig.py $(MAKEFLAGS) >> $@.tmp; \
|
||||
echo "#endif /* SPDK_CONFIG_H */" >> $@.tmp; \
|
||||
cmp -s $@.tmp $@ || mv $@.tmp $@ ; \
|
||||
rm -f $@.tmp
|
||||
|
44
README.md
44
README.md
@ -2,11 +2,6 @@
|
||||
|
||||
[](https://travis-ci.org/spdk/spdk)
|
||||
|
||||
NOTE: The SPDK mailing list has moved to a new location. Please visit
|
||||
[this URL](https://lists.linuxfoundation.org/mailman/listinfo/spdk) to subscribe
|
||||
at the new location. Subscribers from the old location will not be automatically
|
||||
migrated to the new location.
|
||||
|
||||
The Storage Performance Development Kit ([SPDK](http://www.spdk.io)) provides a set of tools
|
||||
and libraries for writing high performance, scalable, user-mode storage
|
||||
applications. It achieves high performance by moving all of the necessary
|
||||
@ -15,7 +10,6 @@ interrupts, which avoids kernel context switches and eliminates interrupt
|
||||
handling overhead.
|
||||
|
||||
The development kit currently includes:
|
||||
|
||||
* [NVMe driver](http://www.spdk.io/doc/nvme.html)
|
||||
* [I/OAT (DMA engine) driver](http://www.spdk.io/doc/ioat.html)
|
||||
* [NVMe over Fabrics target](http://www.spdk.io/doc/nvmf.html)
|
||||
@ -23,7 +17,7 @@ The development kit currently includes:
|
||||
* [vhost target](http://www.spdk.io/doc/vhost.html)
|
||||
* [Virtio-SCSI driver](http://www.spdk.io/doc/virtio.html)
|
||||
|
||||
## In this readme
|
||||
# In this readme:
|
||||
|
||||
* [Documentation](#documentation)
|
||||
* [Prerequisites](#prerequisites)
|
||||
@ -31,7 +25,6 @@ The development kit currently includes:
|
||||
* [Build](#libraries)
|
||||
* [Unit Tests](#tests)
|
||||
* [Vagrant](#vagrant)
|
||||
* [AWS](#aws)
|
||||
* [Advanced Build Options](#advanced)
|
||||
* [Shared libraries](#shared)
|
||||
* [Hugepages and Device Binding](#huge)
|
||||
@ -58,9 +51,6 @@ git submodule update --init
|
||||
## Prerequisites
|
||||
|
||||
The dependencies can be installed automatically by `scripts/pkgdep.sh`.
|
||||
The `scripts/pkgdep.sh` script will automatically install the bare minimum
|
||||
dependencies required to build SPDK.
|
||||
Use `--help` to see information on installing dependencies for optional components
|
||||
|
||||
~~~{.sh}
|
||||
./scripts/pkgdep.sh
|
||||
@ -102,23 +92,14 @@ success or failure.
|
||||
|
||||
A [Vagrant](https://www.vagrantup.com/downloads.html) setup is also provided
|
||||
to create a Linux VM with a virtual NVMe controller to get up and running
|
||||
quickly. Currently this has been tested on MacOS, Ubuntu 16.04.2 LTS and
|
||||
Ubuntu 18.04.3 LTS with the VirtualBox and Libvirt provider.
|
||||
The [VirtualBox Extension Pack](https://www.virtualbox.org/wiki/Downloads)
|
||||
or [Vagrant Libvirt] (https://github.com/vagrant-libvirt/vagrant-libvirt) must
|
||||
quickly. Currently this has only been tested on MacOS and Ubuntu 16.04.2 LTS
|
||||
with the [VirtualBox](https://www.virtualbox.org/wiki/Downloads) provider. The
|
||||
[VirtualBox Extension Pack](https://www.virtualbox.org/wiki/Downloads) must
|
||||
also be installed in order to get the required NVMe support.
|
||||
|
||||
Details on the Vagrant setup can be found in the
|
||||
[SPDK Vagrant documentation](http://spdk.io/doc/vagrant.html).
|
||||
|
||||
<a id="aws"></a>
|
||||
## AWS
|
||||
|
||||
The following setup is known to work on AWS:
|
||||
Image: Ubuntu 18.04
|
||||
Before running `setup.sh`, run `modprobe vfio-pci`
|
||||
then: `DRIVER_OVERRIDE=vfio-pci ./setup.sh`
|
||||
|
||||
<a id="advanced"></a>
|
||||
## Advanced Build Options
|
||||
|
||||
@ -134,9 +115,7 @@ Boolean (on/off) options are configured with a 'y' (yes) or 'n' (no). For
|
||||
example, this line of `CONFIG` controls whether the optional RDMA (libibverbs)
|
||||
support is enabled:
|
||||
|
||||
~~~{.sh}
|
||||
CONFIG_RDMA?=n
|
||||
~~~
|
||||
CONFIG_RDMA?=n
|
||||
|
||||
To enable RDMA, this line may be added to `mk/config.mk` with a 'y' instead of
|
||||
'n'. For the majority of options this can be done using the `configure` script.
|
||||
@ -193,20 +172,16 @@ of the SPDK static ones.
|
||||
|
||||
In order to start a SPDK app linked with SPDK shared libraries, make sure
|
||||
to do the following steps:
|
||||
|
||||
- run ldconfig specifying the directory containing SPDK shared libraries
|
||||
- provide proper `LD_LIBRARY_PATH`
|
||||
|
||||
If DPDK shared libraries are used, you may also need to add DPDK shared
|
||||
libraries to `LD_LIBRARY_PATH`
|
||||
|
||||
Linux:
|
||||
|
||||
~~~{.sh}
|
||||
./configure --with-shared
|
||||
make
|
||||
ldconfig -v -n ./build/lib
|
||||
LD_LIBRARY_PATH=./build/lib/:./dpdk/build/lib/ ./build/bin/spdk_tgt
|
||||
LD_LIBRARY_PATH=./build/lib/ ./app/spdk_tgt/spdk_tgt
|
||||
~~~
|
||||
|
||||
<a id="huge"></a>
|
||||
@ -228,13 +203,6 @@ configuring 8192MB memory.
|
||||
sudo HUGEMEM=8192 scripts/setup.sh
|
||||
~~~
|
||||
|
||||
There are a lot of other environment variables that can be set to configure
|
||||
setup.sh for advanced users. To see the full list, run:
|
||||
|
||||
~~~{.sh}
|
||||
scripts/setup.sh --help
|
||||
~~~
|
||||
|
||||
<a id="examples"></a>
|
||||
## Example Code
|
||||
|
||||
|
@ -1,4 +0,0 @@
|
||||
# Security Policy
|
||||
|
||||
The SPDK community has a documented CVE process [here](https://spdk.io/cve_threat/) that describes
|
||||
both how to report a potential security issue as well as who to contact for more information.
|
39
app/Makefile
39
app/Makefile
@ -1,7 +1,35 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2015 Intel Corporation.
|
||||
#
|
||||
# BSD LICENSE
|
||||
#
|
||||
# Copyright (c) Intel Corporation.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
|
||||
SPDK_ROOT_DIR := $(abspath $(CURDIR)/..)
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
|
||||
@ -9,16 +37,11 @@ include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
|
||||
DIRS-y += trace
|
||||
DIRS-y += trace_record
|
||||
DIRS-y += nvmf_tgt
|
||||
DIRS-y += iscsi_top
|
||||
DIRS-y += iscsi_tgt
|
||||
DIRS-y += spdk_tgt
|
||||
DIRS-y += spdk_lspci
|
||||
ifneq ($(OS),Windows)
|
||||
# TODO - currently disabled on Windows due to lack of support for curses
|
||||
DIRS-y += spdk_top
|
||||
endif
|
||||
ifeq ($(OS),Linux)
|
||||
DIRS-$(CONFIG_VHOST) += vhost
|
||||
DIRS-y += spdk_dd
|
||||
endif
|
||||
|
||||
.PHONY: all clean $(DIRS-y)
|
||||
|
@ -1,7 +1,35 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2016 Intel Corporation.
|
||||
#
|
||||
# BSD LICENSE
|
||||
#
|
||||
# Copyright (c) Intel Corporation.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
|
||||
SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
|
||||
@ -15,20 +43,13 @@ CFLAGS += -I$(SPDK_ROOT_DIR)/lib
|
||||
|
||||
C_SRCS := iscsi_tgt.c
|
||||
|
||||
SPDK_LIB_LIST = $(ALL_MODULES_LIST) event event_iscsi
|
||||
|
||||
ifeq ($(SPDK_ROOT_DIR)/lib/env_dpdk,$(CONFIG_ENV))
|
||||
SPDK_LIB_LIST += env_dpdk_rpc
|
||||
endif
|
||||
SPDK_LIB_LIST = $(ALL_MODULES_LIST)
|
||||
SPDK_LIB_LIST += event_bdev event_copy event_iscsi event_net event_scsi event
|
||||
SPDK_LIB_LIST += jsonrpc json rpc bdev_rpc bdev iscsi scsi copy trace conf
|
||||
SPDK_LIB_LIST += thread util log log_rpc trace_rpc app_rpc net sock
|
||||
|
||||
ifeq ($(OS),Linux)
|
||||
SPDK_LIB_LIST += event_nbd
|
||||
SPDK_LIB_LIST += event_nbd nbd
|
||||
endif
|
||||
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.app.mk
|
||||
|
||||
install: $(APP)
|
||||
$(INSTALL_APP)
|
||||
|
||||
uninstall:
|
||||
$(UNINSTALL_APP)
|
||||
|
@ -1,6 +1,34 @@
|
||||
/* SPDX-License-Identifier: BSD-3-Clause
|
||||
* Copyright (C) 2016 Intel Corporation.
|
||||
/*-
|
||||
* BSD LICENSE
|
||||
*
|
||||
* Copyright (c) Intel Corporation.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "spdk/stdinc.h"
|
||||
@ -9,9 +37,25 @@
|
||||
#include "spdk/event.h"
|
||||
#include "iscsi/iscsi.h"
|
||||
#include "spdk/log.h"
|
||||
#include "spdk/net.h"
|
||||
|
||||
static int g_daemon_mode = 0;
|
||||
|
||||
static void
|
||||
spdk_sigusr1(int signo __attribute__((__unused__)))
|
||||
{
|
||||
char *config_str = NULL;
|
||||
if (spdk_app_get_running_config(&config_str, "iscsi.conf") < 0) {
|
||||
fprintf(stderr, "Error getting config\n");
|
||||
} else {
|
||||
fprintf(stdout, "============================\n");
|
||||
fprintf(stdout, " iSCSI target running config\n");
|
||||
fprintf(stdout, "=============================\n");
|
||||
fprintf(stdout, "%s", config_str);
|
||||
}
|
||||
free(config_str);
|
||||
}
|
||||
|
||||
static void
|
||||
iscsi_usage(void)
|
||||
{
|
||||
@ -19,7 +63,7 @@ iscsi_usage(void)
|
||||
}
|
||||
|
||||
static void
|
||||
spdk_startup(void *arg1)
|
||||
spdk_startup(void *arg1, void *arg2)
|
||||
{
|
||||
if (getenv("MEMZONE_DUMP") != NULL) {
|
||||
spdk_memzone_dump(stdout);
|
||||
@ -46,7 +90,7 @@ main(int argc, char **argv)
|
||||
int rc;
|
||||
struct spdk_app_opts opts = {};
|
||||
|
||||
spdk_app_opts_init(&opts, sizeof(opts));
|
||||
spdk_app_opts_init(&opts);
|
||||
opts.name = "iscsi";
|
||||
if ((rc = spdk_app_parse_args(argc, argv, &opts, "b", NULL,
|
||||
iscsi_parse_arg, iscsi_usage)) !=
|
||||
@ -62,9 +106,10 @@ main(int argc, char **argv)
|
||||
}
|
||||
|
||||
opts.shutdown_cb = NULL;
|
||||
opts.usr1_handler = spdk_sigusr1;
|
||||
|
||||
/* Blocks until the application is exiting */
|
||||
rc = spdk_app_start(&opts, spdk_startup, NULL);
|
||||
rc = spdk_app_start(&opts, spdk_startup, NULL, NULL);
|
||||
if (rc) {
|
||||
SPDK_ERRLOG("Start iscsi target daemon: spdk_app_start() retn non-zero\n");
|
||||
}
|
||||
|
1
app/iscsi_top/.gitignore
vendored
Normal file
1
app/iscsi_top/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
iscsi_top
|
53
app/iscsi_top/Makefile
Normal file
53
app/iscsi_top/Makefile
Normal file
@ -0,0 +1,53 @@
|
||||
#
|
||||
# BSD LICENSE
|
||||
#
|
||||
# Copyright (c) Intel Corporation.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
|
||||
SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.app_cxx.mk
|
||||
|
||||
CXXFLAGS += $(ENV_CXXFLAGS)
|
||||
CXXFLAGS += -I$(SPDK_ROOT_DIR)/lib
|
||||
CXX_SRCS = iscsi_top.cpp
|
||||
|
||||
APP = iscsi_top
|
||||
|
||||
all: $(APP)
|
||||
@:
|
||||
|
||||
$(APP) : $(OBJS)
|
||||
$(LINK_CXX)
|
||||
|
||||
clean:
|
||||
$(CLEAN_C) $(APP)
|
||||
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.deps.mk
|
251
app/iscsi_top/iscsi_top.cpp
Normal file
251
app/iscsi_top/iscsi_top.cpp
Normal file
@ -0,0 +1,251 @@
|
||||
/*-
|
||||
* BSD LICENSE
|
||||
*
|
||||
* Copyright (c) Intel Corporation.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "spdk/stdinc.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
extern "C" {
|
||||
#include "spdk/trace.h"
|
||||
#include "iscsi/conn.h"
|
||||
}
|
||||
|
||||
static char *exe_name;
|
||||
static int g_shm_id = 0;
|
||||
|
||||
static void usage(void)
|
||||
{
|
||||
fprintf(stderr, "usage:\n");
|
||||
fprintf(stderr, " %s <option>\n", exe_name);
|
||||
fprintf(stderr, " option = '-i' to specify the shared memory ID,"
|
||||
" (required)\n");
|
||||
}
|
||||
|
||||
static bool
|
||||
conns_compare(struct spdk_iscsi_conn *first, struct spdk_iscsi_conn *second)
|
||||
{
|
||||
if (first->lcore < second->lcore) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (first->lcore > second->lcore) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (first->id < second->id) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void
|
||||
print_connections(void)
|
||||
{
|
||||
std::vector<struct spdk_iscsi_conn *> v;
|
||||
std::vector<struct spdk_iscsi_conn *>::iterator iter;
|
||||
size_t conns_size;
|
||||
struct spdk_iscsi_conn *conns, *conn;
|
||||
void *conns_ptr;
|
||||
int fd, i;
|
||||
char shm_name[64];
|
||||
|
||||
snprintf(shm_name, sizeof(shm_name), "/spdk_iscsi_conns.%d", g_shm_id);
|
||||
fd = shm_open(shm_name, O_RDONLY, 0600);
|
||||
if (fd < 0) {
|
||||
fprintf(stderr, "Cannot open shared memory: %s\n", shm_name);
|
||||
usage();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
conns_size = sizeof(*conns) * MAX_ISCSI_CONNECTIONS;
|
||||
|
||||
conns_ptr = mmap(NULL, conns_size, PROT_READ, MAP_SHARED, fd, 0);
|
||||
if (conns_ptr == MAP_FAILED) {
|
||||
fprintf(stderr, "Cannot mmap shared memory (%d)\n", errno);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
conns = (struct spdk_iscsi_conn *)conns_ptr;
|
||||
|
||||
for (i = 0; i < MAX_ISCSI_CONNECTIONS; i++) {
|
||||
if (!conns[i].is_valid) {
|
||||
continue;
|
||||
}
|
||||
v.push_back(&conns[i]);
|
||||
}
|
||||
|
||||
stable_sort(v.begin(), v.end(), conns_compare);
|
||||
for (iter = v.begin(); iter != v.end(); iter++) {
|
||||
conn = *iter;
|
||||
printf("lcore %2d conn %3d T:%-8s I:%s (%s)\n",
|
||||
conn->lcore, conn->id,
|
||||
conn->target_short_name, conn->initiator_name,
|
||||
conn->initiator_addr);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
munmap(conns, conns_size);
|
||||
close(fd);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
void *history_ptr;
|
||||
struct spdk_trace_histories *histories;
|
||||
struct spdk_trace_history *history;
|
||||
|
||||
uint64_t tasks_done, last_tasks_done[SPDK_TRACE_MAX_LCORE];
|
||||
int delay, old_delay, history_fd, i, quit, rc;
|
||||
int tasks_done_delta, tasks_done_per_sec;
|
||||
int total_tasks_done_per_sec;
|
||||
struct timeval timeout;
|
||||
fd_set fds;
|
||||
char ch;
|
||||
struct termios oldt, newt;
|
||||
char spdk_trace_shm_name[64];
|
||||
int op;
|
||||
|
||||
exe_name = argv[0];
|
||||
while ((op = getopt(argc, argv, "i:")) != -1) {
|
||||
switch (op) {
|
||||
case 'i':
|
||||
g_shm_id = atoi(optarg);
|
||||
break;
|
||||
default:
|
||||
usage();
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
snprintf(spdk_trace_shm_name, sizeof(spdk_trace_shm_name), "/iscsi_trace.%d", g_shm_id);
|
||||
history_fd = shm_open(spdk_trace_shm_name, O_RDONLY, 0600);
|
||||
if (history_fd < 0) {
|
||||
fprintf(stderr, "Unable to open history shm %s\n", spdk_trace_shm_name);
|
||||
usage();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
history_ptr = mmap(NULL, sizeof(*histories), PROT_READ, MAP_SHARED, history_fd, 0);
|
||||
if (history_ptr == MAP_FAILED) {
|
||||
fprintf(stderr, "Unable to mmap history shm (%d).\n", errno);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
histories = (struct spdk_trace_histories *)history_ptr;
|
||||
|
||||
memset(last_tasks_done, 0, sizeof(last_tasks_done));
|
||||
|
||||
for (i = 0; i < SPDK_TRACE_MAX_LCORE; i++) {
|
||||
history = spdk_get_per_lcore_history(histories, i);
|
||||
last_tasks_done[i] = history->tpoint_count[TRACE_ISCSI_TASK_DONE];
|
||||
}
|
||||
|
||||
delay = 1;
|
||||
quit = 0;
|
||||
|
||||
tcgetattr(0, &oldt);
|
||||
newt = oldt;
|
||||
newt.c_lflag &= ~(ICANON);
|
||||
tcsetattr(0, TCSANOW, &newt);
|
||||
|
||||
while (1) {
|
||||
|
||||
FD_ZERO(&fds);
|
||||
FD_SET(0, &fds);
|
||||
timeout.tv_sec = delay;
|
||||
timeout.tv_usec = 0;
|
||||
rc = select(2, &fds, NULL, NULL, &timeout);
|
||||
|
||||
if (rc > 0) {
|
||||
if (read(0, &ch, 1) != 1) {
|
||||
fprintf(stderr, "Read error on stdin\n");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
printf("\b");
|
||||
switch (ch) {
|
||||
case 'd':
|
||||
printf("Enter num seconds to delay (1-10): ");
|
||||
old_delay = delay;
|
||||
rc = scanf("%d", &delay);
|
||||
if (rc != 1) {
|
||||
fprintf(stderr, "Illegal delay value\n");
|
||||
delay = old_delay;
|
||||
} else if (delay < 1 || delay > 10) {
|
||||
delay = 1;
|
||||
}
|
||||
break;
|
||||
case 'q':
|
||||
quit = 1;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "'%c' not recognized\n", ch);
|
||||
break;
|
||||
}
|
||||
|
||||
if (quit == 1) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
printf("\e[1;1H\e[2J");
|
||||
print_connections();
|
||||
printf("lcore tasks\n");
|
||||
printf("=============\n");
|
||||
total_tasks_done_per_sec = 0;
|
||||
for (i = 0; i < SPDK_TRACE_MAX_LCORE; i++) {
|
||||
history = spdk_get_per_lcore_history(histories, i);
|
||||
tasks_done = history->tpoint_count[TRACE_ISCSI_TASK_DONE];
|
||||
tasks_done_delta = tasks_done - last_tasks_done[i];
|
||||
if (tasks_done_delta == 0) {
|
||||
continue;
|
||||
}
|
||||
last_tasks_done[i] = tasks_done;
|
||||
tasks_done_per_sec = tasks_done_delta / delay;
|
||||
printf("%5d %7d\n", history->lcore, tasks_done_per_sec);
|
||||
total_tasks_done_per_sec += tasks_done_per_sec;
|
||||
}
|
||||
printf("Total %7d\n", total_tasks_done_per_sec);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
tcsetattr(0, TCSANOW, &oldt);
|
||||
|
||||
munmap(history_ptr, sizeof(*histories));
|
||||
close(history_fd);
|
||||
|
||||
return (0);
|
||||
}
|
@ -1,7 +1,35 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2016 Intel Corporation.
|
||||
#
|
||||
# BSD LICENSE
|
||||
#
|
||||
# Copyright (c) Intel Corporation.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
|
||||
SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
|
||||
@ -11,20 +39,13 @@ APP = nvmf_tgt
|
||||
|
||||
C_SRCS := nvmf_main.c
|
||||
|
||||
SPDK_LIB_LIST = $(ALL_MODULES_LIST) event event_nvmf
|
||||
|
||||
ifeq ($(SPDK_ROOT_DIR)/lib/env_dpdk,$(CONFIG_ENV))
|
||||
SPDK_LIB_LIST += env_dpdk_rpc
|
||||
endif
|
||||
SPDK_LIB_LIST = $(ALL_MODULES_LIST)
|
||||
SPDK_LIB_LIST += event_bdev event_copy event_nvmf event_net
|
||||
SPDK_LIB_LIST += nvmf event log trace conf thread util bdev copy rpc jsonrpc json net sock
|
||||
SPDK_LIB_LIST += app_rpc log_rpc trace_rpc bdev_rpc
|
||||
|
||||
ifeq ($(OS),Linux)
|
||||
SPDK_LIB_LIST += event_nbd
|
||||
SPDK_LIB_LIST += event_nbd nbd
|
||||
endif
|
||||
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.app.mk
|
||||
|
||||
install: $(APP)
|
||||
$(INSTALL_APP)
|
||||
|
||||
uninstall:
|
||||
$(UNINSTALL_APP)
|
||||
|
@ -1,6 +1,34 @@
|
||||
/* SPDX-License-Identifier: BSD-3-Clause
|
||||
* Copyright (C) 2017 Intel Corporation.
|
||||
/*-
|
||||
* BSD LICENSE
|
||||
*
|
||||
* Copyright (c) Intel Corporation.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "spdk/stdinc.h"
|
||||
@ -20,7 +48,7 @@ nvmf_parse_arg(int ch, char *arg)
|
||||
}
|
||||
|
||||
static void
|
||||
nvmf_tgt_started(void *arg1)
|
||||
nvmf_tgt_started(void *arg1, void *arg2)
|
||||
{
|
||||
if (getenv("MEMZONE_DUMP") != NULL) {
|
||||
spdk_memzone_dump(stdout);
|
||||
@ -35,8 +63,9 @@ main(int argc, char **argv)
|
||||
struct spdk_app_opts opts = {};
|
||||
|
||||
/* default value in opts */
|
||||
spdk_app_opts_init(&opts, sizeof(opts));
|
||||
spdk_app_opts_init(&opts);
|
||||
opts.name = "nvmf";
|
||||
opts.max_delay_us = 0;
|
||||
if ((rc = spdk_app_parse_args(argc, argv, &opts, "", NULL,
|
||||
nvmf_parse_arg, nvmf_usage)) !=
|
||||
SPDK_APP_PARSE_ARGS_SUCCESS) {
|
||||
@ -44,7 +73,7 @@ main(int argc, char **argv)
|
||||
}
|
||||
|
||||
/* Blocks until the application is exiting */
|
||||
rc = spdk_app_start(&opts, nvmf_tgt_started, NULL);
|
||||
rc = spdk_app_start(&opts, nvmf_tgt_started, NULL, NULL);
|
||||
spdk_app_fini();
|
||||
return rc;
|
||||
}
|
||||
|
1
app/spdk_dd/.gitignore
vendored
1
app/spdk_dd/.gitignore
vendored
@ -1 +0,0 @@
|
||||
spdk_dd
|
@ -1,22 +0,0 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2017 Intel Corporation.
|
||||
# All rights reserved.
|
||||
#
|
||||
|
||||
SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk
|
||||
|
||||
APP = spdk_dd
|
||||
|
||||
C_SRCS := spdk_dd.c
|
||||
|
||||
SPDK_LIB_LIST = $(ALL_MODULES_LIST) event event_bdev
|
||||
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.app.mk
|
||||
|
||||
install: $(APP)
|
||||
$(INSTALL_APP)
|
||||
|
||||
uninstall:
|
||||
$(UNINSTALL_APP)
|
File diff suppressed because it is too large
Load Diff
1
app/spdk_lspci/.gitignore
vendored
1
app/spdk_lspci/.gitignore
vendored
@ -1 +0,0 @@
|
||||
spdk_lspci
|
@ -1,22 +0,0 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2015 Intel Corporation.
|
||||
# All rights reserved.
|
||||
#
|
||||
|
||||
SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk
|
||||
|
||||
APP = spdk_lspci
|
||||
|
||||
C_SRCS := spdk_lspci.c
|
||||
|
||||
SPDK_LIB_LIST = $(SOCK_MODULES_LIST) nvme vmd
|
||||
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.app.mk
|
||||
|
||||
install: $(APP)
|
||||
$(INSTALL_APP)
|
||||
|
||||
uninstall:
|
||||
$(UNINSTALL_APP)
|
@ -1,89 +0,0 @@
|
||||
/* SPDX-License-Identifier: BSD-3-Clause
|
||||
* Copyright (C) 2019 Intel Corporation.
|
||||
* All rights reserved.
|
||||
*/
|
||||
|
||||
#include "spdk/stdinc.h"
|
||||
#include "spdk/env.h"
|
||||
#include "spdk/vmd.h"
|
||||
|
||||
static void
|
||||
usage(void)
|
||||
{
|
||||
printf("Usage: spdk_lspci\n");
|
||||
printf("Print available SPDK PCI devices supported by NVMe driver.\n");
|
||||
}
|
||||
|
||||
static int
|
||||
pci_enum_cb(void *ctx, struct spdk_pci_device *dev)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
print_pci_dev(void *ctx, struct spdk_pci_device *dev)
|
||||
{
|
||||
struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr(dev);
|
||||
char addr[32] = { 0 };
|
||||
|
||||
spdk_pci_addr_fmt(addr, sizeof(addr), &pci_addr);
|
||||
|
||||
printf("%s (%x %x)", addr,
|
||||
spdk_pci_device_get_vendor_id(dev),
|
||||
spdk_pci_device_get_device_id(dev));
|
||||
|
||||
if (strcmp(spdk_pci_device_get_type(dev), "vmd") == 0) {
|
||||
printf(" (NVMe disk behind VMD) ");
|
||||
}
|
||||
|
||||
if (dev->internal.driver == spdk_pci_vmd_get_driver()) {
|
||||
printf(" (VMD) ");
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
int op, rc = 0;
|
||||
struct spdk_env_opts opts;
|
||||
|
||||
while ((op = getopt(argc, argv, "h")) != -1) {
|
||||
switch (op) {
|
||||
case 'h':
|
||||
usage();
|
||||
return 0;
|
||||
default:
|
||||
usage();
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
spdk_env_opts_init(&opts);
|
||||
opts.name = "spdk_lspci";
|
||||
|
||||
if (spdk_env_init(&opts) < 0) {
|
||||
printf("Unable to initialize SPDK env\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (spdk_vmd_init()) {
|
||||
printf("Failed to initialize VMD. Some NVMe devices can be unavailable.\n");
|
||||
}
|
||||
|
||||
if (spdk_pci_enumerate(spdk_pci_nvme_get_driver(), pci_enum_cb, NULL)) {
|
||||
printf("Unable to enumerate PCI nvme driver\n");
|
||||
rc = 1;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
printf("\nList of available PCI devices:\n");
|
||||
spdk_pci_for_each_device(NULL, print_pci_dev);
|
||||
|
||||
exit:
|
||||
spdk_vmd_fini();
|
||||
spdk_env_fini();
|
||||
|
||||
return rc;
|
||||
}
|
@ -1,7 +1,35 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2018 Intel Corporation.
|
||||
#
|
||||
# BSD LICENSE
|
||||
#
|
||||
# Copyright (c) Intel Corporation.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
|
||||
SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
|
||||
@ -13,29 +41,21 @@ C_SRCS := spdk_tgt.c
|
||||
|
||||
SPDK_LIB_LIST = $(ALL_MODULES_LIST)
|
||||
|
||||
SPDK_LIB_LIST += event event_iscsi event_nvmf
|
||||
|
||||
ifeq ($(SPDK_ROOT_DIR)/lib/env_dpdk,$(CONFIG_ENV))
|
||||
SPDK_LIB_LIST += env_dpdk_rpc
|
||||
ifeq ($(OS),Linux)
|
||||
ifeq ($(CONFIG_VHOST),y)
|
||||
SPDK_LIB_LIST += vhost rte_vhost event_vhost
|
||||
endif
|
||||
endif
|
||||
|
||||
SPDK_LIB_LIST += event_bdev event_copy event_iscsi event_net event_scsi event_nvmf event
|
||||
SPDK_LIB_LIST += nvmf trace log conf thread util bdev iscsi scsi copy rpc jsonrpc json
|
||||
SPDK_LIB_LIST += app_rpc log_rpc trace_rpc bdev_rpc net sock
|
||||
|
||||
ifeq ($(OS),Linux)
|
||||
SPDK_LIB_LIST += event_nbd
|
||||
ifeq ($(CONFIG_UBLK),y)
|
||||
SPDK_LIB_LIST += event_ublk
|
||||
endif
|
||||
ifeq ($(CONFIG_VHOST),y)
|
||||
SPDK_LIB_LIST += event_vhost_blk event_vhost_scsi
|
||||
endif
|
||||
ifeq ($(CONFIG_VFIO_USER),y)
|
||||
SPDK_LIB_LIST += event_vfu_tgt
|
||||
endif
|
||||
SPDK_LIB_LIST += event_nbd nbd
|
||||
endif
|
||||
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.app.mk
|
||||
|
||||
install: $(APP)
|
||||
$(INSTALL_APP)
|
||||
|
||||
uninstall:
|
||||
$(UNINSTALL_APP)
|
||||
|
@ -1,6 +1,34 @@
|
||||
/* SPDX-License-Identifier: BSD-3-Clause
|
||||
* Copyright (C) 2018 Intel Corporation.
|
||||
/*-
|
||||
* BSD LICENSE
|
||||
*
|
||||
* Copyright (c) Intel Corporation.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "spdk/stdinc.h"
|
||||
@ -10,6 +38,11 @@
|
||||
#include "spdk/event.h"
|
||||
#include "spdk/vhost.h"
|
||||
|
||||
/* TODO: this should be handled by configure */
|
||||
#if defined(SPDK_CONFIG_VHOST) && !defined(__linux__)
|
||||
#undef SPDK_CONFIG_VHOST
|
||||
#endif
|
||||
|
||||
#ifdef SPDK_CONFIG_VHOST
|
||||
#define SPDK_VHOST_OPTS "S:"
|
||||
#else
|
||||
@ -63,7 +96,7 @@ spdk_tgt_parse_arg(int ch, char *arg)
|
||||
}
|
||||
|
||||
static void
|
||||
spdk_tgt_started(void *arg1)
|
||||
spdk_tgt_started(void *arg1, void *arg2)
|
||||
{
|
||||
if (g_pid_path) {
|
||||
spdk_tgt_save_pid(g_pid_path);
|
||||
@ -81,7 +114,7 @@ main(int argc, char **argv)
|
||||
struct spdk_app_opts opts = {};
|
||||
int rc;
|
||||
|
||||
spdk_app_opts_init(&opts, sizeof(opts));
|
||||
spdk_app_opts_init(&opts);
|
||||
opts.name = "spdk_tgt";
|
||||
if ((rc = spdk_app_parse_args(argc, argv, &opts, g_spdk_tgt_get_opts_string,
|
||||
NULL, spdk_tgt_parse_arg, spdk_tgt_usage)) !=
|
||||
@ -89,7 +122,7 @@ main(int argc, char **argv)
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = spdk_app_start(&opts, spdk_tgt_started, NULL);
|
||||
rc = spdk_app_start(&opts, spdk_tgt_started, NULL, NULL);
|
||||
spdk_app_fini();
|
||||
|
||||
return rc;
|
||||
|
1
app/spdk_top/.gitignore
vendored
1
app/spdk_top/.gitignore
vendored
@ -1 +0,0 @@
|
||||
spdk_top
|
@ -1,22 +0,0 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2015 Intel Corporation.
|
||||
# All rights reserved.
|
||||
#
|
||||
|
||||
SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
|
||||
|
||||
APP = spdk_top
|
||||
|
||||
C_SRCS := spdk_top.c
|
||||
|
||||
SPDK_LIB_LIST = rpc
|
||||
LIBS=-lpanel -lmenu -lncurses
|
||||
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.app.mk
|
||||
|
||||
install: $(APP)
|
||||
$(INSTALL_APP)
|
||||
|
||||
uninstall:
|
||||
$(UNINSTALL_APP)
|
@ -1,74 +0,0 @@
|
||||
Contents
|
||||
========
|
||||
|
||||
- Overview
|
||||
- Installation
|
||||
- Usage
|
||||
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
This application provides SPDK live statistics regarding usage of cores,
|
||||
threads, pollers, execution times, and relations between those. All data
|
||||
is being gathered from SPDK by calling appropriate RPC calls. Application
|
||||
consists of three selectable tabs providing statistics related to three
|
||||
main topics:
|
||||
|
||||
- Threads
|
||||
- Pollers
|
||||
- Cores
|
||||
|
||||
|
||||
Installation
|
||||
============
|
||||
|
||||
spdk_top requires Ncurses library (can by installed by running
|
||||
spdk/scripts/pkgdep.sh) and is compiled by default when SPDK compiles.
|
||||
|
||||
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
To run spdk_top:
|
||||
|
||||
sudo spdk_top [options]
|
||||
|
||||
options:
|
||||
-r <path> RPC listen address (optional, default: /var/tmp/spdk.sock)
|
||||
-h show help message
|
||||
|
||||
Application consists of:
|
||||
- Tabs list (on top)
|
||||
- Statistics window (main windows in the middle)
|
||||
- Options window (below statistics window)
|
||||
- Page indicator / Error status
|
||||
|
||||
Tabs list shows available tabs and highlights currently selected tab.
|
||||
Statistics window displays current statistics. Available statistics
|
||||
depend on which tab is currently selected. All time and run counter
|
||||
related statistics are relative - show elapsed time / number of runs
|
||||
since previous data refresh. Options windows provide hotkeys list
|
||||
to change application settings. Available options are:
|
||||
|
||||
- [q] Quit - quit the application
|
||||
- [1-3] TAB selection - select tab to be displayed
|
||||
- [PgUp] Previous page - go to previous page
|
||||
- [PgDown] Next page - go to next page
|
||||
- [c] Columns - select which columns should be visible / hidden:
|
||||
Use arrow up / down and space / enter keys to select which columns
|
||||
should be visible. Select 'CLOSE' to confirm changes and close
|
||||
the window.
|
||||
- [s] Sorting - change data sorting:
|
||||
Use arrow up / down to select based on which column data should be
|
||||
sorted. Use enter key to confirm or esc key to exit without
|
||||
changing current sorting scheme.
|
||||
- [r] Refresh rate - change data refresh rate:
|
||||
Enter new data refresh rate value. Refresh rate accepts value
|
||||
between 0 and 255 seconds. Use enter key to apply or escape key
|
||||
to cancel.
|
||||
|
||||
Page indicator show current data page. Error status can be displayed
|
||||
on bottom right side of the screen when the application encountered
|
||||
an error.
|
File diff suppressed because it is too large
Load Diff
@ -1,23 +1,51 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2015 Intel Corporation.
|
||||
#
|
||||
# BSD LICENSE
|
||||
#
|
||||
# Copyright (c) Intel Corporation.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
|
||||
SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk
|
||||
|
||||
APP = spdk_trace
|
||||
SPDK_NO_LINK_ENV = 1
|
||||
|
||||
SPDK_LIB_LIST += json trace_parser
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.app_cxx.mk
|
||||
|
||||
CXX_SRCS := trace.cpp
|
||||
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.app_cxx.mk
|
||||
APP = spdk_trace
|
||||
|
||||
install: $(APP)
|
||||
$(INSTALL_APP)
|
||||
all: $(APP)
|
||||
@:
|
||||
|
||||
uninstall:
|
||||
$(UNINSTALL_APP)
|
||||
$(APP): $(OBJS) $(SPDK_LIBS)
|
||||
$(LINK_CXX)
|
||||
|
||||
clean:
|
||||
$(CLEAN_C) $(APP)
|
||||
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.deps.mk
|
||||
|
@ -1,56 +1,90 @@
|
||||
/* SPDX-License-Identifier: BSD-3-Clause
|
||||
* Copyright (C) 2016 Intel Corporation.
|
||||
/*-
|
||||
* BSD LICENSE
|
||||
*
|
||||
* Copyright (c) Intel Corporation.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "spdk/stdinc.h"
|
||||
#include "spdk/env.h"
|
||||
#include "spdk/json.h"
|
||||
#include "spdk/likely.h"
|
||||
#include "spdk/string.h"
|
||||
#include "spdk/util.h"
|
||||
|
||||
#include <map>
|
||||
|
||||
extern "C" {
|
||||
#include "spdk/trace_parser.h"
|
||||
#include "spdk/trace.h"
|
||||
#include "spdk/util.h"
|
||||
}
|
||||
|
||||
static struct spdk_trace_parser *g_parser;
|
||||
static const struct spdk_trace_flags *g_flags;
|
||||
static struct spdk_json_write_ctx *g_json;
|
||||
static bool g_print_tsc = false;
|
||||
|
||||
/* This is a bit ugly, but we don't want to include env_dpdk in the app, while spdk_util, which we
|
||||
* do need, uses some of the functions implemented there. We're not actually using the functions
|
||||
* that depend on those, so just define them as no-ops to allow the app to link.
|
||||
*/
|
||||
extern "C" {
|
||||
void *
|
||||
spdk_realloc(void *buf, size_t size, size_t align)
|
||||
{
|
||||
assert(false);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void
|
||||
spdk_free(void *buf)
|
||||
{
|
||||
assert(false);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
spdk_get_ticks(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
} /* extern "C" */
|
||||
static struct spdk_trace_histories *g_histories;
|
||||
|
||||
static void usage(void);
|
||||
|
||||
struct entry_key {
|
||||
entry_key(uint16_t _lcore, uint64_t _tsc) : lcore(_lcore), tsc(_tsc) {}
|
||||
uint16_t lcore;
|
||||
uint64_t tsc;
|
||||
};
|
||||
|
||||
class compare_entry_key
|
||||
{
|
||||
public:
|
||||
bool operator()(const entry_key &first, const entry_key &second) const
|
||||
{
|
||||
if (first.tsc == second.tsc) {
|
||||
return first.lcore < second.lcore;
|
||||
} else {
|
||||
return first.tsc < second.tsc;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::map<entry_key, spdk_trace_entry *, compare_entry_key> entry_map;
|
||||
|
||||
entry_map g_entry_map;
|
||||
|
||||
struct object_stats {
|
||||
|
||||
std::map<uint64_t, uint64_t> start;
|
||||
std::map<uint64_t, uint64_t> index;
|
||||
std::map<uint64_t, uint64_t> size;
|
||||
std::map<uint64_t, uint64_t> tpoint_id;
|
||||
uint64_t counter;
|
||||
|
||||
object_stats() : start(), index(), size(), tpoint_id(), counter(0) {}
|
||||
};
|
||||
|
||||
struct object_stats g_stats[SPDK_TRACE_MAX_OBJECT];
|
||||
|
||||
static char *g_exe_name;
|
||||
static int g_verbose = 1;
|
||||
|
||||
static uint64_t g_tsc_rate;
|
||||
static uint64_t g_first_tsc = 0x0;
|
||||
|
||||
static float
|
||||
get_us_from_tsc(uint64_t tsc, uint64_t tsc_rate)
|
||||
@ -58,19 +92,10 @@ get_us_from_tsc(uint64_t tsc, uint64_t tsc_rate)
|
||||
return ((float)tsc) * 1000 * 1000 / tsc_rate;
|
||||
}
|
||||
|
||||
static const char *
|
||||
format_argname(const char *name)
|
||||
{
|
||||
static char namebuf[16];
|
||||
|
||||
snprintf(namebuf, sizeof(namebuf), "%s: ", name);
|
||||
return namebuf;
|
||||
}
|
||||
|
||||
static void
|
||||
print_ptr(const char *arg_string, uint64_t arg)
|
||||
{
|
||||
printf("%-7.7s0x%-14jx ", format_argname(arg_string), arg);
|
||||
printf("%-7.7s0x%-14jx ", arg_string, arg);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -81,13 +106,7 @@ print_uint64(const char *arg_string, uint64_t arg)
|
||||
* for FLUSH WRITEBUF when writev() returns -1 due to full
|
||||
* socket buffer.
|
||||
*/
|
||||
printf("%-7.7s%-16jd ", format_argname(arg_string), arg);
|
||||
}
|
||||
|
||||
static void
|
||||
print_string(const char *arg_string, const char *arg)
|
||||
{
|
||||
printf("%-7.7s%-16.16s ", format_argname(arg_string), arg);
|
||||
printf("%-7.7s%-16jd ", arg_string, arg);
|
||||
}
|
||||
|
||||
static void
|
||||
@ -101,46 +120,55 @@ print_size(uint32_t size)
|
||||
}
|
||||
|
||||
static void
|
||||
print_object_id(const struct spdk_trace_tpoint *d, struct spdk_trace_parser_entry *entry)
|
||||
print_object_id(uint8_t type, uint64_t id)
|
||||
{
|
||||
/* Set size to 128 and 256 bytes to make sure we can fit all the characters we need */
|
||||
char related_id[128] = {'\0'};
|
||||
char ids[256] = {'\0'};
|
||||
|
||||
if (entry->related_type != OBJECT_NONE) {
|
||||
snprintf(related_id, sizeof(related_id), " (%c%jd)",
|
||||
g_flags->object[entry->related_type].id_prefix,
|
||||
entry->related_index);
|
||||
}
|
||||
|
||||
snprintf(ids, sizeof(ids), "%c%jd%s", g_flags->object[d->object_type].id_prefix,
|
||||
entry->object_index, related_id);
|
||||
printf("id: %-17s", ids);
|
||||
printf("id: %c%-15jd ", g_histories->flags.object[type].id_prefix, id);
|
||||
}
|
||||
|
||||
static void
|
||||
print_float(const char *arg_string, float arg)
|
||||
{
|
||||
printf("%-7s%-16.3f ", format_argname(arg_string), arg);
|
||||
printf("%-7s%-16.3f ", arg_string, arg);
|
||||
}
|
||||
|
||||
static void
|
||||
print_event(struct spdk_trace_parser_entry *entry, uint64_t tsc_rate, uint64_t tsc_offset)
|
||||
print_arg(bool arg_is_ptr, const char *arg_string, uint64_t arg)
|
||||
{
|
||||
struct spdk_trace_entry *e = entry->entry;
|
||||
const struct spdk_trace_tpoint *d;
|
||||
float us;
|
||||
size_t i;
|
||||
if (arg_string[0] == 0) {
|
||||
printf("%24s", "");
|
||||
return;
|
||||
}
|
||||
|
||||
if (arg_is_ptr) {
|
||||
print_ptr(arg_string, arg);
|
||||
} else {
|
||||
print_uint64(arg_string, arg);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
print_event(struct spdk_trace_entry *e, uint64_t tsc_rate,
|
||||
uint64_t tsc_offset, uint16_t lcore)
|
||||
{
|
||||
struct spdk_trace_tpoint *d;
|
||||
struct object_stats *stats;
|
||||
float us;
|
||||
|
||||
d = &g_histories->flags.tpoint[e->tpoint_id];
|
||||
stats = &g_stats[d->object_type];
|
||||
|
||||
if (d->new_object) {
|
||||
stats->index[e->object_id] = stats->counter++;
|
||||
stats->tpoint_id[e->object_id] = e->tpoint_id;
|
||||
stats->start[e->object_id] = e->tsc;
|
||||
stats->size[e->object_id] = e->size;
|
||||
}
|
||||
|
||||
d = &g_flags->tpoint[e->tpoint_id];
|
||||
us = get_us_from_tsc(e->tsc - tsc_offset, tsc_rate);
|
||||
|
||||
printf("%2d: %10.3f ", entry->lcore, us);
|
||||
if (g_print_tsc) {
|
||||
printf("(%9ju) ", e->tsc - tsc_offset);
|
||||
}
|
||||
if (g_flags->owner[d->owner_type].id_prefix) {
|
||||
printf("%c%02d ", g_flags->owner[d->owner_type].id_prefix, e->poller_id);
|
||||
printf("%2d: %10.3f (%9ju) ", lcore, us, e->tsc - tsc_offset);
|
||||
if (g_histories->flags.owner[d->owner_type].id_prefix) {
|
||||
printf("%c%02d ", g_histories->flags.owner[d->owner_type].id_prefix, e->poller_id);
|
||||
} else {
|
||||
printf("%4s", " ");
|
||||
}
|
||||
@ -148,183 +176,101 @@ print_event(struct spdk_trace_parser_entry *entry, uint64_t tsc_rate, uint64_t t
|
||||
printf("%-*s ", (int)sizeof(d->name), d->name);
|
||||
print_size(e->size);
|
||||
|
||||
print_arg(d->arg1_is_ptr, d->arg1_name, e->arg1);
|
||||
if (d->new_object) {
|
||||
print_object_id(d, entry);
|
||||
print_object_id(d->object_type, stats->index[e->object_id]);
|
||||
} else if (d->object_type != OBJECT_NONE) {
|
||||
if (entry->object_index != UINT64_MAX) {
|
||||
us = get_us_from_tsc(e->tsc - entry->object_start, tsc_rate);
|
||||
print_object_id(d, entry);
|
||||
print_float("time", us);
|
||||
if (stats->start.find(e->object_id) != stats->start.end()) {
|
||||
struct spdk_trace_tpoint *start_description;
|
||||
|
||||
us = get_us_from_tsc(e->tsc - stats->start[e->object_id],
|
||||
tsc_rate);
|
||||
print_object_id(d->object_type, stats->index[e->object_id]);
|
||||
print_float("time:", us);
|
||||
start_description = &g_histories->flags.tpoint[stats->tpoint_id[e->object_id]];
|
||||
if (start_description->short_name[0] != 0) {
|
||||
printf(" (%.4s)", start_description->short_name);
|
||||
}
|
||||
} else {
|
||||
printf("id: N/A");
|
||||
}
|
||||
} else if (e->object_id != 0) {
|
||||
print_ptr("object", e->object_id);
|
||||
}
|
||||
|
||||
for (i = 0; i < d->num_args; ++i) {
|
||||
switch (d->args[i].type) {
|
||||
case SPDK_TRACE_ARG_TYPE_PTR:
|
||||
print_ptr(d->args[i].name, (uint64_t)entry->args[i].pointer);
|
||||
break;
|
||||
case SPDK_TRACE_ARG_TYPE_INT:
|
||||
print_uint64(d->args[i].name, entry->args[i].integer);
|
||||
break;
|
||||
case SPDK_TRACE_ARG_TYPE_STR:
|
||||
print_string(d->args[i].name, entry->args[i].string);
|
||||
break;
|
||||
}
|
||||
print_arg(true, "object: ", e->object_id);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static void
|
||||
print_event_json(struct spdk_trace_parser_entry *entry, uint64_t tsc_rate, uint64_t tsc_offset)
|
||||
process_event(struct spdk_trace_entry *e, uint64_t tsc_rate,
|
||||
uint64_t tsc_offset, uint16_t lcore)
|
||||
{
|
||||
struct spdk_trace_entry *e = entry->entry;
|
||||
const struct spdk_trace_tpoint *d;
|
||||
size_t i;
|
||||
|
||||
d = &g_flags->tpoint[e->tpoint_id];
|
||||
|
||||
spdk_json_write_object_begin(g_json);
|
||||
spdk_json_write_named_uint64(g_json, "lcore", entry->lcore);
|
||||
spdk_json_write_named_uint64(g_json, "tpoint", e->tpoint_id);
|
||||
spdk_json_write_named_uint64(g_json, "tsc", e->tsc);
|
||||
|
||||
if (g_flags->owner[d->owner_type].id_prefix) {
|
||||
spdk_json_write_named_string_fmt(g_json, "poller", "%c%02d",
|
||||
g_flags->owner[d->owner_type].id_prefix,
|
||||
e->poller_id);
|
||||
if (g_verbose) {
|
||||
print_event(e, tsc_rate, tsc_offset, lcore);
|
||||
}
|
||||
if (e->size != 0) {
|
||||
spdk_json_write_named_uint32(g_json, "size", e->size);
|
||||
}
|
||||
if (d->new_object || d->object_type != OBJECT_NONE || e->object_id != 0) {
|
||||
char object_type;
|
||||
|
||||
spdk_json_write_named_object_begin(g_json, "object");
|
||||
if (d->new_object) {
|
||||
object_type = g_flags->object[d->object_type].id_prefix;
|
||||
spdk_json_write_named_string_fmt(g_json, "id", "%c%" PRIu64, object_type,
|
||||
entry->object_index);
|
||||
} else if (d->object_type != OBJECT_NONE) {
|
||||
object_type = g_flags->object[d->object_type].id_prefix;
|
||||
if (entry->object_index != UINT64_MAX) {
|
||||
spdk_json_write_named_string_fmt(g_json, "id", "%c%" PRIu64,
|
||||
object_type,
|
||||
entry->object_index);
|
||||
spdk_json_write_named_uint64(g_json, "time",
|
||||
e->tsc - entry->object_start);
|
||||
}
|
||||
}
|
||||
spdk_json_write_named_uint64(g_json, "value", e->object_id);
|
||||
spdk_json_write_object_end(g_json);
|
||||
}
|
||||
|
||||
/* Print related objects array */
|
||||
if (entry->related_index != UINT64_MAX) {
|
||||
spdk_json_write_named_string_fmt(g_json, "related", "%c%" PRIu64,
|
||||
g_flags->object[entry->related_type].id_prefix,
|
||||
entry->related_index);
|
||||
}
|
||||
|
||||
if (d->num_args > 0) {
|
||||
spdk_json_write_named_array_begin(g_json, "args");
|
||||
for (i = 0; i < d->num_args; ++i) {
|
||||
switch (d->args[i].type) {
|
||||
case SPDK_TRACE_ARG_TYPE_PTR:
|
||||
spdk_json_write_uint64(g_json, (uint64_t)entry->args[i].pointer);
|
||||
break;
|
||||
case SPDK_TRACE_ARG_TYPE_INT:
|
||||
spdk_json_write_uint64(g_json, entry->args[i].integer);
|
||||
break;
|
||||
case SPDK_TRACE_ARG_TYPE_STR:
|
||||
spdk_json_write_string(g_json, entry->args[i].string);
|
||||
break;
|
||||
}
|
||||
}
|
||||
spdk_json_write_array_end(g_json);
|
||||
}
|
||||
|
||||
spdk_json_write_object_end(g_json);
|
||||
}
|
||||
|
||||
static void
|
||||
process_event(struct spdk_trace_parser_entry *e, uint64_t tsc_rate, uint64_t tsc_offset)
|
||||
{
|
||||
if (g_json == NULL) {
|
||||
print_event(e, tsc_rate, tsc_offset);
|
||||
} else {
|
||||
print_event_json(e, tsc_rate, tsc_offset);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
print_tpoint_definitions(void)
|
||||
{
|
||||
const struct spdk_trace_tpoint *tpoint;
|
||||
size_t i, j;
|
||||
|
||||
/* We only care about these when printing JSON */
|
||||
if (!g_json) {
|
||||
return;
|
||||
}
|
||||
|
||||
spdk_json_write_named_uint64(g_json, "tsc_rate", g_flags->tsc_rate);
|
||||
spdk_json_write_named_array_begin(g_json, "tpoints");
|
||||
|
||||
for (i = 0; i < SPDK_COUNTOF(g_flags->tpoint); ++i) {
|
||||
tpoint = &g_flags->tpoint[i];
|
||||
if (tpoint->tpoint_id == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
spdk_json_write_object_begin(g_json);
|
||||
spdk_json_write_named_string(g_json, "name", tpoint->name);
|
||||
spdk_json_write_named_uint32(g_json, "id", tpoint->tpoint_id);
|
||||
spdk_json_write_named_bool(g_json, "new_object", tpoint->new_object);
|
||||
|
||||
spdk_json_write_named_array_begin(g_json, "args");
|
||||
for (j = 0; j < tpoint->num_args; ++j) {
|
||||
spdk_json_write_object_begin(g_json);
|
||||
spdk_json_write_named_string(g_json, "name", tpoint->args[j].name);
|
||||
spdk_json_write_named_uint32(g_json, "type", tpoint->args[j].type);
|
||||
spdk_json_write_named_uint32(g_json, "size", tpoint->args[j].size);
|
||||
spdk_json_write_object_end(g_json);
|
||||
}
|
||||
spdk_json_write_array_end(g_json);
|
||||
spdk_json_write_object_end(g_json);
|
||||
}
|
||||
|
||||
spdk_json_write_array_end(g_json);
|
||||
}
|
||||
|
||||
static int
|
||||
print_json(void *cb_ctx, const void *data, size_t size)
|
||||
populate_events(struct spdk_trace_history *history, int num_entries)
|
||||
{
|
||||
ssize_t rc;
|
||||
int i, num_entries_filled;
|
||||
struct spdk_trace_entry *e;
|
||||
int first, last, lcore;
|
||||
|
||||
while (size > 0) {
|
||||
rc = write(STDOUT_FILENO, data, size);
|
||||
if (rc < 0) {
|
||||
fprintf(stderr, "%s: %s\n", g_exe_name, spdk_strerror(errno));
|
||||
abort();
|
||||
}
|
||||
lcore = history->lcore;
|
||||
|
||||
size -= rc;
|
||||
e = history->entries;
|
||||
|
||||
num_entries_filled = num_entries;
|
||||
while (e[num_entries_filled - 1].tsc == 0) {
|
||||
num_entries_filled--;
|
||||
}
|
||||
|
||||
return 0;
|
||||
if (num_entries == num_entries_filled) {
|
||||
first = last = 0;
|
||||
for (i = 1; i < num_entries; i++) {
|
||||
if (e[i].tsc < e[first].tsc) {
|
||||
first = i;
|
||||
}
|
||||
if (e[i].tsc > e[last].tsc) {
|
||||
last = i;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
first = 0;
|
||||
last = num_entries_filled - 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* We keep track of the highest first TSC out of all reactors.
|
||||
* We will ignore any events that occured before this TSC on any
|
||||
* other reactors. This will ensure we only print data for the
|
||||
* subset of time where we have data across all reactors.
|
||||
*/
|
||||
if (e[first].tsc > g_first_tsc) {
|
||||
g_first_tsc = e[first].tsc;
|
||||
}
|
||||
|
||||
i = first;
|
||||
while (1) {
|
||||
g_entry_map[entry_key(lcore, e[i].tsc)] = &e[i];
|
||||
if (i == last) {
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
if (i == num_entries_filled) {
|
||||
i = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
usage(void)
|
||||
static void usage(void)
|
||||
{
|
||||
fprintf(stderr, "usage:\n");
|
||||
fprintf(stderr, " %s <option> <lcore#>\n", g_exe_name);
|
||||
fprintf(stderr, " option = '-q' to disable verbose mode\n");
|
||||
fprintf(stderr, " '-c' to display single lcore history\n");
|
||||
fprintf(stderr, " '-t' to display TSC offset for each event\n");
|
||||
fprintf(stderr, " '-s' to specify spdk_trace shm name for a\n");
|
||||
fprintf(stderr, " currently running process\n");
|
||||
fprintf(stderr, " '-i' to specify the shared memory ID\n");
|
||||
@ -333,25 +279,26 @@ usage(void)
|
||||
fprintf(stderr, " -i or -p must be specified)\n");
|
||||
fprintf(stderr, " '-f' to specify a tracepoint file name\n");
|
||||
fprintf(stderr, " (-s and -f are mutually exclusive)\n");
|
||||
fprintf(stderr, " '-j' to use JSON to format the output\n");
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct spdk_trace_parser_opts opts;
|
||||
struct spdk_trace_parser_entry entry;
|
||||
int lcore = SPDK_TRACE_MAX_LCORE;
|
||||
uint64_t tsc_offset, entry_count;
|
||||
const char *app_name = NULL;
|
||||
const char *file_name = NULL;
|
||||
int op, i;
|
||||
char shm_name[64];
|
||||
int shm_id = -1, shm_pid = -1;
|
||||
bool json = false;
|
||||
void *history_ptr;
|
||||
struct spdk_trace_history *history;
|
||||
struct spdk_trace_histories *histories;
|
||||
int fd, i, rc;
|
||||
int lcore = SPDK_TRACE_MAX_LCORE;
|
||||
uint64_t tsc_offset;
|
||||
const char *app_name = NULL;
|
||||
const char *file_name = NULL;
|
||||
int op;
|
||||
char shm_name[64];
|
||||
int shm_id = -1, shm_pid = -1;
|
||||
uint64_t trace_histories_size;
|
||||
struct stat _stat;
|
||||
|
||||
g_exe_name = argv[0];
|
||||
while ((op = getopt(argc, argv, "c:f:i:jp:s:t")) != -1) {
|
||||
while ((op = getopt(argc, argv, "c:f:i:p:qs:")) != -1) {
|
||||
switch (op) {
|
||||
case 'c':
|
||||
lcore = atoi(optarg);
|
||||
@ -368,18 +315,15 @@ main(int argc, char **argv)
|
||||
case 'p':
|
||||
shm_pid = atoi(optarg);
|
||||
break;
|
||||
case 'q':
|
||||
g_verbose = 0;
|
||||
break;
|
||||
case 's':
|
||||
app_name = optarg;
|
||||
break;
|
||||
case 'f':
|
||||
file_name = optarg;
|
||||
break;
|
||||
case 't':
|
||||
g_print_tsc = true;
|
||||
break;
|
||||
case 'j':
|
||||
json = true;
|
||||
break;
|
||||
default:
|
||||
usage();
|
||||
exit(1);
|
||||
@ -398,65 +342,117 @@ main(int argc, char **argv)
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (json) {
|
||||
g_json = spdk_json_write_begin(print_json, NULL, 0);
|
||||
if (g_json == NULL) {
|
||||
fprintf(stderr, "Failed to allocate JSON write context\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (!file_name) {
|
||||
if (file_name) {
|
||||
fd = open(file_name, O_RDONLY);
|
||||
} else {
|
||||
if (shm_id >= 0) {
|
||||
snprintf(shm_name, sizeof(shm_name), "/%s_trace.%d", app_name, shm_id);
|
||||
} else {
|
||||
snprintf(shm_name, sizeof(shm_name), "/%s_trace.pid%d", app_name, shm_pid);
|
||||
}
|
||||
fd = shm_open(shm_name, O_RDONLY, 0600);
|
||||
file_name = shm_name;
|
||||
}
|
||||
|
||||
opts.filename = file_name;
|
||||
opts.lcore = lcore;
|
||||
opts.mode = app_name == NULL ? SPDK_TRACE_PARSER_MODE_FILE : SPDK_TRACE_PARSER_MODE_SHM;
|
||||
g_parser = spdk_trace_parser_init(&opts);
|
||||
if (g_parser == NULL) {
|
||||
fprintf(stderr, "Failed to initialize trace parser\n");
|
||||
exit(1);
|
||||
if (fd < 0) {
|
||||
fprintf(stderr, "Could not open %s.\n", file_name);
|
||||
usage();
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
g_flags = spdk_trace_parser_get_flags(g_parser);
|
||||
if (!g_json) {
|
||||
printf("TSC Rate: %ju\n", g_flags->tsc_rate);
|
||||
} else {
|
||||
spdk_json_write_object_begin(g_json);
|
||||
print_tpoint_definitions();
|
||||
spdk_json_write_named_array_begin(g_json, "entries");
|
||||
rc = fstat(fd, &_stat);
|
||||
if (rc < 0) {
|
||||
fprintf(stderr, "Could not get size of %s.\n", file_name);
|
||||
usage();
|
||||
exit(-1);
|
||||
}
|
||||
if ((size_t)_stat.st_size < sizeof(*g_histories)) {
|
||||
fprintf(stderr, "%s is not a valid trace file\n", file_name);
|
||||
usage();
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
for (i = 0; i < SPDK_TRACE_MAX_LCORE; ++i) {
|
||||
if (lcore == SPDK_TRACE_MAX_LCORE || i == lcore) {
|
||||
entry_count = spdk_trace_parser_get_entry_count(g_parser, i);
|
||||
if (entry_count > 0) {
|
||||
printf("Trace Size of lcore (%d): %ju\n", i, entry_count);
|
||||
/* Map the header of trace file */
|
||||
history_ptr = mmap(NULL, sizeof(*g_histories), PROT_READ, MAP_SHARED, fd, 0);
|
||||
if (history_ptr == MAP_FAILED) {
|
||||
fprintf(stderr, "Could not mmap %s.\n", file_name);
|
||||
usage();
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
g_histories = (struct spdk_trace_histories *)history_ptr;
|
||||
|
||||
g_tsc_rate = g_histories->flags.tsc_rate;
|
||||
if (g_tsc_rate == 0) {
|
||||
fprintf(stderr, "Invalid tsc_rate %ju\n", g_tsc_rate);
|
||||
usage();
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
if (g_verbose) {
|
||||
printf("TSC Rate: %ju\n", g_tsc_rate);
|
||||
}
|
||||
|
||||
/* Remap the entire trace file */
|
||||
trace_histories_size = spdk_get_trace_histories_size(g_histories);
|
||||
munmap(history_ptr, sizeof(*g_histories));
|
||||
if ((size_t)_stat.st_size < trace_histories_size) {
|
||||
fprintf(stderr, "%s is not a valid trace file\n", file_name);
|
||||
usage();
|
||||
exit(-1);
|
||||
}
|
||||
history_ptr = mmap(NULL, trace_histories_size, PROT_READ, MAP_SHARED, fd, 0);
|
||||
if (history_ptr == MAP_FAILED) {
|
||||
fprintf(stderr, "Could not mmap %s.\n", file_name);
|
||||
usage();
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
g_histories = (struct spdk_trace_histories *)history_ptr;
|
||||
|
||||
histories = (struct spdk_trace_histories *)malloc(trace_histories_size);
|
||||
if (histories == NULL) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
memcpy(histories, g_histories, trace_histories_size);
|
||||
|
||||
if (lcore == SPDK_TRACE_MAX_LCORE) {
|
||||
for (i = 0; i < SPDK_TRACE_MAX_LCORE; i++) {
|
||||
history = spdk_get_per_lcore_history(histories, i);
|
||||
if (history->entries[0].tsc == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (g_verbose && history->num_entries) {
|
||||
printf("Trace Size of lcore (%d): %ju\n", i, history->num_entries);
|
||||
}
|
||||
|
||||
populate_events(history, history->num_entries);
|
||||
}
|
||||
} else {
|
||||
history = spdk_get_per_lcore_history(histories, lcore);
|
||||
if (history->entries[0].tsc != 0) {
|
||||
if (g_verbose && history->num_entries) {
|
||||
printf("Trace Size of lcore (%d): %ju\n", lcore, history->num_entries);
|
||||
}
|
||||
|
||||
populate_events(history, history->num_entries);
|
||||
}
|
||||
}
|
||||
|
||||
tsc_offset = spdk_trace_parser_get_tsc_offset(g_parser);
|
||||
while (spdk_trace_parser_next_entry(g_parser, &entry)) {
|
||||
if (entry.entry->tsc < tsc_offset) {
|
||||
tsc_offset = g_first_tsc;
|
||||
for (entry_map::iterator it = g_entry_map.begin(); it != g_entry_map.end(); it++) {
|
||||
if (it->first.tsc < g_first_tsc) {
|
||||
continue;
|
||||
}
|
||||
process_event(&entry, g_flags->tsc_rate, tsc_offset);
|
||||
process_event(it->second, g_tsc_rate, tsc_offset, it->first.lcore);
|
||||
}
|
||||
|
||||
if (g_json != NULL) {
|
||||
spdk_json_write_array_end(g_json);
|
||||
spdk_json_write_object_end(g_json);
|
||||
spdk_json_write_end(g_json);
|
||||
}
|
||||
free(histories);
|
||||
|
||||
spdk_trace_parser_cleanup(g_parser);
|
||||
cleanup:
|
||||
munmap(history_ptr, trace_histories_size);
|
||||
close(fd);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
@ -1,7 +1,35 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2015 Intel Corporation.
|
||||
#
|
||||
# BSD LICENSE
|
||||
#
|
||||
# Copyright (c) Intel Corporation.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
|
||||
SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
|
||||
@ -13,9 +41,3 @@ APP = spdk_trace_record
|
||||
C_SRCS := trace_record.c
|
||||
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.app.mk
|
||||
|
||||
install: $(APP)
|
||||
$(INSTALL_APP)
|
||||
|
||||
uninstall:
|
||||
$(UNINSTALL_APP)
|
||||
|
@ -1,6 +1,34 @@
|
||||
/* SPDX-License-Identifier: BSD-3-Clause
|
||||
* Copyright (C) 2018 Intel Corporation.
|
||||
/*-
|
||||
* BSD LICENSE
|
||||
*
|
||||
* Copyright (c) Intel Corporation.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "spdk/stdinc.h"
|
||||
@ -24,7 +52,6 @@ static uint64_t g_histories_size;
|
||||
struct lcore_trace_record_ctx {
|
||||
char lcore_file[TRACE_PATH_MAX];
|
||||
int fd;
|
||||
bool valid;
|
||||
struct spdk_trace_history *in_history;
|
||||
struct spdk_trace_history *out_history;
|
||||
|
||||
@ -95,15 +122,11 @@ input_trace_file_mmap(struct aggr_trace_record_ctx *ctx, const char *shm_name)
|
||||
|
||||
ctx->trace_histories = (struct spdk_trace_histories *)history_ptr;
|
||||
for (i = 0; i < SPDK_TRACE_MAX_LCORE; i++) {
|
||||
struct spdk_trace_history *history;
|
||||
ctx->lcore_ports[i].in_history = spdk_get_per_lcore_history(ctx->trace_histories, i);
|
||||
|
||||
history = spdk_get_per_lcore_history(ctx->trace_histories, i);
|
||||
ctx->lcore_ports[i].in_history = history;
|
||||
ctx->lcore_ports[i].valid = (history != NULL);
|
||||
|
||||
if (g_verbose && history) {
|
||||
if (g_verbose) {
|
||||
printf("Number of trace entries for lcore (%d): %ju\n", i,
|
||||
history->num_entries);
|
||||
ctx->lcore_ports[i].in_history->num_entries);
|
||||
}
|
||||
}
|
||||
|
||||
@ -154,10 +177,6 @@ output_trace_files_prepare(struct aggr_trace_record_ctx *ctx, const char *aggr_p
|
||||
for (i = 0; i < SPDK_TRACE_MAX_LCORE; i++) {
|
||||
port_ctx = &ctx->lcore_ports[i];
|
||||
|
||||
if (!port_ctx->valid) {
|
||||
continue;
|
||||
}
|
||||
|
||||
port_ctx->fd = open(port_ctx->lcore_file, flags, 0600);
|
||||
if (port_ctx->fd < 0) {
|
||||
fprintf(stderr, "Could not open lcore file %s.\n", port_ctx->lcore_file);
|
||||
@ -441,7 +460,6 @@ trace_files_aggregate(struct aggr_trace_record_ctx *ctx)
|
||||
uint64_t lcore_offsets[SPDK_TRACE_MAX_LCORE + 1];
|
||||
int rc, i;
|
||||
ssize_t len = 0;
|
||||
uint64_t current_offset;
|
||||
uint64_t len_sum;
|
||||
|
||||
ctx->out_fd = open(ctx->out_file, flags, 0600);
|
||||
@ -463,17 +481,11 @@ trace_files_aggregate(struct aggr_trace_record_ctx *ctx)
|
||||
}
|
||||
|
||||
/* Update and append lcore offsets converged trace file */
|
||||
current_offset = sizeof(struct spdk_trace_flags);
|
||||
for (i = 0; i < SPDK_TRACE_MAX_LCORE; i++) {
|
||||
lcore_port = &ctx->lcore_ports[i];
|
||||
if (lcore_port->valid) {
|
||||
lcore_offsets[i] = current_offset;
|
||||
current_offset += spdk_get_trace_history_size(lcore_port->num_entries);
|
||||
} else {
|
||||
lcore_offsets[i] = 0;
|
||||
}
|
||||
lcore_offsets[0] = sizeof(struct spdk_trace_flags);
|
||||
for (i = 1; i < (int)SPDK_COUNTOF(lcore_offsets); i++) {
|
||||
lcore_offsets[i] = spdk_get_trace_history_size(ctx->lcore_ports[i - 1].num_entries) +
|
||||
lcore_offsets[i - 1];
|
||||
}
|
||||
lcore_offsets[SPDK_TRACE_MAX_LCORE] = current_offset;
|
||||
|
||||
rc = cont_write(ctx->out_fd, lcore_offsets, sizeof(lcore_offsets));
|
||||
if (rc < 0) {
|
||||
@ -485,10 +497,6 @@ trace_files_aggregate(struct aggr_trace_record_ctx *ctx)
|
||||
for (i = 0; i < SPDK_TRACE_MAX_LCORE; i++) {
|
||||
lcore_port = &ctx->lcore_ports[i];
|
||||
|
||||
if (!lcore_port->valid) {
|
||||
continue;
|
||||
}
|
||||
|
||||
lcore_port->out_history->num_entries = lcore_port->num_entries;
|
||||
rc = cont_write(ctx->out_fd, lcore_port->out_history, sizeof(struct spdk_trace_history));
|
||||
if (rc < 0) {
|
||||
@ -513,9 +521,6 @@ trace_files_aggregate(struct aggr_trace_record_ctx *ctx)
|
||||
}
|
||||
}
|
||||
|
||||
/* Clear rc so that the last cont_write() doesn't get interpreted as a failure. */
|
||||
rc = 0;
|
||||
|
||||
if (len_sum != lcore_port->num_entries * sizeof(struct spdk_trace_entry)) {
|
||||
fprintf(stderr, "Len of lcore trace file doesn't match number of entries for lcore\n");
|
||||
}
|
||||
@ -561,8 +566,7 @@ setup_exit_signal_handler(void)
|
||||
return rc;
|
||||
}
|
||||
|
||||
static void
|
||||
usage(void)
|
||||
static void usage(void)
|
||||
{
|
||||
printf("\n%s is used to record all SPDK generated trace entries\n", g_exe_name);
|
||||
printf("from SPDK trace shared-memory to specified file.\n\n");
|
||||
@ -578,8 +582,7 @@ usage(void)
|
||||
printf(" '-h' to print usage information\n");
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
const char *app_name = NULL;
|
||||
const char *file_name = NULL;
|
||||
@ -610,8 +613,6 @@ main(int argc, char **argv)
|
||||
file_name = optarg;
|
||||
break;
|
||||
case 'h':
|
||||
usage();
|
||||
exit(EXIT_SUCCESS);
|
||||
default:
|
||||
usage();
|
||||
exit(1);
|
||||
@ -662,9 +663,6 @@ main(int argc, char **argv)
|
||||
for (i = 0; i < SPDK_TRACE_MAX_LCORE; i++) {
|
||||
lcore_port = &ctx.lcore_ports[i];
|
||||
|
||||
if (!lcore_port->valid) {
|
||||
continue;
|
||||
}
|
||||
rc = lcore_trace_record(lcore_port);
|
||||
if (rc) {
|
||||
break;
|
||||
|
@ -1,7 +1,35 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2017 Intel Corporation.
|
||||
#
|
||||
# BSD LICENSE
|
||||
#
|
||||
# Copyright (c) Intel Corporation.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# * Neither the name of Intel Corporation nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
|
||||
SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
|
||||
@ -11,16 +39,11 @@ APP = vhost
|
||||
|
||||
C_SRCS := vhost.c
|
||||
|
||||
SPDK_LIB_LIST = $(ALL_MODULES_LIST) event event_vhost_blk event_vhost_scsi event_nbd
|
||||
|
||||
ifeq ($(SPDK_ROOT_DIR)/lib/env_dpdk,$(CONFIG_ENV))
|
||||
SPDK_LIB_LIST += env_dpdk_rpc
|
||||
endif
|
||||
SPDK_LIB_LIST = $(ALL_MODULES_LIST)
|
||||
SPDK_LIB_LIST += vhost rte_vhost event_vhost
|
||||
SPDK_LIB_LIST += event_bdev event_copy event_net event_scsi event
|
||||
SPDK_LIB_LIST += jsonrpc json rpc bdev_rpc bdev scsi copy trace conf
|
||||
SPDK_LIB_LIST += thread util log log_rpc trace_rpc app_rpc
|
||||
SPDK_LIB_LIST += event_nbd nbd net sock
|
||||
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.app.mk
|
||||
|
||||
install: $(APP)
|
||||
$(INSTALL_APP)
|
||||
|
||||
uninstall:
|
||||
$(UNINSTALL_APP)
|
||||
|
@ -1,10 +1,39 @@
|
||||
/* SPDX-License-Identifier: BSD-3-Clause
|
||||
* Copyright (C) 2017 Intel Corporation.
|
||||
/*-
|
||||
* BSD LICENSE
|
||||
*
|
||||
* Copyright (c) Intel Corporation.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* * Neither the name of Intel Corporation nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "spdk/stdinc.h"
|
||||
|
||||
#include "spdk/conf.h"
|
||||
#include "spdk/event.h"
|
||||
|
||||
#include "spdk/vhost.h"
|
||||
@ -50,7 +79,7 @@ vhost_parse_arg(int ch, char *arg)
|
||||
}
|
||||
|
||||
static void
|
||||
vhost_started(void *arg1)
|
||||
vhost_started(void *arg1, void *arg2)
|
||||
{
|
||||
}
|
||||
|
||||
@ -60,7 +89,7 @@ main(int argc, char *argv[])
|
||||
struct spdk_app_opts opts = {};
|
||||
int rc;
|
||||
|
||||
spdk_app_opts_init(&opts, sizeof(opts));
|
||||
spdk_app_opts_init(&opts);
|
||||
opts.name = "vhost";
|
||||
|
||||
if ((rc = spdk_app_parse_args(argc, argv, &opts, "f:S:", NULL,
|
||||
@ -74,7 +103,7 @@ main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
/* Blocks until the application is exiting */
|
||||
rc = spdk_app_start(&opts, vhost_started, NULL);
|
||||
rc = spdk_app_start(&opts, vhost_started, NULL, NULL);
|
||||
|
||||
spdk_app_fini();
|
||||
|
||||
|
180
autobuild.sh
180
autobuild.sh
@ -1,70 +1,152 @@
|
||||
#!/usr/bin/env bash
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2015 Intel Corporation
|
||||
# All rights reserved.
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
rootdir=$(readlink -f $(dirname $0))
|
||||
source "$rootdir/test/common/autotest_common.sh"
|
||||
|
||||
source "$rootdir/test/common/autobuild_common.sh"
|
||||
out=$PWD
|
||||
|
||||
SPDK_TEST_AUTOBUILD=${SPDK_TEST_AUTOBUILD:-}
|
||||
umask 022
|
||||
|
||||
cd $rootdir
|
||||
|
||||
# Print some test system info out for the log
|
||||
date -u
|
||||
git describe --tags
|
||||
|
||||
if [ "$SPDK_TEST_OCF" -eq 1 ]; then
|
||||
# We compile OCF sources ourselves
|
||||
# They don't need to be checked with scanbuild and code coverage is not applicable
|
||||
# So we precompile OCF now for further use as standalone static library
|
||||
./configure $(echo $config_params | sed 's/--enable-coverage//g')
|
||||
$MAKE $MAKEFLAGS include/spdk/config.h
|
||||
CC=gcc CCAR=ar $MAKE $MAKEFLAGS -C lib/bdev/ocf/env exportlib O=$rootdir/build/ocf.a
|
||||
# Set config to use precompiled library
|
||||
config_params="$config_params --with-ocf=/$rootdir/build/ocf.a"
|
||||
fi
|
||||
|
||||
./configure $config_params
|
||||
|
||||
# Print some test system info out for the log
|
||||
echo "** START ** Info for Hostname: $HOSTNAME"
|
||||
uname -a
|
||||
$MAKE cc_version
|
||||
$MAKE cxx_version
|
||||
echo "** END ** Info for Hostname: $HOSTNAME"
|
||||
|
||||
timing_enter autobuild
|
||||
|
||||
timing_enter check_format
|
||||
if [ $SPDK_RUN_CHECK_FORMAT -eq 1 ]; then
|
||||
./scripts/check_format.sh
|
||||
fi
|
||||
timing_exit check_format
|
||||
|
||||
scanbuild=''
|
||||
make_timing_label='make'
|
||||
if [ $SPDK_RUN_SCANBUILD -eq 1 ] && hash scan-build; then
|
||||
scanbuild="scan-build -o $out/scan-build-tmp --status-bugs"
|
||||
make_timing_label='scanbuild_make'
|
||||
report_test_completion "scanbuild"
|
||||
|
||||
fi
|
||||
|
||||
if [ $SPDK_RUN_VALGRIND -eq 1 ]; then
|
||||
report_test_completion "valgrind"
|
||||
fi
|
||||
|
||||
if [ $SPDK_RUN_ASAN -eq 1 ]; then
|
||||
run_test "asan" echo "using asan"
|
||||
report_test_completion "asan"
|
||||
fi
|
||||
|
||||
if [ $SPDK_RUN_UBSAN -eq 1 ]; then
|
||||
run_test "ubsan" echo "using ubsan"
|
||||
report_test_completion "ubsan"
|
||||
fi
|
||||
|
||||
if [ -n "$SPDK_TEST_NATIVE_DPDK" ]; then
|
||||
build_native_dpdk
|
||||
echo $scanbuild
|
||||
|
||||
timing_enter "$make_timing_label"
|
||||
|
||||
$MAKE $MAKEFLAGS clean
|
||||
if [ $SPDK_BUILD_SHARED_OBJECT -eq 1 ]; then
|
||||
./configure $config_params --with-shared
|
||||
$MAKE $MAKEFLAGS
|
||||
$MAKE $MAKEFLAGS clean
|
||||
report_test_completion "shared_object_build"
|
||||
fi
|
||||
|
||||
case "$SPDK_TEST_AUTOBUILD" in
|
||||
full)
|
||||
$rootdir/configure $config_params
|
||||
echo "** START ** Info for Hostname: $HOSTNAME"
|
||||
uname -a
|
||||
$MAKE cc_version
|
||||
$MAKE cxx_version
|
||||
echo "** END ** Info for Hostname: $HOSTNAME"
|
||||
;;
|
||||
ext | tiny | "") ;;
|
||||
*)
|
||||
echo "ERROR: supported values for SPDK_TEST_AUTOBUILD are 'full', 'tiny' and 'ext'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
if [[ $SPDK_TEST_OCF -eq 1 ]]; then
|
||||
ocf_precompile
|
||||
fi
|
||||
|
||||
if [[ $SPDK_TEST_FUZZER -eq 1 ]]; then
|
||||
llvm_precompile
|
||||
fi
|
||||
|
||||
if [[ -n $SPDK_TEST_AUTOBUILD ]]; then
|
||||
autobuild_test_suite
|
||||
elif [[ $SPDK_TEST_UNITTEST -eq 1 ]]; then
|
||||
unittest_build
|
||||
elif [[ $SPDK_TEST_SCANBUILD -eq 1 ]]; then
|
||||
scanbuild_make
|
||||
else
|
||||
if [[ $SPDK_TEST_FUZZER -eq 1 ]]; then
|
||||
# if we are testing nvmf fuzz with llvm lib, --with-shared will cause lib link fail
|
||||
$rootdir/configure $config_params
|
||||
else
|
||||
# if we aren't testing the unittests, build with shared objects.
|
||||
$rootdir/configure $config_params --with-shared
|
||||
fail=0
|
||||
./configure $config_params
|
||||
time $scanbuild $MAKE $MAKEFLAGS || fail=1
|
||||
if [ $fail -eq 1 ]; then
|
||||
if [ -d $out/scan-build-tmp ]; then
|
||||
scanoutput=$(ls -1 $out/scan-build-tmp/)
|
||||
mv $out/scan-build-tmp/$scanoutput $out/scan-build
|
||||
rm -rf $out/scan-build-tmp
|
||||
chmod -R a+rX $out/scan-build
|
||||
fi
|
||||
run_test "make" $MAKE $MAKEFLAGS
|
||||
exit 1
|
||||
else
|
||||
rm -rf $out/scan-build-tmp
|
||||
fi
|
||||
timing_exit "$make_timing_label"
|
||||
|
||||
# Check for generated files that are not listed in .gitignore
|
||||
timing_enter generated_files_check
|
||||
if [ `git status --porcelain --ignore-submodules | wc -l` -ne 0 ]; then
|
||||
echo "Generated files missing from .gitignore:"
|
||||
git status --porcelain --ignore-submodules
|
||||
exit 1
|
||||
fi
|
||||
timing_exit generated_files_check
|
||||
|
||||
# Check that header file dependencies are working correctly by
|
||||
# capturing a binary's stat data before and after touching a
|
||||
# header file and re-making.
|
||||
timing_enter dependency_check
|
||||
STAT1=`stat examples/nvme/identify/identify`
|
||||
sleep 1
|
||||
touch lib/nvme/nvme_internal.h
|
||||
$MAKE $MAKEFLAGS
|
||||
STAT2=`stat examples/nvme/identify/identify`
|
||||
|
||||
if [ "$STAT1" == "$STAT2" ]; then
|
||||
echo "Header dependency check failed"
|
||||
exit 1
|
||||
fi
|
||||
timing_exit dependency_check
|
||||
|
||||
# Test 'make install'
|
||||
timing_enter make_install
|
||||
rm -rf /tmp/spdk
|
||||
mkdir /tmp/spdk
|
||||
$MAKE $MAKEFLAGS install DESTDIR=/tmp/spdk prefix=/usr
|
||||
ls -lR /tmp/spdk
|
||||
rm -rf /tmp/spdk
|
||||
timing_exit make_install
|
||||
|
||||
timing_enter doxygen
|
||||
if [ $SPDK_BUILD_DOC -eq 1 ] && hash doxygen; then
|
||||
$MAKE -C "$rootdir"/doc --no-print-directory $MAKEFLAGS &> "$out"/doxygen.log
|
||||
if [ -s "$out"/doxygen.log ]; then
|
||||
cat "$out"/doxygen.log
|
||||
echo "Doxygen errors found!"
|
||||
exit 1
|
||||
fi
|
||||
if hash pdflatex 2>/dev/null; then
|
||||
$MAKE -C "$rootdir"/doc/output/latex --no-print-directory $MAKEFLAGS &>> "$out"/doxygen.log
|
||||
fi
|
||||
mkdir -p "$out"/doc
|
||||
mv "$rootdir"/doc/output/html "$out"/doc
|
||||
if [ -f "$rootdir"/doc/output/latex/refman.pdf ]; then
|
||||
mv "$rootdir"/doc/output/latex/refman.pdf "$out"/doc/spdk.pdf
|
||||
fi
|
||||
$MAKE -C "$rootdir"/doc --no-print-directory $MAKEFLAGS clean &>> "$out"/doxygen.log
|
||||
if [ -s "$out"/doxygen.log ]; then
|
||||
rm "$out"/doxygen.log
|
||||
fi
|
||||
rm -rf "$rootdir"/doc/output
|
||||
fi
|
||||
timing_exit doxygen
|
||||
|
||||
timing_exit autobuild
|
||||
|
@ -1,54 +1,72 @@
|
||||
#!/usr/bin/env bash
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2015 Intel Corporation
|
||||
# All rights reserved.
|
||||
#
|
||||
|
||||
set -xe
|
||||
|
||||
rootdir=$(readlink -f $(dirname $0))
|
||||
source "$rootdir/test/common/autobuild_common.sh"
|
||||
source "$rootdir/test/common/autotest_common.sh"
|
||||
|
||||
out=$PWD
|
||||
|
||||
MAKEFLAGS=${MAKEFLAGS:--j16}
|
||||
cd $rootdir
|
||||
|
||||
timing_enter porcelain_check
|
||||
if [[ -e $rootdir/mk/config.mk ]]; then
|
||||
$MAKE clean
|
||||
fi
|
||||
timing_enter autopackage
|
||||
|
||||
if [ $(git status --porcelain --ignore-submodules | wc -l) -ne 0 ]; then
|
||||
$MAKE clean
|
||||
|
||||
if [ `git status --porcelain --ignore-submodules | wc -l` -ne 0 ]; then
|
||||
echo make clean left the following files:
|
||||
git status --porcelain --ignore-submodules
|
||||
exit 1
|
||||
fi
|
||||
timing_exit porcelain_check
|
||||
|
||||
if [[ $SPDK_TEST_RELEASE_BUILD -eq 1 ]]; then
|
||||
build_packaging
|
||||
$MAKE clean
|
||||
spdk_pv=spdk-$(date +%Y_%m_%d)
|
||||
spdk_tarball=${spdk_pv}.tar
|
||||
dpdk_pv=dpdk-$(date +%Y_%m_%d)
|
||||
dpdk_tarball=${dpdk_pv}.tar
|
||||
ipsec_pv=ipsec-$(date +%Y_%m_%d)
|
||||
ipsec_tarball=${ipsec_pv}.tar
|
||||
isal_pv=isal-$(date +%Y_%m_%d)
|
||||
isal_tarball=${isal_pv}.tar
|
||||
|
||||
find . -iname "spdk-*.tar* dpdk-*.tar* ipsec-*.tar* isal-*.tar*" -delete
|
||||
git archive HEAD^{tree} --prefix=${spdk_pv}/ -o ${spdk_tarball}
|
||||
|
||||
# Build from packaged source
|
||||
tmpdir=$(mktemp -d)
|
||||
echo "tmpdir=$tmpdir"
|
||||
tar -C "$tmpdir" -xf $spdk_tarball
|
||||
|
||||
if [ -z "$WITH_DPDK_DIR" ]; then
|
||||
cd dpdk
|
||||
git archive HEAD^{tree} --prefix=dpdk/ -o ../${dpdk_tarball}
|
||||
cd ..
|
||||
tar -C "$tmpdir/${spdk_pv}" -xf $dpdk_tarball
|
||||
fi
|
||||
|
||||
if [[ $RUN_NIGHTLY -eq 0 || $SPDK_TEST_UNITTEST -eq 0 ]]; then
|
||||
timing_finish
|
||||
exit 0
|
||||
if [ -d "intel-ipsec-mb" ]; then
|
||||
cd intel-ipsec-mb
|
||||
git archive HEAD^{tree} --prefix=intel-ipsec-mb/ -o ../${ipsec_tarball}
|
||||
cd ..
|
||||
tar -C "$tmpdir/${spdk_pv}" -xf $ipsec_tarball
|
||||
fi
|
||||
|
||||
timing_enter build_release
|
||||
|
||||
config_params="$(get_config_params | sed 's/--enable-debug//g')"
|
||||
if [ $(uname -s) = Linux ]; then
|
||||
# LTO needs a special compiler to work under clang. See detect_cc.sh for details.
|
||||
if [[ $CC == *clang* ]]; then
|
||||
LD=$(type -P ld.gold)
|
||||
export LD
|
||||
fi
|
||||
$rootdir/configure $config_params --enable-lto
|
||||
else
|
||||
# LTO needs a special compiler to work on BSD.
|
||||
$rootdir/configure $config_params
|
||||
if [ -d "isa-l" ]; then
|
||||
cd isa-l
|
||||
git archive HEAD^{tree} --prefix=isa-l/ -o ../${isal_tarball}
|
||||
cd ..
|
||||
tar -C "$tmpdir/${spdk_pv}" -xf $isal_tarball
|
||||
fi
|
||||
$MAKE ${MAKEFLAGS}
|
||||
$MAKE ${MAKEFLAGS} clean
|
||||
|
||||
timing_exit build_release
|
||||
(
|
||||
cd "$tmpdir"/spdk-*
|
||||
# use $config_params to get the right dependency options, but disable coverage and ubsan
|
||||
# explicitly since they are not needed for this build
|
||||
./configure $config_params --disable-debug --enable-werror --disable-coverage --disable-ubsan
|
||||
time $MAKE ${MAKEFLAGS}
|
||||
)
|
||||
rm -rf "$tmpdir"
|
||||
|
||||
timing_exit autopackage
|
||||
|
||||
timing_finish
|
||||
|
26
autorun.sh
26
autorun.sh
@ -1,32 +1,12 @@
|
||||
#!/usr/bin/env bash
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2016 Intel Corporation
|
||||
# All rights reserved.
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
rootdir=$(readlink -f $(dirname $0))
|
||||
|
||||
default_conf=~/autorun-spdk.conf
|
||||
conf=${1:-${default_conf}}
|
||||
|
||||
# If the configuration of tests is not provided, no tests will be carried out.
|
||||
if [[ ! -f $conf ]]; then
|
||||
echo "ERROR: $conf doesn't exist"
|
||||
exit 1
|
||||
fi
|
||||
source "$conf"
|
||||
|
||||
echo "Test configuration:"
|
||||
cat "$conf"
|
||||
conf=~/autorun-spdk.conf
|
||||
|
||||
# Runs agent scripts
|
||||
$rootdir/autobuild.sh "$conf"
|
||||
if ((SPDK_TEST_UNITTEST == 1 || SPDK_RUN_FUNCTIONAL_TEST == 1)); then
|
||||
sudo -E $rootdir/autotest.sh "$conf"
|
||||
fi
|
||||
|
||||
if [[ $SPDK_TEST_AUTOBUILD != 'tiny' ]]; then
|
||||
$rootdir/autopackage.sh "$conf"
|
||||
fi
|
||||
sudo WITH_DPDK_DIR="$WITH_DPDK_DIR" $rootdir/autotest.sh "$conf"
|
||||
$rootdir/autopackage.sh "$conf"
|
||||
|
280
autorun_post.py
280
autorun_post.py
@ -1,105 +1,80 @@
|
||||
#!/usr/bin/python3
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2017 Intel Corporation.
|
||||
# All rights reserved.
|
||||
|
||||
|
||||
import shutil
|
||||
import subprocess
|
||||
import argparse
|
||||
import itertools
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def generateTestCompletionTableByTest(output_dir, data_table):
|
||||
columns_to_group = ['Domain', 'Test', 'Agent']
|
||||
|
||||
total_tests_number = len(data_table.groupby('Test'))
|
||||
|
||||
has_agent = data_table['Agent'] != 'None'
|
||||
data_table_with_agent = data_table[has_agent]
|
||||
executed_tests = len(data_table_with_agent.groupby('Test'))
|
||||
tests_executions = len(data_table_with_agent.groupby(columns_to_group))
|
||||
|
||||
pivot_by_test = pd.pivot_table(data_table, index=columns_to_group)
|
||||
|
||||
output_file = os.path.join(output_dir, 'post_process', 'completions_table_by_test.html')
|
||||
with open(output_file, 'w') as f:
|
||||
table_row = '<tr><td>{}</td><td>{}</td>\n'
|
||||
f.write('<table>\n')
|
||||
f.write(table_row.format('Total number of tests', total_tests_number))
|
||||
f.write(table_row.format('Tests executed', executed_tests))
|
||||
f.write(table_row.format('Number of test executions', tests_executions))
|
||||
f.write('</table>\n')
|
||||
f.write(pivot_by_test.to_html(None))
|
||||
def highest_value(inp):
|
||||
ret_value = False
|
||||
for x in inp:
|
||||
if x:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def generateTestCompletionTables(output_dir, completion_table):
|
||||
data_table = pd.DataFrame(completion_table, columns=["Agent", "Domain", "Test", "With Asan", "With UBsan"])
|
||||
data_table = pd.DataFrame(completion_table, columns=["Agent", "Test", "With Asan", "With UBsan"])
|
||||
data_table.to_html(os.path.join(output_dir, 'completions_table.html'))
|
||||
os.makedirs(os.path.join(output_dir, "post_process"), exist_ok=True)
|
||||
|
||||
pivot_by_agent = pd.pivot_table(data_table, index=["Agent", "Domain", "Test"])
|
||||
pivot_by_agent = pd.pivot_table(data_table, index=["Agent", "Test"])
|
||||
pivot_by_agent.to_html(os.path.join(output_dir, "post_process", 'completions_table_by_agent.html'))
|
||||
|
||||
generateTestCompletionTableByTest(output_dir, data_table)
|
||||
|
||||
pivot_by_asan = pd.pivot_table(data_table, index=["Domain", "Test"], values=["With Asan"], aggfunc=any)
|
||||
pivot_by_test = pd.pivot_table(data_table, index=["Test", "Agent"])
|
||||
pivot_by_test.to_html(os.path.join(output_dir, "post_process", 'completions_table_by_test.html'))
|
||||
pivot_by_asan = pd.pivot_table(data_table, index=["Test"], values=["With Asan"], aggfunc=highest_value)
|
||||
pivot_by_asan.to_html(os.path.join(output_dir, "post_process", 'completions_table_by_asan.html'))
|
||||
pivot_by_ubsan = pd.pivot_table(data_table, index=["Domain", "Test"], values=["With UBsan"], aggfunc=any)
|
||||
pivot_by_ubsan = pd.pivot_table(data_table, index=["Test"], values=["With UBsan"], aggfunc=highest_value)
|
||||
pivot_by_ubsan.to_html(os.path.join(output_dir, "post_process", 'completions_table_by_ubsan.html'))
|
||||
|
||||
|
||||
def generateCoverageReport(output_dir, repo_dir):
|
||||
coveragePath = os.path.join(output_dir, '**', 'cov_total.info')
|
||||
covfiles = [os.path.abspath(p) for p in glob.glob(coveragePath, recursive=True)]
|
||||
for f in covfiles:
|
||||
print(f)
|
||||
if len(covfiles) == 0:
|
||||
return
|
||||
lcov_opts = [
|
||||
'--rc', 'lcov_branch_coverage=1',
|
||||
'--rc', 'lcov_function_coverage=1',
|
||||
'--rc', 'genhtml_branch_coverage=1',
|
||||
'--rc', 'genhtml_function_coverage=1',
|
||||
'--rc', 'genhtml_legend=1',
|
||||
'--rc', 'geninfo_all_blocks=1',
|
||||
]
|
||||
|
||||
# HACK: This is a workaround for some odd CI assumptions
|
||||
details = '--show-details'
|
||||
|
||||
cov_total = os.path.abspath(os.path.join(output_dir, 'cov_total.info'))
|
||||
coverage = os.path.join(output_dir, 'coverage')
|
||||
lcov = ['lcov', *lcov_opts, '-q', *itertools.chain(*[('-a', f) for f in covfiles]), '-o', cov_total]
|
||||
genhtml = ['genhtml', *lcov_opts, '-q', cov_total, '--legend', '-t', 'Combined', *details.split(), '-o', coverage]
|
||||
try:
|
||||
subprocess.check_call(lcov)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print("lcov failed")
|
||||
print(e)
|
||||
return
|
||||
|
||||
with open(cov_total, 'r') as cov_total_file:
|
||||
with open(os.path.join(output_dir, 'coverage.log'), 'w+') as log_file:
|
||||
coveragePath = os.path.join(output_dir, '**', 'cov_total.info')
|
||||
covfiles = [os.path.abspath(p) for p in glob.glob(coveragePath, recursive=True)]
|
||||
for f in covfiles:
|
||||
print(f, file=log_file)
|
||||
if len(covfiles) == 0:
|
||||
return
|
||||
lcov_opts = [
|
||||
'--rc lcov_branch_coverage=1',
|
||||
'--rc lcov_function_coverage=1',
|
||||
'--rc genhtml_branch_coverage=1',
|
||||
'--rc genhtml_function_coverage=1',
|
||||
'--rc genhtml_legend=1',
|
||||
'--rc geninfo_all_blocks=1',
|
||||
]
|
||||
cov_total = os.path.abspath(os.path.join(output_dir, 'cov_total.info'))
|
||||
coverage = os.path.join(output_dir, 'coverage')
|
||||
lcov = 'lcov' + ' ' + ' '.join(lcov_opts) + ' -q -a ' + ' -a '.join(covfiles) + ' -o ' + cov_total
|
||||
genhtml = 'genhtml' + ' ' + ' '.join(lcov_opts) + ' -q ' + cov_total + ' --legend' + ' -t "Combined" --show-details -o ' + coverage
|
||||
try:
|
||||
subprocess.check_call([lcov], shell=True, stdout=log_file, stderr=log_file)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print("lcov failed", file=log_file)
|
||||
print(e, file=log_file)
|
||||
return
|
||||
cov_total_file = open(cov_total, 'r')
|
||||
replacement = "SF:" + repo_dir
|
||||
file_contents = cov_total_file.readlines()
|
||||
|
||||
replacement = "SF:" + repo_dir
|
||||
os.remove(cov_total)
|
||||
with open(cov_total, 'w+') as file:
|
||||
for Line in file_contents:
|
||||
Line = re.sub("^SF:.*/repo", replacement, Line)
|
||||
file.write(Line + '\n')
|
||||
try:
|
||||
subprocess.check_call(genhtml)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print("genhtml failed")
|
||||
print(e)
|
||||
for f in covfiles:
|
||||
os.remove(f)
|
||||
cov_total_file.close()
|
||||
os.remove(cov_total)
|
||||
with open(cov_total, 'w+') as file:
|
||||
for Line in file_contents:
|
||||
Line = re.sub("^SF:.*/repo", replacement, Line)
|
||||
file.write(Line + '\n')
|
||||
try:
|
||||
subprocess.check_call([genhtml], shell=True, stdout=log_file, stderr=log_file)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print("genhtml failed", file=log_file)
|
||||
print(e, file=log_file)
|
||||
for f in covfiles:
|
||||
os.remove(f)
|
||||
|
||||
|
||||
def collectOne(output_dir, dir_name):
|
||||
@ -117,98 +92,91 @@ def collectOne(output_dir, dir_name):
|
||||
shutil.rmtree(d)
|
||||
|
||||
|
||||
def getCompletions(completionFile, test_list, test_completion_table):
|
||||
agent_name = os.path.basename(os.path.dirname(completionFile))
|
||||
with open(completionFile, 'r') as completionList:
|
||||
completions = completionList.read()
|
||||
|
||||
asan_enabled = "asan" in completions
|
||||
ubsan_enabled = "ubsan" in completions
|
||||
|
||||
for line in completions.splitlines():
|
||||
try:
|
||||
domain, test_name = line.strip().split()
|
||||
test_list[test_name] = (True, asan_enabled | test_list[test_name][1], ubsan_enabled | test_list[test_name][2])
|
||||
test_completion_table.append([agent_name, domain, test_name, asan_enabled, ubsan_enabled])
|
||||
try:
|
||||
test_completion_table.remove(["None", "None", test_name, False, False])
|
||||
except ValueError:
|
||||
continue
|
||||
except KeyError:
|
||||
continue
|
||||
|
||||
|
||||
def printList(header, test_list, index, condition):
|
||||
print("\n\n-----%s------" % header)
|
||||
executed_tests = [x for x in sorted(test_list) if test_list[x][index] is condition]
|
||||
print(*executed_tests, sep="\n")
|
||||
|
||||
|
||||
def printListInformation(table_type, test_list):
|
||||
printList("%s Executed in Build" % table_type, test_list, 0, True)
|
||||
printList("%s Missing From Build" % table_type, test_list, 0, False)
|
||||
printList("%s Missing ASAN" % table_type, test_list, 1, False)
|
||||
printList("%s Missing UBSAN" % table_type, test_list, 2, False)
|
||||
|
||||
|
||||
def getSkippedTests(repo_dir):
|
||||
skipped_test_file = os.path.join(repo_dir, "test", "common", "skipped_tests.txt")
|
||||
if not os.path.exists(skipped_test_file):
|
||||
return []
|
||||
|
||||
with open(skipped_test_file, "r") as skipped_test_data:
|
||||
return [x.strip() for x in skipped_test_data.readlines() if "#" not in x and x.strip() != '']
|
||||
|
||||
|
||||
def confirmPerPatchTests(test_list, skiplist):
|
||||
missing_tests = [x for x in sorted(test_list) if test_list[x][0] is False
|
||||
and x not in skiplist]
|
||||
if len(missing_tests) > 0:
|
||||
print("Not all tests were run. Failing the build.")
|
||||
print(missing_tests)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def aggregateCompletedTests(output_dir, repo_dir, skip_confirm=False):
|
||||
def aggregateCompletedTests(output_dir, repo_dir):
|
||||
test_list = {}
|
||||
test_with_asan = {}
|
||||
test_with_ubsan = {}
|
||||
test_completion_table = []
|
||||
|
||||
testFiles = glob.glob(os.path.join(output_dir, '**', 'all_tests.txt'), recursive=True)
|
||||
completionFiles = glob.glob(os.path.join(output_dir, '**', 'test_completions.txt'), recursive=True)
|
||||
asan_enabled = False
|
||||
ubsan_enabled = False
|
||||
test_unit_with_valgrind = False
|
||||
testFilePath = os.path.join(output_dir, '**', 'all_tests.txt')
|
||||
completionFilePath = os.path.join(output_dir, '**', 'test_completions.txt')
|
||||
testFiles = glob.glob(testFilePath, recursive=True)
|
||||
completionFiles = glob.glob(completionFilePath, recursive=True)
|
||||
testSummary = os.path.join(output_dir, "test_execution.log")
|
||||
|
||||
if len(testFiles) == 0:
|
||||
print("Unable to perform test completion aggregator. No input files.")
|
||||
return 0
|
||||
|
||||
with open(testFiles[0], 'r') as raw_test_list:
|
||||
item = testFiles[0]
|
||||
with open(item, 'r') as raw_test_list:
|
||||
for line in raw_test_list:
|
||||
try:
|
||||
test_name = line.strip()
|
||||
except Exception:
|
||||
print("Failed to parse a test type.")
|
||||
return 1
|
||||
test_list[line.strip()] = (False, False, False)
|
||||
test_completion_table.append(["None", line.strip(), False, False])
|
||||
for item in completionFiles:
|
||||
agent_name = os.path.split(os.path.split(item)[0])[1]
|
||||
with open(item, 'r') as completion_list:
|
||||
completions = completion_list.read()
|
||||
|
||||
test_list[test_name] = (False, False, False)
|
||||
test_completion_table.append(["None", "None", test_name, False, False])
|
||||
if "asan" not in completions:
|
||||
asan_enabled = False
|
||||
else:
|
||||
asan_enabled = True
|
||||
|
||||
for completionFile in completionFiles:
|
||||
getCompletions(completionFile, test_list, test_completion_table)
|
||||
if "ubsan" not in completions:
|
||||
ubsan_enabled = False
|
||||
else:
|
||||
ubsan_enabled = True
|
||||
|
||||
if "valgrind" in completions and "unittest" in completions:
|
||||
test_unit_with_valgrind = True
|
||||
test_completion_table.append([agent_name, "valgrind", asan_enabled, ubsan_enabled])
|
||||
for line in completions.split('\n'):
|
||||
try:
|
||||
test_list[line.strip()] = (True, asan_enabled | test_list[line.strip()][1], ubsan_enabled | test_list[line.strip()][1])
|
||||
test_completion_table.append([agent_name, line.strip(), asan_enabled, ubsan_enabled])
|
||||
try:
|
||||
test_completion_table.remove(["None", line.strip(), False, False])
|
||||
except ValueError:
|
||||
continue
|
||||
except KeyError:
|
||||
continue
|
||||
|
||||
with open(testSummary, 'w') as fh:
|
||||
fh.write("\n\n-----Tests Executed in Build------\n")
|
||||
for item in sorted(test_list):
|
||||
if test_list[item][0]:
|
||||
fh.write(item + "\n")
|
||||
|
||||
fh.write("\n\n-----Tests Missing From Build------\n")
|
||||
if not test_unit_with_valgrind:
|
||||
fh.write("UNITTEST_WITH_VALGRIND\n")
|
||||
for item in sorted(test_list):
|
||||
if test_list[item][0] is False:
|
||||
fh.write(item + "\n")
|
||||
|
||||
fh.write("\n\n-----Tests Missing ASAN------\n")
|
||||
for item in sorted(test_list):
|
||||
if test_list[item][1] is False:
|
||||
fh.write(item + "\n")
|
||||
|
||||
fh.write("\n\n-----Tests Missing UBSAN------\n")
|
||||
for item in sorted(test_list):
|
||||
if test_list[item][2] is False:
|
||||
fh.write(item + "\n")
|
||||
|
||||
with open(testSummary, 'r') as fh:
|
||||
print(fh.read())
|
||||
|
||||
printListInformation("Tests", test_list)
|
||||
generateTestCompletionTables(output_dir, test_completion_table)
|
||||
skipped_tests = getSkippedTests(repo_dir)
|
||||
if not skip_confirm:
|
||||
confirmPerPatchTests(test_list, skipped_tests)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def main(output_dir, repo_dir, skip_confirm=False):
|
||||
print("-----Begin Post Process Script------")
|
||||
def main(output_dir, repo_dir):
|
||||
generateCoverageReport(output_dir, repo_dir)
|
||||
collectOne(output_dir, 'doc')
|
||||
collectOne(output_dir, 'ut_coverage')
|
||||
aggregateCompletedTests(output_dir, repo_dir, skip_confirm)
|
||||
aggregateCompletedTests(output_dir, repo_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -217,7 +185,5 @@ if __name__ == "__main__":
|
||||
help="The location of your build's output directory")
|
||||
parser.add_argument("-r", "--repo_directory", type=str, required=True,
|
||||
help="The location of your spdk repository")
|
||||
parser.add_argument("-s", "--skip_confirm", required=False, action="store_true",
|
||||
help="Do not check if all autotest.sh tests were executed.")
|
||||
args = parser.parse_args()
|
||||
main(args.directory_location, args.repo_directory, args.skip_confirm)
|
||||
main(args.directory_location, args.repo_directory)
|
||||
|
381
autotest.sh
381
autotest.sh
@ -1,72 +1,42 @@
|
||||
#!/usr/bin/env bash
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2015 Intel Corporation
|
||||
# All rights reserved.
|
||||
#
|
||||
|
||||
rootdir=$(readlink -f $(dirname $0))
|
||||
|
||||
# In autotest_common.sh all tests are disabled by default.
|
||||
# If the configuration of tests is not provided, no tests will be carried out.
|
||||
if [[ ! -f $1 ]]; then
|
||||
echo "ERROR: SPDK test configuration not specified"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Autotest.sh, as part of autorun.sh, runs in a different
|
||||
# shell process than autobuild.sh. Use helper file to pass
|
||||
# over env variable containing libraries paths.
|
||||
if [[ -e /tmp/spdk-ld-path ]]; then
|
||||
source /tmp/spdk-ld-path
|
||||
fi
|
||||
|
||||
source "$1"
|
||||
source "$rootdir/test/common/autotest_common.sh"
|
||||
source "$rootdir/test/nvmf/common.sh"
|
||||
|
||||
set -xe
|
||||
|
||||
if [ $EUID -ne 0 ]; then
|
||||
echo "$0 must be run as root"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $(uname -s) = Linux ]; then
|
||||
old_core_pattern=$(< /proc/sys/kernel/core_pattern)
|
||||
mkdir -p "$output_dir/coredumps"
|
||||
# Set core_pattern to a known value to avoid ABRT, systemd-coredump, etc.
|
||||
# Dump the $output_dir path to a file so collector can pick it up while executing.
|
||||
# We don't set in in the core_pattern command line because of the string length limitation
|
||||
# of 128 bytes. See 'man core 5' for details.
|
||||
echo "|$rootdir/scripts/core-collector.sh %P %s %t" > /proc/sys/kernel/core_pattern
|
||||
echo "$output_dir/coredumps" > "$rootdir/.coredump_path"
|
||||
# set core_pattern to a known value to avoid ABRT, systemd-coredump, etc.
|
||||
echo "core" > /proc/sys/kernel/core_pattern
|
||||
|
||||
# make sure nbd (network block device) driver is loaded if it is available
|
||||
# this ensures that when tests need to use nbd, it will be fully initialized
|
||||
modprobe nbd || true
|
||||
|
||||
if udevadm=$(type -P udevadm); then
|
||||
"$udevadm" monitor --property &> "$output_dir/udev.log" &
|
||||
udevadm_pid=$!
|
||||
fi
|
||||
fi
|
||||
|
||||
trap "autotest_cleanup || :; exit 1" SIGINT SIGTERM EXIT
|
||||
trap "process_core; autotest_cleanup; exit 1" SIGINT SIGTERM EXIT
|
||||
|
||||
timing_enter autotest
|
||||
|
||||
create_test_list
|
||||
|
||||
src=$(readlink -f $(dirname $0))
|
||||
out=$output_dir
|
||||
out=$PWD
|
||||
cd $src
|
||||
|
||||
freebsd_update_contigmem_mod
|
||||
freebsd_set_maxsock_buf
|
||||
./scripts/setup.sh status
|
||||
|
||||
# lcov takes considerable time to process clang coverage.
|
||||
# Disabling lcov allow us to do this.
|
||||
# More information: https://github.com/spdk/spdk/issues/1693
|
||||
CC_TYPE=$(grep CC_TYPE mk/cc.mk)
|
||||
if hash lcov && ! [[ "$CC_TYPE" == *"clang"* ]]; then
|
||||
freebsd_update_contigmem_mod
|
||||
|
||||
if hash lcov; then
|
||||
# setup output dir for unittest.sh
|
||||
export UT_COVERAGE=$out/ut_coverage
|
||||
export LCOV_OPTS="
|
||||
--rc lcov_branch_coverage=1
|
||||
--rc lcov_function_coverage=1
|
||||
@ -79,307 +49,216 @@ if hash lcov && ! [[ "$CC_TYPE" == *"clang"* ]]; then
|
||||
# Print lcov version to log
|
||||
$LCOV -v
|
||||
# zero out coverage data
|
||||
$LCOV -q -c -i -t "Baseline" -d $src -o $out/cov_base.info
|
||||
$LCOV -q -c -i -t "Baseline" -d $src -o cov_base.info
|
||||
fi
|
||||
|
||||
# Make sure the disks are clean (no leftover partition tables)
|
||||
timing_enter pre_cleanup
|
||||
timing_enter cleanup
|
||||
# Remove old domain socket pathname just in case
|
||||
rm -f /var/tmp/spdk*.sock
|
||||
|
||||
# Load the kernel driver
|
||||
$rootdir/scripts/setup.sh reset
|
||||
./scripts/setup.sh reset
|
||||
|
||||
get_zoned_devs
|
||||
# Let the kernel discover any filesystems or partitions
|
||||
sleep 10
|
||||
|
||||
if ((${#zoned_devs[@]} > 0)); then
|
||||
# FIXME: For now make sure zoned devices are tested on-demand by
|
||||
# a designated tests instead of falling into any other. The main
|
||||
# concern here are fio workloads where specific configuration
|
||||
# must be in place for it to work with the zoned device.
|
||||
export PCI_BLOCKED="${zoned_devs[*]}"
|
||||
export PCI_ZONED="${zoned_devs[*]}"
|
||||
if [ $(uname -s) = Linux ]; then
|
||||
# OCSSD devices drivers don't support IO issues by kernel so
|
||||
# detect OCSSD devices and blacklist them (unbind from any driver).
|
||||
# If test scripts want to use this device it needs to do this explicitly.
|
||||
#
|
||||
# If some OCSSD device is bound to other driver than nvme we won't be able to
|
||||
# discover if it is OCSSD or not so load the kernel driver first.
|
||||
|
||||
|
||||
for dev in $(find /dev -maxdepth 1 -regex '/dev/nvme[0-9]+'); do
|
||||
# Send Open Channel 2.0 Geometry opcode "0xe2" - not supported by NVMe device.
|
||||
if nvme admin-passthru $dev --namespace-id=1 --data-len=4096 --opcode=0xe2 --read >/dev/null; then
|
||||
bdf="$(basename $(readlink -e /sys/class/nvme/${dev#/dev/}/device))"
|
||||
echo "INFO: blacklisting OCSSD device: $dev ($bdf)"
|
||||
PCI_BLACKLIST+=" $bdf"
|
||||
OCSSD_PCI_DEVICES+=" $bdf"
|
||||
fi
|
||||
done
|
||||
|
||||
export OCSSD_PCI_DEVICES
|
||||
|
||||
# Now, bind blacklisted devices to pci-stub module. This will prevent
|
||||
# automatic grabbing these devices when we add device/vendor ID to
|
||||
# proper driver.
|
||||
if [[ -n "$PCI_BLACKLIST" ]]; then
|
||||
PCI_WHITELIST="$PCI_BLACKLIST" \
|
||||
PCI_BLACKLIST="" \
|
||||
DRIVER_OVERRIDE="pci-stub" \
|
||||
./scripts/setup.sh
|
||||
|
||||
# Export our blacklist so it will take effect during next setup.sh
|
||||
export PCI_BLACKLIST
|
||||
fi
|
||||
fi
|
||||
|
||||
# Delete all leftover lvols and gpt partitions
|
||||
# Matches both /dev/nvmeXnY on Linux and /dev/nvmeXnsY on BSD
|
||||
# Filter out nvme with partitions - the "p*" suffix
|
||||
for dev in $(ls /dev/nvme*n* | grep -v p || true); do
|
||||
# Skip zoned devices as non-sequential IO will always fail
|
||||
[[ -z ${zoned_devs["${dev##*/}"]} ]] || continue
|
||||
if ! block_in_use "$dev"; then
|
||||
dd if=/dev/zero of="$dev" bs=1M count=1
|
||||
fi
|
||||
dd if=/dev/zero of="$dev" bs=1M count=1
|
||||
done
|
||||
|
||||
sync
|
||||
|
||||
if ! xtrace_disable_per_cmd reap_spdk_processes; then
|
||||
echo "WARNING: Lingering SPDK processes were detected. Testing environment may be unstable" >&2
|
||||
fi
|
||||
|
||||
if [ $(uname -s) = Linux ]; then
|
||||
run_test "setup.sh" "$rootdir/test/setup/test-setup.sh"
|
||||
# Load RAM disk driver if available
|
||||
modprobe brd || true
|
||||
fi
|
||||
|
||||
$rootdir/scripts/setup.sh status
|
||||
|
||||
if [[ $(uname -s) == Linux ]]; then
|
||||
# Revert NVMe namespaces to default state
|
||||
nvme_namespace_revert
|
||||
fi
|
||||
|
||||
timing_exit pre_cleanup
|
||||
timing_exit cleanup
|
||||
|
||||
# set up huge pages
|
||||
timing_enter afterboot
|
||||
$rootdir/scripts/setup.sh
|
||||
./scripts/setup.sh
|
||||
timing_exit afterboot
|
||||
|
||||
# Revert existing OPAL to factory settings that may have been left from earlier failed tests.
|
||||
# This ensures we won't hit any unexpected failures due to NVMe SSDs being locked.
|
||||
opal_revert_cleanup
|
||||
timing_enter nvmf_setup
|
||||
rdma_device_init
|
||||
timing_exit nvmf_setup
|
||||
|
||||
if [ $SPDK_TEST_CRYPTO -eq 1 ]; then
|
||||
if grep -q '#define SPDK_CONFIG_IGB_UIO_DRIVER 1' $rootdir/include/spdk/config.h; then
|
||||
./scripts/qat_setup.sh igb_uio
|
||||
else
|
||||
./scripts/qat_setup.sh
|
||||
fi
|
||||
fi
|
||||
|
||||
#####################
|
||||
# Unit Tests
|
||||
#####################
|
||||
|
||||
if [ $SPDK_TEST_UNITTEST -eq 1 ]; then
|
||||
run_test "unittest" $rootdir/test/unit/unittest.sh
|
||||
timing_enter unittest
|
||||
run_test suite ./test/unit/unittest.sh
|
||||
report_test_completion "unittest"
|
||||
timing_exit unittest
|
||||
fi
|
||||
|
||||
|
||||
if [ $SPDK_RUN_FUNCTIONAL_TEST -eq 1 ]; then
|
||||
if [[ $SPDK_TEST_CRYPTO -eq 1 || $SPDK_TEST_VBDEV_COMPRESS -eq 1 ]]; then
|
||||
if [[ $SPDK_TEST_USE_IGB_UIO -eq 1 ]]; then
|
||||
$rootdir/scripts/qat_setup.sh igb_uio
|
||||
else
|
||||
$rootdir/scripts/qat_setup.sh
|
||||
fi
|
||||
fi
|
||||
timing_enter lib
|
||||
|
||||
run_test "env" $rootdir/test/env/env.sh
|
||||
run_test "rpc" $rootdir/test/rpc/rpc.sh
|
||||
run_test "rpc_client" $rootdir/test/rpc_client/rpc_client.sh
|
||||
run_test "json_config" $rootdir/test/json_config/json_config.sh
|
||||
run_test "json_config_extra_key" $rootdir/test/json_config/json_config_extra_key.sh
|
||||
run_test "alias_rpc" $rootdir/test/json_config/alias_rpc/alias_rpc.sh
|
||||
run_test "spdkcli_tcp" $rootdir/test/spdkcli/tcp.sh
|
||||
run_test "dpdk_mem_utility" $rootdir/test/dpdk_memory_utility/test_dpdk_mem_info.sh
|
||||
run_test "event" $rootdir/test/event/event.sh
|
||||
run_test "thread" $rootdir/test/thread/thread.sh
|
||||
run_test "accel" $rootdir/test/accel/accel.sh
|
||||
run_test "app_cmdline" $rootdir/test/app/cmdline.sh
|
||||
run_test suite test/env/env.sh
|
||||
run_test suite test/rpc_client/rpc_client.sh
|
||||
run_test suite ./test/json_config/json_config.sh
|
||||
|
||||
if [ $SPDK_TEST_BLOCKDEV -eq 1 ]; then
|
||||
run_test "blockdev_general" $rootdir/test/bdev/blockdev.sh
|
||||
run_test "bdev_raid" $rootdir/test/bdev/bdev_raid.sh
|
||||
run_test "bdevperf_config" $rootdir/test/bdev/bdevperf/test_config.sh
|
||||
if [[ $(uname -s) == Linux ]]; then
|
||||
run_test "reactor_set_interrupt" $rootdir/test/interrupt/reactor_set_interrupt.sh
|
||||
run_test "reap_unregistered_poller" $rootdir/test/interrupt/reap_unregistered_poller.sh
|
||||
fi
|
||||
run_test suite test/bdev/blockdev.sh
|
||||
fi
|
||||
|
||||
if [[ $(uname -s) == Linux ]]; then
|
||||
if [[ $SPDK_TEST_BLOCKDEV -eq 1 || $SPDK_TEST_URING -eq 1 ]]; then
|
||||
# The crypto job also includes the SPDK_TEST_BLOCKDEV in its configuration hence the
|
||||
# dd tests are executed there as well. However, these tests can take a significant
|
||||
# amount of time to complete (up to 4min) on a physical system leading to a potential
|
||||
# job timeout. Avoid that by skipping these tests - this should not affect the coverage
|
||||
# since dd tests are still run as part of the vg jobs.
|
||||
if [[ $SPDK_TEST_CRYPTO -eq 0 ]]; then
|
||||
run_test "spdk_dd" $rootdir/test/dd/dd.sh
|
||||
fi
|
||||
fi
|
||||
if [ $SPDK_TEST_JSON -eq 1 ]; then
|
||||
run_test suite test/config_converter/test_converter.sh
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_EVENT -eq 1 ]; then
|
||||
run_test suite test/event/event.sh
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_NVME -eq 1 ]; then
|
||||
run_test "blockdev_nvme" $rootdir/test/bdev/blockdev.sh "nvme"
|
||||
if [[ $(uname -s) == Linux ]]; then
|
||||
run_test "blockdev_nvme_gpt" $rootdir/test/bdev/blockdev.sh "gpt"
|
||||
run_test suite test/nvme/nvme.sh
|
||||
if [ $SPDK_TEST_NVME_CLI -eq 1 ]; then
|
||||
run_test suite test/nvme/spdk_nvme_cli.sh
|
||||
fi
|
||||
run_test "nvme" $rootdir/test/nvme/nvme.sh
|
||||
if [[ $SPDK_TEST_NVME_PMR -eq 1 ]]; then
|
||||
run_test "nvme_pmr" $rootdir/test/nvme/nvme_pmr.sh
|
||||
fi
|
||||
if [[ $SPDK_TEST_NVME_SCC -eq 1 ]]; then
|
||||
run_test "nvme_scc" $rootdir/test/nvme/nvme_scc.sh
|
||||
fi
|
||||
if [[ $SPDK_TEST_NVME_BP -eq 1 ]]; then
|
||||
run_test "nvme_bp" $rootdir/test/nvme/nvme_bp.sh
|
||||
fi
|
||||
if [[ $SPDK_TEST_NVME_CUSE -eq 1 ]]; then
|
||||
run_test "nvme_cuse" $rootdir/test/nvme/cuse/nvme_cuse.sh
|
||||
fi
|
||||
if [[ $SPDK_TEST_NVME_CMB -eq 1 ]]; then
|
||||
run_test "nvme_cmb" $rootdir/test/nvme/cmb/cmb.sh
|
||||
fi
|
||||
if [[ $SPDK_TEST_NVME_FDP -eq 1 ]]; then
|
||||
run_test "nvme_fdp" test/nvme/nvme_fdp.sh
|
||||
fi
|
||||
|
||||
if [[ $SPDK_TEST_NVME_ZNS -eq 1 ]]; then
|
||||
run_test "nvme_zns" $rootdir/test/nvme/zns/zns.sh
|
||||
fi
|
||||
|
||||
run_test "nvme_rpc" $rootdir/test/nvme/nvme_rpc.sh
|
||||
run_test "nvme_rpc_timeouts" $rootdir/test/nvme/nvme_rpc_timeouts.sh
|
||||
# Only test hotplug without ASAN enabled. Since if it is
|
||||
# enabled, it catches SEGV earlier than our handler which
|
||||
# breaks the hotplug logic.
|
||||
if [ $SPDK_RUN_ASAN -eq 0 ] && [ $(uname -s) = Linux ]; then
|
||||
run_test "sw_hotplug" $rootdir/test/nvme/sw_hotplug.sh
|
||||
fi
|
||||
|
||||
if [[ $SPDK_TEST_XNVME -eq 1 ]]; then
|
||||
run_test "nvme_xnvme" $rootdir/test/nvme/xnvme/xnvme.sh
|
||||
run_test "blockdev_xnvme" $rootdir/test/bdev/blockdev.sh "xnvme"
|
||||
# Run ublk with xnvme since they have similar kernel dependencies
|
||||
run_test "ublk" $rootdir/test/ublk/ublk.sh
|
||||
fi
|
||||
# Temporary workaround for issue #542, annotated for no VM image.
|
||||
#if [ $SPDK_RUN_ASAN -eq 0 ]; then
|
||||
# run_test suite test/nvme/hotplug.sh intel
|
||||
#fi
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_IOAT -eq 1 ]; then
|
||||
run_test "ioat" $rootdir/test/ioat/ioat.sh
|
||||
run_test suite test/ioat/ioat.sh
|
||||
fi
|
||||
|
||||
timing_exit lib
|
||||
|
||||
if [ $SPDK_TEST_ISCSI -eq 1 ]; then
|
||||
run_test "iscsi_tgt" $rootdir/test/iscsi_tgt/iscsi_tgt.sh
|
||||
run_test "spdkcli_iscsi" $rootdir/test/spdkcli/iscsi.sh
|
||||
|
||||
# Run raid spdkcli test under iSCSI since blockdev tests run on systems that can't run spdkcli yet
|
||||
run_test "spdkcli_raid" $rootdir/test/spdkcli/raid.sh
|
||||
run_test suite ./test/iscsi_tgt/iscsi_tgt.sh posix
|
||||
run_test suite ./test/spdkcli/iscsi.sh
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_BLOBFS -eq 1 ]; then
|
||||
run_test "rocksdb" $rootdir/test/blobfs/rocksdb/rocksdb.sh
|
||||
run_test "blobstore" $rootdir/test/blobstore/blobstore.sh
|
||||
run_test "blobstore_grow" $rootdir/test/blobstore/blobstore_grow/blobstore_grow.sh
|
||||
run_test "blobfs" $rootdir/test/blobfs/blobfs.sh
|
||||
run_test "hello_blob" $SPDK_EXAMPLE_DIR/hello_blob \
|
||||
examples/blob/hello_world/hello_blob.json
|
||||
run_test suite ./test/blobfs/rocksdb/rocksdb.sh
|
||||
run_test suite ./test/blobstore/blobstore.sh
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_NVMF -eq 1 ]; then
|
||||
export NET_TYPE
|
||||
# The NVMe-oF run test cases are split out like this so that the parser that compiles the
|
||||
# list of all tests can properly differentiate them. Please do not merge them into one line.
|
||||
if [ "$SPDK_TEST_NVMF_TRANSPORT" = "rdma" ]; then
|
||||
run_test "nvmf_rdma" $rootdir/test/nvmf/nvmf.sh --transport=$SPDK_TEST_NVMF_TRANSPORT
|
||||
run_test "spdkcli_nvmf_rdma" $rootdir/test/spdkcli/nvmf.sh --transport=$SPDK_TEST_NVMF_TRANSPORT
|
||||
elif [ "$SPDK_TEST_NVMF_TRANSPORT" = "tcp" ]; then
|
||||
run_test "nvmf_tcp" $rootdir/test/nvmf/nvmf.sh --transport=$SPDK_TEST_NVMF_TRANSPORT
|
||||
if [[ $SPDK_TEST_URING -eq 0 ]]; then
|
||||
run_test "spdkcli_nvmf_tcp" $rootdir/test/spdkcli/nvmf.sh --transport=$SPDK_TEST_NVMF_TRANSPORT
|
||||
run_test "nvmf_identify_passthru" $rootdir/test/nvmf/target/identify_passthru.sh --transport=$SPDK_TEST_NVMF_TRANSPORT
|
||||
fi
|
||||
run_test "nvmf_dif" $rootdir/test/nvmf/target/dif.sh
|
||||
run_test "nvmf_abort_qd_sizes" $rootdir/test/nvmf/target/abort_qd_sizes.sh
|
||||
elif [ "$SPDK_TEST_NVMF_TRANSPORT" = "fc" ]; then
|
||||
run_test "nvmf_fc" $rootdir/test/nvmf/nvmf.sh --transport=$SPDK_TEST_NVMF_TRANSPORT
|
||||
run_test "spdkcli_nvmf_fc" $rootdir/test/spdkcli/nvmf.sh
|
||||
else
|
||||
echo "unknown NVMe transport, please specify rdma, tcp, or fc."
|
||||
exit 1
|
||||
fi
|
||||
run_test suite ./test/nvmf/nvmf.sh
|
||||
run_test suite ./test/spdkcli/nvmf.sh
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_VHOST -eq 1 ]; then
|
||||
run_test "vhost" $rootdir/test/vhost/vhost.sh
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_VFIOUSER_QEMU -eq 1 ]; then
|
||||
run_test "vfio_user_qemu" $rootdir/test/vfio_user/vfio_user.sh
|
||||
run_test suite ./test/vhost/vhost.sh
|
||||
report_test_completion "vhost"
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_LVOL -eq 1 ]; then
|
||||
run_test "lvol" $rootdir/test/lvol/lvol.sh
|
||||
run_test "blob_io_wait" $rootdir/test/blobstore/blob_io_wait/blob_io_wait.sh
|
||||
timing_enter lvol
|
||||
test_cases="1,50,51,52,53,100,101,102,150,200,201,250,251,252,253,254,255,"
|
||||
test_cases+="300,301,450,451,452,550,551,552,553,"
|
||||
test_cases+="600,601,602,650,651,652,654,655,"
|
||||
test_cases+="700,701,702,750,751,752,753,754,755,756,757,758,759,760,"
|
||||
test_cases+="800,801,802,803,804,10000"
|
||||
run_test suite ./test/lvol/lvol.sh --test-cases=$test_cases
|
||||
run_test suite ./test/blobstore/blob_io_wait/blob_io_wait.sh
|
||||
report_test_completion "lvol"
|
||||
timing_exit lvol
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_VHOST_INIT -eq 1 ]; then
|
||||
timing_enter vhost_initiator
|
||||
run_test "vhost_blockdev" $rootdir/test/vhost/initiator/blockdev.sh
|
||||
run_test "spdkcli_virtio" $rootdir/test/spdkcli/virtio.sh
|
||||
run_test "vhost_shared" $rootdir/test/vhost/shared/shared.sh
|
||||
run_test "vhost_fuzz" $rootdir/test/vhost/fuzz/fuzz.sh
|
||||
run_test suite ./test/vhost/initiator/blockdev.sh
|
||||
run_test suite ./test/spdkcli/virtio.sh
|
||||
run_test suite ./test/vhost/shared/shared.sh
|
||||
report_test_completion "vhost_initiator"
|
||||
timing_exit vhost_initiator
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_PMDK -eq 1 ]; then
|
||||
run_test suite ./test/pmem/pmem.sh -x
|
||||
run_test suite ./test/spdkcli/pmem.sh
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_RBD -eq 1 ]; then
|
||||
run_test "blockdev_rbd" $rootdir/test/bdev/blockdev.sh "rbd"
|
||||
run_test "spdkcli_rbd" $rootdir/test/spdkcli/rbd.sh
|
||||
run_test suite ./test/spdkcli/rbd.sh
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_OCF -eq 1 ]; then
|
||||
run_test "ocf" $rootdir/test/ocf/ocf.sh
|
||||
run_test suite ./test/ocf/ocf.sh
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_FTL -eq 1 ]; then
|
||||
run_test "ftl" $rootdir/test/ftl/ftl.sh
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_VMD -eq 1 ]; then
|
||||
run_test "vmd" $rootdir/test/vmd/vmd.sh
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_VBDEV_COMPRESS -eq 1 ]; then
|
||||
run_test "compress_compdev" $rootdir/test/compress/compress.sh "compdev"
|
||||
run_test "compress_isal" $rootdir/test/compress/compress.sh "isal"
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_OPAL -eq 1 ]; then
|
||||
run_test "nvme_opal" $rootdir/test/nvme/nvme_opal.sh
|
||||
fi
|
||||
|
||||
if [ $SPDK_TEST_CRYPTO -eq 1 ]; then
|
||||
run_test "blockdev_crypto_aesni" $rootdir/test/bdev/blockdev.sh "crypto_aesni"
|
||||
run_test "blockdev_crypto_sw" $rootdir/test/bdev/blockdev.sh "crypto_sw"
|
||||
run_test "blockdev_crypto_qat" $rootdir/test/bdev/blockdev.sh "crypto_qat"
|
||||
run_test "chaining" $rootdir/test/bdev/chaining.sh
|
||||
fi
|
||||
|
||||
if [[ $SPDK_TEST_SCHEDULER -eq 1 ]]; then
|
||||
run_test "scheduler" $rootdir/test/scheduler/scheduler.sh
|
||||
fi
|
||||
|
||||
if [[ $SPDK_TEST_SMA -eq 1 ]]; then
|
||||
run_test "sma" $rootdir/test/sma/sma.sh
|
||||
fi
|
||||
|
||||
if [[ $SPDK_TEST_FUZZER -eq 1 ]]; then
|
||||
run_test "llvm_fuzz" $rootdir/test/fuzz/llvm.sh
|
||||
fi
|
||||
|
||||
if [[ $SPDK_TEST_RAID5 -eq 1 ]]; then
|
||||
run_test "blockdev_raid5f" $rootdir/test/bdev/blockdev.sh "raid5f"
|
||||
if [ $SPDK_TEST_BDEV_FTL -eq 1 ]; then
|
||||
run_test suite ./test/ftl/ftl.sh
|
||||
fi
|
||||
fi
|
||||
|
||||
trap - SIGINT SIGTERM EXIT
|
||||
|
||||
timing_enter post_cleanup
|
||||
timing_enter cleanup
|
||||
autotest_cleanup
|
||||
timing_exit post_cleanup
|
||||
timing_exit cleanup
|
||||
|
||||
timing_exit autotest
|
||||
chmod a+r $output_dir/timing.txt
|
||||
|
||||
[[ -f "$output_dir/udev.log" ]] && rm -f "$output_dir/udev.log"
|
||||
trap - SIGINT SIGTERM EXIT
|
||||
|
||||
if hash lcov && ! [[ "$CC_TYPE" == *"clang"* ]]; then
|
||||
# catch any stray core files
|
||||
process_core
|
||||
|
||||
if hash lcov; then
|
||||
# generate coverage data and combine with baseline
|
||||
$LCOV -q -c -d $src -t "$(hostname)" -o $out/cov_test.info
|
||||
$LCOV -q -a $out/cov_base.info -a $out/cov_test.info -o $out/cov_total.info
|
||||
$LCOV -q -c -d $src -t "$(hostname)" -o cov_test.info
|
||||
$LCOV -q -a cov_base.info -a cov_test.info -o $out/cov_total.info
|
||||
$LCOV -q -r $out/cov_total.info '*/dpdk/*' -o $out/cov_total.info
|
||||
$LCOV -q -r $out/cov_total.info '/usr/*' -o $out/cov_total.info
|
||||
$LCOV -q -r $out/cov_total.info '*/examples/vmd/*' -o $out/cov_total.info
|
||||
$LCOV -q -r $out/cov_total.info '*/app/spdk_lspci/*' -o $out/cov_total.info
|
||||
$LCOV -q -r $out/cov_total.info '*/app/spdk_top/*' -o $out/cov_total.info
|
||||
owner=$(stat -c "%U" .)
|
||||
sudo -u $owner git clean -f "*.gcda"
|
||||
git clean -f "*.gcda"
|
||||
rm -f cov_base.info cov_test.info OLD_STDOUT OLD_STDERR
|
||||
fi
|
||||
|
1
build/lib/.gitignore
vendored
Normal file
1
build/lib/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
# Placeholder
|
@ -1,62 +0,0 @@
|
||||
# Deprecation
|
||||
|
||||
## ABI and API Deprecation
|
||||
|
||||
This document details the policy for maintaining stability of SPDK ABI and API.
|
||||
|
||||
Major ABI version can change at most once for each quarterly SPDK release.
|
||||
ABI versions are managed separately for each library and follow [Semantic Versioning](https://semver.org/).
|
||||
|
||||
API and ABI deprecation notices shall be posted in the next section.
|
||||
Each entry must describe what will be removed and can suggest the future use or alternative.
|
||||
Specific future SPDK release for the removal must be provided.
|
||||
ABI cannot be removed without providing deprecation notice for at least single SPDK release.
|
||||
|
||||
Deprecated code paths must be registered with `SPDK_DEPRECATION_REGISTER()` and logged with
|
||||
`SPDK_LOG_DEPRECATED()`. The tag used with these macros will appear in the SPDK
|
||||
log at the warn level when `SPDK_LOG_DEPRECATED()` is called, subject to rate limits.
|
||||
The tags can be matched with the level 4 headers below.
|
||||
|
||||
## Deprecation Notices
|
||||
|
||||
### PMDK
|
||||
|
||||
PMDK is no longer supported and integrations with it in SPDK are now deprecated, and will be removed in SPDK 23.05.
|
||||
Please see: [UPDATE ON PMDK AND OUR LONG TERM SUPPORT STRATEGY](https://pmem.io/blog/2022/11/update-on-pmdk-and-our-long-term-support-strategy/).
|
||||
|
||||
### VTune
|
||||
|
||||
#### `vtune_support`
|
||||
|
||||
VTune integration is in now deprecated and will be removed in SPDK 23.05.
|
||||
|
||||
### nvmf
|
||||
|
||||
#### `spdk_nvmf_qpair_disconnect`
|
||||
|
||||
Parameters `cb_fn` and `ctx` of `spdk_nvmf_qpair_disconnect` API are deprecated. These parameters
|
||||
will be removed in 23.09 release.
|
||||
|
||||
### gpt
|
||||
|
||||
#### `old_gpt_guid`
|
||||
|
||||
Deprecated the SPDK partition type GUID `7c5222bd-8f5d-4087-9c00-bf9843c7b58c`. Partitions of this
|
||||
type have bdevs created that are one block less than the actual size of the partition. Existing
|
||||
partitions using the deprecated GUID can continue to use that GUID; support for the deprecated GUID
|
||||
will remain in SPDK indefinitely, and will continue to exhibit the off-by-one bug so that on-disk
|
||||
metadata layouts based on the incorrect size are not affected.
|
||||
|
||||
See GitHub issue [2801](https://github.com/spdk/spdk/issues/2801) for additional details on the bug.
|
||||
|
||||
New SPDK partition types should use GUID `6527994e-2c5a-4eec-9613-8f5944074e8b` which will create
|
||||
a bdev of the correct size.
|
||||
|
||||
### lvol
|
||||
|
||||
#### `vbdev_lvol_rpc_req_size`
|
||||
|
||||
Param `size` in rpc commands `rpc_bdev_lvol_create` and `rpc_bdev_lvol_resize` is deprecated and
|
||||
replace by `size_in_mib`.
|
||||
|
||||
See GitHub issue [2346](https://github.com/spdk/spdk/issues/2346) for additional details.
|
3
doc/.gitignore
vendored
3
doc/.gitignore
vendored
@ -1,4 +1,3 @@
|
||||
# changelog.md and deprecation.md is generated by Makefile
|
||||
# changelog.md is generated by Makefile
|
||||
changelog.md
|
||||
deprecation.md
|
||||
output/
|
||||
|
120
doc/Doxyfile
120
doc/Doxyfile
@ -234,7 +234,7 @@ ALIASES =
|
||||
# A mapping has the form "name=value". For example adding "class=itcl::class"
|
||||
# will allow you to use the command class in the itcl::class meaning.
|
||||
|
||||
# TCL_SUBST =
|
||||
TCL_SUBST =
|
||||
|
||||
# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
|
||||
# only. Doxygen will then generate output that is more tailored for C. For
|
||||
@ -746,7 +746,7 @@ WARN_IF_DOC_ERROR = YES
|
||||
# parameter documentation, but not about the absence of documentation.
|
||||
# The default value is: NO.
|
||||
|
||||
WARN_NO_PARAMDOC = YES
|
||||
WARN_NO_PARAMDOC = NO
|
||||
|
||||
# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
|
||||
# a warning is encountered.
|
||||
@ -795,66 +795,46 @@ INPUT += \
|
||||
misc.md \
|
||||
driver_modules.md \
|
||||
tools.md \
|
||||
ci_tools.md \
|
||||
experimental_tools.md \
|
||||
performance_reports.md \
|
||||
|
||||
# All remaining pages are listed here in alphabetical order by filename.
|
||||
INPUT += \
|
||||
about.md \
|
||||
accel_fw.md \
|
||||
applications.md \
|
||||
bdev.md \
|
||||
bdevperf.md \
|
||||
bdev_module.md \
|
||||
bdev_pg.md \
|
||||
blob.md \
|
||||
blobfs.md \
|
||||
changelog.md \
|
||||
compression.md \
|
||||
concurrency.md \
|
||||
containers.md \
|
||||
deprecation.md \
|
||||
distributions.md \
|
||||
directory_structure.md \
|
||||
event.md \
|
||||
ftl.md \
|
||||
gdb_macros.md \
|
||||
getting_started.md \
|
||||
idxd.md \
|
||||
ioat.md \
|
||||
iscsi.md \
|
||||
jsonrpc.md \
|
||||
jsonrpc_proxy.md \
|
||||
libraries.md \
|
||||
lvol.md \
|
||||
memory.md \
|
||||
notify.md \
|
||||
nvme.md \
|
||||
nvme_multipath.md \
|
||||
nvme_spec.md \
|
||||
nvme-cli.md \
|
||||
nvmf.md \
|
||||
nvmf_tgt_pg.md \
|
||||
nvmf_tracing.md \
|
||||
nvmf_multipath_howto.md \
|
||||
overview.md \
|
||||
peer_2_peer.md \
|
||||
pkgconfig.md \
|
||||
porting.md \
|
||||
rpm.md \
|
||||
scheduler.md \
|
||||
shfmt.md \
|
||||
sma.md \
|
||||
spdkcli.md \
|
||||
spdk_top.md \
|
||||
ssd_internals.md \
|
||||
system_configuration.md \
|
||||
ublk.md \
|
||||
usdt.md \
|
||||
userspace.md \
|
||||
vagrant.md \
|
||||
vhost.md \
|
||||
vhost_processing.md \
|
||||
virtio.md \
|
||||
vmd.md
|
||||
vpp_integration.md
|
||||
|
||||
# This tag can be used to specify the character encoding of the source files
|
||||
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
|
||||
@ -1111,7 +1091,7 @@ ALPHABETICAL_INDEX = YES
|
||||
# Minimum value: 1, maximum value: 20, default value: 5.
|
||||
# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
|
||||
|
||||
# COLS_IN_ALPHA_INDEX = 5
|
||||
COLS_IN_ALPHA_INDEX = 5
|
||||
|
||||
# In case all classes in a project start with a common prefix, all classes will
|
||||
# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
|
||||
@ -1247,7 +1227,7 @@ HTML_COLORSTYLE_GAMMA = 80
|
||||
# The default value is: NO.
|
||||
# This tag requires that the tag GENERATE_HTML is set to YES.
|
||||
|
||||
HTML_TIMESTAMP = NO
|
||||
HTML_TIMESTAMP = YES
|
||||
|
||||
# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
|
||||
# documentation will contain sections that can be hidden and shown after the
|
||||
@ -1519,6 +1499,17 @@ EXT_LINKS_IN_WINDOW = NO
|
||||
|
||||
FORMULA_FONTSIZE = 10
|
||||
|
||||
# Use the FORMULA_TRANPARENT tag to determine whether or not the images
|
||||
# generated for formulas are transparent PNGs. Transparent PNGs are not
|
||||
# supported properly for IE 6.0, but are supported on all modern browsers.
|
||||
#
|
||||
# Note that when changing this option you need to delete any form_*.png files in
|
||||
# the HTML output directory before the changes have effect.
|
||||
# The default value is: YES.
|
||||
# This tag requires that the tag GENERATE_HTML is set to YES.
|
||||
|
||||
FORMULA_TRANSPARENT = YES
|
||||
|
||||
# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
|
||||
# http://www.mathjax.org) which uses client side Javascript for the rendering
|
||||
# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
|
||||
@ -1661,7 +1652,7 @@ EXTRA_SEARCH_MAPPINGS =
|
||||
# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
|
||||
# The default value is: YES.
|
||||
|
||||
GENERATE_LATEX = NO
|
||||
GENERATE_LATEX = YES
|
||||
|
||||
# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
|
||||
# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
|
||||
@ -1797,6 +1788,16 @@ LATEX_BATCHMODE = YES
|
||||
|
||||
LATEX_HIDE_INDICES = NO
|
||||
|
||||
# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
|
||||
# code with syntax highlighting in the LaTeX output.
|
||||
#
|
||||
# Note that which sources are shown also depends on other settings such as
|
||||
# SOURCE_BROWSER.
|
||||
# The default value is: NO.
|
||||
# This tag requires that the tag GENERATE_LATEX is set to YES.
|
||||
|
||||
LATEX_SOURCE_CODE = NO
|
||||
|
||||
# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
|
||||
# bibliography, e.g. plainnat, or ieeetr. See
|
||||
# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
|
||||
@ -1869,6 +1870,16 @@ RTF_STYLESHEET_FILE =
|
||||
|
||||
RTF_EXTENSIONS_FILE =
|
||||
|
||||
# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
|
||||
# with syntax highlighting in the RTF output.
|
||||
#
|
||||
# Note that which sources are shown also depends on other settings such as
|
||||
# SOURCE_BROWSER.
|
||||
# The default value is: NO.
|
||||
# This tag requires that the tag GENERATE_RTF is set to YES.
|
||||
|
||||
RTF_SOURCE_CODE = NO
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# Configuration options related to the man page output
|
||||
#---------------------------------------------------------------------------
|
||||
@ -1958,6 +1969,15 @@ GENERATE_DOCBOOK = NO
|
||||
|
||||
DOCBOOK_OUTPUT = docbook
|
||||
|
||||
# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
|
||||
# program listings (including syntax highlighting and cross-referencing
|
||||
# information) to the DOCBOOK output. Note that enabling this will significantly
|
||||
# increase the size of the DOCBOOK output.
|
||||
# The default value is: NO.
|
||||
# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
|
||||
|
||||
DOCBOOK_PROGRAMLISTING = NO
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# Configuration options for the AutoGen Definitions output
|
||||
#---------------------------------------------------------------------------
|
||||
@ -2136,12 +2156,21 @@ EXTERNAL_PAGES = YES
|
||||
# interpreter (i.e. the result of 'which perl').
|
||||
# The default file (with absolute path) is: /usr/bin/perl.
|
||||
|
||||
# PERL_PATH = /usr/bin/perl
|
||||
PERL_PATH = /usr/bin/perl
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# Configuration options related to the dot tool
|
||||
#---------------------------------------------------------------------------
|
||||
|
||||
# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
|
||||
# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
|
||||
# NO turns the diagrams off. Note that this option also works with HAVE_DOT
|
||||
# disabled, but it is recommended to install and use dot, since it yields more
|
||||
# powerful graphs.
|
||||
# The default value is: YES.
|
||||
|
||||
CLASS_DIAGRAMS = YES
|
||||
|
||||
# You can define message sequence charts within doxygen comments using the \msc
|
||||
# command. Doxygen will then run the mscgen tool (see:
|
||||
# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
|
||||
@ -2149,7 +2178,7 @@ EXTERNAL_PAGES = YES
|
||||
# the mscgen tool resides. If left empty the tool is assumed to be found in the
|
||||
# default search path.
|
||||
|
||||
# MSCGEN_PATH =
|
||||
MSCGEN_PATH =
|
||||
|
||||
# You can include diagrams made with dia in doxygen documentation. Doxygen will
|
||||
# then run dia to produce the diagram and insert it in the documentation. The
|
||||
@ -2183,6 +2212,23 @@ HAVE_DOT = YES
|
||||
|
||||
DOT_NUM_THREADS = 0
|
||||
|
||||
# When you want a differently looking font in the dot files that doxygen
|
||||
# generates you can specify the font name using DOT_FONTNAME. You need to make
|
||||
# sure dot is able to find the font, which can be done by putting it in a
|
||||
# standard location or by setting the DOTFONTPATH environment variable or by
|
||||
# setting DOT_FONTPATH to the directory containing the font.
|
||||
# The default value is: Helvetica.
|
||||
# This tag requires that the tag HAVE_DOT is set to YES.
|
||||
|
||||
DOT_FONTNAME = Helvetica
|
||||
|
||||
# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
|
||||
# dot graphs.
|
||||
# Minimum value: 4, maximum value: 24, default value: 10.
|
||||
# This tag requires that the tag HAVE_DOT is set to YES.
|
||||
|
||||
DOT_FONTSIZE = 10
|
||||
|
||||
# By default doxygen will tell dot to use the default font as specified with
|
||||
# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
|
||||
# the path where dot can find it using this tag.
|
||||
@ -2395,6 +2441,18 @@ DOT_GRAPH_MAX_NODES = 50
|
||||
|
||||
MAX_DOT_GRAPH_DEPTH = 2
|
||||
|
||||
# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
|
||||
# background. This is disabled by default, because dot on Windows does not seem
|
||||
# to support this out of the box.
|
||||
#
|
||||
# Warning: Depending on the platform used, enabling this option may lead to
|
||||
# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
|
||||
# read).
|
||||
# The default value is: NO.
|
||||
# This tag requires that the tag HAVE_DOT is set to YES.
|
||||
|
||||
DOT_TRANSPARENT = NO
|
||||
|
||||
# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
|
||||
# files in one run (i.e. multiple -o and -T options on the command line). This
|
||||
# makes dot run faster, but since only newer versions of dot (>1.8.10) support
|
||||
|
13
doc/Makefile
13
doc/Makefile
@ -1,8 +1,3 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
# Copyright (C) 2015 Intel Corporation
|
||||
# All rights reserved.
|
||||
#
|
||||
|
||||
SPDK_ROOT_DIR := $(abspath $(CURDIR)/..)
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
|
||||
|
||||
@ -13,10 +8,6 @@ all: doc
|
||||
|
||||
doc: output
|
||||
|
||||
deprecation.md: ../deprecation.md
|
||||
$(Q)sed -e 's/^# Deprecation/# Deprecation {#deprecation}/' \
|
||||
< $< > $@
|
||||
|
||||
changelog.md: ../CHANGELOG.md
|
||||
$(Q)sed -e 's/^# Changelog/# Changelog {#changelog}/' \
|
||||
-e 's/^##/#/' \
|
||||
@ -24,9 +15,9 @@ changelog.md: ../CHANGELOG.md
|
||||
-e '/# v..\...:/s/\./-/2' \
|
||||
< $< > $@
|
||||
|
||||
output: Doxyfile changelog.md deprecation.md $(wildcard *.md) $(wildcard ../include/spdk/*.h)
|
||||
output: Doxyfile changelog.md $(wildcard *.md) $(wildcard ../include/spdk/*.h)
|
||||
$(Q)rm -rf $@
|
||||
$(Q)doxygen Doxyfile
|
||||
|
||||
clean:
|
||||
$(Q)rm -rf output changelog.md deprecation.md
|
||||
$(Q)rm -rf output changelog.md
|
||||
|
@ -1,9 +1,11 @@
|
||||
# SPDK Documentation
|
||||
SPDK Documentation
|
||||
==================
|
||||
|
||||
The current version of the SPDK documentation can be found online at
|
||||
http://www.spdk.io/doc/
|
||||
|
||||
## Building the Documentation
|
||||
Building the Documentation
|
||||
==========================
|
||||
|
||||
To convert the documentation into HTML run `make` in the `doc`
|
||||
directory. The output will be located in `doc/output/html`. Before
|
||||
|
@ -1,4 +1,4 @@
|
||||
# What is SPDK {#about}
|
||||
# What is SPDK? {#about}
|
||||
|
||||
The Storage Performance Development Kit (SPDK) provides a set of tools and
|
||||
libraries for writing high performance, scalable, user-mode storage
|
||||
|
190
doc/accel_fw.md
190
doc/accel_fw.md
@ -1,190 +0,0 @@
|
||||
# Acceleration Framework {#accel_fw}
|
||||
|
||||
SPDK provides a framework for abstracting general acceleration capabilities
|
||||
that can be implemented through plug-in modules and low-level libraries. These
|
||||
plug-in modules include support for hardware acceleration engines such as
|
||||
the Intel(R) I/O Acceleration Technology (IOAT) engine and the Intel(R) Data
|
||||
Streaming Accelerator (DSA) engine. Additionally, a software plug-in module
|
||||
exists to enable use of the framework in environments without hardware
|
||||
acceleration capabilities. ISA/L is used for optimized CRC32C calculation within
|
||||
the software module.
|
||||
|
||||
## Acceleration Framework Functions {#accel_functions}
|
||||
|
||||
Functions implemented via the framework can be found in the DoxyGen documentation of the
|
||||
framework public header file here [accel.h](https://spdk.io/doc/accel_8h.html)
|
||||
|
||||
## Acceleration Framework Design Considerations {#accel_dc}
|
||||
|
||||
The general interface is defined by `/include/spdk/accel.h` and implemented
|
||||
in `/lib/accel`. These functions may be called by an SPDK application and in
|
||||
most cases, except where otherwise documented, are asynchronous and follow the
|
||||
standard SPDK model for callbacks with a callback argument.
|
||||
|
||||
If the acceleration framework is started without initializing a hardware module,
|
||||
optimized software implementations of the operations will back the public API. All
|
||||
operations supported by the framework have a backing software implementation in
|
||||
the event that no hardware accelerators have been enabled for that operation.
|
||||
|
||||
When multiple hardware modules are enabled the framework will assign each operation to
|
||||
a module based on the order in which it was initialized. So, for example if two modules are
|
||||
enabled, IOAT and software, the software module will be used for every operation except those
|
||||
supported by IOAT.
|
||||
|
||||
## Acceleration Low Level Libraries {#accel_libs}
|
||||
|
||||
Low level libraries provide only the most basic functions that are specific to
|
||||
the hardware. Low level libraries are located in the '/lib' directory with the
|
||||
exception of the software implementation which is implemented as part of the
|
||||
framework itself. The software low level library does not expose a public API.
|
||||
Applications may choose to interact directly with a low level library if there are
|
||||
specific needs/considerations not met via accessing the library through the
|
||||
framework/module. Note that when using the low level libraries directly, the
|
||||
framework abstracted interface is bypassed as the application will call the public
|
||||
functions exposed by the individual low level libraries. Thus, code written this
|
||||
way needs to be certain that the underlying hardware exists everywhere that it runs.
|
||||
|
||||
The low level library for IOAT is located in `/lib/ioat`. The low level library
|
||||
for DSA and IAA is in `/lib/idxd` (IDXD stands for Intel(R) Data Acceleration Driver and
|
||||
supports both DSA and IAA hardware accelerators). In `/lib/idxd` folder, SPDK supports the ability
|
||||
to use either user space and kernel space drivers. The following describes each usage scenario:
|
||||
|
||||
Leveraging user space idxd driver: The DSA devices are managed by the SPDK user space
|
||||
driver in a dedicated SPDK process, then the device cannot be shared by another
|
||||
process. The benefit of this usage is no kernel dependency.
|
||||
|
||||
Leveraging kernel space driver: The DSA devices are managed by kernel
|
||||
space drivers. And the Work queues inside the DSA device can be shared among
|
||||
different processes. Naturally, it can be used in cloud native scenario. The drawback of
|
||||
this usage is the kernel dependency, i.e., idxd kernel driver must be supported and loaded
|
||||
in the kernel.
|
||||
|
||||
## Acceleration Plug-In Modules {#accel_modules}
|
||||
|
||||
Plug-in modules depend on low level libraries to interact with the hardware and
|
||||
add additional functionality such as queueing during busy conditions or flow
|
||||
control in some cases. The framework in turn depends on the modules to provide
|
||||
the complete implementation of the acceleration component. A module must be
|
||||
selected via startup RPC when the application is started. Otherwise, if no startup
|
||||
RPC is provided, the framework is available and will use the software plug-in module.
|
||||
|
||||
### IOAT Module {#accel_ioat}
|
||||
|
||||
To use the IOAT module, use the RPC [`ioat_scan_accel_module`](https://spdk.io/doc/jsonrpc.html) before starting the application.
|
||||
|
||||
### DSA Module {#accel_dsa}
|
||||
|
||||
The DSA module supports the DSA hardware and relies on the low level IDXD library.
|
||||
|
||||
To use the DSA module, use the RPC
|
||||
[`dsa_scan_accel_module`](https://spdk.io/doc/jsonrpc.html). By default, this
|
||||
will attempt to load the SPDK user-space idxd driver. To use the built-in
|
||||
kernel driver on Linux, add the `-k` parameter. See the next section for
|
||||
details on using the kernel driver.
|
||||
|
||||
The DSA hardware supports a limited queue depth and channels. This means that
|
||||
only a limited number of `spdk_thread`s will be able to acquire a channel.
|
||||
Design software to deal with the inability to get a channel.
|
||||
|
||||
#### How to use kernel idxd driver {#accel_idxd_kernel}
|
||||
|
||||
There are several dependencies to leverage the Linux idxd driver for driving DSA devices.
|
||||
|
||||
1 Linux kernel support: You need to have a Linux kernel with the `idxd` driver
|
||||
loaded. Further, add the following command line options to the kernel boot
|
||||
commands:
|
||||
|
||||
```bash
|
||||
intel_iommu=on,sm_on
|
||||
```
|
||||
|
||||
2 User library dependency: Users need to install the developer version of the
|
||||
`accel-config` library. This is often packaged, but the source is available on
|
||||
[GitHub](https://github.com/intel/idxd-config). After the library is installed,
|
||||
users can use the `accel-config` command to configure the work queues(WQs) of
|
||||
the idxd devices managed by the kernel with the following steps:
|
||||
|
||||
Note: this library must be installed before you run `configure`
|
||||
|
||||
```bash
|
||||
accel-config disable-wq dsa0/wq0.1
|
||||
accel-config disable-device dsa0
|
||||
accel-config config-wq --group-id=0 --mode=dedicated --wq-size=128 --type=user --name="MyApp1"
|
||||
--priority=10 --block-on-fault=1 dsa0/wq0.1
|
||||
accel-config config-engine dsa0/engine0.0 --group-id=0
|
||||
accel-config config-engine dsa0/engine0.1 --group-id=0
|
||||
accel-config config-engine dsa0/engine0.2 --group-id=0
|
||||
accel-config config-engine dsa0/engine0.3 --group-id=0
|
||||
accel-config enable-device dsa0
|
||||
accel-config enable-wq dsa0/wq0.1
|
||||
```
|
||||
|
||||
DSA can be configured in many ways, but the above configuration is needed for use with SPDK.
|
||||
Before you can run using the kernel driver you need to make sure that the hardware is bound
|
||||
to the kernel driver and not VFIO. By default when you run `setup.sh` DSA devices will be
|
||||
bound to VFIO. To exclude DSA devices, pass a whitespace separated list of DSA devices BDF
|
||||
using the PCI_BLOCKED parameter as shown below.
|
||||
|
||||
```bash
|
||||
sudo PCI_BLOCKED="0000:04:00.0 0000:05:00.0" ./setup.sh
|
||||
```
|
||||
|
||||
Note: you might need to run `sudo ./setup.sh reset` to unbind all drivers before performing
|
||||
the step above.
|
||||
|
||||
### Software Module {#accel_sw}
|
||||
|
||||
The software module is enabled by default. If no hardware module is explicitly
|
||||
enabled via startup RPC as discussed earlier, the software module will use ISA-L
|
||||
if available for functions such as CRC32C. Otherwise, standard glibc calls are
|
||||
used to back the framework API.
|
||||
|
||||
### dpdk_cryptodev {#accel_dpdk_cryptodev}
|
||||
|
||||
The dpdk_cryptodev module uses DPDK CryptoDev API to implement crypto operations.
|
||||
The following ciphers and PMDs are supported:
|
||||
|
||||
- AESN-NI Multi Buffer Crypto Poll Mode Driver: RTE_CRYPTO_CIPHER_AES128_CBC
|
||||
- Intel(R) QuickAssist (QAT) Crypto Poll Mode Driver: RTE_CRYPTO_CIPHER_AES128_CBC,
|
||||
RTE_CRYPTO_CIPHER_AES128_XTS
|
||||
(Note: QAT is functional however is marked as experimental until the hardware has
|
||||
been fully integrated with the SPDK CI system.)
|
||||
- MLX5 Crypto Poll Mode Driver: RTE_CRYPTO_CIPHER_AES256_XTS, RTE_CRYPTO_CIPHER_AES512_XTS
|
||||
|
||||
To enable this module, use [`dpdk_cryptodev_scan_accel_module`](https://spdk.io/doc/jsonrpc.html),
|
||||
this RPC is available in STARTUP state and the SPDK application needs to be run with `--wait-for-rpc`
|
||||
CLI parameter. To select a specific PMD, use [`dpdk_cryptodev_set_driver`](https://spdk.io/doc/jsonrpc.html)
|
||||
|
||||
### Module to Operation Code Assignment {#accel_assignments}
|
||||
|
||||
When multiple modules are initialized, the accel framework will assign op codes to
|
||||
modules by first assigning all op codes to the Software Module and then overriding
|
||||
op code assignments to Hardware Modules in the order in which they were initialized.
|
||||
The RPC `accel_get_opc_assignments` can be used at any time to see the current
|
||||
assignment map including the names of valid operations. The RPC `accel_assign_opc`
|
||||
can be used after initializing the desired Hardware Modules but before starting the
|
||||
framework in the event that a specific override is desired. Note that to start an
|
||||
application and send startup RPC's use the `--wait-for-rpc` parameter and then use the
|
||||
`framework_start_init` RPC to continue. For example, assume the DSA Module is initialized
|
||||
but for some reason the desire is to have the Software Module handle copies instead.
|
||||
The following RPCs would accomplish the copy override:
|
||||
|
||||
```bash
|
||||
./scripts/rpc.py dsa_scan_accel_module
|
||||
./scripts/rpc.py accel_assign_opc -o copy -m software
|
||||
./scripts/rpc.py framework_start_init
|
||||
./scripts/rpc.py accel_get_opc_assignments
|
||||
{
|
||||
"copy": "software",
|
||||
"fill": "dsa",
|
||||
"dualcast": "dsa",
|
||||
"compare": "dsa",
|
||||
"crc32c": "dsa",
|
||||
"copy_crc32c": "dsa",
|
||||
"compress": "software",
|
||||
"decompress": "software"
|
||||
}
|
||||
```
|
||||
|
||||
To determine the name of available modules and their supported operations use the
|
||||
RPC `accel_get_module_info`.
|
@ -29,28 +29,32 @@ Param | Long Param | Type | Default | Descript
|
||||
-------- | ---------------------- | -------- | ---------------------- | -----------
|
||||
-c | --config | string | | @ref cmd_arg_config_file
|
||||
-d | --limit-coredump | flag | false | @ref cmd_arg_limit_coredump
|
||||
-e | --tpoint-group | integer | | @ref cmd_arg_limit_tpoint_group_mask
|
||||
-e | --tpoint-group-mask | integer | 0x0 | @ref cmd_arg_limit_tpoint_group_mask
|
||||
-g | --single-file-segments | flag | | @ref cmd_arg_single_file_segments
|
||||
-h | --help | flag | | show all available parameters and exit
|
||||
-i | --shm-id | integer | | @ref cmd_arg_multi_process
|
||||
-m | --cpumask | CPU mask | 0x1 | application @ref cpu_mask
|
||||
-n | --mem-channels | integer | all channels | number of memory channels used for DPDK
|
||||
-p | --main-core | integer | first core in CPU mask | main (primary) core for DPDK
|
||||
-p | --master-core | integer | first core in CPU mask | master (primary) core for DPDK
|
||||
-r | --rpc-socket | string | /var/tmp/spdk.sock | RPC listen address
|
||||
-s | --mem-size | integer | all hugepage memory | @ref cmd_arg_memory_size
|
||||
| | --silence-noticelog | flag | | disable notice level logging to `stderr`
|
||||
-u | --no-pci | flag | | @ref cmd_arg_disable_pci_access.
|
||||
| | --wait-for-rpc | flag | | @ref cmd_arg_deferred_initialization
|
||||
-B | --pci-blocked | B:D:F | | @ref cmd_arg_pci_blocked_allowed.
|
||||
-A | --pci-allowed | B:D:F | | @ref cmd_arg_pci_blocked_allowed.
|
||||
-B | --pci-blacklist | B:D:F | | @ref cmd_arg_pci_blacklist_whitelist.
|
||||
-W | --pci-whitelist | B:D:F | | @ref cmd_arg_pci_blacklist_whitelist.
|
||||
-R | --huge-unlink | flag | | @ref cmd_arg_huge_unlink
|
||||
| | --huge-dir | string | the first discovered | allocate hugepages from a specific mount
|
||||
-L | --logflag | string | | @ref cmd_arg_log_flags
|
||||
-L | --logflag | string | | @ref cmd_arg_debug_log_flags
|
||||
|
||||
|
||||
### Configuration file {#cmd_arg_config_file}
|
||||
|
||||
SPDK applications are configured using a JSON RPC configuration file.
|
||||
See @ref jsonrpc for details.
|
||||
Historically, the SPDK applications were configured using a configuration file.
|
||||
This is still supported, but is considered deprecated in favor of JSON RPC
|
||||
configuration. See @ref jsonrpc for details.
|
||||
|
||||
Note that `--config` and `--wait-for-rpc` cannot be used at the same time.
|
||||
|
||||
### Limit coredump {#cmd_arg_limit_coredump}
|
||||
|
||||
@ -61,7 +65,7 @@ to RLIM_INFINITY. Specifying `--limit-coredump` will not set the resource limit
|
||||
|
||||
SPDK has an experimental low overhead tracing framework. Tracepoints in this
|
||||
framework are organized into tracepoint groups. By default, all tracepoint
|
||||
groups are disabled. `--tpoint-group` can be used to enable a specific
|
||||
groups are disabled. `--tpoint-group-mask` can be used to enable a specific
|
||||
subset of tracepoint groups in the application.
|
||||
|
||||
Note: Additional documentation on the tracepoint framework is in progress.
|
||||
@ -72,17 +76,17 @@ SPDK applications progress through a set of states beginning with `STARTUP` and
|
||||
ending with `RUNTIME`.
|
||||
|
||||
If the `--wait-for-rpc` parameter is provided SPDK will pause just before starting
|
||||
framework initialization. This state is called `STARTUP`. The JSON RPC server is
|
||||
ready but only a small subset of commands are available to set up initialization
|
||||
subsystem initialization. This state is called `STARTUP`. The JSON RPC server is
|
||||
ready but only a small subsystem of commands are available to set up initialization
|
||||
parameters. Those parameters can't be changed after the SPDK application enters
|
||||
`RUNTIME` state. When the client finishes configuring the SPDK subsystems it
|
||||
needs to issue the @ref rpc_framework_start_init RPC command to begin the
|
||||
initialization process. After `rpc_framework_start_init` returns `true` SPDK
|
||||
needs to issue the @ref rpc_start_subsystem_init RPC command to begin the
|
||||
initialization process. After `rpc_start_subsystem_init` returns `true` SPDK
|
||||
will enter the `RUNTIME` state and the list of available commands becomes much
|
||||
larger.
|
||||
|
||||
To see which RPC methods are available in the current state, issue the
|
||||
`rpc_get_methods` with the parameter `current` set to `true`.
|
||||
`get_rpc_methods` with the parameter `current` set to `true`.
|
||||
|
||||
For more details see @ref jsonrpc documentation.
|
||||
|
||||
@ -121,12 +125,12 @@ If SPDK is run with PCI access disabled it won't detect any PCI devices. This
|
||||
includes primarily NVMe and IOAT devices. Also, the VFIO and UIO kernel modules
|
||||
are not required in this mode.
|
||||
|
||||
### PCI address blocked and allowed lists {#cmd_arg_pci_blocked_allowed}
|
||||
### PCI address blacklist and whitelist {#cmd_arg_pci_blacklist_whitelist}
|
||||
|
||||
If blocked list is used, then all devices with the provided PCI address will be
|
||||
ignored. If an allowed list is used, only allowed devices will be probed.
|
||||
`-B` or `-A` can be used more than once, but cannot be mixed together. That is,
|
||||
`-B` and `-A` cannot be used at the same time.
|
||||
If blacklist is used, then all devices with the provided PCI address will be
|
||||
ignored. If a whitelist is used, only whitelisted devices will be probed.
|
||||
`-B` or `-W` can be used more than once, but cannot be mixed together. That is,
|
||||
`-B` and `-W` cannot be used at the same time.
|
||||
|
||||
### Unlink hugepage files after initialization {#cmd_arg_huge_unlink}
|
||||
|
||||
@ -134,11 +138,11 @@ By default, each DPDK-based application tries to remove any orphaned hugetlbfs
|
||||
files during its initialization. This option removes hugetlbfs files of the current
|
||||
process as soon as they're created, but is not compatible with `--shm-id`.
|
||||
|
||||
### Log flag {#cmd_arg_log_flags}
|
||||
### Debug log {#cmd_arg_debug_log_flags}
|
||||
|
||||
Enable a specific log type. This option can be used more than once. A list of
|
||||
Enable a specific debug log type. This option can be used more than once. A list of
|
||||
all available types is provided in the `--help` output, with `--logflag all`
|
||||
enabling all of them. Additionally enables debug print level in debug builds of SPDK.
|
||||
enabling all of them. Debug logs are only available in debug builds of SPDK.
|
||||
|
||||
## CPU mask {#cpu_mask}
|
||||
|
||||
@ -151,7 +155,7 @@ Whenever the `CPU mask` is mentioned it is a string in one of the following form
|
||||
|
||||
The following CPU masks are equal and correspond to CPUs 0, 1, 2, 8, 9, 10, 11 and 12:
|
||||
|
||||
~~~bash
|
||||
~~~
|
||||
0x1f07
|
||||
0x1F07
|
||||
1f07
|
||||
|
582
doc/bdev.md
582
doc/bdev.md
@ -1,11 +1,6 @@
|
||||
# Block Device User Guide {#bdev}
|
||||
|
||||
## Target Audience {#bdev_ug_targetaudience}
|
||||
|
||||
This user guide is intended for software developers who have knowledge of block storage, storage drivers, issuing JSON-RPC
|
||||
commands and storage services such as RAID, compression, crypto, and others.
|
||||
|
||||
## Introduction {#bdev_ug_introduction}
|
||||
# Introduction {#bdev_ug_introduction}
|
||||
|
||||
The SPDK block device layer, often simply called *bdev*, is a C library
|
||||
intended to be equivalent to the operating system block storage layer that
|
||||
@ -27,7 +22,7 @@ device underneath (please refer to @ref bdev_module for details). SPDK
|
||||
provides also vbdev modules which creates block devices on existing bdev. For
|
||||
example @ref bdev_ug_logical_volumes or @ref bdev_ug_gpt
|
||||
|
||||
## Prerequisites {#bdev_ug_prerequisites}
|
||||
# Prerequisites {#bdev_ug_prerequisites}
|
||||
|
||||
This guide assumes that you can already build the standard SPDK distribution
|
||||
on your platform. The block device layer is a C library with a single public
|
||||
@ -36,273 +31,168 @@ chapters is done by using JSON-RPC commands. SPDK provides a python-based
|
||||
command line tool for sending RPC commands located at `scripts/rpc.py`. User
|
||||
can list available commands by running this script with `-h` or `--help` flag.
|
||||
Additionally user can retrieve currently supported set of RPC commands
|
||||
directly from SPDK application by running `scripts/rpc.py rpc_get_methods`.
|
||||
directly from SPDK application by running `scripts/rpc.py get_rpc_methods`.
|
||||
Detailed help for each command can be displayed by adding `-h` flag as a
|
||||
command parameter.
|
||||
|
||||
## Configuring Block Device Modules {#bdev_ug_general_rpcs}
|
||||
# General Purpose RPCs {#bdev_ug_general_rpcs}
|
||||
|
||||
Block devices can be configured using JSON RPCs. A complete list of available RPC commands
|
||||
with detailed information can be found on the @ref jsonrpc_components_bdev page.
|
||||
## get_bdevs {#bdev_ug_get_bdevs}
|
||||
|
||||
## Common Block Device Configuration Examples
|
||||
List of currently available block devices including detailed information about
|
||||
them can be get by using `get_bdevs` RPC command. User can add optional
|
||||
parameter `name` to get details about specified by that name bdev.
|
||||
|
||||
## Ceph RBD {#bdev_config_rbd}
|
||||
Example response
|
||||
|
||||
~~~
|
||||
{
|
||||
"num_blocks": 32768,
|
||||
"assigned_rate_limits": {
|
||||
"rw_ios_per_sec": 10000,
|
||||
"rw_mbytes_per_sec": 20
|
||||
},
|
||||
"supported_io_types": {
|
||||
"reset": true,
|
||||
"nvme_admin": false,
|
||||
"unmap": true,
|
||||
"read": true,
|
||||
"write_zeroes": true,
|
||||
"write": true,
|
||||
"flush": true,
|
||||
"nvme_io": false
|
||||
},
|
||||
"driver_specific": {},
|
||||
"claimed": false,
|
||||
"block_size": 4096,
|
||||
"product_name": "Malloc disk",
|
||||
"name": "Malloc0"
|
||||
}
|
||||
~~~
|
||||
|
||||
## set_bdev_qos_limit {#set_bdev_qos_limit}
|
||||
|
||||
Users can use the `set_bdev_qos_limit` RPC command to enable, adjust, and disable
|
||||
rate limits on an existing bdev. Two types of rate limits are supported:
|
||||
IOPS and bandwidth. The rate limits can be enabled, adjusted, and disabled at any
|
||||
time for the specified bdev. The bdev name is a required parameter for this
|
||||
RPC command and at least one of `rw_ios_per_sec` and `rw_mbytes_per_sec` must be
|
||||
specified. When both rate limits are enabled, the first met limit will
|
||||
take effect. The value 0 may be specified to disable the corresponding rate
|
||||
limit. Users can run this command with `-h` or `--help` for more information.
|
||||
|
||||
## Histograms {#rpc_bdev_histogram}
|
||||
|
||||
The `enable_bdev_histogram` RPC command allows to enable or disable gathering
|
||||
latency data for specified bdev. Histogram can be downloaded by the user by
|
||||
calling `get_bdev_histogram` and parsed using scripts/histogram.py script.
|
||||
|
||||
Example command
|
||||
|
||||
`rpc.py enable_bdev_histogram Nvme0n1 --enable`
|
||||
|
||||
The command will enable gathering data for histogram on Nvme0n1 device.
|
||||
|
||||
`rpc.py get_bdev_histogram Nvme0n1 | histogram.py`
|
||||
|
||||
The command will download gathered histogram data. The script will parse
|
||||
the data and show table containing IO count for latency ranges.
|
||||
|
||||
`rpc.py enable_bdev_histogram Nvme0n1 --disable`
|
||||
|
||||
The command will disable histogram on Nvme0n1 device.
|
||||
|
||||
# Ceph RBD {#bdev_config_rbd}
|
||||
|
||||
The SPDK RBD bdev driver provides SPDK block layer access to Ceph RADOS block
|
||||
devices (RBD). Ceph RBD devices are accessed via librbd and librados libraries
|
||||
to access the RADOS block device exported by Ceph. To create Ceph bdev RPC
|
||||
command `bdev_rbd_register_cluster` and `bdev_rbd_create` should be used.
|
||||
|
||||
SPDK provides two ways of creating a RBD bdev. One is to create a new Rados cluster object
|
||||
for each RBD bdev. Another is to share the same Rados cluster object for multiple RBD bdevs.
|
||||
Each Rados cluster object creates a small number of io_context_pool and messenger threads.
|
||||
Ceph commands `ceph config help librados_thread_count` and `ceph config help ms_async_op_threads`
|
||||
could help to check these threads information. Besides, you can specify the number of threads by
|
||||
updating ceph.conf file or using Ceph config commands. For more information, please refer to
|
||||
[Ceph configuration](https://docs.ceph.com/en/latest/rados/configuration/ceph-conf/)
|
||||
One set of threads may not be enough to maximize performance with a large number of RBD bdevs,
|
||||
but one set of threads per RBD bdev may add too much context switching. Therefore, performance
|
||||
tuning on the number of RBD bdevs per cluster object and thread may be required.
|
||||
command `construct_rbd_bdev` should be used.
|
||||
|
||||
Example command
|
||||
|
||||
`rpc.py bdev_rbd_register_cluster rbd_cluster`
|
||||
|
||||
This command will register a cluster named rbd_cluster. Optional `--config-file` and
|
||||
`--key-file` params are specified for the cluster.
|
||||
|
||||
To remove a registered cluster use the bdev_rbd_unregister_cluster command.
|
||||
|
||||
`rpc.py bdev_rbd_unregister_cluster rbd_cluster`
|
||||
|
||||
To create RBD bdev with a registered cluster.
|
||||
|
||||
`rpc.py bdev_rbd_create rbd foo 512 -c rbd_cluster`
|
||||
`rpc.py construct_rbd_bdev rbd foo 512`
|
||||
|
||||
This command will create a bdev that represents the 'foo' image from a pool called 'rbd'.
|
||||
When specifying -c for `bdev_rbd_create`, RBD bdevs will share the same rados cluster with
|
||||
one connection of Ceph in librbd module. Instead it will create a new rados cluster with one
|
||||
cluster connection for every bdev without specifying -c.
|
||||
|
||||
To remove a block device representation use the bdev_rbd_delete command.
|
||||
To remove a block device representation use the delete_rbd_bdev command.
|
||||
|
||||
`rpc.py bdev_rbd_delete Rbd0`
|
||||
`rpc.py delete_rbd_bdev Rbd0`
|
||||
|
||||
To resize a bdev use the bdev_rbd_resize command.
|
||||
|
||||
`rpc.py bdev_rbd_resize Rbd0 4096`
|
||||
|
||||
This command will resize the Rbd0 bdev to 4096 MiB.
|
||||
|
||||
## Compression Virtual Bdev Module {#bdev_config_compress}
|
||||
|
||||
The compression bdev module can be configured to provide compression/decompression
|
||||
services for an underlying thinly provisioned logical volume. Although the underlying
|
||||
module can be anything (i.e. NVME bdev) the overall compression benefits will not be realized
|
||||
unless the data stored on disk is placed appropriately. The compression vbdev module
|
||||
relies on an internal SPDK library called `reduce` to accomplish this, see @ref reduce
|
||||
for detailed information.
|
||||
|
||||
The compression bdev module leverages the [Acceleration Framework](https://spdk.io/doc/accel_fw.html) to
|
||||
carry out the actual compression and decompression. The acceleration framework can be configured to use
|
||||
ISA-L software optimized compression or the DPDK Compressdev module for hardware acceleration. To configure
|
||||
the Compressdev module please see the `compressdev_scan_accel_module` documentation [here](https://spdk.io/doc/jsonrpc.html)
|
||||
|
||||
Persistent memory is used to store metadata associated with the layout of the data on the
|
||||
backing device. SPDK relies on [PMDK](http://pmem.io/pmdk/) to interface persistent memory so any hardware
|
||||
supported by PMDK should work. If the directory for PMEM supplied upon vbdev creation does
|
||||
not point to persistent memory (i.e. a regular filesystem) performance will be severely
|
||||
impacted. The vbdev module and reduce libraries were designed to use persistent memory for
|
||||
any production use.
|
||||
|
||||
Example command
|
||||
|
||||
`rpc.py bdev_compress_create -p /pmem_files -b myLvol`
|
||||
|
||||
In this example, a compression vbdev is created using persistent memory that is mapped to
|
||||
the directory `pmem_files` on top of the existing thinly provisioned logical volume `myLvol`.
|
||||
The resulting compression bdev will be named `COMP_LVS/myLvol` where LVS is the name of the
|
||||
logical volume store that `myLvol` resides on.
|
||||
|
||||
The logical volume is referred to as the backing device and once the compression vbdev is
|
||||
created it cannot be separated from the persistent memory file that will be created in
|
||||
the specified directory. If the persistent memory file is not available, the compression
|
||||
vbdev will also not be available.
|
||||
|
||||
To remove a compression vbdev, use the following command which will also delete the PMEM
|
||||
file. If the logical volume is deleted the PMEM file will not be removed and the
|
||||
compression vbdev will not be available.
|
||||
|
||||
`rpc.py bdev_compress_delete COMP_LVS/myLvol`
|
||||
|
||||
To list compression volumes that are only available for deletion because their PMEM file
|
||||
was missing use the following. The name parameter is optional and if not included will list
|
||||
all volumes, if used it will return the name or an error that the device does not exist.
|
||||
|
||||
`rpc.py bdev_compress_get_orphans --name COMP_Nvme0n1`
|
||||
|
||||
## Crypto Virtual Bdev Module {#bdev_config_crypto}
|
||||
# Crypto Virtual Bdev Module {#bdev_config_crypto}
|
||||
|
||||
The crypto virtual bdev module can be configured to provide at rest data encryption
|
||||
for any underlying bdev. The module relies on the SPDK Accel Framework to provide
|
||||
all cryptographic functionality.
|
||||
One of the accel modules, dpdk_cryptodev is implemented with the DPDK CryptoDev API,
|
||||
it provides support for many different software only cryptographic modules as well hardware
|
||||
assisted support for the Intel QAT board and NVIDIA crypto enabled NICs.
|
||||
|
||||
For reads, the buffer provided to the crypto block device will be used as the destination buffer
|
||||
for unencrypted data. For writes, however, a temporary scratch buffer is used as the
|
||||
destination buffer for encryption which is then passed on to the underlying bdev as the
|
||||
write buffer. This is done to avoid encrypting the data in the original source buffer which
|
||||
may cause problems in some use cases.
|
||||
|
||||
Below is information about accel modules which support crypto operations:
|
||||
|
||||
### dpdk_cryptodev accel module
|
||||
|
||||
Supports the following ciphers:
|
||||
for any underlying bdev. The module relies on the DPDK CryptoDev Framework to provide
|
||||
all cryptographic functionality. The framework provides support for many different software
|
||||
only cryptographic modules as well hardware assisted support for the Intel QAT board. The
|
||||
framework also provides support for cipher, hash, authentication and AEAD functions. At this
|
||||
time the SPDK virtual bdev module supports cipher only as follows:
|
||||
|
||||
- AESN-NI Multi Buffer Crypto Poll Mode Driver: RTE_CRYPTO_CIPHER_AES128_CBC
|
||||
- Intel(R) QuickAssist (QAT) Crypto Poll Mode Driver: RTE_CRYPTO_CIPHER_AES128_CBC,
|
||||
RTE_CRYPTO_CIPHER_AES128_XTS
|
||||
(Note: QAT is functional however is marked as experimental until the hardware has
|
||||
been fully integrated with the SPDK CI system.)
|
||||
- MLX5 Crypto Poll Mode Driver: RTE_CRYPTO_CIPHER_AES256_XTS, RTE_CRYPTO_CIPHER_AES512_XTS
|
||||
- Intel(R) QuickAssist (QAT) Crypto Poll Mode Driver: RTE_CRYPTO_CIPHER_AES128_CBC
|
||||
(Note: QAT is functional however is marked as experimental until the hardware has
|
||||
been fully integrated with the SPDK CI system.)
|
||||
|
||||
In order to support using the bdev block offset (LBA) as the initialization vector (IV),
|
||||
the crypto module break up all I/O into crypto operations of a size equal to the block
|
||||
size of the underlying bdev. For example, a 4K I/O to a bdev with a 512B block size,
|
||||
would result in 8 cryptographic operations.
|
||||
|
||||
### SW accel module
|
||||
For reads, the buffer provided to the crypto module will be used as the destination buffer
|
||||
for unencrypted data. For writes, however, a temporary scratch buffer is used as the
|
||||
destination buffer for encryption which is then passed on to the underlying bdev as the
|
||||
write buffer. This is done to avoid encrypting the data in the original source buffer which
|
||||
may cause problems in some use cases.
|
||||
|
||||
Supports the following ciphers:
|
||||
Example command
|
||||
|
||||
- AES_XTS cipher with 128 or 256 bit keys implemented with ISA-L_crypto
|
||||
`rpc.py construct_crypto_bdev -b NVMe1n1 -c CryNvmeA -d crypto_aesni_mb -k 0123456789123456`
|
||||
|
||||
### General workflow
|
||||
This command will create a crypto vbdev called 'CryNvmeA' on top of the NVMe bdev
|
||||
'NVMe1n1' and will use the DPDK software driver 'crypto_aesni_mb' and the key
|
||||
'0123456789123456'.
|
||||
|
||||
- Set desired accel module to perform crypto operations, that can be done with `accel_assign_opc` RPC command
|
||||
- Create a named crypto key using `accel_crypto_key_create` RPC command. The key will use the assigned accel
|
||||
module. Set of parameters and supported ciphers may be different in each accel module.
|
||||
- Create virtual crypto block device providing the base block device name and the crypto key name
|
||||
using `bdev_crypto_create` RPC command
|
||||
To remove the vbdev use the delete_crypto_bdev command.
|
||||
|
||||
#### Example
|
||||
`rpc.py delete_crypto_bdev CryNvmeA`
|
||||
|
||||
Example command which uses dpdk_cryptodev accel module
|
||||
```
|
||||
# start SPDK application with `--wait-for-rpc` parameter
|
||||
rpc.py dpdk_cryptodev_scan_accel_module
|
||||
rpc.py dpdk_cryptodev_set_driver crypto_aesni_mb
|
||||
rpc.py accel_assign_opc -o encrypt -m dpdk_cryptodev
|
||||
rpc.py accel_assign_opc -o decrypt -m dpdk_cryptodev
|
||||
rpc.py framework_start_init
|
||||
rpc.py accel_crypto_key_create -c AES_CBC -k 01234567891234560123456789123456 -n key_aesni_cbc_1
|
||||
rpc.py bdev_crypto_create NVMe1n1 CryNvmeA -n key_aesni_cbc_1
|
||||
```
|
||||
|
||||
These commands will create a crypto vbdev called 'CryNvmeA' on top of the NVMe bdev
|
||||
'NVMe1n1' and will use a key named `key_aesni_cbc_1`. The key will work with the accel module which
|
||||
has been assigned for encrypt operations, in this example it will be the dpdk_cryptodev.
|
||||
|
||||
### Crypto key format
|
||||
|
||||
Please make sure the keys are provided in hexlified format. This means string passed to
|
||||
rpc.py must be twice as long than the key length in binary form.
|
||||
|
||||
#### Example command
|
||||
|
||||
`rpc.py accel_crypto_key_create -c AES_XTS -k2 7859243a027411e581e0c40a35c8228f -k d16a2f3a9e9f5b32daefacd7f5984f4578add84425be4a0baa489b9de8884b09 -n sample_key`
|
||||
|
||||
This command will create a key called `sample_key`, the AES key
|
||||
'd16a2f3a9e9f5b32daefacd7f5984f4578add84425be4a0baa489b9de8884b09' and the XTS key
|
||||
'7859243a027411e581e0c40a35c8228f'. In other words, the compound AES_XTS key to be used is
|
||||
'd16a2f3a9e9f5b32daefacd7f5984f4578add84425be4a0baa489b9de8884b097859243a027411e581e0c40a35c8228f'
|
||||
|
||||
### Delete the virtual crypto block device
|
||||
|
||||
To remove the vbdev use the bdev_crypto_delete command.
|
||||
|
||||
`rpc.py bdev_crypto_delete CryNvmeA`
|
||||
|
||||
### dpdk_cryptodev mlx5_pci driver configuration
|
||||
|
||||
The mlx5_pci driver works with crypto enabled Nvidia NICs and requires special configuration of
|
||||
DPDK environment to enable crypto function. It can be done via SPDK event library by configuring
|
||||
`env_context` member of `spdk_app_opts` structure or by passing corresponding CLI arguments in
|
||||
the following form: `--allow=BDF,class=crypto,wcs_file=/full/path/to/wrapped/credentials`, e.g.
|
||||
`--allow=0000:01:00.0,class=crypto,wcs_file=/path/credentials.txt`.
|
||||
|
||||
## Delay Bdev Module {#bdev_config_delay}
|
||||
|
||||
The delay vbdev module is intended to apply a predetermined additional latency on top of a lower
|
||||
level bdev. This enables the simulation of the latency characteristics of a device during the functional
|
||||
or scalability testing of an SPDK application. For example, to simulate the effect of drive latency when
|
||||
processing I/Os, one could configure a NULL bdev with a delay bdev on top of it.
|
||||
|
||||
The delay bdev module is not intended to provide a high fidelity replication of a specific NVMe drive's latency,
|
||||
instead it's main purpose is to provide a "big picture" understanding of how a generic latency affects a given
|
||||
application.
|
||||
|
||||
A delay bdev is created using the `bdev_delay_create` RPC. This rpc takes 6 arguments, one for the name
|
||||
of the delay bdev and one for the name of the base bdev. The remaining four arguments represent the following
|
||||
latency values: average read latency, average write latency, p99 read latency, and p99 write latency.
|
||||
Within the context of the delay bdev p99 latency means that one percent of the I/O will be delayed by at
|
||||
least by the value of the p99 latency before being completed to the upper level protocol. All of the latency values
|
||||
are measured in microseconds.
|
||||
|
||||
Example command:
|
||||
|
||||
`rpc.py bdev_delay_create -b Null0 -d delay0 -r 10 --nine-nine-read-latency 50 -w 30 --nine-nine-write-latency 90`
|
||||
|
||||
This command will create a delay bdev with average read and write latencies of 10 and 30 microseconds and p99 read
|
||||
and write latencies of 50 and 90 microseconds respectively.
|
||||
|
||||
A delay bdev can be deleted using the `bdev_delay_delete` RPC
|
||||
|
||||
Example command:
|
||||
|
||||
`rpc.py bdev_delay_delete delay0`
|
||||
|
||||
## GPT (GUID Partition Table) {#bdev_config_gpt}
|
||||
# GPT (GUID Partition Table) {#bdev_config_gpt}
|
||||
|
||||
The GPT virtual bdev driver is enabled by default and does not require any configuration.
|
||||
It will automatically detect @ref bdev_ug_gpt on any attached bdev and will create
|
||||
possibly multiple virtual bdevs.
|
||||
|
||||
### SPDK GPT partition table {#bdev_ug_gpt}
|
||||
## SPDK GPT partition table {#bdev_ug_gpt}
|
||||
|
||||
The SPDK partition type GUID is `6527994e-2c5a-4eec-9613-8f5944074e8b`. Existing SPDK bdevs
|
||||
can be exposed as Linux block devices via NBD and then can be partitioned with
|
||||
The SPDK partition type GUID is `7c5222bd-8f5d-4087-9c00-bf9843c7b58c`. Existing SPDK bdevs
|
||||
can be exposed as Linux block devices via NBD and then ca be partitioned with
|
||||
standard partitioning tools. After partitioning, the bdevs will need to be deleted and
|
||||
attached again for the GPT bdev module to see any changes. NBD kernel module must be
|
||||
loaded first. To create NBD bdev user should use `nbd_start_disk` RPC command.
|
||||
loaded first. To create NBD bdev user should use `start_nbd_disk` RPC command.
|
||||
|
||||
Example command
|
||||
|
||||
`rpc.py nbd_start_disk Malloc0 /dev/nbd0`
|
||||
`rpc.py start_nbd_disk Malloc0 /dev/nbd0`
|
||||
|
||||
This will expose an SPDK bdev `Malloc0` under the `/dev/nbd0` block device.
|
||||
|
||||
To remove NBD device user should use `nbd_stop_disk` RPC command.
|
||||
To remove NBD device user should use `stop_nbd_disk` RPC command.
|
||||
|
||||
Example command
|
||||
|
||||
`rpc.py nbd_stop_disk /dev/nbd0`
|
||||
`rpc.py stop_nbd_disk /dev/nbd0`
|
||||
|
||||
To display full or specified nbd device list user should use `nbd_get_disks` RPC command.
|
||||
To display full or specified nbd device list user should use `get_nbd_disks` RPC command.
|
||||
|
||||
Example command
|
||||
|
||||
`rpc.py nbd_stop_disk -n /dev/nbd0`
|
||||
`rpc.py stop_nbd_disk -n /dev/nbd0`
|
||||
|
||||
### Creating a GPT partition table using NBD {#bdev_ug_gpt_create_part}
|
||||
## Creating a GPT partition table using NBD {#bdev_ug_gpt_create_part}
|
||||
|
||||
~~~bash
|
||||
~~~
|
||||
# Expose bdev Nvme0n1 as kernel block device /dev/nbd0 by JSON-RPC
|
||||
rpc.py nbd_start_disk Nvme0n1 /dev/nbd0
|
||||
rpc.py start_nbd_disk Nvme0n1 /dev/nbd0
|
||||
|
||||
# Create GPT partition table.
|
||||
parted -s /dev/nbd0 mklabel gpt
|
||||
@ -312,17 +202,17 @@ parted -s /dev/nbd0 mkpart MyPartition '0%' '50%'
|
||||
|
||||
# Change the partition type to the SPDK GUID.
|
||||
# sgdisk is part of the gdisk package.
|
||||
sgdisk -t 1:6527994e-2c5a-4eec-9613-8f5944074e8b /dev/nbd0
|
||||
sgdisk -t 1:7c5222bd-8f5d-4087-9c00-bf9843c7b58c /dev/nbd0
|
||||
|
||||
# Stop the NBD device (stop exporting /dev/nbd0).
|
||||
rpc.py nbd_stop_disk /dev/nbd0
|
||||
rpc.py stop_nbd_disk /dev/nbd0
|
||||
|
||||
# Now Nvme0n1 is configured with a GPT partition table, and
|
||||
# the first partition will be automatically exposed as
|
||||
# Nvme0n1p1 in SPDK applications.
|
||||
~~~
|
||||
|
||||
## iSCSI bdev {#bdev_config_iscsi}
|
||||
# iSCSI bdev {#bdev_config_iscsi}
|
||||
|
||||
The SPDK iSCSI bdev driver depends on libiscsi and hence is not enabled by default.
|
||||
In order to use it, build SPDK with an extra `--with-iscsi-initiator` configure option.
|
||||
@ -330,44 +220,44 @@ In order to use it, build SPDK with an extra `--with-iscsi-initiator` configure
|
||||
The following command creates an `iSCSI0` bdev from a single LUN exposed at given iSCSI URL
|
||||
with `iqn.2016-06.io.spdk:init` as the reported initiator IQN.
|
||||
|
||||
`rpc.py bdev_iscsi_create -b iSCSI0 -i iqn.2016-06.io.spdk:init --url iscsi://127.0.0.1/iqn.2016-06.io.spdk:disk1/0`
|
||||
`rpc.py construct_iscsi_bdev -b iSCSI0 -i iqn.2016-06.io.spdk:init --url iscsi://127.0.0.1/iqn.2016-06.io.spdk:disk1/0`
|
||||
|
||||
The URL is in the following format:
|
||||
`iscsi://[<username>[%<password>]@]<host>[:<port>]/<target-iqn>/<lun>`
|
||||
|
||||
## Linux AIO bdev {#bdev_config_aio}
|
||||
# Linux AIO bdev {#bdev_config_aio}
|
||||
|
||||
The SPDK AIO bdev driver provides SPDK block layer access to Linux kernel block
|
||||
devices or a file on a Linux filesystem via Linux AIO. Note that O_DIRECT is
|
||||
used and thus bypasses the Linux page cache. This mode is probably as close to
|
||||
a typical kernel based target as a user space target can get without using a
|
||||
user-space driver. To create AIO bdev RPC command `bdev_aio_create` should be
|
||||
user-space driver. To create AIO bdev RPC command `construct_aio_bdev` should be
|
||||
used.
|
||||
|
||||
Example commands
|
||||
|
||||
`rpc.py bdev_aio_create /dev/sda aio0`
|
||||
`rpc.py construct_aio_bdev /dev/sda aio0`
|
||||
|
||||
This command will create `aio0` device from /dev/sda.
|
||||
|
||||
`rpc.py bdev_aio_create /tmp/file file 4096`
|
||||
`rpc.py construct_aio_bdev /tmp/file file 8192`
|
||||
|
||||
This command will create `file` device with block size 4096 from /tmp/file.
|
||||
This command will create `file` device with block size 8192 from /tmp/file.
|
||||
|
||||
To delete an aio bdev use the bdev_aio_delete command.
|
||||
To delete an aio bdev use the delete_aio_bdev command.
|
||||
|
||||
`rpc.py bdev_aio_delete aio0`
|
||||
`rpc.py delete_aio_bdev aio0`
|
||||
|
||||
## OCF Virtual bdev {#bdev_config_cas}
|
||||
# OCF Virtual bdev {#bdev_config_cas}
|
||||
|
||||
OCF virtual bdev module is based on [Open CAS Framework](https://github.com/Open-CAS/ocf) - a
|
||||
high performance block storage caching meta-library.
|
||||
To enable the module, configure SPDK using `--with-ocf` flag.
|
||||
To enable the module, configure SPDK with `--with-ocf=/path/to/ocf/library`.
|
||||
OCF bdev can be used to enable caching for any underlying bdev.
|
||||
|
||||
Below is an example command for creating OCF bdev:
|
||||
|
||||
`rpc.py bdev_ocf_create Cache1 wt Malloc0 Nvme0n1`
|
||||
`rpc.py construct_ocf_bdev Cache1 wt Malloc0 Nvme0n1`
|
||||
|
||||
This command will create new OCF bdev `Cache1` having bdev `Malloc0` as caching-device
|
||||
and `Nvme0n1` as core-device and initial cache mode `Write-Through`.
|
||||
@ -378,118 +268,86 @@ and non-volatile metadata will be disabled.
|
||||
|
||||
To remove `Cache1`:
|
||||
|
||||
`rpc.py bdev_ocf_delete Cache1`
|
||||
`rpc.py delete_ocf_bdev Cache1`
|
||||
|
||||
During removal OCF-cache will be stopped and all cached data will be written to the core device.
|
||||
|
||||
Note that OCF has a per-device RAM requirement. More details can be found in the
|
||||
[OCF documentation](https://open-cas.github.io/guide_system_requirements.html).
|
||||
Note that OCF has a per-device RAM requirement
|
||||
of about 56000 + _cache device size_ * 58 / _cache line size_ (in bytes).
|
||||
To get more information on OCF
|
||||
please visit [OCF documentation](https://open-cas.github.io/).
|
||||
|
||||
## Malloc bdev {#bdev_config_malloc}
|
||||
# Malloc bdev {#bdev_config_malloc}
|
||||
|
||||
Malloc bdevs are ramdisks. Because of its nature they are volatile. They are created from hugepage memory given to SPDK
|
||||
application.
|
||||
|
||||
Example command for creating malloc bdev:
|
||||
|
||||
`rpc.py bdev_malloc_create -b Malloc0 64 512`
|
||||
|
||||
Example command for removing malloc bdev:
|
||||
|
||||
`rpc.py bdev_malloc_delete Malloc0`
|
||||
|
||||
## Null {#bdev_config_null}
|
||||
# Null {#bdev_config_null}
|
||||
|
||||
The SPDK null bdev driver is a dummy block I/O target that discards all writes and returns undefined
|
||||
data for reads. It is useful for benchmarking the rest of the bdev I/O stack with minimal block
|
||||
device overhead and for testing configurations that can't easily be created with the Malloc bdev.
|
||||
To create Null bdev RPC command `bdev_null_create` should be used.
|
||||
To create Null bdev RPC command `construct_null_bdev` should be used.
|
||||
|
||||
Example command
|
||||
|
||||
`rpc.py bdev_null_create Null0 8589934592 4096`
|
||||
`rpc.py construct_null_bdev Null0 8589934592 4096`
|
||||
|
||||
This command will create an 8 petabyte `Null0` device with block size 4096.
|
||||
|
||||
To delete a null bdev use the bdev_null_delete command.
|
||||
To delete a null bdev use the delete_null_bdev command.
|
||||
|
||||
`rpc.py bdev_null_delete Null0`
|
||||
`rpc.py delete_null_bdev Null0`
|
||||
|
||||
## NVMe bdev {#bdev_config_nvme}
|
||||
# NVMe bdev {#bdev_config_nvme}
|
||||
|
||||
There are two ways to create block device based on NVMe device in SPDK. First
|
||||
way is to connect local PCIe drive and second one is to connect NVMe-oF device.
|
||||
In both cases user should use `bdev_nvme_attach_controller` RPC command to achieve that.
|
||||
In both cases user should use `construct_nvme_bdev` RPC command to achieve that.
|
||||
|
||||
Example commands
|
||||
|
||||
`rpc.py bdev_nvme_attach_controller -b NVMe1 -t PCIe -a 0000:01:00.0`
|
||||
`rpc.py construct_nvme_bdev -b NVMe1 -t PCIe -a 0000:01:00.0`
|
||||
|
||||
This command will create NVMe bdev of physical device in the system.
|
||||
|
||||
`rpc.py bdev_nvme_attach_controller -b Nvme0 -t RDMA -a 192.168.100.1 -f IPv4 -s 4420 -n nqn.2016-06.io.spdk:cnode1`
|
||||
`rpc.py construct_nvme_bdev -b Nvme0 -t RDMA -a 192.168.100.1 -f IPv4 -s 4420 -n nqn.2016-06.io.spdk:cnode1`
|
||||
|
||||
This command will create NVMe bdev of NVMe-oF resource.
|
||||
|
||||
To remove an NVMe controller use the bdev_nvme_detach_controller command.
|
||||
To remove a NVMe controller use the delete_nvme_controller command.
|
||||
|
||||
`rpc.py bdev_nvme_detach_controller Nvme0`
|
||||
`rpc.py delete_nvme_controller Nvme0`
|
||||
|
||||
This command will remove NVMe bdev named Nvme0.
|
||||
This command will remove NVMe controller named Nvme0.
|
||||
|
||||
The SPDK NVMe bdev driver provides the multipath feature. Please refer to
|
||||
@ref nvme_multipath for details.
|
||||
|
||||
### NVMe bdev character device {#bdev_config_nvme_cuse}
|
||||
|
||||
This feature is considered as experimental. You must configure with --with-nvme-cuse
|
||||
option to enable this RPC.
|
||||
|
||||
Example commands
|
||||
|
||||
`rpc.py bdev_nvme_cuse_register -n Nvme3`
|
||||
|
||||
This command will register a character device under /dev/spdk associated with Nvme3
|
||||
controller. If there are namespaces created on Nvme3 controller, a namespace
|
||||
character device is also created for each namespace.
|
||||
|
||||
For example, the first controller registered will have a character device path of
|
||||
/dev/spdk/nvmeX, where X is replaced with a unique integer to differentiate it from
|
||||
other controllers. Note that this 'nvmeX' name here has no correlation to the name
|
||||
associated with the controller in SPDK. Namespace character devices will have a path
|
||||
of /dev/spdk/nvmeXnY, where Y is the namespace ID.
|
||||
|
||||
Cuse devices are removed from system, when NVMe controller is detached or unregistered
|
||||
with command:
|
||||
|
||||
`rpc.py bdev_nvme_cuse_unregister -n Nvme0`
|
||||
|
||||
## Logical volumes {#bdev_ug_logical_volumes}
|
||||
# Logical volumes {#bdev_ug_logical_volumes}
|
||||
|
||||
The Logical Volumes library is a flexible storage space management system. It allows
|
||||
creating and managing virtual block devices with variable size on top of other bdevs.
|
||||
The SPDK Logical Volume library is built on top of @ref blob. For detailed description
|
||||
please refer to @ref lvol.
|
||||
|
||||
### Logical volume store {#bdev_ug_lvol_store}
|
||||
## Logical volume store {#bdev_ug_lvol_store}
|
||||
|
||||
Before creating any logical volumes (lvols), an lvol store has to be created first on
|
||||
selected block device. Lvol store is lvols vessel responsible for managing underlying
|
||||
bdev space assignment to lvol bdevs and storing metadata. To create lvol store user
|
||||
should use using `bdev_lvol_create_lvstore` RPC command.
|
||||
should use using `construct_lvol_store` RPC command.
|
||||
|
||||
Example command
|
||||
|
||||
`rpc.py bdev_lvol_create_lvstore Malloc2 lvs -c 4096`
|
||||
`rpc.py construct_lvol_store Malloc2 lvs -c 4096`
|
||||
|
||||
This will create lvol store named `lvs` with cluster size 4096, build on top of
|
||||
`Malloc2` bdev. In response user will be provided with uuid which is unique lvol store
|
||||
identifier.
|
||||
|
||||
User can get list of available lvol stores using `bdev_lvol_get_lvstores` RPC command (no
|
||||
User can get list of available lvol stores using `get_lvol_stores` RPC command (no
|
||||
parameters available).
|
||||
|
||||
Example response
|
||||
|
||||
~~~
|
||||
{
|
||||
"uuid": "330a6ab2-f468-11e7-983e-001e67edf35d",
|
||||
@ -502,26 +360,26 @@ Example response
|
||||
}
|
||||
~~~
|
||||
|
||||
To delete lvol store user should use `bdev_lvol_delete_lvstore` RPC command.
|
||||
To delete lvol store user should use `destroy_lvol_store` RPC command.
|
||||
|
||||
Example commands
|
||||
|
||||
`rpc.py bdev_lvol_delete_lvstore -u 330a6ab2-f468-11e7-983e-001e67edf35d`
|
||||
`rpc.py destroy_lvol_store -u 330a6ab2-f468-11e7-983e-001e67edf35d`
|
||||
|
||||
`rpc.py bdev_lvol_delete_lvstore -l lvs`
|
||||
`rpc.py destroy_lvol_store -l lvs`
|
||||
|
||||
### Lvols {#bdev_ug_lvols}
|
||||
## Lvols {#bdev_ug_lvols}
|
||||
|
||||
To create lvols on existing lvol store user should use `bdev_lvol_create` RPC command.
|
||||
To create lvols on existing lvol store user should use `construct_lvol_bdev` RPC command.
|
||||
Each created lvol will be represented by new bdev.
|
||||
|
||||
Example commands
|
||||
|
||||
`rpc.py bdev_lvol_create lvol1 25 -l lvs`
|
||||
`rpc.py construct_lvol_bdev lvol1 25 -l lvs`
|
||||
|
||||
`rpc.py bdev_lvol_create lvol2 25 -u 330a6ab2-f468-11e7-983e-001e67edf35d`
|
||||
`rpc.py construct_lvol_bdev lvol2 25 -u 330a6ab2-f468-11e7-983e-001e67edf35d`
|
||||
|
||||
## Passthru {#bdev_config_passthru}
|
||||
# Passthru {#bdev_config_passthru}
|
||||
|
||||
The SPDK Passthru virtual block device module serves as an example of how to write a
|
||||
virtual block device module. It implements the required functionality of a vbdev module
|
||||
@ -529,95 +387,48 @@ and demonstrates some other basic features such as the use of per I/O context.
|
||||
|
||||
Example commands
|
||||
|
||||
`rpc.py bdev_passthru_create -b aio -p pt`
|
||||
`rpc.py construct_passthru_bdev -b aio -p pt`
|
||||
|
||||
`rpc.py bdev_passthru_delete pt`
|
||||
`rpc.py delete_passthru_bdev pt`
|
||||
|
||||
## RAID {#bdev_ug_raid}
|
||||
# Pmem {#bdev_config_pmem}
|
||||
|
||||
RAID virtual bdev module provides functionality to combine any SPDK bdevs into
|
||||
one RAID bdev. Currently SPDK supports only RAID 0. RAID metadata may be stored
|
||||
on member disks if enabled when creating the RAID bdev, so user does not have to
|
||||
recreate the RAID volume when restarting application. It is not enabled by
|
||||
default for backward compatibility. User may specify member disks to create
|
||||
RAID volume even if they do not exist yet - as the member disks are registered at
|
||||
a later time, the RAID module will claim them and will surface the RAID volume
|
||||
after all of the member disks are available. It is allowed to use disks of
|
||||
different sizes - the smallest disk size will be the amount of space used on
|
||||
each member disk.
|
||||
The SPDK pmem bdev driver uses pmemblk pool as the target for block I/O operations. For
|
||||
details on Pmem memory please refer to PMDK documentation on http://pmem.io website.
|
||||
First, user needs to configure SPDK to include PMDK support:
|
||||
|
||||
Example commands
|
||||
`configure --with-pmdk`
|
||||
|
||||
`rpc.py bdev_raid_create -n Raid0 -z 64 -r 0 -b "lvol0 lvol1 lvol2 lvol3"`
|
||||
To create pmemblk pool for use with SPDK user should use `create_pmem_pool` RPC command.
|
||||
|
||||
`rpc.py bdev_raid_get_bdevs`
|
||||
Example command
|
||||
|
||||
`rpc.py bdev_raid_delete Raid0`
|
||||
`rpc.py create_pmem_pool /path/to/pmem_pool 25 4096`
|
||||
|
||||
## Split {#bdev_ug_split}
|
||||
To get information on created pmem pool file user can use `pmem_pool_info` RPC command.
|
||||
|
||||
The split block device module takes an underlying block device and splits it into
|
||||
several smaller equal-sized virtual block devices. This serves as an example to create
|
||||
more vbdevs on a given base bdev for user testing.
|
||||
Example command
|
||||
|
||||
Example commands
|
||||
`rpc.py pmem_pool_info /path/to/pmem_pool`
|
||||
|
||||
To create four split bdevs with base bdev_b0 use the `bdev_split_create` command.
|
||||
Each split bdev will be one fourth the size of the base bdev.
|
||||
To remove pmem pool file user can use `delete_pmem_pool` RPC command.
|
||||
|
||||
`rpc.py bdev_split_create bdev_b0 4`
|
||||
Example command
|
||||
|
||||
The `split_size_mb`(-s) parameter restricts the size of each split bdev.
|
||||
The total size of all split bdevs must not exceed the base bdev size.
|
||||
`rpc.py delete_pmem_pool /path/to/pmem_pool`
|
||||
|
||||
`rpc.py bdev_split_create bdev_b0 4 -s 128`
|
||||
To create bdev based on pmemblk pool file user should use `construct_pmem_bdev ` RPC
|
||||
command.
|
||||
|
||||
To remove the split bdevs, use the `bdev_split_delete` command with the base bdev name.
|
||||
Example command
|
||||
|
||||
`rpc.py bdev_split_delete bdev_b0`
|
||||
`rpc.py construct_pmem_bdev /path/to/pmem_pool -n pmem`
|
||||
|
||||
## Uring {#bdev_ug_uring}
|
||||
To remove a block device representation use the delete_pmem_bdev command.
|
||||
|
||||
The uring bdev module issues I/O to kernel block devices using the io_uring Linux kernel API. This module requires liburing.
|
||||
For more information on io_uring refer to kernel [IO_uring] (https://kernel.dk/io_uring.pdf)
|
||||
`rpc.py delete_pmem_bdev pmem`
|
||||
|
||||
The user needs to configure SPDK to include io_uring support:
|
||||
|
||||
`configure --with-uring`
|
||||
|
||||
Support for zoned devices is enabled by default in uring bdev. It can be explicitly disabled as follows:
|
||||
|
||||
`configure --with-uring --without-uring-zns`
|
||||
|
||||
To create a uring bdev with given filename, bdev name and block size use the `bdev_uring_create` RPC.
|
||||
|
||||
`rpc.py bdev_uring_create /path/to/device bdev_u0 512`
|
||||
|
||||
To remove a uring bdev use the `bdev_uring_delete` RPC.
|
||||
|
||||
`rpc.py bdev_uring_delete bdev_u0`
|
||||
|
||||
## xnvme {#bdev_ug_xnvme}
|
||||
|
||||
The xnvme bdev module issues I/O to the underlying NVMe devices through various I/O mechanisms
|
||||
such as libaio, io_uring, Asynchronous IOCTL using io_uring passthrough, POSIX aio, emulated aio etc.
|
||||
|
||||
This module requires xNVMe library.
|
||||
For more information on xNVMe refer to [xNVMe] (https://xnvme.io/docs/latest)
|
||||
|
||||
The user needs to configure SPDK to include xNVMe support:
|
||||
|
||||
`configure --with-xnvme`
|
||||
|
||||
To create a xnvme bdev with given filename, bdev name and I/O mechanism use the `bdev_xnvme_create` RPC.
|
||||
|
||||
`rpc.py bdev_xnvme_create /dev/ng0n1 bdev_ng0n1 io_uring_cmd`
|
||||
|
||||
To remove a xnvme bdev use the `bdev_xnvme_delete` RPC.
|
||||
|
||||
`rpc.py bdev_xnvme_delete bdev_ng0n1`
|
||||
|
||||
## Virtio Block {#bdev_config_virtio_blk}
|
||||
# Virtio Block {#bdev_config_virtio_blk}
|
||||
|
||||
The Virtio-Block driver allows creating SPDK bdevs from Virtio-Block devices.
|
||||
|
||||
@ -625,61 +436,34 @@ The following command creates a Virtio-Block device named `VirtioBlk0` from a vh
|
||||
socket `/tmp/vhost.0` exposed directly by SPDK @ref vhost. Optional `vq-count` and
|
||||
`vq-size` params specify number of request queues and queue depth to be used.
|
||||
|
||||
`rpc.py bdev_virtio_attach_controller --dev-type blk --trtype user --traddr /tmp/vhost.0 --vq-count 2 --vq-size 512 VirtioBlk0`
|
||||
`rpc.py construct_virtio_dev --dev-type blk --trtype user --traddr /tmp/vhost.0 --vq-count 2 --vq-size 512 VirtioBlk0`
|
||||
|
||||
The driver can be also used inside QEMU-based VMs. The following command creates a Virtio
|
||||
Block device named `VirtioBlk0` from a Virtio PCI device at address `0000:00:01.0`.
|
||||
The entire configuration will be read automatically from PCI Configuration Space. It will
|
||||
reflect all parameters passed to QEMU's vhost-user-scsi-pci device.
|
||||
|
||||
`rpc.py bdev_virtio_attach_controller --dev-type blk --trtype pci --traddr 0000:01:00.0 VirtioBlk1`
|
||||
`rpc.py construct_virtio_dev --dev-type blk --trtype pci --traddr 0000:01:00.0 VirtioBlk1`
|
||||
|
||||
Virtio-Block devices can be removed with the following command
|
||||
|
||||
`rpc.py bdev_virtio_detach_controller VirtioBlk0`
|
||||
`rpc.py remove_virtio_bdev VirtioBlk0`
|
||||
|
||||
## Virtio SCSI {#bdev_config_virtio_scsi}
|
||||
# Virtio SCSI {#bdev_config_virtio_scsi}
|
||||
|
||||
The Virtio-SCSI driver allows creating SPDK block devices from Virtio-SCSI LUNs.
|
||||
|
||||
Virtio-SCSI bdevs are created the same way as Virtio-Block ones.
|
||||
Virtio-SCSI bdevs are constructed the same way as Virtio-Block ones.
|
||||
|
||||
`rpc.py bdev_virtio_attach_controller --dev-type scsi --trtype user --traddr /tmp/vhost.0 --vq-count 2 --vq-size 512 VirtioScsi0`
|
||||
`rpc.py construct_virtio_dev --dev-type scsi --trtype user --traddr /tmp/vhost.0 --vq-count 2 --vq-size 512 VirtioScsi0`
|
||||
|
||||
`rpc.py bdev_virtio_attach_controller --dev-type scsi --trtype pci --traddr 0000:01:00.0 VirtioScsi0`
|
||||
`rpc.py construct_virtio_dev --dev-type scsi --trtype pci --traddr 0000:01:00.0 VirtioScsi0`
|
||||
|
||||
Each Virtio-SCSI device may export up to 64 block devices named VirtioScsi0t0 ~ VirtioScsi0t63,
|
||||
one LUN (LUN0) per SCSI device. The above 2 commands will output names of all exposed bdevs.
|
||||
|
||||
Virtio-SCSI devices can be removed with the following command
|
||||
|
||||
`rpc.py bdev_virtio_detach_controller VirtioScsi0`
|
||||
`rpc.py remove_virtio_bdev VirtioScsi0`
|
||||
|
||||
Removing a Virtio-SCSI device will destroy all its bdevs.
|
||||
|
||||
## DAOS bdev {#bdev_config_daos}
|
||||
|
||||
DAOS bdev creates SPDK block device on top of DAOS DFS, the name of the bdev defines the file name in DFS namespace.
|
||||
Note that DAOS container has to be POSIX type, e.g.: ` daos cont create --pool=test-pool --label=test-cont --type=POSIX`
|
||||
|
||||
To build SPDK with daos support, daos-devel package has to be installed, please see the setup [guide](https://docs.daos.io/v2.0/).
|
||||
To enable the module, configure SPDK using `--with-daos` flag.
|
||||
|
||||
Running `daos_agent` service on the target machine is required for the SPDK DAOS bdev communication with a DAOS cluster.
|
||||
|
||||
The implementation uses the independent pool and container connections per device's channel for the best IO throughput, therefore,
|
||||
running a target application with multiple cores (`-m [0-7], for example) is highly advisable.
|
||||
|
||||
Example command for creating daos bdev:
|
||||
|
||||
`rpc.py bdev_daos_create daosdev0 test-pool test-cont 64 4096`
|
||||
|
||||
Example command for removing daos bdev:
|
||||
|
||||
`rpc.py bdev_daos_delete daosdev0`
|
||||
|
||||
To resize a bdev use the bdev_daos_resize command.
|
||||
|
||||
`rpc.py bdev_daos_resize daosdev0 8192`
|
||||
|
||||
This command will resize the daosdev0 bdev to 8192 MiB.
|
||||
|
@ -18,14 +18,14 @@ how to write a module.
|
||||
|
||||
## Creating A New Module
|
||||
|
||||
Block device modules are located in subdirectories under module/bdev today. It is not
|
||||
Block device modules are located in subdirectories under lib/bdev today. It is not
|
||||
currently possible to place the code for a bdev module elsewhere, but updates
|
||||
to the build system could be made to enable this in the future. To create a
|
||||
module, add a new directory with a single C file and a Makefile. A great
|
||||
starting point is to copy the existing 'null' bdev module.
|
||||
|
||||
The primary interface that bdev modules will interact with is in
|
||||
include/spdk/bdev_module.h. In that header a macro is defined that registers
|
||||
include/spdk_internal/bdev.h. In that header a macro is defined that registers
|
||||
a new bdev module - SPDK_BDEV_MODULE_REGISTER. This macro take as argument a
|
||||
pointer spdk_bdev_module structure that is used to register new bdev module.
|
||||
|
||||
@ -34,8 +34,8 @@ initialization (`module_init`) and teardown (`module_fini`) functions,
|
||||
the function that returns context size (`get_ctx_size`) - scratch space that
|
||||
will be allocated in each I/O request for use by this module, and a callback
|
||||
that will be called each time a new bdev is registered by another module
|
||||
(`examine_config` and `examine_disk`). Please check the documentation of
|
||||
struct spdk_bdev_module for more details.
|
||||
(`examine`). Please check the documentation of struct spdk_bdev_module for
|
||||
more details.
|
||||
|
||||
## Creating Bdevs
|
||||
|
||||
@ -137,60 +137,18 @@ block device. Once the I/O request is completed, the module must call
|
||||
spdk_bdev_io_complete(). The I/O does not have to finish within the calling
|
||||
context of `submit_request`.
|
||||
|
||||
Integrating a new bdev module into the build system requires updates to various
|
||||
files in the /mk directory.
|
||||
|
||||
## Creating Bdevs in an External Repository
|
||||
|
||||
A User can build their own bdev module and application on top of existing SPDK libraries. The example in
|
||||
test/external_code serves as a template for creating, building and linking an external
|
||||
bdev module. Refer to test/external_code/README.md and @ref so_linking for further information.
|
||||
|
||||
## Creating Virtual Bdevs
|
||||
|
||||
Block devices are considered virtual if they handle I/O requests by routing
|
||||
the I/O to other block devices. The canonical example would be a bdev module
|
||||
that implements RAID. Virtual bdevs are created in the same way as regular
|
||||
bdevs, but take the one additional step of claiming the bdev.
|
||||
|
||||
The module can open the underlying bdevs it wishes to route I/O to using
|
||||
spdk_bdev_open_ext(), where the string name is provided by the user via an RPC.
|
||||
To ensure that other consumers do not modify the underlying bdev in an unexpected
|
||||
way, the virtual bdev should take a claim on the underlying bdev before
|
||||
reading from or writing to the underlying bdev.
|
||||
|
||||
There are two slightly different APIs for taking and releasing claims. The
|
||||
preferred interface uses `spdk_bdev_module_claim_bdev_desc()`. This method allows
|
||||
claims that ensure there is a single writer with
|
||||
`SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE`, cooperating shared writers with
|
||||
`SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED`, and shared readers that prevent any
|
||||
writers with `SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE`. In all cases,
|
||||
`spdk_bdev_open_ext()` may be used to open the underlying bdev read-only. If a
|
||||
read-only bdev descriptor successfully claims a bdev with
|
||||
`SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE` or `SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED`
|
||||
the bdev descriptor is promoted to read-write.
|
||||
Any claim that is obtained with `spdk_bdev_module_claim_bdev_desc()` is
|
||||
automatically released upon closing the bdev descriptor used to obtain the
|
||||
claim. Shared claims continue to block new incompatible claims and new writers
|
||||
until the last claim is released.
|
||||
|
||||
The non-preferred interface for obtaining a claim allows the caller to obtain
|
||||
an exclusive writer claim with `spdk_bdev_module_claim_bdev()`. It may be
|
||||
be released with `spdk_bdev_module_release_bdev()`. If a read-only bdev
|
||||
descriptor is passed, it is promoted to read-write. NULL may be passed instead
|
||||
of a bdev descriptor to avoid promotion and to block new writers. New code
|
||||
should use `spdk_bdev_module_claim_bdev_desc()` with the claim type that is
|
||||
tailored to the virtual bdev's needs.
|
||||
|
||||
The descriptor obtained from the successful spdk_bdev_open_ext() may be used
|
||||
with spdk_bdev_get_io_channel() to obtain I/O channels for the bdev. This is
|
||||
likely done in response to the virtual bdev's `get_io_channel` callback.
|
||||
Channels may be obtained before and/or after claiming the underlying bdev, but
|
||||
beware there may be other unknown writers until the underlying bdev has been
|
||||
claimed.
|
||||
|
||||
When a virtual bdev module claims an underlying bdev from its `examine_config`
|
||||
callback, it causes the `examine_disk` callback to only be called for this
|
||||
module and any others that establish a shared claim. If no claims are taken by
|
||||
`examine_config` callbacks, all virtual bdevs' `examine_disk` callbacks are
|
||||
called.
|
||||
bdevs, but take one additional step. The module can look up the underlying
|
||||
bdevs it wishes to route I/O to using spdk_bdev_get_by_name(), where the string
|
||||
name is provided by the user in a configuration file or via an RPC. The module
|
||||
then may proceed is normal by opening the bdev to obtain a descriptor, and
|
||||
creating I/O channels for the bdev (probably in response to the
|
||||
`get_io_channel` callback). The final step is to have the module use its open
|
||||
descriptor to call spdk_bdev_module_claim_bdev(), indicating that it is
|
||||
consuming the underlying bdev. This prevents other users from opening
|
||||
descriptors with write permissions. This effectively 'promotes' the descriptor
|
||||
to write-exclusive and is an operation only available to bdev modules.
|
||||
|
@ -51,7 +51,7 @@ The bdev layer depends on the generic message passing infrastructure
|
||||
abstracted by the header file include/spdk/thread.h. See @ref concurrency for a
|
||||
full description. Most importantly, calls into the bdev library may only be
|
||||
made from threads that have been allocated with SPDK by calling
|
||||
spdk_thread_create().
|
||||
spdk_allocate_thread().
|
||||
|
||||
From an allocated thread, the bdev library may be initialized by calling
|
||||
spdk_bdev_initialize(), which is an asynchronous operation. Until the completion
|
||||
@ -63,7 +63,7 @@ to tear down the bdev library, call spdk_bdev_finish().
|
||||
All block devices have a simple string name. At any time, a pointer to the
|
||||
device object can be obtained by calling spdk_bdev_get_by_name(), or the entire
|
||||
set of bdevs may be iterated using spdk_bdev_first() and spdk_bdev_next() and
|
||||
their variants or spdk_for_each_bdev() and its variant.
|
||||
their variants.
|
||||
|
||||
Some block devices may also be given aliases, which are also string names.
|
||||
Aliases behave like symlinks - they can be used interchangeably with the real
|
||||
@ -72,7 +72,7 @@ name to look up the block device.
|
||||
## Preparing To Use A Block Device
|
||||
|
||||
In order to send I/O requests to a block device, it must first be opened by
|
||||
calling spdk_bdev_open_ext(). This will return a descriptor. Multiple users may have
|
||||
calling spdk_bdev_open(). This will return a descriptor. Multiple users may have
|
||||
a bdev open at the same time, and coordination of reads and writes between
|
||||
users must be handled by some higher level mechanism outside of the bdev
|
||||
layer. Opening a bdev with write permission may fail if a virtual bdev module
|
||||
@ -81,14 +81,13 @@ logical volume management and forward their I/O to lower level bdevs, so they
|
||||
mark these lower level bdevs as claimed to prevent outside users from issuing
|
||||
writes.
|
||||
|
||||
When a block device is opened, a callback and context must be provided that
|
||||
will be called with appropriate spdk_bdev_event_type enum as an argument when
|
||||
the bdev triggers asynchronous event such as bdev removal. For example,
|
||||
the callback will be called on each open descriptor for a bdev backed by
|
||||
a physical NVMe SSD when the NVMe SSD is hot-unplugged. In this case
|
||||
the callback can be thought of as a request to close the open descriptor so
|
||||
other memory may be freed. A bdev cannot be torn down while open descriptors
|
||||
exist, so it is required that a callback is provided.
|
||||
When a block device is opened, an optional callback and context can be
|
||||
provided that will be called if the underlying storage servicing the block
|
||||
device is removed. For example, the remove callback will be called on each
|
||||
open descriptor for a bdev backed by a physical NVMe SSD when the NVMe SSD is
|
||||
hot-unplugged. The callback can be thought of as a request to close the open
|
||||
descriptor so other memory may be freed. A bdev cannot be torn down while open
|
||||
descriptors exist, so it is highly recommended that a callback is provided.
|
||||
|
||||
When a user is done with a descriptor, they may release it by calling
|
||||
spdk_bdev_close().
|
||||
|
@ -1,87 +0,0 @@
|
||||
# bdevperf {#bdevperf}
|
||||
|
||||
## Introduction
|
||||
|
||||
bdevperf is an SPDK application used for performance testing
|
||||
of block devices (bdevs) exposed by the SPDK bdev layer. It is an
|
||||
alternative to the SPDK bdev fio plugin for benchmarking SPDK bdevs.
|
||||
In some cases, bdevperf can provide lower overhead than the fio
|
||||
plugin, resulting in better performance and efficiency for tests
|
||||
using a limited number of CPU cores.
|
||||
|
||||
bdevperf exposes command line interface that allows to specify
|
||||
SPDK framework options as well as testing options.
|
||||
bdevperf also supports a configuration file format similar
|
||||
to FIO. It allows user to create jobs parameterized by
|
||||
filename, cpumask, blocksize, queuesize, etc.
|
||||
|
||||
## Config file
|
||||
|
||||
bdevperf's config file format is similar to FIO.
|
||||
|
||||
Below is an example config file that uses all available parameters:
|
||||
|
||||
~~~{.ini}
|
||||
[global]
|
||||
filename=Malloc0:Malloc1
|
||||
bs=1024
|
||||
iosize=256
|
||||
rw=randrw
|
||||
rwmixread=90
|
||||
|
||||
[A]
|
||||
cpumask=0xff
|
||||
|
||||
[B]
|
||||
cpumask=[0-128]
|
||||
filename=Malloc1
|
||||
|
||||
[global]
|
||||
filename=Malloc0
|
||||
rw=write
|
||||
|
||||
[C]
|
||||
bs=4096
|
||||
iosize=128
|
||||
offset=1000000
|
||||
length=1000000
|
||||
~~~
|
||||
|
||||
Jobs `[A]` `[B]` or `[C]`, inherit default values from `[global]`
|
||||
section residing above them. So in the example, job `[A]` inherits
|
||||
`filename` value and uses both `Malloc0` and `Malloc1` bdevs as targets,
|
||||
job `[B]` overrides its `filename` value and uses `Malloc1` and
|
||||
job `[C]` inherits value `Malloc0` for its `filename`.
|
||||
|
||||
Interaction with CLI arguments is not the same as in FIO however.
|
||||
If bdevperf receives CLI argument, it overrides values
|
||||
of corresponding parameter for all `[global]` sections of config file.
|
||||
So if example config is used, specifying `-q` argument
|
||||
will make jobs `[A]` and `[B]` use its value.
|
||||
|
||||
Below is a full list of supported parameters with descriptions.
|
||||
|
||||
Param | Default | Description
|
||||
--------- | ----------------- | -----------
|
||||
filename | | Bdevs to use, separated by ":"
|
||||
cpumask | Maximum available | CPU mask. Format is defined at @ref cpu_mask
|
||||
bs | | Block size (io size)
|
||||
iodepth | | Queue depth
|
||||
rwmixread | `50` | Percentage of a mixed workload that should be reads
|
||||
offset | `0` | Start I/O at the provided offset on the bdev
|
||||
length | 100% of bdev size | End I/O at `offset`+`length` on the bdev
|
||||
rw | | Type of I/O pattern
|
||||
|
||||
Available rw types:
|
||||
|
||||
- read
|
||||
- randread
|
||||
- write
|
||||
- randwrite
|
||||
- verify
|
||||
- reset
|
||||
- unmap
|
||||
- write_zeroes
|
||||
- flush
|
||||
- rw
|
||||
- randrw
|
375
doc/blob.md
375
doc/blob.md
@ -1,6 +1,6 @@
|
||||
# Blobstore Programmer's Guide {#blob}
|
||||
|
||||
## In this document {#blob_pg_toc}
|
||||
# In this document {#blob_pg_toc}
|
||||
|
||||
* @ref blob_pg_audience
|
||||
* @ref blob_pg_intro
|
||||
@ -35,39 +35,79 @@ NAND too.
|
||||
|
||||
## Theory of Operation {#blob_pg_theory}
|
||||
|
||||
### Abstractions
|
||||
### Abstractions:
|
||||
|
||||
The Blobstore defines a hierarchy of storage abstractions as follows.
|
||||
|
||||
* **Logical Block**: Logical blocks are exposed by the disk itself, which are numbered from 0 to N, where N is the
|
||||
number of blocks in the disk. A logical block is typically either 512B or 4KiB.
|
||||
number of blocks in the disk. A logical block is typically either 512B or 4KiB.
|
||||
* **Page**: A page is defined to be a fixed number of logical blocks defined at Blobstore creation time. The logical
|
||||
blocks that compose a page are always contiguous. Pages are also numbered from the beginning of the disk such
|
||||
that the first page worth of blocks is page 0, the second page is page 1, etc. A page is typically 4KiB in size,
|
||||
so this is either 8 or 1 logical blocks in practice. The SSD must be able to perform atomic reads and writes of
|
||||
at least the page size.
|
||||
blocks that compose a page are always contiguous. Pages are also numbered from the beginning of the disk such
|
||||
that the first page worth of blocks is page 0, the second page is page 1, etc. A page is typically 4KiB in size,
|
||||
so this is either 8 or 1 logical blocks in practice. The SSD must be able to perform atomic reads and writes of
|
||||
at least the page size.
|
||||
* **Cluster**: A cluster is a fixed number of pages defined at Blobstore creation time. The pages that compose a cluster
|
||||
are always contiguous. Clusters are also numbered from the beginning of the disk, where cluster 0 is the first cluster
|
||||
worth of pages, cluster 1 is the second grouping of pages, etc. A cluster is typically 1MiB in size, or 256 pages.
|
||||
are always contiguous. Clusters are also numbered from the beginning of the disk, where cluster 0 is the first cluster
|
||||
worth of pages, cluster 1 is the second grouping of pages, etc. A cluster is typically 1MiB in size, or 256 pages.
|
||||
* **Blob**: A blob is an ordered list of clusters. Blobs are manipulated (created, sized, deleted, etc.) by the application
|
||||
and persist across power failures and reboots. Applications use a Blobstore provided identifier to access a particular blob.
|
||||
Blobs are read and written in units of pages by specifying an offset from the start of the blob. Applications can also
|
||||
store metadata in the form of key/value pairs with each blob which we'll refer to as xattrs (extended attributes).
|
||||
and persist across power failures and reboots. Applications use a Blobstore provided identifier to access a particular blob.
|
||||
Blobs are read and written in units of pages by specifying an offset from the start of the blob. Applications can also
|
||||
store metadata in the form of key/value pairs with each blob which we'll refer to as xattrs (extended attributes).
|
||||
* **Blobstore**: An SSD which has been initialized by a Blobstore-based application is referred to as "a Blobstore." A
|
||||
Blobstore owns the entire underlying device which is made up of a private Blobstore metadata region and the collection of
|
||||
blobs as managed by the application.
|
||||
Blobstore owns the entire underlying device which is made up of a private Blobstore metadata region and the collection of
|
||||
blobs as managed by the application.
|
||||
|
||||
```text
|
||||
+-----------------------------------------------------------------+
|
||||
| Blob |
|
||||
| +-----------------------------+ +-----------------------------+ |
|
||||
| | Cluster | | Cluster | |
|
||||
| | +----+ +----+ +----+ +----+ | | +----+ +----+ +----+ +----+ | |
|
||||
| | |Page| |Page| |Page| |Page| | | |Page| |Page| |Page| |Page| | |
|
||||
| | +----+ +----+ +----+ +----+ | | +----+ +----+ +----+ +----+ | |
|
||||
| +-----------------------------+ +-----------------------------+ |
|
||||
+-----------------------------------------------------------------+
|
||||
```
|
||||
@htmlonly
|
||||
|
||||
<div id="blob_hierarchy"></div>
|
||||
|
||||
<script>
|
||||
let elem = document.getElementById('blob_hierarchy');
|
||||
|
||||
let canvasWidth = 800;
|
||||
let canvasHeight = 200;
|
||||
var two = new Two({ width: 800, height: 200 }).appendTo(elem);
|
||||
|
||||
var blobRect = two.makeRectangle(canvasWidth / 2, canvasHeight / 2, canvasWidth, canvasWidth);
|
||||
blobRect.fill = '#7ED3F7';
|
||||
|
||||
var blobText = two.makeText('Blob', canvasWidth / 2, 10, { alignment: 'center'});
|
||||
|
||||
for (var i = 0; i < 2; i++) {
|
||||
let clusterWidth = 400;
|
||||
let clusterHeight = canvasHeight;
|
||||
var clusterRect = two.makeRectangle((clusterWidth / 2) + (i * clusterWidth),
|
||||
clusterHeight / 2,
|
||||
clusterWidth - 10,
|
||||
clusterHeight - 50);
|
||||
clusterRect.fill = '#00AEEF';
|
||||
|
||||
var clusterText = two.makeText('Cluster',
|
||||
(clusterWidth / 2) + (i * clusterWidth),
|
||||
35,
|
||||
{ alignment: 'center', fill: 'white' });
|
||||
|
||||
|
||||
for (var j = 0; j < 4; j++) {
|
||||
let pageWidth = 100;
|
||||
let pageHeight = canvasHeight;
|
||||
var pageRect = two.makeRectangle((pageWidth / 2) + (j * pageWidth) + (i * clusterWidth),
|
||||
pageHeight / 2,
|
||||
pageWidth - 20,
|
||||
pageHeight - 100);
|
||||
pageRect.fill = '#003C71';
|
||||
|
||||
var pageText = two.makeText('Page',
|
||||
(pageWidth / 2) + (j * pageWidth) + (i * clusterWidth),
|
||||
pageHeight / 2,
|
||||
{ alignment: 'center', fill: 'white' });
|
||||
}
|
||||
}
|
||||
|
||||
two.update();
|
||||
</script>
|
||||
|
||||
@endhtmlonly
|
||||
|
||||
### Atomicity
|
||||
|
||||
@ -75,19 +115,19 @@ For all Blobstore operations regarding atomicity, there is a dependency on the u
|
||||
operations of at least one page size. Atomicity here can refer to multiple operations:
|
||||
|
||||
* **Data Writes**: For the case of data writes, the unit of atomicity is one page. Therefore if a write operation of
|
||||
greater than one page is underway and the system suffers a power failure, the data on media will be consistent at a page
|
||||
size granularity (if a single page were in the middle of being updated when power was lost, the data at that page location
|
||||
will be as it was prior to the start of the write operation following power restoration.)
|
||||
greater than one page is underway and the system suffers a power failure, the data on media will be consistent at a page
|
||||
size granularity (if a single page were in the middle of being updated when power was lost, the data at that page location
|
||||
will be as it was prior to the start of the write operation following power restoration.)
|
||||
* **Blob Metadata Updates**: Each blob has its own set of metadata (xattrs, size, etc). For performance reasons, a copy of
|
||||
this metadata is kept in RAM and only synchronized with the on-disk version when the application makes an explicit call to
|
||||
do so, or when the Blobstore is unloaded. Therefore, setting of an xattr, for example is not consistent until the call to
|
||||
synchronize it (covered later) which is, however, performed atomically.
|
||||
this metadata is kept in RAM and only synchronized with the on-disk version when the application makes an explicit call to
|
||||
do so, or when the Blobstore is unloaded. Therefore, setting of an xattr, for example is not consistent until the call to
|
||||
synchronize it (covered later) which is, however, performed atomically.
|
||||
* **Blobstore Metadata Updates**: Blobstore itself has its own metadata which, like per blob metadata, has a copy in both
|
||||
RAM and on-disk. Unlike the per blob metadata, however, the Blobstore metadata region is not made consistent via a blob
|
||||
synchronization call, it is only synchronized when the Blobstore is properly unloaded via API. Therefore, if the Blobstore
|
||||
metadata is updated (blob creation, deletion, resize, etc.) and not unloaded properly, it will need to perform some extra
|
||||
steps the next time it is loaded which will take a bit more time than it would have if shutdown cleanly, but there will be
|
||||
no inconsistencies.
|
||||
RAM and on-disk. Unlike the per blob metadata, however, the Blobstore metadata region is not made consistent via a blob
|
||||
synchronization call, it is only synchronized when the Blobstore is properly unloaded via API. Therefore, if the Blobstore
|
||||
metadata is updated (blob creation, deletion, resize, etc.) and not unloaded properly, it will need to perform some extra
|
||||
steps the next time it is loaded which will take a bit more time than it would have if shutdown cleanly, but there will be
|
||||
no inconsistencies.
|
||||
|
||||
### Callbacks
|
||||
|
||||
@ -129,11 +169,6 @@ Channels are an SPDK-wide abstraction and with Blobstore the best way to think a
|
||||
required in order to do IO. The application will perform IO to the channel and channels are best thought of as being
|
||||
associated 1:1 with a thread.
|
||||
|
||||
With external snapshots (see @ref blob_pg_esnap_and_esnap_clone), a read from a blob may lead to
|
||||
reading from the device containing the blobstore or an external snapshot device. To support this,
|
||||
each blobstore IO channel maintains a tree of channels to be used when reading from external
|
||||
snapshot devices.
|
||||
|
||||
### Blob Identifiers
|
||||
|
||||
When an application creates a blob, it does not provide a name as is the case with many other similar
|
||||
@ -148,25 +183,22 @@ When the Blobstore is initialized, there are multiple configuration options to c
|
||||
options and their defaults are:
|
||||
|
||||
* **Cluster Size**: By default, this value is 1MB. The cluster size is required to be a multiple of page size and should be
|
||||
selected based on the application’s usage model in terms of allocation. Recall that blobs are made up of clusters so when
|
||||
a blob is allocated/deallocated or changes in size, disk LBAs will be manipulated in groups of cluster size. If the
|
||||
application is expecting to deal with mainly very large (always multiple GB) blobs then it may make sense to change the
|
||||
cluster size to 1GB for example.
|
||||
selected based on the application’s usage model in terms of allocation. Recall that blobs are made up of clusters so when
|
||||
a blob is allocated/deallocated or changes in size, disk LBAs will be manipulated in groups of cluster size. If the
|
||||
application is expecting to deal with mainly very large (always multiple GB) blobs then it may make sense to change the
|
||||
cluster size to 1GB for example.
|
||||
* **Number of Metadata Pages**: By default, Blobstore will assume there can be as many clusters as there are metadata pages
|
||||
which is the worst case scenario in terms of metadata usage and can be overridden here however the space efficiency is
|
||||
not significant.
|
||||
which is the worst case scenario in terms of metadata usage and can be overridden here however the space efficiency is
|
||||
not significant.
|
||||
* **Maximum Simultaneous Metadata Operations**: Determines how many internally pre-allocated memory structures are set
|
||||
aside for performing metadata operations. It is unlikely that changes to this value (default 32) would be desirable.
|
||||
aside for performing metadata operations. It is unlikely that changes to this value (default 32) would be desirable.
|
||||
* **Maximum Simultaneous Operations Per Channel**: Determines how many internally pre-allocated memory structures are set
|
||||
aside for channel operations. Changes to this value would be application dependent and best determined by both a knowledge
|
||||
of the typical usage model, an understanding of the types of SSDs being used and empirical data. The default is 512.
|
||||
aside for channel operations. Changes to this value would be application dependent and best determined by both a knowledge
|
||||
of the typical usage model, an understanding of the types of SSDs being used and empirical data. The default is 512.
|
||||
* **Blobstore Type**: This field is a character array to be used by applications that need to identify whether the
|
||||
Blobstore found here is appropriate to claim or not. The default is NULL and unless the application is being deployed in
|
||||
an environment where multiple applications using the same disks are at risk of inadvertently using the wrong Blobstore, there
|
||||
is no need to set this value. It can, however, be set to any valid set of characters.
|
||||
* **External Snapshot Device Creation Callback**: If the blobstore supports external snapshots this function will be called
|
||||
as a blob that clones an external snapshot (an "esnap clone") is opened so that the blobstore consumer can load the external
|
||||
snapshot and register a blobstore device that will satisfy read requests. See @ref blob_pg_esnap_and_esnap_clone.
|
||||
Blobstore found here is appropriate to claim or not. The default is NULL and unless the application is being deployed in
|
||||
an environment where multiple applications using the same disks are at risk of inadvertently using the wrong Blobstore, there
|
||||
is no need to set this value. It can, however, be set to any valid set of characters.
|
||||
|
||||
### Sub-page Sized Operations
|
||||
|
||||
@ -178,11 +210,10 @@ requires finer granularity it will have to accommodate that itself.
|
||||
As mentioned earlier, Blobstore can share a single thread with an application or the application
|
||||
can define any number of threads, within resource constraints, that makes sense. The basic considerations that must be
|
||||
followed are:
|
||||
|
||||
* Metadata operations (API with MD in the name) should be isolated from each other as there is no internal locking on the
|
||||
memory structures affected by these API.
|
||||
memory structures affected by these API.
|
||||
* Metadata operations should be isolated from conflicting IO operations (an example of a conflicting IO would be one that is
|
||||
reading/writing to an area of a blob that a metadata operation is deallocating).
|
||||
reading/writing to an area of a blob that a metadata operation is deallocating).
|
||||
* Asynchronous callbacks will always take place on the calling thread.
|
||||
* No assumptions about IO ordering can be made regardless of how many or which threads were involved in the issuing.
|
||||
|
||||
@ -194,7 +225,7 @@ with SPDK API.
|
||||
### Error Handling
|
||||
|
||||
Asynchronous Blobstore callbacks all include an error number that should be checked; non-zero values
|
||||
indicate an error. Synchronous calls will typically return an error value if applicable.
|
||||
indicate and error. Synchronous calls will typically return an error value if applicable.
|
||||
|
||||
### Asynchronous API
|
||||
|
||||
@ -236,18 +267,21 @@ relevant in understanding any kind of structure for what is on the Blobstore.
|
||||
There are multiple examples of Blobstore usage in the [repo](https://github.com/spdk/spdk):
|
||||
|
||||
* **Hello World**: Actually named `hello_blob.c` this is a very basic example of a single threaded application that
|
||||
does nothing more than demonstrate the very basic API. Although Blobstore is optimized for NVMe, this example uses
|
||||
a RAM disk (malloc) back-end so that it can be executed easily in any development environment. The malloc back-end
|
||||
is a `bdev` module thus this example uses not only the SPDK Framework but the `bdev` layer as well.
|
||||
does nothing more than demonstrate the very basic API. Although Blobstore is optimized for NVMe, this example uses
|
||||
a RAM disk (malloc) back-end so that it can be executed easily in any development environment. The malloc back-end
|
||||
is a `bdev` module thus this example uses not on the SPDK Framework but the `bdev` layer as well.
|
||||
|
||||
* **Hello NVME Blob**: `hello_nvme_blob.c` is the non-bdev version of `hello_blob.c` and simply shows how an
|
||||
application can directly integrate Blobstore with the SPDK NVMe driver without using the `bdev` layer at all.
|
||||
|
||||
* **CLI**: The `blobcli.c` example is command line utility intended to not only serve as example code but as a test
|
||||
and development tool for Blobstore itself. It is also a simple single threaded application that relies on both the
|
||||
SPDK Framework and the `bdev` layer but offers multiple modes of operation to accomplish some real-world tasks. In
|
||||
command mode, it accepts single-shot commands which can be a little time consuming if there are many commands to
|
||||
get through as each one will take a few seconds waiting for DPDK initialization. It therefore has a shell mode that
|
||||
allows the developer to get to a `blob>` prompt and then very quickly interact with Blobstore with simple commands
|
||||
that include the ability to import/export blobs from/to regular files. Lastly there is a scripting mode to automate
|
||||
a series of tasks, again, handy for development and/or test type activities.
|
||||
and development tool for Blobstore itself. It is also a simple single threaded application that relies on both the
|
||||
SPDK Framework and the `bdev` layer but offers multiple modes of operation to accomplish some real-world tasks. In
|
||||
command mode, it accepts single-shot commands which can be a little time consuming if there are many commands to
|
||||
get through as each one will take a few seconds waiting for DPDK initialization. It therefore has a shell mode that
|
||||
allows the developer to get to a `blob>` prompt and then very quickly interact with Blobstore with simple commands
|
||||
that include the ability to import/export blobs from/to regular files. Lastly there is a scripting mode to automate
|
||||
a series of tasks, again, handy for development and/or test type activities.
|
||||
|
||||
## Configuration {#blob_pg_config}
|
||||
|
||||
@ -264,23 +298,19 @@ contribute to the Blobstore effort itself.
|
||||
The Blobstore owns the entire storage device. The device is divided into clusters starting from the beginning, such
|
||||
that cluster 0 begins at the first logical block.
|
||||
|
||||
```text
|
||||
LBA 0 LBA N
|
||||
+-----------+-----------+-----+-----------+
|
||||
| Cluster 0 | Cluster 1 | ... | Cluster N |
|
||||
+-----------+-----------+-----+-----------+
|
||||
```
|
||||
LBA 0 LBA N
|
||||
+-----------+-----------+-----+-----------+
|
||||
| Cluster 0 | Cluster 1 | ... | Cluster N |
|
||||
+-----------+-----------+-----+-----------+
|
||||
|
||||
Cluster 0 is special and has the following format, where page 0 is the first page of the cluster:
|
||||
|
||||
```text
|
||||
+--------+-------------------+
|
||||
| Page 0 | Page 1 ... Page N |
|
||||
+--------+-------------------+
|
||||
| Super | Metadata Region |
|
||||
| Block | |
|
||||
+--------+-------------------+
|
||||
```
|
||||
+--------+-------------------+
|
||||
| Page 0 | Page 1 ... Page N |
|
||||
+--------+-------------------+
|
||||
| Super | Metadata Region |
|
||||
| Block | |
|
||||
+--------+-------------------+
|
||||
|
||||
The super block is a single page located at the beginning of the partition. It contains basic information about
|
||||
the Blobstore. The metadata region is the remainder of cluster 0 and may extend to additional clusters. Refer
|
||||
@ -291,171 +321,6 @@ form a linked list. The first page in the list will be written in place on updat
|
||||
be written to fresh locations. This requires the backing device to support an atomic write size greater than
|
||||
or equal to the page size to guarantee that the operation is atomic. See the section on atomicity for details.
|
||||
|
||||
### Blob cluster layout {#blob_pg_cluster_layout}
|
||||
|
||||
Each blob is an ordered list of clusters, where starting LBA of a cluster is called extent. A blob can be
|
||||
thin provisioned, resulting in no extent for some of the clusters. When first write operation occurs
|
||||
to the unallocated cluster - new extent is chosen. This information is stored in RAM and on-disk.
|
||||
|
||||
There are two extent representations on-disk, dependent on `use_extent_table` (default:true) opts used
|
||||
when creating a blob.
|
||||
|
||||
* **use_extent_table=true**: EXTENT_PAGE descriptor is not part of linked list of pages. It contains extents
|
||||
that are not run-length encoded. Each extent page is referenced by EXTENT_TABLE descriptor, which is serialized
|
||||
as part of linked list of pages. Extent table is run-length encoding all unallocated extent pages.
|
||||
Every new cluster allocation updates a single extent page, in case when extent page was previously allocated.
|
||||
Otherwise additionally incurs serializing whole linked list of pages for the blob.
|
||||
|
||||
* **use_extent_table=false**: EXTENT_RLE descriptor is serialized as part of linked list of pages.
|
||||
Extents pointing to contiguous LBA are run-length encoded, including unallocated extents represented by 0.
|
||||
Every new cluster allocation incurs serializing whole linked list of pages for the blob.
|
||||
|
||||
### Thin Blobs, Snapshots, and Clones
|
||||
|
||||
Each in-use cluster is allocated to blobstore metadata or to a particular blob. Once a cluster is
|
||||
allocated to a blob it is considered owned by that blob and that particular blob's metadata
|
||||
maintains a reference to the cluster as a record of ownership. Cluster ownership is transferred
|
||||
during snapshot operations described later in @ref blob_pg_snapshots.
|
||||
|
||||
Through the use of thin provisioning, snapshots, and/or clones, a blob may be backed by clusters it
|
||||
owns, clusters owned by another blob, or by a zeroes device. The behavior of reads and writes depend
|
||||
on whether the operation targets blocks that are backed by a cluster owned by the blob or not.
|
||||
|
||||
* **read from blocks on an owned cluster**: The read is serviced by reading directly from the
|
||||
appropriate cluster.
|
||||
* **read from other blocks**: The read is passed on to the blob's *back device* and the back
|
||||
device services the read. The back device may be another blob or it may be a zeroes device.
|
||||
* **write to blocks on an owned cluster**: The write is serviced by writing directly to the
|
||||
appropriate cluster.
|
||||
* **write to thin provisioned cluster**: If the back device is the zeroes device and no cluster
|
||||
is allocated to the blob the process described in @ref blob_pg_thin_provisioning is followed.
|
||||
* **write to other blocks**: A copy-on-write operation is triggered. See @ref blob_pg_copy_on_write
|
||||
for details.
|
||||
|
||||
External snapshots allow some external data source to act as a snapshot. This allows clones to be
|
||||
created of data that resides outside of the blobstore containing the clone.
|
||||
|
||||
#### Thin Provisioning {#blob_pg_thin_provisioning}
|
||||
|
||||
As mentioned in @ref blob_pg_cluster_layout, a blob may be thin provisioned. A thin provisioned blob
|
||||
starts out with no allocated clusters. Clusters are allocated as writes occur. A thin provisioned
|
||||
blob's back device is a *zeroes device*. A read from a zeroes device fills the read buffer with
|
||||
zeroes.
|
||||
|
||||
When a thin provisioned volume writes to a block that does not have an allocated cluster, the
|
||||
following steps are performed:
|
||||
|
||||
1. Allocate a cluster.
|
||||
2. Update blob metadata.
|
||||
3. Perform the write.
|
||||
|
||||
#### Snapshots and Clones {#blob_pg_snapshots}
|
||||
|
||||
A snapshot is a read-only blob that may have clones. A snapshot may itself be a clone of one other
|
||||
blob. While the interface gives the illusion of being able to create many snapshots of a blob, under
|
||||
the covers this results in a chain of snapshots that are clones of the previous snapshot.
|
||||
|
||||
When blob1 is snapshotted, a new read-only blob is created and blob1 becomes a clone of this new
|
||||
blob. That is:
|
||||
|
||||
| Step | Action | State |
|
||||
| ---- | ------------------------------ | ------------------------------------------------- |
|
||||
| 1 | Create blob1 | `blob1 (rw)` |
|
||||
| 2 | Create snapshot blob2 of blob1 | `blob1 (rw) --> blob2 (ro)` |
|
||||
| 2a | Write to blob1 | `blob1 (rw) --> blob2 (ro)` |
|
||||
| 3 | Create snapshot blob3 of blob1 | `blob1 (rw) --> blob3 (ro) ---> blob2 (ro)` |
|
||||
|
||||
Supposing blob1 was not thin provisioned, step 1 would have allocated clusters needed to perform a
|
||||
full write of blob1. As blob2 is created in step 2, the ownership of all of blob1's clusters is
|
||||
transferred to blob2 and blob2 becomes blob1's back device. During step2a, the writes to blob1 cause
|
||||
one or more clusters to be allocated to blob1. When blob3 is created in step 3, the clusters
|
||||
allocated in step 2a are given to blob3, blob3's back device becomes blob2, and blob1's back device
|
||||
becomes blob3.
|
||||
|
||||
It is important to understand the chain above when considering strategies to use a golden image from
|
||||
which many clones are made. The IO path is more efficient if one snapshot is cloned many times than
|
||||
it is to create a new snapshot for every clone. The following illustrates the difference.
|
||||
|
||||
Using a single snapshot means the data originally referenced by the golden image is always one hop
|
||||
away.
|
||||
|
||||
```text
|
||||
create golden golden --> golden-snap
|
||||
snapshot golden as golden-snap ^ ^ ^
|
||||
clone golden-snap as clone1 clone1 ---+ | |
|
||||
clone golden-snap as clone2 clone2 -----+ |
|
||||
clone golden-snap as clone3 clone3 -------+
|
||||
```
|
||||
|
||||
Using a snapshot per clone means that the chain of back devices grows with every new snapshot and
|
||||
clone pair. Reading a block from clone3 may result in a read from clone3's back device (snap3), from
|
||||
clone2's back device (snap2), then finally clone1's back device (snap1, the current owner of the
|
||||
blocks originally allocated to golden).
|
||||
|
||||
```text
|
||||
create golden
|
||||
snapshot golden as snap1 golden --> snap3 -----> snap2 ----> snap1
|
||||
clone snap1 as clone1 clone3----/ clone2 --/ clone1 --/
|
||||
snapshot golden as snap2
|
||||
clone snap2 as clone2
|
||||
snapshot golden as snap3
|
||||
clone snap3 as clone3
|
||||
```
|
||||
|
||||
A snapshot with no more than one clone can be deleted. When a snapshot with one clone is deleted,
|
||||
the clone becomes a regular blob. The clusters owned by the snapshot are transferred to the clone or
|
||||
freed, depending on whether the clone already owns a cluster for a particular block range.
|
||||
|
||||
Removal of the last clone leaves the snapshot in place. This snapshot continues to be read-only and
|
||||
can serve as the snapshot for future clones.
|
||||
|
||||
#### Inflating and Decoupling Clones
|
||||
|
||||
A clone can remove its dependence on a snapshot with the following operations:
|
||||
|
||||
1. Inflate the clone. Clusters backed by any snapshot or a zeroes device are copied into newly
|
||||
allocated clusters. The blob becomes a thick provisioned blob.
|
||||
2. Decouple the clone. Clusters backed by the first back device snapshot are copied into newly
|
||||
allocated clusters. If the clone's back device snapshot was itself a clone of another
|
||||
snapshot, the clone remains a clone but is now a clone of a different snapshot.
|
||||
3. Remove the snapshot. This is only possible if the snapshot has one clone. The end result is
|
||||
usually the same as decoupling but ownership of clusters is transferred from the snapshot rather
|
||||
than being copied. If the snapshot that was deleted was itself a clone of another snapshot, the
|
||||
clone remains a clone, but is now a clone of a different snapshot.
|
||||
|
||||
#### External Snapshots and Esnap Clones {#blob_pg_esnap_and_esnap_clone}
|
||||
|
||||
A blobstore that is loaded with the `esnap_bs_dev_create` callback defined will support external
|
||||
snapshots (esnaps). An external snapshot is not useful on its own: it needs to be cloned by a blob.
|
||||
A clone of an external snapshot is referred to as an *esnap clone*. An esnap clone supports IO and
|
||||
other operations just like any other clone.
|
||||
|
||||
An esnap clone can be recognized in various ways:
|
||||
|
||||
* **On disk**: the blob metadata has the `SPDK_BLOB_EXTERNAL_SNAPSHOT` (0x8) bit is set in
|
||||
`invalid_flags` and an internal XATTR with name `BLOB_EXTERNAL_SNAPSHOT_ID` ("EXTSNAP") exists.
|
||||
* **In memory**: The `spdk_blob` structure contains the metadata read from disk, `blob->parent_id`
|
||||
is set to `SPDK_BLOBID_EXTERNAL_SNAPSHOT`, and `blob->back_bs_dev` references a blobstore device
|
||||
which is not a blob in the same blobstore nor a zeroes device.
|
||||
|
||||
#### Copy-on-write {#blob_pg_copy_on_write}
|
||||
|
||||
A copy-on-write operation is somewhat expensive, with the cost being proportional to the cluster
|
||||
size. Typical copy-on-write involves the following steps:
|
||||
|
||||
1. Allocate a cluster.
|
||||
2. Allocate a cluster-sized buffer into which data can be read.
|
||||
3. Trigger a full-cluster read from the back device into the cluster-sized buffer.
|
||||
4. Write from the cluster-sized buffer into the newly allocated cluster.
|
||||
5. Update the blob's on-disk metadata to record ownership of the newly allocated cluster. This
|
||||
involves at least one page-sized write.
|
||||
6. Write the new data to the just allocated and copied cluster.
|
||||
|
||||
If the source cluster is backed by a zeroes device, steps 2 through 4 are skipped. Alternatively, if
|
||||
the blobstore resides on a device that can perform the copy on its own, steps 2 through 4 are
|
||||
offloaded to the device. Neither of these optimizations are available when the back device is an
|
||||
external snapshot.
|
||||
|
||||
### Sequences and Batches
|
||||
|
||||
Internally Blobstore uses the concepts of sequences and batches to submit IO to the underlying device in either
|
||||
@ -465,18 +330,11 @@ a serial fashion or in parallel, respectively. Both are defined using the follow
|
||||
struct spdk_bs_request_set;
|
||||
~~~
|
||||
|
||||
These requests sets are basically bookkeeping mechanisms to help Blobstore efficiently deal with related groups
|
||||
These requests sets are basically bookkeeping mechanisms to help Blobstore efficiently deal will related groups
|
||||
of IO. They are an internal construct only and are pre-allocated on a per channel basis (channels were discussed
|
||||
earlier). They are removed from a channel associated linked list when the set (sequence or batch) is started and
|
||||
then returned to the list when completed.
|
||||
|
||||
Each request set maintains a reference to a `channel` and a `back_channel`. The `channel` is used
|
||||
for performing IO on the blobstore device. The `back_channel` is used for performing IO on the
|
||||
blob's back device, `blob->back_bs_dev`. For blobs that are not esnap clones, `channel` and
|
||||
`back_channel` reference an IO channel used with the device that contains the blobstore. For blobs
|
||||
that are esnap clones, `channel` is the same as with any other blob and `back_channel` is an IO
|
||||
channel for the external snapshot device.
|
||||
|
||||
### Key Internal Structures
|
||||
|
||||
`blobstore.h` contains many of the key structures for the internal workings of Blobstore. Only a few notable ones
|
||||
@ -486,7 +344,7 @@ the public API is `blob.h`.
|
||||
~~~{.sh}
|
||||
struct spdk_blob
|
||||
~~~
|
||||
This is an in-memory data structure that contains key elements like the blob identifier, its current state and two
|
||||
This is an in-memory data structure that contains key elements like the blob identifier, it's current state and two
|
||||
copies of the mutable metadata for the blob; one copy is the current metadata and the other is the last copy written
|
||||
to disk.
|
||||
|
||||
@ -520,6 +378,5 @@ example,
|
||||
~~~
|
||||
|
||||
And for the most part the following conventions are followed throughout:
|
||||
|
||||
* functions beginning with an underscore are called internally only
|
||||
* functions or variables with the letters `cpl` are related to set or callback completions
|
||||
|
@ -1,8 +1,8 @@
|
||||
# BlobFS (Blobstore Filesystem) {#blobfs}
|
||||
|
||||
## BlobFS Getting Started Guide {#blobfs_getting_started}
|
||||
# BlobFS Getting Started Guide {#blobfs_getting_started}
|
||||
|
||||
## RocksDB Integration {#blobfs_rocksdb}
|
||||
# RocksDB Integration {#blobfs_rocksdb}
|
||||
|
||||
Clone and build the SPDK repository as per https://github.com/spdk/spdk
|
||||
|
||||
@ -14,30 +14,25 @@ make
|
||||
~~~
|
||||
|
||||
Clone the RocksDB repository from the SPDK GitHub fork into a separate directory.
|
||||
Make sure you check out the `6.15.fb` branch.
|
||||
Make sure you check out the `spdk-v5.6.1` branch.
|
||||
|
||||
~~~{.sh}
|
||||
cd ..
|
||||
git clone -b 6.15.fb https://github.com/spdk/rocksdb.git
|
||||
git clone -b spdk-v5.6.1 https://github.com/spdk/rocksdb.git
|
||||
~~~
|
||||
|
||||
Build RocksDB. Only the `db_bench` benchmarking tool is integrated with BlobFS.
|
||||
(Note: add `DEBUG_LEVEL=0` for a release build.)
|
||||
|
||||
~~~{.sh}
|
||||
cd rocksdb
|
||||
make db_bench SPDK_DIR=relative_path/to/spdk
|
||||
~~~
|
||||
|
||||
Or you can also add `DEBUG_LEVEL=0` for a release build (need to turn on `USE_RTTI`).
|
||||
|
||||
~~~{.sh}
|
||||
export USE_RTTI=1 && make db_bench DEBUG_LEVEL=0 SPDK_DIR=relative_path/to/spdk
|
||||
make db_bench SPDK_DIR=path/to/spdk
|
||||
~~~
|
||||
|
||||
Create an NVMe section in the configuration file using SPDK's `gen_nvme.sh` script.
|
||||
|
||||
~~~{.sh}
|
||||
scripts/gen_nvme.sh --json-with-subsystems > /usr/local/etc/spdk/rocksdb.json
|
||||
scripts/gen_nvme.sh > /usr/local/etc/spdk/rocksdb.conf
|
||||
~~~
|
||||
|
||||
Verify the configuration file has specified the correct NVMe SSD.
|
||||
@ -54,7 +49,7 @@ HUGEMEM=5120 scripts/setup.sh
|
||||
Create an empty SPDK blobfs for testing.
|
||||
|
||||
~~~{.sh}
|
||||
test/blobfs/mkfs/mkfs /usr/local/etc/spdk/rocksdb.json Nvme0n1
|
||||
test/blobfs/mkfs/mkfs /usr/local/etc/spdk/rocksdb.conf Nvme0n1
|
||||
~~~
|
||||
|
||||
At this point, RocksDB is ready for testing with SPDK. Three `db_bench` parameters are used to configure SPDK:
|
||||
@ -66,20 +61,20 @@ At this point, RocksDB is ready for testing with SPDK. Three `db_bench` paramet
|
||||
Default is 4096 (4GB). (Optional)
|
||||
|
||||
SPDK has a set of scripts which will run `db_bench` against a variety of workloads and capture performance and profiling
|
||||
data. The primary script is `test/blobfs/rocksdb/rocksdb.sh`.
|
||||
data. The primary script is `test/blobfs/rocksdb/run_tests.sh`.
|
||||
|
||||
## FUSE
|
||||
# FUSE
|
||||
|
||||
BlobFS provides a FUSE plug-in to mount an SPDK BlobFS as a kernel filesystem for inspection or debug purposes.
|
||||
The FUSE plug-in requires fuse3 and will be built automatically when fuse3 is detected on the system.
|
||||
|
||||
~~~{.sh}
|
||||
test/blobfs/fuse/fuse /usr/local/etc/spdk/rocksdb.json Nvme0n1 /mnt/fuse
|
||||
test/blobfs/fuse/fuse /usr/local/etc/spdk/rocksdb.conf Nvme0n1 /mnt/fuse
|
||||
~~~
|
||||
|
||||
Note that the FUSE plug-in has some limitations - see the list below.
|
||||
|
||||
## Limitations
|
||||
# Limitations
|
||||
|
||||
* BlobFS has primarily been tested with RocksDB so far, so any use cases different from how RocksDB uses a filesystem
|
||||
may run into issues. BlobFS will be tested in a broader range of use cases after this initial release.
|
||||
|
@ -1,7 +0,0 @@
|
||||
# CI Tools {#ci_tools}
|
||||
|
||||
Section describing tools used by CI to verify integrity of the submitted
|
||||
patches ([status](https://ci.spdk.io)).
|
||||
|
||||
- @subpage shfmt
|
||||
- @subpage distributions
|
@ -76,7 +76,7 @@ realize the following:
|
||||
the 20 4KB IO units in the backing storage.
|
||||
* A "chunk map" will be 32 bytes in size. This corresponds to 4 backing IO units per chunk
|
||||
(16KB / 4KB), and 8B (64b) per backing IO unit index.
|
||||
* 5 chunk maps will be allocated in 160B of persistent memory. This corresponds to 4 chunk maps
|
||||
* 5 chunk maps will be allocated in 160B of persistent memory. This correponds to 4 chunk maps
|
||||
for the 4 chunks in the compressed block device (64KB / 16KB), plus an extra chunk map for use
|
||||
when overwriting an existing chunk.
|
||||
* "Free chunk map list" will consist of indices 0 through 4 (inclusive). These represent the
|
||||
@ -88,7 +88,6 @@ In these examples, the value "X" will represent the special value (2^64-1) descr
|
||||
|
||||
### Initial Creation
|
||||
|
||||
```text
|
||||
+--------------------+
|
||||
Backing Device | |
|
||||
+--------------------+
|
||||
@ -104,7 +103,6 @@ In these examples, the value "X" will represent the special value (2^64-1) descr
|
||||
+---+---+---+---+
|
||||
Logical Map | X | X | X | X |
|
||||
+---+---+---+---+
|
||||
```
|
||||
|
||||
### Write 16KB at Offset 32KB
|
||||
|
||||
@ -123,7 +121,6 @@ In these examples, the value "X" will represent the special value (2^64-1) descr
|
||||
store the 16KB of data.
|
||||
* Write the chunk map index to entry 2 in the logical map.
|
||||
|
||||
```text
|
||||
+--------------------+
|
||||
Backing Device |01 |
|
||||
+--------------------+
|
||||
@ -139,7 +136,6 @@ In these examples, the value "X" will represent the special value (2^64-1) descr
|
||||
+---+---+---+---+
|
||||
Logical Map | X | X | 0 | X |
|
||||
+---+---+---+---+
|
||||
```
|
||||
|
||||
### Write 4KB at Offset 8KB
|
||||
|
||||
@ -157,7 +153,6 @@ In these examples, the value "X" will represent the special value (2^64-1) descr
|
||||
* Write (2, X, X, X) to the chunk map.
|
||||
* Write the chunk map index to entry 0 in the logical map.
|
||||
|
||||
```text
|
||||
+--------------------+
|
||||
Backing Device |012 |
|
||||
+--------------------+
|
||||
@ -173,7 +168,6 @@ In these examples, the value "X" will represent the special value (2^64-1) descr
|
||||
+---+---+---+---+
|
||||
Logical Map | 1 | X | 0 | X |
|
||||
+---+---+---+---+
|
||||
```
|
||||
|
||||
### Read 16KB at Offset 16KB
|
||||
|
||||
@ -205,7 +199,6 @@ In these examples, the value "X" will represent the special value (2^64-1) descr
|
||||
* Free chunk map 1 back to the free chunk map list.
|
||||
* Free backing IO unit 2 back to the free backing IO unit list.
|
||||
|
||||
```text
|
||||
+--------------------+
|
||||
Backing Device |01 34 |
|
||||
+--------------------+
|
||||
@ -221,7 +214,6 @@ In these examples, the value "X" will represent the special value (2^64-1) descr
|
||||
+---+---+---+---+
|
||||
Logical Map | 2 | X | 0 | X |
|
||||
+---+---+---+---+
|
||||
```
|
||||
|
||||
### Operations that span across multiple chunks
|
||||
|
||||
|
@ -4,7 +4,5 @@
|
||||
- @subpage memory
|
||||
- @subpage concurrency
|
||||
- @subpage ssd_internals
|
||||
- @subpage nvme_spec
|
||||
- @subpage vhost_processing
|
||||
- @subpage overview
|
||||
- @subpage porting
|
||||
|
@ -1,62 +1,64 @@
|
||||
# Message Passing and Concurrency {#concurrency}
|
||||
|
||||
## Theory
|
||||
# Theory
|
||||
|
||||
One of the primary aims of SPDK is to scale linearly with the addition of
|
||||
hardware. This can mean many things in practice. For instance, moving from one
|
||||
SSD to two should double the number of I/O's per second. Or doubling the number
|
||||
of CPU cores should double the amount of computation possible. Or even doubling
|
||||
the number of NICs should double the network throughput. To achieve this, the
|
||||
software's threads of execution must be independent from one another as much as
|
||||
possible. In practice, that means avoiding software locks and even atomic
|
||||
instructions.
|
||||
hardware. This can mean a number of things in practice. For instance, moving
|
||||
from one SSD to two should double the number of I/O's per second. Or doubling
|
||||
the number of CPU cores should double the amount of computation possible. Or
|
||||
even doubling the number of NICs should double the network throughput. To
|
||||
achieve this, the software must be designed such that threads of execution are
|
||||
independent from one another as much as possible. In practice, that means
|
||||
avoiding software locks and even atomic instructions.
|
||||
|
||||
Traditionally, software achieves concurrency by placing some shared data onto
|
||||
the heap, protecting it with a lock, and then having all threads of execution
|
||||
acquire the lock only when accessing the data. This model has many great
|
||||
properties:
|
||||
acquire the lock only when that shared data needs to be accessed. This model
|
||||
has a number of great properties:
|
||||
|
||||
* It's easy to convert single-threaded programs to multi-threaded programs
|
||||
because you don't have to change the data model from the single-threaded
|
||||
version. You add a lock around the data.
|
||||
* It's relatively easy to convert single-threaded programs to multi-threaded
|
||||
programs because you don't have to change the data model from the
|
||||
single-threaded version. You just add a lock around the data.
|
||||
* You can write your program as a synchronous, imperative list of statements
|
||||
that you read from top to bottom.
|
||||
* The scheduler can interrupt threads, allowing for efficient time-sharing
|
||||
of CPU resources.
|
||||
that you read from top to bottom.
|
||||
* Your threads can be interrupted and put to sleep by the operating system
|
||||
scheduler behind the scenes, allowing for efficient time-sharing of CPU resources.
|
||||
|
||||
Unfortunately, as the number of threads scales up, contention on the lock around
|
||||
the shared data does too. More granular locking helps, but then also increases
|
||||
the complexity of the program. Even then, beyond a certain number of contended
|
||||
locks, threads will spend most of their time attempting to acquire the locks and
|
||||
the program will not benefit from more CPU cores.
|
||||
Unfortunately, as the number of threads scales up, contention on the lock
|
||||
around the shared data does too. More granular locking helps, but then also
|
||||
greatly increases the complexity of the program. Even then, beyond a certain
|
||||
number highly contended locks, threads will spend most of their time
|
||||
attempting to acquire the locks and the program will not benefit from any
|
||||
additional CPU cores.
|
||||
|
||||
SPDK takes a different approach altogether. Instead of placing shared data in a
|
||||
global location that all threads access after acquiring a lock, SPDK will often
|
||||
assign that data to a single thread. When other threads want to access the data,
|
||||
they pass a message to the owning thread to perform the operation on their
|
||||
behalf. This strategy, of course, is not at all new. For instance, it is one of
|
||||
the core design principles of
|
||||
assign that data to a single thread. When other threads want to access the
|
||||
data, they pass a message to the owning thread to perform the operation on
|
||||
their behalf. This strategy, of course, is not at all new. For instance, it is
|
||||
one of the core design principles of
|
||||
[Erlang](http://erlang.org/download/armstrong_thesis_2003.pdf) and is the main
|
||||
concurrency mechanism in [Go](https://tour.golang.org/concurrency/2). A message
|
||||
in SPDK consists of a function pointer and a pointer to some context. Messages
|
||||
are passed between threads using a
|
||||
in SPDK typically consists of a function pointer and a pointer to some context,
|
||||
and is passed between threads using a
|
||||
[lockless ring](http://dpdk.org/doc/guides/prog_guide/ring_lib.html). Message
|
||||
passing is often much faster than most software developer's intuition leads them
|
||||
to believe due to caching effects. If a single core is accessing the same data
|
||||
(on behalf of all of the other cores), then that data is far more likely to be
|
||||
in a cache closer to that core. It's often most efficient to have each core work
|
||||
on a small set of data sitting in its local cache and then hand off a small
|
||||
message to the next core when done.
|
||||
passing is often much faster than most software developer's intuition leads them to
|
||||
believe, primarily due to caching effects. If a single core is consistently
|
||||
accessing the same data (on behalf of all of the other cores), then that data
|
||||
is far more likely to be in a cache closer to that core. It's often most
|
||||
efficient to have each core work on a relatively small set of data sitting in
|
||||
its local cache and then hand off a small message to the next core when done.
|
||||
|
||||
In more extreme cases where even message passing may be too costly, each thread
|
||||
may make a local copy of the data. The thread will then only reference its local
|
||||
copy. To mutate the data, threads will send a message to each other thread
|
||||
telling them to perform the update on their local copy. This is great when the
|
||||
data isn't mutated very often, but is read very frequently, and is often
|
||||
employed in the I/O path. This of course trades memory size for computational
|
||||
efficiency, so it is used in only the most critical code paths.
|
||||
In more extreme cases where even message passing may be too costly, a copy of
|
||||
the data will be made for each thread. The thread will then only reference its
|
||||
local copy. To mutate the data, threads will send a message to each other
|
||||
thread telling them to perform the update on their local copy. This is great
|
||||
when the data isn't mutated very often, but may be read very frequently, and is
|
||||
often employed in the I/O path. This of course trades memory size for
|
||||
computational efficiency, so it's use is limited to only the most critical code
|
||||
paths.
|
||||
|
||||
## Message Passing Infrastructure
|
||||
# Message Passing Infrastructure
|
||||
|
||||
SPDK provides several layers of message passing infrastructure. The most
|
||||
fundamental libraries in SPDK, for instance, don't do any message passing on
|
||||
@ -66,75 +68,58 @@ their documentation (e.g. @ref nvme). Most libraries, however, depend on SPDK's
|
||||
abstraction, located in `libspdk_thread.a`. The thread abstraction provides a
|
||||
basic message passing framework and defines a few key primitives.
|
||||
|
||||
First, `spdk_thread` is an abstraction for a lightweight, stackless thread of
|
||||
execution. A lower level framework can execute an `spdk_thread` for a single
|
||||
timeslice by calling `spdk_thread_poll()`. A lower level framework is allowed to
|
||||
move an `spdk_thread` between system threads at any time, as long as there is
|
||||
only a single system thread executing `spdk_thread_poll()` on that
|
||||
`spdk_thread` at any given time. New lightweight threads may be created at any
|
||||
time by calling `spdk_thread_create()` and destroyed by calling
|
||||
`spdk_thread_destroy()`. The lightweight thread is the foundational abstraction for
|
||||
threading in SPDK.
|
||||
First, spdk_thread is an abstraction for a thread of execution and
|
||||
spdk_poller is an abstraction for a function that should be
|
||||
periodically called on the given thread. On each system thread that the user
|
||||
wishes to use with SPDK, they must first call spdk_allocate_thread(). This
|
||||
function takes three function pointers - one that will be called to pass a
|
||||
message to this thread, one that will be called to request that a poller be
|
||||
started on this thread, and finally one to request that a poller be stopped.
|
||||
*The implementation of these functions is not provided by this library*. Many
|
||||
applications already have facilities for passing messages, so to ease
|
||||
integration with existing code bases we've left the implementation up to the
|
||||
user. However, for users starting from scratch, see the following section on
|
||||
the event framework for an SPDK-provided implementation.
|
||||
|
||||
There are then a few additional abstractions layered on top of the
|
||||
`spdk_thread`. One is the `spdk_poller`, which is an abstraction for a
|
||||
function that should be repeatedly called on the given thread. Another is an
|
||||
`spdk_msg_fn`, which is a function pointer and a context pointer, that can
|
||||
be sent to a thread for execution via `spdk_thread_send_msg()`.
|
||||
|
||||
The library also defines two additional abstractions: `spdk_io_device` and
|
||||
`spdk_io_channel`. In the course of implementing SPDK we noticed the same
|
||||
pattern emerging in a number of different libraries. In order to implement a
|
||||
message passing strategy, the code would describe some object with global state
|
||||
and also some per-thread context associated with that object that was accessed
|
||||
in the I/O path to avoid locking on the global state. The pattern was clearest
|
||||
in the lowest layers where I/O was being submitted to block devices. These
|
||||
devices often expose multiple queues that can be assigned to threads and then
|
||||
accessed without a lock to submit I/O. To abstract that, we generalized the
|
||||
device to `spdk_io_device` and the thread-specific queue to `spdk_io_channel`.
|
||||
Over time, however, the pattern has appeared in a huge number of places that
|
||||
don't fit quite so nicely with the names we originally chose. In today's code
|
||||
`spdk_io_device` is any pointer, whose uniqueness is predicated only on its
|
||||
memory address, and `spdk_io_channel` is the per-thread context associated with
|
||||
a particular `spdk_io_device`.
|
||||
The library also defines two other abstractions: spdk_io_device and
|
||||
spdk_io_channel. In the course of implementing SPDK we noticed the
|
||||
same pattern emerging in a number of different libraries. In order to
|
||||
implement a message passing strategy, the code would describe some object with
|
||||
global state and also some per-thread context associated with that object that
|
||||
was accessed in the I/O path to avoid locking on the global state. The pattern
|
||||
was clearest in the lowest layers where I/O was being submitted to block
|
||||
devices. These devices often expose multiple queues that can be assigned to
|
||||
threads and then accessed without a lock to submit I/O. To abstract that, we
|
||||
generalized the device to spdk_io_device and the thread-specific queue to
|
||||
spdk_io_channel. Over time, however, the pattern has appeared in a huge
|
||||
number of places that don't fit quite so nicely with the names we originally
|
||||
chose. In today's code spdk_io_device is any pointer, whose uniqueness is
|
||||
predicated only on its memory address, and spdk_io_channel is the per-thread
|
||||
context associated with a particular spdk_io_device.
|
||||
|
||||
The threading abstraction provides functions to send a message to any other
|
||||
thread, to send a message to all threads one by one, and to send a message to
|
||||
all threads for which there is an io_channel for a given io_device.
|
||||
|
||||
Most critically, the thread abstraction does not actually spawn any system level
|
||||
threads of its own. Instead, it relies on the existence of some lower level
|
||||
framework that spawns system threads and sets up event loops. Inside those event
|
||||
loops, the threading abstraction simply requires the lower level framework to
|
||||
repeatedly call `spdk_thread_poll()` on each `spdk_thread()` that exists. This
|
||||
makes SPDK very portable to a wide variety of asynchronous, event-based
|
||||
frameworks such as [Seastar](https://www.seastar.io) or [libuv](https://libuv.org/).
|
||||
# The event Framework
|
||||
|
||||
## SPDK Spinlocks
|
||||
As the number of example applications in SPDK grew, it became clear that a
|
||||
large portion of the code in each was implementing the basic message passing
|
||||
infrastructure required to call spdk_allocate_thread(). This includes spawning
|
||||
one thread per core, pinning each thread to a unique core, and allocating
|
||||
lockless rings between the threads for message passing. Instead of
|
||||
re-implementing that infrastructure for each example application, SPDK
|
||||
provides the SPDK @ref event. This library handles setting up all of the
|
||||
message passing infrastructure, installing signal handlers to cleanly
|
||||
shutdown, implements periodic pollers, and does basic command line parsing.
|
||||
When started through spdk_app_start(), the library automatically spawns all of
|
||||
the threads requested, pins them, and calls spdk_allocate_thread() with
|
||||
appropriate function pointers for each one. This makes it much easier to
|
||||
implement a brand new SPDK application and is the recommended method for those
|
||||
starting out. Only established applications with sufficient message passing
|
||||
infrastructure should consider directly integrating the lower level libraries.
|
||||
|
||||
There are some cases where locks are used. These should be limited in favor of
|
||||
the message passing interface described above. When locks are needed,
|
||||
SPDK spinlocks should be used instead of POSIX locks.
|
||||
|
||||
POSIX locks like `pthread_mutex_t` and `pthread_spinlock_t` do not properly
|
||||
handle locking between SPDK's lightweight threads. SPDK's `spdk_spinlock`
|
||||
is safe to use in SPDK libraries and applications. This safety comes from
|
||||
imposing restrictions on when locks can be held. See
|
||||
[spdk_spinlock](structspdk__spinlock.html) for details.
|
||||
|
||||
## The event Framework
|
||||
|
||||
The SPDK project didn't want to officially pick an asynchronous, event-based
|
||||
framework for all of the example applications it shipped with, in the interest
|
||||
of supporting the widest variety of frameworks possible. But the applications do
|
||||
of course require something that implements an asynchronous event loop in order
|
||||
to run, so enter the `event` framework located in `lib/event`. This framework
|
||||
includes things like polling and scheduling the lightweight threads, installing
|
||||
signal handlers to cleanly shutdown, and basic command line option parsing.
|
||||
Only established applications should consider directly integrating the lower
|
||||
level libraries.
|
||||
|
||||
## Limitations of the C Language
|
||||
# Limitations of the C Language
|
||||
|
||||
Message passing is efficient, but it results in asynchronous code.
|
||||
Unfortunately, asynchronous code is a challenge in C. It's often implemented by
|
||||
@ -152,7 +137,6 @@ function `foo` performs some asynchronous operation and when that completes
|
||||
function `bar` is called, then function `bar` performs some operation that
|
||||
calls function `baz` on completion, a good way to write it is as such:
|
||||
|
||||
```c
|
||||
void baz(void *ctx) {
|
||||
...
|
||||
}
|
||||
@ -164,7 +148,6 @@ calls function `baz` on completion, a good way to write it is as such:
|
||||
void foo(void *ctx) {
|
||||
async_op(bar, ctx);
|
||||
}
|
||||
```
|
||||
|
||||
Don't split these functions up - keep them as a nice unit that can be read from bottom to top.
|
||||
|
||||
@ -176,7 +159,6 @@ them in C we can still write them out by hand. As an example, here's a
|
||||
callback chain that performs `foo` 5 times and then calls `bar` - effectively
|
||||
an asynchronous for loop.
|
||||
|
||||
```c
|
||||
enum states {
|
||||
FOO_START = 0,
|
||||
FOO_END,
|
||||
@ -259,7 +241,6 @@ an asynchronous for loop.
|
||||
|
||||
run_state_machine(sm);
|
||||
}
|
||||
```
|
||||
|
||||
This is complex, of course, but the `run_state_machine` function can be read
|
||||
from top to bottom to get a clear overview of what's happening in the code
|
||||
|
@ -1,101 +0,0 @@
|
||||
# SPDK and Containers {#containers}
|
||||
|
||||
This is a living document as there are many ways to use containers with
|
||||
SPDK. As new usages are identified and tested, they will be documented
|
||||
here.
|
||||
|
||||
## In this document {#containers_toc}
|
||||
|
||||
* @ref spdk_in_docker
|
||||
* @ref spdk_docker_suite
|
||||
* @ref kata_containers_with_spdk_vhost
|
||||
|
||||
## Containerizing an SPDK Application for Docker {#spdk_in_docker}
|
||||
|
||||
There are no SPDK specific changes needed to run an SPDK based application in
|
||||
a docker container, however this quick start guide should help you as you
|
||||
containerize your SPDK based application.
|
||||
|
||||
1. Make sure you have all of your app dependencies identified and included in your Dockerfile
|
||||
2. Make sure you have compiled your application for the target arch
|
||||
3. Make sure your host has hugepages enabled
|
||||
4. Make sure your host has bound your nvme device to your userspace driver
|
||||
5. Write your Dockerfile. The following is a simple Dockerfile to containerize the nvme `hello_world`
|
||||
example:
|
||||
|
||||
~~~{.sh}
|
||||
# start with the latest Fedora
|
||||
FROM fedora
|
||||
|
||||
# if you are behind a proxy, set that up now
|
||||
ADD dnf.conf /etc/dnf/dnf.conf
|
||||
|
||||
# these are the min dependencies for the hello_world app
|
||||
RUN dnf install libaio-devel -y
|
||||
RUN dnf install numactl-devel -y
|
||||
|
||||
# set our working dir
|
||||
WORKDIR /app
|
||||
|
||||
# add the hello_world binary
|
||||
ADD hello_world hello_world
|
||||
|
||||
# run the app
|
||||
CMD ./hello_world
|
||||
~~~
|
||||
|
||||
6. Create your image
|
||||
|
||||
`sudo docker image build -t hello:1.0 .`
|
||||
|
||||
7. You docker command line will need to include at least the following:
|
||||
- the `--privileged` flag to enable sharing of hugepages
|
||||
- use of the `-v` switch to map hugepages
|
||||
|
||||
`sudo docker run --privileged -v /dev/hugepages:/dev/hugepages hello:1.0`
|
||||
|
||||
or depending on the needs of your app you may need one or more of the following parameters:
|
||||
|
||||
- If you are using the SPDK app framework: `-v /dev/shm:/dev/shm`
|
||||
- If you need to use RPCs from outside of the container: `-v /var/tmp:/var/tmp`
|
||||
- If you need to use the host network (i.e. NVMF target application): `--network host`
|
||||
|
||||
Your output should look something like this:
|
||||
|
||||
~~~{.sh}
|
||||
$ sudo docker run --privileged -v //dev//hugepages://dev//hugepages hello:1.0
|
||||
Starting SPDK v20.01-pre git sha1 80da95481 // DPDK 19.11.0 initialization...
|
||||
[ DPDK EAL parameters: hello_world -c 0x1 --log-level=lib.eal:6 --log-level=lib.cryptodev:5 --log-level=user1:6 --iova-mode=pa
|
||||
--base-virtaddr=0x200000000000 --match-allocations --file-prefix=spdk0 --proc-type=auto ]
|
||||
EAL: No available hugepages reported in hugepages-1048576kB
|
||||
Initializing NVMe Controllers
|
||||
Attaching to 0000:06:00.0
|
||||
Attached to 0000:06:00.0
|
||||
Using controller INTEL SSDPEDMD400G4 (CVFT7203005M400LGN ) with 1 namespaces.
|
||||
Namespace ID: 1 size: 400GB
|
||||
Initialization complete.
|
||||
INFO: using host memory buffer for IO
|
||||
Hello world!
|
||||
~~~
|
||||
|
||||
## SPDK Docker suite {#spdk_docker_suite}
|
||||
|
||||
When considering how to generate SPDK docker container images formally,
|
||||
deploy SPDK containers correctly, interact with SPDK container instances,
|
||||
and orchestrate SPDK container instances, you can get practiced and inspired from
|
||||
this SPDK docker-compose example:
|
||||
[SPDK Docker suite](https://github.com/spdk/spdk/blob/master/docker/README.md).
|
||||
|
||||
## Using SPDK vhost target to provide volume service to Kata Containers and Docker {#kata_containers_with_spdk_vhost}
|
||||
|
||||
[Kata Containers](https://katacontainers.io) can build a secure container
|
||||
runtime with lightweight virtual machines that feel and perform like
|
||||
containers, but provide stronger workload isolation using hardware
|
||||
virtualization technology as a second layer of defense.
|
||||
|
||||
From Kata Containers [1.11.0](https://github.com/kata-containers/runtime/releases/tag/1.11.0),
|
||||
vhost-user-blk support is enabled in `kata-containers/runtime`. That is to say
|
||||
SPDK vhost target can be used to provide volume service to Kata Containers directly.
|
||||
In addition, a container manager like Docker, can be configured easily to launch
|
||||
a Kata container with an SPDK vhost-user block device. For operating details, visit
|
||||
Kata containers use-case [Setup to run SPDK vhost-user devices with Kata Containers and Docker](https://github.com/kata-containers/documentation/blob/master/use-cases/using-SPDK-vhostuser-and-kata.md#host-setup-for-vhost-user-devices)
|
122
doc/directory_structure.md
Normal file
122
doc/directory_structure.md
Normal file
@ -0,0 +1,122 @@
|
||||
# SPDK Directory Structure {#directory_structure}
|
||||
|
||||
# Overview {#dir_overview}
|
||||
|
||||
SPDK is primarily a collection of C libraries intended to be consumed directly by
|
||||
applications, but the repository also contains many examples and full-fledged applications.
|
||||
This will provide a general overview of what is where in the repository.
|
||||
|
||||
## Applications {#dir_app}
|
||||
|
||||
The `app` top-level directory contains four applications:
|
||||
- `app/iscsi_tgt`: An iSCSI target
|
||||
- `app/nvmf_tgt`: An NVMe-oF target
|
||||
- `app/iscsi_top`: Informational tool (like `top`) that tracks activity in the
|
||||
iSCSI target.
|
||||
- `app/trace`: A tool for processing trace points output from the iSCSI and
|
||||
NVMe-oF targets.
|
||||
- `app/vhost`: A vhost application that presents virtio controllers to
|
||||
QEMU-based VMs and process I/O submitted to those controllers.
|
||||
|
||||
The application binaries will be in their respective directories after compiling and all
|
||||
can be run with no arguments to print out their command line arguments. For the iSCSI
|
||||
and NVMe-oF targets, they both need a configuration file (-c option). Fully commented
|
||||
examples of the configuration files live in the `etc/spdk` directory.
|
||||
|
||||
## Build Collateral {#dir_build}
|
||||
|
||||
The `build` directory contains all of the static libraries constructed during
|
||||
the build process. The `lib` directory combined with the `include/spdk`
|
||||
directory are the official outputs of an SPDK release, if it were to be packaged.
|
||||
|
||||
## Documentation {#dir_doc}
|
||||
|
||||
The `doc` top-level directory contains all of SPDK's documentation. API Documentation
|
||||
is created using Doxygen directly from the code, but more general articles and longer
|
||||
explanations reside in this directory, as well as the Doxygen config file.
|
||||
|
||||
To build the documentation, just type `make` within the doc directory.
|
||||
|
||||
## Examples {#dir_examples}
|
||||
|
||||
The `examples` top-level directory contains a set of examples intended to be used
|
||||
for reference. These are different than the applications, which are doing a "real"
|
||||
task that could reasonably be deployed. The examples are instead either heavily
|
||||
contrived to demonstrate some facet of SPDK, or aren't considered complete enough
|
||||
to warrant tagging them as a full blown SPDK application.
|
||||
|
||||
This is a great place to learn about how SPDK works. In particular, check out
|
||||
`examples/nvme/hello_world`.
|
||||
|
||||
## Include {#dir_include}
|
||||
|
||||
The `include` directory is where all of the header files are located. The public API
|
||||
is all placed in the `spdk` subdirectory of `include` and we highly
|
||||
recommend that applications set their include path to the top level `include`
|
||||
directory and include the headers by prefixing `spdk/` like this:
|
||||
|
||||
~~~{.c}
|
||||
#include "spdk/nvme.h"
|
||||
~~~
|
||||
|
||||
Most of the headers here correspond with a library in the `lib` directory and will be
|
||||
covered in that section. There are a few headers that stand alone, however. They are:
|
||||
|
||||
- `assert.h`
|
||||
- `barrier.h`
|
||||
- `endian.h`
|
||||
- `fd.h`
|
||||
- `mmio.h`
|
||||
- `queue.h` and `queue_extras.h`
|
||||
- `string.h`
|
||||
|
||||
There is also an `spdk_internal` directory that contains header files widely included
|
||||
by libraries within SPDK, but that are not part of the public API and would not be
|
||||
installed on a user's system.
|
||||
|
||||
## Libraries {#dir_lib}
|
||||
|
||||
The `lib` directory contains the real heart of SPDK. Each component is a C library with
|
||||
its own directory under `lib`.
|
||||
|
||||
### Block Device Abstraction Layer {#dir_bdev}
|
||||
|
||||
The `bdev` directory contains a block device abstraction layer that is currently used
|
||||
within the iSCSI and NVMe-oF targets. The public interface is `include/spdk/bdev.h`.
|
||||
This library lacks clearly defined responsibilities as of this writing and instead does a
|
||||
number of
|
||||
things:
|
||||
- Translates from a common `block` protocol to specific protocols like NVMe or to system
|
||||
calls like libaio. There are currently three block device backend modules that can be
|
||||
plugged in - libaio, SPDK NVMe, CephRBD, and a RAM-based backend called malloc.
|
||||
- Provides a mechanism for composing virtual block devices from physical devices (to do
|
||||
RAID and the like).
|
||||
- Handles some memory allocation for data buffers.
|
||||
|
||||
This layer also could be made to do I/O queueing or splitting in a general way. We're open
|
||||
to design ideas and discussion here.
|
||||
|
||||
### Configuration File Parser {#dir_conf}
|
||||
|
||||
The `conf` directory contains configuration file parser. The public header
|
||||
is `include/spdk/conf.h`. The configuration file format is kind of like INI,
|
||||
except that the directives are are "Name Value" instead of "Name = Value". This is
|
||||
the configuration format for both the iSCSI and NVMe-oF targets.
|
||||
|
||||
... Lots more libraries that need to be described ...
|
||||
|
||||
## Makefile Fragments {#dir_mk}
|
||||
|
||||
The `mk` directory contains a number of shared Makefile fragments used in the build system.
|
||||
|
||||
## Scripts {#dir_scripts}
|
||||
|
||||
The `scripts` directory contains convenient scripts for a number of operations. The two most
|
||||
important are `check_format.sh`, which will use astyle and pep8 to check C, C++, and Python
|
||||
coding style against our defined conventions, and `setup.sh` which binds and unbinds devices
|
||||
from kernel drivers.
|
||||
|
||||
## Tests {#dir_tests}
|
||||
|
||||
The `test` directory contains all of the tests for SPDK's components and the subdirectories mirror
|
||||
the structure of the entire repository. The tests are a mixture of unit tests and functional tests.
|
@ -1,69 +0,0 @@
|
||||
# distributions {#distributions}
|
||||
|
||||
## In this document {#distros_toc}
|
||||
|
||||
* @ref distros_overview
|
||||
* @ref linux_list
|
||||
* @ref freebsd_list
|
||||
|
||||
## Overview {#distros_overview}
|
||||
|
||||
CI pool uses different flavors of `Linux` and `FreeBSD` distributions which are
|
||||
used as a base for all the tests run against submitted patches. Below is the
|
||||
listing which covers all currently supported versions and the related CI
|
||||
jobs (see [status](https://ci.spdk.io) as a reference).
|
||||
|
||||
## Linux distributions {#linux_list}
|
||||
|
||||
* Fedora: Trying to follow new release as per the release cycle whenever possible.
|
||||
|
||||
```list
|
||||
- autobuild-vg-autotest
|
||||
- clang-vg-autotest
|
||||
- iscsi*-vg-autotest
|
||||
- nvme-vg-autotest
|
||||
- nvmf*-vg-autotest
|
||||
- scanbuild-vg-autotest
|
||||
- unittest-vg-autotest
|
||||
- vhost-initiator-vg-autotest
|
||||
```
|
||||
|
||||
Jobs listed below are run on bare-metal systems where version of
|
||||
Fedora may vary. In the future these will be aligned with the
|
||||
`vg` jobs.
|
||||
|
||||
```list
|
||||
- BlobFS-autotest
|
||||
- crypto-autotest
|
||||
- nvme-phy-autotest
|
||||
- nvmf*-phy-autotest
|
||||
- vhost-autotest
|
||||
```
|
||||
|
||||
* Ubuntu: Last two LTS releases. Currently `18.04` and `20.04`.
|
||||
|
||||
```list
|
||||
- ubuntu18-vg-autotest
|
||||
- ubuntu20-vg-autotest
|
||||
```
|
||||
|
||||
* CentOS: Maintained releases. Currently `7.9`. Centos 8.3 is only used for testing on 22.01.x branch.
|
||||
|
||||
```list
|
||||
- centos7-vg-autotest
|
||||
- centos8-vg-autotest
|
||||
```
|
||||
|
||||
* Rocky Linux: Last release. Currently `8.6`. CentOS 8 replacement.
|
||||
|
||||
```list
|
||||
- rocky8-vg-autotest
|
||||
```
|
||||
|
||||
## FreeBSD distributions {#freebsd_list}
|
||||
|
||||
* FreeBSD: Production release. Currently `12.2`.
|
||||
|
||||
```list
|
||||
- freebsd-vg-autotest
|
||||
```
|
@ -2,6 +2,4 @@
|
||||
|
||||
- @subpage nvme
|
||||
- @subpage ioat
|
||||
- @subpage idxd
|
||||
- @subpage virtio
|
||||
- @subpage vmd
|
||||
|
20
doc/event.md
20
doc/event.md
@ -14,7 +14,7 @@ concurrency.
|
||||
|
||||
The event framework public interface is defined in event.h.
|
||||
|
||||
## Event Framework Design Considerations {#event_design}
|
||||
# Event Framework Design Considerations {#event_design}
|
||||
|
||||
Simple server applications can be written in a single-threaded fashion. This
|
||||
allows for straightforward code that can maintain state without any locking or
|
||||
@ -27,9 +27,9 @@ synchronization. Unfortunately, in many real-world cases, the connections are
|
||||
not entirely independent and cross-thread shared state is necessary. SPDK
|
||||
provides an event framework to help solve this problem.
|
||||
|
||||
## SPDK Event Framework Components {#event_components}
|
||||
# SPDK Event Framework Components {#event_components}
|
||||
|
||||
### Events {#event_component_events}
|
||||
## Events {#event_component_events}
|
||||
|
||||
To accomplish cross-thread communication while minimizing synchronization
|
||||
overhead, the framework provides message passing in the form of events. The
|
||||
@ -45,7 +45,7 @@ asynchronous operations to achieve concurrency. Asynchronous I/O may be issued
|
||||
with a non-blocking function call, and completion is typically signaled using
|
||||
a callback function.
|
||||
|
||||
### Reactors {#event_component_reactors}
|
||||
## Reactors {#event_component_reactors}
|
||||
|
||||
Each reactor has a lock-free queue for incoming events to that core, and
|
||||
threads from any core may insert events into the queue of any other core. The
|
||||
@ -54,7 +54,7 @@ in first-in, first-out order as they are received. Event functions should
|
||||
never block and should preferably execute very quickly, since they are called
|
||||
directly from the event loop on the destination core.
|
||||
|
||||
### Pollers {#event_component_pollers}
|
||||
## Pollers {#event_component_pollers}
|
||||
|
||||
The framework also defines another type of function called a poller. Pollers
|
||||
may be registered with the spdk_poller_register() function. Pollers, like
|
||||
@ -66,18 +66,10 @@ intended to poll hardware as a replacement for interrupts. Normally, pollers
|
||||
are executed on every iteration of the main event loop. Pollers may also be
|
||||
scheduled to execute periodically on a timer if low latency is not required.
|
||||
|
||||
### Application Framework {#event_component_app}
|
||||
## Application Framework {#event_component_app}
|
||||
|
||||
The framework itself is bundled into a higher level abstraction called an "app". Once
|
||||
spdk_app_start() is called, it will block the current thread until the application
|
||||
terminates by calling spdk_app_stop() or an error condition occurs during the
|
||||
initialization code within spdk_app_start(), itself, before invoking the caller's
|
||||
supplied function.
|
||||
|
||||
### Custom shutdown callback {#event_component_shutdown}
|
||||
|
||||
When creating SPDK based application user may add custom shutdown callback which
|
||||
will be called before the application framework starts the shutdown process.
|
||||
To do that set shutdown_cb function callback in spdk_app_opts structure passed
|
||||
to spdk_app_start(). Custom shutdown callback should call spdk_app_stop() before
|
||||
returning to continue application shutdown process.
|
||||
|
3
doc/experimental_tools.md
Normal file
3
doc/experimental_tools.md
Normal file
@ -0,0 +1,3 @@
|
||||
# Experimental Tools {#experimental_tools}
|
||||
|
||||
- @subpage spdkcli
|
333
doc/ftl.md
333
doc/ftl.md
@ -1,206 +1,261 @@
|
||||
# Flash Translation Layer {#ftl}
|
||||
|
||||
The Flash Translation Layer library provides efficient 4K block device access on top of devices
|
||||
with >4K write unit size (eg. raid5f bdev) or devices with large indirection units (some
|
||||
capacity-focused NAND drives), which don't handle 4K writes well. It handles the logical to
|
||||
physical address mapping and manages the garbage collection process.
|
||||
The Flash Translation Layer library provides block device access on top of non-block SSDs
|
||||
implementing Open Channel interface. It handles the logical to physical address mapping, responds to
|
||||
the asynchronous media management events, and manages the defragmentation process.
|
||||
|
||||
## Terminology {#ftl_terminology}
|
||||
# Terminology {#ftl_terminology}
|
||||
|
||||
### Logical to physical address map {#ftl_l2p}
|
||||
## Logical to physical address map
|
||||
|
||||
- Shorthand: `L2P`
|
||||
* Shorthand: L2P
|
||||
|
||||
Contains the mapping of the logical addresses (LBA) to their on-disk physical location. The LBAs
|
||||
are contiguous and in range from 0 to the number of surfaced blocks (the number of spare blocks
|
||||
Contains the mapping of the logical addresses (LBA) to their on-disk physical location (PPA). The
|
||||
LBAs are contiguous and in range from 0 to the number of surfaced blocks (the number of spare blocks
|
||||
are calculated during device formation and are subtracted from the available address space). The
|
||||
spare blocks account for zones going offline throughout the lifespan of the device as well as
|
||||
provide necessary buffer for data [garbage collection](#ftl_reloc).
|
||||
spare blocks account for chunks going offline throughout the lifespan of the device as well as
|
||||
provide necessary buffer for data [defragmentation](#ftl_reloc).
|
||||
|
||||
Since the L2P would occupy a significant amount of DRAM (4B/LBA for drives smaller than 16TiB,
|
||||
8B/LBA for bigger drives), FTL will, by default, store only the 2GiB of most recently used L2P
|
||||
addresses in memory (the amount is configurable), and page them in and out of the cache device
|
||||
as necessary.
|
||||
## Band {#ftl_band}
|
||||
|
||||
### Band {#ftl_band}
|
||||
|
||||
A band describes a collection of zones, each belonging to a different parallel unit. All writes to
|
||||
a band follow the same pattern - a batch of logical blocks is written to one zone, another batch
|
||||
Band describes a collection of chunks, each belonging to a different parallel unit. All writes to
|
||||
the band follow the same pattern - a batch of logical blocks is written to one chunk, another batch
|
||||
to the next one and so on. This ensures the parallelism of the write operations, as they can be
|
||||
executed independently on different zones. Each band keeps track of the LBAs it consists of, as
|
||||
executed independently on a different chunks. Each band keeps track of the LBAs it consists of, as
|
||||
well as their validity, as some of the data will be invalidated by subsequent writes to the same
|
||||
logical address. The L2P mapping can be restored from the SSD by reading this information in order
|
||||
from the oldest band to the youngest.
|
||||
|
||||
```text
|
||||
+--------------+ +--------------+ +--------------+
|
||||
band 1 | zone 1 +--------+ zone 1 +---- --- --- --- --- ---+ zone 1 |
|
||||
band 1 | chunk 1 +--------+ chk 1 +---- --- --- --- --- ---+ chk 1 |
|
||||
+--------------+ +--------------+ +--------------+
|
||||
band 2 | zone 2 +--------+ zone 2 +---- --- --- --- --- ---+ zone 2 |
|
||||
band 2 | chunk 2 +--------+ chk 2 +---- --- --- --- --- ---+ chk 2 |
|
||||
+--------------+ +--------------+ +--------------+
|
||||
band 3 | zone 3 +--------+ zone 3 +---- --- --- --- --- ---+ zone 3 |
|
||||
band 3 | chunk 3 +--------+ chk 3 +---- --- --- --- --- ---+ chk 3 |
|
||||
+--------------+ +--------------+ +--------------+
|
||||
| ... | | ... | | ... |
|
||||
+--------------+ +--------------+ +--------------+
|
||||
band m | zone m +--------+ zone m +---- --- --- --- --- ---+ zone m |
|
||||
band m | chunk m +--------+ chk m +---- --- --- --- --- ---+ chk m |
|
||||
+--------------+ +--------------+ +--------------+
|
||||
| ... | | ... | | ... |
|
||||
+--------------+ +--------------+ +--------------+
|
||||
|
||||
parallel unit 1 pu 2 pu n
|
||||
```
|
||||
|
||||
The address map (`P2L`) is saved as a part of the band's metadata, at the end of each band:
|
||||
The address map and valid map are, along with a several other things (e.g. UUID of the device it's
|
||||
part of, number of surfaced LBAs, band's sequence number, etc.), parts of the band's metadata. The
|
||||
metadata is split in two parts:
|
||||
* the head part, containing information already known when opening the band (device's UUID, band's
|
||||
sequence number, etc.), located at the beginning blocks of the band,
|
||||
* the tail part, containing the address map and the valid map, located at the end of the band.
|
||||
|
||||
```text
|
||||
band's data tail metadata
|
||||
+-------------------+-------------------------------+------------------------+
|
||||
|zone 1 |...|zone n |...|...|zone 1 |...| | ... |zone m-1 |zone m|
|
||||
|block 1| |block 1| | |block x| | | |block y |block y|
|
||||
+-------------------+-------------+-----------------+------------------------+
|
||||
```
|
||||
|
||||
Bands are written sequentially (in a way that was described earlier). Before a band can be written
|
||||
to, all of its zones need to be erased. During that time, the band is considered to be in a `PREP`
|
||||
state. Then the band moves to the `OPEN` state and actual user data can be written to the
|
||||
head metadata band's data tail metadata
|
||||
+-------------------+-------------------------------+----------------------+
|
||||
|chk 1|...|chk n|...|...|chk 1|...| | ... |chk m-1 |chk m|
|
||||
|lbk 1| |lbk 1| | |lbk x| | | |lblk y |lblk y|
|
||||
+-------------------+-------------+-----------------+----------------------+
|
||||
|
||||
|
||||
Bands are being written sequentially (in a way that was described earlier). Before a band can be
|
||||
written to, all of its chunks need to be erased. During that time, the band is considered to be in a
|
||||
`PREP` state. After that is done, the band transitions to the `OPENING` state, in which head metadata
|
||||
is being written. Then the band moves to the `OPEN` state and actual user data can be written to the
|
||||
band. Once the whole available space is filled, tail metadata is written and the band transitions to
|
||||
`CLOSING` state. When that finishes the band becomes `CLOSED`.
|
||||
|
||||
### Non volatile cache {#ftl_nvcache}
|
||||
## Ring write buffer {#ftl_rwb}
|
||||
|
||||
- Shorthand: `nvcache`
|
||||
* Shorthand: RWB
|
||||
|
||||
Nvcache is a bdev that is used for buffering user writes and storing various metadata.
|
||||
Nvcache data space is divided into chunks. Chunks are written in sequential manner.
|
||||
When number of free chunks is below assigned threshold data from fully written chunks
|
||||
is moved to base_bdev. This process is called chunk compaction.
|
||||
```text
|
||||
nvcache
|
||||
+-----------------------------------------+
|
||||
|chunk 1 |
|
||||
| +--------------------------------- + |
|
||||
| |blk 1 + md| blk 2 + md| blk n + md| |
|
||||
| +----------------------------------| |
|
||||
+-----------------------------------------+
|
||||
| ... |
|
||||
+-----------------------------------------+
|
||||
+-----------------------------------------+
|
||||
|chunk N |
|
||||
| +--------------------------------- + |
|
||||
| |blk 1 + md| blk 2 + md| blk n + md| |
|
||||
| +----------------------------------| |
|
||||
+-----------------------------------------+
|
||||
```
|
||||
Because the smallest write size the SSD may support can be a multiple of block size, in order to
|
||||
support writes to a single block, the data needs to be buffered. The write buffer is the solution to
|
||||
this problem. It consists of a number of pre-allocated buffers called batches, each of size allowing
|
||||
for a single transfer to the SSD. A single batch is divided into block-sized buffer entries.
|
||||
|
||||
### Garbage collection and relocation {#ftl_reloc}
|
||||
write buffer
|
||||
+-----------------------------------+
|
||||
|batch 1 |
|
||||
| +-----------------------------+ |
|
||||
| |rwb |rwb | ... |rwb | |
|
||||
| |entry 1|entry 2| |entry n| |
|
||||
| +-----------------------------+ |
|
||||
+-----------------------------------+
|
||||
| ... |
|
||||
+-----------------------------------+
|
||||
|batch m |
|
||||
| +-----------------------------+ |
|
||||
| |rwb |rwb | ... |rwb | |
|
||||
| |entry 1|entry 2| |entry n| |
|
||||
| +-----------------------------+ |
|
||||
+-----------------------------------+
|
||||
|
||||
- Shorthand: gc, reloc
|
||||
When a write is scheduled, it needs to acquire an entry for each of its blocks and copy the data
|
||||
onto this buffer. Once all blocks are copied, the write can be signalled as completed to the user.
|
||||
In the meantime, the `rwb` is polled for filled batches and, if one is found, it's sent to the SSD.
|
||||
After that operation is completed the whole batch can be freed. For the whole time the data is in
|
||||
the `rwb`, the L2P points at the buffer entry instead of a location on the SSD. This allows for
|
||||
servicing read requests from the buffer.
|
||||
|
||||
## Defragmentation and relocation {#ftl_reloc}
|
||||
|
||||
* Shorthand: defrag, reloc
|
||||
|
||||
Since a write to the same LBA invalidates its previous physical location, some of the blocks on a
|
||||
band might contain old data that basically wastes space. As there is no way to overwrite an already
|
||||
written block for a ZNS drive, this data will stay there until the whole zone is reset. This might create a
|
||||
written block, this data will stay there until the whole chunk is reset. This might create a
|
||||
situation in which all of the bands contain some valid data and no band can be erased, so no writes
|
||||
can be executed anymore. Therefore a mechanism is needed to move valid data and invalidate whole
|
||||
bands, so that they can be reused.
|
||||
|
||||
```text
|
||||
band band
|
||||
+-----------------------------------+ +-----------------------------------+
|
||||
| ** * * *** * *** * * | | |
|
||||
|** * * * * * * *| +----> | |
|
||||
|* *** * * * | | |
|
||||
+-----------------------------------+ +-----------------------------------+
|
||||
```
|
||||
|
||||
Valid blocks are marked with an asterisk '\*'.
|
||||
|
||||
Module responsible for data relocation is called `reloc`. When a band is chosen for garbage collection,
|
||||
the appropriate blocks are marked as required to be moved. The `reloc` module takes a band that has
|
||||
some of such blocks marked, checks their validity and, if they're still valid, copies them.
|
||||
Another reason for data relocation might be an event from the SSD telling us that the data might
|
||||
become corrupt if it's not relocated. This might happen due to its old age (if it was written a
|
||||
long time ago) or due to read disturb (media characteristic, that causes corruption of neighbouring
|
||||
blocks during a read operation).
|
||||
|
||||
Choosing a band for garbage collection depends its validity ratio (proportion of valid blocks to all
|
||||
user blocks). The lower the ratio, the higher the chance the band will be chosen for gc.
|
||||
Module responsible for data relocation is called `reloc`. When a band is chosen for defragmentation
|
||||
or an ANM (asynchronous NAND management) event is received, the appropriate blocks are marked as
|
||||
required to be moved. The `reloc` module takes a band that has some of such blocks marked, checks
|
||||
their validity and, if they're still valid, copies them.
|
||||
|
||||
## Metadata {#ftl_metadata}
|
||||
Choosing a band for defragmentation depends on several factors: its valid ratio (1) (proportion of
|
||||
valid blocks to all user blocks), its age (2) (when was it written) and its write count / wear level
|
||||
index of its chunks (3) (how many times the band was written to). The lower the ratio (1), the
|
||||
higher its age (2) and the lower its write count (3), the higher the chance the band will be chosen
|
||||
for defrag.
|
||||
|
||||
In addition to the [L2P](#ftl_l2p), FTL will store additional metadata both on the cache, as
|
||||
well as on the base devices. The following types of metadata are persisted:
|
||||
# Usage {#ftl_usage}
|
||||
|
||||
- Superblock - stores the global state of FTL; stored on cache, mirrored to the base device
|
||||
## Prerequisites {#ftl_prereq}
|
||||
|
||||
- L2P - see the [L2P](#ftl_l2p) section for details
|
||||
In order to use the FTL module, an Open Channel SSD is required. The easiest way to obtain one is to
|
||||
emulate it using QEMU. The QEMU with the patches providing Open Channel support can be found on the
|
||||
SPDK's QEMU fork on [spdk-3.0.0](https://github.com/spdk/qemu/tree/spdk-3.0.0) branch.
|
||||
|
||||
- Band - stores the state of bands - write pointers, their OPEN/FREE/CLOSE state; stored on cache, mirrored to a different section of the cache device
|
||||
## Configuring QEMU {#ftl_qemu_config}
|
||||
|
||||
- Valid map - bitmask of all the valid physical addresses, used for improving [relocation](#ftl_reloc)
|
||||
To emulate an Open Channel device, QEMU expects parameters describing the characteristics and
|
||||
geometry of the SSD:
|
||||
- `serial` - serial number,
|
||||
- `lver` - version of the OCSSD standard (0 - disabled, 1 - "1.2", 2 - "2.0"), libftl only supports
|
||||
2.0,
|
||||
- `lba_index` - default LBA format. Possible values (libftl only supports lba_index >= 3):
|
||||
|lba_index| data| metadata|
|
||||
|---------|-----|---------|
|
||||
| 0 | 512B| 0B |
|
||||
| 1 | 512B| 8B |
|
||||
| 2 | 512B| 16B |
|
||||
| 3 |4096B| 0B |
|
||||
| 4 |4096B| 64B |
|
||||
| 5 |4096B| 128B |
|
||||
| 6 |4096B| 16B |
|
||||
- `lnum_ch` - number of groups,
|
||||
- `lnum_lun` - number of parallel units
|
||||
- `lnum_pln` - number of planes (logical blocks from all planes constitute a chunk)
|
||||
- `lpgs_per_blk` - number of pages (smallest programmable unit) per chunk
|
||||
- `lsecs_per_pg` - number of sectors in a page
|
||||
- `lblks_per_pln` - number of chunks in a parallel unit
|
||||
- `laer_thread_sleep` - timeout in ms between asynchronous events requesting the host to relocate
|
||||
the data based on media feedback
|
||||
- `lmetadata` - metadata file
|
||||
|
||||
- Chunk - stores the state of chunks - write pointers, their OPEN/FREE/CLOSE state; stored on cache, mirrored to a different section of the cache device
|
||||
|
||||
- P2L - stores the address mapping (P2L, see [band](#ftl_band)) of currently open bands. This allows for the recovery of open
|
||||
bands after dirty shutdown without needing VSS DIX metadata on the base device; stored on the cache device
|
||||
|
||||
- Trim - stores information about unmapped (trimmed) LBAs; stored on cache, mirrored to a different section of the cache device
|
||||
|
||||
## Dirty shutdown recovery {#ftl_dirty_shutdown}
|
||||
|
||||
After power failure, FTL needs to rebuild the whole L2P using the address maps (`P2L`) stored within each band/chunk.
|
||||
This needs to done, because while individual L2P pages may have been paged out and persisted to the cache device,
|
||||
there's no way to tell which, if any, pages were dirty before the power failure occurred. The P2L consists of not only
|
||||
the mapping itself, but also a sequence id (`seq_id`), which describes the relative age of a given logical block
|
||||
(multiple writes to the same logical block would produce the same amount of P2L entries, only the last one having the current data).
|
||||
|
||||
FTL will therefore rebuild the whole L2P by reading the P2L of all closed bands and chunks. For open bands, the P2L is stored on
|
||||
the cache device, in a separate metadata region (see [the P2L section](#ftl_metadata)). Open chunks can be restored thanks to storing
|
||||
the mapping in the VSS DIX metadata, which the cache device must be formatted with.
|
||||
|
||||
### Shared memory recovery {#ftl_shm_recovery}
|
||||
|
||||
In order to shorten the recovery after crash of the target application, FTL also stores its metadata in shared memory (`shm`) - this
|
||||
allows it to keep track of the dirty-ness state of individual pages and shortens the recovery time dramatically, as FTL will only
|
||||
need to mark any potential L2P pages which were paging out at the time of the crash as dirty and reissue the writes. There's no need
|
||||
to read the whole P2L in this case.
|
||||
|
||||
### Trim {#ftl_trim}
|
||||
|
||||
Due to metadata size constraints and the difficulty of maintaining consistent data returned before and after dirty shutdown, FTL
|
||||
currently only allows for trims (unmaps) aligned to 4MiB (alignment concerns both the offset and length of the trim command).
|
||||
|
||||
## Usage {#ftl_usage}
|
||||
|
||||
### Prerequisites {#ftl_prereq}
|
||||
|
||||
In order to use the FTL module, a cache device formatted with VSS DIX metadata is required.
|
||||
|
||||
### FTL bdev creation {#ftl_create}
|
||||
|
||||
Similar to other bdevs, the FTL bdevs can be created either based on JSON config files or via RPC.
|
||||
Both interfaces require the same arguments which are described by the `--help` option of the
|
||||
`bdev_ftl_create` RPC call, which are:
|
||||
|
||||
- bdev's name
|
||||
- base bdev's name
|
||||
- cache bdev's name (cache bdev must support VSS DIX mode - could be emulated by providing SPDK_FTL_VSS_EMU=1 flag to make;
|
||||
emulating VSS should be done for testing purposes only, it is not power-fail safe)
|
||||
- UUID of the FTL device (if the FTL is to be restored from the SSD)
|
||||
|
||||
## FTL bdev stack {#ftl_bdev_stack}
|
||||
|
||||
In order to create FTL on top of a regular bdev:
|
||||
1) Create regular bdev e.g. `bdev_nvme`, `bdev_null`, `bdev_malloc`
|
||||
2) Create second regular bdev for nvcache
|
||||
3) Create FTL bdev on top of bdev created in step 1 and step 2
|
||||
For more detailed description of the available options, consult the `hw/block/nvme.c` file in
|
||||
the QEMU repository.
|
||||
|
||||
Example:
|
||||
|
||||
```
|
||||
$ /path/to/qemu [OTHER PARAMETERS] -drive format=raw,file=/path/to/data/file,if=none,id=myocssd0
|
||||
-device nvme,drive=myocssd0,serial=deadbeef,lver=2,lba_index=3,lnum_ch=1,lnum_lun=8,lnum_pln=4,
|
||||
lpgs_per_blk=1536,lsecs_per_pg=4,lblks_per_pln=512,lmetadata=/path/to/md/file
|
||||
```
|
||||
$ scripts/rpc.py bdev_nvme_attach_controller -b nvme0 -a 00:05.0 -t pcie
|
||||
nvme0n1
|
||||
|
||||
$ scripts/rpc.py bdev_nvme_attach_controller -b nvme1 -a 00:06.0 -t pcie
|
||||
nvme1n1
|
||||
In the above example, a device is created with 1 channel, 8 parallel units, 512 chunks per parallel
|
||||
unit, 24576 (`lnum_pln` * `lpgs_per_blk` * `lsecs_per_pg`) logical blocks in each chunk with logical
|
||||
block being 4096B. Therefore the data file needs to be at least 384G (8 * 512 * 24576 * 4096B) of
|
||||
size and can be created with the following command:
|
||||
|
||||
$ scripts/rpc.py bdev_ftl_create -b ftl0 -d nvme0n1 -c nvme1n1
|
||||
```
|
||||
$ fallocate -l 384G /path/to/data/file
|
||||
```
|
||||
|
||||
## Configuring SPDK {#ftl_spdk_config}
|
||||
|
||||
To verify that the drive is emulated correctly, one can check the output of the NVMe identify app
|
||||
(assuming that `scripts/setup.sh` was called before and the driver has been changed for that
|
||||
device):
|
||||
|
||||
```
|
||||
$ examples/nvme/identify/identify
|
||||
=====================================================
|
||||
NVMe Controller at 0000:00:0a.0 [1d1d:1f1f]
|
||||
=====================================================
|
||||
Controller Capabilities/Features
|
||||
================================
|
||||
Vendor ID: 1d1d
|
||||
Subsystem Vendor ID: 1af4
|
||||
Serial Number: deadbeef
|
||||
Model Number: QEMU NVMe Ctrl
|
||||
|
||||
... other info ...
|
||||
|
||||
Namespace OCSSD Geometry
|
||||
=======================
|
||||
OC version: maj:2 min:0
|
||||
|
||||
... other info ...
|
||||
|
||||
Groups (channels): 1
|
||||
PUs (LUNs) per group: 8
|
||||
Chunks per LUN: 512
|
||||
Logical blks per chunk: 24576
|
||||
|
||||
... other info ...
|
||||
|
||||
```
|
||||
|
||||
Similarly to other bdevs, the FTL bdevs can be created either based on config files or via RPC. Both
|
||||
interfaces require the same arguments which are described by the `--help` option of the
|
||||
`construct_ftl_bdev` RPC call, which are:
|
||||
- bdev's name
|
||||
- transport type of the device (e.g. PCIe)
|
||||
- transport address of the device (e.g. `00:0a.0`)
|
||||
- parallel unit range
|
||||
- UUID of the FTL device (if the FTL is to be restored from the SSD)
|
||||
|
||||
Example config:
|
||||
|
||||
```
|
||||
[Ftl]
|
||||
TransportID "trtype:PCIe traddr:00:0a.0" nvme0 "0-3"
|
||||
TransportID "trtype:PCIe traddr:00:0a.0" nvme1 "4-5" e9825835-b03c-49d7-bc3e-5827cbde8a88
|
||||
```
|
||||
|
||||
The above will result in creation of two devices:
|
||||
- `nvme0` on `00:0a.0` using parallel units 0-3, created from scratch
|
||||
- `nvme1` on the same device using parallel units 4-5, restored from the SSD using the UUID
|
||||
provided
|
||||
|
||||
The same can be achieved with the following two RPC calls:
|
||||
|
||||
```
|
||||
$ scripts/rpc.py construct_ftl_bdev -b nvme0 -l 0-3 -a 00:0a.0
|
||||
{
|
||||
"name": "ftl0",
|
||||
"uuid": "3b469565-1fa5-4bfb-8341-747ec9f3a9b9"
|
||||
"name": "nvme0",
|
||||
"uuid": "b4624a89-3174-476a-b9e5-5fd27d73e870"
|
||||
}
|
||||
$ scripts/rpc.py construct_ftl_bdev -b nvme1 -l 0-3 -a 00:0a.0 -u e9825835-b03c-49d7-bc3e-5827cbde8a88
|
||||
{
|
||||
"name": "nvme1",
|
||||
"uuid": "e9825835-b03c-49d7-bc3e-5827cbde8a88"
|
||||
}
|
||||
```
|
||||
|
@ -1,269 +0,0 @@
|
||||
# GDB Macros User Guide {#gdb_macros}
|
||||
|
||||
## Introduction
|
||||
|
||||
When debugging an spdk application using gdb we may need to view data structures
|
||||
in lists, e.g. information about bdevs or threads.
|
||||
|
||||
If, for example I have several bdevs, and I wish to get information on bdev by
|
||||
the name 'test_vols3', I will need to manually iterate over the list as follows:
|
||||
|
||||
~~~{.sh}
|
||||
(gdb) p g_bdev_mgr->bdevs->tqh_first->name
|
||||
$5 = 0x7f7dcc0b21b0 "test_vols1"
|
||||
(gdb) p g_bdev_mgr->bdevs->tqh_first->internal->link->tqe_next->name
|
||||
$6 = 0x7f7dcc0b1a70 "test_vols2"
|
||||
(gdb) p
|
||||
g_bdev_mgr->bdevs->tqh_first->internal->link->tqe_next->internal->link->tqe_next->name
|
||||
$7 = 0x7f7dcc215a00 "test_vols3"
|
||||
(gdb) p
|
||||
g_bdev_mgr->bdevs->tqh_first->internal->link->tqe_next->internal->link->tqe_next
|
||||
$8 = (struct spdk_bdev *) 0x7f7dcc2c7c08
|
||||
~~~
|
||||
|
||||
At this stage, we can start looking at the relevant fields of our bdev which now
|
||||
we know is in address 0x7f7dcc2c7c08.
|
||||
|
||||
This can be somewhat troublesome if there are 100 bdevs, and the one we need is
|
||||
56th in the list...
|
||||
|
||||
Instead, we can use a gdb macro in order to get information about all the
|
||||
devices.
|
||||
|
||||
Examples:
|
||||
|
||||
Printing bdevs:
|
||||
|
||||
~~~{.sh}
|
||||
(gdb) spdk_print_bdevs
|
||||
|
||||
SPDK object of type struct spdk_bdev at 0x7f7dcc1642a8
|
||||
((struct spdk_bdev*) 0x7f7dcc1642a8)
|
||||
name 0x7f7dcc0b21b0 "test_vols1"
|
||||
|
||||
---------------
|
||||
|
||||
SPDK object of type struct spdk_bdev at 0x7f7dcc216008
|
||||
((struct spdk_bdev*) 0x7f7dcc216008)
|
||||
name 0x7f7dcc0b1a70 "test_vols2"
|
||||
|
||||
---------------
|
||||
|
||||
SPDK object of type struct spdk_bdev at 0x7f7dcc2c7c08
|
||||
((struct spdk_bdev*) 0x7f7dcc2c7c08)
|
||||
name 0x7f7dcc215a00 "test_vols3"
|
||||
|
||||
---------------
|
||||
~~~
|
||||
|
||||
Finding a bdev by name:
|
||||
|
||||
~~~{.sh}
|
||||
(gdb) spdk_find_bdev test_vols1
|
||||
test_vols1
|
||||
|
||||
SPDK object of type struct spdk_bdev at 0x7f7dcc1642a8
|
||||
((struct spdk_bdev*) 0x7f7dcc1642a8)
|
||||
name 0x7f7dcc0b21b0 "test_vols1"
|
||||
~~~
|
||||
|
||||
Printing spdk threads:
|
||||
|
||||
~~~{.sh}
|
||||
(gdb) spdk_print_threads
|
||||
|
||||
SPDK object of type struct spdk_thread at 0x7fffd0008b50
|
||||
((struct spdk_thread*) 0x7fffd0008b50)
|
||||
name 0x7fffd00008e0 "reactor_1"
|
||||
IO Channels:
|
||||
SPDK object of type struct spdk_io_channel at 0x7fffd0052610
|
||||
((struct spdk_io_channel*) 0x7fffd0052610)
|
||||
name
|
||||
ref 1
|
||||
device 0x7fffd0008c80 (0x7fffd0008ce0 "nvmf_tgt")
|
||||
---------------
|
||||
|
||||
SPDK object of type struct spdk_io_channel at 0x7fffd0056cd0
|
||||
((struct spdk_io_channel*) 0x7fffd0056cd0)
|
||||
name
|
||||
ref 2
|
||||
device 0x7fffd0056bf0 (0x7fffd0008e70 "test_vol1")
|
||||
---------------
|
||||
|
||||
SPDK object of type struct spdk_io_channel at 0x7fffd00582e0
|
||||
((struct spdk_io_channel*) 0x7fffd00582e0)
|
||||
name
|
||||
ref 1
|
||||
device 0x7fffd0056c50 (0x7fffd0056cb0 "bdev_test_vol1")
|
||||
---------------
|
||||
|
||||
SPDK object of type struct spdk_io_channel at 0x7fffd00583b0
|
||||
((struct spdk_io_channel*) 0x7fffd00583b0)
|
||||
name
|
||||
ref 1
|
||||
device 0x7fffd0005630 (0x7fffd0005690 "bdev_mgr")
|
||||
---------------
|
||||
~~~
|
||||
|
||||
Printing nvmf subsystems:
|
||||
|
||||
~~~{.sh}
|
||||
(gdb) spdk_print_nvmf_subsystems
|
||||
|
||||
SPDK object of type struct spdk_nvmf_subsystem at 0x7fffd0008d00
|
||||
((struct spdk_nvmf_subsystem*) 0x7fffd0008d00)
|
||||
name "nqn.2014-08.org.nvmexpress.discovery", '\000' <repeats 187 times>
|
||||
nqn "nqn.2014-08.org.nvmexpress.discovery", '\000' <repeats 187 times>
|
||||
ID 0
|
||||
|
||||
---------------
|
||||
|
||||
SPDK object of type struct spdk_nvmf_subsystem at 0x7fffd0055760
|
||||
((struct spdk_nvmf_subsystem*) 0x7fffd0055760)
|
||||
name "nqn.2016-06.io.spdk.umgmt:cnode1", '\000' <repeats 191 times>
|
||||
nqn "nqn.2016-06.io.spdk.umgmt:cnode1", '\000' <repeats 191 times>
|
||||
ID 1
|
||||
~~~
|
||||
|
||||
Printing SPDK spinlocks:
|
||||
|
||||
In this example, the spinlock has been initialized and locked but has never been unlocked.
|
||||
After it is unlocked the first time the last unlocked stack will be present and the
|
||||
`Locked by spdk_thread` line will say `not locked`.
|
||||
|
||||
~~~{.sh}
|
||||
Breakpoint 2, spdk_spin_unlock (sspin=0x655110 <g_bdev_mgr+80>) at thread.c:2915
|
||||
2915 struct spdk_thread *thread = spdk_get_thread();
|
||||
(gdb) print *sspin
|
||||
$2 = struct spdk_spinlock:
|
||||
Locked by spdk_thread: 0x658080
|
||||
Initialized at:
|
||||
0x43e677 <spdk_spin_init+213> thread.c:2878
|
||||
0x404feb <_bdev_init+16> /build/spdk/spdk-review-public/lib/bdev/bdev.c:116
|
||||
0x44483d <__libc_csu_init+77>
|
||||
0x7ffff62c9d18 <__libc_start_main+120>
|
||||
0x40268e <_start+46>
|
||||
Last locked at:
|
||||
0x43e936 <spdk_spin_lock+436> thread.c:2909
|
||||
0x40ca9c <bdev_name_add+129> /build/spdk/spdk-review-public/lib/bdev/bdev.c:3855
|
||||
0x411a3c <bdev_register+641> /build/spdk/spdk-review-public/lib/bdev/bdev.c:6660
|
||||
0x412e1e <spdk_bdev_register+24> /build/spdk/spdk-review-public/lib/bdev/bdev.c:7171
|
||||
0x417895 <num_blocks_test+119> bdev_ut.c:878
|
||||
0x7ffff7bc38cb <run_single_test.constprop+379>
|
||||
0x7ffff7bc3b61 <run_single_suite.constprop+433>
|
||||
0x7ffff7bc3f76 <CU_run_all_tests+118>
|
||||
0x43351f <main+1439> bdev_ut.c:6295
|
||||
0x7ffff62c9d85 <__libc_start_main+229>
|
||||
0x40268e <_start+46>
|
||||
Last unlocked at:
|
||||
~~~
|
||||
|
||||
Print a single spinlock stack:
|
||||
|
||||
~~~{.sh}
|
||||
(gdb) print sspin->internal.lock_stack
|
||||
$1 = struct sspin_stack:
|
||||
0x40c6a1 <spdk_spin_lock+436> /build/spdk/spdk-review-public/lib/thread/thread.c:2909
|
||||
0x413f48 <spdk_spin+552> thread_ut.c:1831
|
||||
0x7ffff7bc38cb <run_single_test.constprop+379>
|
||||
0x7ffff7bc3b61 <run_single_suite.constprop+433>
|
||||
0x7ffff7bc3f76 <CU_run_all_tests+118>
|
||||
0x4148fa <main+547> thread_ut.c:1948
|
||||
0x7ffff62c9d85 <__libc_start_main+229>
|
||||
0x40248e <_start+46>
|
||||
~~~
|
||||
|
||||
## Loading The gdb Macros
|
||||
|
||||
Copy the gdb macros to the host where you are about to debug.
|
||||
It is best to copy the file either to somewhere within the PYTHONPATH, or to add
|
||||
the destination directory to the PYTHONPATH. This is not mandatory, and can be
|
||||
worked around, but can save a few steps when loading the module to gdb.
|
||||
|
||||
From gdb, with the application core open, invoke python and load the modules.
|
||||
|
||||
In the example below, I copied the macros to the /tmp directory which is not in
|
||||
the PYTHONPATH, so I had to manually add the directory to the path.
|
||||
|
||||
~~~{.sh}
|
||||
(gdb) python
|
||||
>import sys
|
||||
>sys.path.append('/tmp')
|
||||
>import gdb_macros
|
||||
>end
|
||||
(gdb) spdk_load_macros
|
||||
~~~
|
||||
|
||||
## Using the gdb Data Directory
|
||||
|
||||
On most systems, the data directory is /usr/share/gdb. The python script should
|
||||
be copied into the python/gdb/function (or python/gdb/command) directory under
|
||||
the data directory, e.g. /usr/share/gdb/python/gdb/function.
|
||||
|
||||
If the python script is in there, then the only thing you need to do when
|
||||
starting gdb is type "spdk_load_macros".
|
||||
|
||||
## Using .gdbinit To Load The Macros
|
||||
|
||||
.gdbinit can also be used in order to run automatically run the manual steps
|
||||
above prior to starting gdb.
|
||||
|
||||
Example .gdbinit:
|
||||
|
||||
~~~{.sh}
|
||||
source /opt/km/install/tools/gdb_macros/gdb_macros.py
|
||||
~~~
|
||||
|
||||
When starting gdb you still have to call spdk_load_macros.
|
||||
|
||||
## Why Do We Need to Explicitly Call spdk_load_macros
|
||||
|
||||
The reason is that the macros need to use globals provided by spdk in order to
|
||||
iterate the spdk lists and build iterable representations of the list objects.
|
||||
This will result in errors if these are not available which is very possible if
|
||||
gdb is used for reasons other than debugging spdk core dumps.
|
||||
|
||||
In the example below, I attempted to load the macros when the globals are not
|
||||
available causing gdb to fail loading the gdb_macros:
|
||||
|
||||
~~~{.sh}
|
||||
(gdb) spdk_load_macros
|
||||
Traceback (most recent call last):
|
||||
File "/opt/km/install/tools/gdb_macros/gdb_macros.py", line 257, in invoke
|
||||
spdk_print_threads()
|
||||
File "/opt/km/install/tools/gdb_macros/gdb_macros.py", line 241, in __init__
|
||||
threads = SpdkThreads()
|
||||
File "/opt/km/install/tools/gdb_macros/gdb_macros.py", line 234, in __init__
|
||||
super(SpdkThreads, self).__init__('g_threads', SpdkThread)
|
||||
File "/opt/km/install/tools/gdb_macros/gdb_macros.py", line 25, in __init__
|
||||
['tailq'])
|
||||
File "/opt/km/install/tools/gdb_macros/gdb_macros.py", line 10, in __init__
|
||||
self.list = gdb.parse_and_eval(self.list_pointer)
|
||||
RuntimeError: No symbol table is loaded. Use the "file" command.
|
||||
Error occurred in Python command: No symbol table is loaded. Use the "file"
|
||||
command.
|
||||
~~~
|
||||
|
||||
## Macros available
|
||||
|
||||
- spdk_load_macros: load the macros (use --reload in order to reload them)
|
||||
- spdk_print_bdevs: information about bdevs
|
||||
- spdk_find_bdev: find a bdev (substring search)
|
||||
- spdk_print_io_devices: information about io devices
|
||||
- spdk_print_nvmf_subsystems: information about nvmf subsystems
|
||||
- spdk_print_threads: information about threads
|
||||
|
||||
## Adding New Macros
|
||||
|
||||
The list iteration macros are usually built from 3 layers:
|
||||
|
||||
- SpdkPrintCommand: inherits from gdb.Command and invokes the list iteration
|
||||
- SpdkTailqList: Performs the iteration of a tailq list according to the tailq
|
||||
member implementation
|
||||
- SpdkObject: Provides the __str__ function so that the list iteration can print
|
||||
the object
|
||||
|
||||
Other useful objects:
|
||||
|
||||
- SpdkNormalTailqList: represents a list which has 'tailq' as the tailq object
|
||||
- SpdkArr: Iteration over an array (instead of a linked list)
|
@ -1,6 +1,6 @@
|
||||
# General Information {#general}
|
||||
|
||||
- @subpage directory_structure
|
||||
- @subpage event
|
||||
- @subpage scheduler
|
||||
- @subpage logical_volumes
|
||||
- @subpage accel_fw
|
||||
- @subpage vpp_integration
|
||||
|
@ -1,28 +1,23 @@
|
||||
# Getting Started {#getting_started}
|
||||
|
||||
## Getting the Source Code {#getting_started_source}
|
||||
# Getting the Source Code {#getting_started_source}
|
||||
|
||||
~~~{.sh}
|
||||
git clone https://github.com/spdk/spdk --recursive
|
||||
git clone https://github.com/spdk/spdk
|
||||
cd spdk
|
||||
git submodule update --init
|
||||
~~~
|
||||
|
||||
## Installing Prerequisites {#getting_started_prerequisites}
|
||||
# Installing Prerequisites {#getting_started_prerequisites}
|
||||
|
||||
The `scripts/pkgdep.sh` script will automatically install the bare minimum
|
||||
dependencies required to build SPDK.
|
||||
Use `--help` to see information on installing dependencies for optional components.
|
||||
The `scripts/pkgdep.sh` script will automatically install the full set of
|
||||
dependencies required to build and develop SPDK.
|
||||
|
||||
~~~{.sh}
|
||||
sudo scripts/pkgdep.sh
|
||||
~~~
|
||||
|
||||
Option --all will install all dependencies needed by SPDK features.
|
||||
|
||||
~~~{.sh}
|
||||
sudo scripts/pkgdep.sh --all
|
||||
~~~
|
||||
|
||||
## Building {#getting_started_building}
|
||||
# Building {#getting_started_building}
|
||||
|
||||
Linux:
|
||||
|
||||
@ -55,7 +50,7 @@ can enable it by doing the following:
|
||||
make
|
||||
~~~
|
||||
|
||||
## Running the Unit Tests {#getting_started_unittests}
|
||||
# Running the Unit Tests {#getting_started_unittests}
|
||||
|
||||
It's always a good idea to confirm your build worked by running the
|
||||
unit tests.
|
||||
@ -68,7 +63,7 @@ You will see several error messages when running the unit tests, but they are
|
||||
part of the test suite. The final message at the end of the script indicates
|
||||
success or failure.
|
||||
|
||||
## Running the Example Applications {#getting_started_examples}
|
||||
# Running the Example Applications {#getting_started_examples}
|
||||
|
||||
Before running an SPDK application, some hugepages must be allocated and
|
||||
any NVMe and I/OAT devices must be unbound from the native kernel drivers.
|
||||
@ -108,7 +103,7 @@ with no arguments to see the help output. If your system has its IOMMU
|
||||
enabled you can run the examples as your regular user. If it doesn't, you'll
|
||||
need to run as a privileged user (root).
|
||||
|
||||
A good example to start with is `build/examples/identify`, which prints
|
||||
A good example to start with is `examples/nvme/identify/identify`, which prints
|
||||
out information about all of the NVMe devices on your system.
|
||||
|
||||
Larger, more fully functional applications are available in the `app`
|
||||
|
23
doc/idxd.md
23
doc/idxd.md
@ -1,23 +0,0 @@
|
||||
# IDXD Driver {#idxd}
|
||||
|
||||
## Public Interface {#idxd_interface}
|
||||
|
||||
- spdk/idxd.h
|
||||
|
||||
## Key Functions {#idxd_key_functions}
|
||||
|
||||
Function | Description
|
||||
--------------------------------------- | -----------
|
||||
spdk_idxd_probe() | @copybrief spdk_idxd_probe()
|
||||
spdk_idxd_submit_copy() | @copybrief spdk_idxd_submit_copy()
|
||||
spdk_idxd_submit_compare() | @copybrief spdk_idxd_submit_compare()
|
||||
spdk_idxd_submit_crc32c() | @copybrief spdk_idxd_submit_crc32c()
|
||||
spdk_idxd_submit_dualcast | @copybrief spdk_idxd_submit_dualcast()
|
||||
spdk_idxd_submit_fill() | @copybrief spdk_idxd_submit_fill()
|
||||
|
||||
## Kernel vs User {#idxd_configs}
|
||||
|
||||
The low level library can be initialized either directly via `spdk_idxd_set_config`,
|
||||
passing in a value of `true` indicates that the IDXD kernel driver is loaded and
|
||||
that SPDK will use work queue(s) surfaced by the driver. Passing in `false` means
|
||||
that the SPDK user space driver will be used to initialize the hardware.
|
@ -1,673 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||
|
||||
<svg
|
||||
width="181.24mm"
|
||||
height="79.375mm"
|
||||
version="1.1"
|
||||
viewBox="0 0 181.24 79.375"
|
||||
id="svg172"
|
||||
sodipodi:docname="lvol_esnap_clone.svg"
|
||||
inkscape:version="1.2.2 (b0a8486541, 2022-12-01)"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<sodipodi:namedview
|
||||
id="namedview174"
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#000000"
|
||||
borderopacity="0.25"
|
||||
inkscape:showpageshadow="2"
|
||||
inkscape:pageopacity="0.0"
|
||||
inkscape:pagecheckerboard="0"
|
||||
inkscape:deskcolor="#d1d1d1"
|
||||
inkscape:document-units="mm"
|
||||
showgrid="false"
|
||||
inkscape:zoom="1.7926966"
|
||||
inkscape:cx="338.59607"
|
||||
inkscape:cy="148.93764"
|
||||
inkscape:window-width="1351"
|
||||
inkscape:window-height="930"
|
||||
inkscape:window-x="762"
|
||||
inkscape:window-y="134"
|
||||
inkscape:window-maximized="0"
|
||||
inkscape:current-layer="g170" />
|
||||
<title
|
||||
id="title2">Thin Provisioning</title>
|
||||
<defs
|
||||
id="defs28">
|
||||
<marker
|
||||
id="marker2036"
|
||||
overflow="visible"
|
||||
orient="auto">
|
||||
<path
|
||||
transform="matrix(-.4 0 0 -.4 -4 0)"
|
||||
d="m0 0 5-5-17.5 5 17.5 5z"
|
||||
fill-rule="evenodd"
|
||||
stroke="#000"
|
||||
stroke-width="1pt"
|
||||
id="path4" />
|
||||
</marker>
|
||||
<marker
|
||||
id="marker1960"
|
||||
overflow="visible"
|
||||
orient="auto">
|
||||
<path
|
||||
transform="matrix(-.4 0 0 -.4 -4 0)"
|
||||
d="m0 0 5-5-17.5 5 17.5 5z"
|
||||
fill-rule="evenodd"
|
||||
stroke="#000"
|
||||
stroke-width="1pt"
|
||||
id="path7" />
|
||||
</marker>
|
||||
<marker
|
||||
id="marker1890"
|
||||
overflow="visible"
|
||||
orient="auto">
|
||||
<path
|
||||
transform="matrix(-.4 0 0 -.4 -4 0)"
|
||||
d="m0 0 5-5-17.5 5 17.5 5z"
|
||||
fill-rule="evenodd"
|
||||
stroke="#000"
|
||||
stroke-width="1pt"
|
||||
id="path10" />
|
||||
</marker>
|
||||
<marker
|
||||
id="marker1826"
|
||||
overflow="visible"
|
||||
orient="auto">
|
||||
<path
|
||||
transform="matrix(-.4 0 0 -.4 -4 0)"
|
||||
d="m0 0 5-5-17.5 5 17.5 5z"
|
||||
fill-rule="evenodd"
|
||||
stroke="#000"
|
||||
stroke-width="1pt"
|
||||
id="path13" />
|
||||
</marker>
|
||||
<marker
|
||||
id="marker1816"
|
||||
overflow="visible"
|
||||
orient="auto">
|
||||
<path
|
||||
transform="matrix(-.4 0 0 -.4 -4 0)"
|
||||
d="m0 0 5-5-17.5 5 17.5 5z"
|
||||
fill-rule="evenodd"
|
||||
stroke="#000"
|
||||
stroke-width="1pt"
|
||||
id="path16" />
|
||||
</marker>
|
||||
<marker
|
||||
id="Arrow1Mend"
|
||||
overflow="visible"
|
||||
orient="auto">
|
||||
<path
|
||||
transform="matrix(-.4 0 0 -.4 -4 0)"
|
||||
d="m0 0 5-5-17.5 5 17.5 5z"
|
||||
fill-rule="evenodd"
|
||||
stroke="#000"
|
||||
stroke-width="1pt"
|
||||
id="path19" />
|
||||
</marker>
|
||||
<marker
|
||||
id="marker11771-4-9"
|
||||
overflow="visible"
|
||||
orient="auto">
|
||||
<path
|
||||
transform="matrix(-.4 0 0 -.4 -4 0)"
|
||||
d="m0 0 5-5-17.5 5 17.5 5z"
|
||||
fill="#f00"
|
||||
fill-rule="evenodd"
|
||||
stroke="#ff2a2a"
|
||||
stroke-width="1pt"
|
||||
id="path22" />
|
||||
</marker>
|
||||
<marker
|
||||
id="marker1826-2-4-7-1-7"
|
||||
overflow="visible"
|
||||
orient="auto">
|
||||
<path
|
||||
transform="matrix(-.4 0 0 -.4 -4 0)"
|
||||
d="m0 0 5-5-17.5 5 17.5 5z"
|
||||
fill="#00f"
|
||||
fill-rule="evenodd"
|
||||
stroke="#00f"
|
||||
stroke-width="1pt"
|
||||
id="path25" />
|
||||
</marker>
|
||||
</defs>
|
||||
<metadata
|
||||
id="metadata30">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title>Thin Provisioning</dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<g
|
||||
transform="translate(2.6458 2.3956)"
|
||||
id="g34">
|
||||
<rect
|
||||
x="-2.6458"
|
||||
y="-2.3956"
|
||||
width="181.24"
|
||||
height="79.375"
|
||||
fill="#fffffe"
|
||||
stroke-width=".26458"
|
||||
id="rect32" />
|
||||
</g>
|
||||
<g
|
||||
transform="translate(-3.9688 -4.6356)"
|
||||
id="g170">
|
||||
<g
|
||||
stroke="#000"
|
||||
id="g52">
|
||||
<g
|
||||
stroke-width=".26458"
|
||||
id="g48">
|
||||
<rect
|
||||
x="44.979"
|
||||
y="32.417"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
fill="none"
|
||||
stroke-dasharray="0.52916663, 0.52916663"
|
||||
id="rect36" />
|
||||
<rect
|
||||
x="67.469"
|
||||
y="32.417"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
fill="#d7d7f4"
|
||||
id="rect38" />
|
||||
<rect
|
||||
x="89.958"
|
||||
y="32.417"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
fill="#d7d7f4"
|
||||
id="rect40" />
|
||||
<rect
|
||||
x="112.45"
|
||||
y="32.417"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
fill="none"
|
||||
stroke-dasharray="0.52916663, 0.52916663"
|
||||
id="rect42" />
|
||||
<rect
|
||||
x="134.94"
|
||||
y="32.417"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
fill="none"
|
||||
stroke-dasharray="0.52916663, 0.52916663"
|
||||
id="rect44" />
|
||||
<rect
|
||||
x="157.43"
|
||||
y="32.417"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
fill="#d7d7f4"
|
||||
id="rect46" />
|
||||
</g>
|
||||
<rect
|
||||
x="44.979"
|
||||
y="46.969"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
fill="#f4d7d7"
|
||||
stroke-dasharray="0.52999997, 0.26499999"
|
||||
stroke-width=".265"
|
||||
id="rect50" />
|
||||
</g>
|
||||
<text
|
||||
x="56.412949"
|
||||
y="51.598957"
|
||||
fill="#000000"
|
||||
font-family="sans-serif"
|
||||
font-size="10.583px"
|
||||
letter-spacing="0px"
|
||||
stroke-width="0.26458"
|
||||
word-spacing="0px"
|
||||
style="line-height:1.25"
|
||||
xml:space="preserve"
|
||||
id="text56"><tspan
|
||||
x="56.412949"
|
||||
y="51.598957"
|
||||
font-family="sans-serif"
|
||||
font-size="3.5278px"
|
||||
stroke-width="0.26458"
|
||||
text-align="center"
|
||||
text-anchor="middle"
|
||||
style="font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
|
||||
id="tspan54">26f9a7...</tspan></text>
|
||||
<rect
|
||||
x="67.469"
|
||||
y="46.969"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
fill="#f4d7d7"
|
||||
stroke="#000"
|
||||
stroke-dasharray="0.52999997, 0.26499999"
|
||||
stroke-width=".265"
|
||||
id="rect58" />
|
||||
<text
|
||||
x="78.902527"
|
||||
y="51.598961"
|
||||
fill="#000000"
|
||||
font-family="sans-serif"
|
||||
font-size="10.583px"
|
||||
letter-spacing="0px"
|
||||
stroke-width="0.26458"
|
||||
word-spacing="0px"
|
||||
style="line-height:1.25"
|
||||
xml:space="preserve"
|
||||
id="text62"><tspan
|
||||
x="78.902527"
|
||||
y="51.598961"
|
||||
font-family="sans-serif"
|
||||
font-size="3.5278px"
|
||||
stroke-width="0.26458"
|
||||
text-align="center"
|
||||
text-anchor="middle"
|
||||
style="font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
|
||||
id="tspan60">b44ab3...</tspan></text>
|
||||
<rect
|
||||
x="89.958"
|
||||
y="46.969"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
fill="#f4d7d7"
|
||||
stroke="#000"
|
||||
stroke-dasharray="0.52999997, 0.26499999"
|
||||
stroke-width=".265"
|
||||
id="rect64" />
|
||||
<text
|
||||
x="101.39211"
|
||||
y="51.598961"
|
||||
fill="#000000"
|
||||
font-family="sans-serif"
|
||||
font-size="10.583px"
|
||||
letter-spacing="0px"
|
||||
stroke-width="0.26458"
|
||||
word-spacing="0px"
|
||||
style="line-height:1.25"
|
||||
xml:space="preserve"
|
||||
id="text68"><tspan
|
||||
x="101.39211"
|
||||
y="51.598961"
|
||||
font-family="sans-serif"
|
||||
font-size="3.5278px"
|
||||
stroke-width="0.26458"
|
||||
text-align="center"
|
||||
text-anchor="middle"
|
||||
style="font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
|
||||
id="tspan66">ee5593...</tspan></text>
|
||||
<rect
|
||||
x="112.45"
|
||||
y="46.969"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
fill="#f4d7d7"
|
||||
stroke="#000"
|
||||
stroke-dasharray="0.52999997, 0.26499999"
|
||||
stroke-width=".265"
|
||||
id="rect70" />
|
||||
<text
|
||||
x="123.88169"
|
||||
y="51.598961"
|
||||
fill="#000000"
|
||||
font-family="sans-serif"
|
||||
font-size="10.583px"
|
||||
letter-spacing="0px"
|
||||
stroke-width="0.26458"
|
||||
word-spacing="0px"
|
||||
style="line-height:1.25"
|
||||
xml:space="preserve"
|
||||
id="text74"><tspan
|
||||
x="123.88169"
|
||||
y="51.598961"
|
||||
font-family="sans-serif"
|
||||
font-size="3.5278px"
|
||||
stroke-width="0.26458"
|
||||
text-align="center"
|
||||
text-anchor="middle"
|
||||
style="font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
|
||||
id="tspan72">7a3bfe...</tspan></text>
|
||||
<rect
|
||||
x="134.94"
|
||||
y="46.969"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
fill="#f4d7d7"
|
||||
stroke="#000"
|
||||
stroke-dasharray="0.52999997, 0.26499999"
|
||||
stroke-width=".265"
|
||||
id="rect76" />
|
||||
<text
|
||||
x="146.37128"
|
||||
y="51.598957"
|
||||
fill="#000000"
|
||||
font-family="sans-serif"
|
||||
font-size="10.583px"
|
||||
letter-spacing="0px"
|
||||
stroke-width="0.26458"
|
||||
word-spacing="0px"
|
||||
style="line-height:1.25"
|
||||
xml:space="preserve"
|
||||
id="text80"><tspan
|
||||
x="146.37128"
|
||||
y="51.598957"
|
||||
font-family="sans-serif"
|
||||
font-size="3.5278px"
|
||||
stroke-width="0.26458"
|
||||
text-align="center"
|
||||
text-anchor="middle"
|
||||
style="font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
|
||||
id="tspan78">8f4e15...</tspan></text>
|
||||
<rect
|
||||
x="157.43"
|
||||
y="46.969"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
fill="#f4d7d7"
|
||||
stroke="#000"
|
||||
stroke-dasharray="0.52999997, 0.26499999"
|
||||
stroke-width=".265"
|
||||
id="rect82" />
|
||||
<g
|
||||
font-family="sans-serif"
|
||||
letter-spacing="0px"
|
||||
stroke-width=".26458"
|
||||
word-spacing="0px"
|
||||
id="g98">
|
||||
<text
|
||||
x="168.86086"
|
||||
y="51.598961"
|
||||
font-size="10.583px"
|
||||
style="line-height:1.25"
|
||||
xml:space="preserve"
|
||||
id="text86"><tspan
|
||||
x="168.86086"
|
||||
y="51.598961"
|
||||
font-family="sans-serif"
|
||||
font-size="3.5278px"
|
||||
stroke-width="0.26458"
|
||||
text-align="center"
|
||||
text-anchor="middle"
|
||||
style="font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
|
||||
id="tspan84">40c285...</tspan></text>
|
||||
<text
|
||||
x="6.6430736"
|
||||
y="51.680019"
|
||||
font-size="3.5278px"
|
||||
style="line-height:1.25;font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
|
||||
xml:space="preserve"
|
||||
id="text90"><tspan
|
||||
x="6.6430736"
|
||||
y="51.680019"
|
||||
stroke-width="0.26458"
|
||||
id="tspan88">read-only bdev</tspan></text>
|
||||
<text
|
||||
x="6.6296382"
|
||||
y="12.539818"
|
||||
font-size="3.5278px"
|
||||
style="line-height:1.25;font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
|
||||
xml:space="preserve"
|
||||
id="text96"><tspan
|
||||
sodipodi:role="line"
|
||||
id="tspan436"
|
||||
x="6.6296382"
|
||||
y="12.539818">esnap clone</tspan><tspan
|
||||
sodipodi:role="line"
|
||||
x="6.6296382"
|
||||
y="16.949568"
|
||||
id="tspan440">Volume</tspan><tspan
|
||||
sodipodi:role="line"
|
||||
id="tspan438"
|
||||
x="6.6296382"
|
||||
y="21.359318" /></text>
|
||||
</g>
|
||||
<g
|
||||
stroke="#000"
|
||||
id="g118">
|
||||
<path
|
||||
d="m6.6146 24.479 173.3 1e-6"
|
||||
fill="none"
|
||||
stroke-dasharray="1.59, 1.59"
|
||||
stroke-width=".265"
|
||||
id="path100" />
|
||||
<g
|
||||
fill="#f4d7d7"
|
||||
stroke-dasharray="0.52916663, 0.26458332"
|
||||
stroke-width=".26458"
|
||||
id="g108">
|
||||
<rect
|
||||
x="44.979"
|
||||
y="9.9271"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
id="rect102" />
|
||||
<rect
|
||||
x="112.45"
|
||||
y="9.9271"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
id="rect104" />
|
||||
<rect
|
||||
x="134.94"
|
||||
y="9.9271"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
id="rect106" />
|
||||
</g>
|
||||
<g
|
||||
fill="#d7d7f4"
|
||||
stroke-width=".26458"
|
||||
id="g116">
|
||||
<rect
|
||||
x="67.469"
|
||||
y="9.9271"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
id="rect110" />
|
||||
<rect
|
||||
x="89.958"
|
||||
y="9.9271"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
id="rect112" />
|
||||
<rect
|
||||
x="157.43"
|
||||
y="9.9271"
|
||||
width="22.49"
|
||||
height="6.6146"
|
||||
id="rect114" />
|
||||
</g>
|
||||
</g>
|
||||
<text
|
||||
x="6.614583"
|
||||
y="37.708332"
|
||||
fill="#000000"
|
||||
font-family="sans-serif"
|
||||
font-size="3.5278px"
|
||||
letter-spacing="0px"
|
||||
stroke-width=".26458"
|
||||
word-spacing="0px"
|
||||
style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25"
|
||||
xml:space="preserve"
|
||||
id="text122"><tspan
|
||||
x="6.614583"
|
||||
y="37.708332"
|
||||
stroke-width=".26458"
|
||||
id="tspan120">active clusters</tspan></text>
|
||||
<rect
|
||||
x="37.042"
|
||||
y="7.2812"
|
||||
width="145.52"
|
||||
height="11.906"
|
||||
ry="1.3229"
|
||||
fill="none"
|
||||
stroke="#999"
|
||||
stroke-width=".5"
|
||||
id="rect124" />
|
||||
<rect
|
||||
x="37.042"
|
||||
y="29.771"
|
||||
width="145.52"
|
||||
height="26.458"
|
||||
ry="1.3229"
|
||||
fill="none"
|
||||
stroke="#999"
|
||||
stroke-width=".5"
|
||||
id="rect126" />
|
||||
<g
|
||||
fill="#00f"
|
||||
stroke="#00f"
|
||||
id="g144">
|
||||
<g
|
||||
stroke-width=".26458"
|
||||
id="g140">
|
||||
<path
|
||||
d="m78.052 16.542v15.875"
|
||||
marker-end="url(#marker1960)"
|
||||
id="path128" />
|
||||
<path
|
||||
d="m55.562 16.542v30.427"
|
||||
marker-end="url(#marker2036)"
|
||||
id="path130" />
|
||||
<path
|
||||
d="m100.54 16.542v15.875"
|
||||
marker-end="url(#marker1890)"
|
||||
id="path132" />
|
||||
<path
|
||||
d="m169.33 16.542v15.875"
|
||||
marker-end="url(#Arrow1Mend)"
|
||||
id="path134" />
|
||||
<path
|
||||
d="m124.35 16.542v30.427"
|
||||
marker-end="url(#marker1826)"
|
||||
id="path136" />
|
||||
<path
|
||||
d="m146.84 16.542v30.427"
|
||||
marker-end="url(#marker1816)"
|
||||
id="path138" />
|
||||
</g>
|
||||
<path
|
||||
d="m132.29 61.521 10.583 1e-5"
|
||||
marker-end="url(#marker1826-2-4-7-1-7)"
|
||||
stroke-width=".265"
|
||||
id="path142" />
|
||||
</g>
|
||||
<path
|
||||
d="m132.29 66.813h10.583"
|
||||
fill="#f00"
|
||||
marker-end="url(#marker11771-4-9)"
|
||||
stroke="#ff2a2a"
|
||||
stroke-width=".265"
|
||||
id="path146" />
|
||||
<g
|
||||
stroke-width=".26458"
|
||||
id="g162">
|
||||
<text
|
||||
x="145.52083"
|
||||
y="62.843975"
|
||||
fill="#000000"
|
||||
font-family="sans-serif"
|
||||
font-size="3.5278px"
|
||||
letter-spacing="0px"
|
||||
word-spacing="0px"
|
||||
style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25"
|
||||
xml:space="preserve"
|
||||
id="text150"><tspan
|
||||
x="145.52083"
|
||||
y="62.843975"
|
||||
font-family="sans-serif"
|
||||
font-size="2.8222px"
|
||||
stroke-width=".26458"
|
||||
style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal"
|
||||
id="tspan148">read</tspan></text>
|
||||
<text
|
||||
x="145.52083"
|
||||
y="68.135651"
|
||||
fill="#000000"
|
||||
font-family="sans-serif"
|
||||
font-size="3.5278px"
|
||||
letter-spacing="0px"
|
||||
word-spacing="0px"
|
||||
style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25"
|
||||
xml:space="preserve"
|
||||
id="text154"><tspan
|
||||
x="145.52083"
|
||||
y="68.135651"
|
||||
font-family="sans-serif"
|
||||
font-size="2.8222px"
|
||||
stroke-width=".26458"
|
||||
style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal"
|
||||
id="tspan152">allocate and copy cluster</tspan></text>
|
||||
<rect
|
||||
x="132.29"
|
||||
y="70.781"
|
||||
width="10.583"
|
||||
height="2.6458"
|
||||
fill="none"
|
||||
stroke="#000"
|
||||
stroke-dasharray="0.52916664, 0.52916664"
|
||||
id="rect156" />
|
||||
<text
|
||||
x="145.52083"
|
||||
y="73.427307"
|
||||
fill="#000000"
|
||||
font-family="sans-serif"
|
||||
font-size="3.5278px"
|
||||
letter-spacing="0px"
|
||||
word-spacing="0px"
|
||||
style="line-height:1.25;font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
|
||||
xml:space="preserve"
|
||||
id="text160"><tspan
|
||||
x="145.52083"
|
||||
y="73.427307"
|
||||
font-family="sans-serif"
|
||||
font-size="2.8222px"
|
||||
stroke-width="0.26458"
|
||||
style="font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal"
|
||||
id="tspan158">external snapshot cluster</tspan></text>
|
||||
</g>
|
||||
<rect
|
||||
x="132.29"
|
||||
y="76.073"
|
||||
width="10.583"
|
||||
height="2.6458"
|
||||
fill="none"
|
||||
stroke="#000"
|
||||
stroke-width=".265"
|
||||
id="rect164" />
|
||||
<text
|
||||
x="145.52083"
|
||||
y="78.718971"
|
||||
fill="#000000"
|
||||
font-family="sans-serif"
|
||||
font-size="3.5278px"
|
||||
letter-spacing="0px"
|
||||
stroke-width=".26458"
|
||||
word-spacing="0px"
|
||||
style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25"
|
||||
xml:space="preserve"
|
||||
id="text168"><tspan
|
||||
x="145.52083"
|
||||
y="78.718971"
|
||||
font-family="sans-serif"
|
||||
font-size="2.8222px"
|
||||
stroke-width=".26458"
|
||||
style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal"
|
||||
id="tspan166">allocated cluster</tspan></text>
|
||||
</g>
|
||||
</svg>
|
Before Width: | Height: | Size: 19 KiB |
@ -1,124 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<svg width="193.94mm" height="139.71mm" version="1.1" viewBox="0 0 193.94 139.71" xmlns="http://www.w3.org/2000/svg" xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<title>NVMe CUSE</title>
|
||||
<defs>
|
||||
<marker id="marker9353" overflow="visible" orient="auto">
|
||||
<path transform="matrix(-.8 0 0 -.8 -10 0)" d="m0 0 5-5-17.5 5 17.5 5z" fill-rule="evenodd" stroke="#000" stroke-width="1pt"/>
|
||||
</marker>
|
||||
<marker id="marker7156" overflow="visible" orient="auto">
|
||||
<path transform="matrix(.8 0 0 .8 10 0)" d="m0 0 5-5-17.5 5 17.5 5z" fill-rule="evenodd" stroke="#000" stroke-width="1pt"/>
|
||||
</marker>
|
||||
<marker id="marker4572" overflow="visible" orient="auto">
|
||||
<path transform="matrix(-.8 0 0 -.8 -10 0)" d="m0 0 5-5-17.5 5 17.5 5z" fill-rule="evenodd" stroke="#000" stroke-width="1pt"/>
|
||||
</marker>
|
||||
<marker id="marker4436" overflow="visible" orient="auto">
|
||||
<path transform="matrix(-.8 0 0 -.8 -10 0)" d="m0 0 5-5-17.5 5 17.5 5z" fill-rule="evenodd" stroke="#000" stroke-width="1pt"/>
|
||||
</marker>
|
||||
<marker id="marker4324" overflow="visible" orient="auto">
|
||||
<path transform="matrix(.8 0 0 .8 10 0)" d="m0 0 5-5-17.5 5 17.5 5z" fill-rule="evenodd" stroke="#000" stroke-width="1pt"/>
|
||||
</marker>
|
||||
<marker id="marker2300" overflow="visible" orient="auto">
|
||||
<path transform="matrix(-.8 0 0 -.8 -10 0)" d="m0 0 5-5-17.5 5 17.5 5z" fill-rule="evenodd" stroke="#000" stroke-width="1pt"/>
|
||||
</marker>
|
||||
<marker id="marker2110" overflow="visible" orient="auto">
|
||||
<path transform="matrix(-.8 0 0 -.8 -10 0)" d="m0 0 5-5-17.5 5 17.5 5z" fill-rule="evenodd" stroke="#000" stroke-width="1pt"/>
|
||||
</marker>
|
||||
<marker id="marker2028" overflow="visible" orient="auto">
|
||||
<path transform="matrix(-.8 0 0 -.8 -10 0)" d="m0 0 5-5-17.5 5 17.5 5z" fill-rule="evenodd" stroke="#000" stroke-width="1pt"/>
|
||||
</marker>
|
||||
<marker id="marker1219" overflow="visible" orient="auto">
|
||||
<path transform="matrix(.8 0 0 .8 10 0)" d="m0 0 5-5-17.5 5 17.5 5z" fill-rule="evenodd" stroke="#000" stroke-width="1pt"/>
|
||||
</marker>
|
||||
<marker id="Arrow1Lstart" overflow="visible" orient="auto">
|
||||
<path transform="matrix(.8 0 0 .8 10 0)" d="m0 0 5-5-17.5 5 17.5 5z" fill-rule="evenodd" stroke="#000" stroke-width="1pt"/>
|
||||
</marker>
|
||||
<marker id="marker1127" overflow="visible" orient="auto">
|
||||
<path transform="matrix(-.8 0 0 -.8 -10 0)" d="m0 0 5-5-17.5 5 17.5 5z" fill-rule="evenodd" stroke="#000" stroke-width="1pt"/>
|
||||
</marker>
|
||||
<marker id="Arrow1Lend" overflow="visible" orient="auto">
|
||||
<path transform="matrix(-.8 0 0 -.8 -10 0)" d="m0 0 5-5-17.5 5 17.5 5z" fill-rule="evenodd" stroke="#000" stroke-width="1pt"/>
|
||||
</marker>
|
||||
</defs>
|
||||
<metadata>
|
||||
<rdf:RDF>
|
||||
<cc:Work rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
|
||||
<dc:title>NVMe CUSE</dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<g transform="translate(-2.1066 -22.189)">
|
||||
<rect x="11.906" y="134.85" width="72.004" height="20.6" ry="3.7798" fill="none" stroke="#000" stroke-width=".5"/>
|
||||
<text x="14.363094" y="149.02231" fill="#000000" font-family="sans-serif" font-size="10.583px" letter-spacing="0px" stroke-width=".26458" word-spacing="0px" style="line-height:1.25" xml:space="preserve"><tspan x="14.363094" y="149.02231" font-family="sans-serif" font-size="3.5278px" stroke-width=".26458" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal">/dev/spdk/nvme0</tspan></text>
|
||||
<text x="47.625" y="149.02231" fill="#000000" font-family="sans-serif" font-size="10.583px" letter-spacing="0px" stroke-width=".26458" word-spacing="0px" style="line-height:1.25" xml:space="preserve"><tspan x="47.625" y="149.02231" font-family="sans-serif" font-size="3.5278px" stroke-width=".26458" writing-mode="lr" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal">/dev/spdk/nvme0n1</tspan></text>
|
||||
<g stroke="#000">
|
||||
<rect x="12.095" y="35.818" width="71.249" height="88.446" ry="4.3467" fill="none" stroke-width=".5"/>
|
||||
<rect x="133.43" y="33.929" width="62.366" height="76.351" ry="4.7247" fill="none" stroke-width=".5"/>
|
||||
<g fill="#fff" stroke-width=".26458">
|
||||
<rect x="14.174" y="91.57" width="64.256" height="24.568"/>
|
||||
<g fill-opacity=".9798">
|
||||
<rect x="46.302" y="100.64" width="26.62" height="11.061"/>
|
||||
</g>
|
||||
</g>
|
||||
<g transform="translate(-.53932 -.16291)">
|
||||
<path d="m63.878 111.98v32.884" fill="none" marker-end="url(#marker1127)" marker-start="url(#Arrow1Lstart)" stroke-width=".26458px"/>
|
||||
<g stroke-width=".265">
|
||||
<path d="m34.585 115.57v28.726" fill="none" marker-end="url(#Arrow1Lend)" marker-start="url(#marker1219)"/>
|
||||
<rect x="136.26" y="39.031" width="54.996" height="58.586" fill="#fff"/>
|
||||
<rect x="153.84" y="52.26" width="34.018" height="11.906" ry="5.8544" fill="none"/>
|
||||
</g>
|
||||
<path d="m112.45 24.479v137.58" fill="none" stroke-dasharray="1.5874999, 1.5874999" stroke-width=".26458"/>
|
||||
</g>
|
||||
<g fill="#fff" stroke-width=".265">
|
||||
<rect x="89.58" y="54.339" width="38.365" height="8.8824"/>
|
||||
</g>
|
||||
</g>
|
||||
<g font-family="sans-serif" font-size="4.2333px" letter-spacing="0px" stroke-width=".26458" word-spacing="0px">
|
||||
<text x="93.54911" y="59.800339" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="93.54911" y="59.800339" stroke-width=".26458">io_msg queue</tspan></text>
|
||||
<text x="11.906249" y="27.31399" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="11.906249" y="27.31399" stroke-width=".26458">CUSE threads</tspan></text>
|
||||
<text x="165.36458" y="27.502975" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="165.36458" y="27.502975" stroke-width=".26458">SPDK threads</tspan></text>
|
||||
</g>
|
||||
<g stroke="#000">
|
||||
<rect x="17.009" y="47.914" width="29.482" height="13.04" ry="6.5201" fill="#fff" stroke-width=".265"/>
|
||||
<rect x="49.921" y="68.161" width="28.915" height="13.04" ry="6.5201" fill="#fff" stroke-width=".265"/>
|
||||
<g fill="none">
|
||||
<path d="m32.506 61.143v30.427" marker-start="url(#marker7156)" stroke-width=".26458px"/>
|
||||
<path d="m63.689 81.176 0.18899 19.277" marker-start="url(#marker4324)" stroke-width=".265"/>
|
||||
<g stroke-width=".26458px">
|
||||
<path d="m46.113 54.339h43.467" marker-end="url(#marker2028)"/>
|
||||
<path d="m64.284 67.972c0.02768-6.3997-1.3229-5.2917 25.135-5.2917" marker-end="url(#marker2110)"/>
|
||||
<path d="m127.78 56.066h25.135" marker-end="url(#marker2300)"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<g stroke-width=".26458">
|
||||
<g transform="translate(-.25341)" font-family="sans-serif" font-size="4.2333px" letter-spacing="0px" word-spacing="0px">
|
||||
<text x="138.90625" y="44.889877" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="138.90625" y="44.889877" stroke-width=".26458">NVMe</tspan></text>
|
||||
<text x="16.063986" y="97.050598" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="16.063986" y="97.050598" stroke-width=".26458">CUSE ctrlr</tspan></text>
|
||||
<text x="48.380947" y="106.12202" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="48.380947" y="106.12202" stroke-width=".26458">CUSE ns</tspan></text>
|
||||
<text x="51.420551" y="75.799461" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="51.420551" y="75.799461" stroke-width=".26458">ioctl pthread</tspan></text>
|
||||
<text x="18.906757" y="55.833015" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="18.906757" y="55.833015" stroke-width=".26458">ioctl pthread</tspan></text>
|
||||
</g>
|
||||
<path d="m160.86 85.17c0.38097 13.154-7.1538 11.542-82.052 10.936" fill="none" marker-end="url(#marker4572)" stroke="#000" stroke-dasharray="0.79374995, 0.79374995"/>
|
||||
<path d="m179.38 85.17c0.37797 22.25-6.5765 20.83-106.08 20.641" fill="none" marker-end="url(#marker4436)" stroke="#000" stroke-dasharray="0.79374995, 0.79374995"/>
|
||||
</g>
|
||||
<g font-family="sans-serif" font-size="4.2333px" letter-spacing="0px" stroke-width=".26458" word-spacing="0px">
|
||||
<text x="13.229166" y="139.7619" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="13.229166" y="139.7619" stroke-width=".26458">Kernel</tspan></text>
|
||||
<text x="14.552083" y="41.488094" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="14.552083" y="41.488094" stroke-width=".26458">CUSE</tspan></text>
|
||||
<text x="161.73709" y="59.415913" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="161.73709" y="59.415913" stroke-width=".26458">io poller</tspan></text>
|
||||
</g>
|
||||
<g fill="none" stroke="#000">
|
||||
<path d="m111.91 127.5h-109.8" stroke-dasharray="1.58749992, 1.58749992" stroke-width=".26458"/>
|
||||
<rect x="153.3" y="71.941" width="34.018" height="13.229" ry="6.6146" stroke-width=".265"/>
|
||||
<path d="m170.12 64.003v7.9375" marker-end="url(#marker9353)" stroke-width=".265"/>
|
||||
</g>
|
||||
<g font-family="sans-serif" font-size="4.2333px" letter-spacing="0px" stroke-width=".26458" word-spacing="0px">
|
||||
<text x="159.72221" y="79.76664" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="159.72221" y="79.76664" stroke-width=".26458">io execute</tspan></text>
|
||||
<text x="172.34003" y="68.59539" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="172.34003" y="68.59539" font-family="sans-serif" font-size="2.8222px" stroke-width=".26458" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal">fn(arg)</tspan></text>
|
||||
<text x="53.046707" y="52.192699" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="53.046707" y="52.192699" font-family="sans-serif" font-size="2.8222px" stroke-width=".26458" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal">nvme_io_msg send()</tspan></text>
|
||||
<text x="53.102341" y="60.250244" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="53.102341" y="60.250244" font-family="sans-serif" font-size="2.8222px" stroke-width=".26458" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal">nvme_io_msg send()</tspan></text>
|
||||
<text x="120.79763" y="50.70586" font-size="12px" stroke-width="1" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal;line-height:1.25" xml:space="preserve"><tspan x="120.79763" y="50.70586" font-family="sans-serif" font-size="2.8222px" stroke-width=".26458" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal">spdk_nvme_io_msg process()</tspan></text>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
Before Width: | Height: | Size: 12 KiB |
@ -1,41 +0,0 @@
|
||||
<?xml version="1.0"?>
|
||||
<svg width="680" height="420" xmlns="http://www.w3.org/2000/svg" xmlns:svg="http://www.w3.org/2000/svg">
|
||||
<!-- Created with SVG-edit - https://github.com/SVG-Edit/svgedit-->
|
||||
<g class="layer">
|
||||
<title>Layer 1</title>
|
||||
<rect fill="#ffffff" height="369" id="svg_1" stroke="#000000" width="635.87" x="22.74" y="26.61"/>
|
||||
<rect fill="#aaffff" height="0" id="svg_2" stroke="#000000" width="0" x="191.24" y="101.36">Application A</rect>
|
||||
<rect fill="#aaffff" height="88.96" id="svg_3" stroke="#000000" width="171" x="400.9" y="67.61">ublk Server</rect>
|
||||
<line fill="none" id="svg_4" stroke="#000000" stroke-dasharray="5,5" stroke-width="2" x1="23.11" x2="660.11" y1="199.03" y2="198.03">ublk Server</line>
|
||||
<text fill="#000000" font-family="Serif" font-size="21" font-weight="bold" id="svg_5" stroke="#000000" stroke-width="0" text-anchor="middle" transform="matrix(1 0 0 1 0 0)" x="488.28" xml:space="preserve" y="122.24">ublk Server</text>
|
||||
<rect fill="#aaffff" height="62" id="svg_6" stroke="#000000" transform="matrix(1 0 0 1 0 0)" width="161" x="384.38" y="311.2"/>
|
||||
<text fill="#000000" font-family="Serif" font-size="21" font-weight="bold" id="svg_7" stroke="#000000" stroke-width="0" text-anchor="middle" transform="matrix(1 0 0 1 0 0)" x="468.93" xml:space="preserve" y="349.7">ublk Driver</text>
|
||||
<rect fill="#ffff00" height="32" id="svg_8" stroke="#000000" width="98" x="144.36" y="212.94"/>
|
||||
<text fill="#000000" font-family="Serif" font-size="18" id="svg_9" stroke="#000000" stroke-width="0" text-anchor="middle" x="194.36" xml:space="preserve" y="235.94">/dev/ublkb3</text>
|
||||
<rect fill="#ffffff" height="0" id="svg_10" stroke="#000000" width="0" x="175.36" y="246.94"/>
|
||||
<rect fill="#ffff00" height="33" id="svg_11" stroke="#000000" width="97" x="200.03" y="239.6"/>
|
||||
<text fill="#000000" font-family="Serif" font-size="18" id="svg_12" stroke="#000000" stroke-width="0" text-anchor="middle" x="249.36" xml:space="preserve" y="263.27">/dev/ublkb2</text>
|
||||
<rect fill="#ffffff" height="0" id="svg_13" stroke="#000000" width="0" x="174.36" y="264.94"/>
|
||||
<rect fill="#ffff00" height="33" id="svg_14" stroke="#000000" width="97" x="33.99" y="244.06"/>
|
||||
<text fill="#000000" font-family="Serif" font-size="18" id="svg_15" stroke="#000000" stroke-width="0" text-anchor="middle" x="82.99" xml:space="preserve" y="267.06">/dev/ublkb1</text>
|
||||
<rect fill="#00ff00" height="32" id="svg_16" stroke="#000000" width="93" x="35.99" y="206.31">le/dev/ublkb1</rect>
|
||||
<text fill="#000000" font-family="Serif" font-size="18" id="svg_17" stroke="#000000" stroke-width="0" text-anchor="middle" x="80.99" xml:space="preserve" y="226.31">Filesystem</text>
|
||||
<path d="m383.94,359.38l-298.65,-1.66c0,0 -1.68,-79.96 -1.68,-79.96" fill="none" id="svg_22" stroke="#000000" stroke-linejoin="bevel" stroke-width="4"/>
|
||||
<path d="m384.83,334.28l-148.14,-0.2c0,0 3.33,-62.12 3.33,-62.12" fill="none" id="svg_26" stroke="#000000" stroke-linejoin="bevel" stroke-width="4" transform="matrix(1 0 0 1 0 0)"/>
|
||||
<path d="m384.69,347.33l-201.99,-0.22l0,-102.04" fill="none" id="svg_27" stroke="#000000" stroke-linejoin="bevel" stroke-width="4" transform="matrix(1 0 0 1 0 0)"/>
|
||||
<path d="m454.33,155.75c0,0 0.48,154.94 0.32,154.69c-0.16,-0.25 -0.32,-154.69 -0.32,-154.69z" fill="none" id="svg_28" stroke="#000000" stroke-linejoin="bevel" stroke-width="3"/>
|
||||
<path d="m468.6,156.42l0.18,155.99l-0.18,-155.99z" fill="none" id="svg_29" stroke="#000000" stroke-linejoin="bevel" stroke-width="3"/>
|
||||
<path d="m482.69,157.08l-0.32,154.03l0.32,-154.03z" fill="none" id="svg_30" stroke="#000000" stroke-linecap="square" stroke-linejoin="bevel" stroke-width="3">ublk Server</path>
|
||||
<rect fill="#aaffff" height="35.63" id="svg_40" stroke="#000000" width="109.37" x="65.74" y="91.86">Application A</rect>
|
||||
<text fill="#000000" font-family="Serif" font-size="18" id="svg_41" stroke="#000000" stroke-width="0" style="cursor: move;" text-anchor="middle" x="119.36" xml:space="preserve" y="112.19">Application D</text>
|
||||
<rect fill="#aaffff" height="30.63" id="svg_42" stroke="#000000" width="109.37" x="89.49" y="115.61">Application A</rect>
|
||||
<text fill="#000000" font-family="Serif" font-size="18" id="svg_43" stroke="#000000" stroke-width="0" text-anchor="middle" x="143.11" xml:space="preserve" y="136.56">Application C</text>
|
||||
<rect fill="#aaffff" height="31.25" id="svg_44" stroke="#000000" width="109.37" x="114.49" y="139.99">Application A</rect>
|
||||
<text fill="#000000" font-family="Serif" font-size="18" id="svg_45" stroke="#000000" stroke-width="0" style="cursor: move;" text-anchor="middle" x="169.36" xml:space="preserve" y="160.31">Application B</text>
|
||||
<rect fill="#aaffff" height="30.63" id="svg_46" stroke="#000000" width="109.37" x="145.74" y="164.99">Application A</rect>
|
||||
<text fill="#000000" font-family="Serif" font-size="18" id="svg_47" stroke="#000000" stroke-width="0" text-anchor="middle" x="201.24" xml:space="preserve" y="186.56">Application A</text>
|
||||
<text fill="#000000" font-family="Serif" font-size="21" font-weight="bold" id="svg_50" stroke="#000000" stroke-width="0" text-anchor="middle" transform="matrix(1 0 0 1 0 0)" x="161.4" xml:space="preserve" y="82.24">ublk Workload</text>
|
||||
<text fill="#000000" font-family="Serif" font-size="19" font-style="italic" font-weight="normal" id="svg_51" stroke="#000000" stroke-width="0" text-anchor="middle" x="602.65" xml:space="preserve" y="222.24">Kernel Space</text>
|
||||
<text fill="#000000" font-family="Serif" font-size="19" font-style="italic" font-weight="normal" id="svg_52" stroke="#000000" stroke-width="0" text-anchor="middle" transform="matrix(1 0 0 1 0 0)" x="602.03" xml:space="preserve" y="188.49">Userspace</text>
|
||||
</g>
|
||||
</svg>
|
Before Width: | Height: | Size: 5.6 KiB |
34
doc/index.md
34
doc/index.md
@ -1,41 +1,31 @@
|
||||
# Storage Performance Development Kit {#mainpage}
|
||||
|
||||
## Introduction
|
||||
# Storage Performance Development Kit {#index}
|
||||
|
||||
# Introduction
|
||||
@copydoc intro
|
||||
|
||||
## Concepts
|
||||
|
||||
# Concepts
|
||||
@copydoc concepts
|
||||
|
||||
## User Guides
|
||||
|
||||
# User Guides
|
||||
@copydoc user_guides
|
||||
|
||||
## Programmer Guides
|
||||
|
||||
# Programmer Guides
|
||||
@copydoc prog_guides
|
||||
|
||||
## General Information
|
||||
|
||||
# General Information
|
||||
@copydoc general
|
||||
|
||||
## Miscellaneous
|
||||
|
||||
# Miscellaneous
|
||||
@copydoc misc
|
||||
|
||||
## Driver Modules
|
||||
|
||||
# Driver Modules
|
||||
@copydoc driver_modules
|
||||
|
||||
## Tools
|
||||
|
||||
# Tools
|
||||
@copydoc tools
|
||||
|
||||
## CI Tools
|
||||
|
||||
@copydoc ci_tools
|
||||
|
||||
## Performance Reports
|
||||
# Experimental Tools
|
||||
@copydoc experimental_tools
|
||||
|
||||
# Performance Reports
|
||||
@copydoc performance_reports
|
||||
|
@ -4,5 +4,4 @@
|
||||
- @subpage getting_started
|
||||
- @subpage vagrant
|
||||
- @subpage changelog
|
||||
- @subpage deprecation
|
||||
- [Source Code (GitHub)](https://github.com/spdk/spdk)
|
||||
|
@ -1,10 +1,10 @@
|
||||
# I/OAT Driver {#ioat}
|
||||
|
||||
## Public Interface {#ioat_interface}
|
||||
# Public Interface {#ioat_interface}
|
||||
|
||||
- spdk/ioat.h
|
||||
|
||||
## Key Functions {#ioat_key_functions}
|
||||
# Key Functions {#ioat_key_functions}
|
||||
|
||||
Function | Description
|
||||
--------------------------------------- | -----------
|
||||
|
191
doc/iscsi.md
191
doc/iscsi.md
@ -1,6 +1,6 @@
|
||||
# iSCSI Target {#iscsi}
|
||||
|
||||
## iSCSI Target Getting Started Guide {#iscsi_getting_started}
|
||||
# iSCSI Target Getting Started Guide {#iscsi_getting_started}
|
||||
|
||||
The Storage Performance Development Kit iSCSI target application is named `iscsi_tgt`.
|
||||
This following section describes how to run iscsi from your cloned package.
|
||||
@ -10,7 +10,7 @@ This following section describes how to run iscsi from your cloned package.
|
||||
This guide starts by assuming that you can already build the standard SPDK distribution on your
|
||||
platform.
|
||||
|
||||
Once built, the binary will be in `build/bin`.
|
||||
Once built, the binary will be in `app/iscsi_tgt`.
|
||||
|
||||
If you want to kill the application by using signal, make sure use the SIGTERM, then the application
|
||||
will release all the shared memory resource before exit, the SIGKILL will make the shared memory
|
||||
@ -23,6 +23,24 @@ document.
|
||||
|
||||

|
||||
|
||||
## Configuring iSCSI Target via config file {#iscsi_config}
|
||||
|
||||
A `iscsi_tgt` specific configuration file is used to configure the iSCSI target. A fully documented
|
||||
example configuration file is located at `etc/spdk/iscsi.conf.in`.
|
||||
|
||||
The configuration file is used to configure the SPDK iSCSI target. This file defines the following:
|
||||
TCP ports to use as iSCSI portals; general iSCSI parameters; initiator names and addresses to allow
|
||||
access to iSCSI target nodes; number and types of storage backends to export over iSCSI LUNs; iSCSI
|
||||
target node mappings between portal groups, initiator groups, and LUNs.
|
||||
|
||||
You should make a copy of the example configuration file, modify it to suit your environment, and
|
||||
then run the iscsi_tgt application and pass it the configuration file using the -c option. Right now,
|
||||
the target requires elevated privileges (root) to run.
|
||||
|
||||
~~~
|
||||
app/iscsi_tgt/iscsi_tgt -c /path/to/iscsi.conf
|
||||
~~~
|
||||
|
||||
### Assigning CPU Cores to the iSCSI Target {#iscsi_config_lcore}
|
||||
|
||||
SPDK uses the [DPDK Environment Abstraction Layer](http://dpdk.org/doc/guides/prog_guide/env_abstraction_layer.html)
|
||||
@ -32,49 +50,66 @@ To ensure the SPDK iSCSI target has the best performance, place the NICs and the
|
||||
same NUMA node and configure the target to run on CPU cores associated with that node. The following
|
||||
command line option is used to configure the SPDK iSCSI target:
|
||||
|
||||
~~~bash
|
||||
~~~
|
||||
-m 0xF000000
|
||||
~~~
|
||||
|
||||
This is a hexadecimal bit mask of the CPU cores where the iSCSI target will start polling threads.
|
||||
In this example, CPU cores 24, 25, 26 and 27 would be used.
|
||||
|
||||
### Configuring a LUN in the iSCSI Target {#iscsi_lun}
|
||||
|
||||
Each LUN in an iSCSI target node is associated with an SPDK block device. See @ref bdev
|
||||
for details on configuring SPDK block devices. The block device to LUN mappings are specified in the
|
||||
configuration file as:
|
||||
|
||||
~~~~
|
||||
[TargetNodeX]
|
||||
LUN0 Malloc0
|
||||
LUN1 Nvme0n1
|
||||
~~~~
|
||||
|
||||
This exports a malloc'd target. The disk is a RAM disk that is a chunk of memory allocated by iscsi in
|
||||
user space. It will use offload engine to do the copy job instead of memcpy if the system has enough DMA
|
||||
channels.
|
||||
|
||||
## Configuring iSCSI Target via RPC method {#iscsi_rpc}
|
||||
|
||||
The iSCSI target is configured via JSON-RPC calls. See @ref jsonrpc for details.
|
||||
In addition to the configuration file, the iSCSI target may also be configured via JSON-RPC calls. See
|
||||
@ref jsonrpc for details.
|
||||
|
||||
### Portal groups
|
||||
|
||||
- iscsi_create_portal_group -- Add a portal group.
|
||||
- iscsi_delete_portal_group -- Delete an existing portal group.
|
||||
- iscsi_target_node_add_pg_ig_maps -- Add initiator group to portal group mappings to an existing iSCSI target node.
|
||||
- iscsi_target_node_remove_pg_ig_maps -- Delete initiator group to portal group mappings from an existing iSCSI target node.
|
||||
- iscsi_get_portal_groups -- Show information about all available portal groups.
|
||||
- add_portal_group -- Add a portal group.
|
||||
- delete_portal_group -- Delete an existing portal group.
|
||||
- add_pg_ig_maps -- Add initiator group to portal group mappings to an existing iSCSI target node.
|
||||
- delete_pg_ig_maps -- Delete initiator group to portal group mappings from an existing iSCSI target node.
|
||||
- get_portal_groups -- Show information about all available portal groups.
|
||||
|
||||
~~~bash
|
||||
/path/to/spdk/scripts/rpc.py iscsi_create_portal_group 1 10.0.0.1:3260
|
||||
~~~
|
||||
python /path/to/spdk/scripts/rpc.py add_portal_group 1 10.0.0.1:3260
|
||||
~~~
|
||||
|
||||
### Initiator groups
|
||||
|
||||
- iscsi_create_initiator_group -- Add an initiator group.
|
||||
- iscsi_delete_initiator_group -- Delete an existing initiator group.
|
||||
- iscsi_initiator_group_add_initiators -- Add initiators to an existing initiator group.
|
||||
- iscsi_get_initiator_groups -- Show information about all available initiator groups.
|
||||
- add_initiator_group -- Add an initiator group.
|
||||
- delete_initiator_group -- Delete an existing initiator group.
|
||||
- add_initiators_to_initiator_group -- Add initiators to an existing initiator group.
|
||||
- get_initiator_groups -- Show information about all available initiator groups.
|
||||
|
||||
~~~bash
|
||||
/path/to/spdk/scripts/rpc.py iscsi_create_initiator_group 2 ANY 10.0.0.2/32
|
||||
~~~
|
||||
python /path/to/spdk/scripts/rpc.py add_initiator_group 2 ANY 10.0.0.2/32
|
||||
~~~
|
||||
|
||||
### Target nodes
|
||||
|
||||
- iscsi_create_target_node -- Add an iSCSI target node.
|
||||
- iscsi_delete_target_node -- Delete an iSCSI target node.
|
||||
- iscsi_target_node_add_lun -- Add a LUN to an existing iSCSI target node.
|
||||
- iscsi_get_target_nodes -- Show information about all available iSCSI target nodes.
|
||||
- construct_target_node -- Add a iSCSI target node.
|
||||
- delete_target_node -- Delete a iSCSI target node.
|
||||
- target_node_add_lun -- Add an LUN to an existing iSCSI target node.
|
||||
- get_target_nodes -- Show information about all available iSCSI target nodes.
|
||||
|
||||
~~~bash
|
||||
/path/to/spdk/scripts/rpc.py iscsi_create_target_node Target3 Target3_alias MyBdev:0 1:2 64 -d
|
||||
~~~
|
||||
python /path/to/spdk/scripts/rpc.py construct_target_node Target3 Target3_alias MyBdev:0 1:2 64 -d
|
||||
~~~
|
||||
|
||||
## Configuring iSCSI Initiator {#iscsi_initiator}
|
||||
@ -83,30 +118,30 @@ The Linux initiator is open-iscsi.
|
||||
|
||||
Installing open-iscsi package
|
||||
Fedora:
|
||||
~~~bash
|
||||
~~~
|
||||
yum install -y iscsi-initiator-utils
|
||||
~~~
|
||||
|
||||
Ubuntu:
|
||||
~~~bash
|
||||
~~~
|
||||
apt-get install -y open-iscsi
|
||||
~~~
|
||||
|
||||
### Setup
|
||||
|
||||
Edit /etc/iscsi/iscsid.conf
|
||||
~~~bash
|
||||
~~~
|
||||
node.session.cmds_max = 4096
|
||||
node.session.queue_depth = 128
|
||||
~~~
|
||||
|
||||
iscsid must be restarted or receive SIGHUP for changes to take effect. To send SIGHUP, run:
|
||||
~~~bash
|
||||
~~~
|
||||
killall -HUP iscsid
|
||||
~~~
|
||||
|
||||
Recommended changes to /etc/sysctl.conf
|
||||
~~~bash
|
||||
~~~
|
||||
net.ipv4.tcp_timestamps = 1
|
||||
net.ipv4.tcp_sack = 0
|
||||
|
||||
@ -124,14 +159,13 @@ net.core.netdev_max_backlog = 300000
|
||||
### Discovery
|
||||
|
||||
Assume target is at 10.0.0.1
|
||||
|
||||
~~~bash
|
||||
~~~
|
||||
iscsiadm -m discovery -t sendtargets -p 10.0.0.1
|
||||
~~~
|
||||
|
||||
### Connect to target
|
||||
|
||||
~~~bash
|
||||
~~~
|
||||
iscsiadm -m node --login
|
||||
~~~
|
||||
|
||||
@ -140,13 +174,13 @@ they came up as.
|
||||
|
||||
### Disconnect from target
|
||||
|
||||
~~~bash
|
||||
~~~
|
||||
iscsiadm -m node --logout
|
||||
~~~
|
||||
|
||||
### Deleting target node cache
|
||||
|
||||
~~~bash
|
||||
~~~
|
||||
iscsiadm -m node -o delete
|
||||
~~~
|
||||
|
||||
@ -154,7 +188,7 @@ This will cause the initiator to forget all previously discovered iSCSI target n
|
||||
|
||||
### Finding /dev/sdX nodes for iSCSI LUNs
|
||||
|
||||
~~~bash
|
||||
~~~
|
||||
iscsiadm -m session -P 3 | grep "Attached scsi disk" | awk '{print $4}'
|
||||
~~~
|
||||
|
||||
@ -166,25 +200,25 @@ After the targets are connected, they can be tuned. For example if /dev/sdc is
|
||||
an iSCSI disk then the following can be done:
|
||||
Set noop to scheduler
|
||||
|
||||
~~~bash
|
||||
~~~
|
||||
echo noop > /sys/block/sdc/queue/scheduler
|
||||
~~~
|
||||
|
||||
Disable merging/coalescing (can be useful for precise workload measurements)
|
||||
|
||||
~~~bash
|
||||
~~~
|
||||
echo "2" > /sys/block/sdc/queue/nomerges
|
||||
~~~
|
||||
|
||||
Increase requests for block queue
|
||||
|
||||
~~~bash
|
||||
~~~
|
||||
echo "1024" > /sys/block/sdc/queue/nr_requests
|
||||
~~~
|
||||
|
||||
### Example: Configure simple iSCSI Target with one portal and two LUNs
|
||||
|
||||
Assuming we have one iSCSI Target server with portal at 10.0.0.1:3200, two LUNs (Malloc0 and Malloc1),
|
||||
Assuming we have one iSCSI Target server with portal at 10.0.0.1:3200, two LUNs (Malloc0 and Malloc),
|
||||
and accepting initiators on 10.0.0.2/32, like on diagram below:
|
||||
|
||||

|
||||
@ -192,57 +226,56 @@ Assuming we have one iSCSI Target server with portal at 10.0.0.1:3200, two LUNs
|
||||
#### Configure iSCSI Target
|
||||
|
||||
Start iscsi_tgt application:
|
||||
|
||||
```bash
|
||||
./build/bin/iscsi_tgt
|
||||
```
|
||||
$ ./app/iscsi_tgt/iscsi_tgt
|
||||
```
|
||||
|
||||
Construct two 64MB Malloc block devices with 512B sector size "Malloc0" and "Malloc1":
|
||||
|
||||
```bash
|
||||
./scripts/rpc.py bdev_malloc_create -b Malloc0 64 512
|
||||
./scripts/rpc.py bdev_malloc_create -b Malloc1 64 512
|
||||
```
|
||||
$ python ./scripts/rpc.py construct_malloc_bdev -b Malloc0 64 512
|
||||
$ python ./scripts/rpc.py construct_malloc_bdev -b Malloc1 64 512
|
||||
```
|
||||
|
||||
Create new portal group with id 1, and address 10.0.0.1:3260:
|
||||
|
||||
```bash
|
||||
./scripts/rpc.py iscsi_create_portal_group 1 10.0.0.1:3260
|
||||
```
|
||||
$ python ./scripts/rpc.py add_portal_group 1 10.0.0.1:3260
|
||||
```
|
||||
|
||||
Create one initiator group with id 2 to accept any connection from 10.0.0.2/32:
|
||||
|
||||
```bash
|
||||
./scripts/rpc.py iscsi_create_initiator_group 2 ANY 10.0.0.2/32
|
||||
```
|
||||
$ python ./scripts/rpc.py add_initiator_group 2 ANY 10.0.0.2/32
|
||||
```
|
||||
|
||||
Finally construct one target using previously created bdevs as LUN0 (Malloc0) and LUN1 (Malloc1)
|
||||
Finaly construct one target using previously created bdevs as LUN0 (Malloc0) and LUN1 (Malloc1)
|
||||
with a name "disk1" and alias "Data Disk1" using portal group 1 and initiator group 2.
|
||||
|
||||
```bash
|
||||
./scripts/rpc.py iscsi_create_target_node disk1 "Data Disk1" "Malloc0:0 Malloc1:1" 1:2 64 -d
|
||||
```
|
||||
$ python ./scripts/rpc.py construct_target_node disk1 "Data Disk1" "Malloc0:0 Malloc1:1" 1:2 64 -d
|
||||
```
|
||||
|
||||
#### Configure initiator
|
||||
|
||||
Discover target
|
||||
|
||||
~~~bash
|
||||
~~~
|
||||
$ iscsiadm -m discovery -t sendtargets -p 10.0.0.1
|
||||
10.0.0.1:3260,1 iqn.2016-06.io.spdk:disk1
|
||||
~~~
|
||||
|
||||
Connect to the target
|
||||
|
||||
~~~bash
|
||||
iscsiadm -m node --login
|
||||
~~~
|
||||
$ iscsiadm -m node --login
|
||||
~~~
|
||||
|
||||
At this point the iSCSI target should show up as SCSI disks.
|
||||
|
||||
Check dmesg to see what they came up as. In this example it can look like below:
|
||||
|
||||
~~~bash
|
||||
~~~
|
||||
...
|
||||
[630111.860078] scsi host68: iSCSI Initiator over TCP/IP
|
||||
[630112.124743] scsi 68:0:0:0: Direct-Access INTEL Malloc disk 0001 PQ: 0 ANSI: 5
|
||||
@ -265,18 +298,17 @@ Check dmesg to see what they came up as. In this example it can look like below:
|
||||
You may also use simple bash command to find /dev/sdX nodes for each iSCSI LUN
|
||||
in all logged iSCSI sessions:
|
||||
|
||||
~~~bash
|
||||
~~~
|
||||
$ iscsiadm -m session -P 3 | grep "Attached scsi disk" | awk '{print $4}'
|
||||
sdd
|
||||
sde
|
||||
~~~
|
||||
|
||||
## iSCSI Hotplug {#iscsi_hotplug}
|
||||
# iSCSI Hotplug {#iscsi_hotplug}
|
||||
|
||||
At the iSCSI level, we provide the following support for Hotplug:
|
||||
|
||||
1. bdev/nvme:
|
||||
|
||||
At the bdev/nvme level, we start one hotplug monitor which will call
|
||||
spdk_nvme_probe() periodically to get the hotplug events. We provide the
|
||||
private attach_cb and remove_cb for spdk_nvme_probe(). For the attach_cb,
|
||||
@ -286,46 +318,17 @@ upper level stack (for iSCSI target, the upper level stack is scsi/lun) to
|
||||
handle the hot-remove event.
|
||||
|
||||
2. scsi/lun:
|
||||
|
||||
When the LUN receive the hot-remove notification from block device layer,
|
||||
the LUN will be marked as removed, and all the IOs after this point will
|
||||
return with check condition status. Then the LUN starts one poller which will
|
||||
wait for all the commands which have already been submitted to block device to
|
||||
return back; after all the commands return back, the LUN will be deleted.
|
||||
|
||||
## Known bugs and limitations {#iscsi_hotplug_bugs}
|
||||
|
||||
For write command, if you want to test hotplug with write command which will
|
||||
cause r2t, for example 1M size IO, it will crash the iscsi tgt.
|
||||
For read command, if you want to test hotplug with large read IO, for example 1M
|
||||
size IO, it will probably crash the iscsi tgt.
|
||||
|
||||
@sa spdk_nvme_probe
|
||||
|
||||
## iSCSI Login Redirection {#iscsi_login_redirection}
|
||||
|
||||
The SPDK iSCSI target application supports iSCSI login redirection feature.
|
||||
|
||||
A portal refers to an IP address and TCP port number pair, and a portal group
|
||||
contains a set of portals. Users for the SPDK iSCSI target application configure
|
||||
portals through portal groups.
|
||||
|
||||
To support login redirection feature, we utilize two types of portal groups,
|
||||
public portal group and private portal group.
|
||||
|
||||
The SPDK iSCSI target application usually has a discovery portal. The discovery
|
||||
portal is connected by an initiator to get a list of targets, as well as the list
|
||||
of portals on which these target may be accessed, by a discovery session.
|
||||
|
||||
Public portal groups have their portals returned by a discovery session. Private
|
||||
portal groups do not have their portals returned by a discovery session. A public
|
||||
portal group may optionally have a redirect portal for non-discovery logins for
|
||||
each associated target. This redirect portal must be from a private portal group.
|
||||
|
||||
Initiators configure portals in public portal groups as target portals. When an
|
||||
initiator logs in to a target through a portal in an associated public portal group,
|
||||
the target sends a temporary redirection response with a redirect portal. Then the
|
||||
initiator logs in to the target again through the redirect portal.
|
||||
|
||||
Users set a portal group to public or private at creation using the
|
||||
`iscsi_create_portal_group` RPC, associate portal groups with a target using the
|
||||
`iscsi_create_target_node` RPC or the `iscsi_target_node_add_pg_ig_maps` RPC,
|
||||
specify a up-to-date redirect portal in a public portal group for a target using
|
||||
the `iscsi_target_node_set_redirect` RPC, and terminate the corresponding connections
|
||||
by asynchronous logout request using the `iscsi_target_node_request_logout` RPC.
|
||||
|
||||
Typically users will use the login redirection feature in scale out iSCSI target
|
||||
system, which runs multiple SPDK iSCSI target applications.
|
||||
|
9367
doc/jsonrpc.md
9367
doc/jsonrpc.md
File diff suppressed because it is too large
Load Diff
@ -1,8 +1,6 @@
|
||||
# JSON-RPC Remote access {#jsonrpc_proxy}
|
||||
|
||||
SPDK provides a sample python script `rpc_http_proxy.py`, that provides http server which listens for JSON
|
||||
objects from users. It uses HTTP POST method to receive JSON objects including methods and parameters
|
||||
described in this chapter.
|
||||
SPDK provides a sample python script `rpc_http_proxy.py`, that provides http server which listens for JSON objects from users. It uses HTTP POST method to receive JSON objects including methods and parameters described in this chapter.
|
||||
|
||||
## Parameters
|
||||
|
||||
@ -28,15 +26,14 @@ Status 200 with resultant JSON object included on success.
|
||||
|
||||
## Client side
|
||||
|
||||
Below is a sample python script acting as a client side. It sends `bdev_get_bdevs` method with optional `name`
|
||||
parameter and prints JSON object returned from remote_rpc script.
|
||||
Below is a sample python script acting as a client side. It sends `get_bdevs` method with optional `name` parameter and prints JSON object returned from remote_rpc script.
|
||||
|
||||
~~~python
|
||||
~~~
|
||||
import json
|
||||
import requests
|
||||
|
||||
if __name__ == '__main__':
|
||||
payload = {'id':1, 'method': 'bdev_get_bdevs', 'params': {'name': 'Malloc0'}}
|
||||
payload = {'id':1, 'method': 'get_bdevs', 'params': {'name': 'Malloc0'}}
|
||||
url = 'http://192.168.0.2:8000/'
|
||||
req = requests.post(url,
|
||||
data=json.dumps(payload),
|
||||
@ -48,10 +45,7 @@ if __name__ == '__main__':
|
||||
|
||||
Output:
|
||||
|
||||
~~~python
|
||||
python client.py
|
||||
[{u'num_blocks': 2621440, u'name': u'Malloc0', u'uuid': u'fb57e59c-599d-42f1-8b89-3e46dbe12641', u'claimed': True,
|
||||
u'driver_specific': {}, u'supported_io_types': {u'reset': True, u'nvme_admin': False, u'unmap': True, u'read': True,
|
||||
u'nvme_io': False, u'write': True, u'flush': True, u'write_zeroes': True}, u'qos_ios_per_sec': 0, u'block_size': 4096,
|
||||
u'product_name': u'Malloc disk', u'aliases': []}]
|
||||
~~~
|
||||
python client.py
|
||||
[{u'num_blocks': 2621440, u'name': u'Malloc0', u'uuid': u'fb57e59c-599d-42f1-8b89-3e46dbe12641', u'claimed': True, u'driver_specific': {}, u'supported_io_types': {u'reset': True, u'nvme_admin': False, u'unmap': True, u'read': True, u'nvme_io': False, u'write': True, u'flush': True, u'write_zeroes': True}, u'qos_ios_per_sec': 0, u'block_size': 4096, u'product_name': u'Malloc disk', u'aliases': []}]
|
||||
~~~
|
||||
|
214
doc/libraries.md
214
doc/libraries.md
@ -1,214 +0,0 @@
|
||||
# SPDK Libraries {#libraries}
|
||||
|
||||
The SPDK repository is, first and foremost, a collection of high-performance
|
||||
storage-centric software libraries. With this in mind, much care has been taken
|
||||
to ensure that these libraries have consistent and robust naming and versioning
|
||||
conventions. The libraries themselves are also divided across two directories
|
||||
(`lib` and `module`) inside of the SPDK repository in a deliberate way to prevent
|
||||
mixing of SPDK event framework dependent code and lower level libraries. This document
|
||||
is aimed at explaining the structure, naming conventions, versioning scheme, and use cases
|
||||
of the libraries contained in these two directories.
|
||||
|
||||
## Directory Structure {#structure}
|
||||
|
||||
The SPDK libraries are divided into two directories. The `lib` directory contains the base libraries that
|
||||
compose SPDK. Some of these base libraries define plug-in systems. Instances of those plug-ins are called
|
||||
modules and are located in the `module` directory. For example, the `spdk_sock` library is contained in the
|
||||
`lib` directory while the implementations of socket abstractions, `sock_posix` and `sock_uring`
|
||||
are contained in the `module` directory.
|
||||
|
||||
### lib {#lib}
|
||||
|
||||
The libraries in the `lib` directory can be readily divided into four categories:
|
||||
|
||||
- Utility Libraries: These libraries contain basic, commonly used functions that make more complex
|
||||
libraries easier to implement. For example, `spdk_log` contains macro definitions that provide a
|
||||
consistent logging paradigm and `spdk_json` is a general purpose JSON parsing library.
|
||||
- Protocol Libraries: These libraries contain the building blocks for a specific service. For example,
|
||||
`spdk_nvmf` and `spdk_vhost` each define the storage protocols after which they are named.
|
||||
- Storage Service Libraries: These libraries provide a specific abstraction that can be mapped to somewhere
|
||||
between the physical drive and the filesystem level of your typical storage stack. For example `spdk_bdev`
|
||||
provides a general block device abstraction layer, `spdk_lvol` provides a logical volume abstraction,
|
||||
`spdk_blobfs` provides a filesystem abstraction, and `spdk_ftl` provides a flash translation layer
|
||||
abstraction.
|
||||
- System Libraries: These libraries provide system level services such as a JSON based RPC service
|
||||
(see `spdk_jsonrpc`) and thread abstractions (see `spdk_thread`). The most notable library in this category
|
||||
is the `spdk_env_dpdk` library which provides a shim for the underlying Data Plane Development Kit (DPDK)
|
||||
environment and provides services like memory management.
|
||||
|
||||
The one library in the `lib` directory that doesn't fit into the above classification is the `spdk_event` library.
|
||||
This library defines a framework used by the applications contained in the `app` and `example` directories. Much
|
||||
care has been taken to keep the SPDK libraries independent from this framework. The libraries in `lib` are engineered
|
||||
to allow plugging directly into independent application frameworks such as Seastar or libuv with minimal effort.
|
||||
|
||||
Currently there are two exceptions in the `lib` directory which still rely on `spdk_event`, `spdk_vhost` and `spdk_iscsi`.
|
||||
There are efforts underway to remove all remaining dependencies these libraries have on the `spdk_event` library.
|
||||
|
||||
Much like the `spdk_event` library, the `spdk_env_dpdk` library has been architected in such a way that it
|
||||
can be readily replaced by an alternate environment shim. More information on replacing the `spdk_env_dpdk`
|
||||
module and the underlying `dpdk` environment can be found in the [environment](#env_replacement) section.
|
||||
|
||||
### module {#module}
|
||||
|
||||
The component libraries in the `module` directory represent specific implementations of the base libraries in
|
||||
the `lib` directory. As with the `lib` directory, much care has been taken to avoid dependencies on the
|
||||
`spdk_event` framework except for those libraries which directly implement the `spdk_event` module plugin system.
|
||||
|
||||
There are seven sub-directories in the `module` directory which each hold a different class of libraries. These
|
||||
sub-directories can be divided into two types.
|
||||
|
||||
- plug-in libraries: These libraries are explicitly tied to one of the libraries in the `lib` directory and
|
||||
are registered with that library at runtime by way of a specific constructor function. The parent library in
|
||||
the `lib` directory then manages the module directly. These types of libraries each implement a function table
|
||||
defined by their parent library. The following table shows these directories and their corresponding parent
|
||||
libraries:
|
||||
|
||||
<center>
|
||||
| module directory | parent library | dependent on event library |
|
||||
|------------------|----------------|----------------------------|
|
||||
| module/accel | spdk_accel | no |
|
||||
| module/bdev | spdk_bdev | no |
|
||||
| module/event | spdk_event | yes |
|
||||
| module/sock | spdk_sock | no |
|
||||
</center>
|
||||
|
||||
- Free libraries: These libraries are highly dependent upon a library in the `lib` directory but are not
|
||||
explicitly registered to that library via a constructor. The libraries in the `blob`, `blobfs`, and `env_dpdk`
|
||||
directories fall into this category. None of the libraries in this category depend explicitly on the
|
||||
`spdk_event` library.
|
||||
|
||||
## Library Conventions {#conventions}
|
||||
|
||||
The SPDK libraries follow strict conventions for naming functions, logging, versioning, and header files.
|
||||
|
||||
### Headers {#headers}
|
||||
|
||||
All public SPDK header files exist in the `include` directory of the SPDK repository. These headers
|
||||
are divided into two sub-directories.
|
||||
|
||||
`include/spdk` contains headers intended to be used by consumers of the SPDK libraries. All of the
|
||||
functions, variables, and types in these functions are intended for public consumption. Multiple headers
|
||||
in this directory may depend upon the same underlying library and work together to expose different facets
|
||||
of the library. The `spdk_bdev` library, for example, is exposed in three different headers. `bdev_module.h`
|
||||
defines the interfaces a bdev module library would need to implement, `bdev.h` contains general block device
|
||||
functions that would be used by an application consuming block devices exposed by SPDK, and `bdev_zone.h`
|
||||
exposes zoned bdev specific functions. Many of the other libraries exhibit a similar behavior of splitting
|
||||
headers between consumers of the library and those wishing to register a module with that library.
|
||||
|
||||
`include/spdk_internal`, as its name suggests contains header files intended to be consumed only by other
|
||||
libraries inside of the SPDK repository. These headers are typically used for sharing lower level functions
|
||||
between two libraries that both require similar functions. For example `spdk_internal/nvme_tcp.h` contains
|
||||
low level tcp functions used by both the `spdk_nvme` and `spdk_nvmf` libraries. These headers are *NOT*
|
||||
intended for general consumption.
|
||||
|
||||
Other header files contained directly in the `lib` and `module` directories are intended to be consumed *only*
|
||||
by source files of their corresponding library. Any symbols intended to be used across libraries need to be
|
||||
included in a header in the `include/spdk_internal` directory.
|
||||
|
||||
### Naming Conventions {#naming}
|
||||
|
||||
All public types and functions in SPDK libraries begin with the prefix `spdk_`. They are also typically
|
||||
further namespaced using the spdk library name. The rest of the function or type name describes its purpose.
|
||||
|
||||
There are no internal library functions that begin with the `spdk_` prefix. This naming convention is
|
||||
enforced by the SPDK continuous Integration testing. Functions not intended for use outside of their home
|
||||
library should be namespaced with the name of the library only.
|
||||
|
||||
### Map Files {#map}
|
||||
|
||||
SPDK libraries can be built as both static and shared object files. To facilitate building libraries as shared
|
||||
objects, each one has a corresponding map file (e.g. `spdk_nvmf` relies on `spdk_nvmf.map`). SPDK libraries
|
||||
not exporting any symbols rely on a blank map file located at `mk/spdk_blank.map`.
|
||||
|
||||
## SPDK Shared Objects {#shared_objects}
|
||||
|
||||
### Shared Object Versioning {#versioning}
|
||||
|
||||
SPDK shared objects follow a semantic versioning pattern with a major and minor version. Any changes which
|
||||
break backwards compatibility (symbol removal or change) will cause a shared object major increment and
|
||||
backwards compatible changes will cause a minor version increment; i.e. an application that relies on
|
||||
`libspdk_nvmf.so.3.0` will be compatible with `libspdk_nvmf.so.3.1` but not with `libspdk_nvmf.so.4.0`.
|
||||
|
||||
Shared object versions are incremented only once between each release cycle. This means that at most, the
|
||||
major version of each SPDK shared library will increment only once between each SPDK release.
|
||||
|
||||
There are currently no guarantees in SPDK of ABI compatibility between two major SPDK releases.
|
||||
|
||||
The point releases of an LTS release will be ABI compatible with the corresponding LTS major release.
|
||||
|
||||
Shared objects are versioned independently of one another. This means that `libspdk_nvme.so.3.0` and
|
||||
`libspdk_bdev.so.3.0` do not necessarily belong to the same release. This also means that shared objects
|
||||
with the same suffix are not necessarily compatible with each other. It is important to source all of your
|
||||
SPDK libraries from the same repository and version to ensure inter-library compatibility.
|
||||
|
||||
### Linking to Shared Objects {#so_linking}
|
||||
|
||||
Shared objects in SPDK are created on a per-library basis. There is a top level `libspdk.so` object
|
||||
which is a linker script. It simply contains references to all of the other spdk shared objects.
|
||||
|
||||
There are essentially two ways of linking to SPDK libraries.
|
||||
|
||||
1. An application can link to the top level shared object library as follows:
|
||||
~~~{.sh}
|
||||
gcc -o my_app ./my_app.c -lspdk -lspdk_env_dpdk -ldpdk
|
||||
~~~
|
||||
|
||||
2. An application can link to only a subset of libraries by linking directly to the ones it relies on:
|
||||
~~~{.sh}
|
||||
gcc -o my_app ./my_app.c -lpassthru_external -lspdk_event_bdev -lspdk_bdev -lspdk_bdev_malloc
|
||||
-lspdk_log -lspdk_thread -lspdk_util -lspdk_event -lspdk_env_dpdk -ldpdk
|
||||
~~~
|
||||
|
||||
In the second instance, please note that applications need only link to the libraries upon which they
|
||||
directly depend. All SPDK libraries have their dependencies specified at object compile time. This means
|
||||
that when linking to `spdk_net`, one does not also have to specify `spdk_log`, `spdk_util`, `spdk_json`,
|
||||
`spdk_jsonrpc`, and `spdk_rpc`. However, this dependency inclusion does not extend to the application
|
||||
itself; i.e. if an application directly uses symbols from both `spdk_bdev` and `spdk_log`, both libraries
|
||||
will need to be supplied to the linker when linking the application even though `spdk_log` is a dependency
|
||||
of `spdk_bdev`.
|
||||
|
||||
Please also note that when linking to SPDK libraries, both the spdk_env shim library and the env library
|
||||
itself need to be supplied to the linker. In the examples above, these are `spdk_env_dpdk` and `dpdk`
|
||||
respectively. This was intentional and allows one to easily swap out both the environment and the
|
||||
environment shim.
|
||||
|
||||
### Replacing the env abstraction {#env_replacement}
|
||||
|
||||
SPDK depends on an environment abstraction that provides crucial pinned memory management and PCIe
|
||||
bus management operations. The interface for this environment abstraction is defined in the
|
||||
`include/env.h` header file. The default implementation of this environment is located in `spdk_env_dpdk`.
|
||||
This abstraction in turn relies upon the DPDK libraries. This two part implementation was deliberate
|
||||
and allows for easily swapping out the dpdk version upon which the spdk libraries rely without making
|
||||
modifications to the spdk source directly.
|
||||
|
||||
Any environment can replace the `spdk_env_dpdk` environment by implementing the `include/env.h` header
|
||||
file. The environment can either be implemented wholesale in a single library or as a two-part
|
||||
shim/implementation library system.
|
||||
|
||||
~~~{.sh}
|
||||
# single library
|
||||
gcc -o my_app ./my_app.c -lspdk -lcustom_env_implementation
|
||||
|
||||
# two libraries
|
||||
gcc -o my_app ./my_app.c -lspdk -lcustom_env_shim -lcustom_env_implementation
|
||||
~~~
|
||||
|
||||
## SPDK Static Objects {#static_objects}
|
||||
|
||||
SPDK static objects are compiled by default even when no parameters are supplied to the build system.
|
||||
Unlike SPDK shared objects, the filename does not contain any versioning semantics. Linking against
|
||||
static objects is similar to shared objects but will always require the use of `-Wl,--whole-archive`
|
||||
as argument. This is due to the use of constructor functions in SPDK such as those to register
|
||||
NVMe transports.
|
||||
|
||||
Due to the lack of versioning semantics, it is not recommended to install static libraries system wide.
|
||||
Instead the path to these static libraries should be added as argument at compile time using
|
||||
`-L/path/to/static/libs`. The use of static objects instead of shared objects can also be forced
|
||||
through `-Wl,-Bstatic`, otherwise some compilers might prefer to use the shared objects if both
|
||||
are available.
|
||||
|
||||
~~~{.sh}
|
||||
gcc -o my_app ./my_app.c -L/path/to/static/libs -Wl,--whole-archive -Wl,-Bstatic -lpassthru_external
|
||||
-lspdk_event_bdev -lspdk_bdev -lspdk_bdev_malloc -lspdk_log -lspdk_thread -lspdk_util -lspdk_event
|
||||
-lspdk_env_dpdk -Wl,--no-whole-archive -Wl,-Bdynamic -pthread -ldpdk
|
||||
~~~
|
142
doc/lvol.md
142
doc/lvol.md
@ -1,48 +1,38 @@
|
||||
# Logical Volumes {#logical_volumes}
|
||||
|
||||
The Logical Volumes library is a flexible storage space management system. It provides creating and managing virtual
|
||||
block devices with variable size. The SPDK Logical Volume library is built on top of @ref blob.
|
||||
The Logical Volumes library is a flexible storage space management system. It provides creating and managing virtual block devices with variable size. The SPDK Logical Volume library is built on top of @ref blob.
|
||||
|
||||
## Terminology {#lvol_terminology}
|
||||
# Terminology {#lvol_terminology}
|
||||
|
||||
### Logical volume store {#lvs}
|
||||
## Logical volume store {#lvs}
|
||||
|
||||
* Shorthand: lvolstore, lvs
|
||||
* Type name: struct spdk_lvol_store
|
||||
|
||||
A logical volume store uses the super blob feature of blobstore to hold uuid (and in future other metadata).
|
||||
Blobstore types are implemented in blobstore itself, and saved on disk. An lvolstore will generate a UUID on
|
||||
creation, so that it can be uniquely identified from other lvolstores.
|
||||
By default when creating lvol store data region is unmapped. Optional --clear-method parameter can be passed
|
||||
on creation to change that behavior to writing zeroes or performing no operation.
|
||||
A logical volume store uses the super blob feature of blobstore to hold uuid (and in future other metadata). Blobstore types are implemented in blobstore itself, and saved on disk. An lvolstore will generate a UUID on creation, so that it can be uniquely identified from other lvolstores.
|
||||
By default when creating lvol store data region is unmapped. Optional --clear-method parameter can be passed on creation to change that behavior to writing zeroes or performing no operation.
|
||||
|
||||
### Logical volume {#lvol}
|
||||
## Logical volume {#lvol}
|
||||
|
||||
* Shorthand: lvol
|
||||
* Type name: struct spdk_lvol
|
||||
|
||||
A logical volume is implemented as an SPDK blob created from an lvolstore. An lvol is uniquely identified by
|
||||
its UUID. Lvol additional can have alias name.
|
||||
A logical volume is implemented as an SPDK blob created from an lvolstore. An lvol is uniquely identified by its UUID. Lvol additional can have alias name.
|
||||
|
||||
### Logical volume block device {#lvol_bdev}
|
||||
## Logical volume block device {#lvol_bdev}
|
||||
|
||||
* Shorthand: lvol_bdev
|
||||
* Type name: struct spdk_lvol_bdev
|
||||
|
||||
Representation of an SPDK block device (spdk_bdev) with an lvol implementation.
|
||||
A logical volume block device translates generic SPDK block device I/O (spdk_bdev_io) operations into the
|
||||
equivalent SPDK blob operations. Combination of lvol name and lvolstore name gives lvol_bdev alias name in
|
||||
a form "lvs_name/lvol_name". block_size of the created bdev is always 4096, due to blobstore page size.
|
||||
Cluster_size is configurable by parameter. Size of the new bdev will be rounded up to nearest multiple of
|
||||
cluster_size. By default lvol bdevs claim part of lvol store equal to their set size. When thin provision
|
||||
option is enabled, no space is taken from lvol store until data is written to lvol bdev.
|
||||
By default when deleting lvol bdev or resizing down, allocated clusters are unmapped. Optional --clear-method
|
||||
parameter can be passed on creation to change that behavior to writing zeroes or performing no operation.
|
||||
A logical volume block device translates generic SPDK block device I/O (spdk_bdev_io) operations into the equivalent SPDK blob operations. Combination of lvol name and lvolstore name gives lvol_bdev alias name in a form "lvs_name/lvol_name". block_size of the created bdev is always 4096, due to blobstore page size. Cluster_size is configurable by parameter.
|
||||
Size of the new bdev will be rounded up to nearest multiple of cluster_size.
|
||||
By default lvol bdevs claim part of lvol store equal to their set size. When thin provision option is enabled, no space is taken from lvol store until data is written to lvol bdev.
|
||||
By default when deleting lvol bdev or resizing down, allocated clusters are unmapped. Optional --clear-method parameter can be passed on creation to change that behavior to writing zeroes or performing no operation.
|
||||
|
||||
### Thin provisioning {#lvol_thin_provisioning}
|
||||
## Thin provisioning {#lvol_thin_provisioning}
|
||||
|
||||
Thin provisioned lvols rely on dynamic cluster allocation (e.g. when the first write operation on a cluster is performed), only space
|
||||
required to store data is used and unallocated clusters are obtained from underlying device (e.g. zeroes_dev).
|
||||
Thin provisioned lvols rely on dynamic cluster allocation (e.g. when the first write operation on a cluster is performed), only space required to store data is used and unallocated clusters are obtained from underlying device (e.g. zeroes_dev).
|
||||
|
||||
Sample write operations of thin provisioned blob are shown on the diagram below:
|
||||
|
||||
@ -52,13 +42,11 @@ Sample read operations and the structure of thin provisioned blob are shown on t
|
||||
|
||||

|
||||
|
||||
### Snapshots and clone {#lvol_snapshots}
|
||||
## Snapshots and clone {#lvol_snapshots}
|
||||
|
||||
Logical volumes support snapshots and clones functionality. User may at any given time create snapshot of existing
|
||||
logical volume to save a backup of current volume state. When creating snapshot original volume becomes thin provisioned
|
||||
and saves only incremental differences from its underlying snapshot. This means that every read from unallocated cluster
|
||||
is actually a read from the snapshot and every write to unallocated cluster triggers new cluster allocation and data copy
|
||||
from corresponding cluster in snapshot to the new cluster in logical volume before the actual write occurs.
|
||||
Logical volumes support snapshots and clones functionality. User may at any given time create snapshot of existing logical volume to save a backup of current volume state.
|
||||
When creating snapshot original volume becomes thin provisioned and saves only incremental differences from its underlying snapshot. This means that every read from unallocated cluster is actually a read from the snapshot and
|
||||
every write to unallocated cluster triggers new cluster allocation and data copy from corresponding cluster in snapshot to the new cluster in logical volume before the actual write occurs.
|
||||
|
||||
The read operation is performed as shown in the diagram below:
|
||||

|
||||
@ -66,51 +54,29 @@ The read operation is performed as shown in the diagram below:
|
||||
The write operation is performed as shown in the diagram below:
|
||||

|
||||
|
||||
User may also create clone of existing snapshot that will be thin provisioned and it will behave in the same way as logical volume
|
||||
from which snapshot is created. There is no limit of clones and snapshots that may be created as long as there is enough space on
|
||||
logical volume store. Snapshots are read only. Clones may be created only from snapshots or read only logical volumes.
|
||||
User may also create clone of existing snapshot that will be thin provisioned and it will behave in the same way as logical volume from which snapshot is created.
|
||||
There is no limit of clones and snapshots that may be created as long as there is enough space on logical volume store. Snapshots are read only. Clones may be created only from snapshots or read only logical volumes.
|
||||
|
||||
A snapshot can be removed only if there is a single clone on top of it. The relation chain will be updated accordingly.
|
||||
The cluster map of clone and snapshot will be merged and entries for unallocated clusters in the clone will be updated with
|
||||
addresses from the snapshot cluster map. The entire operation modifies metadata only - no data is copied during this process.
|
||||
## Inflation {#lvol_inflation}
|
||||
|
||||
### External Snapshots
|
||||
|
||||
With the external snapshots feature, clones can be made of any bdev. These clones are commonly called *esnap clones*.
|
||||
Esnap clones work very similarly to thin provisioning. Rather than the back device being an zeroes device, the external snapshot
|
||||
bdev is used as the back device.
|
||||
|
||||

|
||||
|
||||
A bdev that is used as an external snapshot cannot be opened for writing by anything else so long as an esnap clone exists.
|
||||
|
||||
A bdev may have multiple esnap clones and esnap clones can themselves be snapshotted and cloned.
|
||||
|
||||
### Inflation {#lvol_inflation}
|
||||
|
||||
Blobs can be inflated to copy data from backing devices (e.g. snapshots) and allocate all remaining clusters. As a result of this
|
||||
operation all dependencies for the blob are removed.
|
||||
Blobs can be inflated to copy data from backing devices (e.g. snapshots) and allocate all remaining clusters. As a result of this operation all dependencies for the blob are removed.
|
||||
|
||||

|
||||
|
||||
### Decoupling {#lvol_decoupling}
|
||||
## Decoupling {#lvol_decoupling}
|
||||
|
||||
Blobs can be decoupled from their parent blob by copying data from backing devices (e.g. snapshots) for all allocated clusters.
|
||||
Remaining unallocated clusters are kept thin provisioned.
|
||||
Note: When decouple is performed, only single dependency is removed. To remove all dependencies in a chain of blobs depending
|
||||
on each other, multiple calls need to be issued.
|
||||
Blobs can be decoupled from all dependencies by copying data from backing devices (e.g. snapshots) for all allocated clusters. Remaining unallocated clusters are kept thin provisioned.
|
||||
|
||||
## Configuring Logical Volumes
|
||||
# Configuring Logical Volumes
|
||||
|
||||
There is no static configuration available for logical volumes. All configuration is done trough RPC. Information about
|
||||
logical volumes is kept on block devices.
|
||||
There is no static configuration available for logical volumes. All configuration is done trough RPC. Information about logical volumes is kept on block devices.
|
||||
|
||||
## RPC overview {#lvol_rpc}
|
||||
# RPC overview {#lvol_rpc}
|
||||
|
||||
RPC regarding lvolstore:
|
||||
|
||||
```bash
|
||||
bdev_lvol_create_lvstore [-h] [-c CLUSTER_SZ] bdev_name lvs_name
|
||||
```
|
||||
construct_lvol_store [-h] [-c CLUSTER_SZ] bdev_name lvs_name
|
||||
Constructs lvolstore on specified bdev with specified name. During
|
||||
construction bdev is unmapped at initialization and all data is
|
||||
erased. Then original bdev is claimed by
|
||||
@ -120,20 +86,20 @@ bdev_lvol_create_lvstore [-h] [-c CLUSTER_SZ] bdev_name lvs_name
|
||||
-h show help
|
||||
-c CLUSTER_SZ Specifies the size of cluster. By default its 4MiB.
|
||||
--clear-method specify data region clear method "none", "unmap" (default), "write_zeroes"
|
||||
bdev_lvol_delete_lvstore [-h] [-u UUID] [-l LVS_NAME]
|
||||
destroy_lvol_store [-h] [-u UUID] [-l LVS_NAME]
|
||||
Destroy lvolstore on specified bdev. Removes lvolstore along with lvols on
|
||||
it. User can identify lvol store by UUID or its name. Note that destroying
|
||||
lvolstore requires using this call, while deleting single lvol requires
|
||||
using bdev_lvol_delete rpc call.
|
||||
using destroy_lvol_bdev rpc call.
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
bdev_lvol_get_lvstores [-h] [-u UUID] [-l LVS_NAME]
|
||||
get_lvol_stores [-h] [-u UUID] [-l LVS_NAME]
|
||||
Display current logical volume store list
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
-u UUID, --uuid UUID show details of specified lvol store
|
||||
-l LVS_NAME, --lvs_name LVS_NAME show details of specified lvol store
|
||||
bdev_lvol_rename_lvstore [-h] old_name new_name
|
||||
rename_lvol_store [-h] old_name new_name
|
||||
Change logical volume store name
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
@ -141,8 +107,8 @@ bdev_lvol_rename_lvstore [-h] old_name new_name
|
||||
|
||||
RPC regarding lvol and spdk bdev:
|
||||
|
||||
```bash
|
||||
bdev_lvol_create [-h] [-u UUID] [-l LVS_NAME] [-t] [-c CLEAR_METHOD] lvol_name size
|
||||
```
|
||||
construct_lvol_bdev [-h] [-u UUID] [-l LVS_NAME] [-t] [-c CLEAR_METHOD] lvol_name size
|
||||
Creates lvol with specified size and name on lvolstore specified by its uuid
|
||||
or name. Then constructs spdk bdev on top of that lvol and presents it as spdk bdev.
|
||||
User may use -t switch to create thin provisioned lvol.
|
||||
@ -150,59 +116,41 @@ bdev_lvol_create [-h] [-u UUID] [-l LVS_NAME] [-t] [-c CLEAR_METHOD] lvol_name s
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
-c, --clear-method specify data clusters clear method "none", "unmap" (default), "write_zeroes"
|
||||
bdev_lvol_get_lvols [-h] [-u LVS_UUID] [-l LVS_NAME]
|
||||
Display logical volume list, including those that do not have associated bdevs.
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
-u LVS_UUID, --lvs_uuid UUID show volumes only in the specified lvol store
|
||||
-l LVS_NAME, --lvs_name LVS_NAME show volumes only in the specified lvol store
|
||||
bdev_get_bdevs [-h] [-b NAME]
|
||||
get_bdevs [-h] [-b NAME]
|
||||
User can view created bdevs using this call including those created on top of lvols.
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
-b NAME, --name NAME Name of the block device. Example: Nvme0n1
|
||||
bdev_lvol_delete [-h] bdev_name
|
||||
Deletes a logical volume previously created by bdev_lvol_create.
|
||||
destroy_lvol_bdev [-h] bdev_name
|
||||
Deletes a logical volume previously created by construct_lvol_bdev.
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
bdev_lvol_snapshot [-h] lvol_name snapshot_name
|
||||
snapshot_lvol_bdev [-h] lvol_name snapshot_name
|
||||
Create a snapshot with snapshot_name of a given lvol bdev.
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
bdev_lvol_clone [-h] snapshot_name clone_name
|
||||
clone_lvol_bdev [-h] snapshot_name clone_name
|
||||
Create a clone with clone_name of a given lvol snapshot.
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
bdev_lvol_clone_bdev [-h] bdev_name_or_uuid lvs_name clone_name
|
||||
Create a clone with clone_name of a bdev. The bdev must not be an lvol in the lvs_name lvstore.
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
bdev_lvol_rename [-h] old_name new_name
|
||||
rename_lvol_bdev [-h] old_name new_name
|
||||
Change lvol bdev name
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
bdev_lvol_resize [-h] name size
|
||||
resize_lvol_bdev [-h] name size
|
||||
Resize existing lvol bdev
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
bdev_lvol_set_read_only [-h] name
|
||||
set_read_only_lvol_bdev [-h] name
|
||||
Mark lvol bdev as read only
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
bdev_lvol_inflate [-h] name
|
||||
inflate_lvol_bdev [-h] name
|
||||
Inflate lvol bdev
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
bdev_lvol_decouple_parent [-h] name
|
||||
decouple_parent_lvol_bdev [-h] name
|
||||
Decouple parent of a logical volume
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
bdev_lvol_set_xattr [-h] name xattr_name xattr_value
|
||||
Set xattr for lvol bdev
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
bdev_lvol_get_xattr [-h] name xattr_name
|
||||
Get xattr for lvol bdev
|
||||
optional arguments:
|
||||
-h, --help show help
|
||||
```
|
||||
|
@ -1,4 +1,4 @@
|
||||
# Direct Memory Access (DMA) From User Space {#memory}
|
||||
# Memory Management for User Space Drivers {#memory}
|
||||
|
||||
The following is an attempt to explain why all data buffers passed to SPDK must
|
||||
be allocated using spdk_dma_malloc() or its siblings, and why SPDK relies on
|
||||
@ -92,7 +92,7 @@ SPDK must be allocated using spdk_dma_malloc() or its siblings. The buffers
|
||||
must be allocated specifically so that they are pinned and so that physical
|
||||
addresses are known.
|
||||
|
||||
## IOMMU Support
|
||||
# IOMMU Support
|
||||
|
||||
Many platforms contain an extra piece of hardware called an I/O Memory
|
||||
Management Unit (IOMMU). An IOMMU is much like a regular MMU, except it
|
||||
|
@ -1,5 +1,3 @@
|
||||
# Miscellaneous {#misc}
|
||||
|
||||
- @subpage peer_2_peer
|
||||
- @subpage containers
|
||||
- @subpage rpms
|
||||
|
@ -1,40 +0,0 @@
|
||||
# Notify library {#notify}
|
||||
|
||||
The notify library implements an event bus, allowing users to register, generate,
|
||||
and listen for events. For example, the bdev library may register a new event type
|
||||
for bdev creation. Any time a bdev is created, it "sends" the event. Consumers of
|
||||
that event may periodically poll for new events to retrieve them.
|
||||
The event bus is implemented as a circular ring of fixed size. If event consumers
|
||||
do not poll frequently enough, events may be lost. All events are identified by a
|
||||
monotonically increasing integer, so missing events may be detected, although
|
||||
not recovered.
|
||||
|
||||
## Register event types {#notify_register}
|
||||
|
||||
During initialization the sender library should register its own event types using
|
||||
`spdk_notify_type_register(const char *type)`. Parameter 'type' is the name of
|
||||
notification type.
|
||||
|
||||
## Get info about events {#notify_get_info}
|
||||
|
||||
A consumer can get information about the available event types during runtime using
|
||||
`spdk_notify_foreach_type`, which iterates over registered notification types and
|
||||
calls a callback on each of them, so that user can produce detailed information
|
||||
about notification.
|
||||
|
||||
## Get new events {#notify_listen}
|
||||
|
||||
A consumer can get events by calling function `spdk_notify_foreach_event`.
|
||||
The caller should specify last received event and the maximum number of invocations.
|
||||
There might be multiple consumers of each event. The event bus is implemented as a
|
||||
circular buffer, so older events may be overwritten by newer ones.
|
||||
|
||||
## Send events {#notify_send}
|
||||
|
||||
When an event occurs, a library can invoke `spdk_notify_send` with two strings.
|
||||
One containing the type of the event, like "spdk_bdev_register", second with context,
|
||||
for example "Nvme0n1"
|
||||
|
||||
## RPC Calls {#rpc_calls}
|
||||
|
||||
See [JSON-RPC documentation](jsonrpc.md/#rpc_notify_get_types)
|
87
doc/nvme-cli.md
Normal file
87
doc/nvme-cli.md
Normal file
@ -0,0 +1,87 @@
|
||||
# nvme-cli {#nvme-cli}
|
||||
|
||||
# nvme-cli with SPDK Getting Started Guide
|
||||
|
||||
Now nvme-cli can support both kernel driver and SPDK user mode driver for most of its available commands and
|
||||
Intel specific commands.
|
||||
|
||||
1. Clone the nvme-cli repository from the SPDK GitHub fork. Make sure you check out the spdk-1.6 branch.
|
||||
~~~{.sh}
|
||||
git clone -b spdk-1.6 https://github.com/spdk/nvme-cli.git
|
||||
~~~
|
||||
|
||||
2. Clone the SPDK repository from https://github.com/spdk/spdk under the nvme-cli folder.
|
||||
|
||||
3. Refer to the "README.md" under SPDK folder to properly build SPDK.
|
||||
|
||||
4. Refer to the "README.md" under nvme-cli folder to properly build nvme-cli.
|
||||
|
||||
5. Execute "<spdk_folder>/scripts/setup.sh" with the "root" account.
|
||||
|
||||
6. Update the "spdk.conf" file under nvme-cli folder to properly configure the SPDK. Notes as following:
|
||||
~~~{.sh}
|
||||
spdk=1
|
||||
Indicates whether or not to use spdk. Can be 0 (off) or 1 (on).
|
||||
Defaults to 1 which assumes that you have run "<spdk_folder>/scripts/setup.sh", unbinding your drives from the kernel.
|
||||
|
||||
|
||||
core_mask=0x1
|
||||
A bitmask representing which core(s) to use for nvme-cli operations.
|
||||
Defaults to core 0.
|
||||
|
||||
mem_size=512
|
||||
The amount of reserved hugepage memory to use for nvme-cli (in MB).
|
||||
Defaults to 512MB.
|
||||
|
||||
shm_id=0
|
||||
Indicates the shared memory ID for the spdk application with which your NVMe drives are associated,
|
||||
and should be adjusted accordingly.
|
||||
Defaults to 0.
|
||||
~~~
|
||||
|
||||
7. Run the "./nvme list" command to get the domain:bus:device.function for each found NVMe SSD.
|
||||
|
||||
8. Run the other nvme commands with domain:bus:device.function instead of "/dev/nvmeX" for the specified device.
|
||||
~~~{.sh}
|
||||
Example: ./nvme smart-log 0000:01:00.0
|
||||
~~~
|
||||
|
||||
9. Run the "./nvme intel" commands for Intel specific commands against Intel NVMe SSD.
|
||||
~~~{.sh}
|
||||
Example: ./nvme intel internal-log 0000:08:00.0
|
||||
~~~
|
||||
|
||||
10. Execute "<spdk_folder>/scripts/setup.sh reset" with the "root" account and update "spdk=0" in spdk.conf to
|
||||
use the kernel driver if wanted.
|
||||
|
||||
## Use scenarios
|
||||
|
||||
### Run as the only SPDK application on the system
|
||||
1. Modify the spdk to 1 in spdk.conf. If the system has fewer cores or less memory, update the spdk.conf accordingly.
|
||||
|
||||
### Run together with other running SPDK applications on shared NVMe SSDs
|
||||
1. For the other running SPDK application, start with the parameter like "-i 1" to have the same "shm_id".
|
||||
|
||||
2. Use the default spdk.conf setting where "shm_id=1" to start the nvme-cli.
|
||||
|
||||
3. If other SPDK applications run with different shm_id parameter, update the "spdk.conf" accordingly.
|
||||
|
||||
### Run with other running SPDK applications on non-shared NVMe SSDs
|
||||
1. Properly configure the other running SPDK applications.
|
||||
~~~{.sh}
|
||||
a. Only access the NVMe SSDs it wants.
|
||||
b. Allocate a fixed number of memory instead of all available memory.
|
||||
~~~
|
||||
|
||||
2. Properly configure the spdk.conf setting for nvme-cli.
|
||||
~~~{.sh}
|
||||
a. Not access the NVMe SSDs from other SPDK applications.
|
||||
b. Change the mem_size to a proper size.
|
||||
~~~
|
||||
|
||||
## Note
|
||||
1. To run the newly built nvme-cli, either explicitly run as "./nvme" or added it into the $PATH to avoid
|
||||
invoke other already installed version.
|
||||
|
||||
2. To run the newly built nvme-cli with SPDK support in arbitrary directory, copy "spdk.conf" to that
|
||||
directory from the nvme cli folder and update the configuration as suggested.
|
228
doc/nvme.md
228
doc/nvme.md
@ -1,18 +1,16 @@
|
||||
# NVMe Driver {#nvme}
|
||||
|
||||
## In this document {#nvme_toc}
|
||||
# In this document {#nvme_toc}
|
||||
|
||||
- @ref nvme_intro
|
||||
- @ref nvme_examples
|
||||
- @ref nvme_interface
|
||||
- @ref nvme_design
|
||||
- @ref nvme_fabrics_host
|
||||
- @ref nvme_multi_process
|
||||
- @ref nvme_hotplug
|
||||
- @ref nvme_cuse
|
||||
- @ref nvme_led
|
||||
* @ref nvme_intro
|
||||
* @ref nvme_examples
|
||||
* @ref nvme_interface
|
||||
* @ref nvme_design
|
||||
* @ref nvme_fabrics_host
|
||||
* @ref nvme_multi_process
|
||||
* @ref nvme_hotplug
|
||||
|
||||
## Introduction {#nvme_intro}
|
||||
# Introduction {#nvme_intro}
|
||||
|
||||
The NVMe driver is a C library that may be linked directly into an application
|
||||
that provides direct, zero-copy data transfer to and from
|
||||
@ -30,23 +28,23 @@ devices via NVMe over Fabrics. Users may now call spdk_nvme_probe() on both
|
||||
local PCI busses and on remote NVMe over Fabrics discovery services. The API is
|
||||
otherwise unchanged.
|
||||
|
||||
## Examples {#nvme_examples}
|
||||
# Examples {#nvme_examples}
|
||||
|
||||
### Getting Start with Hello World {#nvme_helloworld}
|
||||
## Getting Start with Hello World {#nvme_helloworld}
|
||||
|
||||
There are a number of examples provided that demonstrate how to use the NVMe
|
||||
library. They are all in the [examples/nvme](https://github.com/spdk/spdk/tree/master/examples/nvme)
|
||||
directory in the repository. The best place to start is
|
||||
[hello_world](https://github.com/spdk/spdk/blob/master/examples/nvme/hello_world/hello_world.c).
|
||||
|
||||
### Running Benchmarks with Fio Plugin {#nvme_fioplugin}
|
||||
## Running Benchmarks with Fio Plugin {#nvme_fioplugin}
|
||||
|
||||
SPDK provides a plugin to the very popular [fio](https://github.com/axboe/fio)
|
||||
tool for running some basic benchmarks. See the fio start up
|
||||
[guide](https://github.com/spdk/spdk/blob/master/examples/nvme/fio_plugin/)
|
||||
for more details.
|
||||
|
||||
### Running Benchmarks with Perf Tool {#nvme_perf}
|
||||
## Running Benchmarks with Perf Tool {#nvme_perf}
|
||||
|
||||
NVMe perf utility in the [examples/nvme/perf](https://github.com/spdk/spdk/tree/master/examples/nvme/perf)
|
||||
is one of the examples which also can be used for performance tests. The fio
|
||||
@ -80,7 +78,7 @@ perf -q 1 -o 4096 -w write -r 'trtype:PCIe traddr:0000:04:00.0' -t 300 -e 'PRACT
|
||||
perf -q 1 -o 4096 -w read -r 'trtype:PCIe traddr:0000:04:00.0' -t 200 -e 'PRACT=0,PRCKH=GUARD'
|
||||
~~~
|
||||
|
||||
## Public Interface {#nvme_interface}
|
||||
# Public Interface {#nvme_interface}
|
||||
|
||||
- spdk/nvme.h
|
||||
|
||||
@ -104,9 +102,9 @@ spdk_nvme_ctrlr_process_admin_completions() | @copybrief spdk_nvme_ctrlr_process
|
||||
spdk_nvme_ctrlr_cmd_io_raw() | @copybrief spdk_nvme_ctrlr_cmd_io_raw()
|
||||
spdk_nvme_ctrlr_cmd_io_raw_with_md() | @copybrief spdk_nvme_ctrlr_cmd_io_raw_with_md()
|
||||
|
||||
## NVMe Driver Design {#nvme_design}
|
||||
# NVMe Driver Design {#nvme_design}
|
||||
|
||||
### NVMe I/O Submission {#nvme_io_submission}
|
||||
## NVMe I/O Submission {#nvme_io_submission}
|
||||
|
||||
I/O is submitted to an NVMe namespace using nvme_ns_cmd_xxx functions. The NVMe
|
||||
driver submits the I/O request as an NVMe submission queue entry on the queue
|
||||
@ -118,39 +116,7 @@ spdk_nvme_qpair_process_completions().
|
||||
@sa spdk_nvme_ns_cmd_read, spdk_nvme_ns_cmd_write, spdk_nvme_ns_cmd_dataset_management,
|
||||
spdk_nvme_ns_cmd_flush, spdk_nvme_qpair_process_completions
|
||||
|
||||
#### Fused operations {#nvme_fuses}
|
||||
|
||||
To "fuse" two commands, the first command should have the SPDK_NVME_IO_FLAGS_FUSE_FIRST
|
||||
io flag set, and the next one should have the SPDK_NVME_IO_FLAGS_FUSE_SECOND.
|
||||
|
||||
In addition, the following rules must be met to execute two commands as an atomic unit:
|
||||
|
||||
- The commands shall be inserted next to each other in the same submission queue.
|
||||
- The LBA range, should be the same for the two commands.
|
||||
|
||||
E.g. To send fused compare and write operation user must call spdk_nvme_ns_cmd_compare
|
||||
followed with spdk_nvme_ns_cmd_write and make sure no other operations are submitted
|
||||
in between on the same queue, like in example below:
|
||||
|
||||
~~~c
|
||||
rc = spdk_nvme_ns_cmd_compare(ns, qpair, cmp_buf, 0, 1, nvme_fused_first_cpl_cb,
|
||||
NULL, SPDK_NVME_CMD_FUSE_FIRST);
|
||||
if (rc != 0) {
|
||||
...
|
||||
}
|
||||
|
||||
rc = spdk_nvme_ns_cmd_write(ns, qpair, write_buf, 0, 1, nvme_fused_second_cpl_cb,
|
||||
NULL, SPDK_NVME_CMD_FUSE_SECOND);
|
||||
if (rc != 0) {
|
||||
...
|
||||
}
|
||||
~~~
|
||||
|
||||
The NVMe specification currently defines compare-and-write as a fused operation.
|
||||
Support for compare-and-write is reported by the controller flag
|
||||
SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED.
|
||||
|
||||
#### Scaling Performance {#nvme_scaling}
|
||||
### Scaling Performance {#nvme_scaling}
|
||||
|
||||
NVMe queue pairs (struct spdk_nvme_qpair) provide parallel submission paths for
|
||||
I/O. I/O may be submitted on multiple queue pairs simultaneously from different
|
||||
@ -183,7 +149,7 @@ require that data should be done by sending a request to the owning thread.
|
||||
This results in a message passing architecture, as opposed to a locking
|
||||
architecture, and will result in superior scaling across CPU cores.
|
||||
|
||||
### NVMe Driver Internal Memory Usage {#nvme_memory_usage}
|
||||
## NVMe Driver Internal Memory Usage {#nvme_memory_usage}
|
||||
|
||||
The SPDK NVMe driver provides a zero-copy data transfer path, which means that
|
||||
there are no data buffers for I/O commands. However, some Admin commands have
|
||||
@ -203,12 +169,12 @@ Each submission queue entry (SQE) and completion queue entry (CQE) consumes 64 b
|
||||
and 16 bytes respectively. Therefore, the maximum memory used for each I/O queue
|
||||
pair is (MQES + 1) * (64 + 16) Bytes.
|
||||
|
||||
## NVMe over Fabrics Host Support {#nvme_fabrics_host}
|
||||
# NVMe over Fabrics Host Support {#nvme_fabrics_host}
|
||||
|
||||
The NVMe driver supports connecting to remote NVMe-oF targets and
|
||||
interacting with them in the same manner as local NVMe SSDs.
|
||||
|
||||
### Specifying Remote NVMe over Fabrics Targets {#nvme_fabrics_trid}
|
||||
## Specifying Remote NVMe over Fabrics Targets {#nvme_fabrics_trid}
|
||||
|
||||
The method for connecting to a remote NVMe-oF target is very similar
|
||||
to the normal enumeration process for local PCIe-attached NVMe devices.
|
||||
@ -229,11 +195,11 @@ single NVM subsystem directly, the NVMe library will call `probe_cb`
|
||||
for just that subsystem; this allows the user to skip the discovery step
|
||||
and connect directly to a subsystem with a known address.
|
||||
|
||||
### RDMA Limitations
|
||||
## RDMA Limitations
|
||||
|
||||
Please refer to NVMe-oF target's @ref nvmf_rdma_limitations
|
||||
|
||||
## NVMe Multi Process {#nvme_multi_process}
|
||||
# NVMe Multi Process {#nvme_multi_process}
|
||||
|
||||
This capability enables the SPDK NVMe driver to support multiple processes accessing the
|
||||
same NVMe device. The NVMe driver allocates critical structures from shared memory, so
|
||||
@ -244,16 +210,15 @@ The primary motivation for this feature is to support management tools that can
|
||||
to long running applications, perform some maintenance work or gather information, and
|
||||
then detach.
|
||||
|
||||
### Configuration {#nvme_multi_process_configuration}
|
||||
## Configuration {#nvme_multi_process_configuration}
|
||||
|
||||
DPDK EAL allows different types of processes to be spawned, each with different permissions
|
||||
on the hugepage memory used by the applications.
|
||||
|
||||
There are two types of processes:
|
||||
|
||||
1. a primary process which initializes the shared memory and has full privileges and
|
||||
2. a secondary process which can attach to the primary process by mapping its shared memory
|
||||
regions and perform NVMe operations including creating queue pairs.
|
||||
regions and perform NVMe operations including creating queue pairs.
|
||||
|
||||
This feature is enabled by default and is controlled by selecting a value for the shared
|
||||
memory group ID. This ID is a positive integer and two applications with the same shared
|
||||
@ -270,149 +235,34 @@ Example: identical shm_id and non-overlapping core masks
|
||||
./perf -q 8 -o 131072 -w write -c 0x10 -t 60 -i 1
|
||||
~~~
|
||||
|
||||
### Limitations {#nvme_multi_process_limitations}
|
||||
## Limitations {#nvme_multi_process_limitations}
|
||||
|
||||
1. Two processes sharing memory may not share any cores in their core mask.
|
||||
2. If a primary process exits while secondary processes are still running, those processes
|
||||
will continue to run. However, a new primary process cannot be created.
|
||||
will continue to run. However, a new primary process cannot be created.
|
||||
3. Applications are responsible for coordinating access to logical blocks.
|
||||
4. If a process exits unexpectedly, the allocated memory will be released when the last
|
||||
process exits.
|
||||
process exits.
|
||||
|
||||
@sa spdk_nvme_probe, spdk_nvme_ctrlr_process_admin_completions
|
||||
|
||||
## NVMe Hotplug {#nvme_hotplug}
|
||||
|
||||
# NVMe Hotplug {#nvme_hotplug}
|
||||
|
||||
At the NVMe driver level, we provide the following support for Hotplug:
|
||||
|
||||
1. Hotplug events detection:
|
||||
The user of the NVMe library can call spdk_nvme_probe() periodically to detect
|
||||
hotplug events. The probe_cb, followed by the attach_cb, will be called for each
|
||||
new device detected. The user may optionally also provide a remove_cb that will be
|
||||
called if a previously attached NVMe device is no longer present on the system.
|
||||
All subsequent I/O to the removed device will return an error.
|
||||
The user of the NVMe library can call spdk_nvme_probe() periodically to detect
|
||||
hotplug events. The probe_cb, followed by the attach_cb, will be called for each
|
||||
new device detected. The user may optionally also provide a remove_cb that will be
|
||||
called if a previously attached NVMe device is no longer present on the system.
|
||||
All subsequent I/O to the removed device will return an error.
|
||||
|
||||
2. Hot remove NVMe with IO loads:
|
||||
When a device is hot removed while I/O is occurring, all access to the PCI BAR will
|
||||
result in a SIGBUS error. The NVMe driver automatically handles this case by installing
|
||||
a SIGBUS handler and remapping the PCI BAR to a new, placeholder memory location.
|
||||
This means I/O in flight during a hot remove will complete with an appropriate error
|
||||
code and will not crash the application.
|
||||
When a device is hot removed while I/O is occurring, all access to the PCI BAR will
|
||||
result in a SIGBUS error. The NVMe driver automatically handles this case by installing
|
||||
a SIGBUS handler and remapping the PCI BAR to a new, placeholder memory location.
|
||||
This means I/O in flight during a hot remove will complete with an appropriate error
|
||||
code and will not crash the application.
|
||||
|
||||
@sa spdk_nvme_probe
|
||||
|
||||
## NVMe Character Devices {#nvme_cuse}
|
||||
|
||||
### Design
|
||||
|
||||

|
||||
|
||||
For each controller as well as namespace, character devices are created in the
|
||||
locations:
|
||||
~~~{.sh}
|
||||
/dev/spdk/nvmeX
|
||||
/dev/spdk/nvmeXnY
|
||||
...
|
||||
~~~
|
||||
Where X is unique SPDK NVMe controller index and Y is namespace id.
|
||||
|
||||
Requests from CUSE are handled by pthreads when controller and namespaces are created.
|
||||
Those pass the I/O or admin commands via a ring to a thread that processes them using
|
||||
nvme_io_msg_process().
|
||||
|
||||
Ioctls that request information attained when attaching NVMe controller receive an
|
||||
immediate response, without passing them through the ring.
|
||||
|
||||
This interface reserves one additional qpair for sending down the I/O for each controller.
|
||||
|
||||
### Usage
|
||||
|
||||
#### Enabling cuse support for NVMe
|
||||
|
||||
Cuse support is disabled by default. To enable support for NVMe-CUSE devices first
|
||||
install required dependencies
|
||||
~~~{.sh}
|
||||
sudo scripts/pkgdep.sh --fuse
|
||||
~~~
|
||||
Then compile SPDK with "./configure --with-nvme-cuse".
|
||||
|
||||
#### Creating NVMe-CUSE device
|
||||
|
||||
First make sure to prepare the environment (see @ref getting_started).
|
||||
This includes loading CUSE kernel module.
|
||||
Any NVMe controller attached to a running SPDK application can be
|
||||
exposed via NVMe-CUSE interface. When closing SPDK application,
|
||||
the NVMe-CUSE devices are unregistered.
|
||||
|
||||
~~~{.sh}
|
||||
$ sudo scripts/setup.sh
|
||||
$ sudo modprobe cuse
|
||||
$ sudo build/bin/spdk_tgt
|
||||
# Continue in another session
|
||||
$ sudo scripts/rpc.py bdev_nvme_attach_controller -b Nvme0 -t PCIe -a 0000:82:00.0
|
||||
Nvme0n1
|
||||
$ sudo scripts/rpc.py bdev_nvme_get_controllers
|
||||
[
|
||||
{
|
||||
"name": "Nvme0",
|
||||
"trid": {
|
||||
"trtype": "PCIe",
|
||||
"traddr": "0000:82:00.0"
|
||||
}
|
||||
}
|
||||
]
|
||||
$ sudo scripts/rpc.py bdev_nvme_cuse_register -n Nvme0
|
||||
$ ls /dev/spdk/
|
||||
nvme0 nvme0n1
|
||||
~~~
|
||||
|
||||
#### Example of using nvme-cli
|
||||
|
||||
Most nvme-cli commands can point to specific controller or namespace by providing a path to it.
|
||||
This can be leveraged to issue commands to the SPDK NVMe-CUSE devices.
|
||||
|
||||
~~~{.sh}
|
||||
sudo nvme id-ctrl /dev/spdk/nvme0
|
||||
sudo nvme smart-log /dev/spdk/nvme0
|
||||
sudo nvme id-ns /dev/spdk/nvme0n1
|
||||
~~~
|
||||
|
||||
Note: `nvme list` command does not display SPDK NVMe-CUSE devices,
|
||||
see nvme-cli [PR #773](https://github.com/linux-nvme/nvme-cli/pull/773).
|
||||
|
||||
#### Examples of using smartctl
|
||||
|
||||
smartctl tool recognizes device type based on the device path. If none of expected
|
||||
patterns match, SCSI translation layer is used to identify device.
|
||||
|
||||
To use smartctl '-d nvme' parameter must be used in addition to full path to
|
||||
the NVMe device.
|
||||
|
||||
~~~{.sh}
|
||||
smartctl -d nvme -i /dev/spdk/nvme0
|
||||
smartctl -d nvme -H /dev/spdk/nvme1
|
||||
...
|
||||
~~~
|
||||
|
||||
### Limitations
|
||||
|
||||
NVMe namespaces are created as character devices and their use may be limited for
|
||||
tools expecting block devices.
|
||||
|
||||
Sysfs is not updated by SPDK.
|
||||
|
||||
SPDK NVMe CUSE creates nodes in "/dev/spdk/" directory to explicitly differentiate
|
||||
from other devices. Tools that only search in the "/dev" directory might not work
|
||||
with SPDK NVMe CUSE.
|
||||
|
||||
SCSI to NVMe Translation Layer is not implemented. Tools that are using this layer to
|
||||
identify, manage or operate device might not work properly or their use may be limited.
|
||||
|
||||
## NVMe LED management {#nvme_led}
|
||||
|
||||
It is possible to use the ledctl(8) utility to control the state of LEDs in systems supporting
|
||||
NPEM (Native PCIe Enclosure Management), even when the NVMe devices are controlled by SPDK.
|
||||
However, in this case it is necessary to determine the slot device number because the block device
|
||||
is unavailable. The [ledctl.sh](https://github.com/spdk/spdk/tree/master/scripts/ledctl.sh) script
|
||||
can be used to help with this. It takes the name of the nvme bdev and invokes ledctl with
|
||||
appropriate options.
|
||||
|
@ -1,166 +0,0 @@
|
||||
# NVMe Multipath {#nvme_multipath}
|
||||
|
||||
## Introduction
|
||||
|
||||
The NVMe bdev module supports two modes: failover and multipath. In failover mode, only one
|
||||
active connection is maintained and alternate paths are connected only during the switch-over.
|
||||
This can lead to delays and failed I/O reported to upper layers, but it does reduce the number
|
||||
of active connections at any given time. In multipath, active connections are maintained for
|
||||
every path and used based on a policy of either active-passive or active-active. The multipath
|
||||
mode also supports Asymmetric Namespace Access (ANA) and uses that to make policy decisions.
|
||||
|
||||
## Design
|
||||
|
||||
### Multipath Mode
|
||||
|
||||
A user may establish connections on multiple independent paths to the same NVMe-oF subsystem
|
||||
for NVMe bdevs by calling the `bdev_nvme_attach_controller` RPC multiple times with the same NVMe
|
||||
bdev controller name. Additionally, the `multipath` parameter for this RPC must be set to
|
||||
"multipath" when connecting the second or later paths.
|
||||
|
||||
For each path created by the `bdev_nvme_attach_controller` RPC, an NVMe-oF controller is created.
|
||||
Then the set of namespaces presented by that controller are discovered. For each namespace found,
|
||||
the NVMe bdev module attempts to match it with an existing NVMe bdev. If it finds a match, it adds
|
||||
the given namespace as an alternate path. If it does not find a match, it creates a new NVMe bdev.
|
||||
|
||||
I/O and admin qpairs are necessary to access an NVMe-oF controller. A single admin qpair is created
|
||||
and is shared by all SPDK threads. To submit I/O without taking locks, for each SPDK thread, an I/O
|
||||
qpair is created as a dynamic context of an I/O channel for an NVMe-oF controller.
|
||||
|
||||
For each SPDK thread, the NVMe bdev module creates an I/O channel for an NVMe bdev and provides it to
|
||||
the upper layer. The I/O channel for the NVMe bdev has an I/O path for each namespace. I/O path is
|
||||
an additional abstraction to submit I/O to a namespace, and consists of an I/O qpair context and a
|
||||
namespace. If an NVMe bdev has multiple namespaces, an I/O channel for the NVMe bdev has a list of
|
||||
multiple I/O paths. The I/O channel for the NVMe bdev has a retry I/O list and has a path selection
|
||||
policy.
|
||||
|
||||
### Path Error Recovery
|
||||
|
||||
If the NVMe driver detects an error on a qpair, it disconnects the qpair and notifies the error to
|
||||
the NVMe bdev module. Then the NVMe bdev module starts resetting the corresponding NVMe-oF controller.
|
||||
The NVMe-oF controller reset consists of the following steps: 1) disconnect and delete all I/O qpairs,
|
||||
2) disconnect admin qpair, 3) connect admin qpair, 4) configure the NVMe-oF controller, and
|
||||
5) create and connect all I/O qpairs.
|
||||
|
||||
If the step 3, 4, or 5 fails, the reset reverts to the step 3 and then it is retried after
|
||||
`reconnect_delay_sec` seconds. Then the NVMe-oF controller is deleted automatically if it is not
|
||||
recovered within `ctrlr_loss_timeout_sec` seconds. If `ctrlr_loss_timeout_sec` is -1, it retries
|
||||
indefinitely.
|
||||
|
||||
By default, error detection on a qpair is very slow for TCP and RDMA transports. For fast error
|
||||
detection, a global option, `transport_ack_timeout`, is useful.
|
||||
|
||||
### Path Selection
|
||||
|
||||
Multipath mode supports two path selection policies, active-passive or active-active.
|
||||
|
||||
For both path selection policies, only ANA optimal I/O paths are used unless there are no ANA
|
||||
optimal I/O paths available.
|
||||
|
||||
For active-passive policy, each I/O channel for an NVMe bdev has a cache to store the first found
|
||||
I/O path which is connected and optimal from ANA and use it for I/O submission. Some users may want
|
||||
to specify the preferred I/O path manually. They can dynamically set the preferred I/O path using
|
||||
the `bdev_nvme_set_preferred_path` RPC. Such assignment is realized naturally by moving the
|
||||
I/O path to the head of the I/O path list. By default, if the preferred I/O path is restored,
|
||||
failback to it is done automatically. The automatic failback can be disabled by a global option
|
||||
`disable_auto_failback`. In this case, the `bdev_nvme_set_preferred_path` RPC can be used
|
||||
to do manual failback.
|
||||
|
||||
The active-active policy uses the round-robin algorithm and submits an I/O to each I/O path in
|
||||
circular order.
|
||||
|
||||
### I/O Retry
|
||||
|
||||
The NVMe bdev module has a global option, `bdev_retry_count`, to control the number of retries when
|
||||
an I/O is returned with error. Each I/O has a retry count. If the retry count of an I/O is less than
|
||||
the `bdev_retry_count`, the I/O is allowed to retry and the retry count is incremented.
|
||||
|
||||
NOTE: The `bdev_retry_count` is not directly used but is required to be non-zero for the process
|
||||
of multipath mode failing over to a different path because the retry count is checked first always
|
||||
when an I/O is returned with error.
|
||||
|
||||
Each I/O has a timer to schedule an I/O retry at a particular time in the future. Each I/O channel
|
||||
for an NVMe bdev has a sorted I/O retry list. Retried I/Os are inserted into the I/O retry list.
|
||||
|
||||
If an I/O is returned with error, the I/O completion handler in the NVMe bdev module executes the
|
||||
following steps:
|
||||
|
||||
1. If the DNR (Do Not Retry) bit is set or the retry count exceeds the limit, then complete the
|
||||
I/O with the returned error.
|
||||
2. If the error is a path error, insert the I/O to the I/O retry list with no delay.
|
||||
3. Otherwise, insert the I/O to the I/O retry list with the delay reported by the CRD (Command
|
||||
Retry Delay).
|
||||
|
||||
Then the I/O retry poller is scheduled to the closest expiration. If there is no retried I/O,
|
||||
the I/O retry poller is stopped.
|
||||
|
||||
When submitting an I/O, there may be no available I/O path. If there is any I/O path which is
|
||||
recovering, the I/O is inserted to the I/O retry list with one second delay. This may result in
|
||||
queueing many I/Os indefinitely. To avoid such indefinite queueing, per NVMe-oF controller option,
|
||||
`fast_io_fail_timeout_sec`, is added. If the corresponding NVMe-oF controller is not recovered
|
||||
within `fast_io_fail_timeout_sec` seconds, the I/O is not queued to wait the recovery but returned
|
||||
with an I/O error to the upper layer.
|
||||
|
||||
### Asymmetric Namespace Accesses (ANA) Handling
|
||||
|
||||
If an I/O is returned with an ANA error or an ANA change notice event is received, the ANA log page
|
||||
may be changed. In this case, the NVMe bdev module reads the ANA log page to check the ANA state
|
||||
changes.
|
||||
|
||||
As described before, only ANA optimal I/O paths will be used unless there are no ANA optimal paths
|
||||
available.
|
||||
|
||||
If an I/O path is in ANA transition, i.e., its namespace reports the ANA inaccessible state or the ANA
|
||||
change state, the NVMe bdev module queues I/Os to wait until the namespace becomes accessible again.
|
||||
The ANA transition should end within the ANATT (ANA Transition Time) seconds. If the namespace does
|
||||
not report the ANA optimized state or the ANA accessible state within the ANATT seconds, I/Os are
|
||||
returned with an I/O error to the upper layer.
|
||||
|
||||
### I/O Timeout
|
||||
|
||||
The NVMe driver supports I/O timeout for submitted I/Os. The NVMe bdev module provides three
|
||||
actions when an I/O timeout is notified from the NVMe driver, ABORT, RESET, or NONE. Users can
|
||||
choose one of the actions as a global option, `action_on_timeout`. Users can set different timeout
|
||||
values for I/O commands and admin commands by global options, `timeout_us` and `timeout_admin_us`.
|
||||
|
||||
For ABORT, the NVMe bdev module tries aborting the timed out I/O, and if failed, it starts the
|
||||
NVMe-oF controller reset. For RESET, the NVMe bdev module starts the NVMe-oF controller reset.
|
||||
|
||||
## Usage
|
||||
|
||||
The following is an example to attach two NVMe-oF controllers and aggregate these into a single
|
||||
NVMe bdev controller `Nvme0`.
|
||||
|
||||
```bash
|
||||
./scripts/rpc.py bdev_nvme_attach_controller -b Nvme0 -t rdma -a 192.168.100.8 -s 4420 -f ipv4 -n nqn.2016-06.io.spdk:cnode1 -l -1 -o 20
|
||||
./scripts/rpc.py bdev_nvme_attach_controller -b Nvme0 -t rdma -a 192.168.100.9 -s 4420 -f ipv4 -n nqn.2016-06.io.spdk:cnode1 -l -1 -o 20 -x multipath
|
||||
```
|
||||
|
||||
In this example, if these two NVMe-oF controllers have a shared namespace whose namespace ID is 1,
|
||||
a single NVMe bdev `Nvme0n1` is created. For the NVMe bdev module, the default value of
|
||||
`bdev_retry_count` is 3 and I/O retry is enabled by default. `ctrlr_loss_timeout_sec` is set to -1
|
||||
and `reconnect_delay_sec` is set to 20. Hence, NVMe-oF controller reconnect will be retried once
|
||||
per 20 seconds until it succeeds.
|
||||
|
||||
To confirm if multipath is configured correctly, two RPCs, `bdev_get_bdevs` and
|
||||
`bdev_nvme_get_controllers` are available.
|
||||
|
||||
```bash
|
||||
./scripts/rpc.py bdev_get_bdevs -b Nvme0n1
|
||||
./scripts/rpc.py bdev_nvme_get_controllers -n Nvme0
|
||||
```
|
||||
|
||||
To monitor the current multipath state, a RPC `bdev_nvme_get_io_paths` are available.
|
||||
|
||||
```bash
|
||||
./scripts/rpc.py bdev_nvme_get_io_paths -n Nvme0n1
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
SPDK NVMe multipath is transport protocol independent. Heterogeneous multipath configuration (e.g.,
|
||||
TCP and RDMA) is supported. However, in this type of configuration, memory domain is not available
|
||||
yet because memory domain is supported only by the RDMA transport now.
|
||||
|
||||
The RPCs, `bdev_get_iostat` and `bdev_nvme_get_transport_statistics` display I/O statistics but
|
||||
both are not aware of multipath.
|
123
doc/nvme_spec.md
123
doc/nvme_spec.md
@ -1,123 +0,0 @@
|
||||
# Submitting I/O to an NVMe Device {#nvme_spec}
|
||||
|
||||
## The NVMe Specification
|
||||
|
||||
The NVMe specification describes a hardware interface for interacting with
|
||||
storage devices. The specification includes network transport definitions for
|
||||
remote storage as well as a hardware register layout for local PCIe devices.
|
||||
What follows here is an overview of how an I/O is submitted to a local PCIe
|
||||
device through SPDK.
|
||||
|
||||
NVMe devices allow host software (in our case, the SPDK NVMe driver) to allocate
|
||||
queue pairs in host memory. The term "host" is used a lot, so to clarify that's
|
||||
the system that the NVMe SSD is plugged into. A queue pair consists of two
|
||||
queues - a submission queue and a completion queue. These queues are more
|
||||
accurately described as circular rings of fixed size entries. The submission
|
||||
queue is an array of 64 byte command structures, plus 2 integers (head and tail
|
||||
indices). The completion queue is similarly an array of 16 byte completion
|
||||
structures, plus 2 integers (head and tail indices). There are also two 32-bit
|
||||
registers involved that are called doorbells.
|
||||
|
||||
An I/O is submitted to an NVMe device by constructing a 64 byte command, placing
|
||||
it into the submission queue at the current location of the submission queue
|
||||
tail index, and then writing the new index of the submission queue tail to the
|
||||
submission queue tail doorbell register. It's actually valid to copy a whole set
|
||||
of commands into open slots in the ring and then write the doorbell just one
|
||||
time to submit the whole batch.
|
||||
|
||||
There is a very detailed description of the command submission and completion
|
||||
process in the NVMe specification, which is conveniently available from the main
|
||||
page over at [NVM Express](https://nvmexpress.org).
|
||||
|
||||
Most importantly, the command itself describes the operation and also, if
|
||||
necessary, a location in host memory containing a descriptor for host memory
|
||||
associated with the command. This host memory is the data to be written on a
|
||||
write command, or the location to place the data on a read command. Data is
|
||||
transferred to or from this location using a DMA engine on the NVMe device.
|
||||
|
||||
The completion queue works similarly, but the device is instead the one writing
|
||||
entries into the ring. Each entry contains a "phase" bit that toggles between 0
|
||||
and 1 on each loop through the entire ring. When a queue pair is set up to
|
||||
generate interrupts, the interrupt contains the index of the completion queue
|
||||
head. However, SPDK doesn't enable interrupts and instead polls on the phase
|
||||
bit to detect completions. Interrupts are very heavy operations, so polling this
|
||||
phase bit is often far more efficient.
|
||||
|
||||
## The SPDK NVMe Driver I/O Path
|
||||
|
||||
Now that we know how the ring structures work, let's cover how the SPDK NVMe
|
||||
driver uses them. The user is going to construct a queue pair at some early time
|
||||
in the life cycle of the program, so that's not part of the "hot" path. Then,
|
||||
they'll call functions like spdk_nvme_ns_cmd_read() to perform an I/O operation.
|
||||
The user supplies a data buffer, the target LBA, and the length, as well as
|
||||
other information like which NVMe namespace the command is targeted at and which
|
||||
NVMe queue pair to use. Finally, the user provides a callback function and
|
||||
context pointer that will be called when a completion for the resulting command
|
||||
is discovered during a later call to spdk_nvme_qpair_process_completions().
|
||||
|
||||
The first stage in the driver is allocating a request object to track the operation. The
|
||||
operations are asynchronous, so it can't simply track the state of the request
|
||||
on the call stack. Allocating a new request object on the heap would be far too
|
||||
slow, so SPDK keeps a pre-allocated set of request objects inside of the NVMe
|
||||
queue pair object - `struct spdk_nvme_qpair`. The number of requests allocated to
|
||||
the queue pair is larger than the actual queue depth of the NVMe submission
|
||||
queue because SPDK supports a couple of key convenience features. The first is
|
||||
software queueing - SPDK will allow the user to submit more requests than the
|
||||
hardware queue can actually hold and SPDK will automatically queue in software.
|
||||
The second is splitting. SPDK will split a request for many reasons, some of
|
||||
which are outlined next. The number of request objects is configurable at queue
|
||||
pair creation time and if not specified, SPDK will pick a sensible number based
|
||||
on the hardware queue depth.
|
||||
|
||||
The second stage is building the 64 byte NVMe command itself. The command is
|
||||
built into memory embedded into the request object - not directly into an NVMe
|
||||
submission queue slot. Once the command has been constructed, SPDK attempts to
|
||||
obtain an open slot in the NVMe submission queue. For each element in the
|
||||
submission queue an object called a tracker is allocated. The trackers are
|
||||
allocated in an array, so they can be quickly looked up by an index. The tracker
|
||||
itself contains a pointer to the request currently occupying that slot. When a
|
||||
particular tracker is obtained, the command's CID value is updated with the
|
||||
index of the tracker. The NVMe specification provides that CID value in the
|
||||
completion, so the request can be recovered by looking up the tracker via the
|
||||
CID value and then following the pointer.
|
||||
|
||||
Once a tracker (slot) is obtained, the data buffer associated with it is
|
||||
processed to build a PRP list. That's essentially an NVMe scatter gather list,
|
||||
although it is a bit more restricted. The user provides SPDK with the virtual
|
||||
address of the buffer, so SPDK has to go do a page table look up to find the
|
||||
physical address (pa) or I/O virtual addresses (iova) backing that virtual
|
||||
memory. A virtually contiguous memory region may not be physically contiguous,
|
||||
so this may result in a PRP list with multiple elements. Sometimes this may
|
||||
result in a set of physical addresses that can't actually be expressed as a
|
||||
single PRP list, so SPDK will automatically split the user operation into two
|
||||
separate requests transparently. For more information on how memory is managed,
|
||||
see @ref memory.
|
||||
|
||||
The reason the PRP list is not built until a tracker is obtained is because the
|
||||
PRP list description must be allocated in DMA-able memory and can be quite
|
||||
large. Since SPDK typically allocates a large number of requests, we didn't want
|
||||
to allocate enough space to pre-build the worst case scenario PRP list,
|
||||
especially given that the common case does not require a separate PRP list at
|
||||
all.
|
||||
|
||||
Each NVMe command has two PRP list elements embedded into it, so a separate PRP
|
||||
list isn't required if the request is 4KiB (or if it is 8KiB and aligned
|
||||
perfectly). Profiling shows that this section of the code is not a major
|
||||
contributor to the overall CPU use.
|
||||
|
||||
With a tracker filled out, SPDK copies the 64 byte command into the actual NVMe
|
||||
submission queue slot and then rings the submission queue tail doorbell to tell
|
||||
the device to go process it. SPDK then returns back to the user, without waiting
|
||||
for a completion.
|
||||
|
||||
The user can periodically call `spdk_nvme_qpair_process_completions()` to tell
|
||||
SPDK to examine the completion queue. Specifically, it reads the phase bit of
|
||||
the next expected completion slot and when it flips, looks at the CID value to
|
||||
find the tracker, which points at the request object. The request object
|
||||
contains a function pointer that the user provided initially, which is then
|
||||
called to complete the command.
|
||||
|
||||
The `spdk_nvme_qpair_process_completions()` function will keep advancing to the
|
||||
next completion slot until it runs out of completions, at which point it will
|
||||
write the completion queue head doorbell to let the device know that it can use
|
||||
the completion queue slots for new completions and return.
|
73
doc/nvmf.md
73
doc/nvmf.md
@ -3,7 +3,7 @@
|
||||
@sa @ref nvme_fabrics_host
|
||||
@sa @ref nvmf_tgt_tracepoints
|
||||
|
||||
## NVMe-oF Target Getting Started Guide {#nvmf_getting_started}
|
||||
# NVMe-oF Target Getting Started Guide {#nvmf_getting_started}
|
||||
|
||||
The SPDK NVMe over Fabrics target is a user space application that presents block devices over a fabrics
|
||||
such as Ethernet, Infiniband or Fibre Channel. SPDK currently supports RDMA and TCP transports.
|
||||
@ -29,11 +29,16 @@ available [here](https://downloads.openfabrics.org/OFED/).
|
||||
|
||||
### Prerequisites {#nvmf_prereqs}
|
||||
|
||||
To build nvmf_tgt with the RDMA transport, there are some additional dependencies,
|
||||
which can be install using pkgdep.sh script.
|
||||
To build nvmf_tgt with the RDMA transport, there are some additional dependencies.
|
||||
|
||||
Fedora:
|
||||
~~~{.sh}
|
||||
sudo scripts/pkgdep.sh --rdma
|
||||
dnf install libibverbs-devel librdmacm-devel
|
||||
~~~
|
||||
|
||||
Ubuntu:
|
||||
~~~{.sh}
|
||||
apt-get install libibverbs-dev librdmacm-dev
|
||||
~~~
|
||||
|
||||
Then build SPDK with RDMA enabled:
|
||||
@ -43,7 +48,7 @@ Then build SPDK with RDMA enabled:
|
||||
make
|
||||
~~~
|
||||
|
||||
Once built, the binary will be in `build/bin`.
|
||||
Once built, the binary will be in `app/nvmf_tgt`.
|
||||
|
||||
### Prerequisites for InfiniBand/RDMA Verbs {#nvmf_prereqs_verbs}
|
||||
|
||||
@ -106,58 +111,33 @@ using 1GB hugepages or by pre-reserving memory at application startup with `--me
|
||||
option. All pre-reserved memory will be registered as a single region, but won't be returned to the
|
||||
system until the SPDK application is terminated.
|
||||
|
||||
Another known issue occurs when using the E810 NICs in RoCE mode. Specifically, the NVMe-oF target
|
||||
sometimes cannot destroy a qpair, because its posted work requests don't get flushed. It can cause
|
||||
the NVMe-oF target application unable to terminate cleanly.
|
||||
|
||||
## TCP transport support {#nvmf_tcp_transport}
|
||||
|
||||
The transport is built into the nvmf_tgt by default, and it does not need any special libraries.
|
||||
|
||||
## FC transport support {#nvmf_fc_transport}
|
||||
|
||||
To build nvmf_tgt with the FC transport, there is an additional FC LLD (Low Level Driver) code dependency.
|
||||
Please contact your FC vendor for instructions to obtain FC driver module.
|
||||
|
||||
### Broadcom FC LLD code
|
||||
|
||||
FC LLD driver for Broadcom FC NVMe capable adapters can be obtained from,
|
||||
https://github.com/ecdufcdrvr/bcmufctdrvr.
|
||||
|
||||
### Fetch FC LLD module and then build SPDK with FC enabled
|
||||
|
||||
After cloning SPDK repo and initialize submodules, FC LLD library is built which then can be linked with
|
||||
the fc transport.
|
||||
|
||||
~~~{.sh}
|
||||
git clone https://github.com/spdk/spdk --recursive
|
||||
git clone https://github.com/ecdufcdrvr/bcmufctdrvr fc
|
||||
cd fc
|
||||
make DPDK_DIR=../spdk/dpdk/build SPDK_DIR=../spdk
|
||||
cd ../spdk
|
||||
./configure --with-fc=../fc/build
|
||||
make
|
||||
~~~
|
||||
|
||||
## Configuring the SPDK NVMe over Fabrics Target {#nvmf_config}
|
||||
|
||||
An NVMe over Fabrics target can be configured using JSON RPCs.
|
||||
The basic RPCs needed to configure the NVMe-oF subsystem are detailed below. More information about
|
||||
working with NVMe over Fabrics specific RPCs can be found on the @ref jsonrpc_components_nvmf_tgt RPC page.
|
||||
|
||||
Using .ini style configuration files for configuration of the NVMe-oF target is deprecated and should
|
||||
be replaced with JSON based RPCs. .ini style configuration files can be converted to json format by way
|
||||
of the new script `scripts/config_converter.py`.
|
||||
|
||||
### Using RPCs {#nvmf_config_rpc}
|
||||
|
||||
Start the nvmf_tgt application with elevated privileges. Once the target is started,
|
||||
the nvmf_create_transport rpc can be used to initialize a given transport. Below is an
|
||||
example where the target is started and configured with two different transports.
|
||||
The RDMA transport is configured with an I/O unit size of 8192 bytes, max I/O size 131072 and an
|
||||
in capsule data size of 8192 bytes. The TCP transport is configured with an I/O unit size of
|
||||
The RDMA transport is configured with an I/O unit size of 8192 bytes, 4 max qpairs per controller,
|
||||
and an in capsule data size of 0 bytes. The TCP transport is configured with an I/O unit size of
|
||||
16384 bytes, 8 max qpairs per controller, and an in capsule data size of 8192 bytes.
|
||||
|
||||
~~~{.sh}
|
||||
build/bin/nvmf_tgt
|
||||
scripts/rpc.py nvmf_create_transport -t RDMA -u 8192 -i 131072 -c 8192
|
||||
scripts/rpc.py nvmf_create_transport -t TCP -u 16384 -m 8 -c 8192
|
||||
app/nvmf_tgt/nvmf_tgt
|
||||
scripts/rpc.py nvmf_create_transport -t RDMA -u 8192 -p 4 -c 0
|
||||
scripts/rpc.py nvmf_create_transport -t TCP -u 16348 -p 8 -c 8192
|
||||
~~~
|
||||
|
||||
Below is an example of creating a malloc bdev and assigning it to a subsystem. Adjust the bdevs,
|
||||
@ -165,8 +145,8 @@ NQN, serial number, and IP address with RDMA transport to your own circumstances
|
||||
"rdma" with "TCP", then the subsystem will add a listener with TCP transport.
|
||||
|
||||
~~~{.sh}
|
||||
scripts/rpc.py bdev_malloc_create -b Malloc0 512 512
|
||||
scripts/rpc.py nvmf_create_subsystem nqn.2016-06.io.spdk:cnode1 -a -s SPDK00000000000001 -d SPDK_Controller1
|
||||
scripts/rpc.py construct_malloc_bdev -b Malloc0 512 512
|
||||
scripts/rpc.py nvmf_subsystem_create nqn.2016-06.io.spdk:cnode1 -a -s SPDK00000000000001
|
||||
scripts/rpc.py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode1 Malloc0
|
||||
scripts/rpc.py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode1 -t rdma -a 192.168.100.8 -s 4420
|
||||
~~~
|
||||
@ -184,8 +164,7 @@ Basic Types
|
||||
year = 4 * digit ;
|
||||
month = '01' | '02' | '03' | '04' | '05' | '06' | '07' | '08' | '09' | '10' | '11' | '12' ;
|
||||
digit = '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' ;
|
||||
hex digit = 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | '0' |
|
||||
'1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' ;
|
||||
hex digit = 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' ;
|
||||
|
||||
NQN Definition
|
||||
NVMe Qualified Name = ( NVMe-oF Discovery NQN | NVMe UUID NQN | NVMe Domain NQN ), '\0' ;
|
||||
@ -197,7 +176,6 @@ NVMe Domain NQN = "nqn.", year, '-', month, '.', reverse domain, ':', utf-8 stri
|
||||
~~~
|
||||
|
||||
Please note that the following types from the definition above are defined elsewhere:
|
||||
|
||||
1. utf-8 string: Defined in [rfc 3629](https://tools.ietf.org/html/rfc3629).
|
||||
2. reverse domain: Equivalent to domain name as defined in [rfc 1034](https://tools.ietf.org/html/rfc1034).
|
||||
|
||||
@ -230,7 +208,7 @@ The `-m` core mask option specifies a bit mask of the CPU cores that
|
||||
SPDK is allowed to execute work items on.
|
||||
For example, to allow SPDK to use cores 24, 25, 26 and 27:
|
||||
~~~{.sh}
|
||||
build/bin/nvmf_tgt -m 0xF000000
|
||||
app/nvmf_tgt/nvmf_tgt -m 0xF000000
|
||||
~~~
|
||||
|
||||
## Configuring the Linux NVMe over Fabrics Host {#nvmf_host}
|
||||
@ -268,8 +246,3 @@ nvme disconnect -n "nqn.2016-06.io.spdk:cnode1"
|
||||
|
||||
SPDK has a tracing framework for capturing low-level event information at runtime.
|
||||
@ref nvmf_tgt_tracepoints enable analysis of both performance and application crashes.
|
||||
|
||||
## Enabling NVMe-oF Multipath
|
||||
|
||||
The SPDK NVMe-oF target and initiator support multiple independent paths to the same NVMe-oF subsystem.
|
||||
For step-by-step instructions for configuring and switching between paths, see @ref nvmf_multipath_howto .
|
||||
|
@ -1,103 +0,0 @@
|
||||
# NVMe-oF Multipath HOWTO {#nvmf_multipath_howto}
|
||||
|
||||
This HOWTO provides step-by-step instructions for setting-up a simple SPDK deployment and testing multipath.
|
||||
It demonstrates configuring path preferences with Asymmetric Namespace Access (ANA), as well as round-robin
|
||||
path load balancing.
|
||||
|
||||
## Build SPDK on both the initiator and target servers
|
||||
|
||||
Clone the repo:
|
||||
~~~{.sh}
|
||||
git clone https://github.com/spdk/spdk --recursive
|
||||
~~~
|
||||
|
||||
Configure and build SPDK:
|
||||
~~~{.sh}
|
||||
cd spdk/
|
||||
./configure
|
||||
make -j16
|
||||
~~~
|
||||
|
||||
## Setup hugepages
|
||||
|
||||
This should be run once on each server (and after reboots):
|
||||
~~~{.sh}
|
||||
cd spdk/
|
||||
./scripts/setup.sh
|
||||
~~~
|
||||
|
||||
## On target: start and configure SPDK
|
||||
|
||||
Start the target in the background and configure it:
|
||||
~~~{.sh}
|
||||
cd spdk/
|
||||
./build/bin/nvmf_tgt -m 0x3 &
|
||||
./scripts/rpc.py nvmf_create_transport -t tcp -o -u 8192
|
||||
~~~
|
||||
|
||||
Create a subsystem, with `-r` to enable ANA reporting feature:
|
||||
~~~{.sh}
|
||||
./scripts/rpc.py nvmf_create_subsystem nqn.2022-02.io.spdk:cnode0 -a -s SPDK00000000000001 -r
|
||||
~~~
|
||||
|
||||
Create and add a malloc block device:
|
||||
~~~{.sh}
|
||||
./scripts/rpc.py bdev_malloc_create 64 512 -b Malloc0
|
||||
./scripts/rpc.py nvmf_subsystem_add_ns nqn.2022-02.io.spdk:cnode0 Malloc0
|
||||
~~~
|
||||
|
||||
Add two listeners, each with a different `IP:port` pair:
|
||||
~~~{.sh}
|
||||
./scripts/rpc.py nvmf_subsystem_add_listener -t tcp -a 172.17.1.13 -s 4420 nqn.2022-02.io.spdk:cnode0
|
||||
./scripts/rpc.py nvmf_subsystem_add_listener -t tcp -a 172.18.1.13 -s 5520 nqn.2022-02.io.spdk:cnode0
|
||||
~~~
|
||||
|
||||
## On initiator: start and configure bdevperf
|
||||
|
||||
Launch the bdevperf process in the background:
|
||||
~~~{.sh}
|
||||
cd spdk/
|
||||
./build/examples/bdevperf -m 0x4 -z -r /tmp/bdevperf.sock -q 128 -o 4096 -w verify -t 90 &> bdevperf.log &
|
||||
~~~
|
||||
|
||||
Configure bdevperf and add two paths:
|
||||
~~~{.sh}
|
||||
./scripts/rpc.py -s /tmp/bdevperf.sock bdev_nvme_set_options -r -1
|
||||
./scripts/rpc.py -s /tmp/bdevperf.sock bdev_nvme_attach_controller -b Nvme0 -t tcp -a 172.17.1.13 -s 4420 -f ipv4 -n nqn.2022-02.io.spdk:cnode0 -l -1 -o 10
|
||||
./scripts/rpc.py -s /tmp/bdevperf.sock bdev_nvme_attach_controller -b Nvme0 -t tcp -a 172.18.1.13 -s 5520 -f ipv4 -n nqn.2022-02.io.spdk:cnode0 -x multipath -l -1 -o 10
|
||||
~~~
|
||||
|
||||
## Launch a bdevperf test
|
||||
|
||||
Connect to the RPC socket of the bdevperf process and start the test:
|
||||
~~~{.sh}
|
||||
PYTHONPATH=$PYTHONPATH:/root/src/spdk/python ./examples/bdev/bdevperf/bdevperf.py -t 1 -s /tmp/bdevperf.sock perform_tests
|
||||
~~~
|
||||
|
||||
The RPC command will return, leaving the test to run for 90 seconds in the background. On the target server,
|
||||
observe that only the first path (port) is receiving packets by checking the queues with `ss -t`.
|
||||
|
||||
You can view the paths available to the initiator with:
|
||||
~~~{.sh}
|
||||
./scripts/rpc.py -s /tmp/bdevperf.sock bdev_nvme_get_io_paths -n Nvme0n1
|
||||
~~~
|
||||
|
||||
## Switching paths
|
||||
|
||||
This can be done on the target server by setting the first path's ANA to `non_optimized`:
|
||||
~~~{.sh}
|
||||
./scripts/rpc.py nvmf_subsystem_listener_set_ana_state nqn.2022-02.io.spdk:cnode0 -t tcp -a 172.17.1.13 -s 4420 -n non_optimized
|
||||
~~~
|
||||
|
||||
Use `ss -t` to verify that the traffic has switched to the second path.
|
||||
|
||||
## Use round-robin (active_active) path load balancing
|
||||
|
||||
First, ensure the ANA for both paths is configured as `optimized` on the target. Then, change the
|
||||
multipath policy on the initiator to `active_active` (multipath policy is per bdev, so
|
||||
`bdev_nvme_set_multipath_policy` must be called after `bdev_nvme_attach_controller`):
|
||||
~~~{.sh}
|
||||
./scripts/rpc.py -s /tmp/bdevperf.sock bdev_nvme_set_multipath_policy -b Nvme0n1 -p active_active
|
||||
~~~
|
||||
|
||||
Observe with `ss -t` that both connections are receiving traffic (queues build up).
|
@ -68,7 +68,7 @@ system. This is used for access control.
|
||||
|
||||
A user of the NVMe-oF target library begins by creating a target using
|
||||
spdk_nvmf_tgt_create(), setting up a set of addresses on which to accept
|
||||
connections by calling spdk_nvmf_tgt_listen_ext(), then creating a subsystem
|
||||
connections by calling spdk_nvmf_tgt_listen(), then creating a subsystem
|
||||
using spdk_nvmf_subsystem_create().
|
||||
|
||||
Subsystems begin in an inactive state and must be activated by calling
|
||||
@ -78,13 +78,12 @@ calling spdk_nvmf_subsystem_pause() and resumed by calling
|
||||
spdk_nvmf_subsystem_resume().
|
||||
|
||||
Namespaces may be added to the subsystem by calling
|
||||
spdk_nvmf_subsystem_add_ns_ext() when the subsystem is inactive or paused.
|
||||
spdk_nvmf_subsystem_add_ns() when the subsystem is inactive or paused.
|
||||
Namespaces are bdevs. See @ref bdev for more information about the SPDK bdev
|
||||
layer. A bdev may be obtained by calling spdk_bdev_get_by_name().
|
||||
|
||||
Once a subsystem exists and the target is listening on an address, new
|
||||
connections will be automatically assigned to poll groups as they are
|
||||
detected.
|
||||
connections may be accepted by polling spdk_nvmf_tgt_accept().
|
||||
|
||||
All I/O to a subsystem is driven by a poll group, which polls for incoming
|
||||
network I/O. Poll groups may be created by calling
|
||||
@ -92,6 +91,14 @@ spdk_nvmf_poll_group_create(). They automatically request to begin polling
|
||||
upon creation on the thread from which they were created. Most importantly, *a
|
||||
poll group may only be accessed from the thread on which it was created.*
|
||||
|
||||
When spdk_nvmf_tgt_accept() detects a new connection, it will construct a new
|
||||
struct spdk_nvmf_qpair object and call the user provided `new_qpair_fn`
|
||||
callback for each new qpair. In response to this callback, the user must
|
||||
assign the qpair to a poll group by calling spdk_nvmf_poll_group_add().
|
||||
Remember, a poll group may only be accessed from the thread on which it was created,
|
||||
so making a call to spdk_nvmf_poll_group_add() may require passing a message
|
||||
to the appropriate thread.
|
||||
|
||||
## Access Control
|
||||
|
||||
Access control is performed at the subsystem level by adding allowed listen
|
||||
@ -104,7 +111,9 @@ and hosts may only be added to inactive or paused subsystems.
|
||||
|
||||
A discovery subsystem, as defined by the NVMe-oF specification, is
|
||||
automatically created for each NVMe-oF target constructed. Connections to the
|
||||
discovery subsystem are handled in the same way as any other subsystem.
|
||||
discovery subsystem are handled in the same way as any other subsystem - new
|
||||
qpairs are created in response to spdk_nvmf_tgt_accept() and they must be
|
||||
assigned to a poll group.
|
||||
|
||||
## Transports
|
||||
|
||||
@ -123,7 +132,15 @@ fabrics simultaneously.
|
||||
The SPDK NVMe-oF target library does not strictly dictate threading model, but
|
||||
poll groups do all of their polling and I/O processing on the thread they are
|
||||
created on. Given that, it almost always makes sense to create one poll group
|
||||
per thread used in the application.
|
||||
per thread used in the application. New qpairs created in response to
|
||||
spdk_nvmf_tgt_accept() can be handed out round-robin to the poll groups. This
|
||||
is how the SPDK NVMe-oF target application currently functions.
|
||||
|
||||
More advanced algorithms for distributing qpairs to poll groups are possible.
|
||||
For instance, a NUMA-aware algorithm would be an improvement over basic
|
||||
round-robin, where NUMA-aware means assigning qpairs to poll groups running on
|
||||
CPU cores that are on the same NUMA node as the network adapter and storage
|
||||
device. Load-aware algorithms also may have benefits.
|
||||
|
||||
## Scaling Across CPU Cores
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user