Spdk/test/nvme/perf/common.sh

#!/usr/bin/env bash

function discover_bdevs() {
	local rootdir=$1
	local config_file=$2
	local cfg_type=$3
	local wait_for_spdk_bdev=${4:-30}
	local rpc_server=/var/tmp/spdk-discover-bdevs.sock

	if [ ! -e $config_file ]; then
		echo "Invalid Configuration File: $config_file"
		return 1
	fi

	if [ -z $cfg_type ]; then
		cfg_type="-c"
	fi

	# Start the bdev service to query for the list of available
	# bdevs.
	$rootdir/test/app/bdev_svc/bdev_svc -r $rpc_server -i 0 \
		$cfg_type $config_file &> /dev/null &
	stubpid=$!
	while ! [ -e /var/run/spdk_bdev0 ]; do
		# If this counter drops to zero, errexit will be caught to abort the test
		((wait_for_spdk_bdev--))
		sleep 1
	done

	# Get all of the bdevs
	$rootdir/scripts/rpc.py -s "$rpc_server" bdev_get_bdevs

	# Shut down the bdev service
	kill $stubpid
	wait $stubpid
	rm -f /var/run/spdk_bdev0
}

function create_spdk_bdev_conf() {
	local output
	local disk_cfg
	local bdev_io_cache_size=$1
	local bdev_io_pool_size=$2
	local bdev_json_cfg=()
	local bdev_opts=()

	disk_cfg=($(grep -vP "^\s*#" "$DISKCFG"))

	if [[ -n "$bdev_io_cache_size" ]]; then
		bdev_opts+=("\"bdev_io_cache_size\": $bdev_io_cache_size")
	fi

	if [[ -n "$bdev_io_pool_size" ]]; then
		bdev_opts+=("\"bdev_io_pool_size\": $bdev_io_pool_size")
	fi

	local IFS=","
	if [[ ${#bdev_opts[@]} -gt 0 ]]; then
		bdev_json_cfg+=("$(
			cat <<- JSON
				{
					"method": "bdev_set_options",
					"params": {
						${bdev_opts[*]}
					}
				}
			JSON
		)")
	fi

	for i in "${!disk_cfg[@]}"; do
		bdev_json_cfg+=("$(
			cat <<- JSON
				{
					"method": "bdev_nvme_attach_controller",
					"params": {
						"trtype": "PCIe",
						"name":"Nvme${i}",
						"traddr":"${disk_cfg[i]}"
					}
				}
			JSON
		)")
	done

	local IFS=","
	jq -r '.' <<- JSON > $testdir/bdev.conf
		{
			"subsystems": [
				{
					"subsystem": "bdev",
					"config": [
						${bdev_json_cfg[*]}
					]
				}
			]
		}
	JSON
}

function is_bdf_not_mounted() {
	local bdf=$1
	local blkname
	local mountpoints
	blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}')
	mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w)
	return $mountpoints
}

function get_cores() {
	local cpu_list="$1"
	for cpu in ${cpu_list//,/ }; do
		echo $cpu
	done
}

function get_cores_numa_node() {
	local cores=$1
	for core in $cores; do
		lscpu -p=cpu,node | grep "^$core\b" | awk -F ',' '{print $2}'
	done
}

function get_numa_node() {
	local plugin=$1
	local disks=$2
	if [[ "$plugin" =~ "nvme" ]]; then
		for bdf in $disks; do
			local driver
			driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}')
			# Use this check to ommit blacklisted devices ( not binded to driver with setup.sh script )
			if [ "$driver" = "vfio-pci" ] || [ "$driver" = "uio_pci_generic" ]; then
				cat /sys/bus/pci/devices/$bdf/numa_node
			fi
		done
	elif [[ "$plugin" =~ "bdev" ]]; then
		local bdevs
		bdevs=$(discover_bdevs $rootdir $testdir/bdev.conf --json)
		for name in $disks; do
			local bdev_bdf
			bdev_bdf=$(jq -r ".[] | select(.name==\"$name\").driver_specific.nvme.pci_address" <<< $bdevs)
			cat /sys/bus/pci/devices/$bdev_bdf/numa_node
		done
	else
		for name in $disks; do
			local bdf
			# Not reading directly from /sys/block/nvme* because of a kernel bug
			# which results in NUMA 0 always getting reported.
			bdf=$(cat /sys/block/$name/device/address)
			cat /sys/bus/pci/devices/$bdf/numa_node
		done
	fi
}

function get_disks() {
	local plugin=$1
	local disk_cfg

	disk_cfg=($(grep -vP "^\s*#" "$DISKCFG"))
	if [[ "$plugin" =~ "nvme" ]]; then
		# PCI BDF address is enough for nvme-perf and nvme-fio-plugin,
		# so just print them from configuration file
		echo "${disk_cfg[*]}"
	elif [[ "$plugin" =~ "bdev" ]]; then
		# Generate NvmeXn1 bdev name configuration file for bdev-perf
		# and bdev-fio-plugin
		local bdevs
		local disk_no
		disk_no=${#disk_cfg[@]}
		eval echo "Nvme{0..$((disk_no - 1))}n1"
	else
		# Find nvme block devices and only use the ones which
		# are not mounted
		for bdf in "${disk_cfg[@]}"; do
			if is_bdf_not_mounted $bdf; then
				local blkname
				blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}')
				echo $blkname
			fi
		done
	fi
}

function get_disks_on_numa() {
	local devs=($1)
	local numas=($2)
	local numa_no=$3
	local disks_on_numa=""
	local i

	for ((i = 0; i < ${#devs[@]}; i++)); do
		if [ ${numas[$i]} = $numa_no ]; then
			disks_on_numa=$((disks_on_numa + 1))
		fi
	done
	echo $disks_on_numa
}

function create_fio_config() {
	local disk_no=$1
	local plugin=$2
	local disks=($3)
	local disks_numa=($4)
	local cores=($5)
	local total_disks=${#disks[@]}
	local fio_job_section=()
	local num_cores=${#cores[@]}
	local disks_per_core=$((disk_no / num_cores))
	local disks_per_core_mod=$((disk_no % num_cores))
	local cores_numa
	cores_numa=($(get_cores_numa_node "${cores[*]}"))

	# Following part of this function still leverages global variables a lot.
	# It's a mix of local variables passed as aruments to function with global variables. This is messy.
	# TODO: Modify this to be consistent with how variables are used here. Aim for using only
	# local variables to get rid of globals as much as possible.
	desc="\"Test io_plugin=$PLUGIN Blocksize=${BLK_SIZE} Workload=$RW MIX=${MIX} qd=${IODEPTH}\""
	cp "$testdir/config.fio.tmp" "$testdir/config.fio"
	cat <<- EOF >> $testdir/config.fio
		description=$desc

		rw=$RW
		rwmixread=$MIX
		bs=$BLK_SIZE
		runtime=$RUNTIME
		ramp_time=$RAMP_TIME
		numjobs=$NUMJOBS
		log_avg_msec=$SAMPLING_INT
	EOF

	if $GTOD_REDUCE; then
		echo "gtod_reduce=1" >> $testdir/config.fio
	fi

	for i in "${!cores[@]}"; do
		local m=0 #Counter of disks per NUMA node
		local n=0 #Counter of all disks in test
		core_numa=${cores_numa[$i]}

		total_disks_per_core=$disks_per_core
		# Check how many "stray" disks are unassigned to CPU cores
		# Assign one disk to current CPU core and substract it from the total of
		# unassigned disks
		if [[ "$disks_per_core_mod" -gt "0" ]]; then
			total_disks_per_core=$((disks_per_core + 1))
			disks_per_core_mod=$((disks_per_core_mod - 1))
		fi
		# SPDK fio plugin supports submitting/completing I/Os to multiple SSDs from a single thread.
		# Therefore, the per thread queue depth is set to the desired IODEPTH/device X the number of devices per thread.
		QD=$IODEPTH
		if [[ "$NOIOSCALING" = false ]]; then
			QD=$((IODEPTH * total_disks_per_core))
		fi

		fio_job_section+=("")
		fio_job_section+=("[filename${i}]")
		fio_job_section+=("iodepth=$QD")
		fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}")

		while [[ "$m" -lt "$total_disks_per_core" ]]; do
			# Try to add disks to job section if it's NUMA node matches NUMA
			# for currently selected CPU
			if [[ "${disks_numa[$n]}" == "$core_numa" ]]; then
				if [[ "$plugin" == "spdk-plugin-nvme" ]]; then
					fio_job_section+=("filename=trtype=PCIe traddr=${disks[$n]//:/.} ns=1 #NVMe NUMA Node ${disks_numa[$n]}")
				elif [[ "$plugin" == "spdk-plugin-bdev" ]]; then
					fio_job_section+=("filename=${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}")
				elif [[ "$plugin" =~ "kernel" ]]; then
					fio_job_section+=("filename=/dev/${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}")
				fi
				m=$((m + 1))

				#Mark numa of n'th disk as "x" to mark it as claimed for next loop iterations
				disks_numa[$n]="x"
			fi
			n=$((n + 1))

			# If there is no more disks with numa node same as cpu numa node, switch to
			# other numa node, go back to start of loop and try again.
			if [[ $n -ge $total_disks ]]; then
				echo "WARNING! Cannot assign any more NVMes for CPU ${cores[$i]}"
				echo "NVMe assignment for this CPU will be cross-NUMA."
				if [[ "$core_numa" == "1" ]]; then
					core_numa=0
				else
					core_numa=1
				fi
				n=0
			fi
		done
	done

	printf "%s\n" "${fio_job_section[@]}" >> $testdir/config.fio
	echo "INFO: Generated fio configuration file:"
	cat $testdir/config.fio
}

function preconditioning() {
	local dev_name=""
	local filename=""
	local nvme_list

	HUGEMEM=8192 $rootdir/scripts/setup.sh
	cp $testdir/config.fio.tmp $testdir/config.fio
	echo "[Preconditioning]" >> $testdir/config.fio

	# Generate filename argument for FIO.
	# We only want to target NVMes not bound to nvme driver.
	# If they're still bound to nvme that means they were skipped by
	# setup.sh on purpose.
	nvme_list=$(get_disks nvme)
	for nvme in $nvme_list; do
		dev_name='trtype=PCIe traddr='${nvme//:/.}' ns=1'
		filename+=$(printf %s":" "$dev_name")
	done
	echo "** Preconditioning disks, this can take a while, depending on the size of disks."
	run_spdk_nvme_fio "spdk-plugin-nvme" --filename="$filename" --size=100% --loops=2 --bs=1M \
		--rw=write --iodepth=32 --output-format=normal
	rm -f $testdir/config.fio
}

function get_results() {
	local reads_pct
	local writes_pct

	reads_pct=$(bc -l <<< "scale=3; $2/100")
	writes_pct=$(bc -l <<< "scale=3; 1-$reads_pct")
	case "$1" in
		iops)
			iops=$(jq -r '.jobs[] | .read.iops + .write.iops' $TMP_RESULT_FILE)
			iops=${iops%.*}
			echo $iops
			;;
		mean_lat_usec)
			mean_lat=$(jq -r ".jobs[] | (.read.lat_ns.mean * $reads_pct + .write.lat_ns.mean * $writes_pct)" $TMP_RESULT_FILE)
			mean_lat=${mean_lat%.*}
			echo $((mean_lat / 1000))
			;;
		p90_lat_usec)
			p90_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"90.000000\"  // 0 * $reads_pct + .write.clat_ns.percentile.\"90.000000\" // 0 * $writes_pct)" $TMP_RESULT_FILE)
			p90_lat=${p90_lat%.*}
			echo $((p90_lat / 1000))
			;;
		p99_lat_usec)
			p99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.000000\"  // 0 * $reads_pct + .write.clat_ns.percentile.\"99.000000\" // 0 * $writes_pct)" $TMP_RESULT_FILE)
			p99_lat=${p99_lat%.*}
			echo $((p99_lat / 1000))
			;;
		p99_99_lat_usec)
			p99_99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.990000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.990000\" // 0 * $writes_pct)" $TMP_RESULT_FILE)
			p99_99_lat=${p99_99_lat%.*}
			echo $((p99_99_lat / 1000))
			;;
		stdev_usec)
			stdev=$(jq -r ".jobs[] | (.read.clat_ns.stddev * $reads_pct + .write.clat_ns.stddev * $writes_pct)" $TMP_RESULT_FILE)
			stdev=${stdev%.*}
			echo $((stdev / 1000))
			;;
		mean_slat_usec)
			mean_slat=$(jq -r ".jobs[] | (.read.slat_ns.mean * $reads_pct + .write.slat_ns.mean * $writes_pct)" $TMP_RESULT_FILE)
			mean_slat=${mean_slat%.*}
			echo $((mean_slat / 1000))
			;;
		mean_clat_usec)
			mean_clat=$(jq -r ".jobs[] | (.read.clat_ns.mean * $reads_pct + .write.clat_ns.mean * $writes_pct)" $TMP_RESULT_FILE)
			mean_clat=${mean_clat%.*}
			echo $((mean_clat / 1000))
			;;
		bw_Kibs)
			bw=$(jq -r ".jobs[] | (.read.bw + .write.bw)" $TMP_RESULT_FILE)
			bw=${bw%.*}
			echo $((bw))
			;;
	esac
}

function get_bdevperf_results() {
	case "$1" in
		iops)
			iops=$(grep Total $TMP_RESULT_FILE | awk -F 'Total' '{print $2}' | awk '{print $2}')
			iops=${iops%.*}
			echo $iops
			;;
		bw_Kibs)
			bw_MBs=$(grep Total $TMP_RESULT_FILE | awk -F 'Total' '{print $2}' | awk '{print $4}')
			bw_MBs=${bw_MBs%.*}
			echo $((bw_MBs * 1024))
			;;
	esac
}

function get_nvmeperf_results() {
	local iops
	local bw_MBs
	local mean_lat_usec
	local max_lat_usec
	local min_lat_usec

	read -r iops bw_MBs mean_lat_usec min_lat_usec max_lat_usec <<< $(tr -s " " < $TMP_RESULT_FILE | grep -oP "(?<=Total : )(.*+)")

	# We need to get rid of the decimal spaces due
	# to use of arithmetic expressions instead of "bc" for calculations
	iops=${iops%.*}
	bw_MBs=${bw_MBs%.*}
	mean_lat_usec=${mean_lat_usec%.*}
	min_lat_usec=${min_lat_usec%.*}
	max_lat_usec=${max_lat_usec%.*}

	echo "$iops $(bc <<< "$bw_MBs * 1024") $mean_lat_usec $min_lat_usec $max_lat_usec"
}

function run_spdk_nvme_fio() {
	local plugin=$1
	echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting."
	if [[ "$plugin" = "spdk-plugin-nvme" ]]; then
		LD_PRELOAD=$plugin_dir/spdk_nvme $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk
	elif [[ "$plugin" = "spdk-plugin-bdev" ]]; then
		LD_PRELOAD=$plugin_dir/spdk_bdev $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk_bdev --spdk_json_conf=$testdir/bdev.conf --spdk_mem=4096
	fi

	sleep 1
}

function run_nvme_fio() {
	echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting."
	$FIO_BIN $testdir/config.fio --output-format=json "$@"
	sleep 1
}

function run_bdevperf() {
	echo "** Running bdevperf test, this can take a while, depending on the run-time setting."
	$bdevperf_dir/bdevperf --json $testdir/bdev.conf -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -m "[$CPUS_ALLOWED]" -r /var/tmp/spdk.sock
	sleep 1
}

function run_nvmeperf() {
	# Prepare -r argument string for nvme perf command
	local r_opt
	local disks

	# Limit the number of disks to $1 if needed
	disks=($(get_disks nvme))
	disks=("${disks[@]:0:$1}")
	r_opt=$(printf -- ' -r "trtype:PCIe traddr:%s"' "${disks[@]}")

	echo "** Running nvme perf test, this can take a while, depending on the run-time setting."

	# Run command in separate shell as this solves quoting issues related to r_opt var
	$SHELL -c "$nvmeperf_dir/perf $r_opt -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -c [$CPUS_ALLOWED]"
	sleep 1
}

function wait_for_nvme_reload() {
	local nvmes=$1

	shopt -s extglob
	for disk in $nvmes; do
		cmd="ls /sys/block/$disk/queue/*@(iostats|rq_affinity|nomerges|io_poll_delay)*"
		until $cmd 2> /dev/null; do
			echo "Waiting for full nvme driver reload..."
			sleep 0.5
		done
	done
	shopt -q extglob
}

function verify_disk_number() {
	# Check if we have appropriate number of disks to carry out the test
	disks=($(get_disks $PLUGIN))
	if [[ $DISKNO == "ALL" ]] || [[ $DISKNO == "all" ]]; then
		DISKNO=${#disks[@]}
	elif [[ $DISKNO -gt ${#disks[@]} ]] || [[ ! $DISKNO =~ ^[0-9]+$ ]]; then
		echo "error: Required devices number ($DISKNO) is not a valid number or it's larger than the number of devices found (${#disks[@]})"
		false
	fi
}