Spdk/scripts/perf/vhost/conf-generator
Michal Berger c1e9ed6d4c perf/vhost: Auto generate VM cpu and disk map configs
The configuration is generated based on existing host's NUMA
topology (as seen via sysfs) instead of a total number of cpus as
it was done before. New logic attempts to load balance VMs and
their cpus based on nvme drives' NUMA location. If there is no
enough cpus left under the target node, all remaining nodes are
checked. For the sake of the performance, cpus are not mixed
between different numa nodes.

Disk map is created by mapping VMs to existing nvme drives
based on their NUMA location. Extra VMs are assigned in bus
order of the nvme drives.

SPDK cpus are split by matching the VM-to-nvme NUMA ratio.
Static list can be defined as well to override this behavior.

https://trello.com/c/HSoRtQkO/401-deprecate-vhost-performance-python-helper-script

Signed-off-by: Michal Berger <michal.berger@intel.com>
Change-Id: Ia63c6f9a472a685d252efd110eaba7b114a87d2c
Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/12401
Tested-by: SPDK CI Jenkins <sys_sgci@intel.com>
Reviewed-by: Tomasz Zawadzki <tomasz.zawadzki@intel.com>
Reviewed-by: Jim Harris <james.r.harris@intel.com>
Reviewed-by: Karol Latecki <karol.latecki@intel.com>
2023-01-19 21:27:59 +00:00

372 lines
10 KiB
Bash
Executable File

#!/usr/bin/env bash
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (C) 2022 Intel Corporation.
# All rights reserved.
curdir=$(readlink -f "$(dirname "$0")")
rootdir=$(readlink -f "$curdir/../../../")
source "$rootdir/scripts/common.sh"
source "$rootdir/test/scheduler/common.sh"
get_auto_cfg() {
local vm_cpus vm_node vm vms vms_per_nvme
local cpu node nodes_idxs node_idx
local nvmes nvme nvme_idx nvme_diff nvmes_per_node
local vm_diff aligned_number_of_vms=0
local diff iter
local -g auto_cpu_map=() auto_disk_map=() spdk=()
map_cpus
get_nvme_numa_map
nodes_idxs=("${!nodes[@]}")
# Construct initial NUMA-aware setup by pinning VM to given nvme's node. First run is meant
# to pin enough number of VMs (as per vm_count) to match the number of available nvme ctrls.
vm=0
for node in "${nodes_idxs[@]}"; do
nvmes=(${!nvme_numa_map[node]})
for ((nvme_idx = 0; nvme_idx < ${#nvmes[@]} && vm < vm_count; vm++, nvme_idx++)); do
eval "vm${vm}_node=$node"
done
nvmes_per_node[node]=${#nvmes[@]}
done
vm_diff=$((vm_count - vm))
# Align extra number of VMs in case nvme ctrls are not distributed evenly across the existing
# NUMA nodes.
# FIXME: This is targeted for systems with only 2 NUMA nodes. Technically, kernel supports
# more than that - it's possible to achieve setups with > 2 NUMA nodes under virtual env
# for instance. Should this be of any concern?
if ((nvmes_per_node[0] < nvmes_per_node[1])); then
nvme_diff=$((nvmes_per_node[1] - nvmes_per_node[0]))
elif ((nvmes_per_node[0] > nvmes_per_node[1])); then
nvme_diff=$((nvmes_per_node[0] - nvmes_per_node[1]))
else
nvme_diff=0
fi
diff=$((vm_diff + nvme_diff))
if ((diff % 2 == 0)); then
aligned_number_of_vms=$((diff / ${#nodes_idxs[@]}))
fi
# Second run distributes extra VMs across existing NUMA nodes. In case we can distribute even
# number of extra VMs (as per vm_count) then simply assign them in bulk. In case there's an
# odd number, do some simple rr balancing where we assign them one by one - first to node0,
# second to node1, third to node0, etc.
if ((aligned_number_of_vms)); then
for node in "${nodes_idxs[@]}"; do
for ((iter = 0; iter < aligned_number_of_vms && vm < vm_count; iter++, vm++)); do
eval "vm${vm}_node=$node"
done
done
else
while ((vm < vm_count)); do
for node in "${nodes_idxs[@]}"; do
eval "vm${vm}_node=$node"
((++vm))
done
done
fi
local -g vm_numa_map=()
for ((vm = 0; vm < vm_count; vm++)); do
# Load balance the cpus across available numa nodes based on the pinning
# done prior. If there are no cpus left under selected node, iterate over
# all available nodes. If no cpus are left, fail. We don't allow to mix
# cpus from different nodes for the sake of the performance.
node_idx=0 node_idx_perc=0
eval "vm_node=\$vm${vm}_node"
local -n node_cpus=node_${vm_node}_cpu
local -n vm_nodes=node_${vm_node}_vm
vm_numa_map[vm_node]="node_${vm_node}_vm[@]"
while ((${#node_cpus[@]} < vm_cpu_num && node_idx < ${#nodes_idxs[@]})); do
vm_node=${nodes_idxs[node_idx]}
local -n node_cpus=node_${nodes_idxs[node_idx++]}_cpu
done
if ((${#node_cpus[@]} < vm_cpu_num)); then
printf 'Not enough CPUs available for VM %u (CPUs: %u, Nodes: %u, CPUs per VM: %u)\n' \
"$vm" "${#cpus[@]}" "${#nodes_idxs[@]}" "$vm_cpu_num" >&2
return 1
fi
# Normalize indexes
node_cpus=("${node_cpus[@]}")
vm_cpus=("${node_cpus[@]::vm_cpu_num}")
node_cpus=("${node_cpus[@]:vm_cpu_num}")
auto_cpu_map+=("$(
cat <<- CPU_VM
VM_${vm}_qemu_mask=$(
IFS=","
echo "${vm_cpus[*]}"
)
VM_${vm}_qemu_numa_node=$vm_node
CPU_VM
)")
# Save map of each VM->NUMA node to be able to construct a disk map in later steps.
vm_nodes+=("$vm")
done
# auto_cpu_map is ready, all requested VMs should be balanced across all NUMA nodes
# making sure each nvme drive will be bound to at least 1 VM placed on the
# corresponding NUMA node. Now, construct disk_cfg and assign VMs, with proper
# split value, to each nvme - extra VMs will be added to nvme drives in their
# bus order.
local -A nvme_vm_map=()
local iter nvmes_no=0 vms_no=0
for node in "${nodes_idxs[@]}"; do
if [[ ! -v nvme_numa_map[node] ]]; then
# There are no drives available on that node, skip it
continue
fi
nvmes=(${!nvme_numa_map[node]}) nvmes_no=${#nvmes[@]}
vms=(${!vm_numa_map[node]}) vms_no=${#vms[@]}
for ((iter = 0; iter <= (vms_no - nvmes_no <= 0 ? 1 : vms_no - nvmes_no); iter++)); do
for nvme in "${nvmes[@]}"; do
if ((${#vms[@]} == 0)); then
# No VMs on given node or they have been exhausted - skip all remaining drives.
continue 3
fi
nvme_vm_map["$nvme"]="_${nvme//[:.]/_}_[@]"
local -n nvme_vms=_${nvme//[:.]/_}_
nvme_vms+=("${vms[0]}") vms=("${vms[@]:1}")
done
done
done
local sorted_nvmes=()
sorted_nvmes=($(printf '%s\n' "${!nvme_vm_map[@]}" | sort))
for nvme in "${!sorted_nvmes[@]}"; do
vms=(${!nvme_vm_map["${sorted_nvmes[nvme]}"]})
auto_disk_map+=("${sorted_nvmes[nvme]},Nvme$((nvme++)),${#vms[*]},${vms[*]}")
done
get_spdk_cpus || return 1
auto_cpu_map+=("vhost_0_reactor_mask=[$(
IFS=","
echo "${spdk[*]}"
)]")
auto_cpu_map+=("vhost_0_master_core=${spdk[0]}")
}
get_nvme_numa_map() {
local nvmes nvme node
local -g nvme_numa_map=()
cache_pci_bus
for nvme in ${pci_bus_cache[0x010802]}; do
node=$(< "/sys/bus/pci/devices/$nvme/numa_node")
nvme_numa_map[node]="node_${node}_nvme[@]"
local -n node_nvmes=node_${node}_nvme
node_nvmes+=("$nvme")
done
}
get_spdk_cpus() {
local -g spdk=()
local node vms perc
local cpus_per_node cpus_exhausted=() cpus_remained=()
if [[ -z $spdk_cpu_num ]]; then
spdk=(0)
return 0
fi
if [[ -n $spdk_cpu_list ]]; then
spdk=($(parse_cpu_list <(echo "$spdk_cpu_list")))
return 0
fi
# Start allocating from NUMA node with greater number of pinned VMs.
node_sort=($(for node in "${!vm_numa_map[@]}"; do
vms=(${!vm_numa_map[node]})
echo "${#vms[@]}:$node"
done | sort -rn))
for _node in "${node_sort[@]}"; do
node=${_node#*:} vms=${_node%:*}
local -n node_all_cpus=node_${node}_cpu
perc=$((vms * 100 / vm_count))
cpus_per_node=$((spdk_cpu_num * perc / 100))
cpus_per_node=$((cpus_per_node == 0 ? 1 : cpus_per_node))
if ((${#node_all_cpus[@]} == 0)); then
printf 'No CPUs left to allocate for SPDK on node%u. Need %u CPUs\n' \
"$node" "$cpus_per_node" >&2
cpus_exhausted[node]=1
continue
fi
if ((${#node_all_cpus[@]} < cpus_per_node)); then
printf 'Not enough CPUs to allocate for SPDK on node%u. Need %u CPUs, getting %u\n' \
"$node" "$cpus_per_node" "${#node_all_cpus[@]}" >&2
cpus_per_node=${#node_all_cpus[@]}
cpus_exhauseted[node]=1
fi
spdk+=("${node_all_cpus[@]::cpus_per_node}")
node_all_cpus=("${node_all_cpus[@]:cpus_per_node}")
cpus_remained+=("${node_all_cpus[@]}")
done
# If we didn't allocate the entire number of requested cpus in the initial run,
# adjust it by adding the remaining portion from the node having greater number
# of pinned VMs.
if ((${#spdk[@]} < spdk_cpu_num)); then
if [[ -n $ALIGN_FROM_ALL_NODES ]] && ((${#cpus_remained[@]} > 0)); then
printf 'Trying to get extra CPUs from all nodes\n'
local -n node_all_cpus=cpus_remained
else
node=${node_sort[0]#*:}
printf 'Trying to get extra CPUs from the dominant node%u to align: %u < %u\n' \
"$node" "${#spdk[@]}" "$spdk_cpu_num"
if ((cpus_exhausted[node])); then
printf 'No CPUs available on node%u\n' "$node"
else
local -n node_all_cpus=node_${node}_cpu
fi
fi
spdk+=("${node_all_cpus[@]::spdk_cpu_num-${#spdk[@]}}")
fi >&2
if ((${#spdk[@]} != spdk_cpu_num)); then
printf 'Different number of SPDK CPUs allocated to meet the requirements: requested %u, got %u\n' \
"$spdk_cpu_num" "${#spdk[@]}"
else
printf 'Requested number of SPDK CPUs allocated: %u\n' "$spdk_cpu_num"
fi >&2
}
_p_disk_map() {
((${#auto_disk_map[@]} > 0)) || return 0
printf '%s\n' "${auto_disk_map[@]}"
}
_p_cpu_map() {
((${#auto_cpu_map[@]} > 0)) || return 0
printf '%s\n' "${auto_cpu_map[@]}"
}
p_disk_map() {
cat <<- DISK_MAP
# Generated automatically by ${0##*/}
# NVMe Drives: ${#auto_disk_map[@]} VM count: $vm_count
$(_p_disk_map)
DISK_MAP
}
p_vms_in_node() {
((${#vm_numa_map[@]} > 0)) || return 0
local node vms
for node in "${!vm_numa_map[@]}"; do
vms=(${!vm_numa_map[node]})
echo "Node$node: ${#vms[@]} VMs"
done
}
p_cpu_map() {
local node_stats
mapfile -t node_stats < <(p_vms_in_node)
cat <<- CPU_MAP
# Generated automatically by ${0##*/}
# VM NUMA Nodes: ${#vm_numa_map[@]} VM count: $vm_count CPU Per VM: $vm_cpu_num SPDK CPU count: ${#spdk[@]}
$(printf '# - %s\n' "${node_stats[@]}")
$(_p_cpu_map)
CPU_MAP
}
p_all() {
p_disk_map
printf '\n'
p_cpu_map
}
fetch_env() {
spdk_cpu_num=${spdk_cpu_num:-1}
vm_count=${vm_count:-1}
vm_cpu_num=${vm_cpu_num:-1}
# Normalize
spdk_cpu_num=$((spdk_cpu_num <= 0 ? 1 : spdk_cpu_num))
vm_count=$((vm_count <= 0 ? 1 : vm_count))
vm_cpu_num=$((vm_cpu_num <= 0 ? 1 : vm_cpu_num))
cpu_out=${cpu_out:-"$PWD/auto-cpu.conf"}
disk_out=${disk_out:-"$PWD/auto-disk.conf"}
}
help() {
cat <<- HELP
${0##*/}: [-p all|cpu|disk -s]
Configuration is generated based on system's cpu and nvme topology. Parameters
taken directly from the environment:
spdk_cpu_list - list of CPUs to assign to a SPDK app
spdk_cpu_num - number of CPUs to use across all NUMA nodes
(spdk_cpu_list takes priority, default: 1)
vm_count - number of VMs to prepare the configuration for
(default: 1)
vm_cpu_num - number of CPUs to assign per VM (default: 1)
Override parameters:
vmN_node - overrides selected NUMA node for VM N - by default,
this is allocated up to number of nvme drives
cpu_out - with -s, points at location where to save cpu conf
disk_out - with -s, points at location where to save disk conf
Note: VMs are pinned to nvme drives based on their NUMA location.
Example:
# Allocate 6 cpus from node1 for SPDK. Configure 24 VMs, 2 CPUs per VM
$ export spdk_cpu_num=6 vm_count=24 vm_cpu_num=2
$ ${0##*/} -p all
HELP
}
print=""
save=no
fetch_env
while getopts :hsp: arg; do
case "$arg" in
h)
help
exit 0
;;
p) print=$OPTARG ;;
s) save=yes ;;
*) ;;
esac
done
get_auto_cfg || exit 1
case "$print" in
all) p_all ;;
cpu) p_cpu_map ;;
disk) p_disk_map ;;
*) ;;
esac
if [[ $save == yes ]]; then
p_cpu_map > "$cpu_out"
p_disk_map > "$disk_out"
fi