372 lines
10 KiB
Plaintext
372 lines
10 KiB
Plaintext
|
#!/usr/bin/env bash
|
||
|
# SPDX-License-Identifier: BSD-3-Clause
|
||
|
# Copyright (C) 2022 Intel Corporation.
|
||
|
# All rights reserved.
|
||
|
|
||
|
curdir=$(readlink -f "$(dirname "$0")")
|
||
|
rootdir=$(readlink -f "$curdir/../../../")
|
||
|
|
||
|
source "$rootdir/scripts/common.sh"
|
||
|
source "$rootdir/test/scheduler/common.sh"
|
||
|
|
||
|
get_auto_cfg() {
|
||
|
local vm_cpus vm_node vm vms vms_per_nvme
|
||
|
local cpu node nodes_idxs node_idx
|
||
|
local nvmes nvme nvme_idx nvme_diff nvmes_per_node
|
||
|
local vm_diff aligned_number_of_vms=0
|
||
|
local diff iter
|
||
|
|
||
|
local -g auto_cpu_map=() auto_disk_map=() spdk=()
|
||
|
|
||
|
map_cpus
|
||
|
get_nvme_numa_map
|
||
|
|
||
|
nodes_idxs=("${!nodes[@]}")
|
||
|
|
||
|
# Construct initial NUMA-aware setup by pinning VM to given nvme's node. First run is meant
|
||
|
# to pin enough number of VMs (as per vm_count) to match the number of available nvme ctrls.
|
||
|
vm=0
|
||
|
for node in "${nodes_idxs[@]}"; do
|
||
|
nvmes=(${!nvme_numa_map[node]})
|
||
|
for ((nvme_idx = 0; nvme_idx < ${#nvmes[@]} && vm < vm_count; vm++, nvme_idx++)); do
|
||
|
eval "vm${vm}_node=$node"
|
||
|
done
|
||
|
nvmes_per_node[node]=${#nvmes[@]}
|
||
|
done
|
||
|
|
||
|
vm_diff=$((vm_count - vm))
|
||
|
|
||
|
# Align extra number of VMs in case nvme ctrls are not distributed evenly across the existing
|
||
|
# NUMA nodes.
|
||
|
# FIXME: This is targeted for systems with only 2 NUMA nodes. Technically, kernel supports
|
||
|
# more than that - it's possible to achieve setups with > 2 NUMA nodes under virtual env
|
||
|
# for instance. Should this be of any concern?
|
||
|
if ((nvmes_per_node[0] < nvmes_per_node[1])); then
|
||
|
nvme_diff=$((nvmes_per_node[1] - nvmes_per_node[0]))
|
||
|
elif ((nvmes_per_node[0] > nvmes_per_node[1])); then
|
||
|
nvme_diff=$((nvmes_per_node[0] - nvmes_per_node[1]))
|
||
|
else
|
||
|
nvme_diff=0
|
||
|
fi
|
||
|
|
||
|
diff=$((vm_diff + nvme_diff))
|
||
|
|
||
|
if ((diff % 2 == 0)); then
|
||
|
aligned_number_of_vms=$((diff / ${#nodes_idxs[@]}))
|
||
|
fi
|
||
|
|
||
|
# Second run distributes extra VMs across existing NUMA nodes. In case we can distribute even
|
||
|
# number of extra VMs (as per vm_count) then simply assign them in bulk. In case there's an
|
||
|
# odd number, do some simple rr balancing where we assign them one by one - first to node0,
|
||
|
# second to node1, third to node0, etc.
|
||
|
if ((aligned_number_of_vms)); then
|
||
|
for node in "${nodes_idxs[@]}"; do
|
||
|
for ((iter = 0; iter < aligned_number_of_vms && vm < vm_count; iter++, vm++)); do
|
||
|
eval "vm${vm}_node=$node"
|
||
|
done
|
||
|
done
|
||
|
else
|
||
|
while ((vm < vm_count)); do
|
||
|
for node in "${nodes_idxs[@]}"; do
|
||
|
eval "vm${vm}_node=$node"
|
||
|
((++vm))
|
||
|
done
|
||
|
done
|
||
|
fi
|
||
|
|
||
|
local -g vm_numa_map=()
|
||
|
for ((vm = 0; vm < vm_count; vm++)); do
|
||
|
# Load balance the cpus across available numa nodes based on the pinning
|
||
|
# done prior. If there are no cpus left under selected node, iterate over
|
||
|
# all available nodes. If no cpus are left, fail. We don't allow to mix
|
||
|
# cpus from different nodes for the sake of the performance.
|
||
|
node_idx=0 node_idx_perc=0
|
||
|
eval "vm_node=\$vm${vm}_node"
|
||
|
|
||
|
local -n node_cpus=node_${vm_node}_cpu
|
||
|
local -n vm_nodes=node_${vm_node}_vm
|
||
|
|
||
|
vm_numa_map[vm_node]="node_${vm_node}_vm[@]"
|
||
|
|
||
|
while ((${#node_cpus[@]} < vm_cpu_num && node_idx < ${#nodes_idxs[@]})); do
|
||
|
vm_node=${nodes_idxs[node_idx]}
|
||
|
local -n node_cpus=node_${nodes_idxs[node_idx++]}_cpu
|
||
|
done
|
||
|
|
||
|
if ((${#node_cpus[@]} < vm_cpu_num)); then
|
||
|
printf 'Not enough CPUs available for VM %u (CPUs: %u, Nodes: %u, CPUs per VM: %u)\n' \
|
||
|
"$vm" "${#cpus[@]}" "${#nodes_idxs[@]}" "$vm_cpu_num" >&2
|
||
|
return 1
|
||
|
fi
|
||
|
|
||
|
# Normalize indexes
|
||
|
node_cpus=("${node_cpus[@]}")
|
||
|
|
||
|
vm_cpus=("${node_cpus[@]::vm_cpu_num}")
|
||
|
node_cpus=("${node_cpus[@]:vm_cpu_num}")
|
||
|
|
||
|
auto_cpu_map+=("$(
|
||
|
cat <<- CPU_VM
|
||
|
VM_${vm}_qemu_mask=$(
|
||
|
IFS=","
|
||
|
echo "${vm_cpus[*]}"
|
||
|
)
|
||
|
VM_${vm}_qemu_numa_node=$vm_node
|
||
|
CPU_VM
|
||
|
)")
|
||
|
|
||
|
# Save map of each VM->NUMA node to be able to construct a disk map in later steps.
|
||
|
vm_nodes+=("$vm")
|
||
|
done
|
||
|
|
||
|
# auto_cpu_map is ready, all requested VMs should be balanced across all NUMA nodes
|
||
|
# making sure each nvme drive will be bound to at least 1 VM placed on the
|
||
|
# corresponding NUMA node. Now, construct disk_cfg and assign VMs, with proper
|
||
|
# split value, to each nvme - extra VMs will be added to nvme drives in their
|
||
|
# bus order.
|
||
|
local -A nvme_vm_map=()
|
||
|
local iter nvmes_no=0 vms_no=0
|
||
|
for node in "${nodes_idxs[@]}"; do
|
||
|
if [[ ! -v nvme_numa_map[node] ]]; then
|
||
|
# There are no drives available on that node, skip it
|
||
|
continue
|
||
|
fi
|
||
|
nvmes=(${!nvme_numa_map[node]}) nvmes_no=${#nvmes[@]}
|
||
|
vms=(${!vm_numa_map[node]}) vms_no=${#vms[@]}
|
||
|
for ((iter = 0; iter <= (vms_no - nvmes_no <= 0 ? 1 : vms_no - nvmes_no); iter++)); do
|
||
|
for nvme in "${nvmes[@]}"; do
|
||
|
if ((${#vms[@]} == 0)); then
|
||
|
# No VMs on given node or they have been exhausted - skip all remaining drives.
|
||
|
continue 3
|
||
|
fi
|
||
|
nvme_vm_map["$nvme"]="_${nvme//[:.]/_}_[@]"
|
||
|
local -n nvme_vms=_${nvme//[:.]/_}_
|
||
|
nvme_vms+=("${vms[0]}") vms=("${vms[@]:1}")
|
||
|
done
|
||
|
done
|
||
|
done
|
||
|
|
||
|
local sorted_nvmes=()
|
||
|
sorted_nvmes=($(printf '%s\n' "${!nvme_vm_map[@]}" | sort))
|
||
|
for nvme in "${!sorted_nvmes[@]}"; do
|
||
|
vms=(${!nvme_vm_map["${sorted_nvmes[nvme]}"]})
|
||
|
auto_disk_map+=("${sorted_nvmes[nvme]},Nvme$((nvme++)),${#vms[*]},${vms[*]}")
|
||
|
done
|
||
|
|
||
|
get_spdk_cpus || return 1
|
||
|
|
||
|
auto_cpu_map+=("vhost_0_reactor_mask=[$(
|
||
|
IFS=","
|
||
|
echo "${spdk[*]}"
|
||
|
)]")
|
||
|
auto_cpu_map+=("vhost_0_master_core=${spdk[0]}")
|
||
|
}
|
||
|
|
||
|
get_nvme_numa_map() {
|
||
|
local nvmes nvme node
|
||
|
local -g nvme_numa_map=()
|
||
|
|
||
|
cache_pci_bus
|
||
|
|
||
|
for nvme in ${pci_bus_cache[0x010802]}; do
|
||
|
node=$(< "/sys/bus/pci/devices/$nvme/numa_node")
|
||
|
nvme_numa_map[node]="node_${node}_nvme[@]"
|
||
|
local -n node_nvmes=node_${node}_nvme
|
||
|
node_nvmes+=("$nvme")
|
||
|
done
|
||
|
}
|
||
|
|
||
|
get_spdk_cpus() {
|
||
|
local -g spdk=()
|
||
|
local node vms perc
|
||
|
local cpus_per_node cpus_exhausted=() cpus_remained=()
|
||
|
|
||
|
if [[ -z $spdk_cpu_num ]]; then
|
||
|
spdk=(0)
|
||
|
return 0
|
||
|
fi
|
||
|
|
||
|
if [[ -n $spdk_cpu_list ]]; then
|
||
|
spdk=($(parse_cpu_list <(echo "$spdk_cpu_list")))
|
||
|
return 0
|
||
|
fi
|
||
|
|
||
|
# Start allocating from NUMA node with greater number of pinned VMs.
|
||
|
node_sort=($(for node in "${!vm_numa_map[@]}"; do
|
||
|
vms=(${!vm_numa_map[node]})
|
||
|
echo "${#vms[@]}:$node"
|
||
|
done | sort -rn))
|
||
|
|
||
|
for _node in "${node_sort[@]}"; do
|
||
|
node=${_node#*:} vms=${_node%:*}
|
||
|
local -n node_all_cpus=node_${node}_cpu
|
||
|
perc=$((vms * 100 / vm_count))
|
||
|
cpus_per_node=$((spdk_cpu_num * perc / 100))
|
||
|
cpus_per_node=$((cpus_per_node == 0 ? 1 : cpus_per_node))
|
||
|
|
||
|
if ((${#node_all_cpus[@]} == 0)); then
|
||
|
printf 'No CPUs left to allocate for SPDK on node%u. Need %u CPUs\n' \
|
||
|
"$node" "$cpus_per_node" >&2
|
||
|
|
||
|
cpus_exhausted[node]=1
|
||
|
continue
|
||
|
fi
|
||
|
if ((${#node_all_cpus[@]} < cpus_per_node)); then
|
||
|
printf 'Not enough CPUs to allocate for SPDK on node%u. Need %u CPUs, getting %u\n' \
|
||
|
"$node" "$cpus_per_node" "${#node_all_cpus[@]}" >&2
|
||
|
cpus_per_node=${#node_all_cpus[@]}
|
||
|
cpus_exhauseted[node]=1
|
||
|
fi
|
||
|
|
||
|
spdk+=("${node_all_cpus[@]::cpus_per_node}")
|
||
|
node_all_cpus=("${node_all_cpus[@]:cpus_per_node}")
|
||
|
cpus_remained+=("${node_all_cpus[@]}")
|
||
|
done
|
||
|
|
||
|
# If we didn't allocate the entire number of requested cpus in the initial run,
|
||
|
# adjust it by adding the remaining portion from the node having greater number
|
||
|
# of pinned VMs.
|
||
|
if ((${#spdk[@]} < spdk_cpu_num)); then
|
||
|
if [[ -n $ALIGN_FROM_ALL_NODES ]] && ((${#cpus_remained[@]} > 0)); then
|
||
|
printf 'Trying to get extra CPUs from all nodes\n'
|
||
|
local -n node_all_cpus=cpus_remained
|
||
|
else
|
||
|
node=${node_sort[0]#*:}
|
||
|
printf 'Trying to get extra CPUs from the dominant node%u to align: %u < %u\n' \
|
||
|
"$node" "${#spdk[@]}" "$spdk_cpu_num"
|
||
|
if ((cpus_exhausted[node])); then
|
||
|
printf 'No CPUs available on node%u\n' "$node"
|
||
|
else
|
||
|
local -n node_all_cpus=node_${node}_cpu
|
||
|
fi
|
||
|
fi
|
||
|
spdk+=("${node_all_cpus[@]::spdk_cpu_num-${#spdk[@]}}")
|
||
|
fi >&2
|
||
|
if ((${#spdk[@]} != spdk_cpu_num)); then
|
||
|
printf 'Different number of SPDK CPUs allocated to meet the requirements: requested %u, got %u\n' \
|
||
|
"$spdk_cpu_num" "${#spdk[@]}"
|
||
|
else
|
||
|
printf 'Requested number of SPDK CPUs allocated: %u\n' "$spdk_cpu_num"
|
||
|
fi >&2
|
||
|
}
|
||
|
|
||
|
_p_disk_map() {
|
||
|
((${#auto_disk_map[@]} > 0)) || return 0
|
||
|
printf '%s\n' "${auto_disk_map[@]}"
|
||
|
}
|
||
|
|
||
|
_p_cpu_map() {
|
||
|
((${#auto_cpu_map[@]} > 0)) || return 0
|
||
|
printf '%s\n' "${auto_cpu_map[@]}"
|
||
|
}
|
||
|
|
||
|
p_disk_map() {
|
||
|
cat <<- DISK_MAP
|
||
|
# Generated automatically by ${0##*/}
|
||
|
# NVMe Drives: ${#auto_disk_map[@]} VM count: $vm_count
|
||
|
$(_p_disk_map)
|
||
|
DISK_MAP
|
||
|
}
|
||
|
|
||
|
p_vms_in_node() {
|
||
|
((${#vm_numa_map[@]} > 0)) || return 0
|
||
|
|
||
|
local node vms
|
||
|
for node in "${!vm_numa_map[@]}"; do
|
||
|
vms=(${!vm_numa_map[node]})
|
||
|
echo "Node$node: ${#vms[@]} VMs"
|
||
|
done
|
||
|
}
|
||
|
|
||
|
p_cpu_map() {
|
||
|
local node_stats
|
||
|
|
||
|
mapfile -t node_stats < <(p_vms_in_node)
|
||
|
cat <<- CPU_MAP
|
||
|
# Generated automatically by ${0##*/}
|
||
|
# VM NUMA Nodes: ${#vm_numa_map[@]} VM count: $vm_count CPU Per VM: $vm_cpu_num SPDK CPU count: ${#spdk[@]}
|
||
|
$(printf '# - %s\n' "${node_stats[@]}")
|
||
|
$(_p_cpu_map)
|
||
|
CPU_MAP
|
||
|
}
|
||
|
|
||
|
p_all() {
|
||
|
p_disk_map
|
||
|
printf '\n'
|
||
|
p_cpu_map
|
||
|
}
|
||
|
|
||
|
fetch_env() {
|
||
|
spdk_cpu_num=${spdk_cpu_num:-1}
|
||
|
vm_count=${vm_count:-1}
|
||
|
vm_cpu_num=${vm_cpu_num:-1}
|
||
|
|
||
|
# Normalize
|
||
|
spdk_cpu_num=$((spdk_cpu_num <= 0 ? 1 : spdk_cpu_num))
|
||
|
vm_count=$((vm_count <= 0 ? 1 : vm_count))
|
||
|
vm_cpu_num=$((vm_cpu_num <= 0 ? 1 : vm_cpu_num))
|
||
|
|
||
|
cpu_out=${cpu_out:-"$PWD/auto-cpu.conf"}
|
||
|
disk_out=${disk_out:-"$PWD/auto-disk.conf"}
|
||
|
}
|
||
|
|
||
|
help() {
|
||
|
cat <<- HELP
|
||
|
${0##*/}: [-p all|cpu|disk -s]
|
||
|
|
||
|
Configuration is generated based on system's cpu and nvme topology. Parameters
|
||
|
taken directly from the environment:
|
||
|
|
||
|
spdk_cpu_list - list of CPUs to assign to a SPDK app
|
||
|
spdk_cpu_num - number of CPUs to use across all NUMA nodes
|
||
|
(spdk_cpu_list takes priority, default: 1)
|
||
|
vm_count - number of VMs to prepare the configuration for
|
||
|
(default: 1)
|
||
|
vm_cpu_num - number of CPUs to assign per VM (default: 1)
|
||
|
|
||
|
Override parameters:
|
||
|
vmN_node - overrides selected NUMA node for VM N - by default,
|
||
|
this is allocated up to number of nvme drives
|
||
|
cpu_out - with -s, points at location where to save cpu conf
|
||
|
disk_out - with -s, points at location where to save disk conf
|
||
|
|
||
|
Note: VMs are pinned to nvme drives based on their NUMA location.
|
||
|
|
||
|
Example:
|
||
|
# Allocate 6 cpus from node1 for SPDK. Configure 24 VMs, 2 CPUs per VM
|
||
|
$ export spdk_cpu_num=6 vm_count=24 vm_cpu_num=2
|
||
|
$ ${0##*/} -p all
|
||
|
HELP
|
||
|
}
|
||
|
|
||
|
print=""
|
||
|
save=no
|
||
|
|
||
|
fetch_env
|
||
|
|
||
|
while getopts :hsp: arg; do
|
||
|
case "$arg" in
|
||
|
h)
|
||
|
help
|
||
|
exit 0
|
||
|
;;
|
||
|
p) print=$OPTARG ;;
|
||
|
s) save=yes ;;
|
||
|
*) ;;
|
||
|
esac
|
||
|
done
|
||
|
|
||
|
get_auto_cfg || exit 1
|
||
|
|
||
|
case "$print" in
|
||
|
all) p_all ;;
|
||
|
cpu) p_cpu_map ;;
|
||
|
disk) p_disk_map ;;
|
||
|
*) ;;
|
||
|
esac
|
||
|
|
||
|
if [[ $save == yes ]]; then
|
||
|
p_cpu_map > "$cpu_out"
|
||
|
p_disk_map > "$disk_out"
|
||
|
fi
|