From 2b80955c097ace9ba27631846ee49758444dff56 Mon Sep 17 00:00:00 2001 From: Michal Berger Date: Fri, 6 Nov 2020 14:10:20 +0100 Subject: [PATCH] scripts/setup: Refactor hugepages allocation for Linux Main changes: - By default, allocate all hugepages on node0. On NUMA-aware systems, processes will most often use default policy with local node as the preffered one for allocations. This usually is node0, thus splitting hugepages evenly across the nodes, in default setup, would force allocations from a remote node, impacting overall performance (in case hugepages on node0 run out). See 68740678e1 as a reference. - Introduce HUGE_EVEN_ALLOC - force setup.sh to evenly distribute hugepages across all the nodes. - Introduce HUGEPGSZ - overrides default page size - Introduce CLEAR_HUGE - remove all hugepages on demand before allocation is performed. - HUGENODE - this now can be a list of nodes to use. NRHUGE will be set across all the nodes from the list. Change-Id: I084829edde3c416e7fc6b7b6abe369cc0631fcd7 Signed-off-by: Michal Berger Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/5042 Tested-by: SPDK CI Jenkins Community-CI: Mellanox Build Bot Reviewed-by: Jim Harris Reviewed-by: Tomasz Zawadzki --- scripts/setup.sh | 89 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 20 deletions(-) diff --git a/scripts/setup.sh b/scripts/setup.sh index fd6b3a1b0..c84a4eaed 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -44,12 +44,18 @@ function usage() { echo echo "The following environment variables can be specified." echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default." - echo " For NUMA systems, the hugepages will be evenly distributed" - echo " between CPU nodes" + echo " For NUMA systems, the hugepages will be distributed on node0 by" + echo " default." + echo "HUGE_EVEN_ALLOC If set to 'yes', hugepages will be evenly distributed across all" + echo " system's NUMA nodes (effectively ignoring anything set in HUGENODE)." + echo " Uses kernel's default for hugepages size." echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM." - echo "HUGENODE Specific NUMA node to allocate hugepages on. To allocate" - echo " hugepages on multiple nodes run this script multiple times -" - echo " once for each node." + echo "HUGENODE Specific NUMA node to allocate hugepages on. Multiple nodes can be" + echo " separated with comas - NRHUGE will be applied on each node." + echo "HUGEPGSZ Size of the hugepages to use in kB. If not set, kernel's default" + echo " setting is used." + echo "CLEAR_HUGE If set to 'yes', the attempt to remove hugepages from all nodes will" + echo " be made prior to allocation". echo "PCI_WHITELIST" echo "PCI_BLACKLIST Whitespace separated list of PCI devices (NVMe, I/OAT, VMD, Virtio)." echo " Each device must be specified as a full PCI address." @@ -391,6 +397,57 @@ function cleanup_linux() { unset dirs_to_clean files_to_clean opened_files } +check_hugepages_alloc() { + local hp_int=$1 + local allocated_hugepages + + echo $((NRHUGE < 0 ? 0 : NRHUGE)) > "$hp_int" + + allocated_hugepages=$(< "$hp_int") + if ((allocated_hugepages < NRHUGE)); then + cat <<- ERROR + + ## ERROR: requested $NRHUGE hugepages but $allocated_hugepages could be allocated ${2:+on node$2}. + ## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine. + ERROR + return 1 + fi +} + +clear_hugepages() { echo 0 > /proc/sys/vm/nr_hugepages; } + +configure_linux_hugepages() { + local node system_nodes nodes_to_use + + if [[ $CLEAR_HUGE == yes ]]; then + clear_hugepages + fi + + if [[ $HUGE_EVEN_ALLOC == yes ]]; then + clear_hugepages + check_hugepages_alloc /proc/sys/vm/nr_hugepages + return 0 + fi + + for node in /sys/devices/system/node/node*; do + [[ -e $node ]] || continue + nodes[${node##*node}]=$node/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages + done + + IFS="," read -ra nodes_to_use <<< "$HUGENODE" + if ((${#nodes_to_use[@]} == 0)); then + nodes_to_use=(0) + fi + + for node in "${nodes_to_use[@]}"; do + if [[ -z ${nodes[node]} ]]; then + echo "Node $node doesn't exist, ignoring" >&2 + continue + fi + check_hugepages_alloc "${nodes[node]}" "$node" + done +} + function configure_linux() { configure_linux_pci hugetlbfs_mounts=$(linux_hugetlbfs_mounts) @@ -402,20 +459,7 @@ function configure_linux() { mount -t hugetlbfs nodev "$hugetlbfs_mounts" fi - if [ -z "$HUGENODE" ]; then - hugepages_target="/proc/sys/vm/nr_hugepages" - else - hugepages_target="/sys/devices/system/node/node${HUGENODE}/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages" - fi - - echo "$NRHUGE" > "$hugepages_target" - allocated_hugepages=$(cat $hugepages_target) - if [ "$allocated_hugepages" -lt "$NRHUGE" ]; then - echo "" - echo "## ERROR: requested $NRHUGE hugepages but only $allocated_hugepages could be allocated." - echo "## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine." - exit 1 - fi + configure_linux_hugepages if [ "$driver_name" = "vfio-pci" ]; then if [ -n "$TARGET_USER" ]; then @@ -704,7 +748,12 @@ if [[ $mode == reset && $PCI_BLOCK_SYNC_ON_RESET == yes ]]; then fi if [[ $os == Linux ]]; then - HUGEPGSZ=$(($(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9'))) + if [[ -n $HUGEPGSZ && ! -e /sys/kernel/mm/hugepages/hugepages-${HUGEPGSZ}kB ]]; then + echo "${HUGEPGSZ}kB is not supported by the running kernel, ingoring" >&2 + unset -v HUGEPGSZ + fi + + HUGEPGSZ=${HUGEPGSZ:-$(grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9')} HUGEPGSZ_MB=$((HUGEPGSZ / 1024)) : ${NRHUGE=$(((HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB))}