Spdk/scripts/perf/pm/collect-bmc-pm

491 lines
14 KiB
Plaintext
Raw Normal View History

#!/usr/bin/env bash
set -e
hex() { printf '0x%02x\n' "$@"; }
is_root() {
# Talking to local BMC device requires root privileges
if ((UID)); then
printf '%s, you need to be root to run this script\n' "$USER" >&2
return 1
fi
}
is_ipmitool() {
if ! type -P ipmitool; then
printf 'ipmitool not detected, cannot run commands against the BMC\n' >&2
return 1
fi
}
ipmi_load() {
# Silently attempt to load core ipmi drivers - we will pick up the device later on.
modprobe -qa ipmi_si ipmi_devintf ipmi_msghandler || return 0
}
ipmi_supported() {
# Verify if kernel detected and registered at least one BMC under
# the ipmi platform. Look for KCS specifically as this the type
# of the interface the script was tested against.
local ipmi=/sys/class/ipmi/ipmi0
# Keep these details global for easy access if needed.
local -g man_id prod_id dev_id ipmi_ver platform board ipmitool
ipmi_load
if [[ ! -e $ipmi ]]; then
printf 'BMC not detected. Please, make sure your platform is IPMI-compatible\n'
return 1
fi >&2
type=$(< "$ipmi/device/type")
if [[ $type != kcs ]]; then
printf 'No supported BMC interface detected (%s) - only KCS is supported\n' "$type"
return 1
fi >&2
man_id=$(< "$ipmi/device/bmc/manufacturer_id")
prod_id=$(< "$ipmi/device/bmc/product_id")
dev_id=$(hex "$(< "$ipmi/device/bmc/device_id")")
ipmi_ver=$(< "$ipmi/device/bmc/ipmi_version")
if [[ -e /sys/class/dmi/id/board_vendor ]]; then
platform=$(< /sys/class/dmi/id/board_vendor)
fi
if [[ -e /sys/class/dmi/id/board_name ]]; then
board=$(< /sys/class/dmi/id/board_name)
fi
# Keep output similar to ipmi_si's
cat <<- BMC_DEV >&2
BMC detected, details below:
Manufacturer ID: $man_id
Product ID: $prod_id
Device ID: $dev_id
IPMI Version: $ipmi_ver
Platform: ${platform:-unknown}
Board: ${board:-unknown}
BMC_DEV
# Verify if we have proper tools to work with
ipmitool=$(is_ipmitool)
}
ipmiraw() {
# For the majority of commands we use raw payload to not depend on specific ipmitool version
# and the way how it interprets/parses the returned data. This also allows us to inspect the
# integrity of data more closely to make sure we don't report nonsensical values to the user.
local rsp
rsp=($("$ipmitool" raw "$@" 2> /dev/null))
# Slap hex prefix to work with proper base
rsp=("${rsp[@]/#/0x}")
hex "${rsp[@]}"
}
dcmiraw() {
local cmd=$1 data=("${@:2}")
ipmiraw 0x2c "$cmd" 0xdc "${data[@]}"
}
print_dcmi_available_time_periods() {
local time_periods=${enhanced_power_attr[4]}
local -g available_time_periods=()
local -g available_time_periods_in_seconds=()
available_time_periods[0]="NOW"
if ((time_periods > 0)); then
local time_idx=5
local offset=$time_idx
local units unit time time_s units_mask=0xc0 to_sec
units[0x0]=seconds
units[0x1]=minutes
units[0x2]=hours
units[0x3]=days
to_sec[0x0]=1
to_sec[0x1]=60
to_sec[0x2]=3600
to_sec[0x3]=86400
while ((offset < time_idx + time_periods)); do
time=$((enhanced_power_attr[offset] & ~units_mask))
unit=${units[enhanced_power_attr[offset] >> 6]:-unknown}
time_s=$((time * to_sec[enhanced_power_attr[offset] >> 6]))
if ((time != 0)); then
available_time_periods[offset]="$time $unit"
available_time_periods_in_seconds[time_s]=${enhanced_power_attr[offset]}
fi
((++offset))
done
fi
cat <<- TIME_PERIODS >&2
Available averaging time periods to request:
$(printf ' - %s\n' "${available_time_periods[@]}")
TIME_PERIODS
}
dcmi_power_support() {
# Verify if the BMC conforms to the DCMI spec
local rsp
# Table 6-2, Get DCMI Capabilities Command Format
if ! rsp=($(dcmiraw 0x1 0x1)); then
printf 'Cannot determine if BMC supports DCMI Power Management capability\n' >&2
return 1
fi
# Table 6-3, DCMI Capabilities Parameters:
# - Supported DCMI Capabilities:
# - Byte 2 Platform capabilities: [0] Power management
if ((!(rsp[5] & (1 << 0)))); then
printf 'BMC does not provide DCMI Power Mangament capability\n' >&2
return 1
fi
# Check if BMC provides Enhanced System Power Statistics attributes - this allows to issue
# requests for power readings at averaging time period, .e.g. from last 5 seconds, 30 minutes,
# 1 hour and so on. With this we can provide more detailed view on power usage within a
# specific period of time. Without it, we need to depend only on current reading that should
# be always available (the "NOW" reading).
local -g enhanced_power_attr=()
# Table 6-3, DCMI Capabilities Parameters:
# - Enhanced System Power Statistics attributes
if enhanced_power_attr=($(dcmiraw 0x1 0x5)); then
print_dcmi_available_time_periods
fi
printf 'Using DCMI Power Management\n' >&2
}
sdr_power_support() {
# This is a fallback which only some platforms may provide (confirmed PowerEdge and CYP).
# We are looking for a full, threshold sensor which reports overall power usage in Watts.
# Different BMCs may have SDRs which describe such sensor(s) differently so this is not
# 100% reliable. To make sure we pick up a proper sensor we also narrow it down to a
# specific entity (System Board or Power Supply). Readings from the sensor should be
# considered as "NOW" readings (without access to min, max readings).
local -g power_sensors=()
local sensor entity unit status
# Cache SDR to speed up sensor readings
if [[ ! -f $sdr_cache ]]; then
printf 'Saving SDR cache at %s\n' "$sdr_cache" >&2
"$ipmitool" sdr dump "$sdr_cache" > /dev/null
fi
if ((${#extra_power_sensors[@]} > 0)); then
power_sensors+=("${extra_power_sensors[@]}")
fi
while IFS="," read -r sensor _ unit status _ entity _; do
[[ $unit == Watts && $status == ok ]] || continue
[[ $entity == "System Board" || $entity == "Power Supply" ]] || continue
power_sensors+=("$sensor")
done < <("$ipmitool" -S "$sdr_cache" -vc sdr list full 2>&1)
if ((${#power_sensors[@]} > 0)); then
printf 'Using SDR (Power sensors: %s)\n' "${power_sensors[*]}"
else
printf 'Cannot locate power sensors\n'
return 1
fi >&2
}
power_support() {
local -g support cpu_support=0
if ((include_cpu == 1)) && rapl_supported; then
cpu_support=1
fi
if [[ $interface == dcmi || $interface == sdr ]]; then
# override
"${interface}_power_support"
support=$interface
elif dcmi_power_support; then
support=dcmi
elif sdr_power_support; then
support=sdr
else
printf 'BMC does not provide Power Management support, cannot gather system-wide power measurements\n' >&2
if ((cpu_support)); then
printf 'Only CPU measurements will be provided\n' >&2
return 0
fi
return 1
fi
}
get_dcmi_now_reading() {
local rsp reading=0 max min avg ts timeframe mode=01h
local get_cmd get_avg=0 print
# Table 6-16, Get Power Reading Command:
get_cmd=(0x2 0x1 0x0 0x0)
if [[ $interval =~ ^[0-9]+$ && -n ${available_time_periods_in_seconds[interval]} ]]; then
get_cmd=(0x2 0x2 "${available_time_periods_in_seconds[interval]}" 0x0)
get_avg=1
mode=02h
fi
# We use System Power Statistics mode to get the "NOW" reading by default. In case
# interval matches one supported by Enhanced System Power Statistics we use that
# mode to obtain extra min, max, avg statistics.
if ! rsp=($(dcmiraw "${get_cmd[@]}")); then
printf 'DCMI reading: error\n'
else
# Note that the BMC timestamp depends on the hwclock setup which we then attempt
# to represent in UTC.
ts=$((rsp[12] << 24 | rsp[11] << 16 | rsp[10] << 8 | rsp[9]))
# This is interpreted differently by different BMCs so for now we make a note of
# it but don't present it to the user.
timeframe=$((rsp[16] << 24 | rsp[15] << 16 | rsp[14] << 8 | rsp[13]))
reading=$((rsp[2] << 8 | rsp[1]))
if ((get_avg == 1)); then
min=$((rsp[4] << 8 | rsp[3]))
max=$((rsp[6] << 8 | rsp[5]))
avg=$((rsp[8] << 8 | rsp[7]))
_DCMI_min+=("$min")
_DCMI_max+=("$max")
_DCMI_avg+=("$avg")
power_readings["DCMI_MIN"]="_DCMI_min[@]"
power_readings["DCMI_MAX"]="_DCMI_max[@]"
power_readings["DCMI_AVG"]="_DCMI_avg[@]"
fi
_DCMI+=("$reading")
power_readings["DCMI"]="_DCMI[@]"
for print in min max avg reading; do
[[ -n ${!print} ]] || continue
printf '(%s) DCMI %s (mode: %s): %u Watts (interval: %ss)\n' \
"$(utc "$ts")" \
"$print" \
"$mode" \
"${!print}" \
"$interval"
done
fi >&2
}
get_sdr_now_reading() {
local sensor reading=0 ts unit
if ((${#power_sensors[@]} == 0)); then
printf 'No power sensors were provided\n' >&2
return 1
fi
for sensor in "${!power_sensors[@]}"; do
ts=$(utc)
if ! IFS="," read -r _ reading unit _; then
reading=error
else
eval "_sensor${sensor}_readings+=($reading)"
power_readings["${power_sensors[sensor]}"]="_sensor${sensor}_readings[@]"
reading+=" $unit"
fi < <("$ipmitool" -c -S "$sdr_cache" sdr get "${power_sensors[sensor]}") 2> /dev/null
printf '(%s) Sensor %s reading: %s (interval %ss)\n' \
"$ts" \
"${power_sensors[sensor]}" \
"$reading" \
"$interval" >&2
done
}
rapl_supported() {
[[ -e /sys/class/powercap/intel-rapl ]]
}
get_cpu_socket_reading() {
local rapl=/sys/class/powercap
local socket socket_idx _socket_idx socket_name
local ts reading
# power_uw is usually not available so we need to relay on energy_uj. It's also rarely
# rw so we can't zero it out, hence we need to keep track of the initial counter. For
# details see kernel documentation (powercap.rst).
ts=$(utc)
for socket in /sys/class/powercap/intel-rapl:*; do
[[ -e $socket ]] || continue
socket_idx=${socket#*:} socket_name=$(< "$socket/name")
# Adjust for different domains, see linux/intel_rapl.h
case "$socket_name" in
dram | core | uncore) _socket_idx=${socket_idx//:/_} socket_idx=${socket_idx%:*} ;;
package-*) _socket_idx=$socket_idx socket_name=socket ;;
psys*) _socket_idx=$socket_idx socket_name=platform ;;
esac
local -n socket_uj=socket_${_socket_idx}_uj
socket_uj+=("$(< "$socket/energy_uj")")
# We need at least two readings for comparison
((${#socket_uj[@]} > 1)) || continue
# Convert to Watts - use bc since $interval can be an actual float
reading=$(bc <<< "scale=2; (${socket_uj[-1]} - ${socket_uj[-2]}) / 1000000 / $interval")
eval "_socket${_socket_idx}_readings+=($reading)"
power_readings["$socket_name-$socket_idx"]="_socket${_socket_idx}_readings[@]"
printf '(%s) CPU %s %s reading: %s Watts (interval: %ss)\n' \
"$ts" \
"$socket_name" \
"$socket_idx" \
"$reading" \
"$interval" >&2
done
}
get_now_reading() {
case "$support" in
dcmi) get_dcmi_now_reading ;;
sdr) get_sdr_now_reading ;;
*) ;;
esac
}
dump_readings() {
local sensor reading readings avg total
((${#power_readings[@]} > 0)) || return 1
printf 'Dumping average sensors reading from %s\n' "${!power_readings[*]}" >&2
for sensor in "${!power_readings[@]}"; do
readings=("${!power_readings["$sensor"]}")
if ((${#readings[@]} == 0)); then
printf 'No readings available for %s sensor\n' "$sensor" >&2
continue
fi
total=0
for reading in "${readings[@]}"; do
((total += ${reading%.*}))
done
avg=$((total / ${#readings[@]}))
readings+=("Total: ${#readings[@]}")
printf '%u\n' "$avg" > "$output_dir/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt"
printf '%s\n' "${readings[@]}" > "$output_dir/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt"
printf 'Dumped avg to %s\n' "$output_dir/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt" >&2
printf 'Dumped all to %s\n' "$output_dir/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt" >&2
done
}
utc() {
date --utc ${1:+-"d@$1"}
}
cleanup() {
[[ -f $sdr_cache && $remove_sdr_cache == yes ]] && rm "$sdr_cache"
dump_readings
}
collect_readings() {
local _count=$count
if ((_count == 1 && cpu_support)); then
# We need at least two readings to get a meaningful data
((_count += 1))
fi
while ((count <= 0 ? 1 : _count--)); do
get_now_reading
((cpu_support)) && get_cpu_socket_reading
sleep "${interval}s"
done
}
help() {
cat <<- HELP
Usage: $0 [-h] [-d dir] [-i sdr|dcmi] [-s SENSOR_NAME] [-t interval] [-l log_file] [-p prefix] [-c count] [-r]
-h - Print this message.
-d - Directory where the results should be saved. Default is /tmp.
-i - Type of interface to use for requesting power usage. "sdr" or "dcmi".
If not set, available interface is used ("dcmi" has priority).
-t - How long to wait before each get power command in seconds. In case
this value matches one of supported averaging time periods special
variant of the command will be used to obtain the reading - this
variant is used only with the "dcmi" interface. Default is 1s.
-s - In case "sdr" interface is in use, try to read data from SENSOR_NAME.
-x - In case "sdr" interface is in use, don't remove SDR cache. This can
speed up subsequent runs of the script.
-l - Save output of the script to a log file (dir/${0##*/}.bmc.pm.log).
-p - Add prefix to saved files.
-c - Read power usage count times. 0 is the default and it means to run
indefinitely.
-r - Include readings from CPU sockets (RAPL-dependent)
When started, ${0##*/} will enter loop to continuously read power usage from either
DCMI interface or dedicated Watts sensors every interval. Each reading will be
logged to stderr. Upon termination, average power usage will be dumped to /tmp or
directory set by -d.
HELP
}
is_root
output_dir=/tmp
interval=1
remove_sdr_cache=yes
log_to_file=no
prefix=""
count=0
include_cpu=0
declare -A power_readings=()
declare -a extra_power_sensors=()
while getopts :hi:s:d:t:xlp:c:r arg; do
case "$arg" in
h)
help
exit 0
;;
d) output_dir=$OPTARG ;;
s) extra_power_sensors+=("$OPTARG") ;;
i) interface=${OPTARG,,} ;;
t) interval=$OPTARG ;;
x) remove_sdr_cache=no ;;
l) log_to_file=yes ;;
p) prefix=$OPTARG ;;
c) count=$OPTARG ;;
r) include_cpu=1 ;;
*) ;;
esac
done
declare -r sdr_cache=$output_dir/sdr.cache
declare -r log_file=${prefix:+${prefix}_}${0##*/}.bmc.pm.log
mkdir -p "$output_dir"
if [[ $log_to_file == yes ]]; then
printf 'Redirecting to %s\n' "$output_dir/$log_file" >&2
exec > "$output_dir/$log_file" 2>&1
fi
trap 'cleanup' EXIT
ipmi_supported
power_support
collect_readings