Signed-off-by: Michal Berger <michal.berger@intel.com> Change-Id: I5a4071441b6eed53553624c5ee587b7c91360eb5 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/16633 Reviewed-by: Tomasz Zawadzki <tomasz.zawadzki@intel.com> Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com>
510 lines
14 KiB
Bash
Executable File
510 lines
14 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# SPDX-License-Identifier: BSD-3-Clause
|
|
# Copyright (C) 2022 Intel Corporation
|
|
# All rights reserved.
|
|
|
|
set -e
|
|
|
|
hex() { printf '0x%02x\n' "$@"; }
|
|
|
|
calc() { bc <<< "scale=2; $*"; }
|
|
|
|
is_root() {
|
|
# Talking to local BMC device requires root privileges
|
|
if ((UID)); then
|
|
printf '%s, you need to be root to run this script\n' "$USER" >&2
|
|
return 1
|
|
fi
|
|
|
|
}
|
|
|
|
is_ipmitool() {
|
|
if ! type -P ipmitool; then
|
|
printf 'ipmitool not detected, cannot run commands against the BMC\n' >&2
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
ipmi_load() {
|
|
# Silently attempt to load core ipmi drivers - we will pick up the device later on.
|
|
modprobe -qa ipmi_si ipmi_devintf ipmi_msghandler || return 0
|
|
}
|
|
|
|
ipmi_supported() {
|
|
# Verify if kernel detected and registered at least one BMC under
|
|
# the ipmi platform. Look for KCS specifically as this the type
|
|
# of the interface the script was tested against.
|
|
|
|
local ipmi=/sys/class/ipmi/ipmi0
|
|
|
|
# Keep these details global for easy access if needed.
|
|
local -g man_id prod_id dev_id ipmi_ver platform board ipmitool
|
|
|
|
ipmi_load
|
|
|
|
if [[ ! -e $ipmi ]]; then
|
|
printf 'BMC not detected. Please, make sure your platform is IPMI-compatible\n'
|
|
return 1
|
|
fi >&2
|
|
|
|
type=$(< "$ipmi/device/type")
|
|
|
|
if [[ $type != kcs ]]; then
|
|
printf 'No supported BMC interface detected (%s) - only KCS is supported\n' "$type"
|
|
return 1
|
|
fi >&2
|
|
|
|
man_id=$(< "$ipmi/device/bmc/manufacturer_id")
|
|
prod_id=$(< "$ipmi/device/bmc/product_id")
|
|
dev_id=$(hex "$(< "$ipmi/device/bmc/device_id")")
|
|
ipmi_ver=$(< "$ipmi/device/bmc/ipmi_version")
|
|
|
|
if [[ -e /sys/class/dmi/id/board_vendor ]]; then
|
|
platform=$(< /sys/class/dmi/id/board_vendor)
|
|
fi
|
|
|
|
if [[ -e /sys/class/dmi/id/board_name ]]; then
|
|
board=$(< /sys/class/dmi/id/board_name)
|
|
fi
|
|
|
|
# Keep output similar to ipmi_si's
|
|
cat <<- BMC_DEV >&2
|
|
|
|
BMC detected, details below:
|
|
Manufacturer ID: $man_id
|
|
Product ID: $prod_id
|
|
Device ID: $dev_id
|
|
IPMI Version: $ipmi_ver
|
|
Platform: ${platform:-unknown}
|
|
Board: ${board:-unknown}
|
|
|
|
BMC_DEV
|
|
|
|
# Verify if we have proper tools to work with
|
|
ipmitool=$(is_ipmitool)
|
|
}
|
|
|
|
ipmiraw() {
|
|
# For the majority of commands we use raw payload to not depend on specific ipmitool version
|
|
# and the way how it interprets/parses the returned data. This also allows us to inspect the
|
|
# integrity of data more closely to make sure we don't report nonsensical values to the user.
|
|
|
|
local rsp
|
|
|
|
rsp=($("$ipmitool" raw "$@" 2> /dev/null))
|
|
# Slap hex prefix to work with proper base
|
|
rsp=("${rsp[@]/#/0x}")
|
|
|
|
hex "${rsp[@]}"
|
|
}
|
|
|
|
dcmiraw() {
|
|
local cmd=$1 data=("${@:2}")
|
|
|
|
ipmiraw 0x2c "$cmd" 0xdc "${data[@]}"
|
|
}
|
|
|
|
print_dcmi_available_time_periods() {
|
|
local time_periods=${enhanced_power_attr[4]}
|
|
local -g available_time_periods=()
|
|
local -g available_time_periods_in_seconds=()
|
|
|
|
available_time_periods[0]="NOW"
|
|
|
|
if ((time_periods > 0)); then
|
|
local time_idx=5
|
|
local offset=$time_idx
|
|
local units unit time time_s units_mask=0xc0 to_sec
|
|
|
|
units[0x0]=seconds
|
|
units[0x1]=minutes
|
|
units[0x2]=hours
|
|
units[0x3]=days
|
|
|
|
to_sec[0x0]=1
|
|
to_sec[0x1]=60
|
|
to_sec[0x2]=3600
|
|
to_sec[0x3]=86400
|
|
|
|
while ((offset < time_idx + time_periods)); do
|
|
time=$((enhanced_power_attr[offset] & ~units_mask))
|
|
unit=${units[enhanced_power_attr[offset] >> 6]:-unknown}
|
|
time_s=$((time * to_sec[enhanced_power_attr[offset] >> 6]))
|
|
if ((time != 0)); then
|
|
available_time_periods[offset]="$time $unit"
|
|
available_time_periods_in_seconds[time_s]=${enhanced_power_attr[offset]}
|
|
fi
|
|
((++offset))
|
|
done
|
|
fi
|
|
cat <<- TIME_PERIODS >&2
|
|
|
|
Available averaging time periods to request:
|
|
$(printf ' - %s\n' "${available_time_periods[@]}")
|
|
|
|
TIME_PERIODS
|
|
}
|
|
|
|
dcmi_power_support() {
|
|
# Verify if the BMC conforms to the DCMI spec
|
|
local rsp
|
|
|
|
# Table 6-2, Get DCMI Capabilities Command Format
|
|
if ! rsp=($(dcmiraw 0x1 0x1)); then
|
|
printf 'Cannot determine if BMC supports DCMI Power Management capability\n' >&2
|
|
return 1
|
|
fi
|
|
|
|
# Table 6-3, DCMI Capabilities Parameters:
|
|
# - Supported DCMI Capabilities:
|
|
# - Byte 2 Platform capabilities: [0] Power management
|
|
if ((!(rsp[5] & (1 << 0)))); then
|
|
printf 'BMC does not provide DCMI Power Mangament capability\n' >&2
|
|
return 1
|
|
fi
|
|
|
|
# Check if BMC provides Enhanced System Power Statistics attributes - this allows to issue
|
|
# requests for power readings at averaging time period, .e.g. from last 5 seconds, 30 minutes,
|
|
# 1 hour and so on. With this we can provide more detailed view on power usage within a
|
|
# specific period of time. Without it, we need to depend only on current reading that should
|
|
# be always available (the "NOW" reading).
|
|
|
|
local -g enhanced_power_attr=()
|
|
|
|
# Table 6-3, DCMI Capabilities Parameters:
|
|
# - Enhanced System Power Statistics attributes
|
|
if enhanced_power_attr=($(dcmiraw 0x1 0x5)); then
|
|
print_dcmi_available_time_periods
|
|
fi
|
|
|
|
printf 'Using DCMI Power Management\n' >&2
|
|
}
|
|
|
|
sdr_power_support() {
|
|
# This is a fallback which only some platforms may provide (confirmed PowerEdge and CYP).
|
|
# We are looking for a full, threshold sensor which reports overall power usage in Watts.
|
|
# Different BMCs may have SDRs which describe such sensor(s) differently so this is not
|
|
# 100% reliable. To make sure we pick up a proper sensor we also narrow it down to a
|
|
# specific entity (System Board or Power Supply). Readings from the sensor should be
|
|
# considered as "NOW" readings (without access to min, max readings).
|
|
|
|
local -g power_sensors=()
|
|
local sensor entity unit status
|
|
|
|
# Cache SDR to speed up sensor readings
|
|
if [[ ! -f $sdr_cache ]]; then
|
|
printf 'Saving SDR cache at %s\n' "$sdr_cache" >&2
|
|
"$ipmitool" sdr dump "$sdr_cache" > /dev/null
|
|
fi
|
|
|
|
if ((${#extra_power_sensors[@]} > 0)); then
|
|
power_sensors+=("${extra_power_sensors[@]}")
|
|
fi
|
|
|
|
while IFS="," read -r sensor _ unit status _ entity _; do
|
|
[[ $unit == Watts && $status == ok ]] || continue
|
|
[[ $entity == "System Board" || $entity == "Power Supply" ]] || continue
|
|
power_sensors+=("$sensor")
|
|
done < <("$ipmitool" -S "$sdr_cache" -vc sdr list full 2>&1)
|
|
|
|
if ((${#power_sensors[@]} > 0)); then
|
|
printf 'Using SDR (Power sensors: %s)\n' "${power_sensors[*]}"
|
|
else
|
|
printf 'Cannot locate power sensors\n'
|
|
return 1
|
|
fi >&2
|
|
}
|
|
|
|
power_support() {
|
|
local -g support cpu_support=0
|
|
|
|
if ((include_cpu == 1)) && rapl_supported; then
|
|
cpu_support=1
|
|
fi
|
|
|
|
if [[ $interface == dcmi || $interface == sdr ]]; then
|
|
# override
|
|
"${interface}_power_support"
|
|
support=$interface
|
|
elif dcmi_power_support; then
|
|
support=dcmi
|
|
elif sdr_power_support; then
|
|
support=sdr
|
|
else
|
|
printf 'BMC does not provide Power Management support, cannot gather system-wide power measurements\n' >&2
|
|
if ((cpu_support)); then
|
|
printf 'Only CPU measurements will be provided\n' >&2
|
|
return 0
|
|
fi
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
get_dcmi_now_reading() {
|
|
local rsp reading=0 max min avg ts timeframe mode=01h
|
|
local get_cmd get_avg=0 print
|
|
|
|
# Table 6-16, Get Power Reading Command:
|
|
get_cmd=(0x2 0x1 0x0 0x0)
|
|
|
|
if [[ $interval =~ ^[0-9]+$ && -n ${available_time_periods_in_seconds[interval]} ]]; then
|
|
get_cmd=(0x2 0x2 "${available_time_periods_in_seconds[interval]}" 0x0)
|
|
get_avg=1
|
|
mode=02h
|
|
fi
|
|
|
|
# We use System Power Statistics mode to get the "NOW" reading by default. In case
|
|
# interval matches one supported by Enhanced System Power Statistics we use that
|
|
# mode to obtain extra min, max, avg statistics.
|
|
|
|
if ! rsp=($(dcmiraw "${get_cmd[@]}")); then
|
|
printf 'DCMI reading: error\n'
|
|
else
|
|
# Note that the BMC timestamp depends on the hwclock setup which we then attempt
|
|
# to represent in UTC.
|
|
ts=$((rsp[12] << 24 | rsp[11] << 16 | rsp[10] << 8 | rsp[9]))
|
|
# This is interpreted differently by different BMCs so for now we make a note of
|
|
# it but don't present it to the user.
|
|
timeframe=$((rsp[16] << 24 | rsp[15] << 16 | rsp[14] << 8 | rsp[13]))
|
|
reading=$((rsp[2] << 8 | rsp[1]))
|
|
if ((get_avg == 1)); then
|
|
min=$((rsp[4] << 8 | rsp[3]))
|
|
max=$((rsp[6] << 8 | rsp[5]))
|
|
avg=$((rsp[8] << 8 | rsp[7]))
|
|
_DCMI_min+=("$min")
|
|
_DCMI_max+=("$max")
|
|
_DCMI_avg+=("$avg")
|
|
power_readings["DCMI_MIN"]="_DCMI_min[@]"
|
|
power_readings["DCMI_MAX"]="_DCMI_max[@]"
|
|
power_readings["DCMI_AVG"]="_DCMI_avg[@]"
|
|
fi
|
|
_DCMI+=("$reading")
|
|
power_readings["DCMI"]="_DCMI[@]"
|
|
|
|
for print in min max avg reading; do
|
|
[[ -n ${!print} ]] || continue
|
|
printf '(%s) DCMI %s (mode: %s): %u Watts (interval: %ss)\n' \
|
|
"$(utc "$ts")" \
|
|
"$print" \
|
|
"$mode" \
|
|
"${!print}" \
|
|
"$interval"
|
|
done
|
|
fi >&2
|
|
}
|
|
|
|
get_sdr_now_reading() {
|
|
local sensor reading=0 ts unit
|
|
|
|
if ((${#power_sensors[@]} == 0)); then
|
|
printf 'No power sensors were provided\n' >&2
|
|
return 1
|
|
fi
|
|
|
|
for sensor in "${!power_sensors[@]}"; do
|
|
ts=$(utc)
|
|
if ! IFS="," read -r _ reading unit _; then
|
|
reading=error
|
|
else
|
|
eval "_sensor${sensor}_readings+=($reading)"
|
|
power_readings["${power_sensors[sensor]}"]="_sensor${sensor}_readings[@]"
|
|
reading+=" $unit"
|
|
fi < <("$ipmitool" -c -S "$sdr_cache" sdr get "${power_sensors[sensor]}") 2> /dev/null
|
|
printf '(%s) Sensor %s reading: %s (interval %ss)\n' \
|
|
"$ts" \
|
|
"${power_sensors[sensor]}" \
|
|
"$reading" \
|
|
"$interval" >&2
|
|
done
|
|
}
|
|
|
|
rapl_supported() {
|
|
[[ -e /sys/class/powercap/intel-rapl ]]
|
|
}
|
|
|
|
get_cpu_socket_reading() {
|
|
local rapl=/sys/class/powercap
|
|
local socket socket_idx _socket_idx socket_name
|
|
local ts reading
|
|
|
|
# power_uw is usually not available so we need to relay on energy_uj. It's also rarely
|
|
# rw so we can't zero it out, hence we need to keep track of the initial counter. For
|
|
# details see kernel documentation (powercap.rst).
|
|
ts=$(utc)
|
|
for socket in /sys/class/powercap/intel-rapl:*; do
|
|
[[ -e $socket ]] || continue
|
|
|
|
socket_idx=${socket#*:} socket_name=$(< "$socket/name")
|
|
# Adjust for different domains, see linux/intel_rapl.h
|
|
case "$socket_name" in
|
|
dram | core | uncore) _socket_idx=${socket_idx//:/_} socket_idx=${socket_idx%:*} ;;
|
|
package-*) _socket_idx=$socket_idx socket_name=socket ;;
|
|
psys*) _socket_idx=$socket_idx socket_name=platform ;;
|
|
esac
|
|
|
|
local -n socket_uj=socket_${_socket_idx}_uj
|
|
socket_uj+=("$(< "$socket/energy_uj")")
|
|
# We need at least two readings for comparison
|
|
((${#socket_uj[@]} > 1)) || continue
|
|
|
|
# Convert to Watts - use bc since $interval can be an actual float
|
|
reading=$(calc "(${socket_uj[-1]} - ${socket_uj[-2]}) / 1000000 / $interval")
|
|
if ((reading < 0)); then
|
|
# Somehow this may happen, probably when the counter wraps over. Consider
|
|
# this as a faulty reading and don't include it since it may impact overall
|
|
# avg.
|
|
printf '(%s) CPU %s %s reading: error(%s) (interval: %ss)\n' \
|
|
"$ts" \
|
|
"$socket_name" \
|
|
"$socket_idx" \
|
|
"$reading" \
|
|
"$interval" >&2
|
|
return 0
|
|
fi
|
|
eval "_socket${_socket_idx}_readings+=($reading)"
|
|
power_readings["$socket_name-$socket_idx"]="_socket${_socket_idx}_readings[@]"
|
|
|
|
printf '(%s) CPU %s %s reading: %s Watts (interval: %ss)\n' \
|
|
"$ts" \
|
|
"$socket_name" \
|
|
"$socket_idx" \
|
|
"$reading" \
|
|
"$interval" >&2
|
|
done
|
|
}
|
|
|
|
get_now_reading() {
|
|
case "$support" in
|
|
dcmi) get_dcmi_now_reading ;;
|
|
sdr) get_sdr_now_reading ;;
|
|
*) ;;
|
|
esac
|
|
}
|
|
|
|
dump_readings() {
|
|
local sensor reading readings avg total
|
|
|
|
((${#power_readings[@]} > 0)) || return 1
|
|
printf 'Dumping average sensors reading from %s\n' "${!power_readings[*]}" >&2
|
|
|
|
for sensor in "${!power_readings[@]}"; do
|
|
readings=("${!power_readings["$sensor"]}")
|
|
if ((${#readings[@]} == 0)); then
|
|
printf 'No readings available for %s sensor\n' "$sensor" >&2
|
|
continue
|
|
fi
|
|
total=0
|
|
for reading in "${readings[@]}"; do
|
|
total=$(calc "$total + $reading")
|
|
done
|
|
avg=$(calc "$total / ${#readings[@]}")
|
|
|
|
readings+=("Total: ${#readings[@]}")
|
|
sensor="${sensor//[[:space:]]/_}"
|
|
printf '%s\n' "$avg" > "$output_dir/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt"
|
|
printf '%s\n' "${readings[@]}" > "$output_dir/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt"
|
|
printf 'Dumped avg to %s\n' "$output_dir/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt" >&2
|
|
printf 'Dumped all to %s\n' "$output_dir/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt" >&2
|
|
done
|
|
}
|
|
|
|
utc() {
|
|
date --utc ${1:+-"d@$1"}
|
|
}
|
|
|
|
cleanup() {
|
|
[[ -f $sdr_cache && $remove_sdr_cache == yes ]] && rm "$sdr_cache"
|
|
dump_readings
|
|
}
|
|
|
|
collect_readings() {
|
|
local _count=$count
|
|
if ((_count == 1 && cpu_support)); then
|
|
# We need at least two readings to get a meaningful data
|
|
((_count += 1))
|
|
fi
|
|
while ((count <= 0 ? 1 : _count--)); do
|
|
get_now_reading
|
|
((cpu_support)) && get_cpu_socket_reading
|
|
sleep "${interval}s"
|
|
done
|
|
}
|
|
|
|
help() {
|
|
cat <<- HELP
|
|
|
|
Usage: $0 [-h] [-d dir] [-i sdr|dcmi] [-s SENSOR_NAME] [-t interval] [-l log_file] [-p prefix] [-c count] [-r]
|
|
|
|
-h - Print this message.
|
|
-d - Directory where the results should be saved. Default is /tmp.
|
|
-i - Type of interface to use for requesting power usage. "sdr" or "dcmi".
|
|
If not set, available interface is used ("dcmi" has priority).
|
|
-t - How long to wait before each get power command in seconds. In case
|
|
this value matches one of supported averaging time periods special
|
|
variant of the command will be used to obtain the reading - this
|
|
variant is used only with the "dcmi" interface. Default is 1s.
|
|
-s - In case "sdr" interface is in use, try to read data from SENSOR_NAME.
|
|
-x - In case "sdr" interface is in use, don't remove SDR cache. This can
|
|
speed up subsequent runs of the script.
|
|
-l - Save output of the script to a log file (dir/${0##*/}.bmc.pm.log).
|
|
-p - Add prefix to saved files.
|
|
-c - Read power usage count times. 0 is the default and it means to run
|
|
indefinitely.
|
|
-r - Include readings from CPU sockets (RAPL-dependent)
|
|
|
|
When started, ${0##*/} will enter loop to continuously read power usage from either
|
|
DCMI interface or dedicated Watts sensors every interval. Each reading will be
|
|
logged to stderr. Upon termination, average power usage will be dumped to /tmp or
|
|
directory set by -d.
|
|
|
|
HELP
|
|
}
|
|
|
|
is_root
|
|
|
|
output_dir=/tmp
|
|
interval=1
|
|
remove_sdr_cache=yes
|
|
log_to_file=no
|
|
prefix=""
|
|
count=0
|
|
include_cpu=0
|
|
|
|
declare -A power_readings=()
|
|
declare -a extra_power_sensors=()
|
|
|
|
while getopts :hi:s:d:t:xlp:c:r arg; do
|
|
case "$arg" in
|
|
h)
|
|
help
|
|
exit 0
|
|
;;
|
|
d) output_dir=$OPTARG ;;
|
|
s) extra_power_sensors+=("$OPTARG") ;;
|
|
i) interface=${OPTARG,,} ;;
|
|
t) interval=$OPTARG ;;
|
|
x) remove_sdr_cache=no ;;
|
|
l) log_to_file=yes ;;
|
|
p) prefix=$OPTARG ;;
|
|
c) count=$OPTARG ;;
|
|
r) include_cpu=1 ;;
|
|
*) ;;
|
|
esac
|
|
done
|
|
|
|
declare -r sdr_cache=$output_dir/sdr.cache
|
|
declare -r log_file=${prefix:+${prefix}_}${0##*/}.bmc.pm.log
|
|
|
|
mkdir -p "$output_dir"
|
|
if [[ $log_to_file == yes ]]; then
|
|
printf 'Redirecting to %s\n' "$output_dir/$log_file" >&2
|
|
exec > "$output_dir/$log_file" 2>&1
|
|
fi
|
|
|
|
trap 'cleanup' EXIT
|
|
|
|
ipmi_supported
|
|
power_support
|
|
|
|
collect_readings
|