#!/usr/bin/env bash # SPDX-License-Identifier: BSD-3-Clause # Copyright (C) 2022 Intel Corporation # All rights reserved. set -e hex() { printf '0x%02x\n' "$@"; } calc() { bc <<< "scale=2; $*"; } is_root() { # Talking to local BMC device requires root privileges if ((UID)); then printf '%s, you need to be root to run this script\n' "$USER" >&2 return 1 fi } is_ipmitool() { if ! type -P ipmitool; then printf 'ipmitool not detected, cannot run commands against the BMC\n' >&2 return 1 fi } ipmi_load() { # Silently attempt to load core ipmi drivers - we will pick up the device later on. modprobe -qa ipmi_si ipmi_devintf ipmi_msghandler || return 0 } ipmi_supported() { # Verify if kernel detected and registered at least one BMC under # the ipmi platform. Look for KCS specifically as this the type # of the interface the script was tested against. local ipmi=/sys/class/ipmi/ipmi0 # Keep these details global for easy access if needed. local -g man_id prod_id dev_id ipmi_ver platform board ipmitool ipmi_load if [[ ! -e $ipmi ]]; then printf 'BMC not detected. Please, make sure your platform is IPMI-compatible\n' return 1 fi >&2 type=$(< "$ipmi/device/type") if [[ $type != kcs ]]; then printf 'No supported BMC interface detected (%s) - only KCS is supported\n' "$type" return 1 fi >&2 man_id=$(< "$ipmi/device/bmc/manufacturer_id") prod_id=$(< "$ipmi/device/bmc/product_id") dev_id=$(hex "$(< "$ipmi/device/bmc/device_id")") ipmi_ver=$(< "$ipmi/device/bmc/ipmi_version") if [[ -e /sys/class/dmi/id/board_vendor ]]; then platform=$(< /sys/class/dmi/id/board_vendor) fi if [[ -e /sys/class/dmi/id/board_name ]]; then board=$(< /sys/class/dmi/id/board_name) fi # Keep output similar to ipmi_si's cat <<- BMC_DEV >&2 BMC detected, details below: Manufacturer ID: $man_id Product ID: $prod_id Device ID: $dev_id IPMI Version: $ipmi_ver Platform: ${platform:-unknown} Board: ${board:-unknown} BMC_DEV # Verify if we have proper tools to work with ipmitool=$(is_ipmitool) } ipmiraw() { # For the majority of commands we use raw payload to not depend on specific ipmitool version # and the way how it interprets/parses the returned data. This also allows us to inspect the # integrity of data more closely to make sure we don't report nonsensical values to the user. local rsp rsp=($("$ipmitool" raw "$@" 2> /dev/null)) # Slap hex prefix to work with proper base rsp=("${rsp[@]/#/0x}") hex "${rsp[@]}" } dcmiraw() { local cmd=$1 data=("${@:2}") ipmiraw 0x2c "$cmd" 0xdc "${data[@]}" } print_dcmi_available_time_periods() { local time_periods=${enhanced_power_attr[4]} local -g available_time_periods=() local -g available_time_periods_in_seconds=() available_time_periods[0]="NOW" if ((time_periods > 0)); then local time_idx=5 local offset=$time_idx local units unit time time_s units_mask=0xc0 to_sec units[0x0]=seconds units[0x1]=minutes units[0x2]=hours units[0x3]=days to_sec[0x0]=1 to_sec[0x1]=60 to_sec[0x2]=3600 to_sec[0x3]=86400 while ((offset < time_idx + time_periods)); do time=$((enhanced_power_attr[offset] & ~units_mask)) unit=${units[enhanced_power_attr[offset] >> 6]:-unknown} time_s=$((time * to_sec[enhanced_power_attr[offset] >> 6])) if ((time != 0)); then available_time_periods[offset]="$time $unit" available_time_periods_in_seconds[time_s]=${enhanced_power_attr[offset]} fi ((++offset)) done fi cat <<- TIME_PERIODS >&2 Available averaging time periods to request: $(printf ' - %s\n' "${available_time_periods[@]}") TIME_PERIODS } dcmi_power_support() { # Verify if the BMC conforms to the DCMI spec local rsp # Table 6-2, Get DCMI Capabilities Command Format if ! rsp=($(dcmiraw 0x1 0x1)); then printf 'Cannot determine if BMC supports DCMI Power Management capability\n' >&2 return 1 fi # Table 6-3, DCMI Capabilities Parameters: # - Supported DCMI Capabilities: # - Byte 2 Platform capabilities: [0] Power management if ((!(rsp[5] & (1 << 0)))); then printf 'BMC does not provide DCMI Power Mangament capability\n' >&2 return 1 fi # Check if BMC provides Enhanced System Power Statistics attributes - this allows to issue # requests for power readings at averaging time period, .e.g. from last 5 seconds, 30 minutes, # 1 hour and so on. With this we can provide more detailed view on power usage within a # specific period of time. Without it, we need to depend only on current reading that should # be always available (the "NOW" reading). local -g enhanced_power_attr=() # Table 6-3, DCMI Capabilities Parameters: # - Enhanced System Power Statistics attributes if enhanced_power_attr=($(dcmiraw 0x1 0x5)); then print_dcmi_available_time_periods fi printf 'Using DCMI Power Management\n' >&2 } sdr_power_support() { # This is a fallback which only some platforms may provide (confirmed PowerEdge and CYP). # We are looking for a full, threshold sensor which reports overall power usage in Watts. # Different BMCs may have SDRs which describe such sensor(s) differently so this is not # 100% reliable. To make sure we pick up a proper sensor we also narrow it down to a # specific entity (System Board or Power Supply). Readings from the sensor should be # considered as "NOW" readings (without access to min, max readings). local -g power_sensors=() local sensor entity unit status # Cache SDR to speed up sensor readings if [[ ! -f $sdr_cache ]]; then printf 'Saving SDR cache at %s\n' "$sdr_cache" >&2 "$ipmitool" sdr dump "$sdr_cache" > /dev/null fi if ((${#extra_power_sensors[@]} > 0)); then power_sensors+=("${extra_power_sensors[@]}") fi while IFS="," read -r sensor _ unit status _ entity _; do [[ $unit == Watts && $status == ok ]] || continue [[ $entity == "System Board" || $entity == "Power Supply" ]] || continue power_sensors+=("$sensor") done < <("$ipmitool" -S "$sdr_cache" -vc sdr list full 2>&1) if ((${#power_sensors[@]} > 0)); then printf 'Using SDR (Power sensors: %s)\n' "${power_sensors[*]}" else printf 'Cannot locate power sensors\n' return 1 fi >&2 } power_support() { local -g support cpu_support=0 if ((include_cpu == 1)) && rapl_supported; then cpu_support=1 fi if [[ $interface == dcmi || $interface == sdr ]]; then # override "${interface}_power_support" support=$interface elif dcmi_power_support; then support=dcmi elif sdr_power_support; then support=sdr else printf 'BMC does not provide Power Management support, cannot gather system-wide power measurements\n' >&2 if ((cpu_support)); then printf 'Only CPU measurements will be provided\n' >&2 return 0 fi return 1 fi } get_dcmi_now_reading() { local rsp reading=0 max min avg ts timeframe mode=01h local get_cmd get_avg=0 print # Table 6-16, Get Power Reading Command: get_cmd=(0x2 0x1 0x0 0x0) if [[ $interval =~ ^[0-9]+$ && -n ${available_time_periods_in_seconds[interval]} ]]; then get_cmd=(0x2 0x2 "${available_time_periods_in_seconds[interval]}" 0x0) get_avg=1 mode=02h fi # We use System Power Statistics mode to get the "NOW" reading by default. In case # interval matches one supported by Enhanced System Power Statistics we use that # mode to obtain extra min, max, avg statistics. if ! rsp=($(dcmiraw "${get_cmd[@]}")); then printf 'DCMI reading: error\n' else # Note that the BMC timestamp depends on the hwclock setup which we then attempt # to represent in UTC. ts=$((rsp[12] << 24 | rsp[11] << 16 | rsp[10] << 8 | rsp[9])) # This is interpreted differently by different BMCs so for now we make a note of # it but don't present it to the user. timeframe=$((rsp[16] << 24 | rsp[15] << 16 | rsp[14] << 8 | rsp[13])) reading=$((rsp[2] << 8 | rsp[1])) if ((get_avg == 1)); then min=$((rsp[4] << 8 | rsp[3])) max=$((rsp[6] << 8 | rsp[5])) avg=$((rsp[8] << 8 | rsp[7])) _DCMI_min+=("$min") _DCMI_max+=("$max") _DCMI_avg+=("$avg") power_readings["DCMI_MIN"]="_DCMI_min[@]" power_readings["DCMI_MAX"]="_DCMI_max[@]" power_readings["DCMI_AVG"]="_DCMI_avg[@]" fi _DCMI+=("$reading") power_readings["DCMI"]="_DCMI[@]" for print in min max avg reading; do [[ -n ${!print} ]] || continue printf '(%s) DCMI %s (mode: %s): %u Watts (interval: %ss)\n' \ "$(utc "$ts")" \ "$print" \ "$mode" \ "${!print}" \ "$interval" done fi >&2 } get_sdr_now_reading() { local sensor reading=0 ts unit if ((${#power_sensors[@]} == 0)); then printf 'No power sensors were provided\n' >&2 return 1 fi for sensor in "${!power_sensors[@]}"; do ts=$(utc) if ! IFS="," read -r _ reading unit _; then reading=error else eval "_sensor${sensor}_readings+=($reading)" power_readings["${power_sensors[sensor]}"]="_sensor${sensor}_readings[@]" reading+=" $unit" fi < <("$ipmitool" -c -S "$sdr_cache" sdr get "${power_sensors[sensor]}") 2> /dev/null printf '(%s) Sensor %s reading: %s (interval %ss)\n' \ "$ts" \ "${power_sensors[sensor]}" \ "$reading" \ "$interval" >&2 done } rapl_supported() { [[ -e /sys/class/powercap/intel-rapl ]] } get_cpu_socket_reading() { local rapl=/sys/class/powercap local socket socket_idx _socket_idx socket_name local ts reading # power_uw is usually not available so we need to relay on energy_uj. It's also rarely # rw so we can't zero it out, hence we need to keep track of the initial counter. For # details see kernel documentation (powercap.rst). ts=$(utc) for socket in /sys/class/powercap/intel-rapl:*; do [[ -e $socket ]] || continue socket_idx=${socket#*:} socket_name=$(< "$socket/name") # Adjust for different domains, see linux/intel_rapl.h case "$socket_name" in dram | core | uncore) _socket_idx=${socket_idx//:/_} socket_idx=${socket_idx%:*} ;; package-*) _socket_idx=$socket_idx socket_name=socket ;; psys*) _socket_idx=$socket_idx socket_name=platform ;; esac local -n socket_uj=socket_${_socket_idx}_uj socket_uj+=("$(< "$socket/energy_uj")") # We need at least two readings for comparison ((${#socket_uj[@]} > 1)) || continue # Convert to Watts - use bc since $interval can be an actual float reading=$(calc "(${socket_uj[-1]} - ${socket_uj[-2]}) / 1000000 / $interval") if ((reading < 0)); then # Somehow this may happen, probably when the counter wraps over. Consider # this as a faulty reading and don't include it since it may impact overall # avg. printf '(%s) CPU %s %s reading: error(%s) (interval: %ss)\n' \ "$ts" \ "$socket_name" \ "$socket_idx" \ "$reading" \ "$interval" >&2 return 0 fi eval "_socket${_socket_idx}_readings+=($reading)" power_readings["$socket_name-$socket_idx"]="_socket${_socket_idx}_readings[@]" printf '(%s) CPU %s %s reading: %s Watts (interval: %ss)\n' \ "$ts" \ "$socket_name" \ "$socket_idx" \ "$reading" \ "$interval" >&2 done } get_now_reading() { case "$support" in dcmi) get_dcmi_now_reading ;; sdr) get_sdr_now_reading ;; *) ;; esac } dump_readings() { local sensor reading readings avg total ((${#power_readings[@]} > 0)) || return 1 printf 'Dumping average sensors reading from %s\n' "${!power_readings[*]}" >&2 for sensor in "${!power_readings[@]}"; do readings=("${!power_readings["$sensor"]}") if ((${#readings[@]} == 0)); then printf 'No readings available for %s sensor\n' "$sensor" >&2 continue fi total=0 for reading in "${readings[@]}"; do total=$(calc "$total + $reading") done avg=$(calc "$total / ${#readings[@]}") readings+=("Total: ${#readings[@]}") sensor="${sensor//[[:space:]]/_}" printf '%s\n' "$avg" > "$output_dir/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt" printf '%s\n' "${readings[@]}" > "$output_dir/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt" printf 'Dumped avg to %s\n' "$output_dir/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt" >&2 printf 'Dumped all to %s\n' "$output_dir/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt" >&2 done } utc() { date --utc ${1:+-"d@$1"} } cleanup() { [[ -f $sdr_cache && $remove_sdr_cache == yes ]] && rm "$sdr_cache" dump_readings } collect_readings() { local _count=$count if ((_count == 1 && cpu_support)); then # We need at least two readings to get a meaningful data ((_count += 1)) fi while ((count <= 0 ? 1 : _count--)); do get_now_reading ((cpu_support)) && get_cpu_socket_reading sleep "${interval}s" done } help() { cat <<- HELP Usage: $0 [-h] [-d dir] [-i sdr|dcmi] [-s SENSOR_NAME] [-t interval] [-l log_file] [-p prefix] [-c count] [-r] -h - Print this message. -d - Directory where the results should be saved. Default is /tmp. -i - Type of interface to use for requesting power usage. "sdr" or "dcmi". If not set, available interface is used ("dcmi" has priority). -t - How long to wait before each get power command in seconds. In case this value matches one of supported averaging time periods special variant of the command will be used to obtain the reading - this variant is used only with the "dcmi" interface. Default is 1s. -s - In case "sdr" interface is in use, try to read data from SENSOR_NAME. -x - In case "sdr" interface is in use, don't remove SDR cache. This can speed up subsequent runs of the script. -l - Save output of the script to a log file (dir/${0##*/}.bmc.pm.log). -p - Add prefix to saved files. -c - Read power usage count times. 0 is the default and it means to run indefinitely. -r - Include readings from CPU sockets (RAPL-dependent) When started, ${0##*/} will enter loop to continuously read power usage from either DCMI interface or dedicated Watts sensors every interval. Each reading will be logged to stderr. Upon termination, average power usage will be dumped to /tmp or directory set by -d. HELP } is_root output_dir=/tmp interval=1 remove_sdr_cache=yes log_to_file=no prefix="" count=0 include_cpu=0 declare -A power_readings=() declare -a extra_power_sensors=() while getopts :hi:s:d:t:xlp:c:r arg; do case "$arg" in h) help exit 0 ;; d) output_dir=$OPTARG ;; s) extra_power_sensors+=("$OPTARG") ;; i) interface=${OPTARG,,} ;; t) interval=$OPTARG ;; x) remove_sdr_cache=no ;; l) log_to_file=yes ;; p) prefix=$OPTARG ;; c) count=$OPTARG ;; r) include_cpu=1 ;; *) ;; esac done declare -r sdr_cache=$output_dir/sdr.cache declare -r log_file=${prefix:+${prefix}_}${0##*/}.bmc.pm.log mkdir -p "$output_dir" if [[ $log_to_file == yes ]]; then printf 'Redirecting to %s\n' "$output_dir/$log_file" >&2 exec > "$output_dir/$log_file" 2>&1 fi trap 'cleanup' EXIT ipmi_supported power_support collect_readings