per Intel policy to include file commit date using git cmd below. The policy does not apply to non-Intel (C) notices. git log --follow -C90% --format=%ad --date default <file> | tail -1 and then pull just the 4 digit year from the result. Intel copyrights were not added to files where Intel either had no contribution ot the contribution lacked substance (ie license header updates, formatting changes, etc). Contribution date used "--follow -C95%" to get the most accurate date. Note that several files in this patch didn't end the license/(c) block with a blank comment line so these were added as the vast majority of files do have this last blank line. Simply there for consistency. Signed-off-by: paul luse <paul.e.luse@intel.com> Change-Id: Id5b7ce4f658fe87132f14139ead58d6e285c04d4 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/15192 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Jim Harris <james.r.harris@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Community-CI: Mellanox Build Bot
413 lines
11 KiB
C
413 lines
11 KiB
C
/* SPDX-License-Identifier: BSD-3-Clause
|
|
* Copyright (C) 2021 Intel Corporation.
|
|
* All rights reserved.
|
|
*/
|
|
|
|
#include "spdk/stdinc.h"
|
|
#include "spdk/likely.h"
|
|
#include "spdk/event.h"
|
|
#include "spdk/log.h"
|
|
#include "spdk/env.h"
|
|
|
|
#include "spdk/thread.h"
|
|
#include "spdk_internal/event.h"
|
|
#include "spdk/scheduler.h"
|
|
#include "spdk_internal/usdt.h"
|
|
|
|
static uint32_t g_main_lcore;
|
|
|
|
struct core_stats {
|
|
uint64_t busy;
|
|
uint64_t idle;
|
|
uint32_t thread_count;
|
|
};
|
|
|
|
static struct core_stats *g_cores;
|
|
|
|
uint8_t g_scheduler_load_limit = 20;
|
|
uint8_t g_scheduler_core_limit = 80;
|
|
uint8_t g_scheduler_core_busy = 95;
|
|
|
|
static uint8_t
|
|
_busy_pct(uint64_t busy, uint64_t idle)
|
|
{
|
|
if ((busy + idle) == 0) {
|
|
return 0;
|
|
}
|
|
|
|
return busy * 100 / (busy + idle);
|
|
}
|
|
|
|
static uint8_t
|
|
_get_thread_load(struct spdk_scheduler_thread_info *thread_info)
|
|
{
|
|
uint64_t busy, idle;
|
|
|
|
busy = thread_info->current_stats.busy_tsc;
|
|
idle = thread_info->current_stats.idle_tsc;
|
|
|
|
/* return percentage of time thread was busy */
|
|
return _busy_pct(busy, idle);
|
|
}
|
|
|
|
typedef void (*_foreach_fn)(struct spdk_scheduler_thread_info *thread_info);
|
|
|
|
static void
|
|
_foreach_thread(struct spdk_scheduler_core_info *cores_info, _foreach_fn fn)
|
|
{
|
|
struct spdk_scheduler_core_info *core;
|
|
uint32_t i, j;
|
|
|
|
SPDK_ENV_FOREACH_CORE(i) {
|
|
core = &cores_info[i];
|
|
for (j = 0; j < core->threads_count; j++) {
|
|
fn(&core->thread_infos[j]);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
_move_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core)
|
|
{
|
|
struct core_stats *dst = &g_cores[dst_core];
|
|
struct core_stats *src = &g_cores[thread_info->lcore];
|
|
uint64_t busy_tsc = thread_info->current_stats.busy_tsc;
|
|
uint8_t busy_pct = _busy_pct(src->busy, src->idle);
|
|
uint64_t tsc;
|
|
|
|
SPDK_DTRACE_PROBE2(dynsched_move, thread_info, dst_core);
|
|
|
|
if (src == dst) {
|
|
/* Don't modify stats if thread is already on that core. */
|
|
return;
|
|
}
|
|
|
|
dst->busy += spdk_min(UINT64_MAX - dst->busy, busy_tsc);
|
|
dst->idle -= spdk_min(dst->idle, busy_tsc);
|
|
dst->thread_count++;
|
|
|
|
/* Adjust busy/idle from core as if thread was not present on it.
|
|
* Core load will reflect the sum of all remaining threads on it. */
|
|
src->busy -= spdk_min(src->busy, busy_tsc);
|
|
src->idle += spdk_min(UINT64_MAX - src->idle, busy_tsc);
|
|
|
|
if (busy_pct >= g_scheduler_core_busy &&
|
|
_busy_pct(src->busy, src->idle) < g_scheduler_core_limit) {
|
|
/* This core was so busy that we cannot assume all of busy_tsc
|
|
* consumed by the moved thread will now be idle_tsc - it's
|
|
* very possible the remaining threads will use these cycles
|
|
* as busy_tsc.
|
|
*
|
|
* So make sure we don't drop the updated estimate below
|
|
* g_scheduler_core_limit, so that other cores can't
|
|
* move threads to this core during this scheduling
|
|
* period.
|
|
*/
|
|
tsc = src->busy + src->idle;
|
|
src->busy = tsc * g_scheduler_core_limit / 100;
|
|
src->idle = tsc - src->busy;
|
|
}
|
|
assert(src->thread_count > 0);
|
|
src->thread_count--;
|
|
|
|
thread_info->lcore = dst_core;
|
|
}
|
|
|
|
static bool
|
|
_is_core_at_limit(uint32_t core_id)
|
|
{
|
|
struct core_stats *core = &g_cores[core_id];
|
|
uint64_t busy, idle;
|
|
|
|
/* Core with no or single thread cannot be over the limit. */
|
|
if (core->thread_count <= 1) {
|
|
return false;
|
|
}
|
|
|
|
busy = core->busy;
|
|
idle = core->idle;
|
|
|
|
/* No work was done, exit before possible division by 0. */
|
|
if (busy == 0) {
|
|
return false;
|
|
}
|
|
|
|
/* Work done was less than the limit */
|
|
if (_busy_pct(busy, idle) < g_scheduler_core_limit) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
_can_core_fit_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core)
|
|
{
|
|
struct core_stats *dst = &g_cores[dst_core];
|
|
uint64_t new_busy_tsc, new_idle_tsc;
|
|
|
|
/* Thread can always fit on the core it's currently on. */
|
|
if (thread_info->lcore == dst_core) {
|
|
return true;
|
|
}
|
|
|
|
/* Reactors in interrupt mode do not update stats,
|
|
* a thread can always fit into reactor in interrupt mode. */
|
|
if (dst->busy + dst->idle == 0) {
|
|
return true;
|
|
}
|
|
|
|
/* Core has no threads. */
|
|
if (dst->thread_count == 0) {
|
|
return true;
|
|
}
|
|
|
|
/* Core doesn't have enough idle_tsc to take this thread. */
|
|
if (dst->idle < thread_info->current_stats.busy_tsc) {
|
|
return false;
|
|
}
|
|
|
|
new_busy_tsc = dst->busy + thread_info->current_stats.busy_tsc;
|
|
new_idle_tsc = dst->idle - thread_info->current_stats.busy_tsc;
|
|
|
|
/* Core cannot fit this thread if it would put it over the
|
|
* g_scheduler_core_limit. */
|
|
return _busy_pct(new_busy_tsc, new_idle_tsc) < g_scheduler_core_limit;
|
|
}
|
|
|
|
static uint32_t
|
|
_find_optimal_core(struct spdk_scheduler_thread_info *thread_info)
|
|
{
|
|
uint32_t i;
|
|
uint32_t current_lcore = thread_info->lcore;
|
|
uint32_t least_busy_lcore = thread_info->lcore;
|
|
struct spdk_thread *thread;
|
|
struct spdk_cpuset *cpumask;
|
|
bool core_at_limit = _is_core_at_limit(current_lcore);
|
|
|
|
thread = spdk_thread_get_by_id(thread_info->thread_id);
|
|
if (thread == NULL) {
|
|
return current_lcore;
|
|
}
|
|
cpumask = spdk_thread_get_cpumask(thread);
|
|
|
|
/* Find a core that can fit the thread. */
|
|
SPDK_ENV_FOREACH_CORE(i) {
|
|
/* Ignore cores outside cpumask. */
|
|
if (!spdk_cpuset_get_cpu(cpumask, i)) {
|
|
continue;
|
|
}
|
|
|
|
/* Search for least busy core. */
|
|
if (g_cores[i].busy < g_cores[least_busy_lcore].busy) {
|
|
least_busy_lcore = i;
|
|
}
|
|
|
|
/* Skip cores that cannot fit the thread and current one. */
|
|
if (!_can_core_fit_thread(thread_info, i) || i == current_lcore) {
|
|
continue;
|
|
}
|
|
if (i == g_main_lcore) {
|
|
/* First consider g_main_lcore, consolidate threads on main lcore if possible. */
|
|
return i;
|
|
} else if (i < current_lcore && current_lcore != g_main_lcore) {
|
|
/* Lower core id was found, move to consolidate threads on lowest core ids. */
|
|
return i;
|
|
} else if (core_at_limit) {
|
|
/* When core is over the limit, any core id is better than current one. */
|
|
return i;
|
|
}
|
|
}
|
|
|
|
/* For cores over the limit, place the thread on least busy core
|
|
* to balance threads. */
|
|
if (core_at_limit) {
|
|
return least_busy_lcore;
|
|
}
|
|
|
|
/* If no better core is found, remain on the same one. */
|
|
return current_lcore;
|
|
}
|
|
|
|
static int
|
|
init(void)
|
|
{
|
|
g_main_lcore = spdk_env_get_current_core();
|
|
|
|
if (spdk_governor_set("dpdk_governor") != 0) {
|
|
SPDK_NOTICELOG("Unable to initialize dpdk governor\n");
|
|
}
|
|
|
|
g_cores = calloc(spdk_env_get_last_core() + 1, sizeof(struct core_stats));
|
|
if (g_cores == NULL) {
|
|
SPDK_ERRLOG("Failed to allocate memory for dynamic scheduler core stats.\n");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
if (spdk_scheduler_get_period() == 0) {
|
|
/* set default scheduling period to one second */
|
|
spdk_scheduler_set_period(SPDK_SEC_TO_USEC);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
deinit(void)
|
|
{
|
|
free(g_cores);
|
|
g_cores = NULL;
|
|
spdk_governor_set(NULL);
|
|
}
|
|
|
|
static void
|
|
_balance_idle(struct spdk_scheduler_thread_info *thread_info)
|
|
{
|
|
if (_get_thread_load(thread_info) >= g_scheduler_load_limit) {
|
|
return;
|
|
}
|
|
/* This thread is idle, move it to the main core. */
|
|
_move_thread(thread_info, g_main_lcore);
|
|
}
|
|
|
|
static void
|
|
_balance_active(struct spdk_scheduler_thread_info *thread_info)
|
|
{
|
|
uint32_t target_lcore;
|
|
|
|
if (_get_thread_load(thread_info) < g_scheduler_load_limit) {
|
|
return;
|
|
}
|
|
|
|
/* This thread is active. */
|
|
target_lcore = _find_optimal_core(thread_info);
|
|
_move_thread(thread_info, target_lcore);
|
|
}
|
|
|
|
static void
|
|
balance(struct spdk_scheduler_core_info *cores_info, uint32_t cores_count)
|
|
{
|
|
struct spdk_reactor *reactor;
|
|
struct spdk_governor *governor;
|
|
struct spdk_scheduler_core_info *core;
|
|
struct core_stats *main_core;
|
|
uint32_t i;
|
|
int rc;
|
|
bool busy_threads_present = false;
|
|
|
|
SPDK_DTRACE_PROBE1(dynsched_balance, cores_count);
|
|
|
|
SPDK_ENV_FOREACH_CORE(i) {
|
|
g_cores[i].thread_count = cores_info[i].threads_count;
|
|
g_cores[i].busy = cores_info[i].current_busy_tsc;
|
|
g_cores[i].idle = cores_info[i].current_idle_tsc;
|
|
SPDK_DTRACE_PROBE2(dynsched_core_info, i, &cores_info[i]);
|
|
}
|
|
main_core = &g_cores[g_main_lcore];
|
|
|
|
/* Distribute threads in two passes, to make sure updated core stats are considered on each pass.
|
|
* 1) Move all idle threads to main core. */
|
|
_foreach_thread(cores_info, _balance_idle);
|
|
/* 2) Distribute active threads across all cores. */
|
|
_foreach_thread(cores_info, _balance_active);
|
|
|
|
/* Switch unused cores to interrupt mode and switch cores to polled mode
|
|
* if they will be used after rebalancing */
|
|
SPDK_ENV_FOREACH_CORE(i) {
|
|
reactor = spdk_reactor_get(i);
|
|
core = &cores_info[i];
|
|
/* We can switch mode only if reactor already does not have any threads */
|
|
if (g_cores[i].thread_count == 0 && TAILQ_EMPTY(&reactor->threads)) {
|
|
core->interrupt_mode = true;
|
|
} else if (g_cores[i].thread_count != 0) {
|
|
core->interrupt_mode = false;
|
|
if (i != g_main_lcore) {
|
|
/* If a thread is present on non g_main_lcore,
|
|
* it has to be busy. */
|
|
busy_threads_present = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
governor = spdk_governor_get();
|
|
if (governor == NULL) {
|
|
return;
|
|
}
|
|
|
|
/* Change main core frequency if needed */
|
|
if (busy_threads_present) {
|
|
rc = governor->set_core_freq_max(g_main_lcore);
|
|
if (rc < 0) {
|
|
SPDK_ERRLOG("setting default frequency for core %u failed\n", g_main_lcore);
|
|
}
|
|
} else if (main_core->busy > main_core->idle) {
|
|
rc = governor->core_freq_up(g_main_lcore);
|
|
if (rc < 0) {
|
|
SPDK_ERRLOG("increasing frequency for core %u failed\n", g_main_lcore);
|
|
}
|
|
} else {
|
|
rc = governor->core_freq_down(g_main_lcore);
|
|
if (rc < 0) {
|
|
SPDK_ERRLOG("lowering frequency for core %u failed\n", g_main_lcore);
|
|
}
|
|
}
|
|
}
|
|
|
|
struct json_scheduler_opts {
|
|
uint8_t load_limit;
|
|
uint8_t core_limit;
|
|
uint8_t core_busy;
|
|
};
|
|
|
|
static const struct spdk_json_object_decoder sched_decoders[] = {
|
|
{"load_limit", offsetof(struct json_scheduler_opts, load_limit), spdk_json_decode_uint8, true},
|
|
{"core_limit", offsetof(struct json_scheduler_opts, core_limit), spdk_json_decode_uint8, true},
|
|
{"core_busy", offsetof(struct json_scheduler_opts, core_busy), spdk_json_decode_uint8, true},
|
|
};
|
|
|
|
static int
|
|
set_opts(const struct spdk_json_val *opts)
|
|
{
|
|
struct json_scheduler_opts scheduler_opts;
|
|
|
|
scheduler_opts.load_limit = g_scheduler_load_limit;
|
|
scheduler_opts.core_limit = g_scheduler_core_limit;
|
|
scheduler_opts.core_busy = g_scheduler_core_busy;
|
|
|
|
if (opts != NULL) {
|
|
if (spdk_json_decode_object_relaxed(opts, sched_decoders,
|
|
SPDK_COUNTOF(sched_decoders), &scheduler_opts)) {
|
|
SPDK_ERRLOG("Decoding scheduler opts JSON failed\n");
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
SPDK_NOTICELOG("Setting scheduler load limit to %d\n", scheduler_opts.load_limit);
|
|
g_scheduler_load_limit = scheduler_opts.load_limit;
|
|
SPDK_NOTICELOG("Setting scheduler core limit to %d\n", scheduler_opts.core_limit);
|
|
g_scheduler_core_limit = scheduler_opts.core_limit;
|
|
SPDK_NOTICELOG("Setting scheduler core busy to %d\n", scheduler_opts.core_busy);
|
|
g_scheduler_core_busy = scheduler_opts.core_busy;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
get_opts(struct spdk_json_write_ctx *ctx)
|
|
{
|
|
spdk_json_write_named_uint8(ctx, "load_limit", g_scheduler_load_limit);
|
|
spdk_json_write_named_uint8(ctx, "core_limit", g_scheduler_core_limit);
|
|
spdk_json_write_named_uint8(ctx, "core_busy", g_scheduler_core_busy);
|
|
}
|
|
|
|
static struct spdk_scheduler scheduler_dynamic = {
|
|
.name = "dynamic",
|
|
.init = init,
|
|
.deinit = deinit,
|
|
.balance = balance,
|
|
.set_opts = set_opts,
|
|
.get_opts = get_opts,
|
|
};
|
|
|
|
SPDK_SCHEDULER_REGISTER(scheduler_dynamic);
|