From 1b866473f81b887506e59499bb0e399682d78851 Mon Sep 17 00:00:00 2001 From: Piotr Gorski Date: Thu, 28 Aug 2025 14:36:55 +0200 Subject: [PATCH] bore-cachy Signed-off-by: Piotr Gorski --- include/linux/sched.h | 29 +++ include/linux/sched/bore.h | 39 ++++ init/Kconfig | 17 ++ kernel/Kconfig.hz | 17 ++ kernel/fork.c | 8 + kernel/futex/waitwake.c | 11 ++ kernel/sched/Makefile | 1 + kernel/sched/bore.c | 393 +++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 12 ++ kernel/sched/debug.c | 61 +++++- kernel/sched/fair.c | 128 +++++++++--- kernel/sched/features.h | 3 + kernel/sched/sched.h | 9 + 13 files changed, 701 insertions(+), 27 deletions(-) create mode 100644 include/linux/sched/bore.h create mode 100644 kernel/sched/bore.c diff --git a/include/linux/sched.h b/include/linux/sched.h index 0d1d70ade..2a0aba074 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -782,6 +782,32 @@ struct kmap_ctrl { #endif }; +#ifdef CONFIG_SCHED_BORE +#define BORE_BC_TIMESTAMP_SHIFT 16 + +struct bore_bc { + u64 timestamp: 48; + u64 penalty: 16; +}; + +struct bore_ctx { + struct bore_bc subtree; + struct bore_bc group; + u64 burst_time; + u16 prev_penalty; + u16 curr_penalty; + union { + u16 penalty; + struct { + u8 _; + u8 score; + }; + }; + bool stop_update; + bool futex_waiting; +}; +#endif /* CONFIG_SCHED_BORE */ + struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -842,6 +868,9 @@ struct task_struct { #ifdef CONFIG_SCHED_CLASS_EXT struct sched_ext_entity scx; #endif +#ifdef CONFIG_SCHED_BORE + struct bore_ctx bore; +#endif /* CONFIG_SCHED_BORE */ const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE diff --git a/include/linux/sched/bore.h b/include/linux/sched/bore.h new file mode 100644 index 000000000..5afb5fed4 --- /dev/null +++ b/include/linux/sched/bore.h @@ -0,0 +1,39 @@ +#ifndef _KERNEL_SCHED_BORE_H +#define _KERNEL_SCHED_BORE_H + +#include +#include +#include +#include +#include + +#define SCHED_BORE_AUTHOR "Masahito Suzuki" +#define SCHED_BORE_PROGNAME "BORE CPU Scheduler modification" + +#define SCHED_BORE_VERSION "6.5.2" + +extern u8 __read_mostly sched_bore; +extern u8 __read_mostly sched_burst_inherit_type; +extern u8 __read_mostly sched_burst_smoothness; +extern u8 __read_mostly sched_burst_penalty_offset; +extern uint __read_mostly sched_burst_penalty_scale; +extern uint __read_mostly sched_burst_cache_lifetime; + +extern u8 effective_prio_bore(struct task_struct *p); +extern void update_curr_bore(struct task_struct *p, u64 delta_exec); +extern void restart_burst_bore(struct task_struct *p); +extern void restart_burst_rescale_deadline_bore(struct task_struct *p); +extern void task_fork_bore(struct task_struct *p, struct task_struct *parent, + u64 clone_flags, u64 now); +extern void sched_init_bore(void); +extern void reset_task_bore(struct task_struct *p); + +extern int sched_bore_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sched_burst_inherit_type_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos); + +extern void reweight_entity( + struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight); + +#endif /* _KERNEL_SCHED_BORE_H */ diff --git a/init/Kconfig b/init/Kconfig index a3aaeb908..99200183c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1369,6 +1369,23 @@ config CHECKPOINT_RESTORE If unsure, say N here. +config SCHED_BORE + bool "Burst-Oriented Response Enhancer" + default y + help + In Desktop and Mobile computing, one might prefer interactive + tasks to keep responsive no matter what they run in the background. + + Enabling this kernel feature modifies the scheduler to discriminate + tasks by their burst time (runtime since it last went sleeping or + yielding state) and prioritize those that run less bursty. + Such tasks usually include window compositor, widgets backend, + terminal emulator, video playback, games and so on. + With a little impact to scheduling fairness, it may improve + responsiveness especially under heavy background workload. + + If unsure, say Y here. + config SCHED_AUTOGROUP bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 0f78364ef..cf64ee42c 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -81,3 +81,20 @@ config HZ config SCHED_HRTICK def_bool HIGH_RES_TIMERS + +config MIN_BASE_SLICE_NS + int "Default value for min_base_slice_ns" + default 2000000 + help + The BORE Scheduler automatically calculates the optimal base + slice for the configured HZ using the following equation: + + base_slice_ns = + 1000000000/HZ * DIV_ROUNDUP(min_base_slice_ns, 1000000000/HZ) + + This option sets the default lower bound limit of the base slice + to prevent the loss of task throughput due to overscheduling. + + Setting this value too high can cause the system to boot with + an unnecessarily large base slice, resulting in high scheduling + latency and poor system responsiveness. diff --git a/kernel/fork.c b/kernel/fork.c index a21920fef..cea9f2300 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -117,6 +117,10 @@ #include #include +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + #include #define CREATE_TRACE_POINTS @@ -2532,6 +2536,10 @@ __latent_entropy struct task_struct *copy_process( * Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); +#ifdef CONFIG_SCHED_BORE + if (likely(p->pid)) + task_fork_bore(p, current, clone_flags, p->start_time); +#endif /* CONFIG_SCHED_BORE */ /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index a9056acb7..e60425086 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -4,6 +4,9 @@ #include #include #include +#ifdef CONFIG_SCHED_BORE +#include +#endif // CONFIG_SCHED_BORE #include "futex.h" @@ -367,7 +370,15 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) +#ifdef CONFIG_SCHED_BORE + { + current->bore.futex_waiting = true; +#endif // CONFIG_SCHED_BORE schedule(); +#ifdef CONFIG_SCHED_BORE + current->bore.futex_waiting = false; + } +#endif // CONFIG_SCHED_BORE } __set_current_state(TASK_RUNNING); } diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 976092b7b..83cbd093b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -32,3 +32,4 @@ obj-y += core.o obj-y += fair.o obj-y += build_policy.o obj-y += build_utility.o +obj-$(CONFIG_SCHED_BORE) += bore.o diff --git a/kernel/sched/bore.c b/kernel/sched/bore.c new file mode 100644 index 000000000..c9e76eda3 --- /dev/null +++ b/kernel/sched/bore.c @@ -0,0 +1,393 @@ +/* + * Burst-Oriented Response Enhancer (BORE) CPU Scheduler + * Copyright (C) 2021-2025 Masahito Suzuki + */ +#include +#include +#include +#include "sched.h" + +#ifdef CONFIG_SCHED_BORE +u8 __read_mostly sched_bore = 1; +u8 __read_mostly sched_burst_inherit_type = 2; +u8 __read_mostly sched_burst_smoothness = 1; +u8 __read_mostly sched_burst_penalty_offset = 24; +uint __read_mostly sched_burst_penalty_scale = 1536; +uint __read_mostly sched_burst_cache_lifetime = 75000000; +static int __maybe_unused maxval_prio = 39; +static int __maybe_unused maxval_6_bits = 63; +static int __maybe_unused maxval_8_bits = 255; +static int __maybe_unused maxval_12_bits = 4095; + +#define MAX_BURST_PENALTY ((40U << 8) - 1) +#define BURST_CACHE_STOP_COUNT 63 + +static u32 (*inherit_penalty_fn)(struct task_struct *, u64, u64); + +static inline u32 log2p1_u64_u32fp(u64 v, u8 fp) { + if (!v) return 0; + u32 exponent = fls64(v), + mantissa = (u32)(v << (64 - exponent) << 1 >> (64 - fp)); + return exponent << fp | mantissa; +} + +static inline u32 calc_burst_penalty(u64 burst_time) { + u32 greed = log2p1_u64_u32fp(burst_time, 8), + tolerance = sched_burst_penalty_offset << 8, + penalty = max(0, (s32)(greed - tolerance)), + scaled_penalty = penalty * sched_burst_penalty_scale >> 10; + return min(MAX_BURST_PENALTY, scaled_penalty); +} + +static inline u64 rescale_slice(u64 delta, u8 old_prio, u8 new_prio) { + u64 unscaled, rescaled; + unscaled = mul_u64_u32_shr(delta , sched_prio_to_weight[old_prio], 10); + rescaled = mul_u64_u32_shr(unscaled, sched_prio_to_wmult [new_prio], 22); + return rescaled; +} + +static inline u32 binary_smooth(u32 new, u32 old) { + if (new <= old) return new; + + u32 increment = new - old, + shift = sched_burst_smoothness, + divisor = 1U << shift; + + return old + ((increment + divisor - 1) >> shift); +} + +static void reweight_task_by_prio(struct task_struct *p, int prio) { + if (task_has_idle_policy(p)) return; + + struct sched_entity *se = &p->se; + unsigned long weight = scale_load(sched_prio_to_weight[prio]); + + if (se->on_rq) { + p->bore.stop_update = true; + reweight_entity(cfs_rq_of(se), se, weight); + p->bore.stop_update = false; + } else + se->load.weight = weight; + se->load.inv_weight = sched_prio_to_wmult[prio]; +} + +u8 effective_prio_bore(struct task_struct *p) { + int prio = p->static_prio - MAX_RT_PRIO; + if (likely(sched_bore)) + prio += p->bore.score; + return (u8)clamp(prio, 0, maxval_prio); +} + +static void update_penalty(struct task_struct *p) { + struct bore_ctx *ctx = &p->bore; + + u8 prev_prio = effective_prio_bore(p); + u32 penalty = 0; + + if (!(p->flags & PF_KTHREAD)) { + u32 curr_penalty = ctx->curr_penalty; + penalty = ctx->prev_penalty; + if (penalty < curr_penalty) + penalty = curr_penalty; + } + ctx->penalty = penalty; + + u8 new_prio = effective_prio_bore(p); + if (new_prio != prev_prio) + reweight_task_by_prio(p, new_prio); +} + +void update_curr_bore(struct task_struct *p, u64 delta_exec) { + struct bore_ctx *ctx = &p->bore; + if (ctx->stop_update) return; + + ctx->burst_time += delta_exec; + u32 curr_penalty = ctx->curr_penalty = calc_burst_penalty(ctx->burst_time); + + if (curr_penalty <= ctx->prev_penalty) return; + update_penalty(p); +} + +void restart_burst_bore(struct task_struct *p) { + struct bore_ctx *ctx = &p->bore; + u32 new_penalty = binary_smooth(ctx->curr_penalty, ctx->prev_penalty); + ctx->prev_penalty = new_penalty; + ctx->curr_penalty = 0; + ctx->burst_time = 0; + update_penalty(p); +} + +void restart_burst_rescale_deadline_bore(struct task_struct *p) { + struct sched_entity *se = &p->se; + s64 vscaled, vremain = se->deadline - se->vruntime; + + u8 old_prio = effective_prio_bore(p); + restart_burst_bore(p); + u8 new_prio = effective_prio_bore(p); + + if (old_prio > new_prio) { + vscaled = rescale_slice(abs(vremain), old_prio, new_prio); + if (unlikely(vremain < 0)) + vscaled = -vscaled; + se->deadline = se->vruntime + vscaled; + } +} + +static inline bool task_is_bore_eligible(struct task_struct *p) +{return p && p->sched_class == &fair_sched_class && !p->exit_state;} + +#ifndef for_each_child_task +#define for_each_child_task(p, t) \ + list_for_each_entry(t, &(p)->children, sibling) +#endif + +static inline u32 count_children_upto2(struct task_struct *p) { + struct list_head *head = &p->children; + struct list_head *next = head->next; + return (next != head) + (next->next != head); +} + +static inline bool burst_cache_expired(struct bore_bc *bc, u64 now) { + u64 timestamp = bc->timestamp << BORE_BC_TIMESTAMP_SHIFT; + return now - timestamp > sched_burst_cache_lifetime; +} + +static void update_burst_cache(struct bore_bc *bc, + struct task_struct *p, u32 count, u32 total, u64 now) { + u32 average = count ? total / count : 0; + bc->penalty = max(average, p->bore.penalty); + bc->timestamp = now >> BORE_BC_TIMESTAMP_SHIFT; +} + +static u32 inherit_none(struct task_struct *parent, + u64 clone_flags, u64 now) +{ return 0; } + +static u32 inherit_from_parent(struct task_struct *parent, + u64 clone_flags, u64 now) { + if (clone_flags & CLONE_PARENT) + parent = parent->real_parent; + + struct bore_bc *bc = &parent->bore.subtree; + + if (burst_cache_expired(bc, now)) { + struct task_struct *child; + u32 count = 0, total = 0; + for_each_child_task(parent, child) { + if (count >= BURST_CACHE_STOP_COUNT) break; + + if (!task_is_bore_eligible(child)) continue; + count++; + total += child->bore.penalty; + } + + update_burst_cache(bc, parent, count, total, now); + } + + return bc->penalty; +} + +static u32 inherit_from_ancestor_hub(struct task_struct *parent, + u64 clone_flags, u64 now) { + struct task_struct *ancestor = parent; + u32 sole_child_count = 0; + + if (clone_flags & CLONE_PARENT) { + ancestor = ancestor->real_parent; + sole_child_count = 1; + } + + for (struct task_struct *next; + (next = ancestor->real_parent) != ancestor && + count_children_upto2(ancestor) <= sole_child_count; + ancestor = next, sole_child_count = 1) {} + + struct bore_bc *bc = &ancestor->bore.subtree; + + if (burst_cache_expired(bc, now)) { + struct task_struct *direct_child; + u32 count = 0, total = 0; + for_each_child_task(ancestor, direct_child) { + if (count >= BURST_CACHE_STOP_COUNT) break; + + struct task_struct *descendant = direct_child; + while (count_children_upto2(descendant) == 1) + descendant = list_first_entry(&descendant->children, + struct task_struct, sibling); + + if (!task_is_bore_eligible(descendant)) continue; + count++; + total += descendant->bore.penalty; + } + + update_burst_cache(bc, ancestor, count, total, now); + } + + return bc->penalty; +} + +static u32 inherit_from_thread_group(struct task_struct *p, u64 now) { + struct task_struct *leader = p->group_leader; + struct bore_bc *bc = &leader->bore.group; + + if (burst_cache_expired(bc, now)) { + struct task_struct *sibling; + u32 count = 0, total = 0; + + for_each_thread(leader, sibling) { + if (count >= BURST_CACHE_STOP_COUNT) break; + + if (!task_is_bore_eligible(sibling)) continue; + count++; + total += sibling->bore.penalty; + } + + update_burst_cache(bc, leader, count, total, now); + } + + return bc->penalty; +} + +void task_fork_bore(struct task_struct *p, + struct task_struct *parent, u64 clone_flags, u64 now) { + if (!task_is_bore_eligible(p) || unlikely(!sched_bore)) return; + + struct bore_ctx *ctx = &p->bore; + u32 inherited_penalty = (clone_flags & CLONE_THREAD)? + inherit_from_thread_group(parent, now): + inherit_penalty_fn(parent, clone_flags, now); + + if (ctx->prev_penalty < inherited_penalty) + ctx->prev_penalty = inherited_penalty; + ctx->curr_penalty = 0; + ctx->burst_time = 0; + ctx->stop_update = false; + ctx->futex_waiting = false; + update_penalty(p); +} + +void reset_task_bore(struct task_struct *p) +{ memset(&p->bore, 0, sizeof(struct bore_ctx)); } + +static void update_inherit_type(void) { + switch(sched_burst_inherit_type) { + case 1: + inherit_penalty_fn = inherit_from_parent; + break; + case 2: + inherit_penalty_fn = inherit_from_ancestor_hub; + break; + default: + inherit_penalty_fn = inherit_none; + } +} + +void __init sched_init_bore(void) { + printk(KERN_INFO "%s %s by %s\n", + SCHED_BORE_PROGNAME, SCHED_BORE_VERSION, SCHED_BORE_AUTHOR); + + reset_task_bore(&init_task); + update_inherit_type(); +} + +static void readjust_all_task_weights(void) { + struct task_struct *task; + struct rq *rq; + struct rq_flags rf; + + scoped_guard(write_lock_irq, &tasklist_lock) + for_each_process(task) { + if (!task_is_bore_eligible(task)) continue; + rq = task_rq_lock(task, &rf); + update_rq_clock(rq); + reweight_task_by_prio(task, effective_prio_bore(task)); + task_rq_unlock(rq, task, &rf); + } +} + +int sched_bore_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + readjust_all_task_weights(); + + return 0; +} + +int sched_burst_inherit_type_update_handler(const struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + update_inherit_type(); + + return 0; +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_bore_sysctls[] = { + { + .procname = "sched_bore", + .data = &sched_bore, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = sched_bore_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_burst_inherit_type", + .data = &sched_burst_inherit_type, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = sched_burst_inherit_type_update_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "sched_burst_smoothness", + .data = &sched_burst_smoothness, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, + { + .procname = "sched_burst_penalty_offset", + .data = &sched_burst_penalty_offset, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &maxval_6_bits, + }, + { + .procname = "sched_burst_penalty_scale", + .data = &sched_burst_penalty_scale, + .maxlen = sizeof(uint), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &maxval_12_bits, + }, + { + .procname = "sched_burst_cache_lifetime", + .data = &sched_burst_cache_lifetime, + .maxlen = sizeof(uint), + .mode = 0644, + .proc_handler = proc_douintvec, + }, +}; + +static int __init sched_bore_sysctl_init(void) { + register_sysctl_init("kernel", sched_bore_sysctls); + return 0; +} +late_initcall(sched_bore_sysctl_init); + +#endif // CONFIG_SYSCTL +#endif /* CONFIG_SCHED_BORE */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4b1953b6c..3d78a3c07 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -97,6 +97,10 @@ #include "../../io_uring/io-wq.h" #include "../smpboot.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); @@ -1370,7 +1374,11 @@ int tg_nop(struct task_group *tg, void *data) void set_load_weight(struct task_struct *p, bool update_load) { +#ifdef CONFIG_SCHED_BORE + int prio = effective_prio_bore(p); +#else /* !CONFIG_SCHED_BORE */ int prio = p->static_prio - MAX_RT_PRIO; +#endif /* CONFIG_SCHED_BORE */ struct load_weight lw; if (task_has_idle_policy(p)) { @@ -8406,6 +8414,10 @@ void __init sched_init(void) BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); #endif +#ifdef CONFIG_SCHED_BORE + sched_init_bore(); +#endif /* CONFIG_SCHED_BORE */ + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 9815f9a0c..88bafe9f9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -167,7 +167,53 @@ static const struct file_operations sched_feat_fops = { }; #ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_BORE +#define DEFINE_SYSCTL_SCHED_FUNC(name, update_func) \ +static ssize_t sched_##name##_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) \ +{ \ + char buf[16]; \ + unsigned int value; \ +\ + if (cnt > 15) \ + cnt = 15; \ +\ + if (copy_from_user(&buf, ubuf, cnt)) \ + return -EFAULT; \ + buf[cnt] = '\0'; \ +\ + if (kstrtouint(buf, 10, &value)) \ + return -EINVAL; \ +\ + sysctl_sched_##name = value; \ + sched_update_##update_func(); \ +\ + *ppos += cnt; \ + return cnt; \ +} \ +\ +static int sched_##name##_show(struct seq_file *m, void *v) \ +{ \ + seq_printf(m, "%d\n", sysctl_sched_##name); \ + return 0; \ +} \ +\ +static int sched_##name##_open(struct inode *inode, struct file *filp) \ +{ \ + return single_open(filp, sched_##name##_show, NULL); \ +} \ +\ +static const struct file_operations sched_##name##_fops = { \ + .open = sched_##name##_open, \ + .write = sched_##name##_write, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; +DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice) + +#undef DEFINE_SYSCTL_SCHED_FUNC +#else /* !CONFIG_SCHED_BORE */ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -213,7 +259,7 @@ static const struct file_operations sched_scaling_fops = { .llseek = seq_lseek, .release = single_release, }; - +#endif /* CONFIG_SCHED_BORE */ #endif /* SMP */ #ifdef CONFIG_PREEMPT_DYNAMIC @@ -504,13 +550,20 @@ static __init int sched_init_debug(void) debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); #endif +#ifdef CONFIG_SCHED_BORE + debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops); + debugfs_create_u32("base_slice_ns", 0444, debugfs_sched, &sysctl_sched_base_slice); +#else /* !CONFIG_SCHED_BORE */ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); +#endif /* CONFIG_SCHED_BORE */ debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); #ifdef CONFIG_SMP +#if !defined(CONFIG_SCHED_BORE) debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); +#endif /* CONFIG_SCHED_BORE */ debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); @@ -755,6 +808,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); +#ifdef CONFIG_SCHED_BORE + SEQ_printf(m, " %2d", p->bore.score); +#endif /* CONFIG_SCHED_BORE */ #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif @@ -1245,6 +1301,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.load.weight); #ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_BORE + P(bore.score); +#endif /* CONFIG_SCHED_BORE */ P(se.avg.load_sum); P(se.avg.runnable_sum); P(se.avg.util_sum); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7091e1364..a1e9ce4dc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -55,6 +55,10 @@ #include "stats.h" #include "autogroup.h" +#ifdef CONFIG_SCHED_BORE +#include +#endif /* CONFIG_SCHED_BORE */ + /* * The initial- and re-scaling of tunables is configurable * @@ -64,28 +68,32 @@ * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * - * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * BORE : default SCHED_TUNABLESCALING_NONE = *1 constant + * EEVDF: default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ +#ifdef CONFIG_SCHED_BORE +unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +#else /* !CONFIG_SCHED_BORE */ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; +#endif /* CONFIG_SCHED_BORE */ /* * Minimal preemption granularity for CPU-bound tasks: * - * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) + * BORE : base_slice = minimum multiple of nsecs_per_tick >= min_base_slice + * (default min_base_slice = 2000000 constant, units: nanoseconds) + * EEVDF: default 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds */ -#ifdef CONFIG_CACHY -unsigned int sysctl_sched_base_slice = 350000ULL; -static unsigned int normalized_sysctl_sched_base_slice = 350000ULL; -#else -unsigned int sysctl_sched_base_slice = 750000ULL; -static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; -#endif +#ifdef CONFIG_SCHED_BORE +static const unsigned int nsecs_per_tick = 1000000000ULL / HZ; +unsigned int sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS; +__read_mostly uint sysctl_sched_base_slice = nsecs_per_tick; +#else /* !CONFIG_SCHED_BORE */ +unsigned int sysctl_sched_base_slice = 700000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; +#endif /* CONFIG_SCHED_BORE */ -#ifdef CONFIG_CACHY -const_debug unsigned int sysctl_sched_migration_cost = 300000UL; -#else const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -#endif static int __init setup_sched_thermal_decay_shift(char *str) { @@ -130,12 +138,8 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ -#ifdef CONFIG_CACHY -static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; -#else static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif -#endif #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ @@ -201,6 +205,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) * * This idea comes from the SD scheduler of Con Kolivas: */ +#ifdef CONFIG_SCHED_BORE +static void update_sysctl(void) { + sysctl_sched_base_slice = nsecs_per_tick * + max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick)); +} +void sched_update_min_base_slice(void) { update_sysctl(); } +#else /* !CONFIG_SCHED_BORE */ static unsigned int get_update_sysctl_factor(void) { unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); @@ -231,6 +242,7 @@ static void update_sysctl(void) SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL } +#endif /* CONFIG_SCHED_BORE */ void __init sched_init_granularity(void) { @@ -708,6 +720,9 @@ static s64 entity_lag(u64 avruntime, struct sched_entity *se) vlag = avruntime - se->vruntime; limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); +#ifdef CONFIG_SCHED_BORE + limit >>= !!sched_bore; +#endif /* CONFIG_SCHED_BORE */ return clamp(vlag, -limit, limit); } @@ -954,7 +969,16 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; +#if !defined(CONFIG_SCHED_BORE) if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) +#else /* CONFIG_SCHED_BORE */ + bool run_to_parity = likely(sched_bore) ? + sched_feat(RUN_TO_PARITY_BORE) : sched_feat(RUN_TO_PARITY); + if (run_to_parity && curr && protect_slice(curr) && + (!entity_is_task(curr) || + !task_of(curr)->bore.futex_waiting || + unlikely(!sched_bore))) +#endif /* CONFIG_SCHED_BORE */ return curr; /* Pick the leftmost entity if it's eligible */ @@ -1013,6 +1037,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ #ifdef CONFIG_SMP +#if !defined(CONFIG_SCHED_BORE) int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -1024,6 +1049,7 @@ int sched_update_scaling(void) return 0; } +#endif /* CONFIG_SCHED_BORE */ #endif #endif @@ -1261,6 +1287,9 @@ static void update_curr(struct cfs_rq *cfs_rq) if (entity_is_task(curr)) { struct task_struct *p = task_of(curr); +#ifdef CONFIG_SCHED_BORE + update_curr_bore(p, delta_exec); +#endif /* CONFIG_SCHED_BORE */ update_curr_task(p, delta_exec); /* @@ -3909,7 +3938,7 @@ static void reweight_eevdf(struct sched_entity *se, u64 avruntime, se->deadline = avruntime + vslice; } -static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, +void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; @@ -5309,12 +5338,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 vslice, vruntime = avg_vruntime(cfs_rq); + u64 vslice = 0, vruntime = avg_vruntime(cfs_rq); s64 lag = 0; if (!se->custom_slice) se->slice = sysctl_sched_base_slice; - vslice = calc_delta_fair(se->slice, se); /* * Due to how V is constructed as the weighted average of entities, @@ -5399,7 +5427,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->rel_deadline = 0; return; } - +#ifdef CONFIG_SCHED_BORE + if (entity_is_task(se) && + likely(sched_bore) && + task_of(se)->bore.futex_waiting) + goto vslice_found; +#endif /* !CONFIG_SCHED_BORE */ + vslice = calc_delta_fair(se->slice, se); +#ifdef CONFIG_SCHED_BORE + if (likely(sched_bore)) + vslice >>= !!(flags & (ENQUEUE_INITIAL | ENQUEUE_WAKEUP)); + else +#endif /* CONFIG_SCHED_BORE */ /* * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks @@ -5408,6 +5447,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) vslice /= 2; +#ifdef CONFIG_SCHED_BORE +vslice_found: +#endif /* CONFIG_SCHED_BORE */ /* * EEVDF: vd_i = ve_i + r_i/w_i */ @@ -5420,7 +5462,7 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); static inline bool cfs_bandwidth_used(void); static void -requeue_delayed_entity(struct sched_entity *se); +requeue_delayed_entity(struct sched_entity *se, int flags); static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -5585,6 +5627,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { update_load_avg(cfs_rq, se, 0); +#ifdef CONFIG_SCHED_BORE + if (sched_feat(DELAY_ZERO) && likely(sched_bore)) + update_entity_lag(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ set_delayed(se); return false; } @@ -7011,7 +7057,7 @@ static int sched_idle_cpu(int cpu) #endif static void -requeue_delayed_entity(struct sched_entity *se) +requeue_delayed_entity(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -7024,13 +7070,22 @@ requeue_delayed_entity(struct sched_entity *se) SCHED_WARN_ON(!se->on_rq); if (sched_feat(DELAY_ZERO)) { +#ifdef CONFIG_SCHED_BORE + if (likely(sched_bore)) + flags |= ENQUEUE_WAKEUP; + else { +#endif /* CONFIG_SCHED_BORE */ + flags = 0; update_entity_lag(cfs_rq, se); +#ifdef CONFIG_SCHED_BORE + } +#endif /* CONFIG_SCHED_BORE */ if (se->vlag > 0) { cfs_rq->nr_running--; if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->vlag = 0; - place_entity(cfs_rq, se, 0); + place_entity(cfs_rq, se, flags); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); cfs_rq->nr_running++; @@ -7067,7 +7122,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_enqueue(&rq->cfs, p); if (flags & ENQUEUE_DELAYED) { - requeue_delayed_entity(se); + requeue_delayed_entity(se, flags); return; } @@ -7085,7 +7140,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { if (se->on_rq) { if (se->sched_delayed) - requeue_delayed_entity(se); + requeue_delayed_entity(se, flags); break; } cfs_rq = cfs_rq_of(se); @@ -7315,6 +7370,15 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_dequeue(&rq->cfs, p); util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); +#ifdef CONFIG_SCHED_BORE + struct cfs_rq *cfs_rq = &rq->cfs; + struct sched_entity *se = &p->se; + if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { + if (cfs_rq->curr == se) + update_curr(cfs_rq_of(&p->se)); + restart_burst_bore(p); + } +#endif /* CONFIG_SCHED_BORE */ if (dequeue_entities(rq, &p->se, flags) < 0) return false; @@ -9144,16 +9208,25 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ +#if !defined(CONFIG_SCHED_BORE) if (unlikely(rq->nr_running == 1)) return; clear_buddies(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ update_rq_clock(rq); /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); +#ifdef CONFIG_SCHED_BORE + restart_burst_rescale_deadline_bore(curr); + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); +#endif /* CONFIG_SCHED_BORE */ /* * Tell update_rq_clock() that we've just updated, * so we don't do microscopic update in schedule() @@ -13345,6 +13418,9 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) SCHED_WARN_ON(p->se.sched_delayed); attach_task_cfs_rq(p); +#ifdef CONFIG_SCHED_BORE + reset_task_bore(p); +#endif /* CONFIG_SCHED_BORE */ set_task_max_allowed_capacity(p); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 050d75030..b81b5a147 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -18,6 +18,9 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true) * 0-lag point or until is has exhausted it's slice. */ SCHED_FEAT(RUN_TO_PARITY, true) +#ifdef CONFIG_SCHED_BORE +SCHED_FEAT(RUN_TO_PARITY_BORE, false) +#endif /* CONFIG_SCHED_BORE */ /* * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for * current. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ad090ec9b..536fda965 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2085,7 +2085,11 @@ static inline void update_sched_domain_debugfs(void) { } static inline void dirty_sched_domain_sysctl(int cpu) { } #endif +#ifdef CONFIG_SCHED_BORE +extern void sched_update_min_base_slice(void); +#else /* !CONFIG_SCHED_BORE */ extern int sched_update_scaling(void); +#endif /* CONFIG_SCHED_BORE */ static inline const struct cpumask *task_user_cpus(struct task_struct *p) { @@ -2807,7 +2811,12 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_BORE +extern unsigned int sysctl_sched_min_base_slice; +extern __read_mostly uint sysctl_sched_base_slice; +#else /* !CONFIG_SCHED_BORE */ extern unsigned int sysctl_sched_base_slice; +#endif /* CONFIG_SCHED_BORE */ #ifdef CONFIG_SCHED_DEBUG extern int sysctl_resched_latency_warn_ms; -- 2.51.0