From 1454d6c7055db07368339dc6a16a03d5a85f8fe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20G=C3=B3rski?= Date: Tue, 24 Oct 2023 21:41:31 +0200 Subject: [PATCH] 6.6: EEVDF adaptation to BORE 3.2.8 (#829) Signed-off-by: Piotr Gorski --- linux-tkg-patches/6.6/0003-eevdf.patch | 1394 ++++++++++++++++++++++++ 1 file changed, 1394 insertions(+) create mode 100644 linux-tkg-patches/6.6/0003-eevdf.patch diff --git a/linux-tkg-patches/6.6/0003-eevdf.patch b/linux-tkg-patches/6.6/0003-eevdf.patch new file mode 100644 index 0000000..f48058c --- /dev/null +++ b/linux-tkg-patches/6.6/0003-eevdf.patch @@ -0,0 +1,1394 @@ +From b4e32953bb6b22b413ac442ec155e8a7660f90a3 Mon Sep 17 00:00:00 2001 +From: Ingo Molnar +Date: Tue, 19 Sep 2023 10:31:15 +0200 +Subject: [PATCH 01/11] sched/fair: Rename check_preempt_wakeup() to + check_preempt_wakeup_fair() + +Other scheduling classes already postfix their similar methods +with the class name. + +Signed-off-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +--- + kernel/sched/fair.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index df348aa55..ab1ad125c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8052,7 +8052,7 @@ static void set_next_buddy(struct sched_entity *se) + /* + * Preempt the current task with a newly woken task if needed: + */ +-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) + { + struct task_struct *curr = rq->curr; + struct sched_entity *se = &curr->se, *pse = &p->se; +@@ -12908,7 +12908,7 @@ DEFINE_SCHED_CLASS(fair) = { + .yield_task = yield_task_fair, + .yield_to_task = yield_to_task_fair, + +- .check_preempt_curr = check_preempt_wakeup, ++ .check_preempt_curr = check_preempt_wakeup_fair, + + .pick_next_task = __pick_next_task_fair, + .put_prev_task = put_prev_task_fair, +-- +2.42.0 + + +From f5ca136342a997ec670cc0276b23cd3dc892d708 Mon Sep 17 00:00:00 2001 +From: Ingo Molnar +Date: Tue, 19 Sep 2023 10:38:21 +0200 +Subject: [PATCH 02/11] sched/fair: Rename check_preempt_curr() to + wakeup_preempt() + +The name is a bit opaque - make it clear that this is about wakeup +preemption. + +Also rename the ->check_preempt_curr() methods similarly. + +Signed-off-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +--- + kernel/sched/core.c | 14 +++++++------- + kernel/sched/deadline.c | 10 +++++----- + kernel/sched/fair.c | 10 +++++----- + kernel/sched/idle.c | 4 ++-- + kernel/sched/rt.c | 6 +++--- + kernel/sched/sched.h | 4 ++-- + kernel/sched/stop_task.c | 4 ++-- + 7 files changed, 26 insertions(+), 26 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 802551e00..28768a3b2 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -2218,10 +2218,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, + p->sched_class->prio_changed(rq, p, oldprio); + } + +-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) ++void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) + { + if (p->sched_class == rq->curr->sched_class) +- rq->curr->sched_class->check_preempt_curr(rq, p, flags); ++ rq->curr->sched_class->wakeup_preempt(rq, p, flags); + else if (sched_class_above(p->sched_class, rq->curr->sched_class)) + resched_curr(rq); + +@@ -2527,7 +2527,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, + rq_lock(rq, rf); + WARN_ON_ONCE(task_cpu(p) != new_cpu); + activate_task(rq, p, 0); +- check_preempt_curr(rq, p, 0); ++ wakeup_preempt(rq, p, 0); + + return rq; + } +@@ -3409,7 +3409,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) + deactivate_task(src_rq, p, 0); + set_task_cpu(p, cpu); + activate_task(dst_rq, p, 0); +- check_preempt_curr(dst_rq, p, 0); ++ wakeup_preempt(dst_rq, p, 0); + + rq_unpin_lock(dst_rq, &drf); + rq_unpin_lock(src_rq, &srf); +@@ -3785,7 +3785,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, + } + + activate_task(rq, p, en_flags); +- check_preempt_curr(rq, p, wake_flags); ++ wakeup_preempt(rq, p, wake_flags); + + ttwu_do_wakeup(p); + +@@ -3856,7 +3856,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) + * it should preempt the task that is current now. + */ + update_rq_clock(rq); +- check_preempt_curr(rq, p, wake_flags); ++ wakeup_preempt(rq, p, wake_flags); + } + ttwu_do_wakeup(p); + ret = 1; +@@ -4871,7 +4871,7 @@ void wake_up_new_task(struct task_struct *p) + + activate_task(rq, p, ENQUEUE_NOCLOCK); + trace_sched_wakeup_new(p); +- check_preempt_curr(rq, p, WF_FORK); ++ wakeup_preempt(rq, p, WF_FORK); + #ifdef CONFIG_SMP + if (p->sched_class->task_woken) { + /* +diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c +index 58b542bf2..fb1996a67 100644 +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -763,7 +763,7 @@ static inline void deadline_queue_pull_task(struct rq *rq) + + static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); + static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); +-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); ++static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags); + + static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, + struct rq *rq) +@@ -1175,7 +1175,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) + + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (dl_task(rq->curr)) +- check_preempt_curr_dl(rq, p, 0); ++ wakeup_preempt_dl(rq, p, 0); + else + resched_curr(rq); + +@@ -1939,7 +1939,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + * Only called when both the current and waking task are -deadline + * tasks. + */ +-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, ++static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, + int flags) + { + if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { +@@ -2652,7 +2652,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) + deadline_queue_push_tasks(rq); + #endif + if (dl_task(rq->curr)) +- check_preempt_curr_dl(rq, p, 0); ++ wakeup_preempt_dl(rq, p, 0); + else + resched_curr(rq); + } else { +@@ -2721,7 +2721,7 @@ DEFINE_SCHED_CLASS(dl) = { + .dequeue_task = dequeue_task_dl, + .yield_task = yield_task_dl, + +- .check_preempt_curr = check_preempt_curr_dl, ++ .wakeup_preempt = wakeup_preempt_dl, + + .pick_next_task = pick_next_task_dl, + .put_prev_task = put_prev_task_dl, +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index ab1ad125c..785e8611e 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8065,7 +8065,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int + + /* + * This is possible from callers such as attach_tasks(), in which we +- * unconditionally check_preempt_curr() after an enqueue (which may have ++ * unconditionally wakeup_preempt() after an enqueue (which may have + * lead to a throttle). This both saves work and prevents false + * next-buddy nomination below. + */ +@@ -8972,7 +8972,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) + + WARN_ON_ONCE(task_rq(p) != rq); + activate_task(rq, p, ENQUEUE_NOCLOCK); +- check_preempt_curr(rq, p, 0); ++ wakeup_preempt(rq, p, 0); + } + + /* +@@ -12447,7 +12447,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) + if (p->prio > oldprio) + resched_curr(rq); + } else +- check_preempt_curr(rq, p, 0); ++ wakeup_preempt(rq, p, 0); + } + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -12549,7 +12549,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) + if (task_current(rq, p)) + resched_curr(rq); + else +- check_preempt_curr(rq, p, 0); ++ wakeup_preempt(rq, p, 0); + } + } + +@@ -12908,7 +12908,7 @@ DEFINE_SCHED_CLASS(fair) = { + .yield_task = yield_task_fair, + .yield_to_task = yield_to_task_fair, + +- .check_preempt_curr = check_preempt_wakeup_fair, ++ .wakeup_preempt = check_preempt_wakeup_fair, + + .pick_next_task = __pick_next_task_fair, + .put_prev_task = put_prev_task_fair, +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index 5007b25c5..565f8374d 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -401,7 +401,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + /* + * Idle tasks are unconditionally rescheduled: + */ +-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) ++static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) + { + resched_curr(rq); + } +@@ -482,7 +482,7 @@ DEFINE_SCHED_CLASS(idle) = { + /* dequeue is not valid, we print a debug message there: */ + .dequeue_task = dequeue_task_idle, + +- .check_preempt_curr = check_preempt_curr_idle, ++ .wakeup_preempt = wakeup_preempt_idle, + + .pick_next_task = pick_next_task_idle, + .put_prev_task = put_prev_task_idle, +diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c +index 0597ba0f8..3e442fa3f 100644 +--- a/kernel/sched/rt.c ++++ b/kernel/sched/rt.c +@@ -953,7 +953,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) + + /* + * When we're idle and a woken (rt) task is +- * throttled check_preempt_curr() will set ++ * throttled wakeup_preempt() will set + * skip_update and the time between the wakeup + * and this unthrottle will get accounted as + * 'runtime'. +@@ -1715,7 +1715,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + /* + * Preempt the current task with a newly woken task if needed: + */ +-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) ++static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) + { + if (p->prio < rq->curr->prio) { + resched_curr(rq); +@@ -2702,7 +2702,7 @@ DEFINE_SCHED_CLASS(rt) = { + .dequeue_task = dequeue_task_rt, + .yield_task = yield_task_rt, + +- .check_preempt_curr = check_preempt_curr_rt, ++ .wakeup_preempt = wakeup_preempt_rt, + + .pick_next_task = pick_next_task_rt, + .put_prev_task = put_prev_task_rt, +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 048462724..c3ca10d02 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2239,7 +2239,7 @@ struct sched_class { + void (*yield_task) (struct rq *rq); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p); + +- void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); ++ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); + + struct task_struct *(*pick_next_task)(struct rq *rq); + +@@ -2513,7 +2513,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) + extern void activate_task(struct rq *rq, struct task_struct *p, int flags); + extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + +-extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); ++extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); + + #ifdef CONFIG_PREEMPT_RT + #define SCHED_NR_MIGRATE_BREAK 8 +diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c +index 85590599b..6cf7304e6 100644 +--- a/kernel/sched/stop_task.c ++++ b/kernel/sched/stop_task.c +@@ -23,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + #endif /* CONFIG_SMP */ + + static void +-check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) ++wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags) + { + /* we're never preempted */ + } +@@ -120,7 +120,7 @@ DEFINE_SCHED_CLASS(stop) = { + .dequeue_task = dequeue_task_stop, + .yield_task = yield_task_stop, + +- .check_preempt_curr = check_preempt_curr_stop, ++ .wakeup_preempt = wakeup_preempt_stop, + + .pick_next_task = pick_next_task_stop, + .put_prev_task = put_prev_task_stop, +-- +2.42.0 + + +From 685dbae13ab75814a3706a3741db931d7d56bb8e Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior +Date: Wed, 20 Sep 2023 15:00:24 +0200 +Subject: [PATCH 03/11] sched/debug: Remove the + /proc/sys/kernel/sched_child_runs_first sysctl + +The /proc/sys/kernel/sched_child_runs_first knob is no longer connected since: + + 5e963f2bd4654 ("sched/fair: Commit to EEVDF") + +Remove it. + +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Ingo Molnar +Link: https://lore.kernel.org/r/20230920130025.412071-2-bigeasy@linutronix.de +--- + kernel/sched/debug.c | 1 - + kernel/sched/fair.c | 13 ------------- + kernel/sched/sched.h | 2 -- + 3 files changed, 16 deletions(-) + +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 4c3d0d9f3..132dfd1e6 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -864,7 +864,6 @@ static void sched_debug_header(struct seq_file *m) + #define PN(x) \ + SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + PN(sysctl_sched_base_slice); +- P(sysctl_sched_child_runs_first); + P(sysctl_sched_features); + #undef PN + #undef P +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 785e8611e..829817ef6 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -78,12 +78,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; + unsigned int sysctl_sched_base_slice = 750000ULL; + static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; + +-/* +- * After fork, child runs first. If set to 0 (default) then +- * parent will (try to) run first. +- */ +-unsigned int sysctl_sched_child_runs_first __read_mostly; +- + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + + int sched_thermal_decay_shift; +@@ -145,13 +139,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; + + #ifdef CONFIG_SYSCTL + static struct ctl_table sched_fair_sysctls[] = { +- { +- .procname = "sched_child_runs_first", +- .data = &sysctl_sched_child_runs_first, +- .maxlen = sizeof(unsigned int), +- .mode = 0644, +- .proc_handler = proc_dointvec, +- }, + #ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index c3ca10d02..7fc32a87f 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -109,8 +109,6 @@ extern __read_mostly int scheduler_running; + extern unsigned long calc_load_update; + extern atomic_long_t calc_load_tasks; + +-extern unsigned int sysctl_sched_child_runs_first; +- + extern void calc_global_load_tick(struct rq *this_rq); + extern long calc_load_fold_active(struct rq *this_rq, long adjust); + +-- +2.42.0 + + +From 7ad030f0be19209fd944ace9be02bcc0d7b4962a Mon Sep 17 00:00:00 2001 +From: Yiwei Lin +Date: Fri, 20 Oct 2023 13:56:17 +0800 +Subject: [PATCH 04/11] sched/fair: Remove unused 'curr' argument from + pick_next_entity() + +The 'curr' argument of pick_next_entity() has become unused after +the EEVDF changes. + +[ mingo: Updated the changelog. ] + +Signed-off-by: Yiwei Lin +Signed-off-by: Ingo Molnar +Link: https://lore.kernel.org/r/20231020055617.42064-1-s921975628@gmail.com +--- + kernel/sched/fair.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 829817ef6..a30c03cc2 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -5254,7 +5254,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + * 4) do not run the "skip" process, if something else is available + */ + static struct sched_entity * +-pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) ++pick_next_entity(struct cfs_rq *cfs_rq) + { + /* + * Enabling NEXT_BUDDY will affect latency but not fairness. +@@ -8144,7 +8144,7 @@ static struct task_struct *pick_task_fair(struct rq *rq) + goto again; + } + +- se = pick_next_entity(cfs_rq, curr); ++ se = pick_next_entity(cfs_rq); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + +@@ -8207,7 +8207,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf + } + } + +- se = pick_next_entity(cfs_rq, curr); ++ se = pick_next_entity(cfs_rq); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + +@@ -8246,7 +8246,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf + put_prev_task(rq, prev); + + do { +- se = pick_next_entity(cfs_rq, NULL); ++ se = pick_next_entity(cfs_rq); + set_next_entity(cfs_rq, se); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); +-- +2.42.0 + + +From 3745fd9933ee1ca1d4dc4dde0a39c2da275c110f Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Sat, 14 Oct 2023 23:12:20 +0200 +Subject: [PATCH 05/11] sched/eevdf: Add feature comments + +Signed-off-by: Peter Zijlstra (Intel) +--- + kernel/sched/features.h | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index f77016823..ef91684d8 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -5,7 +5,14 @@ + * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. + */ + SCHED_FEAT(PLACE_LAG, true) ++/* ++ * Give new tasks half a slice to ease into the competition. ++ */ + SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) ++/* ++ * Inhibit (wakeup) preemption until the current task has either matched the ++ * 0-lag point or until is has exhausted it's slice. ++ */ + SCHED_FEAT(RUN_TO_PARITY, true) + + /* +-- +2.42.0 + + +From 1346e28f7dfb59207364f366e4235ee7304150e8 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Wed, 4 Oct 2023 12:43:53 +0200 +Subject: [PATCH 06/11] sched/eevdf: Remove min_vruntime_copy + +Since commit e8f331bcc270 ("sched/smp: Use lag to simplify +cross-runqueue placement") the min_vruntime_copy is no longer used. + +Signed-off-by: Peter Zijlstra (Intel) +--- + kernel/sched/fair.c | 5 ++--- + kernel/sched/sched.h | 4 ---- + 2 files changed, 2 insertions(+), 7 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index a30c03cc2..c118381f0 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -774,8 +774,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) + } + + /* ensure we never gain time by being placed backwards. */ +- u64_u32_store(cfs_rq->min_vruntime, +- __update_min_vruntime(cfs_rq, vruntime)); ++ cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); + } + + static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +@@ -12571,7 +12570,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) + void init_cfs_rq(struct cfs_rq *cfs_rq) + { + cfs_rq->tasks_timeline = RB_ROOT_CACHED; +- u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); ++ cfs_rq->min_vruntime = (u64)(-(1LL << 20)); + #ifdef CONFIG_SMP + raw_spin_lock_init(&cfs_rq->removed.lock); + #endif +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 7fc32a87f..f447bfbb1 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -558,10 +558,6 @@ struct cfs_rq { + u64 min_vruntime_fi; + #endif + +-#ifndef CONFIG_64BIT +- u64 min_vruntime_copy; +-#endif +- + struct rb_root_cached tasks_timeline; + + /* +-- +2.42.0 + + +From a707549ff96d3887c6cd6f4adbed195f8f01ab34 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Mon, 22 May 2023 13:46:30 +0200 +Subject: [PATCH 07/11] sched/eevdf: Use sched_attr::sched_runtime to set + request/slice suggestion + +Allow applications to directly set a suggested request/slice length using +sched_attr::sched_runtime. + +The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms] +which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100. + +Applications should strive to use their periodic runtime at a high +confidence interval (95%+) as the target slice. Using a smaller slice +will introduce undue preemptions, while using a larger value will +increase latency. + +For all the following examples assume a scheduling quantum of 8, and for +consistency all examples have W=4: + + {A,B,C,D}(w=1,r=8): + + ABCD... + +---+---+---+--- + + t=0, V=1.5 t=1, V=3.5 + A |------< A |------< + B |------< B |------< + C |------< C |------< + D |------< D |------< + ---+*------+-------+--- ---+--*----+-------+--- + + t=2, V=5.5 t=3, V=7.5 + A |------< A |------< + B |------< B |------< + C |------< C |------< + D |------< D |------< + ---+----*--+-------+--- ---+------*+-------+--- + +Note: 4 identical tasks in FIFO order + +~~~ + + {A,B}(w=1,r=16) C(w=2,r=16) + + AACCBBCC... + +---+---+---+--- + + t=0, V=1.25 t=2, V=5.25 + A |--------------< A |--------------< + B |--------------< B |--------------< + C |------< C |------< + ---+*------+-------+--- ---+----*--+-------+--- + + t=4, V=8.25 t=6, V=12.25 + A |--------------< A |--------------< + B |--------------< B |--------------< + C |------< C |------< + ---+-------*-------+--- ---+-------+---*---+--- + +Note: 1 heavy task -- because q=8, double r such that the deadline of the w=2 + task doesn't go below q. + +Note: observe the full schedule becomes: W*max(r_i/w_i) = 4*2q = 8q in length. + +Note: the period of the heavy task is half the full period at: + W*(r_i/w_i) = 4*(2q/2) = 4q + +~~~ + + {A,C,D}(w=1,r=16) B(w=1,r=8): + + BAACCBDD... + +---+---+---+--- + + t=0, V=1.5 t=1, V=3.5 + A |--------------< A |---------------< + B |------< B |------< + C |--------------< C |--------------< + D |--------------< D |--------------< + ---+*------+-------+--- ---+--*----+-------+--- + + t=3, V=7.5 t=5, V=11.5 + A |---------------< A |---------------< + B |------< B |------< + C |--------------< C |--------------< + D |--------------< D |--------------< + ---+------*+-------+--- ---+-------+--*----+--- + + t=6, V=13.5 + A |---------------< + B |------< + C |--------------< + D |--------------< + ---+-------+----*--+--- + +Note: 1 short task -- again double r so that the deadline of the short task + won't be below q. Made B short because its not the leftmost task, but is + eligible with the 0,1,2,3 spread. + +Note: like with the heavy task, the period of the short task observes: + W*(r_i/w_i) = 4*(1q/1) = 4q + +~~~ + + A(w=1,r=16) B(w=1,r=8) C(w=2,r=16) + + BCCAABCC... + +---+---+---+--- + + t=0, V=1.25 t=1, V=3.25 + A |--------------< A |--------------< + B |------< B |------< + C |------< C |------< + ---+*------+-------+--- ---+--*----+-------+--- + + t=3, V=7.25 t=5, V=11.25 + A |--------------< A |--------------< + B |------< B |------< + C |------< C |------< + ---+------*+-------+--- ---+-------+--*----+--- + + t=6, V=13.25 + A |--------------< + B |------< + C |------< + ---+-------+----*--+--- + +Note: 1 heavy and 1 short task -- combine them all. + +Note: both the short and heavy task end up with a period of 4q + +~~~ + + A(w=1,r=16) B(w=2,r=16) C(w=1,r=8) + + BBCAABBC... + +---+---+---+--- + + t=0, V=1 t=2, V=5 + A |--------------< A |--------------< + B |------< B |------< + C |------< C |------< + ---+*------+-------+--- ---+----*--+-------+--- + + t=3, V=7 t=5, V=11 + A |--------------< A |--------------< + B |------< B |------< + C |------< C |------< + ---+------*+-------+--- ---+-------+--*----+--- + + t=7, V=15 + A |--------------< + B |------< + C |------< + ---+-------+------*+--- + +Note: as before but permuted + +~~~ + +From all this it can be deduced that, for the steady state: + + - the total period (P) of a schedule is: W*max(r_i/w_i) + - the average period of a task is: W*(r_i/w_i) + - each task obtains the fair share: w_i/W of each full period P + +Signed-off-by: Peter Zijlstra (Intel) +--- + include/linux/sched.h | 3 +++ + kernel/sched/core.c | 33 ++++++++++++++++++++++++++------- + kernel/sched/debug.c | 3 ++- + kernel/sched/fair.c | 6 ++++-- + 4 files changed, 35 insertions(+), 10 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 77f01ac38..e90e58b2c 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -555,6 +555,9 @@ struct sched_entity { + struct list_head group_node; + unsigned int on_rq; + ++ unsigned int custom_slice : 1; ++ /* 31 bits hole */ ++ + u64 exec_start; + u64 sum_exec_runtime; + u64 prev_sum_exec_runtime; +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 28768a3b2..d914ec370 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4501,7 +4501,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.nr_migrations = 0; + p->se.vruntime = 0; + p->se.vlag = 0; +- p->se.slice = sysctl_sched_base_slice; + INIT_LIST_HEAD(&p->se.group_node); + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -4755,6 +4754,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) + + p->prio = p->normal_prio = p->static_prio; + set_load_weight(p, false); ++ p->se.custom_slice = 0; ++ p->se.slice = sysctl_sched_base_slice; + + /* + * We don't need the reset flag anymore after the fork. It has +@@ -7523,10 +7524,20 @@ static void __setscheduler_params(struct task_struct *p, + + p->policy = policy; + +- if (dl_policy(policy)) ++ if (dl_policy(policy)) { + __setparam_dl(p, attr); +- else if (fair_policy(policy)) ++ } else if (fair_policy(policy)) { + p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ if (attr->sched_runtime) { ++ p->se.custom_slice = 1; ++ p->se.slice = clamp_t(u64, attr->sched_runtime, ++ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ ++ NSEC_PER_MSEC*100); /* HZ=100 / 10 */ ++ } else { ++ p->se.custom_slice = 0; ++ p->se.slice = sysctl_sched_base_slice; ++ } ++ } + + /* + * __sched_setscheduler() ensures attr->sched_priority == 0 when +@@ -7711,7 +7722,9 @@ static int __sched_setscheduler(struct task_struct *p, + * but store a possible modification of reset_on_fork. + */ + if (unlikely(policy == p->policy)) { +- if (fair_policy(policy) && attr->sched_nice != task_nice(p)) ++ if (fair_policy(policy) && ++ (attr->sched_nice != task_nice(p) || ++ (attr->sched_runtime && attr->sched_runtime != p->se.slice))) + goto change; + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) + goto change; +@@ -7857,6 +7870,9 @@ static int _sched_setscheduler(struct task_struct *p, int policy, + .sched_nice = PRIO_TO_NICE(p->static_prio), + }; + ++ if (p->se.custom_slice) ++ attr.sched_runtime = p->se.slice; ++ + /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ + if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; +@@ -8033,12 +8049,14 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a + + static void get_params(struct task_struct *p, struct sched_attr *attr) + { +- if (task_has_dl_policy(p)) ++ if (task_has_dl_policy(p)) { + __getparam_dl(p, attr); +- else if (task_has_rt_policy(p)) ++ } else if (task_has_rt_policy(p)) { + attr->sched_priority = p->rt_priority; +- else ++ } else { + attr->sched_nice = task_nice(p); ++ attr->sched_runtime = p->se.slice; ++ } + } + + /** +@@ -10057,6 +10075,7 @@ void __init sched_init(void) + } + + set_load_weight(&init_task, false); ++ init_task.se.slice = sysctl_sched_base_slice, + + /* + * The boot idle thread does lazy MMU switching as well: +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 132dfd1e6..762c899df 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -579,11 +579,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) + else + SEQ_printf(m, " %c", task_state_to_char(p)); + +- SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", ++ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", + p->comm, task_pid_nr(p), + SPLIT_NS(p->se.vruntime), + entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', + SPLIT_NS(p->se.deadline), ++ p->se.custom_slice ? 'S' : ' ', + SPLIT_NS(p->se.slice), + SPLIT_NS(p->se.sum_exec_runtime), + (long long)(p->nvcsw + p->nivcsw), +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index c118381f0..a2cb468a9 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1012,7 +1012,8 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) + * nice) while the request time r_i is determined by + * sysctl_sched_base_slice. + */ +- se->slice = sysctl_sched_base_slice; ++ if (!se->custom_slice) ++ se->slice = sysctl_sched_base_slice; + + /* + * EEVDF: vd_i = ve_i + r_i / w_i +@@ -4962,7 +4963,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + u64 vslice, vruntime = avg_vruntime(cfs_rq); + s64 lag = 0; + +- se->slice = sysctl_sched_base_slice; ++ if (!se->custom_slice) ++ se->slice = sysctl_sched_base_slice; + vslice = calc_delta_fair(se->slice, se); + + /* +-- +2.42.0 + + +From 27a6fd2cdb4818ab014d744d6b28a10e2ba3c7b0 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Tue, 26 Sep 2023 14:32:32 +0200 +Subject: [PATCH 08/11] sched/eevdf: Allow shorter slices to wakeup-preempt + +Part of the reason to have shorter slices is to improve +responsiveness. Allow shorter slices to preempt longer slices on +wakeup. + + Task | Runtime ms | Switches | Avg delay ms | Max delay ms | Sum delay ms | + + 100ms massive_intr 500us cyclictest NO_PREEMPT_SHORT + + 1 massive_intr:(5) | 846018.956 ms | 779188 | avg: 0.273 ms | max: 58.337 ms | sum:212545.245 ms | + 2 massive_intr:(5) | 853450.693 ms | 792269 | avg: 0.275 ms | max: 71.193 ms | sum:218263.588 ms | + 3 massive_intr:(5) | 843888.920 ms | 771456 | avg: 0.277 ms | max: 92.405 ms | sum:213353.221 ms | + 1 chromium-browse:(8) | 53015.889 ms | 131766 | avg: 0.463 ms | max: 36.341 ms | sum:60959.230 ms | + 2 chromium-browse:(8) | 53864.088 ms | 136962 | avg: 0.480 ms | max: 27.091 ms | sum:65687.681 ms | + 3 chromium-browse:(9) | 53637.904 ms | 132637 | avg: 0.481 ms | max: 24.756 ms | sum:63781.673 ms | + 1 cyclictest:(5) | 12615.604 ms | 639689 | avg: 0.471 ms | max: 32.272 ms | sum:301351.094 ms | + 2 cyclictest:(5) | 12511.583 ms | 642578 | avg: 0.448 ms | max: 44.243 ms | sum:287632.830 ms | + 3 cyclictest:(5) | 12545.867 ms | 635953 | avg: 0.475 ms | max: 25.530 ms | sum:302374.658 ms | + + 100ms massive_intr 500us cyclictest PREEMPT_SHORT + + 1 massive_intr:(5) | 839843.919 ms | 837384 | avg: 0.264 ms | max: 74.366 ms | sum:221476.885 ms | + 2 massive_intr:(5) | 852449.913 ms | 845086 | avg: 0.252 ms | max: 68.162 ms | sum:212595.968 ms | + 3 massive_intr:(5) | 839180.725 ms | 836883 | avg: 0.266 ms | max: 69.742 ms | sum:222812.038 ms | + 1 chromium-browse:(11) | 54591.481 ms | 138388 | avg: 0.458 ms | max: 35.427 ms | sum:63401.508 ms | + 2 chromium-browse:(8) | 52034.541 ms | 132276 | avg: 0.436 ms | max: 31.826 ms | sum:57732.958 ms | + 3 chromium-browse:(8) | 55231.771 ms | 141892 | avg: 0.469 ms | max: 27.607 ms | sum:66538.697 ms | + 1 cyclictest:(5) | 13156.391 ms | 667412 | avg: 0.373 ms | max: 38.247 ms | sum:249174.502 ms | + 2 cyclictest:(5) | 12688.939 ms | 665144 | avg: 0.374 ms | max: 33.548 ms | sum:248509.392 ms | + 3 cyclictest:(5) | 13475.623 ms | 669110 | avg: 0.370 ms | max: 37.819 ms | sum:247673.390 ms | + +As per the numbers the, this makes cyclictest (short slice) it's +max-delay more consistent and consistency drops the sum-delay. The +trade-off is that the massive_intr (long slice) gets more context +switches and a slight increase in sum-delay. + +[mike: numbers] +Signed-off-by: Peter Zijlstra (Intel) +Tested-by: Mike Galbraith +--- + kernel/sched/fair.c | 11 ++++++++--- + kernel/sched/features.h | 4 ++++ + 2 files changed, 12 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index a2cb468a9..969bea908 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8108,9 +8108,14 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int + cfs_rq = cfs_rq_of(se); + update_curr(cfs_rq); + +- /* +- * XXX pick_eevdf(cfs_rq) != se ? +- */ ++ if (sched_feat(PREEMPT_SHORT) && pse->slice < se->slice && ++ entity_eligible(cfs_rq, pse) && ++ (s64)(pse->deadline - se->deadline) < 0 && ++ se->vlag == se->deadline) { ++ /* negate RUN_TO_PARITY */ ++ se->vlag = se->deadline - 1; ++ } ++ + if (pick_eevdf(cfs_rq) == pse) + goto preempt; + +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index ef91684d8..35428a3b8 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -14,6 +14,10 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) + * 0-lag point or until is has exhausted it's slice. + */ + SCHED_FEAT(RUN_TO_PARITY, true) ++/* ++ * Allow tasks with a shorter slice to disregard RUN_TO_PARITY ++ */ ++SCHED_FEAT(PREEMPT_SHORT, true) + + /* + * Prefer to schedule the task we woke last (assuming it failed +-- +2.42.0 + + +From 01679ff43f2fa3f41694107816af6aeeba5d9fb2 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Tue, 26 Sep 2023 14:39:41 +0200 +Subject: [PATCH 09/11] sched/eevdf: Revenge of the Sith^WSleeper + +For tasks that have received excess service (negative lag) allow them +to gain parity (zero lag) by sleeping. + + slice 30000000 (*10) + # Min Latencies: 00041 + # Avg Latencies: 00712 + # Max Latencies: 287353 + + slice 3000000 (default) + # Min Latencies: 00054 + # Avg Latencies: 00436 + # Max Latencies: 23531 + + slice 300000 (/10) + # Min Latencies: 00054 + # Avg Latencies: 00061 + # Max Latencies: 05245 + +It sucks for many other things though... so let it be an experiment. + +Signed-off-by: Peter Zijlstra (Intel) +--- + kernel/sched/fair.c | 36 ++++++++++++++++++++++++++++++++++++ + kernel/sched/features.h | 6 ++++++ + 2 files changed, 42 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 969bea908..b148a654d 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -4957,6 +4957,33 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} + + #endif /* CONFIG_SMP */ + ++static inline u64 ++entity_vlag_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) ++{ ++ u64 now, vdelta; ++ s64 delta; ++ ++ if (!(flags & ENQUEUE_WAKEUP)) ++ return se->vlag; ++ ++ if (flags & ENQUEUE_MIGRATED) ++ return 0; ++ ++ now = rq_clock_task(rq_of(cfs_rq)); ++ delta = now - se->exec_start; ++ if (delta < 0) ++ return se->vlag; ++ ++ if (sched_feat(GENTLE_SLEEPER)) ++ delta /= 2; ++ ++ vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load); ++ if (vdelta < -se->vlag) ++ return se->vlag + vdelta; ++ ++ return 0; ++} ++ + static void + place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + { +@@ -4981,6 +5008,15 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + + lag = se->vlag; + ++ /* ++ * Allow tasks that have received too much service (negative ++ * lag) to (re)gain parity (zero lag) by sleeping for the ++ * equivalent duration. This ensures they will be readily ++ * eligible. ++ */ ++ if (sched_feat(PLACE_SLEEPER) && lag < 0) ++ lag = entity_vlag_sleeper(cfs_rq, se, flags); ++ + /* + * If we want to place a task and preserve lag, we have to + * consider the effect of the new entity on the weighted +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 35428a3b8..926511713 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -18,6 +18,12 @@ SCHED_FEAT(RUN_TO_PARITY, true) + * Allow tasks with a shorter slice to disregard RUN_TO_PARITY + */ + SCHED_FEAT(PREEMPT_SHORT, true) ++/* ++ * Let sleepers earn back lag, but not more than 0-lag. GENTLE_SLEEPERS earn at ++ * half the speed. ++ */ ++SCHED_FEAT(PLACE_SLEEPER, false) ++SCHED_FEAT(GENTLE_SLEEPER, true) + + /* + * Prefer to schedule the task we woke last (assuming it failed +-- +2.42.0 + + +From cec920740e1ee07076fad5585640f7089d0aff48 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Thu, 5 Oct 2023 15:30:13 +0200 +Subject: [PATCH 10/11] sched/eevdf: Disable entity_eligible() + +Disable entity_eligible() entirely, this makes tasks much easier to +pick, but also gives rise to degenerate cases like: + +t=92 V=16 + A |----< + B |< +>C |----------------< + D |< + E |< + F |< + G |< + |---------|-----*---|---------|---------|---- + +hence, default disable. + +Suggested-by: Youssef Esmat +Signed-off-by: Peter Zijlstra (Intel) +--- + kernel/sched/fair.c | 3 +++ + kernel/sched/features.h | 11 +++++++++++ + 2 files changed, 14 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index b148a654d..00e4a32a6 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -724,6 +724,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) + */ + int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) + { ++ if (sched_feat(EVDF)) ++ return true; ++ + struct sched_entity *curr = cfs_rq->curr; + s64 avg = cfs_rq->avg_vruntime; + long load = cfs_rq->avg_load; +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 926511713..8c7887fd8 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -24,6 +24,17 @@ SCHED_FEAT(PREEMPT_SHORT, true) + */ + SCHED_FEAT(PLACE_SLEEPER, false) + SCHED_FEAT(GENTLE_SLEEPER, true) ++/* ++ * Disable the eligibility check -- always true. ++ * ++ * Selecting this allows short tasks, in the presence of a long task, to walk ++ * far past 0-lag and create a window where newly placed tasks will come in and ++ * starve the long task. ++ * ++ * Behaves quite terrible for mixed slice workloads as a result, very much not ++ * recommended. ++ */ ++SCHED_FEAT(EVDF, false) + + /* + * Prefer to schedule the task we woke last (assuming it failed +-- +2.42.0 + + +From df2ccad31aa54f13bd882c0d9e943cd1a0adc12e Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Fri, 15 Sep 2023 00:48:45 +0200 +Subject: [PATCH 11/11] sched/eevdf: Delay dequeue + +For tasks that have negative-lag (have received 'excess' service), delay the +dequeue and keep them in the runnable tree until they're eligible again. Or +rather, keep them until they're selected again, since finding their eligibility +crossover point is expensive. + +The effect is a bit like sleeper bonus, the tasks keep contending for service +until either they get a wakeup or until they're selected again and are really +dequeued. + +This means that any actual dequeue happens with positive lag (serviced owed) +and are more readily ran when woken next. + +Signed-off-by: Peter Zijlstra (Intel) +--- + include/linux/sched.h | 1 + + kernel/sched/core.c | 88 +++++++++++++++++++++++++++++++++-------- + kernel/sched/fair.c | 11 ++++++ + kernel/sched/features.h | 11 ++++++ + kernel/sched/sched.h | 3 +- + 5 files changed, 97 insertions(+), 17 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index e90e58b2c..b6d834a47 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -894,6 +894,7 @@ struct task_struct { + unsigned sched_reset_on_fork:1; + unsigned sched_contributes_to_load:1; + unsigned sched_migrated:1; ++ unsigned sched_delayed:1; + + /* Force alignment to the next boundary: */ + unsigned :0; +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index d914ec370..cb5641bd8 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -3850,12 +3850,23 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) + + rq = __task_rq_lock(p, &rf); + if (task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ if (unlikely(p->sched_delayed)) { ++ p->sched_delayed = 0; ++ /* mustn't run a delayed task */ ++ WARN_ON_ONCE(task_on_cpu(rq, p)); ++ if (sched_feat(GENTLE_DELAY)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); ++ if (p->se.vlag > 0) ++ p->se.vlag = 0; ++ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); ++ } ++ } + if (!task_on_cpu(rq, p)) { + /* + * When on_rq && !on_cpu the task is preempted, see if + * it should preempt the task that is current now. + */ +- update_rq_clock(rq); + wakeup_preempt(rq, p, wake_flags); + } + ttwu_do_wakeup(p); +@@ -6535,6 +6546,24 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + # define SM_MASK_PREEMPT SM_PREEMPT + #endif + ++static void deschedule_task(struct rq *rq, struct task_struct *p, unsigned long prev_state) ++{ ++ p->sched_contributes_to_load = ++ (prev_state & TASK_UNINTERRUPTIBLE) && ++ !(prev_state & TASK_NOLOAD) && ++ !(prev_state & TASK_FROZEN); ++ ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible++; ++ ++ deactivate_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); ++ ++ if (p->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++} ++ + /* + * __schedule() is the main scheduler function. + * +@@ -6619,6 +6648,8 @@ static void __sched notrace __schedule(unsigned int sched_mode) + + switch_count = &prev->nivcsw; + ++ WARN_ON_ONCE(prev->sched_delayed); ++ + /* + * We must load prev->state once (task_struct::state is volatile), such + * that we form a control dependency vs deactivate_task() below. +@@ -6628,14 +6659,6 @@ static void __sched notrace __schedule(unsigned int sched_mode) + if (signal_pending_state(prev_state, prev)) { + WRITE_ONCE(prev->__state, TASK_RUNNING); + } else { +- prev->sched_contributes_to_load = +- (prev_state & TASK_UNINTERRUPTIBLE) && +- !(prev_state & TASK_NOLOAD) && +- !(prev_state & TASK_FROZEN); +- +- if (prev->sched_contributes_to_load) +- rq->nr_uninterruptible++; +- + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) +@@ -6647,17 +6670,50 @@ static void __sched notrace __schedule(unsigned int sched_mode) + * + * After this, schedule() must not care about p->state any more. + */ +- deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); +- +- if (prev->in_iowait) { +- atomic_inc(&rq->nr_iowait); +- delayacct_blkio_start(); +- } ++ if (sched_feat(DELAY_DEQUEUE) && ++ prev->sched_class->delay_dequeue_task && ++ prev->sched_class->delay_dequeue_task(rq, prev)) ++ prev->sched_delayed = 1; ++ else ++ deschedule_task(rq, prev, prev_state); + } + switch_count = &prev->nvcsw; + } + +- next = pick_next_task(rq, prev, &rf); ++ for (struct task_struct *tmp = prev;;) { ++ unsigned long tmp_state; ++ ++ next = pick_next_task(rq, tmp, &rf); ++ if (unlikely(tmp != prev)) ++ finish_task(tmp); ++ ++ if (likely(!next->sched_delayed)) ++ break; ++ ++ next->sched_delayed = 0; ++ ++ /* ++ * A sched_delayed task must not be runnable at this point, see ++ * ttwu_runnable(). ++ */ ++ tmp_state = READ_ONCE(next->__state); ++ if (WARN_ON_ONCE(!tmp_state)) ++ break; ++ ++ prepare_task(next); ++ /* ++ * Order ->on_cpu and ->on_rq, see the comments in ++ * try_to_wake_up(). Normally this is smp_mb__after_spinlock() ++ * above. ++ */ ++ smp_wmb(); ++ deschedule_task(rq, next, tmp_state); ++ if (sched_feat(GENTLE_DELAY) && next->se.vlag > 0) ++ next->se.vlag = 0; ++ ++ tmp = next; ++ } ++ + clear_tsk_need_resched(prev); + clear_preempt_need_resched(); + #ifdef CONFIG_SCHED_DEBUG +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 00e4a32a6..b25a6ad0c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8347,6 +8347,16 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq) + return pick_next_task_fair(rq, NULL, NULL); + } + ++static bool delay_dequeue_task_fair(struct rq *rq, struct task_struct *p) ++{ ++ struct sched_entity *se = &p->se; ++ struct cfs_rq *cfs_rq = cfs_rq_of(se); ++ ++ update_curr(cfs_rq); ++ ++ return !entity_eligible(cfs_rq, se); ++} ++ + /* + * Account for a descheduled task: + */ +@@ -12942,6 +12952,7 @@ DEFINE_SCHED_CLASS(fair) = { + + .wakeup_preempt = check_preempt_wakeup_fair, + ++ .delay_dequeue_task = delay_dequeue_task_fair, + .pick_next_task = __pick_next_task_fair, + .put_prev_task = put_prev_task_fair, + .set_next_task = set_next_task_fair, +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 8c7887fd8..82e01cd97 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -35,6 +35,17 @@ SCHED_FEAT(GENTLE_SLEEPER, true) + * recommended. + */ + SCHED_FEAT(EVDF, false) ++/* ++ * Delay dequeueing tasks until they get selected or woken. ++ * ++ * By delaying the dequeue for non-eligible tasks, they remain in the ++ * competition and can burn off their negative lag. When they get selected ++ * they'll have positive lag by definition. ++ * ++ * GENTLE_DELAY clips the lag on dequeue (or wakeup) to 0. ++ */ ++SCHED_FEAT(DELAY_DEQUEUE, true) ++SCHED_FEAT(GENTLE_DELAY, true) + + /* + * Prefer to schedule the task we woke last (assuming it failed +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index f447bfbb1..e6ea7b63d 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2235,6 +2235,7 @@ struct sched_class { + + void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); + ++ bool (*delay_dequeue_task)(struct rq *rq, struct task_struct *p); + struct task_struct *(*pick_next_task)(struct rq *rq); + + void (*put_prev_task)(struct rq *rq, struct task_struct *p); +@@ -2288,7 +2289,7 @@ struct sched_class { + + static inline void put_prev_task(struct rq *rq, struct task_struct *prev) + { +- WARN_ON_ONCE(rq->curr != prev); ++// WARN_ON_ONCE(rq->curr != prev); + prev->sched_class->put_prev_task(rq, prev); + } + +-- +2.42.0 +