From d9da42be995a37fbde81f8440da65820dffdaff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20G=C3=B3rski?= Date: Tue, 24 Oct 2023 21:07:12 +0200 Subject: [PATCH] 6.5: EEVDF adaptation to BORE 3.2.8 (#828) Signed-off-by: Piotr Gorski --- linux-tkg-patches/6.5/0003-eevdf.patch | 1463 +++++++++++++++++++++++- 1 file changed, 1429 insertions(+), 34 deletions(-) diff --git a/linux-tkg-patches/6.5/0003-eevdf.patch b/linux-tkg-patches/6.5/0003-eevdf.patch index df6d958..6b0242b 100644 --- a/linux-tkg-patches/6.5/0003-eevdf.patch +++ b/linux-tkg-patches/6.5/0003-eevdf.patch @@ -1,7 +1,7 @@ -From 36fe8c8127c34a4f11bd765ca493046ea51b9062 Mon Sep 17 00:00:00 2001 +From 75dc528ce438f0de9dc4488f3de7c03a5464a6a1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 31 May 2023 13:58:40 +0200 -Subject: [PATCH 01/17] sched/fair: Add cfs_rq::avg_vruntime +Subject: [PATCH 01/28] sched/fair: Add cfs_rq::avg_vruntime In order to move to an eligibility based scheduling policy, we need to have a better approximation of the ideal scheduler. @@ -295,10 +295,10 @@ index e93e006a9..4ccb73d85 100644 2.42.0 -From 1d91e841fd914ff551a2fb1d4a491d2daccc988f Mon Sep 17 00:00:00 2001 +From 9839c6f0a4dec304f2577c71cc53fa5adab33ff4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 31 May 2023 13:58:41 +0200 -Subject: [PATCH 02/17] sched/fair: Remove sched_feat(START_DEBIT) +Subject: [PATCH 02/28] sched/fair: Remove sched_feat(START_DEBIT) With the introduction of avg_vruntime() there is no need to use worse approximations. Take the 0-lag point as starting point for inserting @@ -372,10 +372,10 @@ index ee7f23c76..fa828b365 100644 2.42.0 -From 1b7ae7e0939db6a3755332c771f132cb85de09d0 Mon Sep 17 00:00:00 2001 +From e2b83c59e712b31572aab92c651739b8577af01c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 31 May 2023 13:58:42 +0200 -Subject: [PATCH 03/17] sched/fair: Add lag based placement +Subject: [PATCH 03/28] sched/fair: Add lag based placement With the introduction of avg_vruntime, it is possible to approximate lag (the entire purpose of introducing it in fact). Use this to do lag @@ -661,10 +661,10 @@ index fa828b365..7958a10fe 100644 2.42.0 -From ca2997665fe5bdca860ef2dc0f3072a1ef10e0d3 Mon Sep 17 00:00:00 2001 +From 84e74d6704600343fcf3d4e2d6e8ce4d4228d8b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 31 May 2023 13:58:43 +0200 -Subject: [PATCH 04/17] rbtree: Add rb_add_augmented_cached() helper +Subject: [PATCH 04/28] rbtree: Add rb_add_augmented_cached() helper While slightly sub-optimal, updating the augmented data while going down the tree during lookup would be faster -- alas the augment @@ -719,10 +719,10 @@ index 7ee7ed5de..6dbc5a1bf 100644 2.42.0 -From 4a03369ce3240bc6d517059071d91760ee6224e4 Mon Sep 17 00:00:00 2001 +From 6439cd7527a2c4c59045cf79f60192116441288a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 31 May 2023 13:58:44 +0200 -Subject: [PATCH 05/17] sched/fair: Implement an EEVDF-like scheduling policy +Subject: [PATCH 05/28] sched/fair: Implement an EEVDF-like scheduling policy Where CFS is currently a WFQ based scheduler with only a single knob, the weight. The addition of a second, latency oriented parameter, @@ -1398,10 +1398,10 @@ index 4ccb73d85..1fc81dd7f 100644 2.42.0 -From 449a35d8af95c27bed6e2b3953dec01a8617c68e Mon Sep 17 00:00:00 2001 +From c29b5b1a88b3ed15813fc58e9a1d41e64a1d6511 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 31 May 2023 13:58:45 +0200 -Subject: [PATCH 06/17] sched/fair: Commit to lag based placement +Subject: [PATCH 06/28] sched/fair: Commit to lag based placement Removes the FAIR_SLEEPERS code in favour of the new LAG based placement. @@ -1526,10 +1526,10 @@ index 60cce1e6f..2a830eccd 100644 2.42.0 -From 0d15fadb3b9277144df12a3f1af2bd9e868e4d63 Mon Sep 17 00:00:00 2001 +From e891794bcc3eb5eb3e5942ec269f79355b7d0d8f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 31 May 2023 13:58:46 +0200 -Subject: [PATCH 07/17] sched/smp: Use lag to simplify cross-runqueue placement +Subject: [PATCH 07/28] sched/smp: Use lag to simplify cross-runqueue placement Using lag is both more correct and simpler when moving between runqueues. @@ -1793,10 +1793,10 @@ index 91f25d6c8..b7daccfb2 100644 2.42.0 -From 42396c41c6120cdeb72480c3af02ec4876c50d82 Mon Sep 17 00:00:00 2001 +From 991720a7eda97d92a66a5c94fd85617de0307b27 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 31 May 2023 13:58:47 +0200 -Subject: [PATCH 08/17] sched/fair: Commit to EEVDF +Subject: [PATCH 08/28] sched/fair: Commit to EEVDF EEVDF is a better defined scheduling policy, as a result it has less heuristics/tunables. There is no compelling reason to keep CFS around. @@ -2569,10 +2569,10 @@ index 1fc81dd7f..83bbcd35c 100644 2.42.0 -From 08697aba12aedc549b68fa7d45302466114462ab Mon Sep 17 00:00:00 2001 +From 80d62dca8d49a1a1de964786d19b350b7e910365 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 31 May 2023 13:58:48 +0200 -Subject: [PATCH 09/17] sched/debug: Rename sysctl_sched_min_granularity to +Subject: [PATCH 09/28] sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice EEVDF uses this tunable as the base request/slice -- make sure the @@ -2685,10 +2685,10 @@ index 83bbcd35c..e21f6a048 100644 2.42.0 -From a7c65d5f4a52a536b4df4272895b70febf77238b Mon Sep 17 00:00:00 2001 +From 1d3a784709658acd993cab0118c3a251321aaea3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 31 May 2023 13:58:49 +0200 -Subject: [PATCH 10/17] sched/fair: Propagate enqueue flags into place_entity() +Subject: [PATCH 10/28] sched/fair: Propagate enqueue flags into place_entity() This allows place_entity() to consider ENQUEUE_WAKEUP and ENQUEUE_MIGRATED. @@ -2766,10 +2766,10 @@ index e21f6a048..576d371c8 100644 2.42.0 -From e31e544605b41e66131d6b63b24b3ce1f377d866 Mon Sep 17 00:00:00 2001 +From 5b1ff22164a1098fd0a71a8bbc2e14387df3950b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Aug 2023 15:40:59 +0200 -Subject: [PATCH 11/17] sched/eevdf: Curb wakeup-preemption +Subject: [PATCH 11/28] sched/eevdf: Curb wakeup-preemption Mike and others noticed that EEVDF does like to over-schedule quite a bit -- which does hurt performance of a number of benchmarks / @@ -2858,10 +2858,10 @@ index 54334ca5c..546d212ef 100644 2.42.0 -From 9307eff09160b7f12e27112a0ed5d4723f2767c2 Mon Sep 17 00:00:00 2001 +From 1989d7c6cb34c8e293574249320ee716bb5b47b9 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Thu, 24 Aug 2023 13:33:42 +0530 -Subject: [PATCH 12/17] sched/eevdf/doc: Modify the documented knob to +Subject: [PATCH 12/28] sched/eevdf/doc: Modify the documented knob to base_slice_ns as well After committing the scheduler to EEVDF, we renamed the 'min_granularity_ns' @@ -2897,10 +2897,10 @@ index 03db55504..f68919800 100644 2.42.0 -From 27413005ea70c604267673df79f14296c2ae4eae Mon Sep 17 00:00:00 2001 +From eb9c5a4550dcb41730ede47d1554fcc634d3463a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Sep 2023 00:48:55 +0200 -Subject: [PATCH 13/17] sched/eevdf: Also update slice on placement +Subject: [PATCH 13/28] sched/eevdf: Also update slice on placement Tasks that never consume their full slice would not update their slice value. This means that tasks that are spawned before the sysctl scaling keep their @@ -2936,10 +2936,10 @@ index 1cdc95725..efbcdc69c 100644 2.42.0 -From f38ec3c65aa3ae0971c65c0968033391d3c8cae6 Mon Sep 17 00:00:00 2001 +From 9bf86297a3305c99ef82aa77fef7b39c8cc763c1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Sep 2023 14:29:50 +0200 -Subject: [PATCH 14/17] sched/eevdf: Fix avg_vruntime() +Subject: [PATCH 14/28] sched/eevdf: Fix avg_vruntime() The expectation is that placing a task at avg_vruntime() makes it eligible. Turns out there is a corner case where this is not the case. @@ -2991,10 +2991,10 @@ index efbcdc69c..9dbf3ce61 100644 2.42.0 -From 38ee54edf4535b987660980eba5dede6f0cb5c4b Mon Sep 17 00:00:00 2001 +From a23808a0a84a235f0471ad3a2b9bb3398792bf44 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 Oct 2023 21:24:45 +0200 -Subject: [PATCH 15/17] sched/eevdf: Fix min_deadline heap integrity +Subject: [PATCH 15/28] sched/eevdf: Fix min_deadline heap integrity Marek and Biju reported instances of: @@ -3044,10 +3044,10 @@ index 9dbf3ce61..a0f1d9578 100644 2.42.0 -From f183a7a169faad61743f1a3068529dd73fcfc936 Mon Sep 17 00:00:00 2001 +From 4e6428fd79ddf1f53e6171ead190c359551cdfda Mon Sep 17 00:00:00 2001 From: Benjamin Segall Date: Fri, 29 Sep 2023 17:09:30 -0700 -Subject: [PATCH 16/17] sched/eevdf: Fix pick_eevdf() +Subject: [PATCH 16/28] sched/eevdf: Fix pick_eevdf() The old pick_eevdf() could fail to find the actual earliest eligible deadline when it descended to the right looking for min_deadline, but @@ -3193,10 +3193,10 @@ index a0f1d9578..caec9b43c 100644 2.42.0 -From 07a48d2365d2d9739be3de2172b5c9ecc5ef867e Mon Sep 17 00:00:00 2001 +From d872c56e45bbe2bb687568cf2f15f6819916b565 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Oct 2023 16:59:47 +0200 -Subject: [PATCH 17/17] sched/eevdf: Fix heap corruption more +Subject: [PATCH 17/28] sched/eevdf: Fix heap corruption more Because someone is a flaming idiot... :/ @@ -3225,3 +3225,1398 @@ index caec9b43c..d0d912960 100644 -- 2.42.0 + +From b33c87a2d228bfc3f7950a681268b615757efd8a Mon Sep 17 00:00:00 2001 +From: Ingo Molnar +Date: Tue, 19 Sep 2023 10:31:15 +0200 +Subject: [PATCH 18/28] sched/fair: Rename check_preempt_wakeup() to + check_preempt_wakeup_fair() + +Other scheduling classes already postfix their similar methods +with the class name. + +Signed-off-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +--- + kernel/sched/fair.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index d0d912960..89774e7e2 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -7966,7 +7966,7 @@ static void set_next_buddy(struct sched_entity *se) + /* + * Preempt the current task with a newly woken task if needed: + */ +-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) + { + struct task_struct *curr = rq->curr; + struct sched_entity *se = &curr->se, *pse = &p->se; +@@ -12680,7 +12680,7 @@ DEFINE_SCHED_CLASS(fair) = { + .yield_task = yield_task_fair, + .yield_to_task = yield_to_task_fair, + +- .check_preempt_curr = check_preempt_wakeup, ++ .check_preempt_curr = check_preempt_wakeup_fair, + + .pick_next_task = __pick_next_task_fair, + .put_prev_task = put_prev_task_fair, +-- +2.42.0 + + +From 2c79e453add0658cd6750eb81fbc241816c31b84 Mon Sep 17 00:00:00 2001 +From: Ingo Molnar +Date: Tue, 19 Sep 2023 10:38:21 +0200 +Subject: [PATCH 19/28] sched/fair: Rename check_preempt_curr() to + wakeup_preempt() + +The name is a bit opaque - make it clear that this is about wakeup +preemption. + +Also rename the ->check_preempt_curr() methods similarly. + +Signed-off-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +--- + kernel/sched/core.c | 14 +++++++------- + kernel/sched/deadline.c | 10 +++++----- + kernel/sched/fair.c | 10 +++++----- + kernel/sched/idle.c | 4 ++-- + kernel/sched/rt.c | 6 +++--- + kernel/sched/sched.h | 4 ++-- + kernel/sched/stop_task.c | 4 ++-- + 7 files changed, 26 insertions(+), 26 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index be77d999d..4d851de8e 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -2198,10 +2198,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, + p->sched_class->prio_changed(rq, p, oldprio); + } + +-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) ++void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) + { + if (p->sched_class == rq->curr->sched_class) +- rq->curr->sched_class->check_preempt_curr(rq, p, flags); ++ rq->curr->sched_class->wakeup_preempt(rq, p, flags); + else if (sched_class_above(p->sched_class, rq->curr->sched_class)) + resched_curr(rq); + +@@ -2507,7 +2507,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, + rq_lock(rq, rf); + WARN_ON_ONCE(task_cpu(p) != new_cpu); + activate_task(rq, p, 0); +- check_preempt_curr(rq, p, 0); ++ wakeup_preempt(rq, p, 0); + + return rq; + } +@@ -3389,7 +3389,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) + deactivate_task(src_rq, p, 0); + set_task_cpu(p, cpu); + activate_task(dst_rq, p, 0); +- check_preempt_curr(dst_rq, p, 0); ++ wakeup_preempt(dst_rq, p, 0); + + rq_unpin_lock(dst_rq, &drf); + rq_unpin_lock(src_rq, &srf); +@@ -3774,7 +3774,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, + } + + activate_task(rq, p, en_flags); +- check_preempt_curr(rq, p, wake_flags); ++ wakeup_preempt(rq, p, wake_flags); + + ttwu_do_wakeup(p); + +@@ -3845,7 +3845,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) + * it should preempt the task that is current now. + */ + update_rq_clock(rq); +- check_preempt_curr(rq, p, wake_flags); ++ wakeup_preempt(rq, p, wake_flags); + } + ttwu_do_wakeup(p); + ret = 1; +@@ -4872,7 +4872,7 @@ void wake_up_new_task(struct task_struct *p) + + activate_task(rq, p, ENQUEUE_NOCLOCK); + trace_sched_wakeup_new(p); +- check_preempt_curr(rq, p, WF_FORK); ++ wakeup_preempt(rq, p, WF_FORK); + #ifdef CONFIG_SMP + if (p->sched_class->task_woken) { + /* +diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c +index 58b542bf2..fb1996a67 100644 +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -763,7 +763,7 @@ static inline void deadline_queue_pull_task(struct rq *rq) + + static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); + static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); +-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); ++static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags); + + static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, + struct rq *rq) +@@ -1175,7 +1175,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) + + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (dl_task(rq->curr)) +- check_preempt_curr_dl(rq, p, 0); ++ wakeup_preempt_dl(rq, p, 0); + else + resched_curr(rq); + +@@ -1939,7 +1939,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + * Only called when both the current and waking task are -deadline + * tasks. + */ +-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, ++static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, + int flags) + { + if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { +@@ -2652,7 +2652,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) + deadline_queue_push_tasks(rq); + #endif + if (dl_task(rq->curr)) +- check_preempt_curr_dl(rq, p, 0); ++ wakeup_preempt_dl(rq, p, 0); + else + resched_curr(rq); + } else { +@@ -2721,7 +2721,7 @@ DEFINE_SCHED_CLASS(dl) = { + .dequeue_task = dequeue_task_dl, + .yield_task = yield_task_dl, + +- .check_preempt_curr = check_preempt_curr_dl, ++ .wakeup_preempt = wakeup_preempt_dl, + + .pick_next_task = pick_next_task_dl, + .put_prev_task = put_prev_task_dl, +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 89774e7e2..ab95f1312 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -7979,7 +7979,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int + + /* + * This is possible from callers such as attach_tasks(), in which we +- * unconditionally check_preempt_curr() after an enqueue (which may have ++ * unconditionally wakeup_preempt() after an enqueue (which may have + * lead to a throttle). This both saves work and prevents false + * next-buddy nomination below. + */ +@@ -8880,7 +8880,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) + + WARN_ON_ONCE(task_rq(p) != rq); + activate_task(rq, p, ENQUEUE_NOCLOCK); +- check_preempt_curr(rq, p, 0); ++ wakeup_preempt(rq, p, 0); + } + + /* +@@ -12219,7 +12219,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) + if (p->prio > oldprio) + resched_curr(rq); + } else +- check_preempt_curr(rq, p, 0); ++ wakeup_preempt(rq, p, 0); + } + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -12321,7 +12321,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) + if (task_current(rq, p)) + resched_curr(rq); + else +- check_preempt_curr(rq, p, 0); ++ wakeup_preempt(rq, p, 0); + } + } + +@@ -12680,7 +12680,7 @@ DEFINE_SCHED_CLASS(fair) = { + .yield_task = yield_task_fair, + .yield_to_task = yield_to_task_fair, + +- .check_preempt_curr = check_preempt_wakeup_fair, ++ .wakeup_preempt = check_preempt_wakeup_fair, + + .pick_next_task = __pick_next_task_fair, + .put_prev_task = put_prev_task_fair, +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index 5007b25c5..565f8374d 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -401,7 +401,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + /* + * Idle tasks are unconditionally rescheduled: + */ +-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) ++static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) + { + resched_curr(rq); + } +@@ -482,7 +482,7 @@ DEFINE_SCHED_CLASS(idle) = { + /* dequeue is not valid, we print a debug message there: */ + .dequeue_task = dequeue_task_idle, + +- .check_preempt_curr = check_preempt_curr_idle, ++ .wakeup_preempt = wakeup_preempt_idle, + + .pick_next_task = pick_next_task_idle, + .put_prev_task = put_prev_task_idle, +diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c +index 185d3d749..6f066d5e0 100644 +--- a/kernel/sched/rt.c ++++ b/kernel/sched/rt.c +@@ -953,7 +953,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) + + /* + * When we're idle and a woken (rt) task is +- * throttled check_preempt_curr() will set ++ * throttled wakeup_preempt() will set + * skip_update and the time between the wakeup + * and this unthrottle will get accounted as + * 'runtime'. +@@ -1715,7 +1715,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + /* + * Preempt the current task with a newly woken task if needed: + */ +-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) ++static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) + { + if (p->prio < rq->curr->prio) { + resched_curr(rq); +@@ -2702,7 +2702,7 @@ DEFINE_SCHED_CLASS(rt) = { + .dequeue_task = dequeue_task_rt, + .yield_task = yield_task_rt, + +- .check_preempt_curr = check_preempt_curr_rt, ++ .wakeup_preempt = wakeup_preempt_rt, + + .pick_next_task = pick_next_task_rt, + .put_prev_task = put_prev_task_rt, +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 576d371c8..26c235d3a 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2217,7 +2217,7 @@ struct sched_class { + void (*yield_task) (struct rq *rq); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p); + +- void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); ++ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); + + struct task_struct *(*pick_next_task)(struct rq *rq); + +@@ -2490,7 +2490,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) + extern void activate_task(struct rq *rq, struct task_struct *p, int flags); + extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + +-extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); ++extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); + + #ifdef CONFIG_PREEMPT_RT + #define SCHED_NR_MIGRATE_BREAK 8 +diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c +index 85590599b..6cf7304e6 100644 +--- a/kernel/sched/stop_task.c ++++ b/kernel/sched/stop_task.c +@@ -23,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + #endif /* CONFIG_SMP */ + + static void +-check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) ++wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags) + { + /* we're never preempted */ + } +@@ -120,7 +120,7 @@ DEFINE_SCHED_CLASS(stop) = { + .dequeue_task = dequeue_task_stop, + .yield_task = yield_task_stop, + +- .check_preempt_curr = check_preempt_curr_stop, ++ .wakeup_preempt = wakeup_preempt_stop, + + .pick_next_task = pick_next_task_stop, + .put_prev_task = put_prev_task_stop, +-- +2.42.0 + + +From a4c874c05e95ba4e22151f5ac13074b8c1d861e8 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior +Date: Wed, 20 Sep 2023 15:00:24 +0200 +Subject: [PATCH 20/28] sched/debug: Remove the + /proc/sys/kernel/sched_child_runs_first sysctl + +The /proc/sys/kernel/sched_child_runs_first knob is no longer connected since: + + 5e963f2bd4654 ("sched/fair: Commit to EEVDF") + +Remove it. + +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Ingo Molnar +Link: https://lore.kernel.org/r/20230920130025.412071-2-bigeasy@linutronix.de +--- + kernel/sched/debug.c | 1 - + kernel/sched/fair.c | 13 ------------- + kernel/sched/sched.h | 2 -- + 3 files changed, 16 deletions(-) + +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 2c5bb64f5..003fe3fb4 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -863,7 +863,6 @@ static void sched_debug_header(struct seq_file *m) + #define PN(x) \ + SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + PN(sysctl_sched_base_slice); +- P(sysctl_sched_child_runs_first); + P(sysctl_sched_features); + #undef PN + #undef P +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index ab95f1312..23d769b9c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -78,12 +78,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; + unsigned int sysctl_sched_base_slice = 750000ULL; + static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; + +-/* +- * After fork, child runs first. If set to 0 (default) then +- * parent will (try to) run first. +- */ +-unsigned int sysctl_sched_child_runs_first __read_mostly; +- + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + + int sched_thermal_decay_shift; +@@ -145,13 +139,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; + + #ifdef CONFIG_SYSCTL + static struct ctl_table sched_fair_sysctls[] = { +- { +- .procname = "sched_child_runs_first", +- .data = &sysctl_sched_child_runs_first, +- .maxlen = sizeof(unsigned int), +- .mode = 0644, +- .proc_handler = proc_dointvec, +- }, + #ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 26c235d3a..ab53f7eca 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -109,8 +109,6 @@ extern __read_mostly int scheduler_running; + extern unsigned long calc_load_update; + extern atomic_long_t calc_load_tasks; + +-extern unsigned int sysctl_sched_child_runs_first; +- + extern void calc_global_load_tick(struct rq *this_rq); + extern long calc_load_fold_active(struct rq *this_rq, long adjust); + +-- +2.42.0 + + +From 01a0126a1b54a0984126857645838cb03adac52e Mon Sep 17 00:00:00 2001 +From: Yiwei Lin +Date: Fri, 20 Oct 2023 13:56:17 +0800 +Subject: [PATCH 21/28] sched/fair: Remove unused 'curr' argument from + pick_next_entity() + +The 'curr' argument of pick_next_entity() has become unused after +the EEVDF changes. + +[ mingo: Updated the changelog. ] + +Signed-off-by: Yiwei Lin +Signed-off-by: Ingo Molnar +Link: https://lore.kernel.org/r/20231020055617.42064-1-s921975628@gmail.com +--- + kernel/sched/fair.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 23d769b9c..fe6c762d5 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -5242,7 +5242,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + * 4) do not run the "skip" process, if something else is available + */ + static struct sched_entity * +-pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) ++pick_next_entity(struct cfs_rq *cfs_rq) + { + /* + * Enabling NEXT_BUDDY will affect latency but not fairness. +@@ -8058,7 +8058,7 @@ static struct task_struct *pick_task_fair(struct rq *rq) + goto again; + } + +- se = pick_next_entity(cfs_rq, curr); ++ se = pick_next_entity(cfs_rq); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + +@@ -8121,7 +8121,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf + } + } + +- se = pick_next_entity(cfs_rq, curr); ++ se = pick_next_entity(cfs_rq); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + +@@ -8160,7 +8160,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf + put_prev_task(rq, prev); + + do { +- se = pick_next_entity(cfs_rq, NULL); ++ se = pick_next_entity(cfs_rq); + set_next_entity(cfs_rq, se); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); +-- +2.42.0 + + +From 0ba49853099d2af9d4bb8e2c945f8dc6daa81216 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Sat, 14 Oct 2023 23:12:20 +0200 +Subject: [PATCH 22/28] sched/eevdf: Add feature comments + +Signed-off-by: Peter Zijlstra (Intel) +--- + kernel/sched/features.h | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 546d212ef..46b65fdc6 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -5,7 +5,14 @@ + * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. + */ + SCHED_FEAT(PLACE_LAG, true) ++/* ++ * Give new tasks half a slice to ease into the competition. ++ */ + SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) ++/* ++ * Inhibit (wakeup) preemption until the current task has either matched the ++ * 0-lag point or until is has exhausted it's slice. ++ */ + SCHED_FEAT(RUN_TO_PARITY, true) + + /* +-- +2.42.0 + + +From 8db4bc9bec401af5bf78f071471728dac9420171 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Wed, 4 Oct 2023 12:43:53 +0200 +Subject: [PATCH 23/28] sched/eevdf: Remove min_vruntime_copy + +Since commit e8f331bcc270 ("sched/smp: Use lag to simplify +cross-runqueue placement") the min_vruntime_copy is no longer used. + +Signed-off-by: Peter Zijlstra (Intel) +--- + kernel/sched/fair.c | 5 ++--- + kernel/sched/sched.h | 4 ---- + 2 files changed, 2 insertions(+), 7 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index fe6c762d5..fbf907804 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -774,8 +774,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) + } + + /* ensure we never gain time by being placed backwards. */ +- u64_u32_store(cfs_rq->min_vruntime, +- __update_min_vruntime(cfs_rq, vruntime)); ++ cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); + } + + static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +@@ -12343,7 +12342,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) + void init_cfs_rq(struct cfs_rq *cfs_rq) + { + cfs_rq->tasks_timeline = RB_ROOT_CACHED; +- u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); ++ cfs_rq->min_vruntime = (u64)(-(1LL << 20)); + #ifdef CONFIG_SMP + raw_spin_lock_init(&cfs_rq->removed.lock); + #endif +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index ab53f7eca..0a1957994 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -556,10 +556,6 @@ struct cfs_rq { + u64 min_vruntime_fi; + #endif + +-#ifndef CONFIG_64BIT +- u64 min_vruntime_copy; +-#endif +- + struct rb_root_cached tasks_timeline; + + /* +-- +2.42.0 + + +From b46653f0195e9b8f467671caa47973a9bf518ee6 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Mon, 22 May 2023 13:46:30 +0200 +Subject: [PATCH 24/28] sched/eevdf: Use sched_attr::sched_runtime to set + request/slice suggestion + +Allow applications to directly set a suggested request/slice length using +sched_attr::sched_runtime. + +The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms] +which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100. + +Applications should strive to use their periodic runtime at a high +confidence interval (95%+) as the target slice. Using a smaller slice +will introduce undue preemptions, while using a larger value will +increase latency. + +For all the following examples assume a scheduling quantum of 8, and for +consistency all examples have W=4: + + {A,B,C,D}(w=1,r=8): + + ABCD... + +---+---+---+--- + + t=0, V=1.5 t=1, V=3.5 + A |------< A |------< + B |------< B |------< + C |------< C |------< + D |------< D |------< + ---+*------+-------+--- ---+--*----+-------+--- + + t=2, V=5.5 t=3, V=7.5 + A |------< A |------< + B |------< B |------< + C |------< C |------< + D |------< D |------< + ---+----*--+-------+--- ---+------*+-------+--- + +Note: 4 identical tasks in FIFO order + +~~~ + + {A,B}(w=1,r=16) C(w=2,r=16) + + AACCBBCC... + +---+---+---+--- + + t=0, V=1.25 t=2, V=5.25 + A |--------------< A |--------------< + B |--------------< B |--------------< + C |------< C |------< + ---+*------+-------+--- ---+----*--+-------+--- + + t=4, V=8.25 t=6, V=12.25 + A |--------------< A |--------------< + B |--------------< B |--------------< + C |------< C |------< + ---+-------*-------+--- ---+-------+---*---+--- + +Note: 1 heavy task -- because q=8, double r such that the deadline of the w=2 + task doesn't go below q. + +Note: observe the full schedule becomes: W*max(r_i/w_i) = 4*2q = 8q in length. + +Note: the period of the heavy task is half the full period at: + W*(r_i/w_i) = 4*(2q/2) = 4q + +~~~ + + {A,C,D}(w=1,r=16) B(w=1,r=8): + + BAACCBDD... + +---+---+---+--- + + t=0, V=1.5 t=1, V=3.5 + A |--------------< A |---------------< + B |------< B |------< + C |--------------< C |--------------< + D |--------------< D |--------------< + ---+*------+-------+--- ---+--*----+-------+--- + + t=3, V=7.5 t=5, V=11.5 + A |---------------< A |---------------< + B |------< B |------< + C |--------------< C |--------------< + D |--------------< D |--------------< + ---+------*+-------+--- ---+-------+--*----+--- + + t=6, V=13.5 + A |---------------< + B |------< + C |--------------< + D |--------------< + ---+-------+----*--+--- + +Note: 1 short task -- again double r so that the deadline of the short task + won't be below q. Made B short because its not the leftmost task, but is + eligible with the 0,1,2,3 spread. + +Note: like with the heavy task, the period of the short task observes: + W*(r_i/w_i) = 4*(1q/1) = 4q + +~~~ + + A(w=1,r=16) B(w=1,r=8) C(w=2,r=16) + + BCCAABCC... + +---+---+---+--- + + t=0, V=1.25 t=1, V=3.25 + A |--------------< A |--------------< + B |------< B |------< + C |------< C |------< + ---+*------+-------+--- ---+--*----+-------+--- + + t=3, V=7.25 t=5, V=11.25 + A |--------------< A |--------------< + B |------< B |------< + C |------< C |------< + ---+------*+-------+--- ---+-------+--*----+--- + + t=6, V=13.25 + A |--------------< + B |------< + C |------< + ---+-------+----*--+--- + +Note: 1 heavy and 1 short task -- combine them all. + +Note: both the short and heavy task end up with a period of 4q + +~~~ + + A(w=1,r=16) B(w=2,r=16) C(w=1,r=8) + + BBCAABBC... + +---+---+---+--- + + t=0, V=1 t=2, V=5 + A |--------------< A |--------------< + B |------< B |------< + C |------< C |------< + ---+*------+-------+--- ---+----*--+-------+--- + + t=3, V=7 t=5, V=11 + A |--------------< A |--------------< + B |------< B |------< + C |------< C |------< + ---+------*+-------+--- ---+-------+--*----+--- + + t=7, V=15 + A |--------------< + B |------< + C |------< + ---+-------+------*+--- + +Note: as before but permuted + +~~~ + +From all this it can be deduced that, for the steady state: + + - the total period (P) of a schedule is: W*max(r_i/w_i) + - the average period of a task is: W*(r_i/w_i) + - each task obtains the fair share: w_i/W of each full period P + +Signed-off-by: Peter Zijlstra (Intel) +--- + include/linux/sched.h | 3 +++ + kernel/sched/core.c | 33 ++++++++++++++++++++++++++------- + kernel/sched/debug.c | 3 ++- + kernel/sched/fair.c | 6 ++++-- + 4 files changed, 35 insertions(+), 10 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 35331c35f..e0a81ce05 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -555,6 +555,9 @@ struct sched_entity { + struct list_head group_node; + unsigned int on_rq; + ++ unsigned int custom_slice : 1; ++ /* 31 bits hole */ ++ + u64 exec_start; + u64 sum_exec_runtime; + u64 prev_sum_exec_runtime; +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 4d851de8e..6fcccd15a 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4502,7 +4502,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.nr_migrations = 0; + p->se.vruntime = 0; + p->se.vlag = 0; +- p->se.slice = sysctl_sched_base_slice; + INIT_LIST_HEAD(&p->se.group_node); + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -4756,6 +4755,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) + + p->prio = p->normal_prio = p->static_prio; + set_load_weight(p, false); ++ p->se.custom_slice = 0; ++ p->se.slice = sysctl_sched_base_slice; + + /* + * We don't need the reset flag anymore after the fork. It has +@@ -7527,10 +7528,20 @@ static void __setscheduler_params(struct task_struct *p, + + p->policy = policy; + +- if (dl_policy(policy)) ++ if (dl_policy(policy)) { + __setparam_dl(p, attr); +- else if (fair_policy(policy)) ++ } else if (fair_policy(policy)) { + p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ if (attr->sched_runtime) { ++ p->se.custom_slice = 1; ++ p->se.slice = clamp_t(u64, attr->sched_runtime, ++ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ ++ NSEC_PER_MSEC*100); /* HZ=100 / 10 */ ++ } else { ++ p->se.custom_slice = 0; ++ p->se.slice = sysctl_sched_base_slice; ++ } ++ } + + /* + * __sched_setscheduler() ensures attr->sched_priority == 0 when +@@ -7715,7 +7726,9 @@ static int __sched_setscheduler(struct task_struct *p, + * but store a possible modification of reset_on_fork. + */ + if (unlikely(policy == p->policy)) { +- if (fair_policy(policy) && attr->sched_nice != task_nice(p)) ++ if (fair_policy(policy) && ++ (attr->sched_nice != task_nice(p) || ++ (attr->sched_runtime && attr->sched_runtime != p->se.slice))) + goto change; + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) + goto change; +@@ -7861,6 +7874,9 @@ static int _sched_setscheduler(struct task_struct *p, int policy, + .sched_nice = PRIO_TO_NICE(p->static_prio), + }; + ++ if (p->se.custom_slice) ++ attr.sched_runtime = p->se.slice; ++ + /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ + if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; +@@ -8037,12 +8053,14 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a + + static void get_params(struct task_struct *p, struct sched_attr *attr) + { +- if (task_has_dl_policy(p)) ++ if (task_has_dl_policy(p)) { + __getparam_dl(p, attr); +- else if (task_has_rt_policy(p)) ++ } else if (task_has_rt_policy(p)) { + attr->sched_priority = p->rt_priority; +- else ++ } else { + attr->sched_nice = task_nice(p); ++ attr->sched_runtime = p->se.slice; ++ } + } + + /** +@@ -10061,6 +10079,7 @@ void __init sched_init(void) + } + + set_load_weight(&init_task, false); ++ init_task.se.slice = sysctl_sched_base_slice, + + /* + * The boot idle thread does lazy MMU switching as well: +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 003fe3fb4..37ffe69a9 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -578,11 +578,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) + else + SEQ_printf(m, " %c", task_state_to_char(p)); + +- SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", ++ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", + p->comm, task_pid_nr(p), + SPLIT_NS(p->se.vruntime), + entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', + SPLIT_NS(p->se.deadline), ++ p->se.custom_slice ? 'S' : ' ', + SPLIT_NS(p->se.slice), + SPLIT_NS(p->se.sum_exec_runtime), + (long long)(p->nvcsw + p->nivcsw), +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index fbf907804..357005f0d 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1011,7 +1011,8 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) + * nice) while the request time r_i is determined by + * sysctl_sched_base_slice. + */ +- se->slice = sysctl_sched_base_slice; ++ if (!se->custom_slice) ++ se->slice = sysctl_sched_base_slice; + + /* + * EEVDF: vd_i = ve_i + r_i / w_i +@@ -4961,7 +4962,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + u64 vslice, vruntime = avg_vruntime(cfs_rq); + s64 lag = 0; + +- se->slice = sysctl_sched_base_slice; ++ if (!se->custom_slice) ++ se->slice = sysctl_sched_base_slice; + vslice = calc_delta_fair(se->slice, se); + + /* +-- +2.42.0 + + +From 03a5b5f7a486c044e165ddec3253ebdbc0039031 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Tue, 26 Sep 2023 14:32:32 +0200 +Subject: [PATCH 25/28] sched/eevdf: Allow shorter slices to wakeup-preempt + +Part of the reason to have shorter slices is to improve +responsiveness. Allow shorter slices to preempt longer slices on +wakeup. + + Task | Runtime ms | Switches | Avg delay ms | Max delay ms | Sum delay ms | + + 100ms massive_intr 500us cyclictest NO_PREEMPT_SHORT + + 1 massive_intr:(5) | 846018.956 ms | 779188 | avg: 0.273 ms | max: 58.337 ms | sum:212545.245 ms | + 2 massive_intr:(5) | 853450.693 ms | 792269 | avg: 0.275 ms | max: 71.193 ms | sum:218263.588 ms | + 3 massive_intr:(5) | 843888.920 ms | 771456 | avg: 0.277 ms | max: 92.405 ms | sum:213353.221 ms | + 1 chromium-browse:(8) | 53015.889 ms | 131766 | avg: 0.463 ms | max: 36.341 ms | sum:60959.230 ms | + 2 chromium-browse:(8) | 53864.088 ms | 136962 | avg: 0.480 ms | max: 27.091 ms | sum:65687.681 ms | + 3 chromium-browse:(9) | 53637.904 ms | 132637 | avg: 0.481 ms | max: 24.756 ms | sum:63781.673 ms | + 1 cyclictest:(5) | 12615.604 ms | 639689 | avg: 0.471 ms | max: 32.272 ms | sum:301351.094 ms | + 2 cyclictest:(5) | 12511.583 ms | 642578 | avg: 0.448 ms | max: 44.243 ms | sum:287632.830 ms | + 3 cyclictest:(5) | 12545.867 ms | 635953 | avg: 0.475 ms | max: 25.530 ms | sum:302374.658 ms | + + 100ms massive_intr 500us cyclictest PREEMPT_SHORT + + 1 massive_intr:(5) | 839843.919 ms | 837384 | avg: 0.264 ms | max: 74.366 ms | sum:221476.885 ms | + 2 massive_intr:(5) | 852449.913 ms | 845086 | avg: 0.252 ms | max: 68.162 ms | sum:212595.968 ms | + 3 massive_intr:(5) | 839180.725 ms | 836883 | avg: 0.266 ms | max: 69.742 ms | sum:222812.038 ms | + 1 chromium-browse:(11) | 54591.481 ms | 138388 | avg: 0.458 ms | max: 35.427 ms | sum:63401.508 ms | + 2 chromium-browse:(8) | 52034.541 ms | 132276 | avg: 0.436 ms | max: 31.826 ms | sum:57732.958 ms | + 3 chromium-browse:(8) | 55231.771 ms | 141892 | avg: 0.469 ms | max: 27.607 ms | sum:66538.697 ms | + 1 cyclictest:(5) | 13156.391 ms | 667412 | avg: 0.373 ms | max: 38.247 ms | sum:249174.502 ms | + 2 cyclictest:(5) | 12688.939 ms | 665144 | avg: 0.374 ms | max: 33.548 ms | sum:248509.392 ms | + 3 cyclictest:(5) | 13475.623 ms | 669110 | avg: 0.370 ms | max: 37.819 ms | sum:247673.390 ms | + +As per the numbers the, this makes cyclictest (short slice) it's +max-delay more consistent and consistency drops the sum-delay. The +trade-off is that the massive_intr (long slice) gets more context +switches and a slight increase in sum-delay. + +[mike: numbers] +Signed-off-by: Peter Zijlstra (Intel) +Tested-by: Mike Galbraith +--- + kernel/sched/fair.c | 11 ++++++++--- + kernel/sched/features.h | 4 ++++ + 2 files changed, 12 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 357005f0d..b16c70e3b 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8022,9 +8022,14 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int + cfs_rq = cfs_rq_of(se); + update_curr(cfs_rq); + +- /* +- * XXX pick_eevdf(cfs_rq) != se ? +- */ ++ if (sched_feat(PREEMPT_SHORT) && pse->slice < se->slice && ++ entity_eligible(cfs_rq, pse) && ++ (s64)(pse->deadline - se->deadline) < 0 && ++ se->vlag == se->deadline) { ++ /* negate RUN_TO_PARITY */ ++ se->vlag = se->deadline - 1; ++ } ++ + if (pick_eevdf(cfs_rq) == pse) + goto preempt; + +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 46b65fdc6..642f1de58 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -14,6 +14,10 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) + * 0-lag point or until is has exhausted it's slice. + */ + SCHED_FEAT(RUN_TO_PARITY, true) ++/* ++ * Allow tasks with a shorter slice to disregard RUN_TO_PARITY ++ */ ++SCHED_FEAT(PREEMPT_SHORT, true) + + /* + * Prefer to schedule the task we woke last (assuming it failed +-- +2.42.0 + + +From 3c04ff192a3569a6c0ee6924678bc598efc1af59 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Tue, 26 Sep 2023 14:39:41 +0200 +Subject: [PATCH 26/28] sched/eevdf: Revenge of the Sith^WSleeper + +For tasks that have received excess service (negative lag) allow them +to gain parity (zero lag) by sleeping. + + slice 30000000 (*10) + # Min Latencies: 00041 + # Avg Latencies: 00712 + # Max Latencies: 287353 + + slice 3000000 (default) + # Min Latencies: 00054 + # Avg Latencies: 00436 + # Max Latencies: 23531 + + slice 300000 (/10) + # Min Latencies: 00054 + # Avg Latencies: 00061 + # Max Latencies: 05245 + +It sucks for many other things though... so let it be an experiment. + +Signed-off-by: Peter Zijlstra (Intel) +--- + kernel/sched/fair.c | 36 ++++++++++++++++++++++++++++++++++++ + kernel/sched/features.h | 6 ++++++ + 2 files changed, 42 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index b16c70e3b..10009c713 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -4956,6 +4956,33 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} + + #endif /* CONFIG_SMP */ + ++static inline u64 ++entity_vlag_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) ++{ ++ u64 now, vdelta; ++ s64 delta; ++ ++ if (!(flags & ENQUEUE_WAKEUP)) ++ return se->vlag; ++ ++ if (flags & ENQUEUE_MIGRATED) ++ return 0; ++ ++ now = rq_clock_task(rq_of(cfs_rq)); ++ delta = now - se->exec_start; ++ if (delta < 0) ++ return se->vlag; ++ ++ if (sched_feat(GENTLE_SLEEPER)) ++ delta /= 2; ++ ++ vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load); ++ if (vdelta < -se->vlag) ++ return se->vlag + vdelta; ++ ++ return 0; ++} ++ + static void + place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + { +@@ -4980,6 +5007,15 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + + lag = se->vlag; + ++ /* ++ * Allow tasks that have received too much service (negative ++ * lag) to (re)gain parity (zero lag) by sleeping for the ++ * equivalent duration. This ensures they will be readily ++ * eligible. ++ */ ++ if (sched_feat(PLACE_SLEEPER) && lag < 0) ++ lag = entity_vlag_sleeper(cfs_rq, se, flags); ++ + /* + * If we want to place a task and preserve lag, we have to + * consider the effect of the new entity on the weighted +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 642f1de58..97b5f6dd9 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -18,6 +18,12 @@ SCHED_FEAT(RUN_TO_PARITY, true) + * Allow tasks with a shorter slice to disregard RUN_TO_PARITY + */ + SCHED_FEAT(PREEMPT_SHORT, true) ++/* ++ * Let sleepers earn back lag, but not more than 0-lag. GENTLE_SLEEPERS earn at ++ * half the speed. ++ */ ++SCHED_FEAT(PLACE_SLEEPER, false) ++SCHED_FEAT(GENTLE_SLEEPER, true) + + /* + * Prefer to schedule the task we woke last (assuming it failed +-- +2.42.0 + + +From 445a0b07989b3da3116f8f6bf7dd59806a79cc32 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Thu, 5 Oct 2023 15:30:13 +0200 +Subject: [PATCH 27/28] sched/eevdf: Disable entity_eligible() + +Disable entity_eligible() entirely, this makes tasks much easier to +pick, but also gives rise to degenerate cases like: + +t=92 V=16 + A |----< + B |< +>C |----------------< + D |< + E |< + F |< + G |< + |---------|-----*---|---------|---------|---- + +hence, default disable. + +Suggested-by: Youssef Esmat +Signed-off-by: Peter Zijlstra (Intel) +--- + kernel/sched/fair.c | 3 +++ + kernel/sched/features.h | 11 +++++++++++ + 2 files changed, 14 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 10009c713..1bdd95677 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -724,6 +724,9 @@ void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) + */ + int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) + { ++ if (sched_feat(EVDF)) ++ return true; ++ + struct sched_entity *curr = cfs_rq->curr; + s64 avg = cfs_rq->avg_vruntime; + long load = cfs_rq->avg_load; +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 97b5f6dd9..dacef8e1b 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -24,6 +24,17 @@ SCHED_FEAT(PREEMPT_SHORT, true) + */ + SCHED_FEAT(PLACE_SLEEPER, false) + SCHED_FEAT(GENTLE_SLEEPER, true) ++/* ++ * Disable the eligibility check -- always true. ++ * ++ * Selecting this allows short tasks, in the presence of a long task, to walk ++ * far past 0-lag and create a window where newly placed tasks will come in and ++ * starve the long task. ++ * ++ * Behaves quite terrible for mixed slice workloads as a result, very much not ++ * recommended. ++ */ ++SCHED_FEAT(EVDF, false) + + /* + * Prefer to schedule the task we woke last (assuming it failed +-- +2.42.0 + + +From f6be5a98f494e65149331150991c7079a7f9338c Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Fri, 15 Sep 2023 00:48:45 +0200 +Subject: [PATCH 28/28] sched/eevdf: Delay dequeue + +For tasks that have negative-lag (have received 'excess' service), delay the +dequeue and keep them in the runnable tree until they're eligible again. Or +rather, keep them until they're selected again, since finding their eligibility +crossover point is expensive. + +The effect is a bit like sleeper bonus, the tasks keep contending for service +until either they get a wakeup or until they're selected again and are really +dequeued. + +This means that any actual dequeue happens with positive lag (serviced owed) +and are more readily ran when woken next. + +Signed-off-by: Peter Zijlstra (Intel) +--- + include/linux/sched.h | 1 + + kernel/sched/core.c | 88 +++++++++++++++++++++++++++++++++-------- + kernel/sched/fair.c | 11 ++++++ + kernel/sched/features.h | 11 ++++++ + kernel/sched/sched.h | 3 +- + 5 files changed, 97 insertions(+), 17 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index e0a81ce05..93c03b162 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -894,6 +894,7 @@ struct task_struct { + unsigned sched_reset_on_fork:1; + unsigned sched_contributes_to_load:1; + unsigned sched_migrated:1; ++ unsigned sched_delayed:1; + + /* Force alignment to the next boundary: */ + unsigned :0; +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 6fcccd15a..9f56638b6 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -3839,12 +3839,23 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) + + rq = __task_rq_lock(p, &rf); + if (task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ if (unlikely(p->sched_delayed)) { ++ p->sched_delayed = 0; ++ /* mustn't run a delayed task */ ++ WARN_ON_ONCE(task_on_cpu(rq, p)); ++ if (sched_feat(GENTLE_DELAY)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); ++ if (p->se.vlag > 0) ++ p->se.vlag = 0; ++ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); ++ } ++ } + if (!task_on_cpu(rq, p)) { + /* + * When on_rq && !on_cpu the task is preempted, see if + * it should preempt the task that is current now. + */ +- update_rq_clock(rq); + wakeup_preempt(rq, p, wake_flags); + } + ttwu_do_wakeup(p); +@@ -6552,6 +6563,24 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + # define SM_MASK_PREEMPT SM_PREEMPT + #endif + ++static void deschedule_task(struct rq *rq, struct task_struct *p, unsigned long prev_state) ++{ ++ p->sched_contributes_to_load = ++ (prev_state & TASK_UNINTERRUPTIBLE) && ++ !(prev_state & TASK_NOLOAD) && ++ !(prev_state & TASK_FROZEN); ++ ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible++; ++ ++ deactivate_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); ++ ++ if (p->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++} ++ + /* + * __schedule() is the main scheduler function. + * +@@ -6636,6 +6665,8 @@ static void __sched notrace __schedule(unsigned int sched_mode) + + switch_count = &prev->nivcsw; + ++ WARN_ON_ONCE(prev->sched_delayed); ++ + /* + * We must load prev->state once (task_struct::state is volatile), such + * that we form a control dependency vs deactivate_task() below. +@@ -6645,14 +6676,6 @@ static void __sched notrace __schedule(unsigned int sched_mode) + if (signal_pending_state(prev_state, prev)) { + WRITE_ONCE(prev->__state, TASK_RUNNING); + } else { +- prev->sched_contributes_to_load = +- (prev_state & TASK_UNINTERRUPTIBLE) && +- !(prev_state & TASK_NOLOAD) && +- !(prev_state & TASK_FROZEN); +- +- if (prev->sched_contributes_to_load) +- rq->nr_uninterruptible++; +- + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) +@@ -6664,17 +6687,50 @@ static void __sched notrace __schedule(unsigned int sched_mode) + * + * After this, schedule() must not care about p->state any more. + */ +- deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); +- +- if (prev->in_iowait) { +- atomic_inc(&rq->nr_iowait); +- delayacct_blkio_start(); +- } ++ if (sched_feat(DELAY_DEQUEUE) && ++ prev->sched_class->delay_dequeue_task && ++ prev->sched_class->delay_dequeue_task(rq, prev)) ++ prev->sched_delayed = 1; ++ else ++ deschedule_task(rq, prev, prev_state); + } + switch_count = &prev->nvcsw; + } + +- next = pick_next_task(rq, prev, &rf); ++ for (struct task_struct *tmp = prev;;) { ++ unsigned long tmp_state; ++ ++ next = pick_next_task(rq, tmp, &rf); ++ if (unlikely(tmp != prev)) ++ finish_task(tmp); ++ ++ if (likely(!next->sched_delayed)) ++ break; ++ ++ next->sched_delayed = 0; ++ ++ /* ++ * A sched_delayed task must not be runnable at this point, see ++ * ttwu_runnable(). ++ */ ++ tmp_state = READ_ONCE(next->__state); ++ if (WARN_ON_ONCE(!tmp_state)) ++ break; ++ ++ prepare_task(next); ++ /* ++ * Order ->on_cpu and ->on_rq, see the comments in ++ * try_to_wake_up(). Normally this is smp_mb__after_spinlock() ++ * above. ++ */ ++ smp_wmb(); ++ deschedule_task(rq, next, tmp_state); ++ if (sched_feat(GENTLE_DELAY) && next->se.vlag > 0) ++ next->se.vlag = 0; ++ ++ tmp = next; ++ } ++ + clear_tsk_need_resched(prev); + clear_preempt_need_resched(); + #ifdef CONFIG_SCHED_DEBUG +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 1bdd95677..8c1d8bbe7 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8260,6 +8260,16 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq) + return pick_next_task_fair(rq, NULL, NULL); + } + ++static bool delay_dequeue_task_fair(struct rq *rq, struct task_struct *p) ++{ ++ struct sched_entity *se = &p->se; ++ struct cfs_rq *cfs_rq = cfs_rq_of(se); ++ ++ update_curr(cfs_rq); ++ ++ return !entity_eligible(cfs_rq, se); ++} ++ + /* + * Account for a descheduled task: + */ +@@ -12714,6 +12724,7 @@ DEFINE_SCHED_CLASS(fair) = { + + .wakeup_preempt = check_preempt_wakeup_fair, + ++ .delay_dequeue_task = delay_dequeue_task_fair, + .pick_next_task = __pick_next_task_fair, + .put_prev_task = put_prev_task_fair, + .set_next_task = set_next_task_fair, +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index dacef8e1b..fd2c963b7 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -35,6 +35,17 @@ SCHED_FEAT(GENTLE_SLEEPER, true) + * recommended. + */ + SCHED_FEAT(EVDF, false) ++/* ++ * Delay dequeueing tasks until they get selected or woken. ++ * ++ * By delaying the dequeue for non-eligible tasks, they remain in the ++ * competition and can burn off their negative lag. When they get selected ++ * they'll have positive lag by definition. ++ * ++ * GENTLE_DELAY clips the lag on dequeue (or wakeup) to 0. ++ */ ++SCHED_FEAT(DELAY_DEQUEUE, true) ++SCHED_FEAT(GENTLE_DELAY, true) + + /* + * Prefer to schedule the task we woke last (assuming it failed +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 0a1957994..50bca9b72 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2213,6 +2213,7 @@ struct sched_class { + + void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); + ++ bool (*delay_dequeue_task)(struct rq *rq, struct task_struct *p); + struct task_struct *(*pick_next_task)(struct rq *rq); + + void (*put_prev_task)(struct rq *rq, struct task_struct *p); +@@ -2266,7 +2267,7 @@ struct sched_class { + + static inline void put_prev_task(struct rq *rq, struct task_struct *prev) + { +- WARN_ON_ONCE(rq->curr != prev); ++// WARN_ON_ONCE(rq->curr != prev); + prev->sched_class->put_prev_task(rq, prev); + } + +-- +2.42.0 +